#!/usr/pkg/bin/perl
#
# webnew.pl,v 1.5 2001/05/27 22:08:01 kim Exp
#
# For the latest webnew: <URL:http://www.tac.nyc.ny.us/kim/webnew/>
#
#----------------------------------------------------------------------
#
# This program was insipired by
#
# w3new v0.4 by Brooks Cutter <bcutter@stuff.com>
#    http://www.stuff.com/cgi-bin/bbcurn?user=bcutter&pkg=w3new 
#
# I took the parse_html and extract_links_desc functions from it.
# The display_url_list function was fixed to produce working HTML.
#
# The main part of the program has been rewritten, as I wanted to use
# relative links contained in documents at random locations, and also
# recursively fetch modification times for a webbed tree of files.
#
#----------------------------------------------------------------------
#
# You will need the following packages available in the public domain:
#
# libwww-perl v0.40 or later by Roy Fielding (fielding@ics.uci.edu) 
#    http://www.ics.uci.edu/WebSoft/libwww-perl/ 
#
# perl v4.036 by Larry Wall (lwall@netlabs.com) 
#    ftp'able from ftp.uu.net in /systems/gnu as perl-4.036.tar.gz 
# 
#----------------------------------------------------------------------

use Getopt::Std;
use LWP::RobotUA;
use LWP::UserAgent;
use POSIX qw(ctime uname);

@months = (
    'January','February','March','April','May','June','July',
    'August', 'September','October','November','December'
);
@days=('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday');

# if URL isn't fully qualified, this is used as the base
$base = 'http://localhost/';

#
# -A	authorization (username:password)
# -P	don't use proxies
# -R	recursive as robot
# -V	show version
#
# -a	prefer anchors
# -c	content-type
# -d	debug
# -e	email address
# -i	items only (results in an incomplete HTML file)
# -n	nodate
# -r	recursive
# -t	title text
# -v	verbose
# -x	exclude pointers to webnew software
#
# URL is the only parameter.

$opt_c = "text";
$opt_t = "What's new";

getopts('A:PRVac:de:inrt:vx');

if ($opt_V) {
    print "webnew 1.3 (25 May 2001)\n";
    print "Copyright 1996-2001 Kimmo Suominen\n";
    exit 0;
}

die "One URL must be specified!\n" if ($#ARGV != 0);

$opt_v = 1 if ($opt_d);

if ($opt_R) {
    $opt_r = 1;

    if (! $opt_e) {
	$opt_e = $ENV{'USER'}
	    || $ENV{'LOGNAME'}
	    || (getpwuid($<))[0]
	    || die "Who are you?\n";
	$opt_e .= '\@' . (uname())[1];
    }
    $ua = new LWP::RobotUA("webnew/1.3", $opt_e);
    $ua->delay(0.025);
} else {
    $ua = new LWP::UserAgent;
    $ua->agent("webnew/1.3 " . $ua->agent);
    $ua->from($opt_e) if ($opt_e);
}

$ua->env_proxy unless ($opt_P);

$url = URI->new($ARGV[0])->abs($base)->canonical->as_string;
$base = substr($url, 0, rindex($url, '/') + 1);
if ($url =~ /^http:/i) {
    if ($opt_r) {
	push(@todo, $url);
	print STDERR "i $url\n" if ($opt_d);
    } else {
	%ret = &extract_links_desc($url);
	for $i (keys %ret) {
	    if ($i =~ /^http:/i) {
		$desc = $ret{$i};
		$i =~ s/#.*$//;
		$urls{$i} = $desc;
		push(@todo, $i);
		print STDERR "i $i\n" if ($opt_d);
	    } else {
		print STDERR "- $i\n" if ($opt_d);
	    }
	}
    }
}

while ($#todo >= 0) {
    $url = pop(@todo);
    print STDERR "G " if ($opt_d);
    print STDERR "$url" if ($opt_v);
    $q = HTTP::Request->new(HEAD => $url);
    $q->authorization_basic(split (/:/, $opt_A)) if (defined $opt_A);
    $r = $ua->request($q);
    if (! $r->is_success) {
	print STDERR " (".$r->status_line.")\n" if ($opt_v);
	next;
    }
    $lmod = $r->headers->last_modified;
    $ctype = $r->headers->content_type;
    if ($ctype !~ /$opt_c/io) {
	$wrong_ctype{$url} = $ctype;
	print STDERR " ($ctype)\n" if ($opt_v);
	next;
    }
    if ($lmod) {
	$urls_time{$url} = $lmod;
	print STDERR " $lmod" if ($opt_v);
    }
    if ($ctype !~ m!text/html!i) {
	print STDERR " ($ctype)\n" if ($opt_v);
	next;
    }
    %ret = &extract_links_desc($url) unless ($opt_a && ! $opt_r);
    if ($opt_r) {
	for $i (keys %ret) {
	    $desc = $ret{$i};
	    $i =~ s/#.*$//;
	    if ($i !~ /$base/o) {
		print STDERR "\n- $i" if ($opt_d);
		next;
	    }
	    if (! defined $urls{$i}) {
		$urls{$i} = $desc;
		push(@todo, $i);
		print STDERR "\n+ $i" if ($opt_d);
	    } else {
		print STDERR "\no $i" if ($opt_d);
	    }
	}
    }
    print STDERR "\n" if ($opt_v);
}

print STDOUT &display_decl unless ($opt_i);
print STDOUT &display_pointer unless ($opt_x);
print STDOUT &display_comments;
print STDOUT &display_heading unless ($opt_i);
print STDOUT &display_url_list;
print STDOUT &display_pointer unless ($opt_x);
print STDOUT &display_tail unless ($opt_i);

exit;

sub display_url_list {
  local($url,$old_month,$month,$month_no,$list,$_,$day,$wday,$wdaynum,$ul);
  for $url 
    (reverse sort { $urls_time{$a}<=>$urls_time{$b} } keys %urls_time) {
    $old_month = $month;
    ($day,$month_no,$year,$wdaynum) = (gmtime($urls_time{$url}))[3,4,5,6];
    $year += 1900;
    $month = $months[$month_no];
    $wday = $days[$wdaynum];
    if ($month ne $old_month) {
      $list .= "</UL>\n" if ($old_month);
      $list .= "<H2>$month $year</H2>\n";
      $list .= "<UL>\n";
    }
    $list .= "<LI><A HREF=\"$url\">";
    if ($urls{$url}) {
      $urls{$url} =~ s/\s*(.*)\s*/$1/;
      $list .= $urls{$url};
    } else {
      $list .= $url;
    }
    $list .= "</A> ($wday, $month $day)\n";
  }
  $list .= "</UL>\n" if ($month);
  if ($opt_n) {
    $ul = 0;
    for $url (sort {$urls{$a}<=>$urls{$b}} keys %urls) {
      next if ($urls_time{$url});
      unless($ul) { 
        $list .= <<EOF;
<P>
<H2>No modification date returned</H2>
<UL>
EOF
        $ul = 1;
      }
      $list .= <<EOF;
<LI><A HREF="$url">$urls{$url}</A>
EOF
      $list .= "(Robots not welcome)\n" if ($urls_nobots{$url});
      $list .= "(Content-Type: $wrong_ctype{$url})\n" if ($wrong_ctype{$url});
    }
    $list .= "</UL>\n" if ($ul);
  }
  return($list);
}

sub display_decl {
    local($list);
    $list = <<EOF;
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<HTML>
<HEAD>
EOF
    $list .= "<LINK REV=MADE HREF=\"mailto:$opt_e\">\n" if ($opt_e);
    $list .= <<EOF;
<TITLE>$opt_t</TITLE>
</HEAD>
<BODY>
EOF
    return($list);
}

sub display_heading {
    local($list);
    $list = <<EOF;
<H1>$opt_t</H1>
EOF
    return($list);
}

sub display_tail {
    local($list);
    $list = <<EOF;
</BODY>
</HTML>
EOF
}

sub display_pointer {
    local($list);
    $list = <<EOF;
<H5>This page was created using
<A HREF="http://www.tac.nyc.ny.us/kim/webnew/">webnew 1.3</A> on 
EOF
    $list .= &ctime(time);
    chop($list);
    $list .= "</H5>\n";
    return($list);
}

sub display_comments {
    local($list);
    $list = <<EOF;
<!-- This listing was created by webnew 1.3 -->
<!--  http://www.tac.nyc.ny.us/kim/webnew/  -->
EOF
    return($list);
}

sub extract_links_desc {
    local($in_url) = shift(@_);
    local($links,@links,%links);
    local(@ret,$url,$_,$desc,%headers,$content,$response);
    local($base);

    if ($opt_R && ! $ua->rules->allowed($url)) {
	    print STDERR " (no robots)\n" if ($opt_v);
    } else {
	$q = HTTP::Request->new(GET => $in_url);
	$q->authorization_basic(split (/:/, $opt_A)) if ($opt_A);
	$r = $ua->request($q);
	if ($r->is_success) {
	    $base = substr($in_url, 0, rindex($in_url, '/') + 1);
	    &parse_html(*links,$r->content);
	    $links = -1;
	    while (defined $links[++$links]) {
		$_ = $links[$links];
		if (/<a\s.*href\s*=\s*"?([^" <>]+).*>/i) {
		    $url = URI->new($1)->abs($base)->canonical->as_string;
		    $desc = '';
		    while (defined $links[++$links]) {
			last if ($links[$links] =~ m!<\s*/\s*a\s*>!i);
			next if ($links[$links] =~ m!(<|>)!);
			$desc .= $links[$links];
		    }
		    $desc =~ tr/\n/ /;
		    $desc =~ s/ +/ /g;
		    $desc =~ s/(^ )|( $)//g;
		    push(@ret,$url,$desc);
		} elsif ((! $opt_a) && /<title(\s.*)?>/i) {
		    $desc = '';
		    while (defined $links[++$links]) {
			last if ($links[$links] =~ m!<\s*/\s*title\s*>!i);
			next if ($links[$links] =~ m!(<|>)!);
			$desc .= $links[$links];
		    }
		    $desc =~ tr/\n/ /;
		    $desc =~ s/ +/ /g;
		    $desc =~ s/(^ )|( $)//g;
		    $urls{$in_url} = $desc; # store title as name in global
		} else {
		    next;
		}
	    }
	}
    }
    return(@ret);
}

# subroutine: parse_html
# Argument 1: (*data)  - array pointer to return data
# Argument 2-n: variables with HTML data
# Returns in *data HTML split up - first is non-HTML tag, 2nd HTML tag...

sub parse_html {
    local(*data) = shift(@_);
    local($save,$lt,$gt,$data,$_);

  NEXTLINE:
    for (@_) {
	$save .= $_;
	next if ((($lt = index($save,'<')) == -1) ||
			(index($save,'>',$lt) == -1));
	$lt = $gt = 0;
	while (($lt = index($save, '<', $gt)) >= $[) {
	    # This is the data *BEFORE* the '<'
	    if ($lt) { # do If isn't /^</
		if ($gt) {
		    $data = substr($save, ($gt+1), ($lt-$gt-1));
		} else {
		    $data = substr($save, ($gt), ($lt-$gt));
		}
		push(@data, $data);
	    }
	    $gt = index($save, '>', $lt);
	    if ($gt == -1) {
		$save = substr($save, $lt);
		next NEXTLINE;
	    }
	    # This is the data *INSIDE* the <>
	    $data = substr($save, $lt, ($gt-$lt+1));
	    push(@data, $data);
	}
	$save = substr($save, ($gt+1));
    }
    push(@data, $save);
    return(@data);
}
