#!/usr/skunk/bin/perl
#!/usr/local/bin/perl
#!/usr/local/bin/proxyperl
# 
# w3get - point it at a http: url and it recursively retrieves href's
#         and img src's starting from that page
#
# Version 0.1 by Brooks Cutter (bcutter@paradyne.com) 2/5/94
# 
# Usage: w3get [-d] [-v] <fully qualified url>
#
# where fully qualified url is like http://host/file.html
# like the Mosaic What's new page: 
# http://www.ncsa.uiuc.edu/SDG/Software/Mosaic/Docs/whats-new.html
#
# -d prints debugging information.
# -v is verbose (prints a message for each url it descends)
#
#
# I wrote this program a month ago in preperation for a presentation
# on Mosaic and the World Wide Web.  I had a sun there and wanted to
# display parts of the web without using a slow PPP connection.
# I haven't done anything with it since then (except today to document
# it and clean it up) so don't intend to develop it further until
# a need arises.  Feel free to hack this up and pass it around.
# (and pass me a copy please...)
# 
#
# If you are a AT&T Site behind the proxy gateway, you will need
# my version of proxyperl.  Email me for more info, and then set below to 1.
$att_proxy = 0; 
# Uses Paradyne Automounter setup..
$pdn = 0;

# This string is prepended to the rewritten url's 
# It could also be a 'file://...' or 'ftp://...', etc...
# mark so can post process later since leaves the original URL on page
$redirect = 'HOPS_LOCAL';
# directory where I can write my output to
$outdir = "$ENV{'HOME'}/tmp/html_get";
mkdir($outdir, 0755) unless(-d $outdir);

die "$0: <fully qualified url>\n(like http://www.cis.ohio-state.edu:80/hypertext/faq/usenet/tcl-faq/part1/faq.html)\n" unless (@ARGV);

unshift(@INC,"/pdn/appl/perl/lib","/pdn/appl/perl/lib/sys") if ($pdn);

require 'url.pl';
require 'getopts.pl';

&Getopts('dv');
$'ipcdebug = 1 if (($opt_d) && ($att_proxy));

#$version = "HTTP 1.0";

push(@todo, @ARGV);
FOREVER: while (1) {
	#last unless(@todo);
print '@todo = ',scalar @todo," ($todo[0])\n" if ($opt_d);
	unless(@todo) {
		last unless(@remote);
		@todo = @remote; @remote = ();
	}
	$node_url = shift(@todo);
	$seen{$node_url} = 1; # So I only descend each url once...

	print "Checking url $node_url\n" if ($opt_v);
	unless ($node_url =~ m!^http://!) {
		warn "Argument must be fully qualified url (ie: http://host/file.html):\n$node_url\n";
		next;
	}
# If it's already pulled the page down, it shouldn't retrieve it
# again - but it needs to open it, parse the hyperlinks and then
# retrieve those if necessary.  Right now it pulls everything down
# whether it has it or not.
#
#	($url_fn,$url_dir) = &url2fndir($url);
#	next if (-e "$outdir/$url_fn");
#
	if ($page = &url'get($node_url,$version)) {
		if ($node_url =~ /html/i) {
			$page = &url'abs($node_url,$page);
			# This should be combined into the one above, but it was a quick kludge
			# (like this program)
			$page = &url'img_abs($node_url,$page);
		}
	} else { warn "$!\n"; next; }
	$node_host = '';
	if ($node_url =~ m|^http://([^/]+)/?.*$|) {
		$node_host = $1;
	}

	# I should really get the type from HTTP/1.0 headers...
	if ($node_url =~ /html$/i) { 
		@links = &parse_html($page);
		@http = &extract_http(@links);
		for (@http) {
			s/#.*//; # Delete skipto marks
			next if ($seen{$_});
			next if (/htbin/); # skip hitbin
			next if (/cgi.*bin/); # skip hitbin
			next if (/\?/); # Skip argument urls...
			#next unless (/paradyne.com/); # If you don't want to stray from a domain
			if (($node_host) && ($node_url =~ m!http://$node_host!)) {
				# Do local ones first
				push(@todo, $_);
			} else {
				push(@remote, $_);
			}
			$seen{$_} = 1;
		}
		@links2 = &localize(@links); # Should use pointers
		&save_url($node_url, @links2);
	} else {
		&save_url($node_url, $page);
	}
	next;
}

exit;


sub save_url {
	local($url) = shift(@_);
	local($url_fn, $url_dir) = &url2fndir($url);
	return unless($url_fn);
	if ($url_dir) {
		if ((-e "$outdir/$url_dir") && (!-d "$outdir/$url_dir")) {
			# url was previously referenced like:
			# http://host/directory - and thought it was a file when a
			# directory index was generated.  So move it to index.html...
			system("mv $outdir/$url_dir $outdir/$url_dir.index");
			system("mkdir -p $outdir/$url_dir");
			system("mv $outdir/$url_dir.index $outdir/$url_dir/index.html");
		} elsif (!-e "$outdir/$url_dir") { system("mkdir -p $outdir/$url_dir"); }
	}
	print STDERR "Writing $url to $url_fn\n";
	if (-e "$outdir/$url_fn") {
		print STDERR "--->>> HEY, $url_fn already exists!\n";
		return;
	}
	open(OUT, ">$outdir/$url_fn");
	print OUT @_;
	close(OUT);
}

sub url2fndir {
	local($url) = shift(@_);
	return($cache_url_fn{$url},$cache_url_dir{$url})
		if (($cache_url_fn{$url}) && ($cache_url_dir{$url}));
	local($url_fn,$url_dir);

	if ($url =~ m!^http://(.+)$!) {
		$url_fn = $1;
		$url_fn =~ tr/~/_/d;
		@url_dir = split(/\//, $url_fn); pop(@url_dir);
		$url_dir = join('/',@url_dir);
		$cache_url_fn{$url} = $url_fn;
		$cache_url_dir{$url} = $url_dir;
		return($url_fn,$url_dir);
	}
	return('');
}

sub extract_http {
	local($url);
	local($_,@return);

	for (@_) {
		next unless ((/^<a/i) || (/^<img/i));
		if ((/^<(a\s+.*href)=([^>]+)>$/i) || (/^<(img\s+.*src)=([^>]+)>$/i)) {
			$cmd = $1;
			$url = $2;
			$url =~ tr/'"//d; # Delete quotes

			if (($url =~ /^http:/) || ($url =~ m!^[/a-zA-Z0-9]!)) {
				push(@return, $url);
			}
			next;
		}
	}
	return(@return);
}

sub localize {
	local($_,@return);
	local(@r);

	for (@_) {
		unless ((/^<a/i) || (/^<img/i)) {
			push(@r, $_);
			next;
		}
		if ((/^<(a\s+.*href)=([^>]+)>$/i) || (/^<(img\s+.*src)=([^>]+)>$/i)) {
			$cmd = $1;
			$url = $2;
			$url =~ tr/'"//d; # Delete quotes
#print "localize found url $url\n";

			if ($url =~ m!^http://(.+)$!) {
				push(@r, "<$cmd=\"$redirect/$1\">");
			} else {
				push(@r, "<$cmd=\"$url\">");
			}
			next;
		}
	}
	return(@r);
}

sub parse_html {
	local(@data) = ();
	local($save, $_, $lt, $gt);
	NEXTLINE: for (split(/\r/,$_[0])) {
		$save .= $_;
		if ((($lt = index($save,'<')) == -1) || (index($save,'>',$lt) == -1))
			{ next; }
		$lt = $gt = 0;
		while (($lt = index($save, '<', $gt)) >= $[) {
			# This is the data *BEFORE* the '<'
			if ($lt) { # do If isn't /^</
				if ($gt) {
					$data = substr($save, ($gt+1), ($lt-$gt-1));
				} else {
					$data = substr($save, ($gt), ($lt-$gt));
				}
				push(@data, $data);
			}
			$gt = index($save, '>', $lt);
			if ($gt == -1) {
				$save = substr($save, $lt);
				next NEXTLINE;
			}
			# This is the data *INSIDE* the <>
			$data = substr($save, $lt, ($gt-$lt+1));
			push(@data, $data);
		}
		$save = substr($save, ($gt+1));
	}
	push(@data, $save);
	return(@data);
}

# EOF
