An alternative name for
web spider - a program which automatically fetches
WWW pages, parses them for some information and links and then proceeds to these links.
Below is code to one such crawler, written in everybody's favourite chainsaw. It will start from arbitrary address, or if none provided, from first hit in google search of random dictionary word. There it will proceed to pick a random link it hasn't visited yet, always choosing an off-site link in preference to in-site link, move there, repeat, crawl back if it runs out of links and summon up a new dictionary search if it runs out of link history.
The code is pretty crufty, but it'll give you the idea. Besides, this code in itself is useful for amusing things; I made it this way for the reason of collecting almost-random site addresses, for the "today's link" part of my dynamic plan generator for batmud. Also available in www.iki.fi/kaatunut/webcrawl.pl where applicable.
#!/usr/bin/perl -w -s
# webcrawler.pl - crawls the web in single thread, collecting URLs and nothing more
# Parameters:
# -start=address set the starting point for crawl
# -output=file print all found valid addresses to file instead of stdout
# -report=file print progress report to file instead of stderr
# -quiet don't print progress reports
use HTML::TreeBuilder;
use LWP;
use IO::Socket;
@dictionary=`cat /usr/dict/words`;
$linkstackmax=10;
%approved_ext=(
'' => '',
html => '',
htm => '',
shtml => '',
php => '',
php3 => '',
asp => '',
);
use vars qw($start $output $quiet $report);
select REPORT;
$|++;
select STDOUT;
$ua=new LWP::UserAgent;
$ua->agent("Gol " . $ua->agent);
if (defined $output) {
open(OUTPUT,">$output") or die "can't open $output for writing!";
} else {
open(OUTPUT,">&1") or die "can't redirect stdout to output!";
}
if (!$quiet) {
if ($report) {
open(REPORT,">$report") or die "can't open $report for writing!";
} else {
open(REPORT,">&2") or die "can't redirect stderr to report!";
}
}
while (1) {
$h=$start || getroot();
print REPORT "**$h\n" unless $quiet;
while (1) {
if (@lt=get_links($h)) {
push @link_history,[ @lt ];
shift @link_history if @link_history>$linkstackmax;
undef $visited{$h};
print REPORT "$h\n";
}
last unless @link_history;
$h=splice @{$link_history[$#link_history]},
int(rand(@{$link_history[$#link_history]})),1;
unless (@{$link_history[$#link_history]}) {
print REPORT "<--\n" unless $quiet;
pop @link_history;
}
}
}
sub getroot {
my $word;
chomp($word=$dictionary[int rand @dictionary]);
print REPORT "word: $word\n" unless $quiet;
my $sock=new IO::Socket::INET(PeerAddr=>"www.google.com",PeerPort=>80,Proto=>"tcp");
die "sock error: $!" unless $sock;
print $sock "GET /search?q=$word&btnI= HTTP/1.0\r\n\r\n";
while (<$sock>) {
if (/^Location: (.*?)\r?\n/) {
return $1;
}
}
}
sub get_links {
my $link=shift;
my($proto,$host,$dir)=$link =~ m!^(\w+):(?://)?([^/]*)((?:/.*?)?)[^/]*$!;
$dir.="/" unless $dir =~ /\/$/;
my @links;
my @links2;
my $i;
my $tree=new HTML::TreeBuilder;
my $req=new HTTP::Request GET => $link;
print REPORT "$link ->" unless $quiet;
my $res= $ua->request($req);
print REPORT "\r".(" " x length "$link ->")."\r" unless $quiet;
return () unless $res->is_success;
$tree->parse($res->content);
@links=
# sort same-host links first
sort { (($b =~ m!^\w+://([^/]*)!)[0] eq $host &&
($a =~ m!^\w+://([^/]*)!)[0] ne $host) ? 1 :
(($b =~ m!^\w+://([^/]*)!)[0] ne $host &&
($a =~ m!^\w+://([^/]*)!)[0] eq $host) ? -1 : ($a cmp $b) }
grep { $_ ne $link and !exists $visited{$_} }
# kill ".."
map { while (s![^/]*/\.\./!!) { } $_ }
# handle local links
map { (/^\w+:(?:\/\/)?/ ? "" : "$proto://$host".(/^\// ? "" : "$dir")).$_ }
# no CGI queries, only "approved" types
grep { !/\?/ && exists $approved_ext{(m!.([^/.]*)$!)[0] || ""} }
grep { ((/^(\w+):(?:\/\/)?/)[0] || "http") eq "http" }
# eliminate #, convert from href
map { (($_->attr('_tag') eq "a"
? $_->attr('href')
: $_->attr('src')) =~ /^([^#]*)/)[0] }
# need link, only http protocol (<frame src=...> considered a link)
grep { $_->attr('href') or $_->attr('src') }
$tree->look_down( sub {
$_[0]->attr('_tag') eq "a"
or $_[0]->attr('_tag') eq "frame" } );
for ($i=1;$i<@links;++$i) {
splice @links,$i--,1 if $links[$i] eq $links[$i-1];
}
my $split;
for ($split=0; $split<@links && $links[$split] =~ m!http://$host!; ++$split) { }
@links2=splice @links,$split;
return @links2;
}