diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2012-02-18 13:07:29 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2012-02-18 13:07:29 +0000 |
commit | 80c86924c3437c0ad64852ea29c7eab1197dfe90 (patch) | |
tree | 11f2945e6c83ffc5d3fec7519d4cb4e9c7194ff5 | |
download | WWW-RobotRules-tarball-master.tar.gz |
WWW-RobotRules-6.02HEADWWW-RobotRules-6.02master
-rw-r--r-- | Changes | 21 | ||||
-rw-r--r-- | MANIFEST | 10 | ||||
-rw-r--r-- | META.yml | 28 | ||||
-rw-r--r-- | Makefile.PL | 48 | ||||
-rw-r--r-- | README | 143 | ||||
-rw-r--r-- | lib/WWW/RobotRules.pm | 453 | ||||
-rw-r--r-- | lib/WWW/RobotRules/AnyDBM_File.pm | 170 | ||||
-rwxr-xr-x | t/misc/dbmrobot | 23 | ||||
-rw-r--r-- | t/rules-dbm.t | 128 | ||||
-rw-r--r-- | t/rules.t | 230 |
10 files changed, 1254 insertions, 0 deletions
@@ -0,0 +1,21 @@ +_______________________________________________________________________________ +2012-02-18 WWW-RobotRules 6.02 + +Restore perl-5.8.1 compatiblity. + + + +_______________________________________________________________________________ +2011-03-13 WWW-RobotRules 6.01 + +Added legal notice and updated the meta repository link + + + +_______________________________________________________________________________ +2011-02-25 WWW-RobotRules 6.00 + +Initial release of WWW-RobotRules as a separate distribution. There are no code +changes besides incrementing the version number since libwww-perl-5.837. + +The WWW::RobotRules module used to be bundled with the libwww-perl distribution. diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..67c9683 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,10 @@ +Changes +lib/WWW/RobotRules.pm +lib/WWW/RobotRules/AnyDBM_File.pm +Makefile.PL +MANIFEST This list of files +README +t/misc/dbmrobot +t/rules-dbm.t +t/rules.t +META.yml Module meta-data (added by MakeMaker) diff --git a/META.yml b/META.yml new file mode 100644 index 0000000..866f160 --- /dev/null +++ b/META.yml @@ -0,0 +1,28 @@ +--- #YAML:1.0 +name: WWW-RobotRules +version: 6.02 +abstract: database of robots.txt-derived permissions +author: + - Gisle Aas <gisle@activestate.com> +license: perl +distribution_type: module +configure_requires: + ExtUtils::MakeMaker: 0 +build_requires: + ExtUtils::MakeMaker: 0 +requires: + AnyDBM_File: 0 + Fcntl: 0 + perl: 5.008001 + URI: 1.10 +resources: + MailingList: mailto:libwww@perl.org + repository: http://github.com/gisle/www-robotrules +no_index: + directory: + - t + - inc +generated_by: ExtUtils::MakeMaker version 6.57_05 +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: 1.4 diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..f52b375 --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,48 @@ +#!perl -w + +require 5.008001; +use strict; +use ExtUtils::MakeMaker; + +WriteMakefile( + NAME => 'WWW::RobotRules', + VERSION_FROM => 'lib/WWW/RobotRules.pm', + ABSTRACT_FROM => 'lib/WWW/RobotRules.pm', + AUTHOR => 'Gisle Aas <gisle@activestate.com>', + LICENSE => "perl", + MIN_PERL_VERSION => 5.008001, + PREREQ_PM => { + 'AnyDBM_File' => 0, + 'Fcntl' => 0, + 'URI' => "1.10", + }, + META_MERGE => { + resources => { + repository => 'http://github.com/gisle/www-robotrules', + MailingList => 'mailto:libwww@perl.org', + } + }, +); + + +BEGIN { + # compatibility with older versions of MakeMaker + my $developer = -f ".gitignore"; + my %mm_req = ( + LICENCE => 6.31, + META_MERGE => 6.45, + META_ADD => 6.45, + MIN_PERL_VERSION => 6.48, + ); + undef(*WriteMakefile); + *WriteMakefile = sub { + my %arg = @_; + for (keys %mm_req) { + unless (eval { ExtUtils::MakeMaker->VERSION($mm_req{$_}) }) { + warn "$_ $@" if $developer; + delete $arg{$_}; + } + } + ExtUtils::MakeMaker::WriteMakefile(%arg); + }; +} @@ -0,0 +1,143 @@ +NAME + WWW::RobotRules - database of robots.txt-derived permissions + +SYNOPSIS + use WWW::RobotRules; + my $rules = WWW::RobotRules->new('MOMspider/1.0'); + + use LWP::Simple qw(get); + + { + my $url = "http://some.place/robots.txt"; + my $robots_txt = get $url; + $rules->parse($url, $robots_txt) if defined $robots_txt; + } + + { + my $url = "http://some.other.place/robots.txt"; + my $robots_txt = get $url; + $rules->parse($url, $robots_txt) if defined $robots_txt; + } + + # Now we can check if a URL is valid for those servers + # whose "robots.txt" files we've gotten and parsed: + if($rules->allowed($url)) { + $c = get $url; + ... + } + +DESCRIPTION + This module parses /robots.txt files as specified in "A Standard for + Robot Exclusion", at <http://www.robotstxt.org/wc/norobots.html> + Webmasters can use the /robots.txt file to forbid conforming robots from + accessing parts of their web site. + + The parsed files are kept in a WWW::RobotRules object, and this object + provides methods to check if access to a given URL is prohibited. The + same WWW::RobotRules object can be used for one or more parsed + /robots.txt files on any number of hosts. + + The following methods are provided: + + $rules = WWW::RobotRules->new($robot_name) + This is the constructor for WWW::RobotRules objects. The first + argument given to new() is the name of the robot. + + $rules->parse($robot_txt_url, $content, $fresh_until) + The parse() method takes as arguments the URL that was used to + retrieve the /robots.txt file, and the contents of the file. + + $rules->allowed($uri) + Returns TRUE if this robot is allowed to retrieve this URL. + + $rules->agent([$name]) + Get/set the agent name. NOTE: Changing the agent name will clear the + robots.txt rules and expire times out of the cache. + +ROBOTS.TXT + The format and semantics of the "/robots.txt" file are as follows (this + is an edited abstract of <http://www.robotstxt.org/wc/norobots.html>): + + The file consists of one or more records separated by one or more blank + lines. Each record contains lines of the form + + <field-name>: <value> + + The field name is case insensitive. Text after the '#' character on a + line is ignored during parsing. This is used for comments. The following + <field-names> can be used: + + User-Agent + The value of this field is the name of the robot the record is + describing access policy for. If more than one *User-Agent* field is + present the record describes an identical access policy for more than + one robot. At least one field needs to be present per record. If the + value is '*', the record describes the default access policy for any + robot that has not not matched any of the other records. + + The *User-Agent* fields must occur before the *Disallow* fields. If a + record contains a *User-Agent* field after a *Disallow* field, that + constitutes a malformed record. This parser will assume that a blank + line should have been placed before that *User-Agent* field, and will + break the record into two. All the fields before the *User-Agent* + field will constitute a record, and the *User-Agent* field will be + the first field in a new record. + + Disallow + The value of this field specifies a partial URL that is not to be + visited. This can be a full path, or a partial path; any URL that + starts with this value will not be retrieved + + Unrecognized records are ignored. + +ROBOTS.TXT EXAMPLES + The following example "/robots.txt" file specifies that no robots should + visit any URL starting with "/cyberworld/map/" or "/tmp/": + + User-agent: * + Disallow: /cyberworld/map/ # This is an infinite virtual URL space + Disallow: /tmp/ # these will soon disappear + + This example "/robots.txt" file specifies that no robots should visit + any URL starting with "/cyberworld/map/", except the robot called + "cybermapper": + + User-agent: * + Disallow: /cyberworld/map/ # This is an infinite virtual URL space + + # Cybermapper knows where to go. + User-agent: cybermapper + Disallow: + + This example indicates that no robots should visit this site further: + + # go away + User-agent: * + Disallow: / + + This is an example of a malformed robots.txt file. + + # robots.txt for ancientcastle.example.com + # I've locked myself away. + User-agent: * + Disallow: / + # The castle is your home now, so you can go anywhere you like. + User-agent: Belle + Disallow: /west-wing/ # except the west wing! + # It's good to be the Prince... + User-agent: Beast + Disallow: + + This file is missing the required blank lines between records. However, + the intention is clear. + +SEE ALSO + LWP::RobotUA, WWW::RobotRules::AnyDBM_File + +COPYRIGHT + Copyright 1995-2009, Gisle Aas + Copyright 1995, Martijn Koster + + This library is free software; you can redistribute it and/or modify it + under the same terms as Perl itself. + diff --git a/lib/WWW/RobotRules.pm b/lib/WWW/RobotRules.pm new file mode 100644 index 0000000..5dfc453 --- /dev/null +++ b/lib/WWW/RobotRules.pm @@ -0,0 +1,453 @@ +package WWW::RobotRules; + +$VERSION = "6.02"; +sub Version { $VERSION; } + +use strict; +use URI (); + + + +sub new { + my($class, $ua) = @_; + + # This ugly hack is needed to ensure backwards compatibility. + # The "WWW::RobotRules" class is now really abstract. + $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules"; + + my $self = bless { }, $class; + $self->agent($ua); + $self; +} + + +sub parse { + my($self, $robot_txt_uri, $txt, $fresh_until) = @_; + $robot_txt_uri = URI->new("$robot_txt_uri"); + my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port; + + $self->clear_rules($netloc); + $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600)); + + my $ua; + my $is_me = 0; # 1 iff this record is for me + my $is_anon = 0; # 1 iff this record is for * + my $seen_disallow = 0; # watch for missing record separators + my @me_disallowed = (); # rules disallowed for me + my @anon_disallowed = (); # rules disallowed for * + + # blank lines are significant, so turn CRLF into LF to avoid generating + # false ones + $txt =~ s/\015\012/\012/g; + + # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL) + for(split(/[\012\015]/, $txt)) { + + # Lines containing only a comment are discarded completely, and + # therefore do not indicate a record boundary. + next if /^\s*\#/; + + s/\s*\#.*//; # remove comments at end-of-line + + if (/^\s*$/) { # blank line + last if $is_me; # That was our record. No need to read the rest. + $is_anon = 0; + $seen_disallow = 0; + } + elsif (/^\s*User-Agent\s*:\s*(.*)/i) { + $ua = $1; + $ua =~ s/\s+$//; + + if ($seen_disallow) { + # treat as start of a new record + $seen_disallow = 0; + last if $is_me; # That was our record. No need to read the rest. + $is_anon = 0; + } + + if ($is_me) { + # This record already had a User-agent that + # we matched, so just continue. + } + elsif ($ua eq '*') { + $is_anon = 1; + } + elsif($self->is_me($ua)) { + $is_me = 1; + } + } + elsif (/^\s*Disallow\s*:\s*(.*)/i) { + unless (defined $ua) { + warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W; + $is_anon = 1; # assume that User-agent: * was intended + } + my $disallow = $1; + $disallow =~ s/\s+$//; + $seen_disallow = 1; + if (length $disallow) { + my $ignore; + eval { + my $u = URI->new_abs($disallow, $robot_txt_uri); + $ignore++ if $u->scheme ne $robot_txt_uri->scheme; + $ignore++ if lc($u->host) ne lc($robot_txt_uri->host); + $ignore++ if $u->port ne $robot_txt_uri->port; + $disallow = $u->path_query; + $disallow = "/" unless length $disallow; + }; + next if $@; + next if $ignore; + } + + if ($is_me) { + push(@me_disallowed, $disallow); + } + elsif ($is_anon) { + push(@anon_disallowed, $disallow); + } + } + elsif (/\S\s*:/) { + # ignore + } + else { + warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W; + } + } + + if ($is_me) { + $self->push_rules($netloc, @me_disallowed); + } + else { + $self->push_rules($netloc, @anon_disallowed); + } +} + + +# +# Returns TRUE if the given name matches the +# name of this robot +# +sub is_me { + my($self, $ua_line) = @_; + my $me = $self->agent; + + # See whether my short-name is a substring of the + # "User-Agent: ..." line that we were passed: + + if(index(lc($me), lc($ua_line)) >= 0) { + return 1; + } + else { + return ''; + } +} + + +sub allowed { + my($self, $uri) = @_; + $uri = URI->new("$uri"); + + return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https'; + # Robots.txt applies to only those schemes. + + my $netloc = $uri->host . ":" . $uri->port; + + my $fresh_until = $self->fresh_until($netloc); + return -1 if !defined($fresh_until) || $fresh_until < time; + + my $str = $uri->path_query; + my $rule; + for $rule ($self->rules($netloc)) { + return 1 unless length $rule; + return 0 if index($str, $rule) == 0; + } + return 1; +} + + +# The following methods must be provided by the subclass. +sub agent; +sub visit; +sub no_visits; +sub last_visits; +sub fresh_until; +sub push_rules; +sub clear_rules; +sub rules; +sub dump; + + + +package WWW::RobotRules::InCore; + +use vars qw(@ISA); +@ISA = qw(WWW::RobotRules); + + + +sub agent { + my ($self, $name) = @_; + my $old = $self->{'ua'}; + if ($name) { + # Strip it so that it's just the short name. + # I.e., "FooBot" => "FooBot" + # "FooBot/1.2" => "FooBot" + # "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot" + + $name = $1 if $name =~ m/(\S+)/; # get first word + $name =~ s!/.*!!; # get rid of version + unless ($old && $old eq $name) { + delete $self->{'loc'}; # all old info is now stale + $self->{'ua'} = $name; + } + } + $old; +} + + +sub visit { + my($self, $netloc, $time) = @_; + return unless $netloc; + $time ||= time; + $self->{'loc'}{$netloc}{'last'} = $time; + my $count = \$self->{'loc'}{$netloc}{'count'}; + if (!defined $$count) { + $$count = 1; + } + else { + $$count++; + } +} + + +sub no_visits { + my ($self, $netloc) = @_; + $self->{'loc'}{$netloc}{'count'}; +} + + +sub last_visit { + my ($self, $netloc) = @_; + $self->{'loc'}{$netloc}{'last'}; +} + + +sub fresh_until { + my ($self, $netloc, $fresh_until) = @_; + my $old = $self->{'loc'}{$netloc}{'fresh'}; + if (defined $fresh_until) { + $self->{'loc'}{$netloc}{'fresh'} = $fresh_until; + } + $old; +} + + +sub push_rules { + my($self, $netloc, @rules) = @_; + push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules); +} + + +sub clear_rules { + my($self, $netloc) = @_; + delete $self->{'loc'}{$netloc}{'rules'}; +} + + +sub rules { + my($self, $netloc) = @_; + if (defined $self->{'loc'}{$netloc}{'rules'}) { + return @{$self->{'loc'}{$netloc}{'rules'}}; + } + else { + return (); + } +} + + +sub dump +{ + my $self = shift; + for (keys %$self) { + next if $_ eq 'loc'; + print "$_ = $self->{$_}\n"; + } + for (keys %{$self->{'loc'}}) { + my @rules = $self->rules($_); + print "$_: ", join("; ", @rules), "\n"; + } +} + + +1; + +__END__ + + +# Bender: "Well, I don't have anything else +# planned for today. Let's get drunk!" + +=head1 NAME + +WWW::RobotRules - database of robots.txt-derived permissions + +=head1 SYNOPSIS + + use WWW::RobotRules; + my $rules = WWW::RobotRules->new('MOMspider/1.0'); + + use LWP::Simple qw(get); + + { + my $url = "http://some.place/robots.txt"; + my $robots_txt = get $url; + $rules->parse($url, $robots_txt) if defined $robots_txt; + } + + { + my $url = "http://some.other.place/robots.txt"; + my $robots_txt = get $url; + $rules->parse($url, $robots_txt) if defined $robots_txt; + } + + # Now we can check if a URL is valid for those servers + # whose "robots.txt" files we've gotten and parsed: + if($rules->allowed($url)) { + $c = get $url; + ... + } + +=head1 DESCRIPTION + +This module parses F</robots.txt> files as specified in +"A Standard for Robot Exclusion", at +<http://www.robotstxt.org/wc/norobots.html> +Webmasters can use the F</robots.txt> file to forbid conforming +robots from accessing parts of their web site. + +The parsed files are kept in a WWW::RobotRules object, and this object +provides methods to check if access to a given URL is prohibited. The +same WWW::RobotRules object can be used for one or more parsed +F</robots.txt> files on any number of hosts. + +The following methods are provided: + +=over 4 + +=item $rules = WWW::RobotRules->new($robot_name) + +This is the constructor for WWW::RobotRules objects. The first +argument given to new() is the name of the robot. + +=item $rules->parse($robot_txt_url, $content, $fresh_until) + +The parse() method takes as arguments the URL that was used to +retrieve the F</robots.txt> file, and the contents of the file. + +=item $rules->allowed($uri) + +Returns TRUE if this robot is allowed to retrieve this URL. + +=item $rules->agent([$name]) + +Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt +rules and expire times out of the cache. + +=back + +=head1 ROBOTS.TXT + +The format and semantics of the "/robots.txt" file are as follows +(this is an edited abstract of +<http://www.robotstxt.org/wc/norobots.html>): + +The file consists of one or more records separated by one or more +blank lines. Each record contains lines of the form + + <field-name>: <value> + +The field name is case insensitive. Text after the '#' character on a +line is ignored during parsing. This is used for comments. The +following <field-names> can be used: + +=over 3 + +=item User-Agent + +The value of this field is the name of the robot the record is +describing access policy for. If more than one I<User-Agent> field is +present the record describes an identical access policy for more than +one robot. At least one field needs to be present per record. If the +value is '*', the record describes the default access policy for any +robot that has not not matched any of the other records. + +The I<User-Agent> fields must occur before the I<Disallow> fields. If a +record contains a I<User-Agent> field after a I<Disallow> field, that +constitutes a malformed record. This parser will assume that a blank +line should have been placed before that I<User-Agent> field, and will +break the record into two. All the fields before the I<User-Agent> field +will constitute a record, and the I<User-Agent> field will be the first +field in a new record. + +=item Disallow + +The value of this field specifies a partial URL that is not to be +visited. This can be a full path, or a partial path; any URL that +starts with this value will not be retrieved + +=back + +Unrecognized records are ignored. + +=head1 ROBOTS.TXT EXAMPLES + +The following example "/robots.txt" file specifies that no robots +should visit any URL starting with "/cyberworld/map/" or "/tmp/": + + User-agent: * + Disallow: /cyberworld/map/ # This is an infinite virtual URL space + Disallow: /tmp/ # these will soon disappear + +This example "/robots.txt" file specifies that no robots should visit +any URL starting with "/cyberworld/map/", except the robot called +"cybermapper": + + User-agent: * + Disallow: /cyberworld/map/ # This is an infinite virtual URL space + + # Cybermapper knows where to go. + User-agent: cybermapper + Disallow: + +This example indicates that no robots should visit this site further: + + # go away + User-agent: * + Disallow: / + +This is an example of a malformed robots.txt file. + + # robots.txt for ancientcastle.example.com + # I've locked myself away. + User-agent: * + Disallow: / + # The castle is your home now, so you can go anywhere you like. + User-agent: Belle + Disallow: /west-wing/ # except the west wing! + # It's good to be the Prince... + User-agent: Beast + Disallow: + +This file is missing the required blank lines between records. +However, the intention is clear. + +=head1 SEE ALSO + +L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File> + +=head1 COPYRIGHT + + Copyright 1995-2009, Gisle Aas + Copyright 1995, Martijn Koster + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. diff --git a/lib/WWW/RobotRules/AnyDBM_File.pm b/lib/WWW/RobotRules/AnyDBM_File.pm new file mode 100644 index 0000000..8daa688 --- /dev/null +++ b/lib/WWW/RobotRules/AnyDBM_File.pm @@ -0,0 +1,170 @@ +package WWW::RobotRules::AnyDBM_File; + +require WWW::RobotRules; +@ISA = qw(WWW::RobotRules); +$VERSION = "6.00"; + +use Carp (); +use AnyDBM_File; +use Fcntl; +use strict; + +=head1 NAME + +WWW::RobotRules::AnyDBM_File - Persistent RobotRules + +=head1 SYNOPSIS + + require WWW::RobotRules::AnyDBM_File; + require LWP::RobotUA; + + # Create a robot useragent that uses a diskcaching RobotRules + my $rules = WWW::RobotRules::AnyDBM_File->new( 'my-robot/1.0', 'cachefile' ); + my $ua = WWW::RobotUA->new( 'my-robot/1.0', 'me@foo.com', $rules ); + + # Then just use $ua as usual + $res = $ua->request($req); + +=head1 DESCRIPTION + +This is a subclass of I<WWW::RobotRules> that uses the AnyDBM_File +package to implement persistent diskcaching of F<robots.txt> and host +visit information. + +The constructor (the new() method) takes an extra argument specifying +the name of the DBM file to use. If the DBM file already exists, then +you can specify undef as agent name as the name can be obtained from +the DBM database. + +=cut + +sub new +{ + my ($class, $ua, $file) = @_; + Carp::croak('WWW::RobotRules::AnyDBM_File filename required') unless $file; + + my $self = bless { }, $class; + $self->{'filename'} = $file; + tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_CREAT|O_RDWR, 0640 + or Carp::croak("Can't open $file: $!"); + + if ($ua) { + $self->agent($ua); + } + else { + # Try to obtain name from DBM file + $ua = $self->{'dbm'}{"|ua-name|"}; + Carp::croak("No agent name specified") unless $ua; + } + + $self; +} + +sub agent { + my($self, $newname) = @_; + my $old = $self->{'dbm'}{"|ua-name|"}; + if (defined $newname) { + $newname =~ s!/?\s*\d+.\d+\s*$!!; # loose version + unless ($old && $old eq $newname) { + # Old info is now stale. + my $file = $self->{'filename'}; + untie %{$self->{'dbm'}}; + tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_TRUNC|O_RDWR, 0640; + %{$self->{'dbm'}} = (); + $self->{'dbm'}{"|ua-name|"} = $newname; + } + } + $old; +} + +sub no_visits { + my ($self, $netloc) = @_; + my $t = $self->{'dbm'}{"$netloc|vis"}; + return 0 unless $t; + (split(/;\s*/, $t))[0]; +} + +sub last_visit { + my ($self, $netloc) = @_; + my $t = $self->{'dbm'}{"$netloc|vis"}; + return undef unless $t; + (split(/;\s*/, $t))[1]; +} + +sub fresh_until { + my ($self, $netloc, $fresh) = @_; + my $old = $self->{'dbm'}{"$netloc|exp"}; + if ($old) { + $old =~ s/;.*//; # remove cleartext + } + if (defined $fresh) { + $fresh .= "; " . localtime($fresh); + $self->{'dbm'}{"$netloc|exp"} = $fresh; + } + $old; +} + +sub visit { + my($self, $netloc, $time) = @_; + $time ||= time; + + my $count = 0; + my $old = $self->{'dbm'}{"$netloc|vis"}; + if ($old) { + my $last; + ($count,$last) = split(/;\s*/, $old); + $time = $last if $last > $time; + } + $count++; + $self->{'dbm'}{"$netloc|vis"} = "$count; $time; " . localtime($time); +} + +sub push_rules { + my($self, $netloc, @rules) = @_; + my $cnt = 1; + $cnt++ while $self->{'dbm'}{"$netloc|r$cnt"}; + + foreach (@rules) { + $self->{'dbm'}{"$netloc|r$cnt"} = $_; + $cnt++; + } +} + +sub clear_rules { + my($self, $netloc) = @_; + my $cnt = 1; + while ($self->{'dbm'}{"$netloc|r$cnt"}) { + delete $self->{'dbm'}{"$netloc|r$cnt"}; + $cnt++; + } +} + +sub rules { + my($self, $netloc) = @_; + my @rules = (); + my $cnt = 1; + while (1) { + my $rule = $self->{'dbm'}{"$netloc|r$cnt"}; + last unless $rule; + push(@rules, $rule); + $cnt++; + } + @rules; +} + +sub dump +{ +} + +1; + +=head1 SEE ALSO + +L<WWW::RobotRules>, L<LWP::RobotUA> + +=head1 AUTHORS + +Hakan Ardo E<lt>hakan@munin.ub2.lu.se>, Gisle Aas E<lt>aas@sn.no> + +=cut + diff --git a/t/misc/dbmrobot b/t/misc/dbmrobot new file mode 100755 index 0000000..c1794c4 --- /dev/null +++ b/t/misc/dbmrobot @@ -0,0 +1,23 @@ +#!/local/perl/bin/perl -w + +use URI::URL; +$url = url(shift) || die "Usage: $0 <url>\n"; + +require WWW::RobotRules::AnyDBM_File; +require LWP::RobotUA; + +$botname = "Spider/0.1"; + +$rules = new WWW::RobotRules::AnyDBM_File $botname, 'robotdb'; +$ua = new LWP::RobotUA $botname, 'gisle@aas.no', $rules; +$ua->delay(0.1); + +my $req = new HTTP::Request GET => $url; + +my $res = $ua->request($req); +print "Got ", $res->code, " ", $res->message, "(", $res->content_type, ")\n"; + +my $netloc = $url->netloc; +print "This was visit no ", $ua->no_visits($netloc), " to $netloc\n"; + + diff --git a/t/rules-dbm.t b/t/rules-dbm.t new file mode 100644 index 0000000..2335b94 --- /dev/null +++ b/t/rules-dbm.t @@ -0,0 +1,128 @@ + +print "1..13\n"; + + +use WWW::RobotRules::AnyDBM_File; + +$file = "test-$$"; + +$r = new WWW::RobotRules::AnyDBM_File "myrobot/2.0", $file; + +$r->parse("http://www.aas.no/robots.txt", ""); + +$r->visit("www.aas.no:80"); + +print "not " if $r->no_visits("www.aas.no:80") != 1; +print "ok 1\n"; + + +$r->push_rules("www.sn.no:80", "/aas", "/per"); +$r->push_rules("www.sn.no:80", "/god", "/old"); + +@r = $r->rules("www.sn.no:80"); +print "Rules: @r\n"; + +print "not " if "@r" ne "/aas /per /god /old"; +print "ok 2\n"; + +$r->clear_rules("per"); +$r->clear_rules("www.sn.no:80"); + +@r = $r->rules("www.sn.no:80"); +print "Rules: @r\n"; + +print "not " if "@r" ne ""; +print "ok 3\n"; + +$r->visit("www.aas.no:80", time+10); +$r->visit("www.sn.no:80"); + +print "No visits: ", $r->no_visits("www.aas.no:80"), "\n"; +print "Last visit: ", $r->last_visit("www.aas.no:80"), "\n"; +print "Fresh until: ", $r->fresh_until("www.aas.no:80"), "\n"; + +print "not " if $r->no_visits("www.aas.no:80") != 2; +print "ok 4\n"; + +print "not " if abs($r->last_visit("www.sn.no:80") - time) > 2; +print "ok 5\n"; + +$r = undef; + +# Try to reopen the database without a name specified +$r = new WWW::RobotRules::AnyDBM_File undef, $file; +$r->visit("www.aas.no:80"); + +print "not " if $r->no_visits("www.aas.no:80") != 3; +print "ok 6\n"; + +print "Agent-Name: ", $r->agent, "\n"; +print "not " if $r->agent ne "myrobot"; +print "ok 7\n"; + +$r = undef; + +print "*** Dump of database ***\n"; +tie(%cat, AnyDBM_File, $file, 0, 0644) or die "Can't tie: $!"; +while (($key,$val) = each(%cat)) { + print "$key\t$val\n"; +} +print "******\n"; + +untie %cat; + +# Try to open database with a different agent name +$r = new WWW::RobotRules::AnyDBM_File "MOMSpider/2.0", $file; + +print "not " if $r->no_visits("www.sn.no:80"); +print "ok 8\n"; + +# Try parsing +$r->parse("http://www.sn.no:8080/robots.txt", <<EOT, (time + 3)); + +User-Agent: * +Disallow: / + +User-Agent: Momspider +Disallow: /foo +Disallow: /bar + +EOT + +@r = $r->rules("www.sn.no:8080"); +print "not " if "@r" ne "/foo /bar"; +print "ok 9\n"; + +print "not " if $r->allowed("http://www.sn.no") >= 0; +print "ok 10\n"; + +print "not " if $r->allowed("http://www.sn.no:8080/foo/gisle"); +print "ok 11\n"; + +sleep(5); # wait until file has expired +print "not " if $r->allowed("http://www.sn.no:8080/foo/gisle") >= 0; +print "ok 12\n"; + + +$r = undef; + +print "*** Dump of database ***\n"; +tie(%cat, AnyDBM_File, $file, 0, 0644) or die "Can't tie: $!"; +while (($key,$val) = each(%cat)) { + print "$key\t$val\n"; +} +print "******\n"; + +untie %cat; # Otherwise the next line fails on DOSish + +while (unlink("$file", "$file.pag", "$file.dir", "$file.db")) {} + +# Try open a an emty database without specifying a name +eval { + $r = new WWW::RobotRules::AnyDBM_File undef, $file; +}; +print $@; +print "not " unless $@; # should fail +print "ok 13\n"; + +unlink "$file", "$file.pag", "$file.dir", "$file.db"; diff --git a/t/rules.t b/t/rules.t new file mode 100644 index 0000000..26b1025 --- /dev/null +++ b/t/rules.t @@ -0,0 +1,230 @@ +#!/local/bin/perl + +=head1 NAME + +robot-rules.t + +=head1 DESCRIPTION + +Test a number of different A</robots.txt> files against a number +of different User-agents. + +=cut + +require WWW::RobotRules; +use Carp; +use strict; + +print "1..50\n"; # for Test::Harness + +# We test a number of different /robots.txt files, +# + +my $content1 = <<EOM; +# http://foo/robots.txt +User-agent: * +Disallow: /private +Disallow: http://foo/also_private + +User-agent: MOMspider +Disallow: +EOM + +my $content2 = <<EOM; +# http://foo/robots.txt +User-agent: MOMspider + # comment which should be ignored +Disallow: /private +EOM + +my $content3 = <<EOM; +# http://foo/robots.txt +EOM + +my $content4 = <<EOM; +# http://foo/robots.txt +User-agent: * +Disallow: /private +Disallow: mailto:foo + +User-agent: MOMspider +Disallow: /this + +User-agent: Another +Disallow: /that + + +User-agent: SvartEnke1 +Disallow: http://fOO +Disallow: http://bar + +User-Agent: SvartEnke2 +Disallow: ftp://foo +Disallow: http://foo:8080/ +Disallow: http://bar/ + +Sitemap: http://www.adobe.com/sitemap.xml +EOM + +my $content5 = <<EOM; +# I've locked myself away +User-agent: * +Disallow: / +# The castle is your home now, so you can go anywhere you like. +User-agent: Belle +Disallow: /west-wing/ # except the west wing! +# It's good to be the Prince... +User-agent: Beast +Disallow: +EOM + +# same thing backwards +my $content6 = <<EOM; +# It's good to be the Prince... +User-agent: Beast +Disallow: +# The castle is your home now, so you can go anywhere you like. +User-agent: Belle +Disallow: /west-wing/ # except the west wing! +# I've locked myself away +User-agent: * +Disallow: / +EOM + +# and a number of different robots: + +my @tests1 = ( + [$content1, 'MOMspider' => + 1 => 'http://foo/private' => 1, + 2 => 'http://foo/also_private' => 1, + ], + + [$content1, 'Wubble' => + 3 => 'http://foo/private' => 0, + 4 => 'http://foo/also_private' => 0, + 5 => 'http://foo/other' => 1, + ], + + [$content2, 'MOMspider' => + 6 => 'http://foo/private' => 0, + 7 => 'http://foo/other' => 1, + ], + + [$content2, 'Wubble' => + 8 => 'http://foo/private' => 1, + 9 => 'http://foo/also_private' => 1, + 10 => 'http://foo/other' => 1, + ], + + [$content3, 'MOMspider' => + 11 => 'http://foo/private' => 1, + 12 => 'http://foo/other' => 1, + ], + + [$content3, 'Wubble' => + 13 => 'http://foo/private' => 1, + 14 => 'http://foo/other' => 1, + ], + + [$content4, 'MOMspider' => + 15 => 'http://foo/private' => 1, + 16 => 'http://foo/this' => 0, + 17 => 'http://foo/that' => 1, + ], + + [$content4, 'Another' => + 18 => 'http://foo/private' => 1, + 19 => 'http://foo/this' => 1, + 20 => 'http://foo/that' => 0, + ], + + [$content4, 'Wubble' => + 21 => 'http://foo/private' => 0, + 22 => 'http://foo/this' => 1, + 23 => 'http://foo/that' => 1, + ], + + [$content4, 'Another/1.0' => + 24 => 'http://foo/private' => 1, + 25 => 'http://foo/this' => 1, + 26 => 'http://foo/that' => 0, + ], + + [$content4, "SvartEnke1" => + 27 => "http://foo/" => 0, + 28 => "http://foo/this" => 0, + 29 => "http://bar/" => 1, + ], + + [$content4, "SvartEnke2" => + 30 => "http://foo/" => 1, + 31 => "http://foo/this" => 1, + 32 => "http://bar/" => 1, + ], + + [$content4, "MomSpiderJr" => # should match "MomSpider" + 33 => 'http://foo/private' => 1, + 34 => 'http://foo/also_private' => 1, + 35 => 'http://foo/this/' => 0, + ], + + [$content4, "SvartEnk" => # should match "*" + 36 => "http://foo/" => 1, + 37 => "http://foo/private/" => 0, + 38 => "http://bar/" => 1, + ], + + [$content5, 'Villager/1.0' => + 39 => 'http://foo/west-wing/' => 0, + 40 => 'http://foo/' => 0, + ], + + [$content5, 'Belle/2.0' => + 41 => 'http://foo/west-wing/' => 0, + 42 => 'http://foo/' => 1, + ], + + [$content5, 'Beast/3.0' => + 43 => 'http://foo/west-wing/' => 1, + 44 => 'http://foo/' => 1, + ], + + [$content6, 'Villager/1.0' => + 45 => 'http://foo/west-wing/' => 0, + 46 => 'http://foo/' => 0, + ], + + [$content6, 'Belle/2.0' => + 47 => 'http://foo/west-wing/' => 0, + 48 => 'http://foo/' => 1, + ], + + [$content6, 'Beast/3.0' => + 49 => 'http://foo/west-wing/' => 1, + 50 => 'http://foo/' => 1, + ], + + # when adding tests, remember to increase + # the maximum at the top + + ); + +my $t; + +for $t (@tests1) { + my ($content, $ua) = splice(@$t, 0, 2); + + my $robotsrules = new WWW::RobotRules($ua); + $robotsrules->parse('http://foo/robots.txt', $content); + + my ($num, $path, $expected); + while(($num, $path, $expected) = splice(@$t, 0, 3)) { + my $allowed = $robotsrules->allowed($path); + $allowed = 1 if $allowed; + if($allowed != $expected) { + $robotsrules->dump; + confess "Test Failed: $ua => $path ($allowed != $expected)"; + } + print "ok $num\n"; + } +} |