summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Changes21
-rw-r--r--MANIFEST10
-rw-r--r--META.yml28
-rw-r--r--Makefile.PL48
-rw-r--r--README143
-rw-r--r--lib/WWW/RobotRules.pm453
-rw-r--r--lib/WWW/RobotRules/AnyDBM_File.pm170
-rwxr-xr-xt/misc/dbmrobot23
-rw-r--r--t/rules-dbm.t128
-rw-r--r--t/rules.t230
10 files changed, 1254 insertions, 0 deletions
diff --git a/Changes b/Changes
new file mode 100644
index 0000000..727c64b
--- /dev/null
+++ b/Changes
@@ -0,0 +1,21 @@
+_______________________________________________________________________________
+2012-02-18 WWW-RobotRules 6.02
+
+Restore perl-5.8.1 compatiblity.
+
+
+
+_______________________________________________________________________________
+2011-03-13 WWW-RobotRules 6.01
+
+Added legal notice and updated the meta repository link
+
+
+
+_______________________________________________________________________________
+2011-02-25 WWW-RobotRules 6.00
+
+Initial release of WWW-RobotRules as a separate distribution. There are no code
+changes besides incrementing the version number since libwww-perl-5.837.
+
+The WWW::RobotRules module used to be bundled with the libwww-perl distribution.
diff --git a/MANIFEST b/MANIFEST
new file mode 100644
index 0000000..67c9683
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,10 @@
+Changes
+lib/WWW/RobotRules.pm
+lib/WWW/RobotRules/AnyDBM_File.pm
+Makefile.PL
+MANIFEST This list of files
+README
+t/misc/dbmrobot
+t/rules-dbm.t
+t/rules.t
+META.yml Module meta-data (added by MakeMaker)
diff --git a/META.yml b/META.yml
new file mode 100644
index 0000000..866f160
--- /dev/null
+++ b/META.yml
@@ -0,0 +1,28 @@
+--- #YAML:1.0
+name: WWW-RobotRules
+version: 6.02
+abstract: database of robots.txt-derived permissions
+author:
+ - Gisle Aas <gisle@activestate.com>
+license: perl
+distribution_type: module
+configure_requires:
+ ExtUtils::MakeMaker: 0
+build_requires:
+ ExtUtils::MakeMaker: 0
+requires:
+ AnyDBM_File: 0
+ Fcntl: 0
+ perl: 5.008001
+ URI: 1.10
+resources:
+ MailingList: mailto:libwww@perl.org
+ repository: http://github.com/gisle/www-robotrules
+no_index:
+ directory:
+ - t
+ - inc
+generated_by: ExtUtils::MakeMaker version 6.57_05
+meta-spec:
+ url: http://module-build.sourceforge.net/META-spec-v1.4.html
+ version: 1.4
diff --git a/Makefile.PL b/Makefile.PL
new file mode 100644
index 0000000..f52b375
--- /dev/null
+++ b/Makefile.PL
@@ -0,0 +1,48 @@
+#!perl -w
+
+require 5.008001;
+use strict;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+ NAME => 'WWW::RobotRules',
+ VERSION_FROM => 'lib/WWW/RobotRules.pm',
+ ABSTRACT_FROM => 'lib/WWW/RobotRules.pm',
+ AUTHOR => 'Gisle Aas <gisle@activestate.com>',
+ LICENSE => "perl",
+ MIN_PERL_VERSION => 5.008001,
+ PREREQ_PM => {
+ 'AnyDBM_File' => 0,
+ 'Fcntl' => 0,
+ 'URI' => "1.10",
+ },
+ META_MERGE => {
+ resources => {
+ repository => 'http://github.com/gisle/www-robotrules',
+ MailingList => 'mailto:libwww@perl.org',
+ }
+ },
+);
+
+
+BEGIN {
+ # compatibility with older versions of MakeMaker
+ my $developer = -f ".gitignore";
+ my %mm_req = (
+ LICENCE => 6.31,
+ META_MERGE => 6.45,
+ META_ADD => 6.45,
+ MIN_PERL_VERSION => 6.48,
+ );
+ undef(*WriteMakefile);
+ *WriteMakefile = sub {
+ my %arg = @_;
+ for (keys %mm_req) {
+ unless (eval { ExtUtils::MakeMaker->VERSION($mm_req{$_}) }) {
+ warn "$_ $@" if $developer;
+ delete $arg{$_};
+ }
+ }
+ ExtUtils::MakeMaker::WriteMakefile(%arg);
+ };
+}
diff --git a/README b/README
new file mode 100644
index 0000000..ad48413
--- /dev/null
+++ b/README
@@ -0,0 +1,143 @@
+NAME
+ WWW::RobotRules - database of robots.txt-derived permissions
+
+SYNOPSIS
+ use WWW::RobotRules;
+ my $rules = WWW::RobotRules->new('MOMspider/1.0');
+
+ use LWP::Simple qw(get);
+
+ {
+ my $url = "http://some.place/robots.txt";
+ my $robots_txt = get $url;
+ $rules->parse($url, $robots_txt) if defined $robots_txt;
+ }
+
+ {
+ my $url = "http://some.other.place/robots.txt";
+ my $robots_txt = get $url;
+ $rules->parse($url, $robots_txt) if defined $robots_txt;
+ }
+
+ # Now we can check if a URL is valid for those servers
+ # whose "robots.txt" files we've gotten and parsed:
+ if($rules->allowed($url)) {
+ $c = get $url;
+ ...
+ }
+
+DESCRIPTION
+ This module parses /robots.txt files as specified in "A Standard for
+ Robot Exclusion", at <http://www.robotstxt.org/wc/norobots.html>
+ Webmasters can use the /robots.txt file to forbid conforming robots from
+ accessing parts of their web site.
+
+ The parsed files are kept in a WWW::RobotRules object, and this object
+ provides methods to check if access to a given URL is prohibited. The
+ same WWW::RobotRules object can be used for one or more parsed
+ /robots.txt files on any number of hosts.
+
+ The following methods are provided:
+
+ $rules = WWW::RobotRules->new($robot_name)
+ This is the constructor for WWW::RobotRules objects. The first
+ argument given to new() is the name of the robot.
+
+ $rules->parse($robot_txt_url, $content, $fresh_until)
+ The parse() method takes as arguments the URL that was used to
+ retrieve the /robots.txt file, and the contents of the file.
+
+ $rules->allowed($uri)
+ Returns TRUE if this robot is allowed to retrieve this URL.
+
+ $rules->agent([$name])
+ Get/set the agent name. NOTE: Changing the agent name will clear the
+ robots.txt rules and expire times out of the cache.
+
+ROBOTS.TXT
+ The format and semantics of the "/robots.txt" file are as follows (this
+ is an edited abstract of <http://www.robotstxt.org/wc/norobots.html>):
+
+ The file consists of one or more records separated by one or more blank
+ lines. Each record contains lines of the form
+
+ <field-name>: <value>
+
+ The field name is case insensitive. Text after the '#' character on a
+ line is ignored during parsing. This is used for comments. The following
+ <field-names> can be used:
+
+ User-Agent
+ The value of this field is the name of the robot the record is
+ describing access policy for. If more than one *User-Agent* field is
+ present the record describes an identical access policy for more than
+ one robot. At least one field needs to be present per record. If the
+ value is '*', the record describes the default access policy for any
+ robot that has not not matched any of the other records.
+
+ The *User-Agent* fields must occur before the *Disallow* fields. If a
+ record contains a *User-Agent* field after a *Disallow* field, that
+ constitutes a malformed record. This parser will assume that a blank
+ line should have been placed before that *User-Agent* field, and will
+ break the record into two. All the fields before the *User-Agent*
+ field will constitute a record, and the *User-Agent* field will be
+ the first field in a new record.
+
+ Disallow
+ The value of this field specifies a partial URL that is not to be
+ visited. This can be a full path, or a partial path; any URL that
+ starts with this value will not be retrieved
+
+ Unrecognized records are ignored.
+
+ROBOTS.TXT EXAMPLES
+ The following example "/robots.txt" file specifies that no robots should
+ visit any URL starting with "/cyberworld/map/" or "/tmp/":
+
+ User-agent: *
+ Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+ Disallow: /tmp/ # these will soon disappear
+
+ This example "/robots.txt" file specifies that no robots should visit
+ any URL starting with "/cyberworld/map/", except the robot called
+ "cybermapper":
+
+ User-agent: *
+ Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+
+ # Cybermapper knows where to go.
+ User-agent: cybermapper
+ Disallow:
+
+ This example indicates that no robots should visit this site further:
+
+ # go away
+ User-agent: *
+ Disallow: /
+
+ This is an example of a malformed robots.txt file.
+
+ # robots.txt for ancientcastle.example.com
+ # I've locked myself away.
+ User-agent: *
+ Disallow: /
+ # The castle is your home now, so you can go anywhere you like.
+ User-agent: Belle
+ Disallow: /west-wing/ # except the west wing!
+ # It's good to be the Prince...
+ User-agent: Beast
+ Disallow:
+
+ This file is missing the required blank lines between records. However,
+ the intention is clear.
+
+SEE ALSO
+ LWP::RobotUA, WWW::RobotRules::AnyDBM_File
+
+COPYRIGHT
+ Copyright 1995-2009, Gisle Aas
+ Copyright 1995, Martijn Koster
+
+ This library is free software; you can redistribute it and/or modify it
+ under the same terms as Perl itself.
+
diff --git a/lib/WWW/RobotRules.pm b/lib/WWW/RobotRules.pm
new file mode 100644
index 0000000..5dfc453
--- /dev/null
+++ b/lib/WWW/RobotRules.pm
@@ -0,0 +1,453 @@
+package WWW::RobotRules;
+
+$VERSION = "6.02";
+sub Version { $VERSION; }
+
+use strict;
+use URI ();
+
+
+
+sub new {
+ my($class, $ua) = @_;
+
+ # This ugly hack is needed to ensure backwards compatibility.
+ # The "WWW::RobotRules" class is now really abstract.
+ $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
+
+ my $self = bless { }, $class;
+ $self->agent($ua);
+ $self;
+}
+
+
+sub parse {
+ my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
+ $robot_txt_uri = URI->new("$robot_txt_uri");
+ my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
+
+ $self->clear_rules($netloc);
+ $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
+
+ my $ua;
+ my $is_me = 0; # 1 iff this record is for me
+ my $is_anon = 0; # 1 iff this record is for *
+ my $seen_disallow = 0; # watch for missing record separators
+ my @me_disallowed = (); # rules disallowed for me
+ my @anon_disallowed = (); # rules disallowed for *
+
+ # blank lines are significant, so turn CRLF into LF to avoid generating
+ # false ones
+ $txt =~ s/\015\012/\012/g;
+
+ # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
+ for(split(/[\012\015]/, $txt)) {
+
+ # Lines containing only a comment are discarded completely, and
+ # therefore do not indicate a record boundary.
+ next if /^\s*\#/;
+
+ s/\s*\#.*//; # remove comments at end-of-line
+
+ if (/^\s*$/) { # blank line
+ last if $is_me; # That was our record. No need to read the rest.
+ $is_anon = 0;
+ $seen_disallow = 0;
+ }
+ elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
+ $ua = $1;
+ $ua =~ s/\s+$//;
+
+ if ($seen_disallow) {
+ # treat as start of a new record
+ $seen_disallow = 0;
+ last if $is_me; # That was our record. No need to read the rest.
+ $is_anon = 0;
+ }
+
+ if ($is_me) {
+ # This record already had a User-agent that
+ # we matched, so just continue.
+ }
+ elsif ($ua eq '*') {
+ $is_anon = 1;
+ }
+ elsif($self->is_me($ua)) {
+ $is_me = 1;
+ }
+ }
+ elsif (/^\s*Disallow\s*:\s*(.*)/i) {
+ unless (defined $ua) {
+ warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
+ $is_anon = 1; # assume that User-agent: * was intended
+ }
+ my $disallow = $1;
+ $disallow =~ s/\s+$//;
+ $seen_disallow = 1;
+ if (length $disallow) {
+ my $ignore;
+ eval {
+ my $u = URI->new_abs($disallow, $robot_txt_uri);
+ $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
+ $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
+ $ignore++ if $u->port ne $robot_txt_uri->port;
+ $disallow = $u->path_query;
+ $disallow = "/" unless length $disallow;
+ };
+ next if $@;
+ next if $ignore;
+ }
+
+ if ($is_me) {
+ push(@me_disallowed, $disallow);
+ }
+ elsif ($is_anon) {
+ push(@anon_disallowed, $disallow);
+ }
+ }
+ elsif (/\S\s*:/) {
+ # ignore
+ }
+ else {
+ warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
+ }
+ }
+
+ if ($is_me) {
+ $self->push_rules($netloc, @me_disallowed);
+ }
+ else {
+ $self->push_rules($netloc, @anon_disallowed);
+ }
+}
+
+
+#
+# Returns TRUE if the given name matches the
+# name of this robot
+#
+sub is_me {
+ my($self, $ua_line) = @_;
+ my $me = $self->agent;
+
+ # See whether my short-name is a substring of the
+ # "User-Agent: ..." line that we were passed:
+
+ if(index(lc($me), lc($ua_line)) >= 0) {
+ return 1;
+ }
+ else {
+ return '';
+ }
+}
+
+
+sub allowed {
+ my($self, $uri) = @_;
+ $uri = URI->new("$uri");
+
+ return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
+ # Robots.txt applies to only those schemes.
+
+ my $netloc = $uri->host . ":" . $uri->port;
+
+ my $fresh_until = $self->fresh_until($netloc);
+ return -1 if !defined($fresh_until) || $fresh_until < time;
+
+ my $str = $uri->path_query;
+ my $rule;
+ for $rule ($self->rules($netloc)) {
+ return 1 unless length $rule;
+ return 0 if index($str, $rule) == 0;
+ }
+ return 1;
+}
+
+
+# The following methods must be provided by the subclass.
+sub agent;
+sub visit;
+sub no_visits;
+sub last_visits;
+sub fresh_until;
+sub push_rules;
+sub clear_rules;
+sub rules;
+sub dump;
+
+
+
+package WWW::RobotRules::InCore;
+
+use vars qw(@ISA);
+@ISA = qw(WWW::RobotRules);
+
+
+
+sub agent {
+ my ($self, $name) = @_;
+ my $old = $self->{'ua'};
+ if ($name) {
+ # Strip it so that it's just the short name.
+ # I.e., "FooBot" => "FooBot"
+ # "FooBot/1.2" => "FooBot"
+ # "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
+
+ $name = $1 if $name =~ m/(\S+)/; # get first word
+ $name =~ s!/.*!!; # get rid of version
+ unless ($old && $old eq $name) {
+ delete $self->{'loc'}; # all old info is now stale
+ $self->{'ua'} = $name;
+ }
+ }
+ $old;
+}
+
+
+sub visit {
+ my($self, $netloc, $time) = @_;
+ return unless $netloc;
+ $time ||= time;
+ $self->{'loc'}{$netloc}{'last'} = $time;
+ my $count = \$self->{'loc'}{$netloc}{'count'};
+ if (!defined $$count) {
+ $$count = 1;
+ }
+ else {
+ $$count++;
+ }
+}
+
+
+sub no_visits {
+ my ($self, $netloc) = @_;
+ $self->{'loc'}{$netloc}{'count'};
+}
+
+
+sub last_visit {
+ my ($self, $netloc) = @_;
+ $self->{'loc'}{$netloc}{'last'};
+}
+
+
+sub fresh_until {
+ my ($self, $netloc, $fresh_until) = @_;
+ my $old = $self->{'loc'}{$netloc}{'fresh'};
+ if (defined $fresh_until) {
+ $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
+ }
+ $old;
+}
+
+
+sub push_rules {
+ my($self, $netloc, @rules) = @_;
+ push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
+}
+
+
+sub clear_rules {
+ my($self, $netloc) = @_;
+ delete $self->{'loc'}{$netloc}{'rules'};
+}
+
+
+sub rules {
+ my($self, $netloc) = @_;
+ if (defined $self->{'loc'}{$netloc}{'rules'}) {
+ return @{$self->{'loc'}{$netloc}{'rules'}};
+ }
+ else {
+ return ();
+ }
+}
+
+
+sub dump
+{
+ my $self = shift;
+ for (keys %$self) {
+ next if $_ eq 'loc';
+ print "$_ = $self->{$_}\n";
+ }
+ for (keys %{$self->{'loc'}}) {
+ my @rules = $self->rules($_);
+ print "$_: ", join("; ", @rules), "\n";
+ }
+}
+
+
+1;
+
+__END__
+
+
+# Bender: "Well, I don't have anything else
+# planned for today. Let's get drunk!"
+
+=head1 NAME
+
+WWW::RobotRules - database of robots.txt-derived permissions
+
+=head1 SYNOPSIS
+
+ use WWW::RobotRules;
+ my $rules = WWW::RobotRules->new('MOMspider/1.0');
+
+ use LWP::Simple qw(get);
+
+ {
+ my $url = "http://some.place/robots.txt";
+ my $robots_txt = get $url;
+ $rules->parse($url, $robots_txt) if defined $robots_txt;
+ }
+
+ {
+ my $url = "http://some.other.place/robots.txt";
+ my $robots_txt = get $url;
+ $rules->parse($url, $robots_txt) if defined $robots_txt;
+ }
+
+ # Now we can check if a URL is valid for those servers
+ # whose "robots.txt" files we've gotten and parsed:
+ if($rules->allowed($url)) {
+ $c = get $url;
+ ...
+ }
+
+=head1 DESCRIPTION
+
+This module parses F</robots.txt> files as specified in
+"A Standard for Robot Exclusion", at
+<http://www.robotstxt.org/wc/norobots.html>
+Webmasters can use the F</robots.txt> file to forbid conforming
+robots from accessing parts of their web site.
+
+The parsed files are kept in a WWW::RobotRules object, and this object
+provides methods to check if access to a given URL is prohibited. The
+same WWW::RobotRules object can be used for one or more parsed
+F</robots.txt> files on any number of hosts.
+
+The following methods are provided:
+
+=over 4
+
+=item $rules = WWW::RobotRules->new($robot_name)
+
+This is the constructor for WWW::RobotRules objects. The first
+argument given to new() is the name of the robot.
+
+=item $rules->parse($robot_txt_url, $content, $fresh_until)
+
+The parse() method takes as arguments the URL that was used to
+retrieve the F</robots.txt> file, and the contents of the file.
+
+=item $rules->allowed($uri)
+
+Returns TRUE if this robot is allowed to retrieve this URL.
+
+=item $rules->agent([$name])
+
+Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
+rules and expire times out of the cache.
+
+=back
+
+=head1 ROBOTS.TXT
+
+The format and semantics of the "/robots.txt" file are as follows
+(this is an edited abstract of
+<http://www.robotstxt.org/wc/norobots.html>):
+
+The file consists of one or more records separated by one or more
+blank lines. Each record contains lines of the form
+
+ <field-name>: <value>
+
+The field name is case insensitive. Text after the '#' character on a
+line is ignored during parsing. This is used for comments. The
+following <field-names> can be used:
+
+=over 3
+
+=item User-Agent
+
+The value of this field is the name of the robot the record is
+describing access policy for. If more than one I<User-Agent> field is
+present the record describes an identical access policy for more than
+one robot. At least one field needs to be present per record. If the
+value is '*', the record describes the default access policy for any
+robot that has not not matched any of the other records.
+
+The I<User-Agent> fields must occur before the I<Disallow> fields. If a
+record contains a I<User-Agent> field after a I<Disallow> field, that
+constitutes a malformed record. This parser will assume that a blank
+line should have been placed before that I<User-Agent> field, and will
+break the record into two. All the fields before the I<User-Agent> field
+will constitute a record, and the I<User-Agent> field will be the first
+field in a new record.
+
+=item Disallow
+
+The value of this field specifies a partial URL that is not to be
+visited. This can be a full path, or a partial path; any URL that
+starts with this value will not be retrieved
+
+=back
+
+Unrecognized records are ignored.
+
+=head1 ROBOTS.TXT EXAMPLES
+
+The following example "/robots.txt" file specifies that no robots
+should visit any URL starting with "/cyberworld/map/" or "/tmp/":
+
+ User-agent: *
+ Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+ Disallow: /tmp/ # these will soon disappear
+
+This example "/robots.txt" file specifies that no robots should visit
+any URL starting with "/cyberworld/map/", except the robot called
+"cybermapper":
+
+ User-agent: *
+ Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+
+ # Cybermapper knows where to go.
+ User-agent: cybermapper
+ Disallow:
+
+This example indicates that no robots should visit this site further:
+
+ # go away
+ User-agent: *
+ Disallow: /
+
+This is an example of a malformed robots.txt file.
+
+ # robots.txt for ancientcastle.example.com
+ # I've locked myself away.
+ User-agent: *
+ Disallow: /
+ # The castle is your home now, so you can go anywhere you like.
+ User-agent: Belle
+ Disallow: /west-wing/ # except the west wing!
+ # It's good to be the Prince...
+ User-agent: Beast
+ Disallow:
+
+This file is missing the required blank lines between records.
+However, the intention is clear.
+
+=head1 SEE ALSO
+
+L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>
+
+=head1 COPYRIGHT
+
+ Copyright 1995-2009, Gisle Aas
+ Copyright 1995, Martijn Koster
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
diff --git a/lib/WWW/RobotRules/AnyDBM_File.pm b/lib/WWW/RobotRules/AnyDBM_File.pm
new file mode 100644
index 0000000..8daa688
--- /dev/null
+++ b/lib/WWW/RobotRules/AnyDBM_File.pm
@@ -0,0 +1,170 @@
+package WWW::RobotRules::AnyDBM_File;
+
+require WWW::RobotRules;
+@ISA = qw(WWW::RobotRules);
+$VERSION = "6.00";
+
+use Carp ();
+use AnyDBM_File;
+use Fcntl;
+use strict;
+
+=head1 NAME
+
+WWW::RobotRules::AnyDBM_File - Persistent RobotRules
+
+=head1 SYNOPSIS
+
+ require WWW::RobotRules::AnyDBM_File;
+ require LWP::RobotUA;
+
+ # Create a robot useragent that uses a diskcaching RobotRules
+ my $rules = WWW::RobotRules::AnyDBM_File->new( 'my-robot/1.0', 'cachefile' );
+ my $ua = WWW::RobotUA->new( 'my-robot/1.0', 'me@foo.com', $rules );
+
+ # Then just use $ua as usual
+ $res = $ua->request($req);
+
+=head1 DESCRIPTION
+
+This is a subclass of I<WWW::RobotRules> that uses the AnyDBM_File
+package to implement persistent diskcaching of F<robots.txt> and host
+visit information.
+
+The constructor (the new() method) takes an extra argument specifying
+the name of the DBM file to use. If the DBM file already exists, then
+you can specify undef as agent name as the name can be obtained from
+the DBM database.
+
+=cut
+
+sub new
+{
+ my ($class, $ua, $file) = @_;
+ Carp::croak('WWW::RobotRules::AnyDBM_File filename required') unless $file;
+
+ my $self = bless { }, $class;
+ $self->{'filename'} = $file;
+ tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_CREAT|O_RDWR, 0640
+ or Carp::croak("Can't open $file: $!");
+
+ if ($ua) {
+ $self->agent($ua);
+ }
+ else {
+ # Try to obtain name from DBM file
+ $ua = $self->{'dbm'}{"|ua-name|"};
+ Carp::croak("No agent name specified") unless $ua;
+ }
+
+ $self;
+}
+
+sub agent {
+ my($self, $newname) = @_;
+ my $old = $self->{'dbm'}{"|ua-name|"};
+ if (defined $newname) {
+ $newname =~ s!/?\s*\d+.\d+\s*$!!; # loose version
+ unless ($old && $old eq $newname) {
+ # Old info is now stale.
+ my $file = $self->{'filename'};
+ untie %{$self->{'dbm'}};
+ tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_TRUNC|O_RDWR, 0640;
+ %{$self->{'dbm'}} = ();
+ $self->{'dbm'}{"|ua-name|"} = $newname;
+ }
+ }
+ $old;
+}
+
+sub no_visits {
+ my ($self, $netloc) = @_;
+ my $t = $self->{'dbm'}{"$netloc|vis"};
+ return 0 unless $t;
+ (split(/;\s*/, $t))[0];
+}
+
+sub last_visit {
+ my ($self, $netloc) = @_;
+ my $t = $self->{'dbm'}{"$netloc|vis"};
+ return undef unless $t;
+ (split(/;\s*/, $t))[1];
+}
+
+sub fresh_until {
+ my ($self, $netloc, $fresh) = @_;
+ my $old = $self->{'dbm'}{"$netloc|exp"};
+ if ($old) {
+ $old =~ s/;.*//; # remove cleartext
+ }
+ if (defined $fresh) {
+ $fresh .= "; " . localtime($fresh);
+ $self->{'dbm'}{"$netloc|exp"} = $fresh;
+ }
+ $old;
+}
+
+sub visit {
+ my($self, $netloc, $time) = @_;
+ $time ||= time;
+
+ my $count = 0;
+ my $old = $self->{'dbm'}{"$netloc|vis"};
+ if ($old) {
+ my $last;
+ ($count,$last) = split(/;\s*/, $old);
+ $time = $last if $last > $time;
+ }
+ $count++;
+ $self->{'dbm'}{"$netloc|vis"} = "$count; $time; " . localtime($time);
+}
+
+sub push_rules {
+ my($self, $netloc, @rules) = @_;
+ my $cnt = 1;
+ $cnt++ while $self->{'dbm'}{"$netloc|r$cnt"};
+
+ foreach (@rules) {
+ $self->{'dbm'}{"$netloc|r$cnt"} = $_;
+ $cnt++;
+ }
+}
+
+sub clear_rules {
+ my($self, $netloc) = @_;
+ my $cnt = 1;
+ while ($self->{'dbm'}{"$netloc|r$cnt"}) {
+ delete $self->{'dbm'}{"$netloc|r$cnt"};
+ $cnt++;
+ }
+}
+
+sub rules {
+ my($self, $netloc) = @_;
+ my @rules = ();
+ my $cnt = 1;
+ while (1) {
+ my $rule = $self->{'dbm'}{"$netloc|r$cnt"};
+ last unless $rule;
+ push(@rules, $rule);
+ $cnt++;
+ }
+ @rules;
+}
+
+sub dump
+{
+}
+
+1;
+
+=head1 SEE ALSO
+
+L<WWW::RobotRules>, L<LWP::RobotUA>
+
+=head1 AUTHORS
+
+Hakan Ardo E<lt>hakan@munin.ub2.lu.se>, Gisle Aas E<lt>aas@sn.no>
+
+=cut
+
diff --git a/t/misc/dbmrobot b/t/misc/dbmrobot
new file mode 100755
index 0000000..c1794c4
--- /dev/null
+++ b/t/misc/dbmrobot
@@ -0,0 +1,23 @@
+#!/local/perl/bin/perl -w
+
+use URI::URL;
+$url = url(shift) || die "Usage: $0 <url>\n";
+
+require WWW::RobotRules::AnyDBM_File;
+require LWP::RobotUA;
+
+$botname = "Spider/0.1";
+
+$rules = new WWW::RobotRules::AnyDBM_File $botname, 'robotdb';
+$ua = new LWP::RobotUA $botname, 'gisle@aas.no', $rules;
+$ua->delay(0.1);
+
+my $req = new HTTP::Request GET => $url;
+
+my $res = $ua->request($req);
+print "Got ", $res->code, " ", $res->message, "(", $res->content_type, ")\n";
+
+my $netloc = $url->netloc;
+print "This was visit no ", $ua->no_visits($netloc), " to $netloc\n";
+
+
diff --git a/t/rules-dbm.t b/t/rules-dbm.t
new file mode 100644
index 0000000..2335b94
--- /dev/null
+++ b/t/rules-dbm.t
@@ -0,0 +1,128 @@
+
+print "1..13\n";
+
+
+use WWW::RobotRules::AnyDBM_File;
+
+$file = "test-$$";
+
+$r = new WWW::RobotRules::AnyDBM_File "myrobot/2.0", $file;
+
+$r->parse("http://www.aas.no/robots.txt", "");
+
+$r->visit("www.aas.no:80");
+
+print "not " if $r->no_visits("www.aas.no:80") != 1;
+print "ok 1\n";
+
+
+$r->push_rules("www.sn.no:80", "/aas", "/per");
+$r->push_rules("www.sn.no:80", "/god", "/old");
+
+@r = $r->rules("www.sn.no:80");
+print "Rules: @r\n";
+
+print "not " if "@r" ne "/aas /per /god /old";
+print "ok 2\n";
+
+$r->clear_rules("per");
+$r->clear_rules("www.sn.no:80");
+
+@r = $r->rules("www.sn.no:80");
+print "Rules: @r\n";
+
+print "not " if "@r" ne "";
+print "ok 3\n";
+
+$r->visit("www.aas.no:80", time+10);
+$r->visit("www.sn.no:80");
+
+print "No visits: ", $r->no_visits("www.aas.no:80"), "\n";
+print "Last visit: ", $r->last_visit("www.aas.no:80"), "\n";
+print "Fresh until: ", $r->fresh_until("www.aas.no:80"), "\n";
+
+print "not " if $r->no_visits("www.aas.no:80") != 2;
+print "ok 4\n";
+
+print "not " if abs($r->last_visit("www.sn.no:80") - time) > 2;
+print "ok 5\n";
+
+$r = undef;
+
+# Try to reopen the database without a name specified
+$r = new WWW::RobotRules::AnyDBM_File undef, $file;
+$r->visit("www.aas.no:80");
+
+print "not " if $r->no_visits("www.aas.no:80") != 3;
+print "ok 6\n";
+
+print "Agent-Name: ", $r->agent, "\n";
+print "not " if $r->agent ne "myrobot";
+print "ok 7\n";
+
+$r = undef;
+
+print "*** Dump of database ***\n";
+tie(%cat, AnyDBM_File, $file, 0, 0644) or die "Can't tie: $!";
+while (($key,$val) = each(%cat)) {
+ print "$key\t$val\n";
+}
+print "******\n";
+
+untie %cat;
+
+# Try to open database with a different agent name
+$r = new WWW::RobotRules::AnyDBM_File "MOMSpider/2.0", $file;
+
+print "not " if $r->no_visits("www.sn.no:80");
+print "ok 8\n";
+
+# Try parsing
+$r->parse("http://www.sn.no:8080/robots.txt", <<EOT, (time + 3));
+
+User-Agent: *
+Disallow: /
+
+User-Agent: Momspider
+Disallow: /foo
+Disallow: /bar
+
+EOT
+
+@r = $r->rules("www.sn.no:8080");
+print "not " if "@r" ne "/foo /bar";
+print "ok 9\n";
+
+print "not " if $r->allowed("http://www.sn.no") >= 0;
+print "ok 10\n";
+
+print "not " if $r->allowed("http://www.sn.no:8080/foo/gisle");
+print "ok 11\n";
+
+sleep(5); # wait until file has expired
+print "not " if $r->allowed("http://www.sn.no:8080/foo/gisle") >= 0;
+print "ok 12\n";
+
+
+$r = undef;
+
+print "*** Dump of database ***\n";
+tie(%cat, AnyDBM_File, $file, 0, 0644) or die "Can't tie: $!";
+while (($key,$val) = each(%cat)) {
+ print "$key\t$val\n";
+}
+print "******\n";
+
+untie %cat; # Otherwise the next line fails on DOSish
+
+while (unlink("$file", "$file.pag", "$file.dir", "$file.db")) {}
+
+# Try open a an emty database without specifying a name
+eval {
+ $r = new WWW::RobotRules::AnyDBM_File undef, $file;
+};
+print $@;
+print "not " unless $@; # should fail
+print "ok 13\n";
+
+unlink "$file", "$file.pag", "$file.dir", "$file.db";
diff --git a/t/rules.t b/t/rules.t
new file mode 100644
index 0000000..26b1025
--- /dev/null
+++ b/t/rules.t
@@ -0,0 +1,230 @@
+#!/local/bin/perl
+
+=head1 NAME
+
+robot-rules.t
+
+=head1 DESCRIPTION
+
+Test a number of different A</robots.txt> files against a number
+of different User-agents.
+
+=cut
+
+require WWW::RobotRules;
+use Carp;
+use strict;
+
+print "1..50\n"; # for Test::Harness
+
+# We test a number of different /robots.txt files,
+#
+
+my $content1 = <<EOM;
+# http://foo/robots.txt
+User-agent: *
+Disallow: /private
+Disallow: http://foo/also_private
+
+User-agent: MOMspider
+Disallow:
+EOM
+
+my $content2 = <<EOM;
+# http://foo/robots.txt
+User-agent: MOMspider
+ # comment which should be ignored
+Disallow: /private
+EOM
+
+my $content3 = <<EOM;
+# http://foo/robots.txt
+EOM
+
+my $content4 = <<EOM;
+# http://foo/robots.txt
+User-agent: *
+Disallow: /private
+Disallow: mailto:foo
+
+User-agent: MOMspider
+Disallow: /this
+
+User-agent: Another
+Disallow: /that
+
+
+User-agent: SvartEnke1
+Disallow: http://fOO
+Disallow: http://bar
+
+User-Agent: SvartEnke2
+Disallow: ftp://foo
+Disallow: http://foo:8080/
+Disallow: http://bar/
+
+Sitemap: http://www.adobe.com/sitemap.xml
+EOM
+
+my $content5 = <<EOM;
+# I've locked myself away
+User-agent: *
+Disallow: /
+# The castle is your home now, so you can go anywhere you like.
+User-agent: Belle
+Disallow: /west-wing/ # except the west wing!
+# It's good to be the Prince...
+User-agent: Beast
+Disallow:
+EOM
+
+# same thing backwards
+my $content6 = <<EOM;
+# It's good to be the Prince...
+User-agent: Beast
+Disallow:
+# The castle is your home now, so you can go anywhere you like.
+User-agent: Belle
+Disallow: /west-wing/ # except the west wing!
+# I've locked myself away
+User-agent: *
+Disallow: /
+EOM
+
+# and a number of different robots:
+
+my @tests1 = (
+ [$content1, 'MOMspider' =>
+ 1 => 'http://foo/private' => 1,
+ 2 => 'http://foo/also_private' => 1,
+ ],
+
+ [$content1, 'Wubble' =>
+ 3 => 'http://foo/private' => 0,
+ 4 => 'http://foo/also_private' => 0,
+ 5 => 'http://foo/other' => 1,
+ ],
+
+ [$content2, 'MOMspider' =>
+ 6 => 'http://foo/private' => 0,
+ 7 => 'http://foo/other' => 1,
+ ],
+
+ [$content2, 'Wubble' =>
+ 8 => 'http://foo/private' => 1,
+ 9 => 'http://foo/also_private' => 1,
+ 10 => 'http://foo/other' => 1,
+ ],
+
+ [$content3, 'MOMspider' =>
+ 11 => 'http://foo/private' => 1,
+ 12 => 'http://foo/other' => 1,
+ ],
+
+ [$content3, 'Wubble' =>
+ 13 => 'http://foo/private' => 1,
+ 14 => 'http://foo/other' => 1,
+ ],
+
+ [$content4, 'MOMspider' =>
+ 15 => 'http://foo/private' => 1,
+ 16 => 'http://foo/this' => 0,
+ 17 => 'http://foo/that' => 1,
+ ],
+
+ [$content4, 'Another' =>
+ 18 => 'http://foo/private' => 1,
+ 19 => 'http://foo/this' => 1,
+ 20 => 'http://foo/that' => 0,
+ ],
+
+ [$content4, 'Wubble' =>
+ 21 => 'http://foo/private' => 0,
+ 22 => 'http://foo/this' => 1,
+ 23 => 'http://foo/that' => 1,
+ ],
+
+ [$content4, 'Another/1.0' =>
+ 24 => 'http://foo/private' => 1,
+ 25 => 'http://foo/this' => 1,
+ 26 => 'http://foo/that' => 0,
+ ],
+
+ [$content4, "SvartEnke1" =>
+ 27 => "http://foo/" => 0,
+ 28 => "http://foo/this" => 0,
+ 29 => "http://bar/" => 1,
+ ],
+
+ [$content4, "SvartEnke2" =>
+ 30 => "http://foo/" => 1,
+ 31 => "http://foo/this" => 1,
+ 32 => "http://bar/" => 1,
+ ],
+
+ [$content4, "MomSpiderJr" => # should match "MomSpider"
+ 33 => 'http://foo/private' => 1,
+ 34 => 'http://foo/also_private' => 1,
+ 35 => 'http://foo/this/' => 0,
+ ],
+
+ [$content4, "SvartEnk" => # should match "*"
+ 36 => "http://foo/" => 1,
+ 37 => "http://foo/private/" => 0,
+ 38 => "http://bar/" => 1,
+ ],
+
+ [$content5, 'Villager/1.0' =>
+ 39 => 'http://foo/west-wing/' => 0,
+ 40 => 'http://foo/' => 0,
+ ],
+
+ [$content5, 'Belle/2.0' =>
+ 41 => 'http://foo/west-wing/' => 0,
+ 42 => 'http://foo/' => 1,
+ ],
+
+ [$content5, 'Beast/3.0' =>
+ 43 => 'http://foo/west-wing/' => 1,
+ 44 => 'http://foo/' => 1,
+ ],
+
+ [$content6, 'Villager/1.0' =>
+ 45 => 'http://foo/west-wing/' => 0,
+ 46 => 'http://foo/' => 0,
+ ],
+
+ [$content6, 'Belle/2.0' =>
+ 47 => 'http://foo/west-wing/' => 0,
+ 48 => 'http://foo/' => 1,
+ ],
+
+ [$content6, 'Beast/3.0' =>
+ 49 => 'http://foo/west-wing/' => 1,
+ 50 => 'http://foo/' => 1,
+ ],
+
+ # when adding tests, remember to increase
+ # the maximum at the top
+
+ );
+
+my $t;
+
+for $t (@tests1) {
+ my ($content, $ua) = splice(@$t, 0, 2);
+
+ my $robotsrules = new WWW::RobotRules($ua);
+ $robotsrules->parse('http://foo/robots.txt', $content);
+
+ my ($num, $path, $expected);
+ while(($num, $path, $expected) = splice(@$t, 0, 3)) {
+ my $allowed = $robotsrules->allowed($path);
+ $allowed = 1 if $allowed;
+ if($allowed != $expected) {
+ $robotsrules->dump;
+ confess "Test Failed: $ua => $path ($allowed != $expected)";
+ }
+ print "ok $num\n";
+ }
+}