WWW-RobotRules-6.02HEAD WWW-RobotRules-6.02 master

author: Lorry Tar Creator <lorry-tar-importer@lorry> 2012-02-18 13:07:29 +0000
committer: Lorry Tar Creator <lorry-tar-importer@lorry> 2012-02-18 13:07:29 +0000
commit: 80c86924c3437c0ad64852ea29c7eab1197dfe90 (patch)
tree: 11f2945e6c83ffc5d3fec7519d4cb4e9c7194ff5
download: WWW-RobotRules-tarball-master.tar.gz
10 files changed, 1254 insertions, 0 deletions
diff --git a/Changes b/Changes
new file mode 100644
index 0000000..727c64b
--- /dev/null
+++ b/Changes
@@ -0,0 +1,21 @@
+_______________________________________________________________________________
+2012-02-18 WWW-RobotRules 6.02
+
+Restore perl-5.8.1 compatiblity.
+
+
+
+_______________________________________________________________________________
+2011-03-13 WWW-RobotRules 6.01
+
+Added legal notice and updated the meta repository link
+
+
+
+_______________________________________________________________________________
+2011-02-25 WWW-RobotRules 6.00
+
+Initial release of WWW-RobotRules as a separate distribution. There are no code
+changes besides incrementing the version number since libwww-perl-5.837.
+
+The WWW::RobotRules module used to be bundled with the libwww-perl distribution.
diff --git a/MANIFEST b/MANIFEST
new file mode 100644
index 0000000..67c9683
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,10 @@
+Changes
+lib/WWW/RobotRules.pm
+lib/WWW/RobotRules/AnyDBM_File.pm
+Makefile.PL
+MANIFEST			This list of files
+README
+t/misc/dbmrobot
+t/rules-dbm.t
+t/rules.t
+META.yml                                 Module meta-data (added by MakeMaker)
diff --git a/META.yml b/META.yml
new file mode 100644
index 0000000..866f160
--- /dev/null
+++ b/META.yml
@@ -0,0 +1,28 @@
+--- #YAML:1.0
+name:               WWW-RobotRules
+version:            6.02
+abstract:           database of robots.txt-derived permissions
+author:
+    - Gisle Aas <gisle@activestate.com>
+license:            perl
+distribution_type:  module
+configure_requires:
+    ExtUtils::MakeMaker:  0
+build_requires:
+    ExtUtils::MakeMaker:  0
+requires:
+    AnyDBM_File:  0
+    Fcntl:        0
+    perl:         5.008001
+    URI:          1.10
+resources:
+    MailingList:  mailto:libwww@perl.org
+    repository:   http://github.com/gisle/www-robotrules
+no_index:
+    directory:
+        - t
+        - inc
+generated_by:       ExtUtils::MakeMaker version 6.57_05
+meta-spec:
+    url:      http://module-build.sourceforge.net/META-spec-v1.4.html
+    version:  1.4
diff --git a/Makefile.PL b/Makefile.PL
new file mode 100644
index 0000000..f52b375
--- /dev/null
+++ b/Makefile.PL
@@ -0,0 +1,48 @@
+#!perl -w
+
+require 5.008001;
+use strict;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+    NAME => 'WWW::RobotRules',
+    VERSION_FROM => 'lib/WWW/RobotRules.pm',
+    ABSTRACT_FROM => 'lib/WWW/RobotRules.pm',
+    AUTHOR => 'Gisle Aas <gisle@activestate.com>',
+    LICENSE => "perl",
+    MIN_PERL_VERSION => 5.008001,
+    PREREQ_PM => {
+        'AnyDBM_File' => 0,
+        'Fcntl' => 0,
+        'URI' => "1.10",
+    },
+    META_MERGE => {
+	resources => {
+            repository => 'http://github.com/gisle/www-robotrules',
+	    MailingList => 'mailto:libwww@perl.org',
+        }
+    },
+);
+
+
+BEGIN {
+    # compatibility with older versions of MakeMaker
+    my $developer = -f ".gitignore";
+    my %mm_req = (
+        LICENCE => 6.31,
+        META_MERGE => 6.45,
+        META_ADD => 6.45,
+        MIN_PERL_VERSION => 6.48,
+    );
+    undef(*WriteMakefile);
+    *WriteMakefile = sub {
+        my %arg = @_;
+        for (keys %mm_req) {
+            unless (eval { ExtUtils::MakeMaker->VERSION($mm_req{$_}) }) {
+                warn "$_ $@" if $developer;
+                delete $arg{$_};
+            }
+        }
+        ExtUtils::MakeMaker::WriteMakefile(%arg);
+    };
+}
diff --git a/README b/README
new file mode 100644
index 0000000..ad48413
--- /dev/null
+++ b/README
@@ -0,0 +1,143 @@
+NAME
+    WWW::RobotRules - database of robots.txt-derived permissions
+
+SYNOPSIS
+     use WWW::RobotRules;
+     my $rules = WWW::RobotRules->new('MOMspider/1.0');
+
+     use LWP::Simple qw(get);
+
+     {
+       my $url = "http://some.place/robots.txt";
+       my $robots_txt = get $url;
+       $rules->parse($url, $robots_txt) if defined $robots_txt;
+     }
+
+     {
+       my $url = "http://some.other.place/robots.txt";
+       my $robots_txt = get $url;
+       $rules->parse($url, $robots_txt) if defined $robots_txt;
+     }
+
+     # Now we can check if a URL is valid for those servers
+     # whose "robots.txt" files we've gotten and parsed:
+     if($rules->allowed($url)) {
+         $c = get $url;
+         ...
+     }
+
+DESCRIPTION
+    This module parses /robots.txt files as specified in "A Standard for
+    Robot Exclusion", at <http://www.robotstxt.org/wc/norobots.html>
+    Webmasters can use the /robots.txt file to forbid conforming robots from
+    accessing parts of their web site.
+
+    The parsed files are kept in a WWW::RobotRules object, and this object
+    provides methods to check if access to a given URL is prohibited. The
+    same WWW::RobotRules object can be used for one or more parsed
+    /robots.txt files on any number of hosts.
+
+    The following methods are provided:
+
+    $rules = WWW::RobotRules->new($robot_name)
+        This is the constructor for WWW::RobotRules objects. The first
+        argument given to new() is the name of the robot.
+
+    $rules->parse($robot_txt_url, $content, $fresh_until)
+        The parse() method takes as arguments the URL that was used to
+        retrieve the /robots.txt file, and the contents of the file.
+
+    $rules->allowed($uri)
+        Returns TRUE if this robot is allowed to retrieve this URL.
+
+    $rules->agent([$name])
+        Get/set the agent name. NOTE: Changing the agent name will clear the
+        robots.txt rules and expire times out of the cache.
+
+ROBOTS.TXT
+    The format and semantics of the "/robots.txt" file are as follows (this
+    is an edited abstract of <http://www.robotstxt.org/wc/norobots.html>):
+
+    The file consists of one or more records separated by one or more blank
+    lines. Each record contains lines of the form
+
+      <field-name>: <value>
+
+    The field name is case insensitive. Text after the '#' character on a
+    line is ignored during parsing. This is used for comments. The following
+    <field-names> can be used:
+
+    User-Agent
+       The value of this field is the name of the robot the record is
+       describing access policy for. If more than one *User-Agent* field is
+       present the record describes an identical access policy for more than
+       one robot. At least one field needs to be present per record. If the
+       value is '*', the record describes the default access policy for any
+       robot that has not not matched any of the other records.
+
+       The *User-Agent* fields must occur before the *Disallow* fields. If a
+       record contains a *User-Agent* field after a *Disallow* field, that
+       constitutes a malformed record. This parser will assume that a blank
+       line should have been placed before that *User-Agent* field, and will
+       break the record into two. All the fields before the *User-Agent*
+       field will constitute a record, and the *User-Agent* field will be
+       the first field in a new record.
+
+    Disallow
+       The value of this field specifies a partial URL that is not to be
+       visited. This can be a full path, or a partial path; any URL that
+       starts with this value will not be retrieved
+
+    Unrecognized records are ignored.
+
+ROBOTS.TXT EXAMPLES
+    The following example "/robots.txt" file specifies that no robots should
+    visit any URL starting with "/cyberworld/map/" or "/tmp/":
+
+      User-agent: *
+      Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+      Disallow: /tmp/ # these will soon disappear
+
+    This example "/robots.txt" file specifies that no robots should visit
+    any URL starting with "/cyberworld/map/", except the robot called
+    "cybermapper":
+
+      User-agent: *
+      Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+
+      # Cybermapper knows where to go.
+      User-agent: cybermapper
+      Disallow:
+
+    This example indicates that no robots should visit this site further:
+
+      # go away
+      User-agent: *
+      Disallow: /
+
+    This is an example of a malformed robots.txt file.
+
+      # robots.txt for ancientcastle.example.com
+      # I've locked myself away.
+      User-agent: *
+      Disallow: /
+      # The castle is your home now, so you can go anywhere you like.
+      User-agent: Belle
+      Disallow: /west-wing/ # except the west wing!
+      # It's good to be the Prince...
+      User-agent: Beast
+      Disallow:
+
+    This file is missing the required blank lines between records. However,
+    the intention is clear.
+
+SEE ALSO
+    LWP::RobotUA, WWW::RobotRules::AnyDBM_File
+
+COPYRIGHT
+      Copyright 1995-2009, Gisle Aas
+      Copyright 1995, Martijn Koster
+
+    This library is free software; you can redistribute it and/or modify it
+    under the same terms as Perl itself.
+
diff --git a/lib/WWW/RobotRules.pm b/lib/WWW/RobotRules.pm
new file mode 100644
index 0000000..5dfc453
--- /dev/null
+++ b/lib/WWW/RobotRules.pm
@@ -0,0 +1,453 @@
+package WWW::RobotRules;
+
+$VERSION = "6.02";
+sub Version { $VERSION; }
+
+use strict;
+use URI ();
+
+
+
+sub new {
+    my($class, $ua) = @_;
+
+    # This ugly hack is needed to ensure backwards compatibility.
+    # The "WWW::RobotRules" class is now really abstract.
+    $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
+
+    my $self = bless { }, $class;
+    $self->agent($ua);
+    $self;
+}
+
+
+sub parse {
+    my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
+    $robot_txt_uri = URI->new("$robot_txt_uri");
+    my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
+
+    $self->clear_rules($netloc);
+    $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
+
+    my $ua;
+    my $is_me = 0;		# 1 iff this record is for me
+    my $is_anon = 0;		# 1 iff this record is for *
+    my $seen_disallow = 0;      # watch for missing record separators
+    my @me_disallowed = ();	# rules disallowed for me
+    my @anon_disallowed = ();	# rules disallowed for *
+
+    # blank lines are significant, so turn CRLF into LF to avoid generating
+    # false ones
+    $txt =~ s/\015\012/\012/g;
+
+    # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
+    for(split(/[\012\015]/, $txt)) {
+
+	# Lines containing only a comment are discarded completely, and
+        # therefore do not indicate a record boundary.
+	next if /^\s*\#/;
+
+	s/\s*\#.*//;        # remove comments at end-of-line
+
+	if (/^\s*$/) {	    # blank line
+	    last if $is_me; # That was our record. No need to read the rest.
+	    $is_anon = 0;
+	    $seen_disallow = 0;
+	}
+        elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
+	    $ua = $1;
+	    $ua =~ s/\s+$//;
+
+	    if ($seen_disallow) {
+		# treat as start of a new record
+		$seen_disallow = 0;
+		last if $is_me; # That was our record. No need to read the rest.
+		$is_anon = 0;
+	    }
+
+	    if ($is_me) {
+		# This record already had a User-agent that
+		# we matched, so just continue.
+	    }
+	    elsif ($ua eq '*') {
+		$is_anon = 1;
+	    }
+	    elsif($self->is_me($ua)) {
+		$is_me = 1;
+	    }
+	}
+	elsif (/^\s*Disallow\s*:\s*(.*)/i) {
+	    unless (defined $ua) {
+		warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
+		$is_anon = 1;  # assume that User-agent: * was intended
+	    }
+	    my $disallow = $1;
+	    $disallow =~ s/\s+$//;
+	    $seen_disallow = 1;
+	    if (length $disallow) {
+		my $ignore;
+		eval {
+		    my $u = URI->new_abs($disallow, $robot_txt_uri);
+		    $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
+		    $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
+		    $ignore++ if $u->port ne $robot_txt_uri->port;
+		    $disallow = $u->path_query;
+		    $disallow = "/" unless length $disallow;
+		};
+		next if $@;
+		next if $ignore;
+	    }
+
+	    if ($is_me) {
+		push(@me_disallowed, $disallow);
+	    }
+	    elsif ($is_anon) {
+		push(@anon_disallowed, $disallow);
+	    }
+	}
+        elsif (/\S\s*:/) {
+             # ignore
+        }
+	else {
+	    warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
+	}
+    }
+
+    if ($is_me) {
+	$self->push_rules($netloc, @me_disallowed);
+    }
+    else {
+	$self->push_rules($netloc, @anon_disallowed);
+    }
+}
+
+
+#
+# Returns TRUE if the given name matches the
+# name of this robot
+#
+sub is_me {
+    my($self, $ua_line) = @_;
+    my $me = $self->agent;
+
+    # See whether my short-name is a substring of the
+    #  "User-Agent: ..." line that we were passed:
+
+    if(index(lc($me), lc($ua_line)) >= 0) {
+      return 1;
+    }
+    else {
+      return '';
+    }
+}
+
+
+sub allowed {
+    my($self, $uri) = @_;
+    $uri = URI->new("$uri");
+
+    return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
+     # Robots.txt applies to only those schemes.
+
+    my $netloc = $uri->host . ":" . $uri->port;
+
+    my $fresh_until = $self->fresh_until($netloc);
+    return -1 if !defined($fresh_until) || $fresh_until < time;
+
+    my $str = $uri->path_query;
+    my $rule;
+    for $rule ($self->rules($netloc)) {
+	return 1 unless length $rule;
+	return 0 if index($str, $rule) == 0;
+    }
+    return 1;
+}
+
+
+# The following methods must be provided by the subclass.
+sub agent;
+sub visit;
+sub no_visits;
+sub last_visits;
+sub fresh_until;
+sub push_rules;
+sub clear_rules;
+sub rules;
+sub dump;
+
+
+
+package WWW::RobotRules::InCore;
+
+use vars qw(@ISA);
+@ISA = qw(WWW::RobotRules);
+
+
+
+sub agent {
+    my ($self, $name) = @_;
+    my $old = $self->{'ua'};
+    if ($name) {
+        # Strip it so that it's just the short name.
+        # I.e., "FooBot"                                      => "FooBot"
+        #       "FooBot/1.2"                                  => "FooBot"
+        #       "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
+
+	$name = $1 if $name =~ m/(\S+)/; # get first word
+	$name =~ s!/.*!!;  # get rid of version
+	unless ($old && $old eq $name) {
+	    delete $self->{'loc'}; # all old info is now stale
+	    $self->{'ua'} = $name;
+	}
+    }
+    $old;
+}
+
+
+sub visit {
+    my($self, $netloc, $time) = @_;
+    return unless $netloc;
+    $time ||= time;
+    $self->{'loc'}{$netloc}{'last'} = $time;
+    my $count = \$self->{'loc'}{$netloc}{'count'};
+    if (!defined $$count) {
+	$$count = 1;
+    }
+    else {
+	$$count++;
+    }
+}
+
+
+sub no_visits {
+    my ($self, $netloc) = @_;
+    $self->{'loc'}{$netloc}{'count'};
+}
+
+
+sub last_visit {
+    my ($self, $netloc) = @_;
+    $self->{'loc'}{$netloc}{'last'};
+}
+
+
+sub fresh_until {
+    my ($self, $netloc, $fresh_until) = @_;
+    my $old = $self->{'loc'}{$netloc}{'fresh'};
+    if (defined $fresh_until) {
+	$self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
+    }
+    $old;
+}
+
+
+sub push_rules {
+    my($self, $netloc, @rules) = @_;
+    push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
+}
+
+
+sub clear_rules {
+    my($self, $netloc) = @_;
+    delete $self->{'loc'}{$netloc}{'rules'};
+}
+
+
+sub rules {
+    my($self, $netloc) = @_;
+    if (defined $self->{'loc'}{$netloc}{'rules'}) {
+	return @{$self->{'loc'}{$netloc}{'rules'}};
+    }
+    else {
+	return ();
+    }
+}
+
+
+sub dump
+{
+    my $self = shift;
+    for (keys %$self) {
+	next if $_ eq 'loc';
+	print "$_ = $self->{$_}\n";
+    }
+    for (keys %{$self->{'loc'}}) {
+	my @rules = $self->rules($_);
+	print "$_: ", join("; ", @rules), "\n";
+    }
+}
+
+
+1;
+
+__END__
+
+
+# Bender: "Well, I don't have anything else
+#          planned for today.  Let's get drunk!"
+
+=head1 NAME
+
+WWW::RobotRules - database of robots.txt-derived permissions
+
+=head1 SYNOPSIS
+
+ use WWW::RobotRules;
+ my $rules = WWW::RobotRules->new('MOMspider/1.0');
+
+ use LWP::Simple qw(get);
+
+ {
+   my $url = "http://some.place/robots.txt";
+   my $robots_txt = get $url;
+   $rules->parse($url, $robots_txt) if defined $robots_txt;
+ }
+
+ {
+   my $url = "http://some.other.place/robots.txt";
+   my $robots_txt = get $url;
+   $rules->parse($url, $robots_txt) if defined $robots_txt;
+ }
+
+ # Now we can check if a URL is valid for those servers
+ # whose "robots.txt" files we've gotten and parsed:
+ if($rules->allowed($url)) {
+     $c = get $url;
+     ...
+ }
+
+=head1 DESCRIPTION
+
+This module parses F</robots.txt> files as specified in
+"A Standard for Robot Exclusion", at
+<http://www.robotstxt.org/wc/norobots.html>
+Webmasters can use the F</robots.txt> file to forbid conforming
+robots from accessing parts of their web site.
+
+The parsed files are kept in a WWW::RobotRules object, and this object
+provides methods to check if access to a given URL is prohibited.  The
+same WWW::RobotRules object can be used for one or more parsed
+F</robots.txt> files on any number of hosts.
+
+The following methods are provided:
+
+=over 4
+
+=item $rules = WWW::RobotRules->new($robot_name)
+
+This is the constructor for WWW::RobotRules objects.  The first
+argument given to new() is the name of the robot.
+
+=item $rules->parse($robot_txt_url, $content, $fresh_until)
+
+The parse() method takes as arguments the URL that was used to
+retrieve the F</robots.txt> file, and the contents of the file.
+
+=item $rules->allowed($uri)
+
+Returns TRUE if this robot is allowed to retrieve this URL.
+
+=item $rules->agent([$name])
+
+Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
+rules and expire times out of the cache.
+
+=back
+
+=head1 ROBOTS.TXT
+
+The format and semantics of the "/robots.txt" file are as follows
+(this is an edited abstract of
+<http://www.robotstxt.org/wc/norobots.html>):
+
+The file consists of one or more records separated by one or more
+blank lines. Each record contains lines of the form
+
+  <field-name>: <value>
+
+The field name is case insensitive.  Text after the '#' character on a
+line is ignored during parsing.  This is used for comments.  The
+following <field-names> can be used:
+
+=over 3
+
+=item User-Agent
+
+The value of this field is the name of the robot the record is
+describing access policy for.  If more than one I<User-Agent> field is
+present the record describes an identical access policy for more than
+one robot. At least one field needs to be present per record.  If the
+value is '*', the record describes the default access policy for any
+robot that has not not matched any of the other records.
+
+The I<User-Agent> fields must occur before the I<Disallow> fields.  If a
+record contains a I<User-Agent> field after a I<Disallow> field, that
+constitutes a malformed record.  This parser will assume that a blank
+line should have been placed before that I<User-Agent> field, and will
+break the record into two.  All the fields before the I<User-Agent> field
+will constitute a record, and the I<User-Agent> field will be the first
+field in a new record.
+
+=item Disallow
+
+The value of this field specifies a partial URL that is not to be
+visited. This can be a full path, or a partial path; any URL that
+starts with this value will not be retrieved
+
+=back
+
+Unrecognized records are ignored.
+
+=head1 ROBOTS.TXT EXAMPLES
+
+The following example "/robots.txt" file specifies that no robots
+should visit any URL starting with "/cyberworld/map/" or "/tmp/":
+
+  User-agent: *
+  Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+  Disallow: /tmp/ # these will soon disappear
+
+This example "/robots.txt" file specifies that no robots should visit
+any URL starting with "/cyberworld/map/", except the robot called
+"cybermapper":
+
+  User-agent: *
+  Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+
+  # Cybermapper knows where to go.
+  User-agent: cybermapper
+  Disallow:
+
+This example indicates that no robots should visit this site further:
+
+  # go away
+  User-agent: *
+  Disallow: /
+
+This is an example of a malformed robots.txt file.
+
+  # robots.txt for ancientcastle.example.com
+  # I've locked myself away.
+  User-agent: *
+  Disallow: /
+  # The castle is your home now, so you can go anywhere you like.
+  User-agent: Belle
+  Disallow: /west-wing/ # except the west wing!
+  # It's good to be the Prince...
+  User-agent: Beast
+  Disallow:
+
+This file is missing the required blank lines between records.
+However, the intention is clear.
+
+=head1 SEE ALSO
+
+L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>
+
+=head1 COPYRIGHT
+
+  Copyright 1995-2009, Gisle Aas
+  Copyright 1995, Martijn Koster
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
diff --git a/lib/WWW/RobotRules/AnyDBM_File.pm b/lib/WWW/RobotRules/AnyDBM_File.pm
new file mode 100644
index 0000000..8daa688
--- /dev/null
+++ b/lib/WWW/RobotRules/AnyDBM_File.pm
@@ -0,0 +1,170 @@
+package WWW::RobotRules::AnyDBM_File;
+
+require  WWW::RobotRules;
+@ISA = qw(WWW::RobotRules);
+$VERSION = "6.00";
+
+use Carp ();
+use AnyDBM_File;
+use Fcntl;
+use strict;
+
+=head1 NAME
+
+WWW::RobotRules::AnyDBM_File - Persistent RobotRules
+
+=head1 SYNOPSIS
+
+ require WWW::RobotRules::AnyDBM_File;
+ require LWP::RobotUA;
+
+ # Create a robot useragent that uses a diskcaching RobotRules
+ my $rules = WWW::RobotRules::AnyDBM_File->new( 'my-robot/1.0', 'cachefile' );
+ my $ua = WWW::RobotUA->new( 'my-robot/1.0', 'me@foo.com', $rules );
+
+ # Then just use $ua as usual
+ $res = $ua->request($req);
+
+=head1 DESCRIPTION
+
+This is a subclass of I<WWW::RobotRules> that uses the AnyDBM_File
+package to implement persistent diskcaching of F<robots.txt> and host
+visit information.
+
+The constructor (the new() method) takes an extra argument specifying
+the name of the DBM file to use.  If the DBM file already exists, then
+you can specify undef as agent name as the name can be obtained from
+the DBM database.
+
+=cut
+
+sub new 
+{ 
+  my ($class, $ua, $file) = @_;
+  Carp::croak('WWW::RobotRules::AnyDBM_File filename required') unless $file;
+
+  my $self = bless { }, $class;
+  $self->{'filename'} = $file;
+  tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_CREAT|O_RDWR, 0640
+    or Carp::croak("Can't open $file: $!");
+  
+  if ($ua) {
+      $self->agent($ua);
+  }
+  else {
+      # Try to obtain name from DBM file
+      $ua = $self->{'dbm'}{"|ua-name|"};
+      Carp::croak("No agent name specified") unless $ua;
+  }
+
+  $self;
+}
+
+sub agent {
+    my($self, $newname) = @_;
+    my $old = $self->{'dbm'}{"|ua-name|"};
+    if (defined $newname) {
+	$newname =~ s!/?\s*\d+.\d+\s*$!!;  # loose version
+	unless ($old && $old eq $newname) {
+	# Old info is now stale.
+	    my $file = $self->{'filename'};
+	    untie %{$self->{'dbm'}};
+	    tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_TRUNC|O_RDWR, 0640;
+	    %{$self->{'dbm'}} = ();
+	    $self->{'dbm'}{"|ua-name|"} = $newname;
+	}
+    }
+    $old;
+}
+
+sub no_visits {
+    my ($self, $netloc) = @_;
+    my $t = $self->{'dbm'}{"$netloc|vis"};
+    return 0 unless $t;
+    (split(/;\s*/, $t))[0];
+}
+
+sub last_visit {
+    my ($self, $netloc) = @_;
+    my $t = $self->{'dbm'}{"$netloc|vis"};
+    return undef unless $t;
+    (split(/;\s*/, $t))[1];
+}
+
+sub fresh_until {
+    my ($self, $netloc, $fresh) = @_;
+    my $old = $self->{'dbm'}{"$netloc|exp"};
+    if ($old) {
+	$old =~ s/;.*//;  # remove cleartext
+    }
+    if (defined $fresh) {
+	$fresh .= "; " . localtime($fresh);
+	$self->{'dbm'}{"$netloc|exp"} = $fresh;
+    }
+    $old;
+}
+
+sub visit {
+    my($self, $netloc, $time) = @_;
+    $time ||= time;
+
+    my $count = 0;
+    my $old = $self->{'dbm'}{"$netloc|vis"};
+    if ($old) {
+	my $last;
+	($count,$last) = split(/;\s*/, $old);
+	$time = $last if $last > $time;
+    }
+    $count++;
+    $self->{'dbm'}{"$netloc|vis"} = "$count; $time; " . localtime($time);
+}
+
+sub push_rules {
+    my($self, $netloc, @rules) = @_;
+    my $cnt = 1;
+    $cnt++ while $self->{'dbm'}{"$netloc|r$cnt"};
+
+    foreach (@rules) {
+	$self->{'dbm'}{"$netloc|r$cnt"} = $_;
+	$cnt++;
+    }
+}
+
+sub clear_rules {
+    my($self, $netloc) = @_;
+    my $cnt = 1;
+    while ($self->{'dbm'}{"$netloc|r$cnt"}) {
+	delete $self->{'dbm'}{"$netloc|r$cnt"};
+	$cnt++;
+    }
+}
+
+sub rules {
+    my($self, $netloc) = @_;
+    my @rules = ();
+    my $cnt = 1;
+    while (1) {
+	my $rule = $self->{'dbm'}{"$netloc|r$cnt"};
+	last unless $rule;
+	push(@rules, $rule);
+	$cnt++;
+    }
+    @rules;
+}
+
+sub dump
+{
+}
+
+1;
+
+=head1 SEE ALSO
+
+L<WWW::RobotRules>, L<LWP::RobotUA>
+
+=head1 AUTHORS
+
+Hakan Ardo E<lt>hakan@munin.ub2.lu.se>, Gisle Aas E<lt>aas@sn.no>
+
+=cut
+
diff --git a/t/misc/dbmrobot b/t/misc/dbmrobot
new file mode 100755
index 0000000..c1794c4
--- /dev/null
+++ b/t/misc/dbmrobot
@@ -0,0 +1,23 @@
+#!/local/perl/bin/perl -w
+
+use URI::URL;
+$url = url(shift) || die "Usage: $0 <url>\n";
+
+require WWW::RobotRules::AnyDBM_File;
+require LWP::RobotUA;
+
+$botname = "Spider/0.1";
+
+$rules = new WWW::RobotRules::AnyDBM_File $botname, 'robotdb';
+$ua = new LWP::RobotUA $botname, 'gisle@aas.no', $rules;
+$ua->delay(0.1);
+
+my $req = new HTTP::Request GET => $url;
+
+my $res = $ua->request($req);
+print "Got ", $res->code, " ", $res->message, "(", $res->content_type, ")\n";
+
+my $netloc = $url->netloc;
+print "This was visit no ", $ua->no_visits($netloc), " to $netloc\n";
+
+
diff --git a/t/rules-dbm.t b/t/rules-dbm.t
new file mode 100644
index 0000000..2335b94
--- /dev/null
+++ b/t/rules-dbm.t
@@ -0,0 +1,128 @@
+
+print "1..13\n";
+
+
+use WWW::RobotRules::AnyDBM_File;
+
+$file = "test-$$";
+
+$r = new WWW::RobotRules::AnyDBM_File "myrobot/2.0", $file;
+
+$r->parse("http://www.aas.no/robots.txt", "");
+
+$r->visit("www.aas.no:80");
+
+print "not " if $r->no_visits("www.aas.no:80") != 1;
+print "ok 1\n";
+
+
+$r->push_rules("www.sn.no:80", "/aas", "/per");
+$r->push_rules("www.sn.no:80", "/god", "/old");
+
+@r = $r->rules("www.sn.no:80");
+print "Rules: @r\n";
+
+print "not " if "@r" ne "/aas /per /god /old";
+print "ok 2\n";
+
+$r->clear_rules("per");
+$r->clear_rules("www.sn.no:80");
+
+@r = $r->rules("www.sn.no:80");
+print "Rules: @r\n";
+
+print "not " if "@r" ne "";
+print "ok 3\n";
+
+$r->visit("www.aas.no:80", time+10);
+$r->visit("www.sn.no:80");
+
+print "No visits: ", $r->no_visits("www.aas.no:80"), "\n";
+print "Last visit: ", $r->last_visit("www.aas.no:80"), "\n";
+print "Fresh until: ", $r->fresh_until("www.aas.no:80"), "\n";
+
+print "not " if $r->no_visits("www.aas.no:80") != 2;
+print "ok 4\n";
+
+print "not " if abs($r->last_visit("www.sn.no:80") - time) > 2;
+print "ok 5\n";
+
+$r = undef;
+
+# Try to reopen the database without a name specified
+$r = new WWW::RobotRules::AnyDBM_File undef, $file;
+$r->visit("www.aas.no:80");
+
+print "not " if $r->no_visits("www.aas.no:80") != 3;
+print "ok 6\n";
+
+print "Agent-Name: ", $r->agent, "\n";
+print "not " if $r->agent ne "myrobot";
+print "ok 7\n";
+
+$r = undef;
+
+print "*** Dump of database ***\n";
+tie(%cat, AnyDBM_File, $file, 0, 0644) or die "Can't tie: $!";
+while (($key,$val) = each(%cat)) {
+    print "$key\t$val\n";
+}
+print "******\n";
+
+untie %cat;
+
+# Try to open database with a different agent name
+$r = new WWW::RobotRules::AnyDBM_File "MOMSpider/2.0", $file;
+
+print "not " if $r->no_visits("www.sn.no:80");
+print "ok 8\n";
+
+# Try parsing
+$r->parse("http://www.sn.no:8080/robots.txt", <<EOT, (time + 3));
+
+User-Agent: *
+Disallow: /
+
+User-Agent: Momspider
+Disallow: /foo
+Disallow: /bar
+
+EOT
+
+@r = $r->rules("www.sn.no:8080");
+print "not " if "@r" ne "/foo /bar";
+print "ok 9\n";
+
+print "not " if $r->allowed("http://www.sn.no") >= 0;
+print "ok 10\n";
+
+print "not " if $r->allowed("http://www.sn.no:8080/foo/gisle");
+print "ok 11\n";
+
+sleep(5);  # wait until file has expired
+print "not " if $r->allowed("http://www.sn.no:8080/foo/gisle") >= 0;
+print "ok 12\n";
+
+
+$r = undef;
+
+print "*** Dump of database ***\n";
+tie(%cat, AnyDBM_File, $file, 0, 0644) or die "Can't tie: $!";
+while (($key,$val) = each(%cat)) {
+    print "$key\t$val\n";
+}
+print "******\n";
+
+untie %cat;			# Otherwise the next line fails on DOSish
+
+while (unlink("$file", "$file.pag", "$file.dir", "$file.db")) {}
+
+# Try open a an emty database without specifying a name
+eval { 
+   $r = new WWW::RobotRules::AnyDBM_File undef, $file;
+};
+print $@;
+print "not " unless $@;  # should fail
+print "ok 13\n";
+
+unlink "$file", "$file.pag", "$file.dir", "$file.db";
diff --git a/t/rules.t b/t/rules.t
new file mode 100644
index 0000000..26b1025
--- /dev/null
+++ b/t/rules.t
@@ -0,0 +1,230 @@
+#!/local/bin/perl
+
+=head1 NAME
+
+robot-rules.t
+
+=head1 DESCRIPTION
+
+Test a number of different A</robots.txt> files against a number
+of different User-agents.
+
+=cut
+
+require WWW::RobotRules;
+use Carp;
+use strict;
+
+print "1..50\n"; # for Test::Harness
+
+# We test a number of different /robots.txt files,
+#
+
+my $content1 = <<EOM;
+# http://foo/robots.txt
+User-agent: *
+Disallow: /private
+Disallow: http://foo/also_private
+
+User-agent: MOMspider
+Disallow:
+EOM
+
+my $content2 = <<EOM;
+# http://foo/robots.txt
+User-agent: MOMspider
+ # comment which should be ignored
+Disallow: /private
+EOM
+
+my $content3 = <<EOM;
+# http://foo/robots.txt
+EOM
+
+my $content4 = <<EOM;
+# http://foo/robots.txt
+User-agent: *
+Disallow: /private
+Disallow: mailto:foo
+
+User-agent: MOMspider
+Disallow: /this
+
+User-agent: Another
+Disallow: /that
+
+
+User-agent: SvartEnke1
+Disallow: http://fOO
+Disallow: http://bar
+
+User-Agent: SvartEnke2
+Disallow: ftp://foo
+Disallow: http://foo:8080/
+Disallow: http://bar/
+
+Sitemap: http://www.adobe.com/sitemap.xml
+EOM
+
+my $content5 = <<EOM;
+# I've locked myself away
+User-agent: *
+Disallow: /
+# The castle is your home now, so you can go anywhere you like.
+User-agent: Belle
+Disallow: /west-wing/ # except the west wing!
+# It's good to be the Prince...
+User-agent: Beast
+Disallow: 
+EOM
+
+# same thing backwards
+my $content6 = <<EOM;
+# It's good to be the Prince...
+User-agent: Beast
+Disallow: 
+# The castle is your home now, so you can go anywhere you like.
+User-agent: Belle
+Disallow: /west-wing/ # except the west wing!
+# I've locked myself away
+User-agent: *
+Disallow: /
+EOM
+
+# and a number of different robots:
+
+my @tests1 = (
+	   [$content1, 'MOMspider' =>
+	    1 => 'http://foo/private' => 1,
+	    2 => 'http://foo/also_private' => 1,
+	   ],
+
+	   [$content1, 'Wubble' =>
+	    3 => 'http://foo/private' => 0,
+	    4 => 'http://foo/also_private' => 0,
+	    5 => 'http://foo/other' => 1,
+	   ],
+
+	   [$content2, 'MOMspider' =>
+	    6 => 'http://foo/private' => 0,
+	    7 => 'http://foo/other' => 1,
+	   ],
+
+	   [$content2, 'Wubble' =>
+	    8  => 'http://foo/private' => 1,
+	    9  => 'http://foo/also_private' => 1,
+	    10 => 'http://foo/other' => 1,
+	   ],
+
+	   [$content3, 'MOMspider' =>
+	    11 => 'http://foo/private' => 1,
+	    12 => 'http://foo/other' => 1,
+	   ],
+
+	   [$content3, 'Wubble' =>
+	    13 => 'http://foo/private' => 1,
+	    14 => 'http://foo/other' => 1,
+	   ],
+
+	   [$content4, 'MOMspider' =>
+	    15 => 'http://foo/private' => 1,
+	    16 => 'http://foo/this' => 0,
+	    17 => 'http://foo/that' => 1,
+	   ],
+
+	   [$content4, 'Another' =>
+	    18 => 'http://foo/private' => 1,
+	    19 => 'http://foo/this' => 1,
+	    20 => 'http://foo/that' => 0,
+	   ],
+
+	   [$content4, 'Wubble' =>
+	    21 => 'http://foo/private' => 0,
+	    22 => 'http://foo/this' => 1,
+	    23 => 'http://foo/that' => 1,
+	   ],
+
+	   [$content4, 'Another/1.0' =>
+	    24 => 'http://foo/private' => 1,
+	    25 => 'http://foo/this' => 1,
+	    26 => 'http://foo/that' => 0,
+	   ],
+
+	   [$content4, "SvartEnke1" =>
+	    27 => "http://foo/" => 0,
+	    28 => "http://foo/this" => 0,
+	    29 => "http://bar/" => 1,
+	   ],
+
+	   [$content4, "SvartEnke2" =>
+	    30 => "http://foo/" => 1,
+	    31 => "http://foo/this" => 1,
+	    32 => "http://bar/" => 1,
+	   ],
+
+	   [$content4, "MomSpiderJr" =>   # should match "MomSpider"
+	    33 => 'http://foo/private' => 1,
+	    34 => 'http://foo/also_private' => 1,
+	    35 => 'http://foo/this/' => 0,
+	   ],
+
+	   [$content4, "SvartEnk" =>      # should match "*"
+	    36 => "http://foo/" => 1,
+	    37 => "http://foo/private/" => 0,
+	    38 => "http://bar/" => 1,
+	   ],
+
+	   [$content5, 'Villager/1.0' =>
+	    39 => 'http://foo/west-wing/' => 0,
+	    40 => 'http://foo/' => 0,
+	   ],
+
+	   [$content5, 'Belle/2.0' =>
+	    41 => 'http://foo/west-wing/' => 0,
+	    42 => 'http://foo/' => 1,
+	   ],
+
+	   [$content5, 'Beast/3.0' =>
+	    43 => 'http://foo/west-wing/' => 1,
+	    44 => 'http://foo/' => 1,
+	   ],
+
+	   [$content6, 'Villager/1.0' =>
+	    45 => 'http://foo/west-wing/' => 0,
+	    46 => 'http://foo/' => 0,
+	   ],
+
+	   [$content6, 'Belle/2.0' =>
+	    47 => 'http://foo/west-wing/' => 0,
+	    48 => 'http://foo/' => 1,
+	   ],
+
+	   [$content6, 'Beast/3.0' =>
+	    49 => 'http://foo/west-wing/' => 1,
+	    50 => 'http://foo/' => 1,
+	   ],
+
+	   # when adding tests, remember to increase
+	   # the maximum at the top
+
+	  );
+
+my $t;
+
+for $t (@tests1) {
+    my ($content, $ua) = splice(@$t, 0, 2);
+
+    my $robotsrules = new WWW::RobotRules($ua);
+    $robotsrules->parse('http://foo/robots.txt', $content);
+
+    my ($num, $path, $expected);
+    while(($num, $path, $expected) = splice(@$t, 0, 3)) {
+	my $allowed = $robotsrules->allowed($path);
+	$allowed = 1 if $allowed;
+	if($allowed != $expected) {
+	    $robotsrules->dump;
+	    confess "Test Failed: $ua => $path ($allowed != $expected)";
+	}
+	print "ok $num\n";
+    }
+}
author	Lorry Tar Creator <lorry-tar-importer@lorry>	2012-02-18 13:07:29 +0000
committer	Lorry Tar Creator <lorry-tar-importer@lorry>	2012-02-18 13:07:29 +0000
commit	80c86924c3437c0ad64852ea29c7eab1197dfe90 (patch)
tree	11f2945e6c83ffc5d3fec7519d4cb4e9c7194ff5
download	WWW-RobotRules-tarball-master.tar.gz