summaryrefslogtreecommitdiff
path: root/lib/URI/Heuristic.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/URI/Heuristic.pm')
-rw-r--r--lib/URI/Heuristic.pm253
1 files changed, 253 insertions, 0 deletions
diff --git a/lib/URI/Heuristic.pm b/lib/URI/Heuristic.pm
new file mode 100644
index 0000000..d4ace34
--- /dev/null
+++ b/lib/URI/Heuristic.pm
@@ -0,0 +1,253 @@
+package URI::Heuristic;
+
+=head1 NAME
+
+URI::Heuristic - Expand URI using heuristics
+
+=head1 SYNOPSIS
+
+ use URI::Heuristic qw(uf_uristr);
+ $u = uf_uristr("perl"); # http://www.perl.com
+ $u = uf_uristr("www.sol.no/sol"); # http://www.sol.no/sol
+ $u = uf_uristr("aas"); # http://www.aas.no
+ $u = uf_uristr("ftp.funet.fi"); # ftp://ftp.funet.fi
+ $u = uf_uristr("/etc/passwd"); # file:/etc/passwd
+
+=head1 DESCRIPTION
+
+This module provides functions that expand strings into real absolute
+URIs using some built-in heuristics. Strings that already represent
+absolute URIs (i.e. that start with a C<scheme:> part) are never modified
+and are returned unchanged. The main use of these functions is to
+allow abbreviated URIs similar to what many web browsers allow for URIs
+typed in by the user.
+
+The following functions are provided:
+
+=over 4
+
+=item uf_uristr($str)
+
+Tries to make the argument string
+into a proper absolute URI string. The "uf_" prefix stands for "User
+Friendly". Under MacOS, it assumes that any string with a common URL
+scheme (http, ftp, etc.) is a URL rather than a local path. So don't name
+your volumes after common URL schemes and expect uf_uristr() to construct
+valid file: URL's on those volumes for you, because it won't.
+
+=item uf_uri($str)
+
+Works the same way as uf_uristr() but
+returns a C<URI> object.
+
+=back
+
+=head1 ENVIRONMENT
+
+If the hostname portion of a URI does not contain any dots, then
+certain qualified guesses are made. These guesses are governed by
+the following environment variables:
+
+=over 10
+
+=item COUNTRY
+
+The two-letter country code (ISO 3166) for your location. If
+the domain name of your host ends with two letters, then it is taken
+to be the default country. See also L<Locale::Country>.
+
+=item HTTP_ACCEPT_LANGUAGE, LC_ALL, LANG
+
+If COUNTRY is not set, these standard environment variables are
+examined and country (not language) information possibly found in them
+is used as the default country.
+
+=item URL_GUESS_PATTERN
+
+Contains a space-separated list of URL patterns to try. The string
+"ACME" is for some reason used as a placeholder for the host name in
+the URL provided. Example:
+
+ URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com"
+ export URL_GUESS_PATTERN
+
+Specifying URL_GUESS_PATTERN disables any guessing rules based on
+country. An empty URL_GUESS_PATTERN disables any guessing that
+involves host name lookups.
+
+=back
+
+=head1 COPYRIGHT
+
+Copyright 1997-1998, Gisle Aas
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+use strict;
+use warnings;
+
+use Exporter 5.57 'import';
+our @EXPORT_OK = qw(uf_uri uf_uristr uf_url uf_urlstr);
+our $VERSION = "4.20";
+
+our ($MY_COUNTRY, $DEBUG);
+
+sub MY_COUNTRY() {
+ for ($MY_COUNTRY) {
+ return $_ if defined;
+
+ # First try the environment.
+ $_ = $ENV{COUNTRY};
+ return $_ if defined;
+
+ # Try the country part of LC_ALL and LANG from environment
+ my @srcs = ($ENV{LC_ALL}, $ENV{LANG});
+ # ...and HTTP_ACCEPT_LANGUAGE before those if present
+ if (my $httplang = $ENV{HTTP_ACCEPT_LANGUAGE}) {
+ # TODO: q-value processing/ordering
+ for $httplang (split(/\s*,\s*/, $httplang)) {
+ if ($httplang =~ /^\s*([a-zA-Z]+)[_-]([a-zA-Z]{2})\s*$/) {
+ unshift(@srcs, "${1}_${2}");
+ last;
+ }
+ }
+ }
+ for (@srcs) {
+ next unless defined;
+ return lc($1) if /^[a-zA-Z]+_([a-zA-Z]{2})(?:[.@]|$)/;
+ }
+
+ # Last bit of domain name. This may access the network.
+ require Net::Domain;
+ my $fqdn = Net::Domain::hostfqdn();
+ $_ = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/;
+ return $_ if defined;
+
+ # Give up. Defined but false.
+ return ($_ = 0);
+ }
+}
+
+our %LOCAL_GUESSING =
+(
+ 'us' => [qw(www.ACME.gov www.ACME.mil)],
+ 'gb' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)],
+ 'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)],
+ 'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)],
+ # send corrections and new entries to <gisle@aas.no>
+);
+# Backwards compatibility; uk != United Kingdom in ISO 3166
+$LOCAL_GUESSING{uk} = $LOCAL_GUESSING{gb};
+
+
+sub uf_uristr ($)
+{
+ local($_) = @_;
+ print STDERR "uf_uristr: resolving $_\n" if $DEBUG;
+ return unless defined;
+
+ s/^\s+//;
+ s/\s+$//;
+
+ if (/^(www|web|home)[a-z0-9-]*(?:\.|$)/i) {
+ $_ = "http://$_";
+
+ } elsif (/^(ftp|gopher|news|wais|https|http)[a-z0-9-]*(?:\.|$)/i) {
+ $_ = lc($1) . "://$_";
+
+ } elsif ($^O ne "MacOS" &&
+ (m,^/, || # absolute file name
+ m,^\.\.?/, || # relative file name
+ m,^[a-zA-Z]:[/\\],) # dosish file name
+ )
+ {
+ $_ = "file:$_";
+
+ } elsif ($^O eq "MacOS" && m/:/) {
+ # potential MacOS file name
+ unless (m/^(ftp|gopher|news|wais|http|https|mailto):/) {
+ require URI::file;
+ my $a = URI::file->new($_)->as_string;
+ $_ = ($a =~ m/^file:/) ? $a : "file:$a";
+ }
+ } elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) {
+ $_ = "mailto:$_";
+
+ } elsif (!/^[a-zA-Z][a-zA-Z0-9.+\-]*:/) { # no scheme specified
+ if (s/^([-\w]+(?:\.[-\w]+)*)([\/:\?\#]|$)/$2/) {
+ my $host = $1;
+
+ my $scheme = "http";
+ if (/^:(\d+)\b/) {
+ # Some more or less well known ports
+ if ($1 =~ /^[56789]?443$/) {
+ $scheme = "https";
+ } elsif ($1 eq "21") {
+ $scheme = "ftp";
+ }
+ }
+
+ if ($host !~ /\./ && $host ne "localhost") {
+ my @guess;
+ if (exists $ENV{URL_GUESS_PATTERN}) {
+ @guess = map { s/\bACME\b/$host/; $_ }
+ split(' ', $ENV{URL_GUESS_PATTERN});
+ } else {
+ if (MY_COUNTRY()) {
+ my $special = $LOCAL_GUESSING{MY_COUNTRY()};
+ if ($special) {
+ my @special = @$special;
+ push(@guess, map { s/\bACME\b/$host/; $_ }
+ @special);
+ } else {
+ push(@guess, "www.$host." . MY_COUNTRY());
+ }
+ }
+ push(@guess, map "www.$host.$_",
+ "com", "org", "net", "edu", "int");
+ }
+
+
+ my $guess;
+ for $guess (@guess) {
+ print STDERR "uf_uristr: gethostbyname('$guess.')..."
+ if $DEBUG;
+ if (gethostbyname("$guess.")) {
+ print STDERR "yes\n" if $DEBUG;
+ $host = $guess;
+ last;
+ }
+ print STDERR "no\n" if $DEBUG;
+ }
+ }
+ $_ = "$scheme://$host$_";
+
+ } else {
+ # pure junk, just return it unchanged...
+
+ }
+ }
+ print STDERR "uf_uristr: ==> $_\n" if $DEBUG;
+
+ $_;
+}
+
+sub uf_uri ($)
+{
+ require URI;
+ URI->new(uf_uristr($_[0]));
+}
+
+# legacy
+*uf_urlstr = \*uf_uristr;
+
+sub uf_url ($)
+{
+ require URI::URL;
+ URI::URL->new(uf_uristr($_[0]));
+}
+
+1;