summaryrefslogtreecommitdiff
path: root/Lib/urlparse.py
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1996-05-28 23:54:24 +0000
committerGuido van Rossum <guido@python.org>1996-05-28 23:54:24 +0000
commitf39882d84b8d414952217d92ce8c385fe5f39808 (patch)
tree16ba9b964655ca3f5d18574f0f5b9afe99b8b164 /Lib/urlparse.py
parentfe11675d322c34bb1ab071679da8e0bc9dea8662 (diff)
downloadcpython-f39882d84b8d414952217d92ce8c385fe5f39808.tar.gz
optimizations due to Fred Drake; added urldefrag() function
Diffstat (limited to 'Lib/urlparse.py')
-rw-r--r--Lib/urlparse.py53
1 files changed, 35 insertions, 18 deletions
diff --git a/Lib/urlparse.py b/Lib/urlparse.py
index 571de46a37..571ef0eb0c 100644
--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@@ -3,6 +3,7 @@
# Standard/builtin Python modules
import string
+from string import joinfields, splitfields, find, rfind
# A classification of schemes ('' means apply by default)
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
@@ -18,17 +19,23 @@ uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
# Characters valid in scheme names
scheme_chars = string.letters + string.digits + '+-.'
+_parse_cache = {}
+
+def clear_cache():
+ global _parse_cache
+ _parse_cache = {}
+
+
# Parse a URL into 6 components:
# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
# Note that we don't break the components up in smaller bits
# (e.g. netloc is a single string) and we don't expand % escapes.
def urlparse(url, scheme = '', allow_framents = 1):
- netloc = ''
- path = ''
- params = ''
- query = ''
- fragment = ''
+ key = url, scheme, allow_framents
+ if _parse_cache.has_key(key):
+ return _parse_cache[key]
+ netloc = path = params = query = fragment = ''
i = string.find(url, ':')
if i > 0:
for c in url[:i]:
@@ -54,7 +61,9 @@ def urlparse(url, scheme = '', allow_framents = 1):
i = string.find(url, ';')
if i >= 0:
url, params = url[:i], url[i+1:]
- return scheme, netloc, url, params, query, fragment
+ tuple = scheme, netloc, url, params, query, fragment
+ _parse_cache[key] = tuple
+ return tuple
# Put a parsed URL back together again. This may result in a slightly
# different, but equivalent URL, if the URL that was parsed originally
@@ -80,7 +89,7 @@ def urljoin(base, url, allow_framents = 1):
if not base:
return url
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
- urlparse(base, '', allow_framents)
+ urlparse(base, '', allow_framents)
scheme, netloc, path, params, query, fragment = \
urlparse(url, bscheme, allow_framents)
# XXX Unofficial hack: default netloc to bnetloc even if
@@ -90,9 +99,9 @@ def urljoin(base, url, allow_framents = 1):
scheme in uses_netloc and bscheme in uses_netloc:
netloc = bnetloc
# Strip the port number
- i = string.find(netloc, '@')
+ i = find(netloc, '@')
if i < 0: i = 0
- i = string.find(netloc, ':', i)
+ i = find(netloc, ':', i)
if i >= 0:
netloc = netloc[:i]
if scheme != bscheme or scheme not in uses_relative:
@@ -107,15 +116,12 @@ def urljoin(base, url, allow_framents = 1):
return urlunparse((scheme, netloc, path,
params, query, fragment))
if not path:
- path = bpath
- if not query:
- query = bquery
- return urlunparse((scheme, netloc, path,
- params, query, fragment))
- i = string.rfind(bpath, '/')
+ return urlunparse((scheme, netloc, bpath,
+ params, query or bquery, fragment))
+ i = rfind(bpath, '/')
if i >= 0:
path = bpath[:i] + '/' + path
- segments = string.splitfields(path, '/')
+ segments = splitfields(path, '/')
if segments[-1] == '.':
segments[-1] = ''
while '.' in segments:
@@ -132,10 +138,21 @@ def urljoin(base, url, allow_framents = 1):
break
if len(segments) >= 2 and segments[-1] == '..':
segments[-2:] = ['']
- path = string.joinfields(segments, '/')
- return urlunparse((scheme, netloc, path,
+ return urlunparse((scheme, netloc, joinfields(segments, '/'),
params, query, fragment))
+def urldefrag(url):
+ """Removes any existing fragment from URL.
+
+ Returns a tuple of the defragmented URL and the fragment. If
+ the URL contained no fragments, the second element is the
+ empty string.
+ """
+ s, n, p, a, q, frag = urlparse(url)
+ defrag = urlunparse((s, n, p, a, q, ''))
+ return defrag, frag
+
+
test_input = """
http://a/b/c/d