summaryrefslogtreecommitdiff
path: root/urlgrabber
diff options
context:
space:
mode:
Diffstat (limited to 'urlgrabber')
-rw-r--r--urlgrabber/__init__.py2
-rw-r--r--urlgrabber/byterange.py189
-rw-r--r--urlgrabber/grabber.py635
-rw-r--r--urlgrabber/mirror.py86
-rw-r--r--urlgrabber/progress.py76
5 files changed, 534 insertions, 454 deletions
diff --git a/urlgrabber/__init__.py b/urlgrabber/__init__.py
index e1028c6..65b3145 100644
--- a/urlgrabber/__init__.py
+++ b/urlgrabber/__init__.py
@@ -52,4 +52,4 @@ __author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \
'Zdenek Pavlas <zpavlas@redhat.com>'
__url__ = 'http://urlgrabber.baseurl.org/'
-from grabber import urlgrab, urlopen, urlread
+from .grabber import urlgrab, urlopen, urlread
diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py
index 5efa160..e341add 100644
--- a/urlgrabber/byterange.py
+++ b/urlgrabber/byterange.py
@@ -9,9 +9,9 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
@@ -19,53 +19,71 @@
import os
+import sys
import stat
import urllib
-import urllib2
-import rfc822
+import email
+import ftplib
+import socket
+import sys
+import mimetypes
+
+try:
+ from urllib.request import BaseHandler, FileHandler, FTPHandler, URLError
+ from urllib.request import addclosehook, addinfourl
+ from urllib.request import ftpwrapper as urllib_ftpwrapper
+ from urllib.parse import splitport, splituser, splitpasswd, splitattr, unquote
+except ImportError:
+ from urllib2 import BaseHandler, FileHandler, FTPHandler, URLError
+ from urllib2 import ftpwrapper as urllib_ftpwrapper
+ from urllib import (splitport, splituser, splitpasswd, splitattr,
+ unquote, addclosehook, addinfourl)
DEBUG = None
-try:
+if sys.version_info >= (3,):
+ # We do an explicit version check here because because python2
+ # also has an io module with StringIO, but it is incompatible,
+ # and returns str instead of unicode somewhere.
+ from io import StringIO
+else:
from cStringIO import StringIO
-except ImportError, msg:
- from StringIO import StringIO
class RangeError(IOError):
"""Error raised when an unsatisfiable range is requested."""
pass
-
-class HTTPRangeHandler(urllib2.BaseHandler):
+
+class HTTPRangeHandler(BaseHandler):
"""Handler that enables HTTP Range headers.
-
+
This was extremely simple. The Range header is a HTTP feature to
- begin with so all this class does is tell urllib2 that the
- "206 Partial Content" response from the HTTP server is what we
+ begin with so all this class does is tell urllib2 that the
+ "206 Partial Content" response from the HTTP server is what we
expected.
-
+
Example:
import urllib2
import byterange
-
+
range_handler = range.HTTPRangeHandler()
- opener = urllib2.build_opener(range_handler)
-
+ opener = urllib.request.build_opener(range_handler)
+
# install it
- urllib2.install_opener(opener)
-
+ urllib.request.install_opener(opener)
+
# create Request and set Range header
- req = urllib2.Request('http://www.python.org/')
+ req = urllib.request.Request('http://www.python.org/')
req.header['Range'] = 'bytes=30-50'
- f = urllib2.urlopen(req)
+ f = urllib.request.urlopen(req)
"""
-
+
def http_error_206(self, req, fp, code, msg, hdrs):
# 206 Partial Content Response
r = urllib.addinfourl(fp, hdrs, req.get_full_url())
r.code = code
r.msg = msg
return r
-
+
def http_error_416(self, req, fp, code, msg, hdrs):
# HTTP's Range Not Satisfiable error
raise RangeError(9, 'Requested Range Not Satisfiable')
@@ -81,13 +99,13 @@ class HTTPSRangeHandler(HTTPRangeHandler):
class RangeableFileObject:
"""File object wrapper to enable raw range handling.
- This was implemented primarilary for handling range
- specifications for file:// urls. This object effectively makes
- a file object look like it consists only of a range of bytes in
+ This was implemented primarilary for handling range
+ specifications for file:// urls. This object effectively makes
+ a file object look like it consists only of a range of bytes in
the stream.
-
+
Examples:
- # expose 10 bytes, starting at byte position 20, from
+ # expose 10 bytes, starting at byte position 20, from
# /etc/aliases.
>>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30))
# seek seeks within the range (to position 23 in this case)
@@ -99,11 +117,11 @@ class RangeableFileObject:
# byte in the range. the following will return only 7 bytes.
>>> fo.read(30)
"""
-
+
def __init__(self, fo, rangetup):
"""Create a RangeableFileObject.
- fo -- a file like object. only the read() method need be
- supported but supporting an optimized seek() is
+ fo -- a file like object. only the read() method need be
+ supported but supporting an optimized seek() is
preferable.
rangetup -- a (firstbyte,lastbyte) tuple specifying the range
to work over.
@@ -113,24 +131,24 @@ class RangeableFileObject:
(self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
self.realpos = 0
self._do_seek(self.firstbyte)
-
+
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
Any attribute not found in _this_ object will be searched for
in self.fo. This includes methods."""
if hasattr(self.fo, name):
return getattr(self.fo, name)
- raise AttributeError, name
-
+ raise AttributeError(name)
+
def tell(self):
"""Return the position within the range.
- This is different from fo.seek in that position 0 is the
+ This is different from fo.seek in that position 0 is the
first byte position of the range tuple. For example, if
this object was created with a range tuple of (500,899),
tell() will return 0 when at byte position 500 of the file.
"""
return (self.realpos - self.firstbyte)
-
+
def seek(self,offset,whence=0):
"""Seek within the byte range.
Positioning is identical to that described under tell().
@@ -143,13 +161,13 @@ class RangeableFileObject:
elif whence == 2: # absolute from end of file
# XXX: are we raising the right Error here?
raise IOError('seek from end of file not supported.')
-
+
# do not allow seek past lastbyte in range
if self.lastbyte and (realoffset >= self.lastbyte):
realoffset = self.lastbyte
-
+
self._do_seek(realoffset - self.realpos)
-
+
def read(self, size=-1):
"""Read within the range.
This method will limit the size read based on the range.
@@ -158,7 +176,7 @@ class RangeableFileObject:
rslt = self.fo.read(size)
self.realpos += len(rslt)
return rslt
-
+
def readline(self, size=-1):
"""Read lines within the range.
This method will limit the size read based on the range.
@@ -167,7 +185,7 @@ class RangeableFileObject:
rslt = self.fo.readline(size)
self.realpos += len(rslt)
return rslt
-
+
def _calc_read_size(self, size):
"""Handles calculating the amount of data to read based on
the range.
@@ -179,7 +197,7 @@ class RangeableFileObject:
else:
size = (self.lastbyte - self.realpos)
return size
-
+
def _do_seek(self,offset):
"""Seek based on whether wrapped object supports seek().
offset is relative to the current position (self.realpos).
@@ -190,7 +208,7 @@ class RangeableFileObject:
else:
self.fo.seek(self.realpos + offset)
self.realpos+= offset
-
+
def _poor_mans_seek(self,offset):
"""Seek by calling the wrapped file objects read() method.
This is used for file like objects that do not have native
@@ -198,7 +216,7 @@ class RangeableFileObject:
to manually seek to the desired position.
offset -- read this number of bytes from the wrapped
file object.
- raise RangeError if we encounter EOF before reaching the
+ raise RangeError if we encounter EOF before reaching the
specified offset.
"""
pos = 0
@@ -211,25 +229,23 @@ class RangeableFileObject:
raise RangeError(9, 'Requested Range Not Satisfiable')
pos+= bufsize
-class FileRangeHandler(urllib2.FileHandler):
+class FileRangeHandler(FileHandler):
"""FileHandler subclass that adds Range support.
This class handles Range headers exactly like an HTTP
server would.
"""
def open_local_file(self, req):
- import mimetypes
- import mimetools
host = req.get_host()
file = req.get_selector()
localfile = urllib.url2pathname(file)
stats = os.stat(localfile)
size = stats[stat.ST_SIZE]
- modified = rfc822.formatdate(stats[stat.ST_MTIME])
+ modified = email.utils.formatdate(stats[stat.ST_MTIME])
mtype = mimetypes.guess_type(file)[0]
if host:
host, port = urllib.splitport(host)
if port or socket.gethostbyname(host) not in self.get_names():
- raise urllib2.URLError('file not on local host')
+ raise URLError('file not on local host')
fo = open(localfile,'rb')
brange = req.headers.get('Range',None)
brange = range_header_to_tuple(brange)
@@ -241,32 +257,24 @@ class FileRangeHandler(urllib2.FileHandler):
raise RangeError(9, 'Requested Range Not Satisfiable')
size = (lb - fb)
fo = RangeableFileObject(fo, (fb,lb))
- headers = mimetools.Message(StringIO(
+ headers = email.message_from_string(
'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
- (mtype or 'text/plain', size, modified)))
+ (mtype or 'text/plain', size, modified))
return urllib.addinfourl(fo, headers, 'file:'+file)
-# FTP Range Support
+# FTP Range Support
# Unfortunately, a large amount of base FTP code had to be copied
# from urllib and urllib2 in order to insert the FTP REST command.
-# Code modifications for range support have been commented as
+# Code modifications for range support have been commented as
# follows:
# -- range support modifications start/end here
-from urllib import splitport, splituser, splitpasswd, splitattr, \
- unquote, addclosehook, addinfourl
-import ftplib
-import socket
-import sys
-import mimetypes
-import mimetools
-
-class FTPRangeHandler(urllib2.FTPHandler):
+class FTPRangeHandler(FTPHandler):
def ftp_open(self, req):
host = req.get_host()
if not host:
- raise IOError, ('ftp error', 'no host given')
+ raise IOError('ftp error', 'no host given')
host, port = splitport(host)
if port is None:
port = ftplib.FTP_PORT
@@ -282,11 +290,11 @@ class FTPRangeHandler(urllib2.FTPHandler):
host = unquote(host)
user = unquote(user or '')
passwd = unquote(passwd or '')
-
+
try:
host = socket.gethostbyname(host)
- except socket.error, msg:
- raise urllib2.URLError(msg)
+ except socket.error as msg:
+ raise URLError(msg)
path, attrs = splitattr(req.get_selector())
dirs = path.split('/')
dirs = map(unquote, dirs)
@@ -301,22 +309,22 @@ class FTPRangeHandler(urllib2.FTPHandler):
if attr.lower() == 'type' and \
value in ('a', 'A', 'i', 'I', 'd', 'D'):
type = value.upper()
-
+
# -- range support modifications start here
rest = None
- range_tup = range_header_to_tuple(req.headers.get('Range',None))
+ range_tup = range_header_to_tuple(req.headers.get('Range',None))
assert range_tup != ()
if range_tup:
(fb,lb) = range_tup
if fb > 0: rest = fb
# -- range support modifications end here
-
+
fp, retrlen = fw.retrfile(file, type, rest)
-
+
# -- range support modifications start here
if range_tup:
(fb,lb) = range_tup
- if lb == '':
+ if lb == '':
if retrlen is None or retrlen == 0:
raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.')
lb = retrlen
@@ -328,7 +336,7 @@ class FTPRangeHandler(urllib2.FTPHandler):
retrlen = lb - fb
fp = RangeableFileObject(fp, (0,retrlen))
# -- range support modifications end here
-
+
headers = ""
mtype = mimetypes.guess_type(req.get_full_url())[0]
if mtype:
@@ -338,14 +346,14 @@ class FTPRangeHandler(urllib2.FTPHandler):
sf = StringIO(headers)
headers = mimetools.Message(sf)
return addinfourl(fp, headers, req.get_full_url())
- except ftplib.all_errors, msg:
- raise IOError, ('ftp error', msg), sys.exc_info()[2]
+ except ftplib.all_errors as msg:
+ raise IOError('ftp error', msg).with_traceback(sys.exc_info()[2])
def connect_ftp(self, user, passwd, host, port, dirs):
fw = ftpwrapper(user, passwd, host, port, dirs)
return fw
-class ftpwrapper(urllib.ftpwrapper):
+class ftpwrapper(urllib_ftpwrapper):
# range support note:
# this ftpwrapper code is copied directly from
# urllib. The only enhancement is to add the rest
@@ -364,22 +372,22 @@ class ftpwrapper(urllib.ftpwrapper):
# Use nlst to see if the file exists at all
try:
self.ftp.nlst(file)
- except ftplib.error_perm, reason:
- raise IOError, ('ftp error', reason), sys.exc_info()[2]
+ except ftplib.error_perm as reason:
+ raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2])
# Restore the transfer mode!
self.ftp.voidcmd(cmd)
# Try to retrieve as a file
try:
cmd = 'RETR ' + file
conn = self.ftp.ntransfercmd(cmd, rest)
- except ftplib.error_perm, reason:
+ except ftplib.error_perm as reason:
if str(reason)[:3] == '501':
# workaround for REST not supported error
fp, retrlen = self.retrfile(file, type)
fp = RangeableFileObject(fp, (rest,''))
return (fp, retrlen)
elif str(reason)[:3] != '550':
- raise IOError, ('ftp error', reason), sys.exc_info()[2]
+ raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2])
if not conn:
# Set transfer mode to ASCII!
self.ftp.voidcmd('TYPE A')
@@ -400,17 +408,17 @@ class ftpwrapper(urllib.ftpwrapper):
_rangere = None
def range_header_to_tuple(range_header):
"""Get a (firstbyte,lastbyte) tuple from a Range header value.
-
+
Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
function pulls the firstbyte and lastbyte values and returns
a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
the header value, it is returned as an empty string in the
tuple.
-
+
Return None if range_header is None
- Return () if range_header does not conform to the range spec
+ Return () if range_header does not conform to the range spec
pattern.
-
+
"""
global _rangere
if range_header is None: return None
@@ -418,9 +426,9 @@ def range_header_to_tuple(range_header):
import re
_rangere = re.compile(r'^bytes=(\d{1,})-(\d*)')
match = _rangere.match(range_header)
- if match:
+ if match:
tup = range_tuple_normalize(match.group(1,2))
- if tup and tup[1]:
+ if tup and tup[1]:
tup = (tup[0],tup[1]+1)
return tup
return ()
@@ -433,14 +441,14 @@ def range_tuple_to_header(range_tup):
if range_tup is None: return None
range_tup = range_tuple_normalize(range_tup)
if range_tup:
- if range_tup[1]:
+ if range_tup[1]:
range_tup = (range_tup[0],range_tup[1] - 1)
return 'bytes=%s-%s' % range_tup
-
+
def range_tuple_normalize(range_tup):
"""Normalize a (first_byte,last_byte) range tuple.
Return a tuple whose first element is guaranteed to be an int
- and whose second element will be '' (meaning: the last byte) or
+ and whose second element will be '' (meaning: the last byte) or
an int. Finally, return None if the normalized tuple == (0,'')
as that is equivalent to retrieving the entire file.
"""
@@ -452,12 +460,13 @@ def range_tuple_normalize(range_tup):
# handle last byte
try: lb = range_tup[1]
except IndexError: lb = ''
- else:
+ else:
if lb is None: lb = ''
elif lb != '': lb = int(lb)
# check if range is over the entire file
if (fb,lb) == (0,''): return None
# check that the range is valid
- if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb))
+ if lb != '' and fb >= lb:
+ raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb))
return (fb,lb)
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index 074a82f..af5287b 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -9,15 +9,17 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
+from __future__ import print_function
+
"""A high-level cross-protocol url-grabber.
GENERAL ARGUMENTS (kwargs)
@@ -70,7 +72,7 @@ GENERAL ARGUMENTS (kwargs)
are shown but there's no in-progress display.
text = None
-
+
specifies alternative text to be passed to the progress meter
object. If not given, the default progress meter will use the
basename of the file.
@@ -116,7 +118,7 @@ GENERAL ARGUMENTS (kwargs)
the first 10 bytes of the file.
If set to None, no range will be used.
-
+
reget = None [None|'simple'|'check_timestamp']
whether to attempt to reget a partially-downloaded file. Reget
@@ -206,7 +208,7 @@ GENERAL ARGUMENTS (kwargs)
option. Note that python 2.2 handles the case of these
badly and if you do not use the proper case (shown here), your
values will be overridden with the defaults.
-
+
urlparser = URLParser()
The URLParser class handles pre-processing of URLs, including
@@ -246,12 +248,12 @@ GENERAL ARGUMENTS (kwargs)
ssl_context = None
No-op when using the curl backend (default)
-
+
ssl_verify_peer = True
Check the server's certificate to make sure it is valid with what our CA validates
-
+
ssl_verify_host = True
Check the server's hostname to make sure it matches the certificate DN
@@ -263,7 +265,7 @@ GENERAL ARGUMENTS (kwargs)
ssl_key_type = 'PEM'
PEM or DER - format of key
-
+
ssl_cert = None
Path to the ssl certificate the client should use to to authenticate with
@@ -271,26 +273,26 @@ GENERAL ARGUMENTS (kwargs)
ssl_cert_type = 'PEM'
PEM or DER - format of certificate
-
+
ssl_key_pass = None
password to access the ssl_key
-
+
size = None
- size (in bytes) or Maximum size of the thing being downloaded.
+ size (in bytes) or Maximum size of the thing being downloaded.
This is mostly to keep us from exploding with an endless datastream
-
+
max_header_size = 2097152
Maximum size (in bytes) of the headers.
-
+
ip_resolve = 'whatever'
What type of name to IP resolving to use, default is to do both IPV4 and
IPV6.
- async = (key, limit)
+ async_ = (key, limit)
When this option is set, the urlgrab() is not processed immediately
but queued. parallel_wait() then processes grabs in parallel, limiting
@@ -347,7 +349,7 @@ RETRY RELATED ARGUMENTS
retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
if 12 not in retrycodes:
retrycodes.append(12)
-
+
checkfunc = None
a function to do additional checks. This defaults to None, which
@@ -378,7 +380,7 @@ RETRY RELATED ARGUMENTS
function(obj, 'arg1', 2, kwarg=3)
# obj.filename = '/tmp/stuff'
# obj.url = 'http://foo.com/stuff'
-
+
NOTE: both the "args" tuple and "kwargs" dict must be present if
you use this syntax, but either (or both) can be empty.
@@ -437,7 +439,7 @@ RETRY RELATED ARGUMENTS
This callback is very similar to failure_callback. They are
passed the same arguments, so you could use the same function for
both.
-
+
retry_no_cache = False
When True, automatically enable no_cache for future retries if
@@ -515,26 +517,45 @@ BANDWIDTH THROTTLING
"""
-
-
import os
import sys
-import urlparse
import time
import string
import urllib
-import urllib2
-from httplib import responses
-import mimetools
-import thread
import types
import stat
import pycurl
from ftplib import parse150
-from StringIO import StringIO
-from httplib import HTTPException
import socket, select, fcntl
-from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
+from io import BytesIO
+import numbers
+
+try:
+ import urllib.parse as urlparse
+ urlquote, urlunquote = urlparse.quote, urlparse.unquote
+ from urllib.request import HTTPError, url2pathname, pathname2url
+except ImportError:
+ import urlparse
+ from urllib2 import HTTPError
+ urlquote, urlunquote = urllib.quote, urllib.unquote
+ from urllib import url2pathname, pathname2url
+
+try:
+ from http.client import responses, HTTPException
+except ImportError:
+ from httplib import responses, HTTPException
+
+if sys.version_info >= (3,):
+ # We do an explicit version check here because because python2
+ # also has an io module with StringIO, but it is incompatible,
+ # and returns str instead of unicode somewhere.
+ from io import StringIO
+else:
+ from cStringIO import StringIO
+
+from six import text_type, string_types
+
+from .byterange import range_tuple_normalize, range_tuple_to_header, RangeError
try:
import xattr
@@ -543,6 +564,17 @@ try:
except ImportError:
xattr = None
+def _bytes_repr(s):
+ "A wrapper to avoid the b'' that python3 insists on when printing bytes"
+ if isinstance(s, string_types):
+ return s
+ else:
+ return repr(s)[2:-1]
+
+def _urlunquote_convert(s):
+ if not isinstance(s, text_type):
+ s = s.decode('utf8')
+ return urlunquote(s)
########################################################################
# MODULE INITIALIZATION
@@ -555,9 +587,9 @@ except:
try:
# this part isn't going to do much - need to talk to gettext
from i18n import _
-except ImportError, msg:
+except ImportError as msg:
def _(st): return st
-
+
########################################################################
# functions for debugging output. These functions are here because they
# are also part of the module initialization.
@@ -587,7 +619,7 @@ def _init_default_logger(logspec=None):
the form
URLGRABBER_DEBUG=level,filename
-
+
where "level" can be either an integer or a log level from the
logging module (DEBUG, INFO, etc). If the integer is zero or
less, logging will be disabled. Filename is the filename where
@@ -600,7 +632,7 @@ def _init_default_logger(logspec=None):
URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt
URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout
URLGRABBER_DEBUG=INFO # log info and higher to stderr
-
+
This function is called during module initialization. It is not
intended to be called from outside. The only reason it is a
function at all is to keep the module-level namespace tidy and to
@@ -634,7 +666,7 @@ def _log_package_state():
if not DEBUG: return
DEBUG.debug('urlgrabber version = %s' % __version__)
DEBUG.debug('trans function "_" = %s' % _)
-
+
_init_default_logger()
_log_package_state()
@@ -656,7 +688,7 @@ def _(st):
def _to_utf8(obj, errors='replace'):
'''convert 'unicode' to an encoded utf-8 byte string '''
# stolen from yum.i18n
- if isinstance(obj, unicode):
+ if isinstance(obj, text_type):
obj = obj.encode('utf-8', errors)
return obj
@@ -665,7 +697,7 @@ def exception2msg(e):
return str(e)
except UnicodeEncodeError:
# always use byte strings
- return unicode(e).encode('utf8')
+ return text_type(e).encode('utf8')
########################################################################
# END UTILITY FUNCTIONS
@@ -694,7 +726,7 @@ class URLGrabError(IOError):
14 - HTTPError (includes .code and .exception attributes)
15 - user abort
16 - error writing to local file
-
+
MirrorGroup error codes (256 -- 511)
256 - No more mirrors left to try
@@ -705,7 +737,7 @@ class URLGrabError(IOError):
-1 - retry the download, unknown reason
Note: to test which group a code is in, you can simply do integer
- division by 256: e.errno / 256
+ division by 256: e.errno // 256
Negative codes are reserved for use by functions passed in to
retrygrab with checkfunc. The value -1 is built in as a generic
@@ -749,7 +781,7 @@ def urlgrab(url, filename=None, **kwargs):
If filename is none, the basename of the url is used.
urlgrab returns the filename of the local file, which may be different
from the passed-in filename if the copy_local kwarg == 0.
-
+
See module documentation for a description of possible kwargs.
"""
return default_grabber.urlgrab(url, filename, **kwargs)
@@ -759,7 +791,7 @@ def urlopen(url, **kwargs):
If a progress object or throttle specifications exist, then
a special file object will be returned that supports them.
The file object can be treated like any other file object.
-
+
See module documentation for a description of possible kwargs.
"""
return default_grabber.urlopen(url, **kwargs)
@@ -769,7 +801,7 @@ def urlread(url, limit=None, **kwargs):
If the limit is exceeded, an exception will be thrown. Note that urlread
is NOT intended to be used as a way of saying "I want the first N bytes"
but rather 'read the whole file into memory, but don't use too much'
-
+
See module documentation for a description of possible kwargs.
"""
return default_grabber.urlread(url, limit, **kwargs)
@@ -807,36 +839,39 @@ class URLParser:
"""
url = _to_utf8(url)
quote = opts.quote
-
+
if opts.prefix:
url = self.add_prefix(url, opts.prefix)
-
+
parts = urlparse.urlparse(url)
(scheme, host, path, parm, query, frag) = parts
if not scheme or (len(scheme) == 1 and scheme in string.letters):
# if a scheme isn't specified, we guess that it's "file:"
- if url[0] not in '/\\': url = os.path.abspath(url)
- url = 'file:' + urllib.pathname2url(url)
+ if url[0] not in b'/\\': url = os.path.abspath(url)
+ pathname = pathname2url(url)
+ if not isinstance(pathname, bytes):
+ pathname = pathname.encode('utf8')
+ url = b'file:' + pathname
parts = urlparse.urlparse(url)
quote = 0 # pathname2url quotes, so we won't do it again
-
- if scheme in ['http', 'https']:
+
+ if scheme in [b'http', b'https']:
parts = self.process_http(parts, url)
-
+
if quote is None:
quote = self.guess_should_quote(parts)
if quote:
parts = self.quote(parts)
-
+
url = urlparse.urlunparse(parts)
return url, parts
def add_prefix(self, url, prefix):
- if prefix[-1] == '/' or url[0] == '/':
+ if prefix.endswith(b'/') or url.startswith(b'/'):
url = prefix + url
else:
- url = prefix + '/' + url
+ url = prefix + b'/' + url
return url
def process_http(self, parts, url):
@@ -853,8 +888,10 @@ class URLParser:
passing into urlgrabber.
"""
(scheme, host, path, parm, query, frag) = parts
- path = urllib.quote(path)
- return (scheme, host, path, parm, query, frag)
+ newpath = urlquote(path, safe='/$')
+ if not isinstance(path, text_type) and isinstance(newpath, text_type):
+ newpath = newpath.encode('utf8')
+ return (scheme, host, newpath, parm, query, frag)
hexvals = '0123456789ABCDEF'
def guess_should_quote(self, parts):
@@ -868,9 +905,11 @@ class URLParser:
else -> 1
"""
(scheme, host, path, parm, query, frag) = parts
+ if not isinstance(path, text_type):
+ path = path.decode('utf8')
if ' ' in path:
return 1
- ind = string.find(path, '%')
+ ind = path.find('%')
if ind > -1:
while ind > -1:
if len(path) < ind+3:
@@ -879,10 +918,10 @@ class URLParser:
if code[0] not in self.hexvals or \
code[1] not in self.hexvals:
return 1
- ind = string.find(path, '%', ind+1)
+ ind = path.find('%', ind+1)
return 0
return 1
-
+
class URLGrabberOptions:
"""Class to ease kwargs handling."""
@@ -895,23 +934,23 @@ class URLGrabberOptions:
if delegate is None:
self._set_defaults()
self._set_attributes(**kwargs)
-
+
def __getattr__(self, name):
if self.delegate and hasattr(self.delegate, name):
return getattr(self.delegate, name)
- raise AttributeError, name
-
+ raise AttributeError(name)
+
def raw_throttle(self):
- """Calculate raw throttle value from throttle and bandwidth
+ """Calculate raw throttle value from throttle and bandwidth
values.
"""
- if self.throttle <= 0:
+ if self.throttle <= 0:
return 0
- elif type(self.throttle) == type(0):
+ elif isinstance(self.throttle, int):
return float(self.throttle)
else: # throttle is a float
return self.bandwidth * self.throttle
-
+
def find_proxy(self, url, scheme):
"""Find the proxy to use for this URL.
Use the proxies dictionary first, then libproxy.
@@ -953,19 +992,21 @@ class URLGrabberOptions:
options specified in kwargs.
"""
return URLGrabberOptions(delegate=self, **kwargs)
-
+
def _set_attributes(self, **kwargs):
"""Update object attributes with those provided in kwargs."""
self.__dict__.update(kwargs)
- if kwargs.has_key('range'):
+ if 'range' in kwargs:
# normalize the supplied range value
self.range = range_tuple_normalize(self.range)
+ if 'async' in kwargs:
+ self.async_ = self.__dict__.pop('async')
if not self.reget in [None, 'simple', 'check_timestamp']:
- raise URLGrabError(11, _('Illegal reget mode: %s') \
- % (self.reget, ))
+ raise URLGrabError(11, _('Illegal reget mode: %s')
+ % (self.reget,))
def _set_defaults(self):
- """Set all options to their default values.
+ """Set all options to their default values.
When adding new options, make sure a default is
provided here.
"""
@@ -1014,7 +1055,7 @@ class URLGrabberOptions:
self.size = None # if we know how big the thing we're getting is going
# to be. this is ultimately a MAXIMUM size for the file
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
- self.async = None # blocking by default
+ self.async_ = None # blocking by default
self.mirror_group = None
self.max_connections = 5
self.timedhosts = None
@@ -1023,10 +1064,10 @@ class URLGrabberOptions:
self.ftp_disable_epsv = False
self.no_cache = False
self.retry_no_cache = False
-
+
def __repr__(self):
return self.format()
-
+
def format(self, indent=' '):
keys = self.__dict__.keys()
if self.delegate is not None:
@@ -1034,8 +1075,7 @@ class URLGrabberOptions:
keys.sort()
s = '{\n'
for k in keys:
- s = s + indent + '%-15s: %s,\n' % \
- (repr(k), repr(self.__dict__[k]))
+ s = s + indent + '%-15r: %r,\n' % (k, self.__dict__[k])
if self.delegate:
df = self.delegate.format(indent + ' ')
s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
@@ -1055,19 +1095,19 @@ def _run_callback(cb, obj):
class URLGrabber(object):
"""Provides easy opening of URLs with a variety of options.
-
+
All options are specified as kwargs. Options may be specified when
the class is created and may be overridden on a per request basis.
-
+
New objects inherit default values from default_grabber.
"""
-
+
def __init__(self, **kwargs):
self.opts = URLGrabberOptions(**kwargs)
-
+
def _retry(self, opts, func, *args):
tries = 0
- while 1:
+ while True:
# there are only two ways out of this loop. The second has
# several "sub-ways"
# 1) via the return in the "try" block
@@ -1083,13 +1123,13 @@ class URLGrabber(object):
if DEBUG: DEBUG.info('attempt %i/%s: %s',
tries, opts.retry, args[0])
try:
- r = apply(func, (opts,) + args, {})
+ r = func(opts, *args)
if DEBUG: DEBUG.info('success')
return r
- except URLGrabError, e:
+ except URLGrabError as e:
exception = e
callback = opts.failure_callback
- except KeyboardInterrupt, e:
+ except KeyboardInterrupt as e:
exception = e
callback = opts.interrupt_callback
if not callback:
@@ -1105,62 +1145,62 @@ class URLGrabber(object):
if (opts.retry is None) or (tries == opts.retry):
if DEBUG: DEBUG.info('retries exceeded, re-raising')
- raise
+ raise exception
retrycode = getattr(exception, 'errno', None)
if (retrycode is not None) and (retrycode not in opts.retrycodes):
if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
retrycode, opts.retrycodes)
- raise
+ raise exception
if retrycode is not None and retrycode < 0 and opts.retry_no_cache:
opts.no_cache = True
-
+
def urlopen(self, url, opts=None, **kwargs):
"""open the url and return a file object
- If a progress object or throttle value specified when this
- object was created, then a special file object will be
- returned that supports them. The file object can be treated
+ If a progress object or throttle value specified when this
+ object was created, then a special file object will be
+ returned that supports them. The file object can be treated
like any other file object.
"""
url = _to_utf8(url)
opts = (opts or self.opts).derive(**kwargs)
- if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
- (url,parts) = opts.urlparser.parse(url, opts)
+ if DEBUG: DEBUG.debug('combined options: %r' % (opts,))
+ (url,parts) = opts.urlparser.parse(url, opts)
opts.find_proxy(url, parts[0])
def retryfunc(opts, url):
return PyCurlFileObject(url, filename=None, opts=opts)
return self._retry(opts, retryfunc, url)
-
+
def urlgrab(self, url, filename=None, opts=None, **kwargs):
"""grab the file at <url> and make a local copy at <filename>
If filename is none, the basename of the url is used.
- urlgrab returns the filename of the local file, which may be
+ urlgrab returns the filename of the local file, which may be
different from the passed-in filename if copy_local == 0.
"""
url = _to_utf8(url)
opts = (opts or self.opts).derive(**kwargs)
- if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
- (url,parts) = opts.urlparser.parse(url, opts)
+ if DEBUG: DEBUG.debug('combined options: %r' % (opts,))
+ (url,parts) = opts.urlparser.parse(url, opts)
(scheme, host, path, parm, query, frag) = parts
opts.find_proxy(url, scheme)
if filename is None:
- filename = os.path.basename( urllib.unquote(path) )
+ filename = os.path.basename(_urlunquote_convert(path))
if not filename:
# This is better than nothing.
filename = 'index.html'
if scheme == 'file' and not opts.copy_local:
- # just return the name of the local file - don't make a
+ # just return the name of the local file - don't make a
# copy currently
- path = urllib.url2pathname(path)
+ path = url2pathname(path)
if host:
path = os.path.normpath('//' + host + path)
if not os.path.exists(path):
- err = URLGrabError(2,
+ err = URLGrabError(2,
_('Local file does not exist: %s') % (path, ))
err.url = url
raise err
elif not os.path.isfile(path):
- err = URLGrabError(3,
+ err = URLGrabError(3,
_('Not a normal file: %s') % (path, ))
err.url = url
raise err
@@ -1170,8 +1210,8 @@ class URLGrabber(object):
obj = CallbackObject(filename=path, url=url)
_run_callback(opts.checkfunc, obj)
return path
-
- if opts.async:
+
+ if opts.async_:
opts.url = url
opts.filename = filename
opts.size = int(opts.size or 0)
@@ -1192,29 +1232,29 @@ class URLGrabber(object):
finally:
fo.close()
return filename
-
+
try:
return self._retry(opts, retryfunc, url, filename)
- except URLGrabError, e:
+ except URLGrabError as e:
_TH.update(url, 0, 0, e)
opts.exception = e
return _run_callback(opts.failfunc, opts)
-
+
def urlread(self, url, limit=None, opts=None, **kwargs):
"""read the url into a string, up to 'limit' bytes
If the limit is exceeded, an exception will be thrown. Note
- that urlread is NOT intended to be used as a way of saying
- "I want the first N bytes" but rather 'read the whole file
+ that urlread is NOT intended to be used as a way of saying
+ "I want the first N bytes" but rather 'read the whole file
into memory, but don't use too much'
"""
url = _to_utf8(url)
opts = (opts or self.opts).derive(**kwargs)
- if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
- (url,parts) = opts.urlparser.parse(url, opts)
+ if DEBUG: DEBUG.debug('combined options: %r' % (opts,))
+ (url,parts) = opts.urlparser.parse(url, opts)
opts.find_proxy(url, parts[0])
if limit is not None:
limit = limit + 1
-
+
def retryfunc(opts, url, limit):
fo = PyCurlFileObject(url, filename=None, opts=opts)
s = ''
@@ -1232,16 +1272,16 @@ class URLGrabber(object):
finally:
fo.close()
return s
-
+
s = self._retry(opts, retryfunc, url, limit)
if limit and len(s) > limit:
- err = URLGrabError(8,
+ err = URLGrabError(8,
_('Exceeded limit (%i): %s') % (limit, url))
err.url = url
raise err
return s
-
+
def _make_callback(self, callback_obj):
# not used, left for compatibility
if callable(callback_obj):
@@ -1257,7 +1297,7 @@ default_grabber = URLGrabber()
class PyCurlFileObject(object):
def __init__(self, url, filename, opts):
self.fo = None
- self._hdr_dump = ''
+ self._hdr_dump = b''
self._parsed_hdr = None
self.url = url
self.scheme = urlparse.urlsplit(self.url)[0]
@@ -1266,9 +1306,9 @@ class PyCurlFileObject(object):
self.reget_time = None
self.opts = opts
if self.opts.reget == 'check_timestamp':
- raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
+ raise NotImplementedError("check_timestamp regets are not implemented in this ver of urlgrabber. Please report this.")
self._complete = False
- self._rbuf = ''
+ self._rbuf = b''
self._rbufsize = 1024*8
self._ttime = time.time()
self._tsize = 0
@@ -1282,7 +1322,7 @@ class PyCurlFileObject(object):
self._tm_first = None
self._tm_last = None
self._do_open()
-
+
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
@@ -1291,7 +1331,7 @@ class PyCurlFileObject(object):
if hasattr(self.fo, name):
return getattr(self.fo, name)
- raise AttributeError, name
+ raise AttributeError(name)
def _retrieve(self, buf):
try:
@@ -1304,9 +1344,9 @@ class PyCurlFileObject(object):
if not self._prog_running:
if self.opts.progress_obj:
size = self.size + self._reget_length
- self.opts.progress_obj.start(self._prog_reportname,
- urllib.unquote(self.url),
- self._prog_basename,
+ self.opts.progress_obj.start(self._prog_reportname,
+ _urlunquote_convert(self.url),
+ self._prog_basename,
size=size,
text=self.opts.text)
self._prog_running = True
@@ -1323,31 +1363,31 @@ class PyCurlFileObject(object):
self.fo.write(buf[max(start, 0):stop])
else:
self.fo.write(buf)
- except IOError, e:
+ except IOError as e:
self._cb_error = URLGrabError(16, exception2msg(e))
return -1
return len(buf)
except KeyboardInterrupt:
return -1
-
+
def _hdr_retrieve(self, buf):
if self._hdr_ended:
- self._hdr_dump = ''
+ self._hdr_dump = b''
self.size = 0
self._hdr_ended = False
- if self._over_max_size(cur=len(self._hdr_dump),
+ if self._over_max_size(cur=len(self._hdr_dump),
max_size=self.opts.max_header_size):
return -1
try:
# we have to get the size before we do the progress obj start
# but we can't do that w/o making it do 2 connects, which sucks
# so we cheat and stuff it in here in the hdr_retrieve
- if self.scheme in ['http','https']:
- if buf.lower().find('content-length:') != -1:
- length = buf.split(':')[1]
+ if self.scheme in [b'http', b'https']:
+ if buf.lower().find(b'content-length:') != -1:
+ length = buf.split(b':')[1]
self.size = int(length)
- elif (self.append or self.opts.range) and self._hdr_dump == '' and ' 200 ' in buf:
+ elif (self.append or self.opts.range) and not self._hdr_dump and b' 200 ' in buf:
# reget was attempted but server sends it all
# undo what we did in _build_range()
self.append = False
@@ -1356,28 +1396,28 @@ class PyCurlFileObject(object):
self._reget_length = 0
self._range = self.opts.range
self.fo.truncate(0)
- elif self.scheme in ['ftp']:
+ elif self.scheme in [b'ftp']:
s = None
- if buf.startswith('213 '):
+ if buf.startswith(b'213 '):
s = buf[3:].strip()
if len(s) >= 14:
s = None # ignore MDTM responses
- elif buf.startswith('150 '):
+ elif buf.startswith(b'150 '):
s = parse150(buf)
if s:
self.size = int(s)
-
- if buf.lower().find('location') != -1:
- location = ':'.join(buf.split(':')[1:])
+
+ if buf.lower().find(b'location') != -1:
+ location = b':'.join(buf.split(b':')[1:])
location = location.strip()
self.scheme = urlparse.urlsplit(location)[0]
self.url = location
-
+
self._hdr_dump += buf
- if len(self._hdr_dump) != 0 and buf == '\r\n':
+ if len(self._hdr_dump) != 0 and buf == b'\r\n':
self._hdr_ended = True
if DEBUG: DEBUG.debug('header ended:')
-
+
return len(buf)
except KeyboardInterrupt:
return pycurl.READFUNC_ABORT
@@ -1385,14 +1425,14 @@ class PyCurlFileObject(object):
def _return_hdr_obj(self):
if self._parsed_hdr:
return self._parsed_hdr
- statusend = self._hdr_dump.find('\n')
+ statusend = self._hdr_dump.find(b'\n')
statusend += 1 # ridiculous as it may seem.
hdrfp = StringIO()
hdrfp.write(self._hdr_dump[statusend:])
hdrfp.seek(0)
- self._parsed_hdr = mimetools.Message(hdrfp)
+ self._parsed_hdr = email.message_from_string(hdrfp)
return self._parsed_hdr
-
+
hdr = property(_return_hdr_obj)
http_code = property(fget=
lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE))
@@ -1415,7 +1455,7 @@ class PyCurlFileObject(object):
self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
-
+
if DEBUG and DEBUG.level <= 10:
self.curl_obj.setopt(pycurl.VERBOSE, True)
if opts.user_agent:
@@ -1429,11 +1469,11 @@ class PyCurlFileObject(object):
self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
if ipr == 'ipv6':
self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6)
-
+
# maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
-
+
# timeouts
timeout = 300
if hasattr(opts, 'timeout'):
@@ -1443,7 +1483,7 @@ class PyCurlFileObject(object):
self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
# ssl options
- if self.scheme == 'https':
+ if self.scheme == b'https':
if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
@@ -1458,13 +1498,13 @@ class PyCurlFileObject(object):
self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
# if we have a client side cert - turn off reuse b/c nss is odd
self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
- if opts.ssl_cert_type:
+ if opts.ssl_cert_type:
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
if opts.ssl_key_pass:
self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
#headers:
- if self.scheme in ('http', 'https'):
+ if self.scheme in (b'http', b'https'):
headers = []
if opts.http_headers is not None:
for (tag, content) in opts.http_headers:
@@ -1479,11 +1519,11 @@ class PyCurlFileObject(object):
range_str = self._build_range()
if range_str:
self.curl_obj.setopt(pycurl.RANGE, range_str)
-
+
# throttle/bandwidth
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
-
+
# proxy
if opts.proxy is not None:
self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
@@ -1492,7 +1532,7 @@ class PyCurlFileObject(object):
pycurl.HTTPAUTH_ANY - pycurl.HTTPAUTH_GSSNEGOTIATE)
if opts.username and opts.password:
- if self.scheme in ('http', 'https'):
+ if self.scheme in (b'http', b'https'):
self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
if opts.username and opts.password:
@@ -1511,50 +1551,50 @@ class PyCurlFileObject(object):
# our url
self.curl_obj.setopt(pycurl.URL, self.url)
-
-
+
+
def _do_perform(self):
if self._complete:
return
-
+
try:
self.curl_obj.perform()
- except pycurl.error, e:
+ except pycurl.error as e:
# XXX - break some of these out a bit more clearly
- # to other URLGrabErrors from
+ # to other URLGrabErrors from
# http://curl.haxx.se/libcurl/c/libcurl-errors.html
# this covers e.args[0] == 22 pretty well - which will be common
-
+
code = self.http_code
errcode = e.args[0]
- errurl = urllib.unquote(self.url)
-
+ errurl = _urlunquote_convert(self.url)
+
if self._error[0]:
errcode = self._error[0]
-
+
if errcode == 23 and 200 <= code <= 299:
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside
# since we cannot know what it is -I'm banking on it being
- # a ctrl-c. XXXX - if there's a way of going back two raises to
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
# figure out what aborted the pycurl process FIXME
raise getattr(self, '_cb_error', KeyboardInterrupt)
-
+
elif errcode == 28:
err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e))
err.url = errurl
raise err
-
+
elif errcode == 42:
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside
# since we cannot know what it is -I'm banking on it being
- # a ctrl-c. XXXX - if there's a way of going back two raises to
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
# figure out what aborted the pycurl process FIXME
raise KeyboardInterrupt
-
+
else:
pyerr2str = { 5 : _("Couldn't resolve proxy"),
6 : _("Couldn't resolve host"),
@@ -1602,8 +1642,9 @@ class PyCurlFileObject(object):
}
errstr = str(e.args[1]) or pyerr2str.get(errcode, '<Unknown>')
if code and not 200 <= code <= 299:
- msg = '%s Error %d - %s' % (self.scheme.upper(), code,
- self.scheme in ('http', 'https')
+ scheme = _bytes_repr(self.scheme)
+ msg = '%s Error %d - %s' % (scheme.upper(), code,
+ scheme in ('http', 'https')
and responses.get(code) or errstr)
else:
msg = 'curl#%s - "%s"' % (errcode, errstr)
@@ -1618,7 +1659,7 @@ class PyCurlFileObject(object):
if self._error[1]:
msg = self._error[1]
err = URLGrabError(14, msg)
- err.url = urllib.unquote(self.url)
+ err.url = _urlunquote_convert(self.url)
raise err
def _do_open(self):
@@ -1631,11 +1672,11 @@ class PyCurlFileObject(object):
def _add_headers(self):
pass
-
+
def _build_range(self):
reget_length = 0
rt = None
- if self.opts.reget and type(self.filename) in types.StringTypes:
+ if self.opts.reget and isinstance(self.filename, string_types):
# we have reget turned on and we're dumping to a file
try:
s = os.stat(self.filename)
@@ -1646,19 +1687,19 @@ class PyCurlFileObject(object):
reget_length = s[stat.ST_SIZE]
# Set initial length when regetting
- self._amount_read = reget_length
+ self._amount_read = reget_length
self._reget_length = reget_length # set where we started from, too
rt = reget_length, ''
self.append = 1
-
+
if self.opts.range:
rt = self.opts.range
-
+
if rt[0] is None:
rt = (0, rt[1])
rt = (rt[0] + reget_length, rt[1])
-
+
if rt:
header = range_tuple_to_header(rt)
@@ -1670,10 +1711,10 @@ class PyCurlFileObject(object):
def _make_request(self, req, opener):
#XXXX
# This doesn't do anything really, but we could use this
- # instead of do_open() to catch a lot of crap errors as
+ # instead of do_open() to catch a lot of crap errors as
# mstenner did before here
return (self.fo, self.hdr)
-
+
try:
if self.opts.timeout:
old_to = socket.getdefaulttimeout()
@@ -1685,22 +1726,22 @@ class PyCurlFileObject(object):
else:
fo = opener.open(req)
hdr = fo.info()
- except ValueError, e:
+ except ValueError as e:
err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
err.url = self.url
raise err
- except RangeError, e:
+ except RangeError as e:
err = URLGrabError(9, _('%s on %s') % (e, self.url))
err.url = self.url
raise err
- except urllib2.HTTPError, e:
+ except HTTPError as e:
new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
new_e.code = e.code
new_e.exception = e
new_e.url = self.url
raise new_e
- except IOError, e:
+ except IOError as e:
if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout):
err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
err.url = self.url
@@ -1710,41 +1751,41 @@ class PyCurlFileObject(object):
err.url = self.url
raise err
- except OSError, e:
+ except OSError as e:
err = URLGrabError(5, _('%s on %s') % (e, self.url))
err.url = self.url
raise err
- except HTTPException, e:
- err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
- (e.__class__.__name__, self.url, e))
+ except HTTPException as e:
+ err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s')
+ % (e.__class__.__name__, self.url, e))
err.url = self.url
raise err
else:
return (fo, hdr)
-
+
def _do_grab(self):
"""dump the file to a filename or StringIO buffer"""
if self._complete:
return
_was_filename = False
- if type(self.filename) in types.StringTypes and self.filename:
+ if isinstance(self.filename, string_types) and self.filename:
_was_filename = True
self._prog_reportname = str(self.filename)
self._prog_basename = os.path.basename(self.filename)
-
+
if self.append: mode = 'ab'
else: mode = 'wb'
- if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
- (self.filename, mode))
+ if DEBUG: DEBUG.info('opening local file "%s" with mode %s'
+ % (self.filename, mode))
try:
self.fo = open(self.filename, mode)
- except IOError, e:
- err = URLGrabError(16, _(\
- 'error opening local file from %s, IOError: %s') % (self.url, e))
+ except IOError as e:
+ err = URLGrabError(16, _('error opening local file from %s, IOError: %s')
+ % (self.url, e))
err.url = self.url
raise err
@@ -1752,20 +1793,20 @@ class PyCurlFileObject(object):
self._prog_reportname = 'MEMORY'
self._prog_basename = 'MEMORY'
-
- self.fo = StringIO()
+
+ self.fo = BytesIO()
# if this is to be a tempfile instead....
# it just makes crap in the tempdir
#fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb')
- try:
+ try:
self._do_perform()
- except URLGrabError, e:
+ except URLGrabError as e:
self.fo.flush()
self.fo.close()
raise e
-
+
if _was_filename:
# close it up
self.fo.flush()
@@ -1784,27 +1825,26 @@ class PyCurlFileObject(object):
if mod_time != -1:
try:
os.utime(self.filename, (mod_time, mod_time))
- except OSError, e:
- err = URLGrabError(16, _(\
- 'error setting timestamp on file %s from %s, OSError: %s')
- % (self.filename, self.url, e))
+ except OSError as e:
+ err = URLGrabError(16, _('error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
err.url = self.url
raise err
# re open it
try:
self.fo = open(self.filename, 'r')
- except IOError, e:
- err = URLGrabError(16, _(\
- 'error opening file from %s, IOError: %s') % (self.url, e))
+ except IOError as e:
+ err = URLGrabError(16, _('error opening file from %s, IOError: %s')
+ % (self.url, e))
err.url = self.url
raise err
-
+
else:
#self.fo = open(self._temp_name, 'r')
self.fo.seek(0)
self._complete = True
-
+
def _fill_buffer(self, amt=None):
"""fill the buffer to contain at least 'amt' bytes by reading
from the underlying file object. If amt is None, then it will
@@ -1821,9 +1861,9 @@ class PyCurlFileObject(object):
# if we've made it here, then we don't have enough in the buffer
# and we need to read more.
-
+
if not self._complete: self._do_grab() #XXX cheater - change on ranges
-
+
buf = [self._rbuf]
bufsize = len(self._rbuf)
while amt is None or amt:
@@ -1833,23 +1873,23 @@ class PyCurlFileObject(object):
(time.time() - self._ttime)
if diff > 0: time.sleep(diff)
self._ttime = time.time()
-
+
# now read some data, up to self._rbufsize
if amt is None: readamount = self._rbufsize
else: readamount = min(amt, self._rbufsize)
try:
new = self.fo.read(readamount)
- except socket.error, e:
+ except socket.error as e:
err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
err.url = self.url
raise err
- except socket.timeout, e:
+ except socket.timeout as e:
raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
err.url = self.url
raise err
- except IOError, e:
+ except IOError as e:
raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
err.url = self.url
raise err
@@ -1865,7 +1905,7 @@ class PyCurlFileObject(object):
#if self.opts.progress_obj:
# self.opts.progress_obj.update(self._amount_read)
- self._rbuf = string.join(buf, '')
+ self._rbuf = b''.join(buf)
return
def _progress_update(self, download_total, downloaded, upload_total, uploaded):
@@ -1878,7 +1918,7 @@ class PyCurlFileObject(object):
self.opts.progress_obj.update(downloaded)
except (KeyboardInterrupt, IOError):
return -1
-
+
def _over_max_size(self, cur, max_size=None):
if not max_size:
@@ -1896,11 +1936,11 @@ class PyCurlFileObject(object):
self._error = (pycurl.E_FILESIZE_EXCEEDED, msg)
return True
return False
-
+
def read(self, amt=None):
self._fill_buffer(amt)
if amt is None:
- s, self._rbuf = self._rbuf, ''
+ s, self._rbuf = self._rbuf, b''
else:
s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
return s
@@ -1908,13 +1948,13 @@ class PyCurlFileObject(object):
def readline(self, limit=-1):
if not self._complete: self._do_grab()
return self.fo.readline()
-
- i = string.find(self._rbuf, '\n')
+
+ i = self._rbuf.find('\n')
while i < 0 and not (0 < limit <= len(self._rbuf)):
L = len(self._rbuf)
self._fill_buffer(L + self._rbufsize)
if not len(self._rbuf) > L: break
- i = string.find(self._rbuf, '\n', L)
+ i = self._rbuf.find('\n', L)
if i < 0: i = len(self._rbuf)
else: i = i+1
@@ -1927,12 +1967,12 @@ class PyCurlFileObject(object):
if self._prog_running:
self.opts.progress_obj.end(self._amount_read)
self.fo.close()
-
+
def geturl(self):
""" Provide the geturl() method, used to be got from
urllib.addinfourl, via. urllib.URLopener.* """
return self.url
-
+
if hasattr(pycurl, 'GLOBAL_ACK_EINTR'):
# fail immediately on ctrl-c
pycurl.global_init(pycurl.GLOBAL_DEFAULT | pycurl.GLOBAL_ACK_EINTR)
@@ -1945,7 +1985,7 @@ def reset_curl_obj():
_curl_cache = pycurl.Curl()
_libproxy_cache = None
-
+
#####################################################################
# DEPRECATED FUNCTIONS
@@ -1964,23 +2004,23 @@ def set_progress_obj(new_progress_obj):
def set_user_agent(new_user_agent):
"""Deprecated. Use: default_grabber.user_agent = new_user_agent"""
default_grabber.user_agent = new_user_agent
-
+
def retrygrab(url, filename=None, copy_local=0, close_connection=0,
progress_obj=None, throttle=None, bandwidth=None,
numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
"""Deprecated. Use: urlgrab() with the retry arg instead"""
- kwargs = {'copy_local' : copy_local,
+ kwargs = {'copy_local' : copy_local,
'close_connection' : close_connection,
- 'progress_obj' : progress_obj,
- 'throttle' : throttle,
+ 'progress_obj' : progress_obj,
+ 'throttle' : throttle,
'bandwidth' : bandwidth,
'retry' : numtries,
'retrycodes' : retrycodes,
- 'checkfunc' : checkfunc
+ 'checkfunc' : checkfunc
}
return urlgrab(url, filename, **kwargs)
-
+
#####################################################################
# Serializer + parser: A replacement of the rather bulky Json code.
#
@@ -1989,27 +2029,31 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
#
#####################################################################
-_quoter_map = {}
-for c in '%[(,)] \n':
- _quoter_map[c] = '%%%02x' % ord(c)
-del c
+def _quoter(c):
+ if c in '%[(,)] \n':
+ return '%%%02x' % ord(c)
+ return c
def _dumps(v):
if v is None: return 'None'
if v is True: return 'True'
if v is False: return 'False'
- if type(v) in (int, long, float):
+ if isinstance(v, numbers.Number):
return str(v)
- if type(v) == unicode:
- v = v.encode('UTF8')
- if type(v) == str:
- def quoter(c): return _quoter_map.get(c, c)
- return "'%s'" % ''.join(map(quoter, v))
- if type(v) == tuple:
+ if isinstance(v, (str, text_type, bytes)):
+ # standarize to str on both py2 to py3
+ if sys.version_info < (3,):
+ if isinstance(v, text_type):
+ v = v.encode('utf8')
+ else:
+ if isinstance(v, bytes):
+ v = v.decode('utf8')
+ return "'%s'" % ''.join(map(_quoter, v))
+ if isinstance(v, tuple):
return "(%s)" % ','.join(map(_dumps, v))
- if type(v) == list:
+ if isinstance(v, list):
return "[%s]" % ','.join(map(_dumps, v))
- raise TypeError, 'Can\'t serialize %s' % v
+ raise TypeError("Can't serialize %s" % v)
def _loads(s):
def decode(v):
@@ -2063,16 +2107,18 @@ def _readlines(fd):
buf = os.read(fd, 4096)
if not buf: return None
# whole lines only, no buffering
- while buf[-1] != '\n':
+ while not buf.endswith(b'\n'):
buf += os.read(fd, 4096)
- return buf[:-1].split('\n')
+ return buf[:-1].split(b'\n')
import subprocess
class _ExternalDownloader:
def __init__(self):
+ urlgrabber_path = (os.getenv('URLGRABBER_EXT_DOWN', None)
+ or '/usr/libexec/urlgrabber-ext-down')
self.popen = subprocess.Popen(
- '/usr/libexec/urlgrabber-ext-down',
+ urlgrabber_path,
stdin = subprocess.PIPE,
stdout = subprocess.PIPE,
)
@@ -2111,7 +2157,7 @@ class _ExternalDownloader:
self.cnt += 1
self.running[self.cnt] = opts
- os.write(self.stdin, arg +'\n')
+ os.write(self.stdin, (arg +'\n').encode('utf8'))
def perform(self):
ret = []
@@ -2121,22 +2167,22 @@ class _ExternalDownloader:
raise KeyboardInterrupt
for line in lines:
# parse downloader output
- line = line.split(' ', 6)
+ line = line.split(b' ', 6)
_id, size = map(int, line[:2])
if len(line) == 2:
self.running[_id]._progress.update(size)
continue
# job done
opts = self.running.pop(_id)
- if line[4] == 'OK':
+ if line[4] == b'OK':
ug_err = None
if DEBUG: DEBUG.info('success')
else:
ug_err = URLGrabError(int(line[4]), line[6])
- if line[5] != '0':
+ if line[5] != b'0':
ug_err.code = int(line[5])
if DEBUG: DEBUG.info('failure: %s', ug_err)
- _TH.update(opts.url, int(line[2]), float(line[3]), ug_err, opts.async[0])
+ _TH.update(opts.url, int(line[2]), float(line[3]), ug_err, opts.async_[0])
ret.append((opts, size, ug_err))
return ret
@@ -2220,13 +2266,13 @@ def parallel_wait(meter=None):
opts.tries = tries
try:
dl.start(opts)
- except OSError, e:
+ except OSError as e:
# can't spawn downloader, give up immediately
opts.exception = URLGrabError(5, exception2msg(e))
_run_callback(opts.failfunc, opts)
return
- key, limit = opts.async
+ key, limit = opts.async_
host_con[key] = host_con.get(key, 0) + 1
if opts.progress_obj:
if opts.multi_progress_obj:
@@ -2237,13 +2283,15 @@ def parallel_wait(meter=None):
def perform():
for opts, size, ug_err in dl.perform():
- key, limit = opts.async
+ key, limit = opts.async_
host_con[key] -= 1
if ug_err is None:
if opts.checkfunc:
- try: _run_callback(opts.checkfunc, opts)
- except URLGrabError, ug_err: pass
+ try:
+ _run_callback(opts.checkfunc, opts)
+ except URLGrabError as e:
+ ug_err = e
if opts.progress_obj:
if opts.multi_progress_obj:
@@ -2273,8 +2321,10 @@ def parallel_wait(meter=None):
retry = opts.retry or 0
if opts.failure_callback:
opts.exception = ug_err
- try: _run_callback(opts.failure_callback, opts)
- except URLGrabError, ug_err:
+ try:
+ _run_callback(opts.failure_callback, opts)
+ except URLGrabError as e:
+ ug_err = e
retry = 0 # no retries
if opts.tries < retry and ug_err.errno in opts.retrycodes:
if ug_err.errno < 0 and opts.retry_no_cache:
@@ -2364,7 +2414,7 @@ def parallel_wait(meter=None):
# update the current mirror and limit
key = best['mirror']
limit = best.get('kwargs', {}).get('max_connections')
- opts.async = key, limit
+ opts.async_ = key, limit
# update URL and proxy
url = mg._join_url(key, opts.relative_url)
@@ -2373,7 +2423,7 @@ def parallel_wait(meter=None):
opts.url = url
# check host limit, then start
- key, limit = opts.async
+ key, limit = opts.async_
if key in single:
limit = 1
while host_con.get(key, 0) >= (limit or 2):
@@ -2382,7 +2432,7 @@ def parallel_wait(meter=None):
DEBUG.info('max_connections(%s): %d/%s', key, host_con.get(key, 0), limit)
start(opts, 1)
- except IOError, e:
+ except IOError as e:
if e.errno != 4: raise
raise KeyboardInterrupt
@@ -2435,7 +2485,7 @@ class _TH:
def update(url, dl_size, dl_time, ug_err, baseurl=None):
# Use hostname from URL. If it's a file:// URL, use baseurl.
# If no baseurl, do not update timedhosts.
- host = urlparse.urlsplit(url).netloc.split('@')[-1] or baseurl
+ host = urlparse.urlsplit(url).netloc.split(b'@')[-1] or baseurl
if not host: return
_TH.load()
@@ -2467,7 +2517,7 @@ class _TH:
_TH.load()
# Use just the hostname, unless it's a file:// baseurl.
- host = urlparse.urlsplit(baseurl).netloc.split('@')[-1] or baseurl
+ host = urlparse.urlsplit(baseurl).netloc.split(b'@')[-1] or baseurl
default_speed = default_grabber.opts.default_speed
try: speed, fail, ts = _TH.hosts[host]
@@ -2483,68 +2533,67 @@ class _TH:
def _main_test():
try: url, filename = sys.argv[1:3]
except ValueError:
- print 'usage:', sys.argv[0], \
- '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
- sys.exit()
+ print('usage:', sys.argv[0],
+ '<url> <filename> [copy_local=0|1] [close_connection=0|1]')
+ sys.exit(2)
kwargs = {}
for a in sys.argv[3:]:
- k, v = string.split(a, '=', 1)
+ k, v = a.split('=', 1)
kwargs[k] = int(v)
set_throttle(1.0)
set_bandwidth(32 * 1024)
- print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
- default_grabber.bandwidth)
+ print("throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
+ default_grabber.bandwidth))
- try: from progress import text_progress_meter
- except ImportError, e: pass
+ try: from .progress import text_progress_meter
+ except ImportError as e: pass
else: kwargs['progress_obj'] = text_progress_meter()
- try: name = apply(urlgrab, (url, filename), kwargs)
- except URLGrabError, e: print e
- else: print 'LOCAL FILE:', name
+ try: name = urlgrab(url, filename, **kwargs)
+ except URLGrabError as e: print(e)
+ else: print('LOCAL FILE:', name)
def _retry_test():
try: url, filename = sys.argv[1:3]
except ValueError:
- print 'usage:', sys.argv[0], \
- '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
- sys.exit()
+ print('usage:', sys.argv[0],
+ '<url> <filename> [copy_local=0|1] [close_connection=0|1]')
+ sys.exit(2)
kwargs = {}
for a in sys.argv[3:]:
- k, v = string.split(a, '=', 1)
+ k, v = a.split('=', 1)
kwargs[k] = int(v)
- try: from progress import text_progress_meter
- except ImportError, e: pass
+ try: from .progress import text_progress_meter
+ except ImportError as e: pass
else: kwargs['progress_obj'] = text_progress_meter()
def cfunc(filename, hello, there='foo'):
- print hello, there
+ print(hello, there)
import random
rnum = random.random()
if rnum < .5:
- print 'forcing retry'
+ print('forcing retry')
raise URLGrabError(-1, 'forcing retry')
if rnum < .75:
- print 'forcing failure'
+ print('forcing failure')
raise URLGrabError(-2, 'forcing immediate failure')
- print 'success'
+ print('success')
return
-
+
kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
- try: name = apply(retrygrab, (url, filename), kwargs)
- except URLGrabError, e: print e
- else: print 'LOCAL FILE:', name
+ try: name = retrygrab(url, filename, **kwargs)
+ except URLGrabError as e: print(e)
+ else: print('LOCAL FILE:', name)
def _file_object_test(filename=None):
- import cStringIO
if filename is None:
filename = __file__
- print 'using file "%s" for comparisons' % filename
+ print('using file "%s" for comparisons' % filename)
fo = open(filename)
s_input = fo.read()
fo.close()
@@ -2553,17 +2602,17 @@ def _file_object_test(filename=None):
_test_file_object_readall,
_test_file_object_readline,
_test_file_object_readlines]:
- fo_input = cStringIO.StringIO(s_input)
- fo_output = cStringIO.StringIO()
+ fo_input = StringIO(s_input)
+ fo_output = StringIO()
wrapper = PyCurlFileObject(fo_input, None, 0)
- print 'testing %-30s ' % testfunc.__name__,
+ print('testing %-30s ' % testfunc.__name__, end=' ')
testfunc(wrapper, fo_output)
s_output = fo_output.getvalue()
- if s_output == s_input: print 'passed'
- else: print 'FAILED'
-
+ if s_output == s_input: print('passed')
+ else: print('FAILED')
+
def _test_file_object_smallread(wrapper, fo_output):
- while 1:
+ while True:
s = wrapper.read(23)
fo_output.write(s)
if not s: return
@@ -2573,14 +2622,14 @@ def _test_file_object_readall(wrapper, fo_output):
fo_output.write(s)
def _test_file_object_readline(wrapper, fo_output):
- while 1:
+ while True:
s = wrapper.readline()
fo_output.write(s)
if not s: return
def _test_file_object_readlines(wrapper, fo_output):
li = wrapper.readlines()
- fo_output.write(string.join(li, ''))
+ fo_output.write(''.join(li))
if __name__ == '__main__':
_main_test()
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index 988a309..75f0bcb 100644
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -9,9 +9,9 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
@@ -93,14 +93,27 @@ CUSTOMIZATION
import sys
import random
-import thread # needed for locking to make this threadsafe
-from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
-from grabber import _run_callback, _do_raise
-from grabber import exception2msg
-from grabber import _TH
+if sys.version_info >= (3,):
+ # We use a version check because python2 also has _thread
+ import _thread as thread
+else:
+ import thread
-def _(st):
+try:
+ import urllib.parse as urlparse
+except ImportError:
+ import urlparse
+
+from six import string_types
+
+from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
+from .grabber import _run_callback, _do_raise
+from .grabber import exception2msg
+from .grabber import _TH
+from .grabber import _bytes_repr
+
+def _(st):
return st
class GrabRequest:
@@ -142,7 +155,7 @@ class MirrorGroup:
In addition to the required arguments "grabber" and "mirrors",
MirrorGroup also takes the following optional arguments:
-
+
default_action
A dict that describes the actions to be taken upon failure
@@ -173,7 +186,7 @@ class MirrorGroup:
or by returning an action dict from the failure_callback
return {'fail':0}
in increasing precedence.
-
+
If all three of these were done, the net result would be:
{'increment': 0, # set in method
'increment_master': 1, # class default
@@ -278,19 +291,14 @@ class MirrorGroup:
# methods, they will be stripped before getting passed on to the
# grabber
options = ['default_action', 'failure_callback']
-
+
def _process_kwargs(self, kwargs):
self.failure_callback = kwargs.get('failure_callback')
self.default_action = kwargs.get('default_action')
-
+
def _parse_mirrors(self, mirrors):
- parsed_mirrors = []
- for m in mirrors:
- if isinstance(m, basestring):
- m = {'mirror': _to_utf8(m)}
- parsed_mirrors.append(m)
- return parsed_mirrors
-
+ return [{'mirror':_to_utf8(m)} for m in mirrors]
+
def _load_gr(self, gr):
# OVERRIDE IDEAS:
# shuffle gr list
@@ -316,7 +324,7 @@ class MirrorGroup:
# the callback)
cb = gr.kw.get('failure_callback') or self.failure_callback
if cb:
- if type(cb) == type( () ):
+ if isinstance(cb, tuple):
cb, args, kwargs = cb
else:
args, kwargs = (), {}
@@ -351,7 +359,7 @@ class MirrorGroup:
urlopen, there's no good way for the mirror group to know that
an error occurs mid-download (it's already returned and given
you the file object).
-
+
remove --- can have several values
0 do not remove the mirror from the list
1 remove the mirror for this download only
@@ -373,7 +381,7 @@ class MirrorGroup:
self._next += 1
if self._next >= len(self.mirrors): self._next = 0
self._lock.release()
-
+
if action.get('remove', 1):
del gr.mirrors[gr._next]
elif action.get('increment', 1):
@@ -381,9 +389,9 @@ class MirrorGroup:
if gr._next >= len(gr.mirrors): gr._next = 0
if DEBUG:
- grm = [m['mirror'] for m in gr.mirrors]
+ grm = [m['mirror'].decode() for m in gr.mirrors]
DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next)
- selfm = [m['mirror'] for m in self.mirrors]
+ selfm = [m['mirror'].decode() for m in self.mirrors]
DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
#####################################################################
@@ -394,11 +402,17 @@ class MirrorGroup:
# by overriding the configuration methods :)
def _join_url(self, base_url, rel_url):
- if base_url.endswith('/') or rel_url.startswith('/'):
- return base_url + rel_url
+ (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url)
+
+ if isinstance(base_url, bytes):
+ if not isinstance(rel_url, bytes):
+ rel_url = rel_url.encode('utf8')
+ sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/'
else:
- return base_url + '/' + rel_url
-
+ sep = '' if path.endswith('/') or rel_url.startswith('/') else '/'
+
+ return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid))
+
def _mirror_try(self, func, url, kw):
gr = GrabRequest()
gr.func = func
@@ -412,7 +426,7 @@ class MirrorGroup:
except KeyError: pass
tries = 0
- while 1:
+ while True:
tries += 1
mirrorchoice = self._get_mirror(gr)
fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
@@ -420,10 +434,10 @@ class MirrorGroup:
# apply mirrorchoice kwargs on top of grabber.opts
opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {}))
func_ref = getattr(grabber, func)
- if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
+ if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', _bytes_repr(url), _bytes_repr(fullurl))
try:
return func_ref( *(fullurl,), opts=opts, **kw )
- except URLGrabError, e:
+ except URLGrabError as e:
if DEBUG: DEBUG.info('MIRROR: failed')
gr.errors.append((fullurl, exception2msg(e)))
obj = CallbackObject()
@@ -437,7 +451,7 @@ class MirrorGroup:
def urlgrab(self, url, filename=None, **kwargs):
kw = dict(kwargs)
kw['filename'] = filename
- if kw.get('async'):
+ if kw.get('async_') or kw.get('async'):
# enable mirror failovers in async path
kw['mirror_group'] = self, [], {}, set()
kw['relative_url'] = url
@@ -446,10 +460,10 @@ class MirrorGroup:
func = 'urlgrab'
try:
return self._mirror_try(func, url, kw)
- except URLGrabError, e:
+ except URLGrabError as e:
obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs)
return _run_callback(kwargs.get('failfunc', _do_raise), obj)
-
+
def urlopen(self, url, **kwargs):
kw = dict(kwargs)
func = 'urlopen'
@@ -460,7 +474,7 @@ class MirrorGroup:
kw['limit'] = limit
func = 'urlread'
return self._mirror_try(func, url, kw)
-
+
class MGRandomStart(MirrorGroup):
"""A mirror group that starts at a random mirror in the list.
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index 9b77c54..5b4c450 100644
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
@@ -9,23 +9,31 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+from __future__ import print_function
import sys
import time
import math
-import thread
import fcntl
import struct
import termios
+if sys.version_info >= (3,):
+ # We use a version check because python2 also has _thread
+ import _thread as thread
+else:
+ import thread
+
+from six import integer_types, string_types
+
# Code from http://mail.python.org/pipermail/python-list/2000-May/033365.html
def terminal_width(fd=1):
""" Get the real terminal width """
@@ -107,7 +115,7 @@ class BaseMeter:
self.last_amount_read = 0
self.last_update_time = None
self.re = RateEstimator()
-
+
def start(self, filename=None, url=None, basename=None,
size=None, now=None, text=None):
self.filename = filename
@@ -125,7 +133,7 @@ class BaseMeter:
self.last_amount_read = 0
self.last_update_time = now
self._do_start(now)
-
+
def _do_start(self, now=None):
pass
@@ -152,7 +160,7 @@ class BaseMeter:
def _do_end(self, amount_read, now=None):
pass
-
+
# This is kind of a hack, but progress is gotten from grabber which doesn't
# know about the total size to download. So we do this so we can get the data
# out of band here. This will be "fixed" one way or anther soon.
@@ -167,7 +175,7 @@ def text_meter_total_size(size, downloaded=0):
#
# update: No size (minimal: 17 chars)
# -----------------------------------
-# <text> <rate> | <current size> <elapsed time>
+# <text> <rate> | <current size> <elapsed time>
# 8-48 1 8 3 6 1 9 5
#
# Order: 1. <text>+<current size> (17)
@@ -202,7 +210,7 @@ def text_meter_total_size(size, downloaded=0):
#
# end
# ---
-# <text> | <current size> <elapsed time>
+# <text> | <current size> <elapsed time>
# 8-56 3 6 1 9 5
#
# Order: 1. <text> ( 8)
@@ -360,7 +368,7 @@ class MultiFileMeter:
else:
self._lock = _FakeLock()
self.update_period = 0.3 # seconds
-
+
self.numfiles = None
self.finished_files = 0
self.failed_files = 0
@@ -393,7 +401,7 @@ class MultiFileMeter:
if now is None: now = time.time()
self.re.update(self._amount_read(), now)
self._do_end(now)
-
+
def _do_end(self, now):
pass
@@ -406,10 +414,10 @@ class MultiFileMeter:
newmeter = self.helperclass(self)
self.meters.append(newmeter)
return newmeter
-
+
def removeMeter(self, meter):
self.meters.remove(meter)
-
+
###########################################################
# child functions - these should only be called by helpers
def start_meter(self, meter, now):
@@ -423,10 +431,10 @@ class MultiFileMeter:
finally:
self._lock.release()
self._do_start_meter(meter, now)
-
+
def _do_start_meter(self, meter, now):
pass
-
+
def update_meter(self, meter, now):
if not meter in self.meters:
raise ValueError('attempt to use orphaned meter')
@@ -507,7 +515,7 @@ class TextMultiFileMeter(MultiFileMeter):
# 8-22 1 3-4 1 6-12 1 8 3 6 1 7-9 1 3 1
# end
# ---
-# <text> | <file size> <file elapsed time>
+# <text> | <file size> <file elapsed time>
# 8-56 3 6 1 9 5
def _do_update_meter(self, meter, now):
self._lock.acquire()
@@ -606,7 +614,7 @@ class TextMultiFileMeter(MultiFileMeter):
try:
format = "%-30.30s %6.6s %s"
fn = meter.text or meter.basename
- if type(message) in (type(''), type(u'')):
+ if isinstance(message, string_types):
message = message.splitlines()
if not message: message = ['']
out = '%-79s' % (format % (fn, 'FAILED', message[0] or ''))
@@ -622,7 +630,7 @@ class TextMultiFileMeter(MultiFileMeter):
pass
finally:
self._lock.release()
-
+
######################################################################
# support classes and functions
@@ -637,7 +645,7 @@ class RateEstimator:
self.last_update_time = now
self.last_amount_read = 0
self.ave_rate = None
-
+
def update(self, amount_read, now=None):
if now is None: now = time.time()
# libcurl calls the progress callback when fetching headers
@@ -657,11 +665,11 @@ class RateEstimator:
# First update, on reget is the file size
if self.last_amount_read:
self.last_update_time = now
- self.ave_rate = self._temporal_rolling_ave(\
+ self.ave_rate = self._temporal_rolling_ave(
time_diff, read_diff, self.ave_rate, self.timescale)
self.last_amount_read = amount_read
#print 'results', time_diff, read_diff, self.ave_rate
-
+
#####################################################################
# result methods
def average_rate(self):
@@ -697,14 +705,14 @@ class RateEstimator:
epsilon = time_diff / timescale
if epsilon > 1: epsilon = 1.0
return self._rolling_ave(time_diff, read_diff, last_ave, epsilon)
-
+
def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon):
"""perform a "rolling average" iteration
a rolling average "folds" new data into an existing average with
some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive)
a value of 0.0 means only the old value (initial value) counts,
and a value of 1.0 means only the newest value is considered."""
-
+
try:
recent_rate = read_diff / time_diff
except ZeroDivisionError:
@@ -733,7 +741,7 @@ class RateEstimator:
rt = int(rt)
if shift <= 0: return rt
return float(int(rt) >> shift << shift)
-
+
def format_time(seconds, use_hours=0):
if seconds is None or seconds < 0:
@@ -743,15 +751,15 @@ def format_time(seconds, use_hours=0):
return 'Infinite'
else:
seconds = int(seconds)
- minutes = seconds / 60
+ minutes = seconds // 60
seconds = seconds % 60
if use_hours:
- hours = minutes / 60
+ hours = minutes // 60
minutes = minutes % 60
return '%02i:%02i:%02i' % (hours, minutes, seconds)
else:
return '%02i:%02i' % (minutes, seconds)
-
+
def format_number(number, SI=0, space=' '):
"""Turn numbers into human-readable metric-like numbers"""
symbols = ['', # (none)
@@ -763,14 +771,14 @@ def format_number(number, SI=0, space=' '):
'E', # exa
'Z', # zetta
'Y'] # yotta
-
+
if SI: step = 1000.0
else: step = 1024.0
thresh = 999
depth = 0
max_depth = len(symbols) - 1
-
+
# we want numbers between 0 and thresh, but don't exceed the length
# of our list. In that event, the formatting will be screwed up,
# but it'll still show the right number.
@@ -778,7 +786,7 @@ def format_number(number, SI=0, space=' '):
depth = depth + 1
number = number / step
- if type(number) == type(1) or type(number) == type(1L):
+ if isinstance(number, integer_types):
# it's an int or a long, which means it didn't get divided,
# which means it's already short enough
format = '%i%s%s'
@@ -788,7 +796,7 @@ def format_number(number, SI=0, space=' '):
format = '%.1f%s%s'
else:
format = '%.0f%s%s'
-
+
return(format % (float(number or 0), space, symbols[depth]))
def _tst(fn, cur, tot, beg, size, *args):
@@ -806,7 +814,7 @@ def _tst(fn, cur, tot, beg, size, *args):
tm.end(size)
def _mtst(datas, *args):
- print '-' * 79
+ print('-' * 79)
tm = TextMultiFileMeter(threaded=False)
dl_sizes = {}
@@ -850,8 +858,8 @@ def _mtst(datas, *args):
assert not tm.meters
if __name__ == "__main__":
- # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28
- # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08
+ # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28
+ # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08
if len(sys.argv) >= 2 and sys.argv[1] == 'multi':
_mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
("s-1.0.1-1.fc8.i386.rpm", 5000),