diff options
author | Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> | 2019-02-11 20:46:45 +0100 |
---|---|---|
committer | Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> | 2019-02-11 20:59:29 +0100 |
commit | 4d97612f94e9d0799ad7786b6e7c6865ac21cd46 (patch) | |
tree | 8cc467b20f321ac22de13d2b79f7067076985ef1 | |
parent | 0772e448446ca1252a4b1012ad2f9d2c96e29d4c (diff) | |
download | urlgrabber-4d97612f94e9d0799ad7786b6e7c6865ac21cd46.tar.gz |
Remove trailing whitespace
-rw-r--r-- | test/grabberperf.py | 18 | ||||
-rw-r--r-- | test/munittest.py | 10 | ||||
-rw-r--r-- | test/runtests.py | 16 | ||||
-rw-r--r-- | test/test_byterange.py | 44 | ||||
-rw-r--r-- | test/test_grabber.py | 78 | ||||
-rw-r--r-- | test/test_mirror.py | 32 | ||||
-rw-r--r-- | test/threading/batchgrabber.py | 24 | ||||
-rw-r--r-- | urlgrabber/byterange.py | 108 | ||||
-rw-r--r-- | urlgrabber/grabber.py | 254 | ||||
-rw-r--r-- | urlgrabber/mirror.py | 28 | ||||
-rw-r--r-- | urlgrabber/progress.py | 54 |
11 files changed, 333 insertions, 333 deletions
diff --git a/test/grabberperf.py b/test/grabberperf.py index 6eeaf71..4cf26f6 100644 --- a/test/grabberperf.py +++ b/test/grabberperf.py @@ -11,9 +11,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -46,7 +46,7 @@ def main(): # remove temp files os.unlink(tempsrc) os.unlink(tempdst) - + def setuptemp(size): if DEBUG: print 'writing %d KB to temporary file (%s).' % (size / 1024, tempsrc) file = open(tempsrc, 'w', 1024) @@ -55,7 +55,7 @@ def setuptemp(size): file.write(chars[i % 10]) file.flush() file.close() - + def speedtest(size): setuptemp(size) full_times = [] @@ -70,7 +70,7 @@ def speedtest(size): print 'not using progress meter' else: tpm = text_progress_meter(fo=open('/dev/null', 'w')) - + # to address concerns that the overhead from the progress meter # and throttling slow things down, we do this little test. # @@ -81,16 +81,16 @@ def speedtest(size): # note: it _is_ even slower to direct the progress meter to a real # tty or file, but I'm just interested in the overhead from _this_ # module. - + # get it nicely cached before we start comparing if DEBUG: print 'pre-caching' for i in range(100): urlgrab(tempsrc, tempdst, copy_local=1, throttle=None, proxies=proxies) - + if DEBUG: print 'running speed test.' reps = 500 for i in range(reps): - if DEBUG: + if DEBUG: print '\r%4i/%-4i' % (i+1, reps), sys.stdout.flush() t = time.time() diff --git a/test/munittest.py b/test/munittest.py index 437248d..f79a7cb 100644 --- a/test/munittest.py +++ b/test/munittest.py @@ -463,12 +463,12 @@ class TestSuite: self._tests = [] self.addTests(tests) self.description = description or '(no description)' - + def __repr__(self): return "<%s tests=%s>" % (_strclass(self.__class__), self._tests) __str__ = __repr__ - + def shortDescription(self): return self.description @@ -494,7 +494,7 @@ class TestSuite: def __call__(self, result): try: result.startSuite(self) except AttributeError: pass - + for test in self._tests: if result.shouldStop: break @@ -575,7 +575,7 @@ class TestLoader: description = (description.splitlines()[0]).strip() suite = self.suiteClass(instance_list, description) return suite - + def loadTestsFromModule(self, module): """Return a suite of all tests cases contained in the given module""" tests = [] @@ -735,7 +735,7 @@ class _TextTestResult(TestResult): except AttributeError: desc = '(no description)' self.stream.writeln(desc) self.depth += 1 - + def startTest(self, test): TestResult.startTest(self, test) if self.showAll: diff --git a/test/runtests.py b/test/runtests.py index c48bd1d..562edc0 100644 --- a/test/runtests.py +++ b/test/runtests.py @@ -1,19 +1,19 @@ #!/usr/bin/python """Usage: python runtests.py [OPTIONS] -Quick script to run all unit tests from source directory +Quick script to run all unit tests from source directory (e.g. without having to install.) OPTIONS: - - -d, --descriptions=NUM Set to 0 to turn off printing + + -d, --descriptions=NUM Set to 0 to turn off printing test doc strings as descriptions. -v, --verbosity=NUM Output verbosity level. Defaults to - 2 which is one line of info per test. Set + 2 which is one line of info per test. Set to 1 to get one char of info per test or 0 to disable status output completely. """ - + # $Id: runtests.py,v 1.7 2004/03/31 17:02:00 mstenner Exp $ import sys @@ -31,7 +31,7 @@ def main(): # it's okay to import now that sys.path is setup. import test_grabber, test_byterange, test_mirror suite = TestSuite( (test_grabber.suite(), - test_byterange.suite(), + test_byterange.suite(), test_mirror.suite()) ) suite.description = 'urlgrabber tests' runner = TextTestRunner(stream=sys.stdout, @@ -52,9 +52,9 @@ def parse_args(): elif o in ('-v', '--verbosity'): verbosity = int(a) return (descriptions,verbosity) - + def usage(): print __doc__ - + if __name__ == '__main__': main() diff --git a/test/test_byterange.py b/test/test_byterange.py index 0f75807..3322b1b 100644 --- a/test/test_byterange.py +++ b/test/test_byterange.py @@ -11,9 +11,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -32,17 +32,17 @@ from base_test_code import * class RangeableFileObjectTestCase(TestCase): """Test range.RangeableFileObject class""" - + def setUp(self): # 0 1 2 3 4 5 6 7 8 9 # 0123456789012345678901234567890123456789012345678901234567 890123456789012345678901234567890 self.test = 'Why cannot we write the entire 24 volumes of Encyclopaedia\nBrittanica on the head of a pin?\n' self.fo = StringIO(self.test) self.rfo = RangeableFileObject(self.fo, (20,69)) - + def tearDown(self): pass - + def test_seek(self): """RangeableFileObject.seek()""" self.rfo.seek(11) @@ -51,25 +51,25 @@ class RangeableFileObjectTestCase(TestCase): self.assertEquals('volumes', self.rfo.read(7)) self.rfo.seek(1,1) self.assertEquals('of', self.rfo.read(2)) - + def test_read(self): """RangeableFileObject.read()""" self.assertEquals('the', self.rfo.read(3)) self.assertEquals(' entire 24 volumes of ', self.rfo.read(22)) self.assertEquals('Encyclopaedia\nBrittanica', self.rfo.read(50)) self.assertEquals('', self.rfo.read()) - + def test_readall(self): """RangeableFileObject.read(): to end of file.""" rfo = RangeableFileObject(StringIO(self.test),(11,)) self.assertEquals(self.test[11:],rfo.read()) - + def test_readline(self): """RangeableFileObject.readline()""" self.assertEquals('the entire 24 volumes of Encyclopaedia\n', self.rfo.readline()) self.assertEquals('Brittanica', self.rfo.readline()) self.assertEquals('', self.rfo.readline()) - + def test_tell(self): """RangeableFileObject.tell()""" self.assertEquals(0,self.rfo.tell()) @@ -77,20 +77,20 @@ class RangeableFileObjectTestCase(TestCase): self.assertEquals(5,self.rfo.tell()) self.rfo.readline() self.assertEquals(39,self.rfo.tell()) - + class RangeModuleTestCase(TestCase): """Test module level functions defined in range.py""" def setUp(self): pass - + def tearDown(self): pass - + def test_range_tuple_normalize(self): """byterange.range_tuple_normalize()""" from urlgrabber.byterange import range_tuple_normalize from urlgrabber.byterange import RangeError - tests = ( + tests = ( ((None,50), (0,50)), ((500,600), (500,600)), ((500,), (500,'')), @@ -101,15 +101,15 @@ class RangeModuleTestCase(TestCase): ) for test, ex in tests: self.assertEquals( range_tuple_normalize(test), ex ) - + try: range_tuple_normalize( (10,8) ) except RangeError: pass else: self.fail("range_tuple_normalize( (10,8) ) should have raised RangeError") - + def test_range_header_to_tuple(self): """byterange.range_header_to_tuple()""" from urlgrabber.byterange import range_header_to_tuple - tests = ( + tests = ( ('bytes=500-600', (500,601)), ('bytes=500-', (500,'')), ('bla bla', ()), @@ -117,11 +117,11 @@ class RangeModuleTestCase(TestCase): ) for test, ex in tests: self.assertEquals( range_header_to_tuple(test), ex ) - + def test_range_tuple_to_header(self): """byterange.range_tuple_to_header()""" from urlgrabber.byterange import range_tuple_to_header - tests = ( + tests = ( ((500,600), 'bytes=500-599'), ((500,''), 'bytes=500-'), ((500,), 'bytes=500-'), @@ -131,15 +131,15 @@ class RangeModuleTestCase(TestCase): ) for test, ex in tests: self.assertEquals( range_tuple_to_header(test), ex ) - + try: range_tuple_to_header( ('not an int',500) ) except ValueError: pass else: self.fail("range_tuple_to_header( ('not an int',500) ) should have raised ValueError") - + try: range_tuple_to_header( (0,'not an int') ) except ValueError: pass else: self.fail("range_tuple_to_header( (0, 'not an int') ) should have raised ValueError") - + def suite(): tl = TestLoader() return tl.loadTestsFromModule(sys.modules[__name__]) diff --git a/test/test_grabber.py b/test/test_grabber.py index 8e45d25..2bfb1b8 100644 --- a/test/test_grabber.py +++ b/test/test_grabber.py @@ -11,9 +11,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -38,7 +38,7 @@ from urlgrabber.grabber import URLGrabber, URLGrabError, CallbackObject, \ from urlgrabber.progress import text_progress_meter class FileObjectTests(TestCase): - + def setUp(self): self.filename = tempfile.mktemp() fo = file(self.filename, 'wb') @@ -83,7 +83,7 @@ class FileObjectTests(TestCase): self.fo_output.write(s) if not s: break self.assert_(reference_data == self.fo_output.getvalue()) - + class HTTPTests(TestCase): def test_reference_file(self): "download reference file via HTTP" @@ -110,41 +110,41 @@ class URLGrabberModuleTestCase(TestCase): """Test module level functions defined in grabber.py""" def setUp(self): pass - + def tearDown(self): pass - + def test_urlopen(self): "module-level urlopen() function" fo = urlgrabber.urlopen('http://www.python.org') fo.close() - + def test_urlgrab(self): "module-level urlgrab() function" outfile = tempfile.mktemp() - filename = urlgrabber.urlgrab('http://www.python.org', + filename = urlgrabber.urlgrab('http://www.python.org', filename=outfile) os.unlink(outfile) - + def test_urlread(self): "module-level urlread() function" s = urlgrabber.urlread('http://www.python.org') - + class URLGrabberTestCase(TestCase): """Test grabber.URLGrabber class""" - + def setUp(self): - + self.meter = text_progress_meter( fo=cStringIO.StringIO() ) pass - + def tearDown(self): pass - + def testKeywordArgs(self): """grabber.URLGrabber.__init__() **kwargs handling. - + This is a simple test that just passes some arbitrary values into the URLGrabber constructor and checks that they've been set properly. @@ -171,8 +171,8 @@ class URLGrabberTestCase(TestCase): self.assertEquals( opts.user_agent, 'test ua/1.0' ) self.assertEquals( opts.proxies, {'http' : 'http://www.proxy.com:9090'} ) self.assertEquals( opts.opener, opener ) - - nopts = grabber.URLGrabberOptions(delegate=opts, throttle=0.5, + + nopts = grabber.URLGrabberOptions(delegate=opts, throttle=0.5, copy_local=0) self.assertEquals( nopts.progress_obj, self.meter ) self.assertEquals( nopts.throttle, 0.5 ) @@ -185,7 +185,7 @@ class URLGrabberTestCase(TestCase): self.assertEquals( nopts.proxies, {'http' : 'http://www.proxy.com:9090'} ) nopts.opener = None self.assertEquals( nopts.opener, None ) - + def test_make_callback(self): """grabber.URLGrabber._make_callback() tests""" def cb(e): pass @@ -197,7 +197,7 @@ class URLGrabberTestCase(TestCase): class URLParserTestCase(TestCase): def setUp(self): pass - + def tearDown(self): pass @@ -207,7 +207,7 @@ class URLParserTestCase(TestCase): bases = [base, base+'/'] filename = 'bar/baz' target = base + '/' + filename - + for b in bases: g = URLGrabber(prefix=b) (url, parts) = g.opts.urlparser.parse(filename, g.opts) @@ -219,7 +219,7 @@ class URLParserTestCase(TestCase): except IndexError: quote = None g.opts.quote = quote (url, parts) = g.opts.urlparser.parse(urllist[0], g.opts) - + if 1: self.assertEquals(url, urllist[1]) self.assertEquals(parts, urllist[2]) @@ -232,7 +232,7 @@ class URLParserTestCase(TestCase): print ' ' + url print ' ' + urllist[2] print ' ' + parts - + url_tests_all = ( ['http://host.com/path/basename.ext?arg1=val1&arg2=val2#hash', @@ -252,13 +252,13 @@ class URLParserTestCase(TestCase): 'http://host.com/Should%2520Not', ('http', 'host.com', '/Should%2520Not', '', '', ''), 1], ) - + url_tests_posix = ( ['/etc/passwd', 'file:///etc/passwd', ('file', '', '/etc/passwd', '', '', '')], ) - + url_tests_nt = ( [r'\\foo.com\path\file.ext', 'file://foo.com/path/file.ext', @@ -296,7 +296,7 @@ class FailureTestCase(TestCase): self.obj = obj self.args = args self.kwargs = kwargs - + def test_failure_callback_called(self): "failure callback is called on retry" self.failure_callback_called = 0 @@ -340,7 +340,7 @@ class InterruptTestCase(TestCase): self.kwargs = kwargs if kwargs.get('exception', None): raise kwargs['exception'] - + def test_interrupt_callback_called(self): "interrupt callback is called on retry" self.interrupt_callback_called = 0 @@ -367,12 +367,12 @@ class CheckfuncTestCase(TestCase): self.g = grabber.URLGrabber(checkfunc=cf) self.filename = tempfile.mktemp() self.data = short_reference_data - + def tearDown(self): try: os.unlink(self.filename) except: pass if hasattr(self, 'obj'): del self.obj - + def _checkfunc(self, obj, *args, **kwargs): self.obj = obj self.args = args @@ -389,7 +389,7 @@ class CheckfuncTestCase(TestCase): if data == self.data: return else: raise URLGrabError(-2, "data doesn't match") - + def _check_common_args(self): "check the args that are common to both urlgrab and urlread" self.assert_(hasattr(self, 'obj')) @@ -456,7 +456,7 @@ class RegetTestBase: data = fo.read() fo.close() return data - + class CommonRegetTests(RegetTestBase, TestCase): def test_bad_reget_type(self): "exception raised for illegal reget mode" @@ -487,14 +487,14 @@ class HTTPRegetTests(FTPRegetTests): def setUp(self): RegetTestBase.setUp(self) self.url = short_ref_http - + def test_older_check_timestamp(self): try: # define this here rather than in the FTP tests because currently, # we get no timestamp information back from ftp servers. self._make_half_zero_file() ts = 1600000000 # set local timestamp to 2020 - os.utime(self.filename, (ts, ts)) + os.utime(self.filename, (ts, ts)) self.grabber.urlgrab(self.url, self.filename, reget='check_timestamp') data = self._read_file() @@ -502,21 +502,21 @@ class HTTPRegetTests(FTPRegetTests): self.assertEquals(data[self.hl:], self.ref[self.hl:]) except NotImplementedError: self.skip() - + def test_newer_check_timestamp(self): try: # define this here rather than in the FTP tests because currently, # we get no timestamp information back from ftp servers. self._make_half_zero_file() ts = 1 # set local timestamp to 1969 - os.utime(self.filename, (ts, ts)) + os.utime(self.filename, (ts, ts)) self.grabber.urlgrab(self.url, self.filename, reget='check_timestamp') data = self._read_file() self.assertEquals(data, self.ref) except: self.skip() - + class FileRegetTests(HTTPRegetTests): def setUp(self): self.ref = short_reference_data @@ -525,7 +525,7 @@ class FileRegetTests(HTTPRegetTests): tmpfo.write(self.ref) tmpfo.close() self.tmp = tmp - + (url, parts) = grabber.default_grabber.opts.urlparser.parse( tmp, grabber.default_grabber.opts) self.url = url @@ -552,7 +552,7 @@ class ProFTPDSucksTests(TestCase): def test_restart_workaround(self): inst = grabber.URLGrabber() rslt = inst.urlread(self.url, range=(500, 1000)) - + class BaseProxyTests(TestCase): good_p = '%s://%s:%s@%s:%i' % (proxy_proto, proxy_user, good_proxy_pass, proxy_host, proxy_port) @@ -605,4 +605,4 @@ if __name__ == '__main__': grabber.DEBUG = 0 runner = TextTestRunner(stream=sys.stdout,descriptions=1,verbosity=2) runner.run(suite()) - + diff --git a/test/test_mirror.py b/test/test_mirror.py index a6bb6cb..921c6f3 100644 --- a/test/test_mirror.py +++ b/test/test_mirror.py @@ -11,9 +11,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -110,7 +110,7 @@ class CallbackTests(TestCase): # test assumes mirrors are not re-ordered urlgrabber.grabber._TH.hosts.clear() self.mg = MirrorGroup(self.g, fullmirrors) - + def test_failure_callback(self): "test that MG executes the failure callback correctly" tricky_list = [] @@ -158,7 +158,7 @@ class FailoverTests(TestCase): fo = open(filename) contents = fo.read() fo.close() - + # first be sure that the first mirror failed and that the # callback was called self.assertEqual(len(elist), 1) @@ -172,7 +172,7 @@ class FakeGrabber: self.index = 0 self.calls = [] self.opts = URLGrabberOptions() - + def urlgrab(self, url, filename=None, **kwargs): self.calls.append( (url, filename) ) res = self.resultlist[self.index] @@ -191,7 +191,7 @@ class ActionTests(TestCase): def tearDown(self): urlgrabber.mirror.DEBUG = self.db - + def test_defaults(self): 'test default action policy' self.mg.urlgrab('somefile') @@ -207,10 +207,10 @@ class ActionTests(TestCase): 'GR mirrors: [c d e f] 0', 'MAIN mirrors: [a b c d e f] 2', 'MIRROR: trying somefile -> c/somefile'] - + self.assertEquals(self.g.calls, expected_calls) self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) - + def test_instance_action(self): 'test the effects of passed-in default_action' self.mg.default_action = {'remove_master': 1} @@ -227,10 +227,10 @@ class ActionTests(TestCase): 'GR mirrors: [c d e f] 0', 'MAIN mirrors: [c d e f] 0', 'MIRROR: trying somefile -> c/somefile'] - + self.assertEquals(self.g.calls, expected_calls) self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) - + def test_method_action(self): 'test the effects of method-level default_action' self.mg.urlgrab('somefile', default_action={'remove_master': 1}) @@ -246,13 +246,13 @@ class ActionTests(TestCase): 'GR mirrors: [c d e f] 0', 'MAIN mirrors: [c d e f] 0', 'MIRROR: trying somefile -> c/somefile'] - + self.assertEquals(self.g.calls, expected_calls) self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) - + def callback(self, e): return {'fail': 1} - + def test_callback_action(self): 'test the effects of a callback-returned action' self.assertRaises(URLGrabError, self.mg.urlgrab, 'somefile', @@ -267,7 +267,7 @@ class ActionTests(TestCase): self.assertEquals(self.g.calls, expected_calls) self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) - + import threading, socket LOCALPORT = 'localhost', 2000 @@ -387,4 +387,4 @@ def suite(): if __name__ == '__main__': runner = TextTestRunner(stream=sys.stdout,descriptions=1,verbosity=2) runner.run(suite()) - + diff --git a/test/threading/batchgrabber.py b/test/threading/batchgrabber.py index ce2b34a..9fab7fc 100644 --- a/test/threading/batchgrabber.py +++ b/test/threading/batchgrabber.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -19,10 +19,10 @@ """Module for testing urlgrabber under multiple threads. -This module can be used from the command line. Each argument is +This module can be used from the command line. Each argument is a URL to grab. -The BatchURLGrabber class has an interface similar to URLGrabber +The BatchURLGrabber class has an interface similar to URLGrabber but instead of pulling files when urlgrab is called, the request is queued. Calling BatchURLGrabber.batchgrab causes all files to be pulled in multiple threads. @@ -48,10 +48,10 @@ class BatchURLGrabber: self.queue = [] self.threads = [] self.sem = Semaphore() - + def urlgrab(self, url, filename=None, **kwargs): self.queue.append( (url, filename, kwargs) ) - + def batchgrab(self): if hasattr(self.grabber.opts.progress_obj, 'start'): self.grabber.opts.progress_obj.start(len(self.queue)) @@ -71,7 +71,7 @@ class BatchURLGrabber: #if len(self.threads) == self.maxthreads: # sleep(0.2) sleep(0.2) - + class Worker(Thread): def __init__(self, parent, url, filename, kwargs): Thread.__init__(self) @@ -79,7 +79,7 @@ class Worker(Thread): self.url = url self.filename = filename self.kwargs = kwargs - + def run(self): if DEBUG: print "worker thread started." grabber = self.parent.grabber @@ -90,7 +90,7 @@ class Worker(Thread): rslt = self.parent.grabber.urlgrab(self.url, self.filename, **self.kwargs) except URLGrabError as e: print '%s, %s' % (e, self.url) - + def main(): progress_obj = None # uncomment to play with BatchProgressMeter (doesn't work right now) @@ -103,8 +103,8 @@ def main(): g.batchgrab() except KeyboardInterrupt: sys.exit(1) - + if DEBUG: print "after batchgrab" - + if __name__ == '__main__': main() diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py index d2d26ea..fadc9c0 100644 --- a/urlgrabber/byterange.py +++ b/urlgrabber/byterange.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -26,46 +26,46 @@ import rfc822 DEBUG = None -try: +try: from cStringIO import StringIO -except ImportError as msg: +except ImportError as msg: from StringIO import StringIO class RangeError(IOError): """Error raised when an unsatisfiable range is requested.""" pass - + class HTTPRangeHandler(urllib2.BaseHandler): """Handler that enables HTTP Range headers. - + This was extremely simple. The Range header is a HTTP feature to - begin with so all this class does is tell urllib2 that the - "206 Partial Content" response from the HTTP server is what we + begin with so all this class does is tell urllib2 that the + "206 Partial Content" response from the HTTP server is what we expected. - + Example: import urllib2 import byterange - + range_handler = range.HTTPRangeHandler() opener = urllib2.build_opener(range_handler) - + # install it urllib2.install_opener(opener) - + # create Request and set Range header req = urllib2.Request('http://www.python.org/') req.header['Range'] = 'bytes=30-50' f = urllib2.urlopen(req) """ - + def http_error_206(self, req, fp, code, msg, hdrs): # 206 Partial Content Response r = urllib.addinfourl(fp, hdrs, req.get_full_url()) r.code = code r.msg = msg return r - + def http_error_416(self, req, fp, code, msg, hdrs): # HTTP's Range Not Satisfiable error raise RangeError(9, 'Requested Range Not Satisfiable') @@ -81,13 +81,13 @@ class HTTPSRangeHandler(HTTPRangeHandler): class RangeableFileObject: """File object wrapper to enable raw range handling. - This was implemented primarilary for handling range - specifications for file:// urls. This object effectively makes - a file object look like it consists only of a range of bytes in + This was implemented primarilary for handling range + specifications for file:// urls. This object effectively makes + a file object look like it consists only of a range of bytes in the stream. - + Examples: - # expose 10 bytes, starting at byte position 20, from + # expose 10 bytes, starting at byte position 20, from # /etc/aliases. >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30)) # seek seeks within the range (to position 23 in this case) @@ -99,11 +99,11 @@ class RangeableFileObject: # byte in the range. the following will return only 7 bytes. >>> fo.read(30) """ - + def __init__(self, fo, rangetup): """Create a RangeableFileObject. - fo -- a file like object. only the read() method need be - supported but supporting an optimized seek() is + fo -- a file like object. only the read() method need be + supported but supporting an optimized seek() is preferable. rangetup -- a (firstbyte,lastbyte) tuple specifying the range to work over. @@ -113,7 +113,7 @@ class RangeableFileObject: (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup) self.realpos = 0 self._do_seek(self.firstbyte) - + def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for @@ -121,16 +121,16 @@ class RangeableFileObject: if hasattr(self.fo, name): return getattr(self.fo, name) raise AttributeError(name) - + def tell(self): """Return the position within the range. - This is different from fo.seek in that position 0 is the + This is different from fo.seek in that position 0 is the first byte position of the range tuple. For example, if this object was created with a range tuple of (500,899), tell() will return 0 when at byte position 500 of the file. """ return (self.realpos - self.firstbyte) - + def seek(self,offset,whence=0): """Seek within the byte range. Positioning is identical to that described under tell(). @@ -143,13 +143,13 @@ class RangeableFileObject: elif whence == 2: # absolute from end of file # XXX: are we raising the right Error here? raise IOError('seek from end of file not supported.') - + # do not allow seek past lastbyte in range if self.lastbyte and (realoffset >= self.lastbyte): realoffset = self.lastbyte - + self._do_seek(realoffset - self.realpos) - + def read(self, size=-1): """Read within the range. This method will limit the size read based on the range. @@ -158,7 +158,7 @@ class RangeableFileObject: rslt = self.fo.read(size) self.realpos += len(rslt) return rslt - + def readline(self, size=-1): """Read lines within the range. This method will limit the size read based on the range. @@ -167,7 +167,7 @@ class RangeableFileObject: rslt = self.fo.readline(size) self.realpos += len(rslt) return rslt - + def _calc_read_size(self, size): """Handles calculating the amount of data to read based on the range. @@ -179,7 +179,7 @@ class RangeableFileObject: else: size = (self.lastbyte - self.realpos) return size - + def _do_seek(self,offset): """Seek based on whether wrapped object supports seek(). offset is relative to the current position (self.realpos). @@ -190,7 +190,7 @@ class RangeableFileObject: else: self.fo.seek(self.realpos + offset) self.realpos+= offset - + def _poor_mans_seek(self,offset): """Seek by calling the wrapped file objects read() method. This is used for file like objects that do not have native @@ -198,7 +198,7 @@ class RangeableFileObject: to manually seek to the desired position. offset -- read this number of bytes from the wrapped file object. - raise RangeError if we encounter EOF before reaching the + raise RangeError if we encounter EOF before reaching the specified offset. """ pos = 0 @@ -247,10 +247,10 @@ class FileRangeHandler(urllib2.FileHandler): return urllib.addinfourl(fo, headers, 'file:'+file) -# FTP Range Support +# FTP Range Support # Unfortunately, a large amount of base FTP code had to be copied # from urllib and urllib2 in order to insert the FTP REST command. -# Code modifications for range support have been commented as +# Code modifications for range support have been commented as # follows: # -- range support modifications start/end here @@ -282,7 +282,7 @@ class FTPRangeHandler(urllib2.FTPHandler): host = unquote(host) user = unquote(user or '') passwd = unquote(passwd or '') - + try: host = socket.gethostbyname(host) except socket.error as msg: @@ -301,22 +301,22 @@ class FTPRangeHandler(urllib2.FTPHandler): if attr.lower() == 'type' and \ value in ('a', 'A', 'i', 'I', 'd', 'D'): type = value.upper() - + # -- range support modifications start here rest = None - range_tup = range_header_to_tuple(req.headers.get('Range',None)) + range_tup = range_header_to_tuple(req.headers.get('Range',None)) assert range_tup != () if range_tup: (fb,lb) = range_tup if fb > 0: rest = fb # -- range support modifications end here - + fp, retrlen = fw.retrfile(file, type, rest) - + # -- range support modifications start here if range_tup: (fb,lb) = range_tup - if lb == '': + if lb == '': if retrlen is None or retrlen == 0: raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.') lb = retrlen @@ -328,7 +328,7 @@ class FTPRangeHandler(urllib2.FTPHandler): retrlen = lb - fb fp = RangeableFileObject(fp, (0,retrlen)) # -- range support modifications end here - + headers = "" mtype = mimetypes.guess_type(req.get_full_url())[0] if mtype: @@ -400,17 +400,17 @@ class ftpwrapper(urllib.ftpwrapper): _rangere = None def range_header_to_tuple(range_header): """Get a (firstbyte,lastbyte) tuple from a Range header value. - + Range headers have the form "bytes=<firstbyte>-<lastbyte>". This function pulls the firstbyte and lastbyte values and returns a (firstbyte,lastbyte) tuple. If lastbyte is not specified in the header value, it is returned as an empty string in the tuple. - + Return None if range_header is None - Return () if range_header does not conform to the range spec + Return () if range_header does not conform to the range spec pattern. - + """ global _rangere if range_header is None: return None @@ -418,9 +418,9 @@ def range_header_to_tuple(range_header): import re _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)') match = _rangere.match(range_header) - if match: + if match: tup = range_tuple_normalize(match.group(1,2)) - if tup and tup[1]: + if tup and tup[1]: tup = (tup[0],tup[1]+1) return tup return () @@ -433,14 +433,14 @@ def range_tuple_to_header(range_tup): if range_tup is None: return None range_tup = range_tuple_normalize(range_tup) if range_tup: - if range_tup[1]: + if range_tup[1]: range_tup = (range_tup[0],range_tup[1] - 1) return 'bytes=%s-%s' % range_tup - + def range_tuple_normalize(range_tup): """Normalize a (first_byte,last_byte) range tuple. Return a tuple whose first element is guaranteed to be an int - and whose second element will be '' (meaning: the last byte) or + and whose second element will be '' (meaning: the last byte) or an int. Finally, return None if the normalized tuple == (0,'') as that is equivalent to retrieving the entire file. """ @@ -452,7 +452,7 @@ def range_tuple_normalize(range_tup): # handle last byte try: lb = range_tup[1] except IndexError: lb = '' - else: + else: if lb is None: lb = '' elif lb != '': lb = int(lb) # check if range is over the entire file diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index 1e09e5a..64da884 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -70,7 +70,7 @@ GENERAL ARGUMENTS (kwargs) are shown but there's no in-progress display. text = None - + specifies alternative text to be passed to the progress meter object. If not given, the default progress meter will use the basename of the file. @@ -116,7 +116,7 @@ GENERAL ARGUMENTS (kwargs) the first 10 bytes of the file. If set to None, no range will be used. - + reget = None [None|'simple'|'check_timestamp'] whether to attempt to reget a partially-downloaded file. Reget @@ -206,7 +206,7 @@ GENERAL ARGUMENTS (kwargs) option. Note that python 2.2 handles the case of these badly and if you do not use the proper case (shown here), your values will be overridden with the defaults. - + urlparser = URLParser() The URLParser class handles pre-processing of URLs, including @@ -246,12 +246,12 @@ GENERAL ARGUMENTS (kwargs) ssl_context = None No-op when using the curl backend (default) - + ssl_verify_peer = True Check the server's certificate to make sure it is valid with what our CA validates - + ssl_verify_host = True Check the server's hostname to make sure it matches the certificate DN @@ -263,7 +263,7 @@ GENERAL ARGUMENTS (kwargs) ssl_key_type = 'PEM' PEM or DER - format of key - + ssl_cert = None Path to the ssl certificate the client should use to to authenticate with @@ -271,20 +271,20 @@ GENERAL ARGUMENTS (kwargs) ssl_cert_type = 'PEM' PEM or DER - format of certificate - + ssl_key_pass = None password to access the ssl_key - + size = None - size (in bytes) or Maximum size of the thing being downloaded. + size (in bytes) or Maximum size of the thing being downloaded. This is mostly to keep us from exploding with an endless datastream - + max_header_size = 2097152 Maximum size (in bytes) of the headers. - + ip_resolve = 'whatever' What type of name to IP resolving to use, default is to do both IPV4 and @@ -347,7 +347,7 @@ RETRY RELATED ARGUMENTS retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes if 12 not in retrycodes: retrycodes.append(12) - + checkfunc = None a function to do additional checks. This defaults to None, which @@ -378,7 +378,7 @@ RETRY RELATED ARGUMENTS function(obj, 'arg1', 2, kwarg=3) # obj.filename = '/tmp/stuff' # obj.url = 'http://foo.com/stuff' - + NOTE: both the "args" tuple and "kwargs" dict must be present if you use this syntax, but either (or both) can be empty. @@ -437,7 +437,7 @@ RETRY RELATED ARGUMENTS This callback is very similar to failure_callback. They are passed the same arguments, so you could use the same function for both. - + retry_no_cache = False When True, automatically enable no_cache for future retries if @@ -557,7 +557,7 @@ try: from i18n import _ except ImportError as msg: def _(st): return st - + ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. @@ -587,7 +587,7 @@ def _init_default_logger(logspec=None): the form URLGRABBER_DEBUG=level,filename - + where "level" can be either an integer or a log level from the logging module (DEBUG, INFO, etc). If the integer is zero or less, logging will be disabled. Filename is the filename where @@ -600,7 +600,7 @@ def _init_default_logger(logspec=None): URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout URLGRABBER_DEBUG=INFO # log info and higher to stderr - + This function is called during module initialization. It is not intended to be called from outside. The only reason it is a function at all is to keep the module-level namespace tidy and to @@ -634,7 +634,7 @@ def _log_package_state(): if not DEBUG: return DEBUG.debug('urlgrabber version = %s' % __version__) DEBUG.debug('trans function "_" = %s' % _) - + _init_default_logger() _log_package_state() @@ -694,7 +694,7 @@ class URLGrabError(IOError): 14 - HTTPError (includes .code and .exception attributes) 15 - user abort 16 - error writing to local file - + MirrorGroup error codes (256 -- 511) 256 - No more mirrors left to try @@ -749,7 +749,7 @@ def urlgrab(url, filename=None, **kwargs): If filename is none, the basename of the url is used. urlgrab returns the filename of the local file, which may be different from the passed-in filename if the copy_local kwarg == 0. - + See module documentation for a description of possible kwargs. """ return default_grabber.urlgrab(url, filename, **kwargs) @@ -759,7 +759,7 @@ def urlopen(url, **kwargs): If a progress object or throttle specifications exist, then a special file object will be returned that supports them. The file object can be treated like any other file object. - + See module documentation for a description of possible kwargs. """ return default_grabber.urlopen(url, **kwargs) @@ -769,7 +769,7 @@ def urlread(url, limit=None, **kwargs): If the limit is exceeded, an exception will be thrown. Note that urlread is NOT intended to be used as a way of saying "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' - + See module documentation for a description of possible kwargs. """ return default_grabber.urlread(url, limit, **kwargs) @@ -807,10 +807,10 @@ class URLParser: """ url = _to_utf8(url) quote = opts.quote - + if opts.prefix: url = self.add_prefix(url, opts.prefix) - + parts = urlparse.urlparse(url) (scheme, host, path, parm, query, frag) = parts @@ -820,15 +820,15 @@ class URLParser: url = 'file:' + urllib.pathname2url(url) parts = urlparse.urlparse(url) quote = 0 # pathname2url quotes, so we won't do it again - + if scheme in ['http', 'https']: parts = self.process_http(parts, url) - + if quote is None: quote = self.guess_should_quote(parts) if quote: parts = self.quote(parts) - + url = urlparse.urlunparse(parts) return url, parts @@ -882,7 +882,7 @@ class URLParser: ind = string.find(path, '%', ind+1) return 0 return 1 - + class URLGrabberOptions: """Class to ease kwargs handling.""" @@ -895,23 +895,23 @@ class URLGrabberOptions: if delegate is None: self._set_defaults() self._set_attributes(**kwargs) - + def __getattr__(self, name): if self.delegate and hasattr(self.delegate, name): return getattr(self.delegate, name) raise AttributeError(name) - + def raw_throttle(self): - """Calculate raw throttle value from throttle and bandwidth + """Calculate raw throttle value from throttle and bandwidth values. """ - if self.throttle <= 0: + if self.throttle <= 0: return 0 - elif type(self.throttle) == type(0): + elif type(self.throttle) == type(0): return float(self.throttle) else: # throttle is a float return self.bandwidth * self.throttle - + def find_proxy(self, url, scheme): """Find the proxy to use for this URL. Use the proxies dictionary first, then libproxy. @@ -953,7 +953,7 @@ class URLGrabberOptions: options specified in kwargs. """ return URLGrabberOptions(delegate=self, **kwargs) - + def _set_attributes(self, **kwargs): """Update object attributes with those provided in kwargs.""" self.__dict__.update(kwargs) @@ -965,7 +965,7 @@ class URLGrabberOptions: % (self.reget, )) def _set_defaults(self): - """Set all options to their default values. + """Set all options to their default values. When adding new options, make sure a default is provided here. """ @@ -1023,10 +1023,10 @@ class URLGrabberOptions: self.ftp_disable_epsv = False self.no_cache = False self.retry_no_cache = False - + def __repr__(self): return self.format() - + def format(self, indent=' '): keys = self.__dict__.keys() if self.delegate is not None: @@ -1055,16 +1055,16 @@ def _run_callback(cb, obj): class URLGrabber(object): """Provides easy opening of URLs with a variety of options. - + All options are specified as kwargs. Options may be specified when the class is created and may be overridden on a per request basis. - + New objects inherit default values from default_grabber. """ - + def __init__(self, **kwargs): self.opts = URLGrabberOptions(**kwargs) - + def _retry(self, opts, func, *args): tries = 0 while 1: @@ -1114,33 +1114,33 @@ class URLGrabber(object): raise if retrycode is not None and retrycode < 0 and opts.retry_no_cache: opts.no_cache = True - + def urlopen(self, url, opts=None, **kwargs): """open the url and return a file object - If a progress object or throttle value specified when this - object was created, then a special file object will be - returned that supports them. The file object can be treated + If a progress object or throttle value specified when this + object was created, then a special file object will be + returned that supports them. The file object can be treated like any other file object. """ url = _to_utf8(url) opts = (opts or self.opts).derive(**kwargs) if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) - (url,parts) = opts.urlparser.parse(url, opts) + (url,parts) = opts.urlparser.parse(url, opts) opts.find_proxy(url, parts[0]) def retryfunc(opts, url): return PyCurlFileObject(url, filename=None, opts=opts) return self._retry(opts, retryfunc, url) - + def urlgrab(self, url, filename=None, opts=None, **kwargs): """grab the file at <url> and make a local copy at <filename> If filename is none, the basename of the url is used. - urlgrab returns the filename of the local file, which may be + urlgrab returns the filename of the local file, which may be different from the passed-in filename if copy_local == 0. """ url = _to_utf8(url) opts = (opts or self.opts).derive(**kwargs) if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) - (url,parts) = opts.urlparser.parse(url, opts) + (url,parts) = opts.urlparser.parse(url, opts) (scheme, host, path, parm, query, frag) = parts opts.find_proxy(url, scheme) if filename is None: @@ -1149,18 +1149,18 @@ class URLGrabber(object): # This is better than nothing. filename = 'index.html' if scheme == 'file' and not opts.copy_local: - # just return the name of the local file - don't make a + # just return the name of the local file - don't make a # copy currently path = urllib.url2pathname(path) if host: path = os.path.normpath('//' + host + path) if not os.path.exists(path): - err = URLGrabError(2, + err = URLGrabError(2, _('Local file does not exist: %s') % (path, )) err.url = url raise err elif not os.path.isfile(path): - err = URLGrabError(3, + err = URLGrabError(3, _('Not a normal file: %s') % (path, )) err.url = url raise err @@ -1170,7 +1170,7 @@ class URLGrabber(object): obj = CallbackObject(filename=path, url=url) _run_callback(opts.checkfunc, obj) return path - + if opts.async: opts.url = url opts.filename = filename @@ -1192,29 +1192,29 @@ class URLGrabber(object): finally: fo.close() return filename - + try: return self._retry(opts, retryfunc, url, filename) except URLGrabError as e: _TH.update(url, 0, 0, e) opts.exception = e return _run_callback(opts.failfunc, opts) - + def urlread(self, url, limit=None, opts=None, **kwargs): """read the url into a string, up to 'limit' bytes If the limit is exceeded, an exception will be thrown. Note - that urlread is NOT intended to be used as a way of saying - "I want the first N bytes" but rather 'read the whole file + that urlread is NOT intended to be used as a way of saying + "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' """ url = _to_utf8(url) opts = (opts or self.opts).derive(**kwargs) if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) - (url,parts) = opts.urlparser.parse(url, opts) + (url,parts) = opts.urlparser.parse(url, opts) opts.find_proxy(url, parts[0]) if limit is not None: limit = limit + 1 - + def retryfunc(opts, url, limit): fo = PyCurlFileObject(url, filename=None, opts=opts) s = '' @@ -1232,16 +1232,16 @@ class URLGrabber(object): finally: fo.close() return s - + s = self._retry(opts, retryfunc, url, limit) if limit and len(s) > limit: - err = URLGrabError(8, + err = URLGrabError(8, _('Exceeded limit (%i): %s') % (limit, url)) err.url = url raise err return s - + def _make_callback(self, callback_obj): # not used, left for compatibility if callable(callback_obj): @@ -1282,7 +1282,7 @@ class PyCurlFileObject(object): self._tm_first = None self._tm_last = None self._do_open() - + def __getattr__(self, name): """This effectively allows us to wrap at the instance level. @@ -1304,9 +1304,9 @@ class PyCurlFileObject(object): if not self._prog_running: if self.opts.progress_obj: size = self.size + self._reget_length - self.opts.progress_obj.start(self._prog_reportname, - urllib.unquote(self.url), - self._prog_basename, + self.opts.progress_obj.start(self._prog_reportname, + urllib.unquote(self.url), + self._prog_basename, size=size, text=self.opts.text) self._prog_running = True @@ -1329,14 +1329,14 @@ class PyCurlFileObject(object): return len(buf) except KeyboardInterrupt: return -1 - + def _hdr_retrieve(self, buf): if self._hdr_ended: self._hdr_dump = '' self.size = 0 self._hdr_ended = False - if self._over_max_size(cur=len(self._hdr_dump), + if self._over_max_size(cur=len(self._hdr_dump), max_size=self.opts.max_header_size): return -1 try: @@ -1366,18 +1366,18 @@ class PyCurlFileObject(object): s = parse150(buf) if s: self.size = int(s) - + if buf.lower().find('location') != -1: location = ':'.join(buf.split(':')[1:]) location = location.strip() self.scheme = urlparse.urlsplit(location)[0] self.url = location - + self._hdr_dump += buf if len(self._hdr_dump) != 0 and buf == '\r\n': self._hdr_ended = True if DEBUG: DEBUG.debug('header ended:') - + return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT @@ -1392,7 +1392,7 @@ class PyCurlFileObject(object): hdrfp.seek(0) self._parsed_hdr = mimetools.Message(hdrfp) return self._parsed_hdr - + hdr = property(_return_hdr_obj) http_code = property(fget= lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE)) @@ -1415,7 +1415,7 @@ class PyCurlFileObject(object): self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) - + if DEBUG and DEBUG.level <= 10: self.curl_obj.setopt(pycurl.VERBOSE, True) if opts.user_agent: @@ -1429,11 +1429,11 @@ class PyCurlFileObject(object): self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) if ipr == 'ipv6': self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6) - + # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) self.curl_obj.setopt(pycurl.MAXREDIRS, 5) - + # timeouts timeout = 300 if hasattr(opts, 'timeout'): @@ -1458,7 +1458,7 @@ class PyCurlFileObject(object): self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert) # if we have a client side cert - turn off reuse b/c nss is odd self.curl_obj.setopt(pycurl.FORBID_REUSE, 1) - if opts.ssl_cert_type: + if opts.ssl_cert_type: self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) if opts.ssl_key_pass: self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) @@ -1479,11 +1479,11 @@ class PyCurlFileObject(object): range_str = self._build_range() if range_str: self.curl_obj.setopt(pycurl.RANGE, range_str) - + # throttle/bandwidth if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) - + # proxy if opts.proxy is not None: self.curl_obj.setopt(pycurl.PROXY, opts.proxy) @@ -1511,50 +1511,50 @@ class PyCurlFileObject(object): # our url self.curl_obj.setopt(pycurl.URL, self.url) - - + + def _do_perform(self): if self._complete: return - + try: self.curl_obj.perform() except pycurl.error as e: # XXX - break some of these out a bit more clearly - # to other URLGrabErrors from + # to other URLGrabErrors from # http://curl.haxx.se/libcurl/c/libcurl-errors.html # this covers e.args[0] == 22 pretty well - which will be common - + code = self.http_code errcode = e.args[0] errurl = urllib.unquote(self.url) - + if self._error[0]: errcode = self._error[0] - + if errcode == 23 and 200 <= code <= 299: # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside # since we cannot know what it is -I'm banking on it being - # a ctrl-c. XXXX - if there's a way of going back two raises to + # a ctrl-c. XXXX - if there's a way of going back two raises to # figure out what aborted the pycurl process FIXME raise getattr(self, '_cb_error', KeyboardInterrupt) - + elif errcode == 28: err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e)) err.url = errurl raise err - + elif errcode == 42: # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside # since we cannot know what it is -I'm banking on it being - # a ctrl-c. XXXX - if there's a way of going back two raises to + # a ctrl-c. XXXX - if there's a way of going back two raises to # figure out what aborted the pycurl process FIXME raise KeyboardInterrupt - + else: pyerr2str = { 5 : _("Couldn't resolve proxy"), 6 : _("Couldn't resolve host"), @@ -1631,7 +1631,7 @@ class PyCurlFileObject(object): def _add_headers(self): pass - + def _build_range(self): reget_length = 0 rt = None @@ -1646,19 +1646,19 @@ class PyCurlFileObject(object): reget_length = s[stat.ST_SIZE] # Set initial length when regetting - self._amount_read = reget_length + self._amount_read = reget_length self._reget_length = reget_length # set where we started from, too rt = reget_length, '' self.append = 1 - + if self.opts.range: rt = self.opts.range - + if rt[0] is None: rt = (0, rt[1]) rt = (rt[0] + reget_length, rt[1]) - + if rt: header = range_tuple_to_header(rt) @@ -1670,10 +1670,10 @@ class PyCurlFileObject(object): def _make_request(self, req, opener): #XXXX # This doesn't do anything really, but we could use this - # instead of do_open() to catch a lot of crap errors as + # instead of do_open() to catch a lot of crap errors as # mstenner did before here return (self.fo, self.hdr) - + try: if self.opts.timeout: old_to = socket.getdefaulttimeout() @@ -1723,7 +1723,7 @@ class PyCurlFileObject(object): else: return (fo, hdr) - + def _do_grab(self): """dump the file to a filename or StringIO buffer""" @@ -1734,7 +1734,7 @@ class PyCurlFileObject(object): _was_filename = True self._prog_reportname = str(self.filename) self._prog_basename = os.path.basename(self.filename) - + if self.append: mode = 'ab' else: mode = 'wb' @@ -1752,20 +1752,20 @@ class PyCurlFileObject(object): self._prog_reportname = 'MEMORY' self._prog_basename = 'MEMORY' - + self.fo = StringIO() # if this is to be a tempfile instead.... # it just makes crap in the tempdir #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') - try: + try: self._do_perform() except URLGrabError as e: self.fo.flush() self.fo.close() raise e - + if _was_filename: # close it up self.fo.flush() @@ -1786,7 +1786,7 @@ class PyCurlFileObject(object): os.utime(self.filename, (mod_time, mod_time)) except OSError as e: err = URLGrabError(16, _(\ - 'error setting timestamp on file %s from %s, OSError: %s') + 'error setting timestamp on file %s from %s, OSError: %s') % (self.filename, self.url, e)) err.url = self.url raise err @@ -1798,13 +1798,13 @@ class PyCurlFileObject(object): 'error opening file from %s, IOError: %s') % (self.url, e)) err.url = self.url raise err - + else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) self._complete = True - + def _fill_buffer(self, amt=None): """fill the buffer to contain at least 'amt' bytes by reading from the underlying file object. If amt is None, then it will @@ -1821,9 +1821,9 @@ class PyCurlFileObject(object): # if we've made it here, then we don't have enough in the buffer # and we need to read more. - + if not self._complete: self._do_grab() #XXX cheater - change on ranges - + buf = [self._rbuf] bufsize = len(self._rbuf) while amt is None or amt: @@ -1833,7 +1833,7 @@ class PyCurlFileObject(object): (time.time() - self._ttime) if diff > 0: time.sleep(diff) self._ttime = time.time() - + # now read some data, up to self._rbufsize if amt is None: readamount = self._rbufsize else: readamount = min(amt, self._rbufsize) @@ -1878,7 +1878,7 @@ class PyCurlFileObject(object): self.opts.progress_obj.update(downloaded) except (KeyboardInterrupt, IOError): return -1 - + def _over_max_size(self, cur, max_size=None): if not max_size: @@ -1896,7 +1896,7 @@ class PyCurlFileObject(object): self._error = (pycurl.E_FILESIZE_EXCEEDED, msg) return True return False - + def read(self, amt=None): self._fill_buffer(amt) if amt is None: @@ -1908,7 +1908,7 @@ class PyCurlFileObject(object): def readline(self, limit=-1): if not self._complete: self._do_grab() return self.fo.readline() - + i = string.find(self._rbuf, '\n') while i < 0 and not (0 < limit <= len(self._rbuf)): L = len(self._rbuf) @@ -1927,12 +1927,12 @@ class PyCurlFileObject(object): if self._prog_running: self.opts.progress_obj.end(self._amount_read) self.fo.close() - + def geturl(self): """ Provide the geturl() method, used to be got from urllib.addinfourl, via. urllib.URLopener.* """ return self.url - + if hasattr(pycurl, 'GLOBAL_ACK_EINTR'): # fail immediately on ctrl-c pycurl.global_init(pycurl.GLOBAL_DEFAULT | pycurl.GLOBAL_ACK_EINTR) @@ -1945,7 +1945,7 @@ def reset_curl_obj(): _curl_cache = pycurl.Curl() _libproxy_cache = None - + ##################################################################### # DEPRECATED FUNCTIONS @@ -1964,23 +1964,23 @@ def set_progress_obj(new_progress_obj): def set_user_agent(new_user_agent): """Deprecated. Use: default_grabber.user_agent = new_user_agent""" default_grabber.user_agent = new_user_agent - + def retrygrab(url, filename=None, copy_local=0, close_connection=0, progress_obj=None, throttle=None, bandwidth=None, numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None): """Deprecated. Use: urlgrab() with the retry arg instead""" - kwargs = {'copy_local' : copy_local, + kwargs = {'copy_local' : copy_local, 'close_connection' : close_connection, - 'progress_obj' : progress_obj, - 'throttle' : throttle, + 'progress_obj' : progress_obj, + 'throttle' : throttle, 'bandwidth' : bandwidth, 'retry' : numtries, 'retrycodes' : retrycodes, - 'checkfunc' : checkfunc + 'checkfunc' : checkfunc } return urlgrab(url, filename, **kwargs) - + ##################################################################### # Serializer + parser: A replacement of the rather bulky Json code. # @@ -2494,7 +2494,7 @@ def _main_test(): set_throttle(1.0) set_bandwidth(32 * 1024) - print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, + print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, default_grabber.bandwidth) try: from progress import text_progress_meter @@ -2534,7 +2534,7 @@ def _retry_test(): raise URLGrabError(-2, 'forcing immediate failure') print 'success' return - + kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'}) try: name = apply(retrygrab, (url, filename), kwargs) except URLGrabError as e: print e @@ -2561,7 +2561,7 @@ def _file_object_test(filename=None): s_output = fo_output.getvalue() if s_output == s_input: print 'passed' else: print 'FAILED' - + def _test_file_object_smallread(wrapper, fo_output): while 1: s = wrapper.read(23) diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index a2202fe..e4aac7e 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -100,7 +100,7 @@ from grabber import _run_callback, _do_raise from grabber import exception2msg from grabber import _TH -def _(st): +def _(st): return st class GrabRequest: @@ -142,7 +142,7 @@ class MirrorGroup: In addition to the required arguments "grabber" and "mirrors", MirrorGroup also takes the following optional arguments: - + default_action A dict that describes the actions to be taken upon failure @@ -173,7 +173,7 @@ class MirrorGroup: or by returning an action dict from the failure_callback return {'fail':0} in increasing precedence. - + If all three of these were done, the net result would be: {'increment': 0, # set in method 'increment_master': 1, # class default @@ -278,11 +278,11 @@ class MirrorGroup: # methods, they will be stripped before getting passed on to the # grabber options = ['default_action', 'failure_callback'] - + def _process_kwargs(self, kwargs): self.failure_callback = kwargs.get('failure_callback') self.default_action = kwargs.get('default_action') - + def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: @@ -290,7 +290,7 @@ class MirrorGroup: m = {'mirror': _to_utf8(m)} parsed_mirrors.append(m) return parsed_mirrors - + def _load_gr(self, gr): # OVERRIDE IDEAS: # shuffle gr list @@ -351,7 +351,7 @@ class MirrorGroup: urlopen, there's no good way for the mirror group to know that an error occurs mid-download (it's already returned and given you the file object). - + remove --- can have several values 0 do not remove the mirror from the list 1 remove the mirror for this download only @@ -373,7 +373,7 @@ class MirrorGroup: self._next += 1 if self._next >= len(self.mirrors): self._next = 0 self._lock.release() - + if action.get('remove', 1): del gr.mirrors[gr._next] elif action.get('increment', 1): @@ -398,7 +398,7 @@ class MirrorGroup: return base_url + rel_url else: return base_url + '/' + rel_url - + def _mirror_try(self, func, url, kw): gr = GrabRequest() gr.func = func @@ -449,7 +449,7 @@ class MirrorGroup: except URLGrabError as e: obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs) return _run_callback(kwargs.get('failfunc', _do_raise), obj) - + def urlopen(self, url, **kwargs): kw = dict(kwargs) func = 'urlopen' @@ -460,7 +460,7 @@ class MirrorGroup: kw['limit'] = limit func = 'urlread' return self._mirror_try(func, url, kw) - + class MGRandomStart(MirrorGroup): """A mirror group that starts at a random mirror in the list. diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py index 9b77c54..9359f16 100644 --- a/urlgrabber/progress.py +++ b/urlgrabber/progress.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -107,7 +107,7 @@ class BaseMeter: self.last_amount_read = 0 self.last_update_time = None self.re = RateEstimator() - + def start(self, filename=None, url=None, basename=None, size=None, now=None, text=None): self.filename = filename @@ -125,7 +125,7 @@ class BaseMeter: self.last_amount_read = 0 self.last_update_time = now self._do_start(now) - + def _do_start(self, now=None): pass @@ -152,7 +152,7 @@ class BaseMeter: def _do_end(self, amount_read, now=None): pass - + # This is kind of a hack, but progress is gotten from grabber which doesn't # know about the total size to download. So we do this so we can get the data # out of band here. This will be "fixed" one way or anther soon. @@ -167,7 +167,7 @@ def text_meter_total_size(size, downloaded=0): # # update: No size (minimal: 17 chars) # ----------------------------------- -# <text> <rate> | <current size> <elapsed time> +# <text> <rate> | <current size> <elapsed time> # 8-48 1 8 3 6 1 9 5 # # Order: 1. <text>+<current size> (17) @@ -202,7 +202,7 @@ def text_meter_total_size(size, downloaded=0): # # end # --- -# <text> | <current size> <elapsed time> +# <text> | <current size> <elapsed time> # 8-56 3 6 1 9 5 # # Order: 1. <text> ( 8) @@ -360,7 +360,7 @@ class MultiFileMeter: else: self._lock = _FakeLock() self.update_period = 0.3 # seconds - + self.numfiles = None self.finished_files = 0 self.failed_files = 0 @@ -393,7 +393,7 @@ class MultiFileMeter: if now is None: now = time.time() self.re.update(self._amount_read(), now) self._do_end(now) - + def _do_end(self, now): pass @@ -406,10 +406,10 @@ class MultiFileMeter: newmeter = self.helperclass(self) self.meters.append(newmeter) return newmeter - + def removeMeter(self, meter): self.meters.remove(meter) - + ########################################################### # child functions - these should only be called by helpers def start_meter(self, meter, now): @@ -423,10 +423,10 @@ class MultiFileMeter: finally: self._lock.release() self._do_start_meter(meter, now) - + def _do_start_meter(self, meter, now): pass - + def update_meter(self, meter, now): if not meter in self.meters: raise ValueError('attempt to use orphaned meter') @@ -507,7 +507,7 @@ class TextMultiFileMeter(MultiFileMeter): # 8-22 1 3-4 1 6-12 1 8 3 6 1 7-9 1 3 1 # end # --- -# <text> | <file size> <file elapsed time> +# <text> | <file size> <file elapsed time> # 8-56 3 6 1 9 5 def _do_update_meter(self, meter, now): self._lock.acquire() @@ -622,7 +622,7 @@ class TextMultiFileMeter(MultiFileMeter): pass finally: self._lock.release() - + ###################################################################### # support classes and functions @@ -637,7 +637,7 @@ class RateEstimator: self.last_update_time = now self.last_amount_read = 0 self.ave_rate = None - + def update(self, amount_read, now=None): if now is None: now = time.time() # libcurl calls the progress callback when fetching headers @@ -661,7 +661,7 @@ class RateEstimator: time_diff, read_diff, self.ave_rate, self.timescale) self.last_amount_read = amount_read #print 'results', time_diff, read_diff, self.ave_rate - + ##################################################################### # result methods def average_rate(self): @@ -697,14 +697,14 @@ class RateEstimator: epsilon = time_diff / timescale if epsilon > 1: epsilon = 1.0 return self._rolling_ave(time_diff, read_diff, last_ave, epsilon) - + def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon): """perform a "rolling average" iteration a rolling average "folds" new data into an existing average with some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive) a value of 0.0 means only the old value (initial value) counts, and a value of 1.0 means only the newest value is considered.""" - + try: recent_rate = read_diff / time_diff except ZeroDivisionError: @@ -733,7 +733,7 @@ class RateEstimator: rt = int(rt) if shift <= 0: return rt return float(int(rt) >> shift << shift) - + def format_time(seconds, use_hours=0): if seconds is None or seconds < 0: @@ -751,7 +751,7 @@ def format_time(seconds, use_hours=0): return '%02i:%02i:%02i' % (hours, minutes, seconds) else: return '%02i:%02i' % (minutes, seconds) - + def format_number(number, SI=0, space=' '): """Turn numbers into human-readable metric-like numbers""" symbols = ['', # (none) @@ -763,14 +763,14 @@ def format_number(number, SI=0, space=' '): 'E', # exa 'Z', # zetta 'Y'] # yotta - + if SI: step = 1000.0 else: step = 1024.0 thresh = 999 depth = 0 max_depth = len(symbols) - 1 - + # we want numbers between 0 and thresh, but don't exceed the length # of our list. In that event, the formatting will be screwed up, # but it'll still show the right number. @@ -788,7 +788,7 @@ def format_number(number, SI=0, space=' '): format = '%.1f%s%s' else: format = '%.0f%s%s' - + return(format % (float(number or 0), space, symbols[depth])) def _tst(fn, cur, tot, beg, size, *args): @@ -850,8 +850,8 @@ def _mtst(datas, *args): assert not tm.meters if __name__ == "__main__": - # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28 - # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08 + # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28 + # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08 if len(sys.argv) >= 2 and sys.argv[1] == 'multi': _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), ("s-1.0.1-1.fc8.i386.rpm", 5000), |