From 84e3e4853204ecb77d09c0a7495313965386a6fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Mon, 11 Feb 2019 20:41:40 +0100 Subject: Apply 'except' fixer --- urlgrabber/mirror.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 988a309..a2202fe 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -423,7 +423,7 @@ class MirrorGroup: if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) try: return func_ref( *(fullurl,), opts=opts, **kw ) - except URLGrabError, e: + except URLGrabError as e: if DEBUG: DEBUG.info('MIRROR: failed') gr.errors.append((fullurl, exception2msg(e))) obj = CallbackObject() @@ -446,7 +446,7 @@ class MirrorGroup: func = 'urlgrab' try: return self._mirror_try(func, url, kw) - except URLGrabError, e: + except URLGrabError as e: obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs) return _run_callback(kwargs.get('failfunc', _do_raise), obj) -- cgit v1.2.1 From 4d97612f94e9d0799ad7786b6e7c6865ac21cd46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Mon, 11 Feb 2019 20:46:45 +0100 Subject: Remove trailing whitespace --- urlgrabber/mirror.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index a2202fe..e4aac7e 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -100,7 +100,7 @@ from grabber import _run_callback, _do_raise from grabber import exception2msg from grabber import _TH -def _(st): +def _(st): return st class GrabRequest: @@ -142,7 +142,7 @@ class MirrorGroup: In addition to the required arguments "grabber" and "mirrors", MirrorGroup also takes the following optional arguments: - + default_action A dict that describes the actions to be taken upon failure @@ -173,7 +173,7 @@ class MirrorGroup: or by returning an action dict from the failure_callback return {'fail':0} in increasing precedence. - + If all three of these were done, the net result would be: {'increment': 0, # set in method 'increment_master': 1, # class default @@ -278,11 +278,11 @@ class MirrorGroup: # methods, they will be stripped before getting passed on to the # grabber options = ['default_action', 'failure_callback'] - + def _process_kwargs(self, kwargs): self.failure_callback = kwargs.get('failure_callback') self.default_action = kwargs.get('default_action') - + def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: @@ -290,7 +290,7 @@ class MirrorGroup: m = {'mirror': _to_utf8(m)} parsed_mirrors.append(m) return parsed_mirrors - + def _load_gr(self, gr): # OVERRIDE IDEAS: # shuffle gr list @@ -351,7 +351,7 @@ class MirrorGroup: urlopen, there's no good way for the mirror group to know that an error occurs mid-download (it's already returned and given you the file object). - + remove --- can have several values 0 do not remove the mirror from the list 1 remove the mirror for this download only @@ -373,7 +373,7 @@ class MirrorGroup: self._next += 1 if self._next >= len(self.mirrors): self._next = 0 self._lock.release() - + if action.get('remove', 1): del gr.mirrors[gr._next] elif action.get('increment', 1): @@ -398,7 +398,7 @@ class MirrorGroup: return base_url + rel_url else: return base_url + '/' + rel_url - + def _mirror_try(self, func, url, kw): gr = GrabRequest() gr.func = func @@ -449,7 +449,7 @@ class MirrorGroup: except URLGrabError as e: obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs) return _run_callback(kwargs.get('failfunc', _do_raise), obj) - + def urlopen(self, url, **kwargs): kw = dict(kwargs) func = 'urlopen' @@ -460,7 +460,7 @@ class MirrorGroup: kw['limit'] = limit func = 'urlread' return self._mirror_try(func, url, kw) - + class MGRandomStart(MirrorGroup): """A mirror group that starts at a random mirror in the list. -- cgit v1.2.1 From 26d021d7b99ce8dd67333b2c12367391484b9ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Mon, 11 Feb 2019 21:14:16 +0100 Subject: Apply 'import' fixer --- urlgrabber/mirror.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index e4aac7e..942140d 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -95,10 +95,10 @@ import sys import random import thread # needed for locking to make this threadsafe -from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 -from grabber import _run_callback, _do_raise -from grabber import exception2msg -from grabber import _TH +from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 +from .grabber import _run_callback, _do_raise +from .grabber import exception2msg +from .grabber import _TH def _(st): return st -- cgit v1.2.1 From 735716711316f0254a90e36e487a6fdbeb11b602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Mon, 11 Feb 2019 21:25:55 +0100 Subject: Apply 'idioms' fixer --- urlgrabber/mirror.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 942140d..06b8a21 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -316,7 +316,7 @@ class MirrorGroup: # the callback) cb = gr.kw.get('failure_callback') or self.failure_callback if cb: - if type(cb) == type( () ): + if isinstance(cb, type( () )): cb, args, kwargs = cb else: args, kwargs = (), {} @@ -412,7 +412,7 @@ class MirrorGroup: except KeyError: pass tries = 0 - while 1: + while True: tries += 1 mirrorchoice = self._get_mirror(gr) fullurl = self._join_url(mirrorchoice['mirror'], gr.url) -- cgit v1.2.1 From 51a074410af8240b625ec236aca292abc79f64d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Mon, 11 Feb 2019 22:11:19 +0100 Subject: Replace async with async_ The old name is still accepted for compatibility. --- urlgrabber/mirror.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 06b8a21..037902a 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -437,7 +437,7 @@ class MirrorGroup: def urlgrab(self, url, filename=None, **kwargs): kw = dict(kwargs) kw['filename'] = filename - if kw.get('async'): + if kw.get('async_') or kw.get('async'): # enable mirror failovers in async path kw['mirror_group'] = self, [], {}, set() kw['relative_url'] = url -- cgit v1.2.1 From 0c218d654837fba1dd685c99c7e3b0340b1aabbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Mon, 11 Feb 2019 23:37:34 +0100 Subject: Use six.{string_types,text_type} --- urlgrabber/mirror.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 037902a..61102e6 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -95,6 +95,8 @@ import sys import random import thread # needed for locking to make this threadsafe +from six import string_types + from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 from .grabber import _run_callback, _do_raise from .grabber import exception2msg @@ -286,7 +288,7 @@ class MirrorGroup: def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: - if isinstance(m, basestring): + if isinstance(m, string_types): m = {'mirror': _to_utf8(m)} parsed_mirrors.append(m) return parsed_mirrors -- cgit v1.2.1 From a01365ea34ab436a798fb7c53afbcb312b05af44 Mon Sep 17 00:00:00 2001 From: Mattias Giese Date: Thu, 15 Feb 2018 13:05:38 +1300 Subject: Support tokens/queries in URLs Needed for support of some strange sites like SUSE Customer Center. --- urlgrabber/mirror.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 61102e6..d9f8190 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -94,6 +94,7 @@ CUSTOMIZATION import sys import random import thread # needed for locking to make this threadsafe +import urlparse from six import string_types @@ -396,10 +397,11 @@ class MirrorGroup: # by overriding the configuration methods :) def _join_url(self, base_url, rel_url): - if base_url.endswith('/') or rel_url.startswith('/'): - return base_url + rel_url + (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url) + if path.endswith('/') or rel_url.startswith('/'): + return urlparse.urlunsplit((scheme, netloc, path + rel_url, query, fragid)) else: - return base_url + '/' + rel_url + return urlparse.urlunsplit((scheme, netloc, path + '/' + rel_url, query, fragid)) def _mirror_try(self, func, url, kw): gr = GrabRequest() -- cgit v1.2.1 From 27317cb79ed0791aaf1f7e7c44a8167fbd7ac885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Sun, 24 Feb 2019 10:35:44 +0100 Subject: Refactor MirrorGroup._join_url The function call was common to both branches of the conditional, it can be moved out. --- urlgrabber/mirror.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index d9f8190..f2c8ee1 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -398,10 +398,8 @@ class MirrorGroup: def _join_url(self, base_url, rel_url): (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url) - if path.endswith('/') or rel_url.startswith('/'): - return urlparse.urlunsplit((scheme, netloc, path + rel_url, query, fragid)) - else: - return urlparse.urlunsplit((scheme, netloc, path + '/' + rel_url, query, fragid)) + sep = '' if path.endswith('/') or rel_url.startswith('/') else '/' + return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid)) def _mirror_try(self, func, url, kw): gr = GrabRequest() -- cgit v1.2.1 From be8ee10e35319e80200d4ff384434d46fe7783d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Tue, 12 Feb 2019 10:42:34 +0100 Subject: Simplify mirror conversion to utf8 The code would (potentially) skip some items in the mirror list if they were not strings. But this would be a programming error in the caller. Let's assume that mirror list does not contain garbage with unexpected types and do the conversion unconditionally. Note: mirror URLs are still stored encoded, because that seems to work better with urllib. --- urlgrabber/mirror.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index f2c8ee1..764abb6 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -287,12 +287,7 @@ class MirrorGroup: self.default_action = kwargs.get('default_action') def _parse_mirrors(self, mirrors): - parsed_mirrors = [] - for m in mirrors: - if isinstance(m, string_types): - m = {'mirror': _to_utf8(m)} - parsed_mirrors.append(m) - return parsed_mirrors + return [{'mirror':_to_utf8(m)} for m in mirrors] def _load_gr(self, gr): # OVERRIDE IDEAS: -- cgit v1.2.1 From 2113d028dc143560da4338472cb0db6fa658799b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Mon, 11 Feb 2019 23:08:09 +0100 Subject: Convert imports to try new names first This is the 'imports' fixer, but with lots of manual adjustments. There should be no difference under python2. --- urlgrabber/mirror.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 764abb6..7e904e2 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -93,8 +93,17 @@ CUSTOMIZATION import sys import random -import thread # needed for locking to make this threadsafe -import urlparse + +if sys.version_info >= (3,): + # We use a version check because python2 also has _thread + import _thread as thread +else: + import thread + +try: + import urllib.parse as urlparse +except ImportError: + import urlparse from six import string_types -- cgit v1.2.1 From 640db472fd098c1540448ccad1a13a9a4794d387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Tue, 12 Feb 2019 13:58:42 +0100 Subject: Add a wrapper function to avoid b'' in messages --- urlgrabber/mirror.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 7e904e2..5eb7617 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -111,6 +111,7 @@ from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 from .grabber import _run_callback, _do_raise from .grabber import exception2msg from .grabber import _TH +from .grabber import _bytes_repr def _(st): return st @@ -426,7 +427,7 @@ class MirrorGroup: # apply mirrorchoice kwargs on top of grabber.opts opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {})) func_ref = getattr(grabber, func) - if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) + if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', _bytes_repr(url), _bytes_repr(fullurl)) try: return func_ref( *(fullurl,), opts=opts, **kw ) except URLGrabError as e: -- cgit v1.2.1 From 3b7371248f6562ee8afd312c4371b1e4193755d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Tue, 12 Feb 2019 14:45:43 +0100 Subject: Add explicit encode/decode calls This is the patch that has the most potential for trouble in the whole series, because it affects python2 behaviour directly. io.BytesIO is the same as StringIO under python2, so this should have no effect on python2. Under python3 it is necessary to allow reading bytes from a byte data source. Under python2, encoding of an already encoding string is allowed, and actually works fine (is idempotent) for ASCII strings. So the effect of the .encode() calls under python2 should be limited. Under python3, they are of course necessary and can only be done once. So if there are errors here, they should show up when running under python3 pretty easily. --- urlgrabber/mirror.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 5eb7617..1d7fd3d 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -389,9 +389,9 @@ class MirrorGroup: if gr._next >= len(gr.mirrors): gr._next = 0 if DEBUG: - grm = [m['mirror'] for m in gr.mirrors] + grm = [m['mirror'].decode() for m in gr.mirrors] DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next) - selfm = [m['mirror'] for m in self.mirrors] + selfm = [m['mirror'].decode() for m in self.mirrors] DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next) ##################################################################### @@ -403,7 +403,14 @@ class MirrorGroup: def _join_url(self, base_url, rel_url): (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url) - sep = '' if path.endswith('/') or rel_url.startswith('/') else '/' + + if isinstance(base_url, bytes): + if not isinstance(rel_url, bytes): + rel_url = rel_url.encode('utf8') + sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/' + else: + sep = '' if path.endswith('/') or rel_url.startswith('/') else '/' + return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid)) def _mirror_try(self, func, url, kw): -- cgit v1.2.1 From 2ca83d087d32a4a30597af0e60d7927c64f46153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Sun, 24 Feb 2019 10:44:53 +0100 Subject: Replace some type() with specific class names We know what the types of basic types are, let's just put that directly in the code. It seems more idiomatic and slightly more efficient to do things this way. --- urlgrabber/mirror.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'urlgrabber/mirror.py') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 1d7fd3d..75f0bcb 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -324,7 +324,7 @@ class MirrorGroup: # the callback) cb = gr.kw.get('failure_callback') or self.failure_callback if cb: - if isinstance(cb, type( () )): + if isinstance(cb, tuple): cb, args, kwargs = cb else: args, kwargs = (), {} -- cgit v1.2.1