diff options
author | Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> | 2019-02-12 14:45:43 +0100 |
---|---|---|
committer | Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> | 2019-02-24 10:38:43 +0100 |
commit | 3b7371248f6562ee8afd312c4371b1e4193755d2 (patch) | |
tree | 20b36fac7d8441a2bb051a59a86873a52d6cfb18 | |
parent | 554d93ea1516d750fcf164e4f4d09edda10000d8 (diff) | |
download | urlgrabber-3b7371248f6562ee8afd312c4371b1e4193755d2.tar.gz |
Add explicit encode/decode calls
This is the patch that has the most potential for trouble in the whole
series, because it affects python2 behaviour directly.
io.BytesIO is the same as StringIO under python2, so this should have
no effect on python2. Under python3 it is necessary to allow reading
bytes from a byte data source.
Under python2, encoding of an already encoding string is allowed, and
actually works fine (is idempotent) for ASCII strings. So the effect
of the .encode() calls under python2 should be limited. Under python3,
they are of course necessary and can only be done once. So if there
are errors here, they should show up when running under python3 pretty
easily.
-rw-r--r-- | test/base_test_code.py | 4 | ||||
-rw-r--r-- | test/test_grabber.py | 20 | ||||
-rw-r--r-- | test/test_mirror.py | 8 | ||||
-rw-r--r-- | urlgrabber/grabber.py | 16 | ||||
-rw-r--r-- | urlgrabber/mirror.py | 13 |
5 files changed, 41 insertions, 20 deletions
diff --git a/test/base_test_code.py b/test/base_test_code.py index 9d4ec71..ae9ac6e 100644 --- a/test/base_test_code.py +++ b/test/base_test_code.py @@ -7,11 +7,11 @@ base_ftp = 'ftp://localhost/test/' # bugs in their implementation in byterange.py. base_proftp = 'ftp://localhost/test/' -reference_data = ''.join(str(i) + '\n' for i in range(20000)) +reference_data = ''.join(str(i) + '\n' for i in range(20000)).encode('utf8') ref_http = base_http + 'reference' ref_ftp = base_ftp + 'reference' ref_proftp = base_proftp + 'reference' -short_reference_data = ' '.join(str(i) for i in range(10)) +short_reference_data = ' '.join(str(i) for i in range(10)).encode('utf8') short_ref_http = base_http + 'short_reference' short_ref_ftp = base_ftp + 'short_reference' diff --git a/test/test_grabber.py b/test/test_grabber.py index 4d3bf93..e026540 100644 --- a/test/test_grabber.py +++ b/test/test_grabber.py @@ -27,6 +27,8 @@ import sys import os import tempfile, random, os import socket +from io import BytesIO +from six import string_types if sys.version_info >= (3,): # We do an explicit version check here because because python2 @@ -56,8 +58,8 @@ class FileObjectTests(TestCase): with open(self.filename, 'wb') as fo: fo.write(reference_data) - self.fo_input = StringIO(reference_data) - self.fo_output = StringIO() + self.fo_input = BytesIO(reference_data) + self.fo_output = BytesIO() (url, parts) = grabber.default_grabber.opts.urlparser.parse( self.filename, grabber.default_grabber.opts) self.wrapper = grabber.PyCurlFileObject( @@ -227,11 +229,14 @@ class URLParserTestCase(TestCase): try: quote = urllist[3] except IndexError: quote = None g.opts.quote = quote - (url, parts) = g.opts.urlparser.parse(urllist[0], g.opts) + url = urllist[0].encode('utf8') + expected_url = urllist[1].encode('utf8') + expected_parts = tuple(part.encode('utf8') for part in urllist[2]) + (url, parts) = g.opts.urlparser.parse(url, g.opts) if 1: - self.assertEquals(url, urllist[1]) - self.assertEquals(parts, urllist[2]) + self.assertEquals(url, expected_url) + self.assertEquals(parts, expected_parts) else: if url == urllist[1] and parts == urllist[2]: print('OK: %s' % urllist[0]) @@ -328,7 +333,10 @@ class FailureTestCase(TestCase): self.assertEquals(self.args, ('foo',)) self.assertEquals(self.kwargs, {'bar': 'baz'}) self.assert_(isinstance(self.obj, CallbackObject)) - self.assertEquals(self.obj.url, ref_404) + url = self.obj.url + if not isinstance(url, string_types): + url = url.decode('utf8') + self.assertEquals(url, ref_404) self.assert_(isinstance(self.obj.exception, URLGrabError)) del self.obj diff --git a/test/test_mirror.py b/test/test_mirror.py index 097478c..1c9a83e 100644 --- a/test/test_mirror.py +++ b/test/test_mirror.py @@ -187,7 +187,7 @@ class ActionTests(TestCase): def test_defaults(self): 'test default action policy' self.mg.urlgrab('somefile') - expected_calls = [ (m + '/' + 'somefile', None) \ + expected_calls = [ (m.encode('utf8') + b'/somefile', None) for m in self.mirrors[:3] ] expected_logs = \ ['MIRROR: trying somefile -> a/somefile', @@ -207,7 +207,7 @@ class ActionTests(TestCase): 'test the effects of passed-in default_action' self.mg.default_action = {'remove_master': 1} self.mg.urlgrab('somefile') - expected_calls = [ (m + '/' + 'somefile', None) \ + expected_calls = [ (m.encode('utf8') + b'/somefile', None) for m in self.mirrors[:3] ] expected_logs = \ ['MIRROR: trying somefile -> a/somefile', @@ -226,7 +226,7 @@ class ActionTests(TestCase): def test_method_action(self): 'test the effects of method-level default_action' self.mg.urlgrab('somefile', default_action={'remove_master': 1}) - expected_calls = [ (m + '/' + 'somefile', None) \ + expected_calls = [ (m.encode('utf8') + b'/somefile', None) for m in self.mirrors[:3] ] expected_logs = \ ['MIRROR: trying somefile -> a/somefile', @@ -249,7 +249,7 @@ class ActionTests(TestCase): 'test the effects of a callback-returned action' self.assertRaises(URLGrabError, self.mg.urlgrab, 'somefile', failure_callback=self.callback) - expected_calls = [ (m + '/' + 'somefile', None) \ + expected_calls = [ (m.encode('utf8') + b'/somefile', None) for m in self.mirrors[:1] ] expected_logs = \ ['MIRROR: trying somefile -> a/somefile', diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index e8a7ffc..6bda383 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -527,6 +527,7 @@ import stat import pycurl from ftplib import parse150 import socket, select, fcntl +from io import BytesIO try: import urllib.parse as urlparse @@ -847,7 +848,10 @@ class URLParser: if not scheme or (len(scheme) == 1 and scheme in string.letters): # if a scheme isn't specified, we guess that it's "file:" if url[0] not in b'/\\': url = os.path.abspath(url) - url = b'file:' + pathname2url(url) + pathname = pathname2url(url) + if not isinstance(pathname, bytes): + pathname = pathname.encode('utf8') + url = b'file:' + pathname parts = urlparse.urlparse(url) quote = 0 # pathname2url quotes, so we won't do it again @@ -863,7 +867,7 @@ class URLParser: return url, parts def add_prefix(self, url, prefix): - if prefix[-1] == b'/' or url[0] == b'/': + if prefix.endswith(b'/') or url.startswith(b'/'): url = prefix + url else: url = prefix + b'/' + url @@ -900,6 +904,8 @@ class URLParser: else -> 1 """ (scheme, host, path, parm, query, frag) = parts + if not isinstance(path, text_type): + path = path.decode('utf8') if ' ' in path: return 1 ind = path.find('%') @@ -1380,7 +1386,7 @@ class PyCurlFileObject(object): if buf.lower().find(b'content-length:') != -1: length = buf.split(b':')[1] self.size = int(length) - elif (self.append or self.opts.range) and self._hdr_dump == '' and b' 200 ' in buf: + elif (self.append or self.opts.range) and not self._hdr_dump and b' 200 ' in buf: # reget was attempted but server sends it all # undo what we did in _build_range() self.append = False @@ -1669,7 +1675,7 @@ class PyCurlFileObject(object): def _build_range(self): reget_length = 0 rt = None - if self.opts.reget and type(self.filename) in types.StringTypes: + if self.opts.reget and isinstance(self.filename, string_types): # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) @@ -1787,7 +1793,7 @@ class PyCurlFileObject(object): self._prog_basename = 'MEMORY' - self.fo = StringIO() + self.fo = BytesIO() # if this is to be a tempfile instead.... # it just makes crap in the tempdir #fh, self._temp_name = mkstemp() diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index 5eb7617..1d7fd3d 100644 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -389,9 +389,9 @@ class MirrorGroup: if gr._next >= len(gr.mirrors): gr._next = 0 if DEBUG: - grm = [m['mirror'] for m in gr.mirrors] + grm = [m['mirror'].decode() for m in gr.mirrors] DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next) - selfm = [m['mirror'] for m in self.mirrors] + selfm = [m['mirror'].decode() for m in self.mirrors] DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next) ##################################################################### @@ -403,7 +403,14 @@ class MirrorGroup: def _join_url(self, base_url, rel_url): (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url) - sep = '' if path.endswith('/') or rel_url.startswith('/') else '/' + + if isinstance(base_url, bytes): + if not isinstance(rel_url, bytes): + rel_url = rel_url.encode('utf8') + sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/' + else: + sep = '' if path.endswith('/') or rel_url.startswith('/') else '/' + return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid)) def _mirror_try(self, func, url, kw): |