Add explicit encode/decode calls

This is the patch that has the most potential for trouble in the whole series, because it affects python2 behaviour directly. io.BytesIO is the same as StringIO under python2, so this should have no effect on python2. Under python3 it is necessary to allow reading bytes from a byte data source. Under python2, encoding of an already encoding string is allowed, and actually works fine (is idempotent) for ASCII strings. So the effect of the .encode() calls under python2 should be limited. Under python3, they are of course necessary and can only be done once. So if there are errors here, they should show up when running under python3 pretty easily.
author: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> 2019-02-12 14:45:43 +0100
committer: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> 2019-02-24 10:38:43 +0100
commit: 3b7371248f6562ee8afd312c4371b1e4193755d2 (patch)
tree: 20b36fac7d8441a2bb051a59a86873a52d6cfb18
parent: 554d93ea1516d750fcf164e4f4d09edda10000d8 (diff)
download: urlgrabber-3b7371248f6562ee8afd312c4371b1e4193755d2.tar.gz
5 files changed, 41 insertions, 20 deletions
diff --git a/test/base_test_code.py b/test/base_test_code.py
index 9d4ec71..ae9ac6e 100644
--- a/test/base_test_code.py
+++ b/test/base_test_code.py
@@ -7,11 +7,11 @@ base_ftp  = 'ftp://localhost/test/'
 # bugs in their implementation in byterange.py.
 base_proftp = 'ftp://localhost/test/'
 
-reference_data = ''.join(str(i) + '\n' for i in range(20000))
+reference_data = ''.join(str(i) + '\n' for i in range(20000)).encode('utf8')
 ref_http = base_http + 'reference'
 ref_ftp = base_ftp + 'reference'
 ref_proftp = base_proftp + 'reference'
-short_reference_data = ' '.join(str(i) for i in range(10))
+short_reference_data = ' '.join(str(i) for i in range(10)).encode('utf8')
 short_ref_http = base_http + 'short_reference'
 short_ref_ftp = base_ftp + 'short_reference'
 
diff --git a/test/test_grabber.py b/test/test_grabber.py
index 4d3bf93..e026540 100644
--- a/test/test_grabber.py
+++ b/test/test_grabber.py
@@ -27,6 +27,8 @@ import sys
 import os
 import tempfile, random, os
 import socket
+from io import BytesIO
+from six import string_types
 
 if sys.version_info >= (3,):
     # We do an explicit version check here because because python2
@@ -56,8 +58,8 @@ class FileObjectTests(TestCase):
         with open(self.filename, 'wb') as fo:
             fo.write(reference_data)
 
-        self.fo_input = StringIO(reference_data)
-        self.fo_output = StringIO()
+        self.fo_input = BytesIO(reference_data)
+        self.fo_output = BytesIO()
         (url, parts) = grabber.default_grabber.opts.urlparser.parse(
             self.filename, grabber.default_grabber.opts)
         self.wrapper = grabber.PyCurlFileObject(
@@ -227,11 +229,14 @@ class URLParserTestCase(TestCase):
         try: quote = urllist[3]
         except IndexError: quote = None
         g.opts.quote = quote
-        (url, parts) = g.opts.urlparser.parse(urllist[0], g.opts)
+        url = urllist[0].encode('utf8')
+        expected_url = urllist[1].encode('utf8')
+        expected_parts = tuple(part.encode('utf8') for part in urllist[2])
+        (url, parts) = g.opts.urlparser.parse(url, g.opts)
 
         if 1:
-            self.assertEquals(url, urllist[1])
-            self.assertEquals(parts, urllist[2])
+            self.assertEquals(url, expected_url)
+            self.assertEquals(parts, expected_parts)
         else:
             if url == urllist[1] and parts == urllist[2]:
                 print('OK: %s' % urllist[0])
@@ -328,7 +333,10 @@ class FailureTestCase(TestCase):
         self.assertEquals(self.args, ('foo',))
         self.assertEquals(self.kwargs, {'bar': 'baz'})
         self.assert_(isinstance(self.obj, CallbackObject))
-        self.assertEquals(self.obj.url, ref_404)
+        url = self.obj.url
+        if not isinstance(url, string_types):
+            url = url.decode('utf8')
+        self.assertEquals(url, ref_404)
         self.assert_(isinstance(self.obj.exception, URLGrabError))
         del self.obj
 
diff --git a/test/test_mirror.py b/test/test_mirror.py
index 097478c..1c9a83e 100644
--- a/test/test_mirror.py
+++ b/test/test_mirror.py
@@ -187,7 +187,7 @@ class ActionTests(TestCase):
     def test_defaults(self):
         'test default action policy'
         self.mg.urlgrab('somefile')
-        expected_calls = [ (m + '/' + 'somefile', None) \
+        expected_calls = [ (m.encode('utf8') + b'/somefile', None)
                            for m in self.mirrors[:3] ]
         expected_logs = \
             ['MIRROR: trying somefile -> a/somefile',
@@ -207,7 +207,7 @@ class ActionTests(TestCase):
         'test the effects of passed-in default_action'
         self.mg.default_action = {'remove_master': 1}
         self.mg.urlgrab('somefile')
-        expected_calls = [ (m + '/' + 'somefile', None) \
+        expected_calls = [ (m.encode('utf8') + b'/somefile', None)
                            for m in self.mirrors[:3] ]
         expected_logs = \
             ['MIRROR: trying somefile -> a/somefile',
@@ -226,7 +226,7 @@ class ActionTests(TestCase):
     def test_method_action(self):
         'test the effects of method-level default_action'
         self.mg.urlgrab('somefile', default_action={'remove_master': 1})
-        expected_calls = [ (m + '/' + 'somefile', None) \
+        expected_calls = [ (m.encode('utf8') + b'/somefile', None)
                            for m in self.mirrors[:3] ]
         expected_logs = \
             ['MIRROR: trying somefile -> a/somefile',
@@ -249,7 +249,7 @@ class ActionTests(TestCase):
         'test the effects of a callback-returned action'
         self.assertRaises(URLGrabError, self.mg.urlgrab, 'somefile',
                           failure_callback=self.callback)
-        expected_calls = [ (m + '/' + 'somefile', None) \
+        expected_calls = [ (m.encode('utf8') + b'/somefile', None)
                            for m in self.mirrors[:1] ]
         expected_logs = \
                       ['MIRROR: trying somefile -> a/somefile',
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e8a7ffc..6bda383 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -527,6 +527,7 @@ import stat
 import pycurl
 from ftplib import parse150
 import socket, select, fcntl
+from io import BytesIO
 
 try:
     import urllib.parse as urlparse
@@ -847,7 +848,10 @@ class URLParser:
         if not scheme or (len(scheme) == 1 and scheme in string.letters):
             # if a scheme isn't specified, we guess that it's "file:"
             if url[0] not in b'/\\': url = os.path.abspath(url)
-            url = b'file:' + pathname2url(url)
+            pathname = pathname2url(url)
+            if not isinstance(pathname, bytes):
+                pathname = pathname.encode('utf8')
+            url = b'file:' + pathname
             parts = urlparse.urlparse(url)
             quote = 0 # pathname2url quotes, so we won't do it again
 
@@ -863,7 +867,7 @@ class URLParser:
         return url, parts
 
     def add_prefix(self, url, prefix):
-        if prefix[-1] == b'/' or url[0] == b'/':
+        if prefix.endswith(b'/') or url.startswith(b'/'):
             url = prefix + url
         else:
             url = prefix + b'/' + url
@@ -900,6 +904,8 @@ class URLParser:
         else       ->  1
         """
         (scheme, host, path, parm, query, frag) = parts
+        if not isinstance(path, text_type):
+            path = path.decode('utf8')
         if ' ' in path:
             return 1
         ind = path.find('%')
@@ -1380,7 +1386,7 @@ class PyCurlFileObject(object):
                 if buf.lower().find(b'content-length:') != -1:
                     length = buf.split(b':')[1]
                     self.size = int(length)
-                elif (self.append or self.opts.range) and self._hdr_dump == '' and b' 200 ' in buf:
+                elif (self.append or self.opts.range) and not self._hdr_dump and b' 200 ' in buf:
                     # reget was attempted but server sends it all
                     # undo what we did in _build_range()
                     self.append = False
@@ -1669,7 +1675,7 @@ class PyCurlFileObject(object):
     def _build_range(self):
         reget_length = 0
         rt = None
-        if self.opts.reget and type(self.filename) in types.StringTypes:
+        if self.opts.reget and isinstance(self.filename, string_types):
             # we have reget turned on and we're dumping to a file
             try:
                 s = os.stat(self.filename)
@@ -1787,7 +1793,7 @@ class PyCurlFileObject(object):
             self._prog_basename = 'MEMORY'
 
 
-            self.fo = StringIO()
+            self.fo = BytesIO()
             # if this is to be a tempfile instead....
             # it just makes crap in the tempdir
             #fh, self._temp_name = mkstemp()
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index 5eb7617..1d7fd3d 100644
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -389,9 +389,9 @@ class MirrorGroup:
         if gr._next >= len(gr.mirrors): gr._next = 0
 
         if DEBUG:
-            grm = [m['mirror'] for m in gr.mirrors]
+            grm = [m['mirror'].decode() for m in gr.mirrors]
             DEBUG.info('GR   mirrors: [%s] %i', ' '.join(grm), gr._next)
-            selfm = [m['mirror'] for m in self.mirrors]
+            selfm = [m['mirror'].decode() for m in self.mirrors]
             DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
 
     #####################################################################
@@ -403,7 +403,14 @@ class MirrorGroup:
 
     def _join_url(self, base_url, rel_url):
         (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url)
-        sep = '' if path.endswith('/') or rel_url.startswith('/') else '/'
+
+        if isinstance(base_url, bytes):
+            if not isinstance(rel_url, bytes):
+                rel_url = rel_url.encode('utf8')
+            sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/'
+        else:
+            sep = '' if path.endswith('/') or rel_url.startswith('/') else '/'
+
         return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid))
 
     def _mirror_try(self, func, url, kw):
author	Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>	2019-02-12 14:45:43 +0100
committer	Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>	2019-02-24 10:38:43 +0100
commit	3b7371248f6562ee8afd312c4371b1e4193755d2 (patch)
tree	20b36fac7d8441a2bb051a59a86873a52d6cfb18
parent	554d93ea1516d750fcf164e4f4d09edda10000d8 (diff)
download	urlgrabber-3b7371248f6562ee8afd312c4371b1e4193755d2.tar.gz