summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>2019-02-12 14:45:43 +0100
committerZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>2019-02-24 10:38:43 +0100
commit3b7371248f6562ee8afd312c4371b1e4193755d2 (patch)
tree20b36fac7d8441a2bb051a59a86873a52d6cfb18
parent554d93ea1516d750fcf164e4f4d09edda10000d8 (diff)
downloadurlgrabber-3b7371248f6562ee8afd312c4371b1e4193755d2.tar.gz
Add explicit encode/decode calls
This is the patch that has the most potential for trouble in the whole series, because it affects python2 behaviour directly. io.BytesIO is the same as StringIO under python2, so this should have no effect on python2. Under python3 it is necessary to allow reading bytes from a byte data source. Under python2, encoding of an already encoding string is allowed, and actually works fine (is idempotent) for ASCII strings. So the effect of the .encode() calls under python2 should be limited. Under python3, they are of course necessary and can only be done once. So if there are errors here, they should show up when running under python3 pretty easily.
-rw-r--r--test/base_test_code.py4
-rw-r--r--test/test_grabber.py20
-rw-r--r--test/test_mirror.py8
-rw-r--r--urlgrabber/grabber.py16
-rw-r--r--urlgrabber/mirror.py13
5 files changed, 41 insertions, 20 deletions
diff --git a/test/base_test_code.py b/test/base_test_code.py
index 9d4ec71..ae9ac6e 100644
--- a/test/base_test_code.py
+++ b/test/base_test_code.py
@@ -7,11 +7,11 @@ base_ftp = 'ftp://localhost/test/'
# bugs in their implementation in byterange.py.
base_proftp = 'ftp://localhost/test/'
-reference_data = ''.join(str(i) + '\n' for i in range(20000))
+reference_data = ''.join(str(i) + '\n' for i in range(20000)).encode('utf8')
ref_http = base_http + 'reference'
ref_ftp = base_ftp + 'reference'
ref_proftp = base_proftp + 'reference'
-short_reference_data = ' '.join(str(i) for i in range(10))
+short_reference_data = ' '.join(str(i) for i in range(10)).encode('utf8')
short_ref_http = base_http + 'short_reference'
short_ref_ftp = base_ftp + 'short_reference'
diff --git a/test/test_grabber.py b/test/test_grabber.py
index 4d3bf93..e026540 100644
--- a/test/test_grabber.py
+++ b/test/test_grabber.py
@@ -27,6 +27,8 @@ import sys
import os
import tempfile, random, os
import socket
+from io import BytesIO
+from six import string_types
if sys.version_info >= (3,):
# We do an explicit version check here because because python2
@@ -56,8 +58,8 @@ class FileObjectTests(TestCase):
with open(self.filename, 'wb') as fo:
fo.write(reference_data)
- self.fo_input = StringIO(reference_data)
- self.fo_output = StringIO()
+ self.fo_input = BytesIO(reference_data)
+ self.fo_output = BytesIO()
(url, parts) = grabber.default_grabber.opts.urlparser.parse(
self.filename, grabber.default_grabber.opts)
self.wrapper = grabber.PyCurlFileObject(
@@ -227,11 +229,14 @@ class URLParserTestCase(TestCase):
try: quote = urllist[3]
except IndexError: quote = None
g.opts.quote = quote
- (url, parts) = g.opts.urlparser.parse(urllist[0], g.opts)
+ url = urllist[0].encode('utf8')
+ expected_url = urllist[1].encode('utf8')
+ expected_parts = tuple(part.encode('utf8') for part in urllist[2])
+ (url, parts) = g.opts.urlparser.parse(url, g.opts)
if 1:
- self.assertEquals(url, urllist[1])
- self.assertEquals(parts, urllist[2])
+ self.assertEquals(url, expected_url)
+ self.assertEquals(parts, expected_parts)
else:
if url == urllist[1] and parts == urllist[2]:
print('OK: %s' % urllist[0])
@@ -328,7 +333,10 @@ class FailureTestCase(TestCase):
self.assertEquals(self.args, ('foo',))
self.assertEquals(self.kwargs, {'bar': 'baz'})
self.assert_(isinstance(self.obj, CallbackObject))
- self.assertEquals(self.obj.url, ref_404)
+ url = self.obj.url
+ if not isinstance(url, string_types):
+ url = url.decode('utf8')
+ self.assertEquals(url, ref_404)
self.assert_(isinstance(self.obj.exception, URLGrabError))
del self.obj
diff --git a/test/test_mirror.py b/test/test_mirror.py
index 097478c..1c9a83e 100644
--- a/test/test_mirror.py
+++ b/test/test_mirror.py
@@ -187,7 +187,7 @@ class ActionTests(TestCase):
def test_defaults(self):
'test default action policy'
self.mg.urlgrab('somefile')
- expected_calls = [ (m + '/' + 'somefile', None) \
+ expected_calls = [ (m.encode('utf8') + b'/somefile', None)
for m in self.mirrors[:3] ]
expected_logs = \
['MIRROR: trying somefile -> a/somefile',
@@ -207,7 +207,7 @@ class ActionTests(TestCase):
'test the effects of passed-in default_action'
self.mg.default_action = {'remove_master': 1}
self.mg.urlgrab('somefile')
- expected_calls = [ (m + '/' + 'somefile', None) \
+ expected_calls = [ (m.encode('utf8') + b'/somefile', None)
for m in self.mirrors[:3] ]
expected_logs = \
['MIRROR: trying somefile -> a/somefile',
@@ -226,7 +226,7 @@ class ActionTests(TestCase):
def test_method_action(self):
'test the effects of method-level default_action'
self.mg.urlgrab('somefile', default_action={'remove_master': 1})
- expected_calls = [ (m + '/' + 'somefile', None) \
+ expected_calls = [ (m.encode('utf8') + b'/somefile', None)
for m in self.mirrors[:3] ]
expected_logs = \
['MIRROR: trying somefile -> a/somefile',
@@ -249,7 +249,7 @@ class ActionTests(TestCase):
'test the effects of a callback-returned action'
self.assertRaises(URLGrabError, self.mg.urlgrab, 'somefile',
failure_callback=self.callback)
- expected_calls = [ (m + '/' + 'somefile', None) \
+ expected_calls = [ (m.encode('utf8') + b'/somefile', None)
for m in self.mirrors[:1] ]
expected_logs = \
['MIRROR: trying somefile -> a/somefile',
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e8a7ffc..6bda383 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -527,6 +527,7 @@ import stat
import pycurl
from ftplib import parse150
import socket, select, fcntl
+from io import BytesIO
try:
import urllib.parse as urlparse
@@ -847,7 +848,10 @@ class URLParser:
if not scheme or (len(scheme) == 1 and scheme in string.letters):
# if a scheme isn't specified, we guess that it's "file:"
if url[0] not in b'/\\': url = os.path.abspath(url)
- url = b'file:' + pathname2url(url)
+ pathname = pathname2url(url)
+ if not isinstance(pathname, bytes):
+ pathname = pathname.encode('utf8')
+ url = b'file:' + pathname
parts = urlparse.urlparse(url)
quote = 0 # pathname2url quotes, so we won't do it again
@@ -863,7 +867,7 @@ class URLParser:
return url, parts
def add_prefix(self, url, prefix):
- if prefix[-1] == b'/' or url[0] == b'/':
+ if prefix.endswith(b'/') or url.startswith(b'/'):
url = prefix + url
else:
url = prefix + b'/' + url
@@ -900,6 +904,8 @@ class URLParser:
else -> 1
"""
(scheme, host, path, parm, query, frag) = parts
+ if not isinstance(path, text_type):
+ path = path.decode('utf8')
if ' ' in path:
return 1
ind = path.find('%')
@@ -1380,7 +1386,7 @@ class PyCurlFileObject(object):
if buf.lower().find(b'content-length:') != -1:
length = buf.split(b':')[1]
self.size = int(length)
- elif (self.append or self.opts.range) and self._hdr_dump == '' and b' 200 ' in buf:
+ elif (self.append or self.opts.range) and not self._hdr_dump and b' 200 ' in buf:
# reget was attempted but server sends it all
# undo what we did in _build_range()
self.append = False
@@ -1669,7 +1675,7 @@ class PyCurlFileObject(object):
def _build_range(self):
reget_length = 0
rt = None
- if self.opts.reget and type(self.filename) in types.StringTypes:
+ if self.opts.reget and isinstance(self.filename, string_types):
# we have reget turned on and we're dumping to a file
try:
s = os.stat(self.filename)
@@ -1787,7 +1793,7 @@ class PyCurlFileObject(object):
self._prog_basename = 'MEMORY'
- self.fo = StringIO()
+ self.fo = BytesIO()
# if this is to be a tempfile instead....
# it just makes crap in the tempdir
#fh, self._temp_name = mkstemp()
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index 5eb7617..1d7fd3d 100644
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -389,9 +389,9 @@ class MirrorGroup:
if gr._next >= len(gr.mirrors): gr._next = 0
if DEBUG:
- grm = [m['mirror'] for m in gr.mirrors]
+ grm = [m['mirror'].decode() for m in gr.mirrors]
DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next)
- selfm = [m['mirror'] for m in self.mirrors]
+ selfm = [m['mirror'].decode() for m in self.mirrors]
DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
#####################################################################
@@ -403,7 +403,14 @@ class MirrorGroup:
def _join_url(self, base_url, rel_url):
(scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url)
- sep = '' if path.endswith('/') or rel_url.startswith('/') else '/'
+
+ if isinstance(base_url, bytes):
+ if not isinstance(rel_url, bytes):
+ rel_url = rel_url.encode('utf8')
+ sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/'
+ else:
+ sep = '' if path.endswith('/') or rel_url.startswith('/') else '/'
+
return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid))
def _mirror_try(self, func, url, kw):