diff options
author | Andrey Petrov <andrey.petrov@shazow.net> | 2013-11-29 17:11:45 -0500 |
---|---|---|
committer | Andrey Petrov <andrey.petrov@shazow.net> | 2013-11-29 17:11:45 -0500 |
commit | eefde5ca3c2d07764b02475d84a0b8b41dc15b31 (patch) | |
tree | ceab1b3cabbdaaac989981b4c9dfa03a121869ec | |
parent | 9796a765ceab81aa15b4ad3c61e70e4f71748cf9 (diff) | |
download | urllib3-filepost-stream.tar.gz |
Importing old branch for posterity.filepost-stream
Original message: My very ambitious attempt at allowing streamable
iterator files for POST uploading. Not too happy with it yet.
-rw-r--r-- | test/dummy_server.py | 3 | ||||
-rw-r--r-- | test/test_filepost.py | 31 | ||||
-rw-r--r-- | test/test_withdummy.py | 20 | ||||
-rw-r--r-- | urllib3/connectionpool.py | 30 | ||||
-rw-r--r-- | urllib3/filepost.py | 181 |
5 files changed, 232 insertions, 33 deletions
diff --git a/test/dummy_server.py b/test/dummy_server.py index 676ba5f8..8e79b0bd 100644 --- a/test/dummy_server.py +++ b/test/dummy_server.py @@ -39,6 +39,9 @@ class TestingApp(object): size = int(request.params.get('upload_size', '0')) file = request.params.get(param) + if file == None: + return Response("Missing parameter: %s" % param, status='400') + if not isinstance(file, FieldStorage): return Response("Not a file: %s" % param, status='400') diff --git a/test/test_filepost.py b/test/test_filepost.py new file mode 100644 index 00000000..599cd037 --- /dev/null +++ b/test/test_filepost.py @@ -0,0 +1,31 @@ +import unittest + +import sys +sys.path.append('../') + +from urllib3 import filepost +from StringIO import StringIO + +class TestFilePost(unittest.TestCase): + def test_generator(self): + fields = { + 'foo': 'bar', + 'somefile': ('name.txt', StringIO('trolololol')), + } + + stream = filepost.MultipartEncoderGenerator(fields, boundary="boundary") + body = ''.join(chunk for chunk in filepost.IterStreamer(stream)) + self.assertEqual(body, u'--boundary\r\nContent-Disposition: form-data; name="somefile"; filename="name.txt"\r\nContent-Type: text/plain\r\n\r\ntrolololol\r\n--boundary\r\nContent-Disposition: form-data; name="foo"\r\nContent-Type: text/plain\r\n\r\nbar\r\n--boundary--\r\n') + + def test_len(self): + fields = { + 'foo': 'bar', + 'somefile': ('name.txt', StringIO('trolololol')), + } + + iterdata = filepost.MultipartEncoderGenerator(fields, boundary="boundary") + predicted_size = len(iterdata) + + body = ''.join(chunk for chunk in filepost.IterStreamer(iterdata)) + + self.assertEqual(len(body), predicted_size) diff --git a/test/test_withdummy.py b/test/test_withdummy.py index a5f79f33..69e4efb7 100644 --- a/test/test_withdummy.py +++ b/test/test_withdummy.py @@ -21,7 +21,7 @@ class TestConnectionPool(unittest.TestCase): r = self.http_pool.get_url('/', retries=1) if r.data != "Dummy server!": raise Exception("Got unexpected response: %s" % r.data) - except Exception, e: + except MaxRetryError, e: raise Exception("Dummy server not running, make sure HOST and PORT correspond to the dummy server: %s" % e.message) return super(TestConnectionPool, self).__init__(*args, **kw) @@ -57,11 +57,23 @@ class TestConnectionPool(unittest.TestCase): r = self.http_pool.post_url('/upload', fields=fields) self.assertEquals(r.status, 200, r.data) - def test_unicode_upload(self): - fields = { - u'\xe2\x99\xa5': (u'\xe2\x99\xa5.txt', u'\xe2\x99\xa5'), + def _make_fields(self, s): + return { + u'upload_param': s, + u'upload_filename': '%s.txt' % s, + u'upload_size': u'3', + s: ('%s.txt' % s, s), } + def test_unicode_decoded(self): + fields = self._make_fields('\xe2\x99\xa5') + + r = self.http_pool.post_url('/upload', fields=fields) + self.assertEquals(r.status, 200, r.data) + + def test_unicode_encoded(self): + fields = self._make_fields(u'\u2665') + r = self.http_pool.post_url('/upload', fields=fields) self.assertEquals(r.status, 200, r.data) diff --git a/urllib3/connectionpool.py b/urllib3/connectionpool.py index 3fc073ae..35cc6c4d 100644 --- a/urllib3/connectionpool.py +++ b/urllib3/connectionpool.py @@ -10,7 +10,8 @@ except ImportError, e: try: import ssl except ImportError, e: - ssl = None + class ssl(object): + SSLError = None from urllib import urlencode from httplib import HTTPConnection, HTTPSConnection, HTTPException @@ -106,7 +107,28 @@ class HTTPResponse(object): ## Connection objects -class VerifiedHTTPSConnection(HTTPSConnection): +class StreamableMixin(object): + def send(self, data): + if isinstance(data, str): + HTTPConnection.send(self, data) + elif hasattr(data, '__iter__'): + for chunk in data: + HTTPConnection.send(self, chunk) + else: + raise TypeError("data object is not an iterable", data) + +class StreamableHTTPConnection(HTTPConnection, StreamableMixin): + # FIXME: Hack for old-style Python classes with broken inheritance + def send(self, data): + StreamableMixin.send(self, data) + +class StreamableHTTPSConnection(HTTPSConnection, StreamableMixin): + # FIXME: Hack for old-style Python classes with broken inheritance + def send(self, data): + StreamableMixin.send(self, data) + + +class VerifiedHTTPSConnection(StreamableHTTPSConnection): """ Based on httplib.HTTPSConnection but wraps the socket with SSL certification. """ @@ -189,7 +211,7 @@ class HTTPConnectionPool(object): """ self.num_connections += 1 log.info("Starting new HTTP connection (%d): %s" % (self.num_connections, self.host)) - return HTTPConnection(host=self.host, port=self.port) + return StreamableHTTPConnection(host=self.host, port=self.port) def _get_conn(self, timeout=None): """ @@ -369,7 +391,7 @@ class HTTPSConnectionPool(HTTPConnectionPool): log.info("Starting new HTTPS connection (%d): %s" % (self.num_connections, self.host)) if not ssl: - return HTTPSConnection(host=self.host, port=self.port) + return StreamableHTTPSConnection(host=self.host, port=self.port) connection = VerifiedHTTPSConnection(host=self.host, port=self.port) connection.set_cert(key_file=self.key_file, cert_file=self.cert_file, cert_reqs=self.cert_reqs, ca_certs=self.ca_certs) diff --git a/urllib3/filepost.py b/urllib3/filepost.py index a4f54645..ebc8214a 100644 --- a/urllib3/filepost.py +++ b/urllib3/filepost.py @@ -5,40 +5,171 @@ try: except: from StringIO import StringIO -import codecs -writer = codecs.lookup('utf-8')[3] def get_content_type(filename): return mimetypes.guess_type(filename)[0] or 'application/octet-stream' -def encode_multipart_formdata(fields): - body = StringIO() - BOUNDARY = mimetools.choose_boundary() - for fieldname, value in fields.iteritems(): - body.write("--%s\r\n" % (BOUNDARY)) +def encode(str): + return str.encode('utf-8') - if isinstance(value, tuple): - filename, data = value - body.write('Content-Disposition: form-data; name="%s"; filename="%s"\r\n' % (fieldname, filename)) - body.write('Content-Type: %s\r\n\r\n' % (get_content_type(filename))) - else: - data = value - body.write('Content-Disposition: form-data; name="%s"\r\n' % (fieldname)) - body.write('Content-Type: text/plain\r\n\r\n') +def file_size(fp): + pos = fp.tell() + fp.seek(0, 2) + size = fp.tell() + fp.seek(pos) + return size-pos - if isinstance(data, int): - data = str(data) # Backwards compatibility +class IterStreamer(object): + """ + File-like streaming iterator. + """ + def __init__(self, generator): + self.generator = generator + self.iterator = iter(generator) + self.leftover = '' - if isinstance(data, unicode): - writer(body).write(data) - else: - body.write(data) + def __len__(self): + return self.generator.__len__() - body.write('\r\n') + def __iter__(self): + return self.iterator - body.write('--%s--\r\n' % (BOUNDARY)) + def next(self): + return self.iterator.next() - content_type = 'multipart/form-data; boundary=%s' % BOUNDARY + def read(self, size): + data = self.leftover + count = len(self.leftover) + try: + while count < size: + chunk = self.next() + data += chunk + count += len(chunk) + except StopIteration, e: + pass - return body.getvalue(), content_type + if count > size: + self.leftover = data[size:] + + return data[:size] + + +class MultipartEncoderGenerator(object): + """ + Generator yielding chunk-by-chunk streaming data from fields, with proper + headers and boundary separators along the way. This is useful for streaming + large files as iterators without loading the entire data body into memory. + + ``fields`` is a dictionary where the parameter name is the key and the value + is either a (filename, data) tuple or just data. + + The data can be a unicode string, an iterator producing strings, or a file-like + object. File-like objects are read ``chunk_size`` bytes at a time. + + If no ``boundary`` is specified then a random one is used. + """ + def __init__(self, fields, boundary=None, chunk_size=8192): + self.fields = fields + self.chunk_size = chunk_size + self.boundary = boundary or mimetools.choose_boundary() + + def get_content_type(self): + return 'multipart/form-data; boundary=%s' % self.boundary + + def __len__(self): + """ + Figure out the expected body size by iterating over the fields as if they + contained empty files, while accumulating the value file sizes as + efficiently as we can. + """ + empty_fields = {} + size = 0 + for fieldname, value in self.fields.iteritems(): + if isinstance(value, tuple): + filename, data = value + empty_fields[fieldname] = (filename, '') + else: + data = value + empty_fields[fieldname] = '' + + if hasattr(data, '__len__'): + size += len(data) + elif isinstance(data, int): + size += len(str(data)) + elif hasattr(data, 'seek'): + size += file_size(data) + elif hasattr(data, 'read'): + size += len(data.read()) # This is undesired + elif hasattr(data, '__iter__'): + size += sum(len(chunk) for chunk in data) # This is also undesired + else: + size += len(unicode(data)) # Hope for the best + + return size + sum(len(chunk) for chunk in iter(MultipartEncoderGenerator(empty_fields, boundary=self.boundary))) + + def __iter__(self): + + for fieldname, value in self.fields.iteritems(): + yield encode(u'--%s\r\n' % (self.boundary)) + + if isinstance(value, tuple): + filename, data = value + yield encode(u'Content-Disposition: form-data; name="%s"; filename="%s"\r\n' % (fieldname, filename)) + yield encode(u'Content-Type: %s\r\n\r\n' % (get_content_type(filename))) + else: + data = value + yield encode(u'Content-Disposition: form-data; name="%s"\r\n' % fieldname) + yield encode(u'Content-Type: text/plain\r\n\r\n') + + if isinstance(data, unicode): + yield encode(data) + + elif isinstance(data, int): + # Handle integers for backwards compatibility + yield str(data) + + elif hasattr(data, 'read'): + # Stream from a file-like object + while True: + chunk = data.read(self.chunk_size) + if not chunk: + break + yield encode(chunk) + + elif hasattr(data, '__iter__'): + # Stream from an iterator + for chunk in data: + yield encode(chunk) + + else: + # Hope for the best + yield unicode(data) + + yield encode(u'\r\n') + + yield encode(u'--%s--\r\n' % (self.boundary)) + + +def encode_multipart_formdata(fields, boundary=None, chunk_size=8192): + """ + ``fields`` is a dictionary where the parameter name is the key and the value + is either a (filename, data) tuple or just data. Data can be a string, file-like + object, or iterator. + + Example: + fields = { + 'foo': 'bar', + 'upload_file': ('file.txt', 'data'), + 'huge_huge': ('video.mpg', fp), + 'hihihi_42_times': ('hi' for i in xrange(42)), + } + + File-like objects are read ``chunk_size`` bytes at a time. + + If no ``boundary`` is given, a random one is chosen. + + See MultipartEncoderGenerator for more details. + """ + stream = MultipartEncoderGenerator(fields, boundary=boundary, chunk_size=chunk_size) + return IterStreamer(stream), stream.get_content_type() |