diff options
author | Andrey Petrov <andrey.petrov@shazow.net> | 2010-10-10 14:40:33 -0700 |
---|---|---|
committer | Andrey Petrov <andrey.petrov@shazow.net> | 2010-10-10 14:40:33 -0700 |
commit | fd934ae60c2129be2f7b76159a2f21d871f75b7c (patch) | |
tree | 9b2e243a90c4ba36fe63766fbb90c8878b0b9b66 | |
parent | c039803b439a7bbe542a955508b524c33f4a98cd (diff) | |
download | urllib3-fd934ae60c2129be2f7b76159a2f21d871f75b7c.tar.gz |
Added client-side verified SSL and gzip/deflate response decompression patch from niphlod, with refactoring and tests by me.
--HG--
branch : ssl_and_compression
-rw-r--r-- | README.txt | 14 | ||||
-rw-r--r-- | setup.py | 4 | ||||
-rw-r--r-- | test/test_connectionpool.py | 17 | ||||
-rw-r--r-- | urllib3.egg-info/PKG-INFO | 2 | ||||
-rw-r--r-- | urllib3/__init__.py | 4 | ||||
-rw-r--r-- | urllib3/connectionpool.py | 132 |
6 files changed, 155 insertions, 18 deletions
@@ -2,9 +2,10 @@ Highlights ========== * Re-use the same socket connection for multiple requests - (``HTTPConnectionPool`` and ``HTTPSConnectionPool``) + (``HTTPConnectionPool`` and ``HTTPSConnectionPool``) (with client-side certificates) * File posting (``encode_multipart_formdata``) * Built-in redirection and retries (optional) + * Supports gzip and deflate (big thanks to niphlod) * Thread-safe * Small and easy to understand codebase perfect for extending and building upon. For a more comprehensive alternative, have a look at `httplib2 <http://code.google.com/p/httplib2/>`_. @@ -12,7 +13,7 @@ What's wrong with urllib and urllib2? ===================================== There are two critical features missing from the Python standard library: -Connection re-using/pooling and file posting. It's not terribly hard to +Connection re-using/pooling and file posting. It's not terribly hard to implement these yourself, but it's much easier to use a module that already did the work for you. @@ -28,7 +29,7 @@ Performance. When you normally do a urllib call, a separate socket connection is created with each request. By reusing existing sockets (supported since HTTP 1.1), the requests will take up less resources on the server's end, and also provide a faster response time at the client's end. -With some simple benchmarks (see `test/benchmark.py +With some simple benchmarks (see `test/benchmark.py <http://code.google.com/p/urllib3/source/browse/trunk/test/benchmark.py>`_ ), downloading 15 URLs from google.com is about twice as fast when using HTTPConnectionPool (which uses 1 connection) than using plain urllib (which @@ -53,11 +54,10 @@ But, long story short:: import urllib3 API_URL = 'http://ajax.googleapis.com/ajax/services/search/web' - + http_pool = urllib3.connection_from_url(API_URL) - + fields = {'v': '1.0', 'q': 'urllib3'} r = http_pool.get_url(API_URL, fields) - - print r.status, r.data + print r.status, r.data @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -version = '0.3.1' +version = '0.4.0' long_description = open('README.txt').read() @@ -21,7 +21,7 @@ setup(name='urllib3', author='Andrey Petrov', author_email='andrey.petrov@shazow.net', url='http://code.google.com/p/urllib3/', - download_url='http://urllib3.googlecode.com/files/urllib3-0.3.1.tar.gz', + download_url='http://urllib3.googlecode.com/files/urllib3-0.4.0.tar.gz', license='MIT', packages=find_packages(exclude=['ez_setup', 'tests']), include_package_data=True, diff --git a/test/test_connectionpool.py b/test/test_connectionpool.py index de7b725f..85c75632 100644 --- a/test/test_connectionpool.py +++ b/test/test_connectionpool.py @@ -3,7 +3,7 @@ import unittest import sys sys.path.append('../') -from urllib3.connectionpool import HTTPConnectionPool, get_host, connection_from_url, HostChangedError +from urllib3.connectionpool import HTTPConnectionPool, get_host, connection_from_url, make_headers class TestConnectionPool(unittest.TestCase): def test_get_host(self): @@ -47,3 +47,18 @@ class TestConnectionPool(unittest.TestCase): c = connection_from_url(a) self.assertFalse(c.is_same_host(b), "%s =? %s" % (a,b)) + def test_make_headers(self): + self.assertEqual(make_headers(accept_encoding=True), + {'accept-encoding': 'gzip,deflate'}) + + self.assertEqual(make_headers(accept_encoding='foo,bar'), + {'accept-encoding': 'foo,bar'}) + + self.assertEqual(make_headers(accept_encoding=['foo','bar']), + {'accept-encoding': 'foo,bar'}) + + self.assertEqual(make_headers(accept_encoding=True, user_agent='banana'), + {'accept-encoding': 'gzip,deflate', 'user-agent': 'banana'}) + + self.assertEqual(make_headers(user_agent='banana'), + {'user-agent': 'banana'}) diff --git a/urllib3.egg-info/PKG-INFO b/urllib3.egg-info/PKG-INFO index 2fa40c34..80c8a8d0 100644 --- a/urllib3.egg-info/PKG-INFO +++ b/urllib3.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: urllib3 -Version: 0.3.1 +Version: 0.4.0 Summary: HTTP library with thread-safe connection pooling and file post support Home-page: http://code.google.com/p/urllib3/ Author: Andrey Petrov diff --git a/urllib3/__init__.py b/urllib3/__init__.py index 9033f57e..190cc026 100644 --- a/urllib3/__init__.py +++ b/urllib3/__init__.py @@ -6,8 +6,8 @@ __author__ = "Andrey Petrov (andrey.petrov@shazow.net)" __license__ = "MIT" __version__ = "$Rev$" -from connectionpool import HTTPConnectionPool, HTTPSConnectionPool, get_host, connection_from_url +from connectionpool import HTTPConnectionPool, HTTPSConnectionPool, get_host, connection_from_url, make_headers from filepost import encode_multipart_formdata # Possible exceptions -from connectionpool import HTTPError, MaxRetryError, TimeoutError +from connectionpool import HTTPError, SSLError, MaxRetryError, TimeoutError diff --git a/urllib3/connectionpool.py b/urllib3/connectionpool.py index d2bec162..a7f7b8d7 100644 --- a/urllib3/connectionpool.py +++ b/urllib3/connectionpool.py @@ -2,21 +2,37 @@ import logging log = logging.getLogger(__name__) from Queue import Queue, Empty, Full -from StringIO import StringIO -from itertools import count +try: + from cStringIO import StringIO +except ImportError, e: + from StringIO import StringIO + +try: + import ssl +except ImportError, e: + ssl = None from urllib import urlencode from httplib import HTTPConnection, HTTPSConnection, HTTPException +import socket from socket import error as SocketError, timeout as SocketTimeout +import gzip +import zlib + from filepost import encode_multipart_formdata + ## Exceptions class HTTPError(Exception): "Base exception used by this module." pass +class SSLError(Exception): + "Raised when SSL certificate fails in an HTTPS connection." + pass + class MaxRetryError(HTTPError): "Raised when the maximum number of retries is exceeded." pass @@ -54,7 +70,26 @@ class HTTPResponse(object): NOTE: This method will perform r.read() which will have side effects on the original http.HTTPResponse object. """ - return HTTPResponse(data=r.read(), + tmp_data = StringIO(r.read()) + try: + if r.getheader('content-encoding') == 'gzip': + log.debug("Received response with content-encoding: gzip, decompressing with gzip.") + + gzipper = gzip.GzipFile(fileobj=tmp_data) + data = gzipper.read() + elif r.getheader('content-encoding') == 'deflate': + log.debug("Received response with content-encoding: deflate, decompressing with zlib.") + try: + data = zlib.decompress(tmp_data) + except zlib.error, e: + data = zlib.decompress(tmp_data, -zlib.MAX_WBITS) + else: + data = tmp_data.read() + + except IOError: + raise HTTPError("Received response with content-encoding: %s, but failed to decompress it." % (r.getheader('content-encoding'))) + + return HTTPResponse(data=data, headers=dict(r.getheaders()), status=r.status, version=r.version, @@ -68,6 +103,34 @@ class HTTPResponse(object): def getheader(self, name, default=None): return self.headers.get(name, default) + +## Connection objects + +class VerifiedHTTPSConnection(HTTPSConnection): + """ + Based on httplib.HTTPSConnection but wraps the socket with SSL certification. + """ + + def add_cert(self, key_file=None, cert_file=None, cert_reqs='CERT_NONE', ca_certs=None): + ssl_req_scheme = { + 'CERT_NONE' : ssl.CERT_NONE, + 'CERT_OPTIONAL' : ssl.CERT_OPTIONAL, + 'CERT_REQUIRED' : ssl.CERT_REQUIRED + } + + self.key_file = key_file + self.cert_file = cert_file + self.cert_reqs = ssl_req_scheme.get(cert_reqs) or ssl.CERT_NONE + self.ca_certs = ca_certs + + def connect(self): + # Add certificate verification + sock = socket.create_connection((self.host, self.port), self.timeout) + + # Wrap socket using verification with the root certs in trusted_root_certs + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, cert_reqs=self.cert_reqs, ca_certs=self.ca_certs) + + ## Pool objects class HTTPConnectionPool(object): @@ -184,7 +247,6 @@ class HTTPConnectionPool(object): raise HostChangedError("Connection pool with host '%s' tried to open a foreign host: %s" % (host, url)) - try: # Request a connection from the queue conn = self._get_conn() @@ -208,6 +270,10 @@ class HTTPConnectionPool(object): # Timed out either by socket or queue raise TimeoutError("Request timed out after %f seconds" % self.timeout) + except (ssl.SSLError), e: + # SSL certificate error + raise SSLError(message=e.message) + except (HTTPException, SocketError), e: log.warn("Retrying (%d attempts remain) after connection broken by '%r': %s" % (retries, e, url)) return self.urlopen(method, url, body, headers, retries-1, redirect) # Try again @@ -258,17 +324,73 @@ class HTTPSConnectionPool(HTTPConnectionPool): scheme = 'https' + def __init__(self, host, port=None, timeout=None, maxsize=1, block=False, key_file=None, cert_file=None, cert_reqs='CERT_NONE', ca_certs=None): + self.host = host + self.port = port + self.timeout = timeout + self.pool = Queue(maxsize) + self.block = block + + self.key_file = key_file + self.cert_file = cert_file + + self.cert_reqs = cert_reqs + + self.ca_certs = ca_certs + + # Fill the queue up so that doing get() on it will block properly + [self.pool.put(None) for i in xrange(maxsize)] + + self.num_connections = 0 + self.num_requests = 0 + def _new_conn(self): """ Return a fresh HTTPSConnection. """ self.num_connections += 1 log.info("Starting new HTTPS connection (%d): %s" % (self.num_connections, self.host)) - return HTTPSConnection(host=self.host, port=self.port) + + if not ssl: + return HTTPSConnection(host=self.host, port=self.port) + + connection = VerifiedHTTPSConnection(host=self.host, port=self.port) + connection.add_cert(key_file=self.key_file, cert_file=self.cert_file, cert_reqs=self.cert_file, ca_certs=self.ca_certs) + return connection ## Helpers + +def make_headers(accept_encoding=None, user_agent=None): + """ + Shortcuts for generating request headers. + + accept_encoding + Can be a boolean, list, or string. + True translates to 'gzip,deflate'. + List will get joined by comma. + String will be used as provided. + + user_agent + String representing the user-agent you want, such as "python-urllib3/0.6" + """ + headers = {} + if accept_encoding: + if isinstance(accept_encoding, str): + pass + elif isinstance(accept_encoding, list): + accept_encoding = ','.join(accept_encoding) + else: + accept_encoding = 'gzip,deflate' + headers['accept-encoding'] = accept_encoding + + if user_agent: + headers['user-agent'] = user_agent + + return headers + + def get_host(url): """ Given a url, return its scheme, host and port (None if it's not there). |