summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrey Petrov <andrey.petrov@shazow.net>2010-10-10 14:40:33 -0700
committerAndrey Petrov <andrey.petrov@shazow.net>2010-10-10 14:40:33 -0700
commitfd934ae60c2129be2f7b76159a2f21d871f75b7c (patch)
tree9b2e243a90c4ba36fe63766fbb90c8878b0b9b66
parentc039803b439a7bbe542a955508b524c33f4a98cd (diff)
downloadurllib3-fd934ae60c2129be2f7b76159a2f21d871f75b7c.tar.gz
Added client-side verified SSL and gzip/deflate response decompression patch from niphlod, with refactoring and tests by me.
--HG-- branch : ssl_and_compression
-rw-r--r--README.txt14
-rw-r--r--setup.py4
-rw-r--r--test/test_connectionpool.py17
-rw-r--r--urllib3.egg-info/PKG-INFO2
-rw-r--r--urllib3/__init__.py4
-rw-r--r--urllib3/connectionpool.py132
6 files changed, 155 insertions, 18 deletions
diff --git a/README.txt b/README.txt
index baa5565a..9766badf 100644
--- a/README.txt
+++ b/README.txt
@@ -2,9 +2,10 @@ Highlights
==========
* Re-use the same socket connection for multiple requests
- (``HTTPConnectionPool`` and ``HTTPSConnectionPool``)
+ (``HTTPConnectionPool`` and ``HTTPSConnectionPool``) (with client-side certificates)
* File posting (``encode_multipart_formdata``)
* Built-in redirection and retries (optional)
+ * Supports gzip and deflate (big thanks to niphlod)
* Thread-safe
* Small and easy to understand codebase perfect for extending and building upon. For a more comprehensive alternative, have a look at `httplib2 <http://code.google.com/p/httplib2/>`_.
@@ -12,7 +13,7 @@ What's wrong with urllib and urllib2?
=====================================
There are two critical features missing from the Python standard library:
-Connection re-using/pooling and file posting. It's not terribly hard to
+Connection re-using/pooling and file posting. It's not terribly hard to
implement these yourself, but it's much easier to use a module that already
did the work for you.
@@ -28,7 +29,7 @@ Performance. When you normally do a urllib call, a separate socket
connection is created with each request. By reusing existing sockets
(supported since HTTP 1.1), the requests will take up less resources on the
server's end, and also provide a faster response time at the client's end.
-With some simple benchmarks (see `test/benchmark.py
+With some simple benchmarks (see `test/benchmark.py
<http://code.google.com/p/urllib3/source/browse/trunk/test/benchmark.py>`_
), downloading 15 URLs from google.com is about twice as fast when using
HTTPConnectionPool (which uses 1 connection) than using plain urllib (which
@@ -53,11 +54,10 @@ But, long story short::
import urllib3
API_URL = 'http://ajax.googleapis.com/ajax/services/search/web'
-
+
http_pool = urllib3.connection_from_url(API_URL)
-
+
fields = {'v': '1.0', 'q': 'urllib3'}
r = http_pool.get_url(API_URL, fields)
-
- print r.status, r.data
+ print r.status, r.data
diff --git a/setup.py b/setup.py
index cc88d272..a8e872e4 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
from setuptools import setup, find_packages
-version = '0.3.1'
+version = '0.4.0'
long_description = open('README.txt').read()
@@ -21,7 +21,7 @@ setup(name='urllib3',
author='Andrey Petrov',
author_email='andrey.petrov@shazow.net',
url='http://code.google.com/p/urllib3/',
- download_url='http://urllib3.googlecode.com/files/urllib3-0.3.1.tar.gz',
+ download_url='http://urllib3.googlecode.com/files/urllib3-0.4.0.tar.gz',
license='MIT',
packages=find_packages(exclude=['ez_setup', 'tests']),
include_package_data=True,
diff --git a/test/test_connectionpool.py b/test/test_connectionpool.py
index de7b725f..85c75632 100644
--- a/test/test_connectionpool.py
+++ b/test/test_connectionpool.py
@@ -3,7 +3,7 @@ import unittest
import sys
sys.path.append('../')
-from urllib3.connectionpool import HTTPConnectionPool, get_host, connection_from_url, HostChangedError
+from urllib3.connectionpool import HTTPConnectionPool, get_host, connection_from_url, make_headers
class TestConnectionPool(unittest.TestCase):
def test_get_host(self):
@@ -47,3 +47,18 @@ class TestConnectionPool(unittest.TestCase):
c = connection_from_url(a)
self.assertFalse(c.is_same_host(b), "%s =? %s" % (a,b))
+ def test_make_headers(self):
+ self.assertEqual(make_headers(accept_encoding=True),
+ {'accept-encoding': 'gzip,deflate'})
+
+ self.assertEqual(make_headers(accept_encoding='foo,bar'),
+ {'accept-encoding': 'foo,bar'})
+
+ self.assertEqual(make_headers(accept_encoding=['foo','bar']),
+ {'accept-encoding': 'foo,bar'})
+
+ self.assertEqual(make_headers(accept_encoding=True, user_agent='banana'),
+ {'accept-encoding': 'gzip,deflate', 'user-agent': 'banana'})
+
+ self.assertEqual(make_headers(user_agent='banana'),
+ {'user-agent': 'banana'})
diff --git a/urllib3.egg-info/PKG-INFO b/urllib3.egg-info/PKG-INFO
index 2fa40c34..80c8a8d0 100644
--- a/urllib3.egg-info/PKG-INFO
+++ b/urllib3.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.0
Name: urllib3
-Version: 0.3.1
+Version: 0.4.0
Summary: HTTP library with thread-safe connection pooling and file post support
Home-page: http://code.google.com/p/urllib3/
Author: Andrey Petrov
diff --git a/urllib3/__init__.py b/urllib3/__init__.py
index 9033f57e..190cc026 100644
--- a/urllib3/__init__.py
+++ b/urllib3/__init__.py
@@ -6,8 +6,8 @@ __author__ = "Andrey Petrov (andrey.petrov@shazow.net)"
__license__ = "MIT"
__version__ = "$Rev$"
-from connectionpool import HTTPConnectionPool, HTTPSConnectionPool, get_host, connection_from_url
+from connectionpool import HTTPConnectionPool, HTTPSConnectionPool, get_host, connection_from_url, make_headers
from filepost import encode_multipart_formdata
# Possible exceptions
-from connectionpool import HTTPError, MaxRetryError, TimeoutError
+from connectionpool import HTTPError, SSLError, MaxRetryError, TimeoutError
diff --git a/urllib3/connectionpool.py b/urllib3/connectionpool.py
index d2bec162..a7f7b8d7 100644
--- a/urllib3/connectionpool.py
+++ b/urllib3/connectionpool.py
@@ -2,21 +2,37 @@ import logging
log = logging.getLogger(__name__)
from Queue import Queue, Empty, Full
-from StringIO import StringIO
-from itertools import count
+try:
+ from cStringIO import StringIO
+except ImportError, e:
+ from StringIO import StringIO
+
+try:
+ import ssl
+except ImportError, e:
+ ssl = None
from urllib import urlencode
from httplib import HTTPConnection, HTTPSConnection, HTTPException
+import socket
from socket import error as SocketError, timeout as SocketTimeout
+import gzip
+import zlib
+
from filepost import encode_multipart_formdata
+
## Exceptions
class HTTPError(Exception):
"Base exception used by this module."
pass
+class SSLError(Exception):
+ "Raised when SSL certificate fails in an HTTPS connection."
+ pass
+
class MaxRetryError(HTTPError):
"Raised when the maximum number of retries is exceeded."
pass
@@ -54,7 +70,26 @@ class HTTPResponse(object):
NOTE: This method will perform r.read() which will have side effects
on the original http.HTTPResponse object.
"""
- return HTTPResponse(data=r.read(),
+ tmp_data = StringIO(r.read())
+ try:
+ if r.getheader('content-encoding') == 'gzip':
+ log.debug("Received response with content-encoding: gzip, decompressing with gzip.")
+
+ gzipper = gzip.GzipFile(fileobj=tmp_data)
+ data = gzipper.read()
+ elif r.getheader('content-encoding') == 'deflate':
+ log.debug("Received response with content-encoding: deflate, decompressing with zlib.")
+ try:
+ data = zlib.decompress(tmp_data)
+ except zlib.error, e:
+ data = zlib.decompress(tmp_data, -zlib.MAX_WBITS)
+ else:
+ data = tmp_data.read()
+
+ except IOError:
+ raise HTTPError("Received response with content-encoding: %s, but failed to decompress it." % (r.getheader('content-encoding')))
+
+ return HTTPResponse(data=data,
headers=dict(r.getheaders()),
status=r.status,
version=r.version,
@@ -68,6 +103,34 @@ class HTTPResponse(object):
def getheader(self, name, default=None):
return self.headers.get(name, default)
+
+## Connection objects
+
+class VerifiedHTTPSConnection(HTTPSConnection):
+ """
+ Based on httplib.HTTPSConnection but wraps the socket with SSL certification.
+ """
+
+ def add_cert(self, key_file=None, cert_file=None, cert_reqs='CERT_NONE', ca_certs=None):
+ ssl_req_scheme = {
+ 'CERT_NONE' : ssl.CERT_NONE,
+ 'CERT_OPTIONAL' : ssl.CERT_OPTIONAL,
+ 'CERT_REQUIRED' : ssl.CERT_REQUIRED
+ }
+
+ self.key_file = key_file
+ self.cert_file = cert_file
+ self.cert_reqs = ssl_req_scheme.get(cert_reqs) or ssl.CERT_NONE
+ self.ca_certs = ca_certs
+
+ def connect(self):
+ # Add certificate verification
+ sock = socket.create_connection((self.host, self.port), self.timeout)
+
+ # Wrap socket using verification with the root certs in trusted_root_certs
+ self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, cert_reqs=self.cert_reqs, ca_certs=self.ca_certs)
+
+
## Pool objects
class HTTPConnectionPool(object):
@@ -184,7 +247,6 @@ class HTTPConnectionPool(object):
raise HostChangedError("Connection pool with host '%s' tried to open a foreign host: %s" % (host, url))
-
try:
# Request a connection from the queue
conn = self._get_conn()
@@ -208,6 +270,10 @@ class HTTPConnectionPool(object):
# Timed out either by socket or queue
raise TimeoutError("Request timed out after %f seconds" % self.timeout)
+ except (ssl.SSLError), e:
+ # SSL certificate error
+ raise SSLError(message=e.message)
+
except (HTTPException, SocketError), e:
log.warn("Retrying (%d attempts remain) after connection broken by '%r': %s" % (retries, e, url))
return self.urlopen(method, url, body, headers, retries-1, redirect) # Try again
@@ -258,17 +324,73 @@ class HTTPSConnectionPool(HTTPConnectionPool):
scheme = 'https'
+ def __init__(self, host, port=None, timeout=None, maxsize=1, block=False, key_file=None, cert_file=None, cert_reqs='CERT_NONE', ca_certs=None):
+ self.host = host
+ self.port = port
+ self.timeout = timeout
+ self.pool = Queue(maxsize)
+ self.block = block
+
+ self.key_file = key_file
+ self.cert_file = cert_file
+
+ self.cert_reqs = cert_reqs
+
+ self.ca_certs = ca_certs
+
+ # Fill the queue up so that doing get() on it will block properly
+ [self.pool.put(None) for i in xrange(maxsize)]
+
+ self.num_connections = 0
+ self.num_requests = 0
+
def _new_conn(self):
"""
Return a fresh HTTPSConnection.
"""
self.num_connections += 1
log.info("Starting new HTTPS connection (%d): %s" % (self.num_connections, self.host))
- return HTTPSConnection(host=self.host, port=self.port)
+
+ if not ssl:
+ return HTTPSConnection(host=self.host, port=self.port)
+
+ connection = VerifiedHTTPSConnection(host=self.host, port=self.port)
+ connection.add_cert(key_file=self.key_file, cert_file=self.cert_file, cert_reqs=self.cert_file, ca_certs=self.ca_certs)
+ return connection
## Helpers
+
+def make_headers(accept_encoding=None, user_agent=None):
+ """
+ Shortcuts for generating request headers.
+
+ accept_encoding
+ Can be a boolean, list, or string.
+ True translates to 'gzip,deflate'.
+ List will get joined by comma.
+ String will be used as provided.
+
+ user_agent
+ String representing the user-agent you want, such as "python-urllib3/0.6"
+ """
+ headers = {}
+ if accept_encoding:
+ if isinstance(accept_encoding, str):
+ pass
+ elif isinstance(accept_encoding, list):
+ accept_encoding = ','.join(accept_encoding)
+ else:
+ accept_encoding = 'gzip,deflate'
+ headers['accept-encoding'] = accept_encoding
+
+ if user_agent:
+ headers['user-agent'] = user_agent
+
+ return headers
+
+
def get_host(url):
"""
Given a url, return its scheme, host and port (None if it's not there).