1 files changed, 74 insertions, 0 deletions
diff --git a/crawl.py b/crawl.py
new file mode 100644
index 0000000..86e0af7
--- /dev/null
+++ b/crawl.py
@@ -0,0 +1,74 @@
+"""Compare the speed of downloading URLs sequentially vs. using futures."""
+
+import functools
+import time
+import timeit
+import sys
+
+try:
+    from urllib2 import urlopen
+except ImportError:
+    from urllib.request import urlopen
+
+from concurrent.futures import (as_completed, ThreadPoolExecutor,
+                                ProcessPoolExecutor)
+
+URLS = ['http://www.google.com/',
+        'http://www.apple.com/',
+        'http://www.ibm.com',
+        'http://www.thisurlprobablydoesnotexist.com',
+        'http://www.slashdot.org/',
+        'http://www.python.org/',
+        'http://www.bing.com/',
+        'http://www.facebook.com/',
+        'http://www.yahoo.com/',
+        'http://www.youtube.com/',
+        'http://www.blogger.com/']
+
+def load_url(url, timeout):
+    kwargs = {'timeout': timeout} if sys.version_info >= (2, 6) else {}
+    return urlopen(url, **kwargs).read()
+
+def download_urls_sequential(urls, timeout=60):
+    url_to_content = {}
+    for url in urls:
+        try:
+            url_to_content[url] = load_url(url, timeout=timeout)
+        except:
+            pass
+    return url_to_content
+
+def download_urls_with_executor(urls, executor, timeout=60):
+    try:
+        url_to_content = {}
+        future_to_url = dict((executor.submit(load_url, url, timeout), url)
+                             for url in urls)
+
+        for future in as_completed(future_to_url):
+            try:
+                url_to_content[future_to_url[future]] = future.result()
+            except:
+                pass
+        return url_to_content
+    finally:
+        executor.shutdown()
+
+def main():
+    for name, fn in [('sequential',
+                      functools.partial(download_urls_sequential, URLS)),
+                     ('processes',
+                      functools.partial(download_urls_with_executor,
+                                        URLS,
+                                        ProcessPoolExecutor(10))),
+                     ('threads',
+                      functools.partial(download_urls_with_executor,
+                                        URLS,
+                                        ThreadPoolExecutor(10)))]:
+        sys.stdout.write('%s: ' % name.ljust(12))
+        start = time.time()
+        url_map = fn()
+        sys.stdout.write('%.2f seconds (%d of %d downloaded)\n' %
+                         (time.time() - start, len(url_map), len(URLS)))
+
+if __name__ == '__main__':
+    main()