summaryrefslogtreecommitdiff
path: root/crawl.py
diff options
context:
space:
mode:
Diffstat (limited to 'crawl.py')
-rw-r--r--crawl.py74
1 files changed, 74 insertions, 0 deletions
diff --git a/crawl.py b/crawl.py
new file mode 100644
index 0000000..86e0af7
--- /dev/null
+++ b/crawl.py
@@ -0,0 +1,74 @@
+"""Compare the speed of downloading URLs sequentially vs. using futures."""
+
+import functools
+import time
+import timeit
+import sys
+
+try:
+ from urllib2 import urlopen
+except ImportError:
+ from urllib.request import urlopen
+
+from concurrent.futures import (as_completed, ThreadPoolExecutor,
+ ProcessPoolExecutor)
+
+URLS = ['http://www.google.com/',
+ 'http://www.apple.com/',
+ 'http://www.ibm.com',
+ 'http://www.thisurlprobablydoesnotexist.com',
+ 'http://www.slashdot.org/',
+ 'http://www.python.org/',
+ 'http://www.bing.com/',
+ 'http://www.facebook.com/',
+ 'http://www.yahoo.com/',
+ 'http://www.youtube.com/',
+ 'http://www.blogger.com/']
+
+def load_url(url, timeout):
+ kwargs = {'timeout': timeout} if sys.version_info >= (2, 6) else {}
+ return urlopen(url, **kwargs).read()
+
+def download_urls_sequential(urls, timeout=60):
+ url_to_content = {}
+ for url in urls:
+ try:
+ url_to_content[url] = load_url(url, timeout=timeout)
+ except:
+ pass
+ return url_to_content
+
+def download_urls_with_executor(urls, executor, timeout=60):
+ try:
+ url_to_content = {}
+ future_to_url = dict((executor.submit(load_url, url, timeout), url)
+ for url in urls)
+
+ for future in as_completed(future_to_url):
+ try:
+ url_to_content[future_to_url[future]] = future.result()
+ except:
+ pass
+ return url_to_content
+ finally:
+ executor.shutdown()
+
+def main():
+ for name, fn in [('sequential',
+ functools.partial(download_urls_sequential, URLS)),
+ ('processes',
+ functools.partial(download_urls_with_executor,
+ URLS,
+ ProcessPoolExecutor(10))),
+ ('threads',
+ functools.partial(download_urls_with_executor,
+ URLS,
+ ThreadPoolExecutor(10)))]:
+ sys.stdout.write('%s: ' % name.ljust(12))
+ start = time.time()
+ url_map = fn()
+ sys.stdout.write('%.2f seconds (%d of %d downloaded)\n' %
+ (time.time() - start, len(url_map), len(URLS)))
+
+if __name__ == '__main__':
+ main()