diff options
Diffstat (limited to 'crawl.py')
-rw-r--r-- | crawl.py | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/crawl.py b/crawl.py new file mode 100644 index 0000000..86e0af7 --- /dev/null +++ b/crawl.py @@ -0,0 +1,74 @@ +"""Compare the speed of downloading URLs sequentially vs. using futures.""" + +import functools +import time +import timeit +import sys + +try: + from urllib2 import urlopen +except ImportError: + from urllib.request import urlopen + +from concurrent.futures import (as_completed, ThreadPoolExecutor, + ProcessPoolExecutor) + +URLS = ['http://www.google.com/', + 'http://www.apple.com/', + 'http://www.ibm.com', + 'http://www.thisurlprobablydoesnotexist.com', + 'http://www.slashdot.org/', + 'http://www.python.org/', + 'http://www.bing.com/', + 'http://www.facebook.com/', + 'http://www.yahoo.com/', + 'http://www.youtube.com/', + 'http://www.blogger.com/'] + +def load_url(url, timeout): + kwargs = {'timeout': timeout} if sys.version_info >= (2, 6) else {} + return urlopen(url, **kwargs).read() + +def download_urls_sequential(urls, timeout=60): + url_to_content = {} + for url in urls: + try: + url_to_content[url] = load_url(url, timeout=timeout) + except: + pass + return url_to_content + +def download_urls_with_executor(urls, executor, timeout=60): + try: + url_to_content = {} + future_to_url = dict((executor.submit(load_url, url, timeout), url) + for url in urls) + + for future in as_completed(future_to_url): + try: + url_to_content[future_to_url[future]] = future.result() + except: + pass + return url_to_content + finally: + executor.shutdown() + +def main(): + for name, fn in [('sequential', + functools.partial(download_urls_sequential, URLS)), + ('processes', + functools.partial(download_urls_with_executor, + URLS, + ProcessPoolExecutor(10))), + ('threads', + functools.partial(download_urls_with_executor, + URLS, + ThreadPoolExecutor(10)))]: + sys.stdout.write('%s: ' % name.ljust(12)) + start = time.time() + url_map = fn() + sys.stdout.write('%.2f seconds (%d of %d downloaded)\n' % + (time.time() - start, len(url_map), len(URLS))) + +if __name__ == '__main__': + main() |