diff options
Diffstat (limited to 'python2/crawl.py')
-rw-r--r-- | python2/crawl.py | 31 |
1 files changed, 17 insertions, 14 deletions
diff --git a/python2/crawl.py b/python2/crawl.py index a597c76..88d7c9e 100644 --- a/python2/crawl.py +++ b/python2/crawl.py @@ -1,3 +1,5 @@ +"""Compare the speed of downloading URLs sequentially vs. using futures.""" + import datetime import functools import futures.thread @@ -11,26 +13,29 @@ URLS = ['http://www.google.com/', 'http://www.thisurlprobablydoesnotexist.com', 'http://www.slashdot.org/', 'http://www.python.org/', - 'http://www.sweetapp.com/'] * 5 + 'http://www.bing.com/', + 'http://www.facebook.com/', + 'http://www.yahoo.com/', + 'http://www.youtube.com/', + 'http://www.blogger.com/'] -def load_url(url, timeout): - return urllib2.urlopen(url, timeout=timeout).read() +def load_url(url): + return urllib2.urlopen(url).read() -def download_urls_sequential(urls, timeout=60): +def download_urls_sequential(urls): url_to_content = {} for url in urls: try: - url_to_content[url] = load_url(url, timeout=timeout) + url_to_content[url] = load_url(url) except: pass return url_to_content -def download_urls_with_executor(urls, executor, timeout=60): +def download_urls_with_executor(urls, executor): try: url_to_content = {} fs = executor.run_to_futures( - (functools.partial(load_url, url, timeout) for url in urls), - timeout=timeout) + (functools.partial(load_url, url) for url in urls)) for future in fs.successful_futures(): url = urls[future.index] url_to_content[url] = future.result() @@ -41,17 +46,15 @@ def download_urls_with_executor(urls, executor, timeout=60): def main(): for name, fn in [('sequential', functools.partial(download_urls_sequential, URLS)), - ('processes', - functools.partial(download_urls_with_executor, - URLS, - futures.ProcessPoolExecutor(10))), ('threads', functools.partial(download_urls_with_executor, URLS, futures.ThreadPoolExecutor(10)))]: print '%s: ' % name.ljust(12), start = time.time() - fn() - print '%.2f seconds' % (time.time() - start) + url_map = fn() + print '%.2f seconds (%d of %d downloaded)' % (time.time() - start, + len(url_map), + len(URLS)) main() |