summaryrefslogtreecommitdiff
path: root/python2/crawl.py
diff options
context:
space:
mode:
Diffstat (limited to 'python2/crawl.py')
-rw-r--r--python2/crawl.py31
1 files changed, 17 insertions, 14 deletions
diff --git a/python2/crawl.py b/python2/crawl.py
index a597c76..88d7c9e 100644
--- a/python2/crawl.py
+++ b/python2/crawl.py
@@ -1,3 +1,5 @@
+"""Compare the speed of downloading URLs sequentially vs. using futures."""
+
import datetime
import functools
import futures.thread
@@ -11,26 +13,29 @@ URLS = ['http://www.google.com/',
'http://www.thisurlprobablydoesnotexist.com',
'http://www.slashdot.org/',
'http://www.python.org/',
- 'http://www.sweetapp.com/'] * 5
+ 'http://www.bing.com/',
+ 'http://www.facebook.com/',
+ 'http://www.yahoo.com/',
+ 'http://www.youtube.com/',
+ 'http://www.blogger.com/']
-def load_url(url, timeout):
- return urllib2.urlopen(url, timeout=timeout).read()
+def load_url(url):
+ return urllib2.urlopen(url).read()
-def download_urls_sequential(urls, timeout=60):
+def download_urls_sequential(urls):
url_to_content = {}
for url in urls:
try:
- url_to_content[url] = load_url(url, timeout=timeout)
+ url_to_content[url] = load_url(url)
except:
pass
return url_to_content
-def download_urls_with_executor(urls, executor, timeout=60):
+def download_urls_with_executor(urls, executor):
try:
url_to_content = {}
fs = executor.run_to_futures(
- (functools.partial(load_url, url, timeout) for url in urls),
- timeout=timeout)
+ (functools.partial(load_url, url) for url in urls))
for future in fs.successful_futures():
url = urls[future.index]
url_to_content[url] = future.result()
@@ -41,17 +46,15 @@ def download_urls_with_executor(urls, executor, timeout=60):
def main():
for name, fn in [('sequential',
functools.partial(download_urls_sequential, URLS)),
- ('processes',
- functools.partial(download_urls_with_executor,
- URLS,
- futures.ProcessPoolExecutor(10))),
('threads',
functools.partial(download_urls_with_executor,
URLS,
futures.ThreadPoolExecutor(10)))]:
print '%s: ' % name.ljust(12),
start = time.time()
- fn()
- print '%.2f seconds' % (time.time() - start)
+ url_map = fn()
+ print '%.2f seconds (%d of %d downloaded)' % (time.time() - start,
+ len(url_map),
+ len(URLS))
main()