summaryrefslogtreecommitdiff
path: root/python2/crawl.py
diff options
context:
space:
mode:
Diffstat (limited to 'python2/crawl.py')
-rw-r--r--python2/crawl.py38
1 files changed, 21 insertions, 17 deletions
diff --git a/python2/crawl.py b/python2/crawl.py
index cf4a218..f1d29ca 100644
--- a/python2/crawl.py
+++ b/python2/crawl.py
@@ -19,26 +19,29 @@ URLS = ['http://www.google.com/',
'http://www.youtube.com/',
'http://www.blogger.com/']
-def load_url(url):
- return urllib2.urlopen(url).read()
+def load_url(url, timeout):
+ return urllib2.urlopen(url, timeout=timeout).read()
-def download_urls_sequential(urls):
+def download_urls_sequential(urls, timeout=60):
url_to_content = {}
for url in urls:
try:
- url_to_content[url] = load_url(url)
+ url_to_content[url] = load_url(url, timeout=timeout)
except:
pass
return url_to_content
-def download_urls_with_executor(urls, executor):
+def download_urls_with_executor(urls, executor, timeout=60):
try:
url_to_content = {}
- fs = executor.run_to_futures(
- (functools.partial(load_url, url) for url in urls))
- for future in fs.successful_futures():
- url = urls[future.index]
- url_to_content[url] = future.result()
+ future_to_url = dict((executor.submit(load_url, url, timeout), url)
+ for url in urls)
+
+ for future in futures.as_completed(future_to_url):
+ try:
+ url_to_content[future_to_url[future]] = future.result()
+ except:
+ pass
return url_to_content
finally:
executor.shutdown()
@@ -46,19 +49,20 @@ def download_urls_with_executor(urls, executor):
def main():
for name, fn in [('sequential',
functools.partial(download_urls_sequential, URLS)),
- ('threads',
+ ('processes',
functools.partial(download_urls_with_executor,
URLS,
- futures.ThreadPoolExecutor(10))),
- ('processes',
+ futures.ProcessPoolExecutor(10))),
+ ('threads',
functools.partial(download_urls_with_executor,
URLS,
futures.ThreadPoolExecutor(10)))]:
- print '%s: ' % name.ljust(12),
+ print name.ljust(12),
start = time.time()
url_map = fn()
print '%.2f seconds (%d of %d downloaded)' % (time.time() - start,
- len(url_map),
- len(URLS))
+ len(url_map),
+ len(URLS))
-main()
+if __name__ == '__main__':
+ main()