summaryrefslogtreecommitdiff
path: root/crawl.py
diff options
context:
space:
mode:
Diffstat (limited to 'crawl.py')
-rw-r--r--crawl.py38
1 files changed, 38 insertions, 0 deletions
diff --git a/crawl.py b/crawl.py
new file mode 100644
index 0000000..57f9e8e
--- /dev/null
+++ b/crawl.py
@@ -0,0 +1,38 @@
+import datetime
+import functools
+import futures.thread
+import time
+import timeit
+import urllib.request
+
+URLS = ['http://www.google.com/',
+ 'http://www.apple.com/',
+ 'http://www.ibm.com',
+ 'http://www.thisurlprobablydoesnotexist.com',
+ 'http://www.slashdot.org/',
+ 'http://www.python.org/',
+ 'http://www.sweetapp.com/']
+
+def load_url(url, timeout):
+ return urllib.request.urlopen(url, timeout=timeout).read()
+
+def download_urls(urls, timeout=60):
+ url_to_content = {}
+ for url in urls:
+ try:
+ url_to_content[url] = load_url(url, timeout=timeout)
+ except:
+ pass
+ return url_to_content
+
+executor = futures.thread.ThreadPoolExecutor(max_threads=100)
+def download_urls_with_futures(urls, timeout=60):
+ url_to_content = {}
+ fs = executor.run(
+ (functools.partial(load_url, url, timeout) for url in urls),
+ timeout=timeout)
+ for url, future in zip(urls, fs.result_futures()):
+ url_to_content[url] = future.result()
+ return url_to_content
+
+print(download_urls(URLS))