Added cleaner recursive web crawler example.

author: Ryan Williams <breath@alum.mit.edu> 2010-06-21 21:44:44 -0700
committer: Ryan Williams <breath@alum.mit.edu> 2010-06-21 21:44:44 -0700
commit: 7befa471d5ad5d8d7985a2489723848e7e35898d (patch)
tree: 8a5fd455bf9628df30e258d5c448b7f9bc24f05b /examples
parent: 0512896657acdf08a9854053f713d9194018057a (diff)
download: eventlet-7befa471d5ad5d8d7985a2489723848e7e35898d.tar.gz
1 files changed, 49 insertions, 0 deletions
diff --git a/examples/recursive_crawler.py b/examples/recursive_crawler.py
new file mode 100644
index 0000000..2e8701e
--- /dev/null
+++ b/examples/recursive_crawler.py
@@ -0,0 +1,49 @@
+"""This is a recursive web crawler.  Don't go pointing this at random sites;
+it doesn't respect robots.txt and it is pretty brutal about how quickly it 
+fetches pages.
+
+The code for this is very short; this is perhaps a good indication
+that this is making the most effective use of the primitves at hand.
+The fetch function does all the work of making http requests,
+searching for new urls, and dispatching new fetches.  The GreenPool
+acts as sort of a job coordinator (and concurrency controller of
+course).
+"""
+from __future__ import with_statement
+
+from eventlet.green import urllib2
+import eventlet
+import re
+
+# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
+url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
+
+
+def fetch(url, seen, pool):
+    """Fetch a url, stick any found urls into the seen set, and
+    dispatch any new ones to the pool."""
+    print "fetching", url
+    data = ''
+    with eventlet.Timeout(5, False):
+        data = urllib2.urlopen(url).read()
+    for url_match in url_regex.finditer(data):
+        new_url = url_match.group(0)
+        # only send requests to eventlet.net so as not to destroy the internet
+        if new_url not in seen and 'eventlet.net' in new_url:
+            seen.add(new_url)
+            # while this seems stack-recursive, it's actually not:
+            # spawned greenthreads start their own stacks
+            pool.spawn_n(fetch, new_url, seen, pool)
+            
+def crawl(start_url):
+    """Recursively crawl starting from *start_url*.  Returns a set of 
+    urls that were found."""
+    pool = eventlet.GreenPool()
+    seen = set()
+    fetch(start_url, seen, pool)
+    pool.waitall()
+    return seen
+
+seen = crawl("http://eventlet.net")
+print "I saw these urls:"
+print "\n".join(seen)
author	Ryan Williams <breath@alum.mit.edu>	2010-06-21 21:44:44 -0700
committer	Ryan Williams <breath@alum.mit.edu>	2010-06-21 21:44:44 -0700
commit	7befa471d5ad5d8d7985a2489723848e7e35898d (patch)
tree	8a5fd455bf9628df30e258d5c448b7f9bc24f05b /examples
parent	0512896657acdf08a9854053f713d9194018057a (diff)
download	eventlet-7befa471d5ad5d8d7985a2489723848e7e35898d.tar.gz