summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorRyan Williams <rdw@lindenlab.com>2010-03-01 23:30:35 -0800
committerRyan Williams <rdw@lindenlab.com>2010-03-01 23:30:35 -0800
commit67ddcdfead13b8f3525de0b5977c8c233cfb8031 (patch)
tree58b0832b13358141f2f089de778d6bb735f7d1e4 /examples
parent7e6db36e0a13ba582a3f272df55bb8cc967f1546 (diff)
downloadeventlet-67ddcdfead13b8f3525de0b5977c8c233cfb8031.tar.gz
Recursive crawler example added.
Diffstat (limited to 'examples')
-rw-r--r--examples/producer_consumer.py51
1 files changed, 51 insertions, 0 deletions
diff --git a/examples/producer_consumer.py b/examples/producer_consumer.py
new file mode 100644
index 0000000..84e2819
--- /dev/null
+++ b/examples/producer_consumer.py
@@ -0,0 +1,51 @@
+"""This is a recursive web crawler. Don't go pointing this at random sites;
+it doesn't respect robots.txt and it is pretty brutal about how quickly it
+fetches pages.
+
+This is a kind of "producer/consumer" example; the producer function produces
+jobs, and the GreenPool itself is the consumer, farming out work concurrently.
+It's easier to write it this way rather than writing a standard consumer loop;
+GreenPool handles any exceptions raised and arranges so that there's a set
+number of "workers", so you don't have to write that tedious management code
+yourself.
+"""
+
+from eventlet.green import urllib2
+import eventlet
+import re
+
+# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
+url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
+
+
+def fetch(url, outq):
+ """Fetch a url and push any urls found into a queue."""
+ print "fetching", url
+ data = ''
+ with eventlet.Timeout(5, False):
+ data = urllib2.urlopen(url).read()
+ for url_match in url_regex.finditer(data):
+ new_url = url_match.group(0)
+ outq.put(new_url)
+
+
+def producer(start_url):
+ """Recursively crawl starting from *start_url*. Returns a set of
+ urls that were found."""
+ pool = eventlet.GreenPool()
+ seen = set()
+ q = eventlet.Queue()
+ q.put(start_url)
+ # keep looping if there are new urls, or workers that may produce more urls
+ while not q.empty() or pool.running() != 0:
+ url = eventlet.with_timeout(0.1, q.get, timeout_value='')
+ # limit requests to eventlet.net so we don't crash all over the internet
+ if url not in seen and 'eventlet.net' in url:
+ seen.add(url)
+ pool.spawn(fetch, url, q)
+ return seen
+
+
+seen = producer("http://eventlet.net")
+print "I saw these urls:"
+print "\n".join(seen) \ No newline at end of file