diff options
| author | Ryan Williams <rdw@lindenlab.com> | 2010-03-01 23:30:35 -0800 |
|---|---|---|
| committer | Ryan Williams <rdw@lindenlab.com> | 2010-03-01 23:30:35 -0800 |
| commit | 67ddcdfead13b8f3525de0b5977c8c233cfb8031 (patch) | |
| tree | 58b0832b13358141f2f089de778d6bb735f7d1e4 /examples | |
| parent | 7e6db36e0a13ba582a3f272df55bb8cc967f1546 (diff) | |
| download | eventlet-67ddcdfead13b8f3525de0b5977c8c233cfb8031.tar.gz | |
Recursive crawler example added.
Diffstat (limited to 'examples')
| -rw-r--r-- | examples/producer_consumer.py | 51 |
1 files changed, 51 insertions, 0 deletions
diff --git a/examples/producer_consumer.py b/examples/producer_consumer.py new file mode 100644 index 0000000..84e2819 --- /dev/null +++ b/examples/producer_consumer.py @@ -0,0 +1,51 @@ +"""This is a recursive web crawler. Don't go pointing this at random sites; +it doesn't respect robots.txt and it is pretty brutal about how quickly it +fetches pages. + +This is a kind of "producer/consumer" example; the producer function produces +jobs, and the GreenPool itself is the consumer, farming out work concurrently. +It's easier to write it this way rather than writing a standard consumer loop; +GreenPool handles any exceptions raised and arranges so that there's a set +number of "workers", so you don't have to write that tedious management code +yourself. +""" + +from eventlet.green import urllib2 +import eventlet +import re + +# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls +url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))') + + +def fetch(url, outq): + """Fetch a url and push any urls found into a queue.""" + print "fetching", url + data = '' + with eventlet.Timeout(5, False): + data = urllib2.urlopen(url).read() + for url_match in url_regex.finditer(data): + new_url = url_match.group(0) + outq.put(new_url) + + +def producer(start_url): + """Recursively crawl starting from *start_url*. Returns a set of + urls that were found.""" + pool = eventlet.GreenPool() + seen = set() + q = eventlet.Queue() + q.put(start_url) + # keep looping if there are new urls, or workers that may produce more urls + while not q.empty() or pool.running() != 0: + url = eventlet.with_timeout(0.1, q.get, timeout_value='') + # limit requests to eventlet.net so we don't crash all over the internet + if url not in seen and 'eventlet.net' in url: + seen.add(url) + pool.spawn(fetch, url, q) + return seen + + +seen = producer("http://eventlet.net") +print "I saw these urls:" +print "\n".join(seen)
\ No newline at end of file |
