diff options
| author | Ryan Williams <breath@alum.mit.edu> | 2010-06-21 21:44:44 -0700 |
|---|---|---|
| committer | Ryan Williams <breath@alum.mit.edu> | 2010-06-21 21:44:44 -0700 |
| commit | 7befa471d5ad5d8d7985a2489723848e7e35898d (patch) | |
| tree | 8a5fd455bf9628df30e258d5c448b7f9bc24f05b /examples | |
| parent | 0512896657acdf08a9854053f713d9194018057a (diff) | |
| download | eventlet-7befa471d5ad5d8d7985a2489723848e7e35898d.tar.gz | |
Added cleaner recursive web crawler example.
Diffstat (limited to 'examples')
| -rw-r--r-- | examples/recursive_crawler.py | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/examples/recursive_crawler.py b/examples/recursive_crawler.py new file mode 100644 index 0000000..2e8701e --- /dev/null +++ b/examples/recursive_crawler.py @@ -0,0 +1,49 @@ +"""This is a recursive web crawler. Don't go pointing this at random sites; +it doesn't respect robots.txt and it is pretty brutal about how quickly it +fetches pages. + +The code for this is very short; this is perhaps a good indication +that this is making the most effective use of the primitves at hand. +The fetch function does all the work of making http requests, +searching for new urls, and dispatching new fetches. The GreenPool +acts as sort of a job coordinator (and concurrency controller of +course). +""" +from __future__ import with_statement + +from eventlet.green import urllib2 +import eventlet +import re + +# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls +url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))') + + +def fetch(url, seen, pool): + """Fetch a url, stick any found urls into the seen set, and + dispatch any new ones to the pool.""" + print "fetching", url + data = '' + with eventlet.Timeout(5, False): + data = urllib2.urlopen(url).read() + for url_match in url_regex.finditer(data): + new_url = url_match.group(0) + # only send requests to eventlet.net so as not to destroy the internet + if new_url not in seen and 'eventlet.net' in new_url: + seen.add(new_url) + # while this seems stack-recursive, it's actually not: + # spawned greenthreads start their own stacks + pool.spawn_n(fetch, new_url, seen, pool) + +def crawl(start_url): + """Recursively crawl starting from *start_url*. Returns a set of + urls that were found.""" + pool = eventlet.GreenPool() + seen = set() + fetch(start_url, seen, pool) + pool.waitall() + return seen + +seen = crawl("http://eventlet.net") +print "I saw these urls:" +print "\n".join(seen) |
