diff options
Diffstat (limited to 'trunk/src/examples/urlExtractor.py')
-rw-r--r-- | trunk/src/examples/urlExtractor.py | 33 |
1 files changed, 0 insertions, 33 deletions
diff --git a/trunk/src/examples/urlExtractor.py b/trunk/src/examples/urlExtractor.py deleted file mode 100644 index 2c66d78..0000000 --- a/trunk/src/examples/urlExtractor.py +++ /dev/null @@ -1,33 +0,0 @@ -# URL extractor
-# Copyright 2004, Paul McGuire
-from pyparsing import makeHTMLTags, SkipTo, pyparsing_common
-import urllib.request
-from contextlib import closing
-import pprint
-
-linkOpenTag, linkCloseTag = makeHTMLTags('a')
-
-linkBody = SkipTo(linkCloseTag)
-linkBody.setParseAction(pyparsing_common.stripHTMLTags)
-linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split()))
-
-link = linkOpenTag + linkBody("body") + linkCloseTag.suppress()
-
-# Go get some HTML with some links in it.
-with closing(urllib.request.urlopen("http://www.yahoo.com")) as serverListPage:
- htmlText = serverListPage.read().decode("UTF-8")
-
-# scanString is a generator that loops through the input htmlText, and for each
-# match yields the tokens and start and end locations (for this application, we are
-# not interested in the start and end values).
-for toks,strt,end in link.scanString(htmlText):
- print(toks.asList())
-
-# Create dictionary from list comprehension, assembled from each pair of tokens returned
-# from a matched URL.
-pprint.pprint(
- dict((toks.body, toks.href) for toks,strt,end in link.scanString(htmlText))
- )
-
-
-
|