summaryrefslogtreecommitdiff
path: root/examples/urlExtractor.py
diff options
context:
space:
mode:
authorPaul McGuire <ptmcg@austin.rr.com>2019-04-06 23:44:02 -0500
committerPaul McGuire <ptmcg@austin.rr.com>2019-04-06 23:44:02 -0500
commita2439508ba5c94546db98593cfa676de9b59babe (patch)
tree80b02178820811c09b4befc9a9b5efb092813466 /examples/urlExtractor.py
parent832986ffccac943b363da43795c335eafc31b5da (diff)
downloadpyparsing-git-a2439508ba5c94546db98593cfa676de9b59babe.tar.gz
Fixed dict structure in makeHTMLTags expressions, and added tag_body attribute to the generated start expression giving easy access to a SkipTo(closeTag) that will parse the tag's body text; some code cleanup and removed duplication among examples
Diffstat (limited to 'examples/urlExtractor.py')
-rw-r--r--examples/urlExtractor.py60
1 files changed, 30 insertions, 30 deletions
diff --git a/examples/urlExtractor.py b/examples/urlExtractor.py
index e4299b9..fbc2fa6 100644
--- a/examples/urlExtractor.py
+++ b/examples/urlExtractor.py
@@ -1,30 +1,30 @@
-# URL extractor
-# Copyright 2004, Paul McGuire
-from pyparsing import makeHTMLTags, SkipTo, pyparsing_common as ppc
-import urllib.request
-from contextlib import closing
-import pprint
-
-linkOpenTag, linkCloseTag = makeHTMLTags('a')
-
-linkBody = SkipTo(linkCloseTag)
-linkBody.setParseAction(ppc.stripHTMLTags)
-linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split()))
-
-link = linkOpenTag + linkBody("body") + linkCloseTag.suppress()
-
-# Go get some HTML with some links in it.
-with closing(urllib.request.urlopen("https://www.yahoo.com/")) as serverListPage:
- htmlText = serverListPage.read().decode("UTF-8")
-
-# scanString is a generator that loops through the input htmlText, and for each
-# match yields the tokens and start and end locations (for this application, we are
-# not interested in the start and end values).
-for toks,strt,end in link.scanString(htmlText):
- print(toks.asList())
-
-# Create dictionary from list comprehension, assembled from each pair of tokens returned
-# from a matched URL.
-pprint.pprint(
- {toks.body: toks.href for toks,strt,end in link.scanString(htmlText)}
- )
+# URL extractor
+# Copyright 2004, Paul McGuire
+from pyparsing import makeHTMLTags, pyparsing_common as ppc
+import urllib.request
+from contextlib import closing
+import pprint
+
+linkOpenTag, linkCloseTag = makeHTMLTags('a')
+
+linkBody = linkOpenTag.tag_body
+linkBody.setParseAction(ppc.stripHTMLTags)
+linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split()))
+
+link = linkOpenTag + linkBody("body") + linkCloseTag.suppress()
+
+# Go get some HTML with some links in it.
+with closing(urllib.request.urlopen("https://www.cnn.com/")) as serverListPage:
+ htmlText = serverListPage.read().decode("UTF-8")
+
+# scanString is a generator that loops through the input htmlText, and for each
+# match yields the tokens and start and end locations (for this application, we are
+# not interested in the start and end values).
+for toks, strt, end in link.scanString(htmlText):
+ print(toks.asList())
+
+# Create dictionary from list comprehension, assembled from each pair of tokens returned
+# from a matched URL.
+pprint.pprint(
+ {toks.body: toks.href for toks, strt, end in link.scanString(htmlText)}
+ )