diff options
author | Paul McGuire <ptmcg@austin.rr.com> | 2019-04-06 23:44:02 -0500 |
---|---|---|
committer | Paul McGuire <ptmcg@austin.rr.com> | 2019-04-06 23:44:02 -0500 |
commit | a2439508ba5c94546db98593cfa676de9b59babe (patch) | |
tree | 80b02178820811c09b4befc9a9b5efb092813466 /examples/htmlStripper.py | |
parent | 832986ffccac943b363da43795c335eafc31b5da (diff) | |
download | pyparsing-git-a2439508ba5c94546db98593cfa676de9b59babe.tar.gz |
Fixed dict structure in makeHTMLTags expressions, and added tag_body attribute to the generated start expression giving easy access to a SkipTo(closeTag) that will parse the tag's body text; some code cleanup and removed duplication among examples
Diffstat (limited to 'examples/htmlStripper.py')
-rw-r--r-- | examples/htmlStripper.py | 64 |
1 files changed, 32 insertions, 32 deletions
diff --git a/examples/htmlStripper.py b/examples/htmlStripper.py index c3dbcf1..18f3395 100644 --- a/examples/htmlStripper.py +++ b/examples/htmlStripper.py @@ -1,32 +1,32 @@ -#
-# htmlStripper.py
-#
-# Sample code for stripping HTML markup tags and scripts from
-# HTML source files.
-#
-# Copyright (c) 2006, 2016, Paul McGuire
-#
-from contextlib import closing
-import urllib.request, urllib.parse, urllib.error
-from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity,
- htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith)
-
-scriptOpen,scriptClose = makeHTMLTags("script")
-scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose
-commonHTMLEntity.setParseAction(replaceHTMLEntity)
-
-# get some HTML
-targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary"
-with closing(urllib.request.urlopen( targetURL )) as targetPage:
- targetHTML = targetPage.read().decode("UTF-8")
-
-# first pass, strip out tags and translate entities
-firstPass = (htmlComment | scriptBody | commonHTMLEntity |
- anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML)
-
-# first pass leaves many blank lines, collapse these down
-repeatedNewlines = LineEnd() + OneOrMore(LineEnd())
-repeatedNewlines.setParseAction(replaceWith("\n\n"))
-secondPass = repeatedNewlines.transformString(firstPass)
-
-print(secondPass)
+# +# htmlStripper.py +# +# Sample code for stripping HTML markup tags and scripts from +# HTML source files. +# +# Copyright (c) 2006, 2016, Paul McGuire +# +from contextlib import closing +import urllib.request, urllib.parse, urllib.error +from pyparsing import (makeHTMLTags, commonHTMLEntity, replaceHTMLEntity, + htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith) + +scriptOpen, scriptClose = makeHTMLTags("script") +scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose +commonHTMLEntity.setParseAction(replaceHTMLEntity) + +# get some HTML +targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary" +with closing(urllib.request.urlopen( targetURL )) as targetPage: + targetHTML = targetPage.read().decode("UTF-8") + +# first pass, strip out tags and translate entities +firstPass = (htmlComment | scriptBody | commonHTMLEntity | + anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) + +# first pass leaves many blank lines, collapse these down +repeatedNewlines = LineEnd()*(2,) +repeatedNewlines.setParseAction(replaceWith("\n\n")) +secondPass = repeatedNewlines.transformString(firstPass) + +print(secondPass) |