Fixed dict structure in makeHTMLTags expressions, and added tag_body attribute to the generated start expression giving easy access to a SkipTo(closeTag) that will parse the tag's body text; some code cleanup and removed duplication among examples

author: Paul McGuire <ptmcg@austin.rr.com> 2019-04-06 23:44:02 -0500
committer: Paul McGuire <ptmcg@austin.rr.com> 2019-04-06 23:44:02 -0500
commit: a2439508ba5c94546db98593cfa676de9b59babe (patch)
tree: 80b02178820811c09b4befc9a9b5efb092813466 /examples/htmlStripper.py
parent: 832986ffccac943b363da43795c335eafc31b5da (diff)
download: pyparsing-git-a2439508ba5c94546db98593cfa676de9b59babe.tar.gz
1 files changed, 32 insertions, 32 deletions
diff --git a/examples/htmlStripper.py b/examples/htmlStripper.py
index c3dbcf1..18f3395 100644
--- a/examples/htmlStripper.py
+++ b/examples/htmlStripper.py
@@ -1,32 +1,32 @@
-#
-# htmlStripper.py
-#
-#  Sample code for stripping HTML markup tags and scripts from
-#  HTML source files.
-#
-# Copyright (c) 2006, 2016, Paul McGuire
-#
-from contextlib import closing
-import urllib.request, urllib.parse, urllib.error
-from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity,
-    htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith)
-
-scriptOpen,scriptClose = makeHTMLTags("script")
-scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose
-commonHTMLEntity.setParseAction(replaceHTMLEntity)
-
-# get some HTML
-targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary"
-with closing(urllib.request.urlopen( targetURL )) as targetPage:
-    targetHTML = targetPage.read().decode("UTF-8")
-
-# first pass, strip out tags and translate entities
-firstPass = (htmlComment | scriptBody | commonHTMLEntity |
-             anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML)
-
-# first pass leaves many blank lines, collapse these down
-repeatedNewlines = LineEnd() + OneOrMore(LineEnd())
-repeatedNewlines.setParseAction(replaceWith("\n\n"))
-secondPass = repeatedNewlines.transformString(firstPass)
-
-print(secondPass)
+#
+# htmlStripper.py
+#
+#  Sample code for stripping HTML markup tags and scripts from
+#  HTML source files.
+#
+# Copyright (c) 2006, 2016, Paul McGuire
+#
+from contextlib import closing
+import urllib.request, urllib.parse, urllib.error
+from pyparsing import (makeHTMLTags, commonHTMLEntity, replaceHTMLEntity,
+    htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith)
+
+scriptOpen, scriptClose = makeHTMLTags("script")
+scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose
+commonHTMLEntity.setParseAction(replaceHTMLEntity)
+
+# get some HTML
+targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary"
+with closing(urllib.request.urlopen( targetURL )) as targetPage:
+    targetHTML = targetPage.read().decode("UTF-8")
+
+# first pass, strip out tags and translate entities
+firstPass = (htmlComment | scriptBody | commonHTMLEntity |
+             anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML)
+
+# first pass leaves many blank lines, collapse these down
+repeatedNewlines = LineEnd()*(2,)
+repeatedNewlines.setParseAction(replaceWith("\n\n"))
+secondPass = repeatedNewlines.transformString(firstPass)
+
+print(secondPass)
author	Paul McGuire <ptmcg@austin.rr.com>	2019-04-06 23:44:02 -0500
committer	Paul McGuire <ptmcg@austin.rr.com>	2019-04-06 23:44:02 -0500
commit	a2439508ba5c94546db98593cfa676de9b59babe (patch)
tree	80b02178820811c09b4befc9a9b5efb092813466 /examples/htmlStripper.py
parent	832986ffccac943b363da43795c335eafc31b5da (diff)
download	pyparsing-git-a2439508ba5c94546db98593cfa676de9b59babe.tar.gz