From a2439508ba5c94546db98593cfa676de9b59babe Mon Sep 17 00:00:00 2001 From: Paul McGuire Date: Sat, 6 Apr 2019 23:44:02 -0500 Subject: Fixed dict structure in makeHTMLTags expressions, and added tag_body attribute to the generated start expression giving easy access to a SkipTo(closeTag) that will parse the tag's body text; some code cleanup and removed duplication among examples --- examples/htmlStripper.py | 64 ++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'examples/htmlStripper.py') diff --git a/examples/htmlStripper.py b/examples/htmlStripper.py index c3dbcf1..18f3395 100644 --- a/examples/htmlStripper.py +++ b/examples/htmlStripper.py @@ -1,32 +1,32 @@ -# -# htmlStripper.py -# -# Sample code for stripping HTML markup tags and scripts from -# HTML source files. -# -# Copyright (c) 2006, 2016, Paul McGuire -# -from contextlib import closing -import urllib.request, urllib.parse, urllib.error -from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity, - htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith) - -scriptOpen,scriptClose = makeHTMLTags("script") -scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose -commonHTMLEntity.setParseAction(replaceHTMLEntity) - -# get some HTML -targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary" -with closing(urllib.request.urlopen( targetURL )) as targetPage: - targetHTML = targetPage.read().decode("UTF-8") - -# first pass, strip out tags and translate entities -firstPass = (htmlComment | scriptBody | commonHTMLEntity | - anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) - -# first pass leaves many blank lines, collapse these down -repeatedNewlines = LineEnd() + OneOrMore(LineEnd()) -repeatedNewlines.setParseAction(replaceWith("\n\n")) -secondPass = repeatedNewlines.transformString(firstPass) - -print(secondPass) +# +# htmlStripper.py +# +# Sample code for stripping HTML markup tags and scripts from +# HTML source files. +# +# Copyright (c) 2006, 2016, Paul McGuire +# +from contextlib import closing +import urllib.request, urllib.parse, urllib.error +from pyparsing import (makeHTMLTags, commonHTMLEntity, replaceHTMLEntity, + htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith) + +scriptOpen, scriptClose = makeHTMLTags("script") +scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose +commonHTMLEntity.setParseAction(replaceHTMLEntity) + +# get some HTML +targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary" +with closing(urllib.request.urlopen( targetURL )) as targetPage: + targetHTML = targetPage.read().decode("UTF-8") + +# first pass, strip out tags and translate entities +firstPass = (htmlComment | scriptBody | commonHTMLEntity | + anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) + +# first pass leaves many blank lines, collapse these down +repeatedNewlines = LineEnd()*(2,) +repeatedNewlines.setParseAction(replaceWith("\n\n")) +secondPass = repeatedNewlines.transformString(firstPass) + +print(secondPass) -- cgit v1.2.1