From 27e183a78c8062ed7c2bbb91655a5e56cd697bba Mon Sep 17 00:00:00 2001 From: Cengiz Kaygusuz Date: Mon, 20 Nov 2017 20:46:39 -0500 Subject: Move src to root --- examples/htmlStripper.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 examples/htmlStripper.py (limited to 'examples/htmlStripper.py') diff --git a/examples/htmlStripper.py b/examples/htmlStripper.py new file mode 100644 index 0000000..1d7a0f0 --- /dev/null +++ b/examples/htmlStripper.py @@ -0,0 +1,32 @@ +# +# htmlStripper.py +# +# Sample code for stripping HTML markup tags and scripts from +# HTML source files. +# +# Copyright (c) 2006, 2016, Paul McGuire +# +from contextlib import closing +import urllib.request, urllib.parse, urllib.error +from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity, + htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith) + +scriptOpen,scriptClose = makeHTMLTags("script") +scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose +commonHTMLEntity.setParseAction(replaceHTMLEntity) + +# get some HTML +targetURL = "http://wiki.python.org/moin/PythonDecoratorLibrary" +with closing(urllib.request.urlopen( targetURL )) as targetPage: + targetHTML = targetPage.read().decode("UTF-8") + +# first pass, strip out tags and translate entities +firstPass = (htmlComment | scriptBody | commonHTMLEntity | + anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) + +# first pass leaves many blank lines, collapse these down +repeatedNewlines = LineEnd() + OneOrMore(LineEnd()) +repeatedNewlines.setParseAction(replaceWith("\n\n")) +secondPass = repeatedNewlines.transformString(firstPass) + +print(secondPass) \ No newline at end of file -- cgit v1.2.1