summaryrefslogtreecommitdiff
path: root/trunk/src/examples/htmlStripper.py
diff options
context:
space:
mode:
Diffstat (limited to 'trunk/src/examples/htmlStripper.py')
-rw-r--r--trunk/src/examples/htmlStripper.py39
1 files changed, 0 insertions, 39 deletions
diff --git a/trunk/src/examples/htmlStripper.py b/trunk/src/examples/htmlStripper.py
deleted file mode 100644
index 0b0f459..0000000
--- a/trunk/src/examples/htmlStripper.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#
-# htmlStripper.py
-#
-# Sample code for stripping HTML markup tags and scripts from
-# HTML source files.
-#
-# Copyright (c) 2006, Paul McGuire
-#
-from pyparsing import *
-import urllib.request, urllib.parse, urllib.error
-
-removeText = replaceWith("")
-scriptOpen,scriptClose = makeHTMLTags("script")
-scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose
-scriptBody.setParseAction(removeText)
-
-anyTag,anyClose = makeHTMLTags(Word(alphas,alphanums+":_"))
-anyTag.setParseAction(removeText)
-anyClose.setParseAction(removeText)
-htmlComment.setParseAction(removeText)
-
-commonHTMLEntity.setParseAction(replaceHTMLEntity)
-
-# get some HTML
-targetURL = "http://wiki.python.org/moin/PythonDecoratorLibrary"
-targetPage = urllib.request.urlopen( targetURL )
-targetHTML = targetPage.read()
-targetPage.close()
-
-# first pass, strip out tags and translate entities
-firstPass = (htmlComment | scriptBody | commonHTMLEntity |
- anyTag | anyClose ).transformString(targetHTML)
-
-# first pass leaves many blank lines, collapse these down
-repeatedNewlines = LineEnd() + OneOrMore(LineEnd())
-repeatedNewlines.setParseAction(replaceWith("\n\n"))
-secondPass = repeatedNewlines.transformString(firstPass)
-
-print(secondPass) \ No newline at end of file