Added a library full of diagnostics to make tech support easier.

author: Leonard Richardson <leonardr@segfault.org> 2013-05-06 13:55:56 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2013-05-06 13:55:56 -0400
commit: 6b77b294760641ebd3582e8124cec119ce874649 (patch)
tree: 87204be4308fde67c58f6cd3e35c25adb478dde1 /bs4/diagnose.py
parent: e54dfca6930f367d0092034ead1a7d7e0228a4a7 (diff)
download: beautifulsoup4-6b77b294760641ebd3582e8124cec119ce874649.tar.gz
1 files changed, 85 insertions, 0 deletions
diff --git a/bs4/diagnose.py b/bs4/diagnose.py
new file mode 100644
index 0000000..5192a3f
--- /dev/null
+++ b/bs4/diagnose.py
@@ -0,0 +1,85 @@
+"""Diagnostic functions, mainly for use when doing tech support."""
+from StringIO import StringIO
+from HTMLParser import HTMLParser
+from bs4 import BeautifulSoup
+from bs4.builder import builder_registry
+import traceback
+import sys
+
+def diagnose(data):
+    """Diagnostic suite for isolating common problems."""
+    basic_parsers = ["html.parser", "html5lib", "lxml"]
+    for name in basic_parsers:
+        for builder in builder_registry.builders:
+            if name in builder.features:
+                break
+        else:
+            basic_parsers.remove(name)
+            print (
+                "I noticed that %s is not installed. Installing it may help." %
+                name)
+
+    if 'lxml' in basic_parsers:
+        basic_parsers.append(["lxml", "xml"])
+
+    for parser in basic_parsers:
+        print "Trying to parse your data with %s" % parser
+        try:
+            soup = BeautifulSoup(data, parser)
+            print "Here's what %s did with the document:" % parser
+            print soup.prettify()
+        except Exception, e:
+            print "%s could not parse the document." % parser
+            traceback.print_exc()
+        print "-" * 80
+
+def lxml_trace(data, html=True):
+    """Print out the lxml events that occur during parsing.
+
+    This lets you see how lxml parses a document when no Beautiful
+    Soup code is running.
+    """
+    from lxml import etree
+    for event, element in etree.iterparse(StringIO(data), html=html):
+        print("%s, %4s, %s" % (event, element.tag, element.text))
+
+class AnnouncingParser(HTMLParser):
+    """Announces HTMLParser parse events, without doing anything else."""
+    def handle_starttag(self, name, attrs):
+        print "%s START" % name
+
+    def handle_endtag(self, name):
+        print "%s END" % name
+
+    def handle_data(self, data):
+        print "%s DATA" % data
+
+    def handle_charref(self, name):
+        print "%s CHARREF" % name
+
+    def handle_entityref(self, name):
+        print "%s ENTITYREF" % name
+
+    def handle_comment(self, data):
+        print "%s COMMENT" % data
+
+    def handle_decl(self, data):
+        print "%s DECL" % data
+
+    def unknown_decl(self, data):
+        print "%s UNKNOWN-DECL" % data
+
+    def handle_pi(self, data):
+        print "%s PI" % data
+
+def htmlparser_trace(data):
+    """Print out the HTMLParser events that occur during parsing.
+
+    This lets you see how HTMLParser parses a document when no
+    Beautiful Soup code is running.
+    """
+    parser = AnnouncingParser()
+    parser.feed(data)
+
+if __name__ == '__main__':
+    diagnose(sys.stdin.read())
author	Leonard Richardson <leonardr@segfault.org>	2013-05-06 13:55:56 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2013-05-06 13:55:56 -0400
commit	6b77b294760641ebd3582e8124cec119ce874649 (patch)
tree	87204be4308fde67c58f6cd3e35c25adb478dde1 /bs4/diagnose.py
parent	e54dfca6930f367d0092034ead1a7d7e0228a4a7 (diff)
download	beautifulsoup4-6b77b294760641ebd3582e8124cec119ce874649.tar.gz