diff options
author | Leonard Richardson <leonardr@segfault.org> | 2013-05-06 13:55:56 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2013-05-06 13:55:56 -0400 |
commit | 6b77b294760641ebd3582e8124cec119ce874649 (patch) | |
tree | 87204be4308fde67c58f6cd3e35c25adb478dde1 /bs4/diagnose.py | |
parent | e54dfca6930f367d0092034ead1a7d7e0228a4a7 (diff) | |
download | beautifulsoup4-6b77b294760641ebd3582e8124cec119ce874649.tar.gz |
Added a library full of diagnostics to make tech support easier.
Diffstat (limited to 'bs4/diagnose.py')
-rw-r--r-- | bs4/diagnose.py | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/bs4/diagnose.py b/bs4/diagnose.py new file mode 100644 index 0000000..5192a3f --- /dev/null +++ b/bs4/diagnose.py @@ -0,0 +1,85 @@ +"""Diagnostic functions, mainly for use when doing tech support.""" +from StringIO import StringIO +from HTMLParser import HTMLParser +from bs4 import BeautifulSoup +from bs4.builder import builder_registry +import traceback +import sys + +def diagnose(data): + """Diagnostic suite for isolating common problems.""" + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: + for builder in builder_registry.builders: + if name in builder.features: + break + else: + basic_parsers.remove(name) + print ( + "I noticed that %s is not installed. Installing it may help." % + name) + + if 'lxml' in basic_parsers: + basic_parsers.append(["lxml", "xml"]) + + for parser in basic_parsers: + print "Trying to parse your data with %s" % parser + try: + soup = BeautifulSoup(data, parser) + print "Here's what %s did with the document:" % parser + print soup.prettify() + except Exception, e: + print "%s could not parse the document." % parser + traceback.print_exc() + print "-" * 80 + +def lxml_trace(data, html=True): + """Print out the lxml events that occur during parsing. + + This lets you see how lxml parses a document when no Beautiful + Soup code is running. + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html): + print("%s, %4s, %s" % (event, element.tag, element.text)) + +class AnnouncingParser(HTMLParser): + """Announces HTMLParser parse events, without doing anything else.""" + def handle_starttag(self, name, attrs): + print "%s START" % name + + def handle_endtag(self, name): + print "%s END" % name + + def handle_data(self, data): + print "%s DATA" % data + + def handle_charref(self, name): + print "%s CHARREF" % name + + def handle_entityref(self, name): + print "%s ENTITYREF" % name + + def handle_comment(self, data): + print "%s COMMENT" % data + + def handle_decl(self, data): + print "%s DECL" % data + + def unknown_decl(self, data): + print "%s UNKNOWN-DECL" % data + + def handle_pi(self, data): + print "%s PI" % data + +def htmlparser_trace(data): + """Print out the HTMLParser events that occur during parsing. + + This lets you see how HTMLParser parses a document when no + Beautiful Soup code is running. + """ + parser = AnnouncingParser() + parser.feed(data) + +if __name__ == '__main__': + diagnose(sys.stdin.read()) |