diff options
| author | Jeff Dairiki <dairiki@dairiki.org> | 2012-04-01 09:35:46 -0700 |
|---|---|---|
| committer | Jeff Dairiki <dairiki@dairiki.org> | 2012-04-01 09:35:46 -0700 |
| commit | 820b014ffff71b130bc490a9a72d1e9c944c270d (patch) | |
| tree | 94bdf44e9fdcec9c2e53e00653c1875573476347 | |
| parent | 3a872ae9b45c39eb66720b28b8609638938b83f8 (diff) | |
| download | python-lxml-820b014ffff71b130bc490a9a72d1e9c944c270d.tar.gz | |
Fixes so that unit tests run under python 3.1
Note however that while there is a python3 version of html5lib,
it appears to be unmaintained, so the worth of all this is
questionable.
References:
http://code.google.com/p/html5lib/issues/detail?id=144
http://code.google.com/p/html5lib/source/browse/#hg%2Fpython3
--HG--
extra : rebase_source : a4ce702ad841c25d63f4a6a56ea106bcd986bd47
| -rw-r--r-- | src/lxml/html/html5parser.py | 26 |
1 files changed, 18 insertions, 8 deletions
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py index 00c2437a..5489d8e9 100644 --- a/src/lxml/html/html5parser.py +++ b/src/lxml/html/html5parser.py @@ -2,8 +2,6 @@ An interface to html5lib that mimics the lxml.html interface. """ -import urllib - from html5lib import HTMLParser as _HTMLParser from html5lib.treebuilders.etree_lxml import TreeBuilder @@ -15,7 +13,14 @@ try: _strings = basestring except NameError: _strings = (bytes, str) - +try: + from urllib2 import urlopen +except ImportError: + from urllib.request import urlopen +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse class HTMLParser(_HTMLParser): """An html5lib HTML parser with lxml as tree.""" @@ -104,11 +109,11 @@ def fragment_fromstring(html, create_parent=False, no_leading_text=not accept_leading_text) if create_parent: - if not isinstance(create_parent, basestring): + if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: - if isinstance(elements[0], basestring): + if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) @@ -174,11 +179,16 @@ def parse(filename_url_or_file, guess_charset=True, parser=None): """ if parser is None: parser = html_parser - if isinstance(filename_url_or_file, basestring): - fp = urllib.urlopen(filename_url_or_file) - else: + if not isinstance(filename_url_or_file, _strings): fp = filename_url_or_file + elif _looks_like_url(filename_url_or_file): + fp = urlopen(filename_url_or_file) + else: + fp = open(filename_url_or_file, 'rb') return parser.parse(fp, useChardet=guess_charset) +def _looks_like_url(str): + scheme = urlparse(str)[0] + return scheme != '' html_parser = HTMLParser() |
