summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Dairiki <dairiki@dairiki.org>2012-04-01 09:35:46 -0700
committerJeff Dairiki <dairiki@dairiki.org>2012-04-01 09:35:46 -0700
commit820b014ffff71b130bc490a9a72d1e9c944c270d (patch)
tree94bdf44e9fdcec9c2e53e00653c1875573476347
parent3a872ae9b45c39eb66720b28b8609638938b83f8 (diff)
downloadpython-lxml-820b014ffff71b130bc490a9a72d1e9c944c270d.tar.gz
Fixes so that unit tests run under python 3.1
Note however that while there is a python3 version of html5lib, it appears to be unmaintained, so the worth of all this is questionable. References: http://code.google.com/p/html5lib/issues/detail?id=144 http://code.google.com/p/html5lib/source/browse/#hg%2Fpython3 --HG-- extra : rebase_source : a4ce702ad841c25d63f4a6a56ea106bcd986bd47
-rw-r--r--src/lxml/html/html5parser.py26
1 files changed, 18 insertions, 8 deletions
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py
index 00c2437a..5489d8e9 100644
--- a/src/lxml/html/html5parser.py
+++ b/src/lxml/html/html5parser.py
@@ -2,8 +2,6 @@
An interface to html5lib that mimics the lxml.html interface.
"""
-import urllib
-
from html5lib import HTMLParser as _HTMLParser
from html5lib.treebuilders.etree_lxml import TreeBuilder
@@ -15,7 +13,14 @@ try:
_strings = basestring
except NameError:
_strings = (bytes, str)
-
+try:
+ from urllib2 import urlopen
+except ImportError:
+ from urllib.request import urlopen
+try:
+ from urlparse import urlparse
+except ImportError:
+ from urllib.parse import urlparse
class HTMLParser(_HTMLParser):
"""An html5lib HTML parser with lxml as tree."""
@@ -104,11 +109,11 @@ def fragment_fromstring(html, create_parent=False,
no_leading_text=not accept_leading_text)
if create_parent:
- if not isinstance(create_parent, basestring):
+ if not isinstance(create_parent, _strings):
create_parent = 'div'
new_root = Element(create_parent)
if elements:
- if isinstance(elements[0], basestring):
+ if isinstance(elements[0], _strings):
new_root.text = elements[0]
del elements[0]
new_root.extend(elements)
@@ -174,11 +179,16 @@ def parse(filename_url_or_file, guess_charset=True, parser=None):
"""
if parser is None:
parser = html_parser
- if isinstance(filename_url_or_file, basestring):
- fp = urllib.urlopen(filename_url_or_file)
- else:
+ if not isinstance(filename_url_or_file, _strings):
fp = filename_url_or_file
+ elif _looks_like_url(filename_url_or_file):
+ fp = urlopen(filename_url_or_file)
+ else:
+ fp = open(filename_url_or_file, 'rb')
return parser.parse(fp, useChardet=guess_charset)
+def _looks_like_url(str):
+ scheme = urlparse(str)[0]
+ return scheme != ''
html_parser = HTMLParser()