Fixes so that unit tests run under python 3.1

Note however that while there is a python3 version of html5lib, it appears to be unmaintained, so the worth of all this is questionable. References: http://code.google.com/p/html5lib/issues/detail?id=144 http://code.google.com/p/html5lib/source/browse/#hg%2Fpython3 --HG-- extra : rebase_source : a4ce702ad841c25d63f4a6a56ea106bcd986bd47
author: Jeff Dairiki <dairiki@dairiki.org> 2012-04-01 09:35:46 -0700
committer: Jeff Dairiki <dairiki@dairiki.org> 2012-04-01 09:35:46 -0700
commit: 820b014ffff71b130bc490a9a72d1e9c944c270d (patch)
tree: 94bdf44e9fdcec9c2e53e00653c1875573476347
parent: 3a872ae9b45c39eb66720b28b8609638938b83f8 (diff)
download: python-lxml-820b014ffff71b130bc490a9a72d1e9c944c270d.tar.gz
1 files changed, 18 insertions, 8 deletions
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py
index 00c2437a..5489d8e9 100644
--- a/src/lxml/html/html5parser.py
+++ b/src/lxml/html/html5parser.py
@@ -2,8 +2,6 @@
 An interface to html5lib that mimics the lxml.html interface.
 """
 
-import urllib
-
 from html5lib import HTMLParser as _HTMLParser
 from html5lib.treebuilders.etree_lxml import TreeBuilder
 
@@ -15,7 +13,14 @@ try:
     _strings = basestring
 except NameError:
     _strings = (bytes, str)
-
+try:
+    from urllib2 import urlopen
+except ImportError:
+    from urllib.request import urlopen
+try:
+    from urlparse import urlparse
+except ImportError:
+    from urllib.parse import urlparse
 
 class HTMLParser(_HTMLParser):
     """An html5lib HTML parser with lxml as tree."""
@@ -104,11 +109,11 @@ def fragment_fromstring(html, create_parent=False,
         no_leading_text=not accept_leading_text)
 
     if create_parent:
-        if not isinstance(create_parent, basestring):
+        if not isinstance(create_parent, _strings):
             create_parent = 'div'
         new_root = Element(create_parent)
         if elements:
-            if isinstance(elements[0], basestring):
+            if isinstance(elements[0], _strings):
                 new_root.text = elements[0]
                 del elements[0]
             new_root.extend(elements)
@@ -174,11 +179,16 @@ def parse(filename_url_or_file, guess_charset=True, parser=None):
     """
     if parser is None:
         parser = html_parser
-    if isinstance(filename_url_or_file, basestring):
-        fp = urllib.urlopen(filename_url_or_file)
-    else:
+    if not isinstance(filename_url_or_file, _strings):
         fp = filename_url_or_file
+    elif _looks_like_url(filename_url_or_file):
+        fp = urlopen(filename_url_or_file)
+    else:
+        fp = open(filename_url_or_file, 'rb')
     return parser.parse(fp, useChardet=guess_charset)
 
+def _looks_like_url(str):
+    scheme = urlparse(str)[0]
+    return scheme != ''
 
 html_parser = HTMLParser()
author	Jeff Dairiki <dairiki@dairiki.org>	2012-04-01 09:35:46 -0700
committer	Jeff Dairiki <dairiki@dairiki.org>	2012-04-01 09:35:46 -0700
commit	820b014ffff71b130bc490a9a72d1e9c944c270d (patch)
tree	94bdf44e9fdcec9c2e53e00653c1875573476347
parent	3a872ae9b45c39eb66720b28b8609638938b83f8 (diff)
download	python-lxml-820b014ffff71b130bc490a9a72d1e9c944c270d.tar.gz