summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2013-04-28 16:04:06 +0200
committerStefan Behnel <stefan_ml@behnel.de>2013-04-28 16:04:06 +0200
commitaa847aee79d2b8889688ef163e3c36228191ba64 (patch)
treed85bd956cb147940b494e6f0bde44af5f6ee1778
parentd87c6de1b5bc1f57d301680463022a7c35a30f5a (diff)
downloadpython-lxml-aa847aee79d2b8889688ef163e3c36228191ba64.tar.gz
improve HTML parsing in lxml.html in the face of preceding whitespace
-rw-r--r--src/lxml/html/__init__.py30
1 files changed, 18 insertions, 12 deletions
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index 78b0e9f4..4e2fbc60 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -557,6 +557,11 @@ class HtmlElementClassLookup(etree.CustomElementClassLookup):
# parsing
################################################################################
+_looks_like_full_html_unicode = re.compile(
+ unicode(r'^\s*<(?:html|!doctype)'), re.I).match
+_looks_like_full_html_bytes = re.compile(
+ r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
+
def document_fromstring(html, parser=None, **kw):
if parser is None:
parser = html_parser
@@ -581,11 +586,12 @@ def fragments_fromstring(html, no_leading_text=False, base_url=None,
if parser is None:
parser = html_parser
# FIXME: check what happens when you give html with a body, head, etc.
- start = html[:20].lstrip().lower()
- if sys.version_info[0] >= 3 and hasattr(start, 'decode'): # Py3 can't mix bytes into startswith()
- start = start.decode('ISO8859-1')
- if not start.startswith('<html') and not start.startswith('<!doctype'):
- html = '<html><body>%s</body></html>' % html
+ if isinstance(html, bytes):
+ if not _looks_like_full_html_bytes(html):
+ html = '<html><body>%s</body></html>'.encode('ascii') % html
+ else:
+ if not _looks_like_full_html_unicode(html):
+ html = '<html><body>%s</body></html>' % html
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
assert _nons(doc.tag) == 'html'
bodies = [e for e in doc if _nons(e.tag) == 'body']
@@ -659,14 +665,14 @@ def fromstring(html, base_url=None, parser=None, **kw):
"""
if parser is None:
parser = html_parser
- start = html[:10].lstrip().lower()
- if sys.version_info[0] >= 3 and hasattr(start, 'decode'): # Py3 can't mix bytes into startswith()
- start = start.decode('ISO8859-1')
- if start.startswith('<html') or start.startswith('<!doctype'):
- # Looks like a full HTML document
- return document_fromstring(html, parser=parser, base_url=base_url, **kw)
- # otherwise, lets parse it out...
+ if isinstance(html, bytes):
+ is_full_html = _looks_like_full_html_bytes(html)
+ else:
+ is_full_html = _looks_like_full_html_unicode(html)
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
+ if is_full_html:
+ return doc
+ # otherwise, lets parse it out...
bodies = doc.findall('body')
if not bodies:
bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)