diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2013-04-28 16:04:06 +0200 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2013-04-28 16:04:06 +0200 |
commit | aa847aee79d2b8889688ef163e3c36228191ba64 (patch) | |
tree | d85bd956cb147940b494e6f0bde44af5f6ee1778 | |
parent | d87c6de1b5bc1f57d301680463022a7c35a30f5a (diff) | |
download | python-lxml-aa847aee79d2b8889688ef163e3c36228191ba64.tar.gz |
improve HTML parsing in lxml.html in the face of preceding whitespace
-rw-r--r-- | src/lxml/html/__init__.py | 30 |
1 files changed, 18 insertions, 12 deletions
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py index 78b0e9f4..4e2fbc60 100644 --- a/src/lxml/html/__init__.py +++ b/src/lxml/html/__init__.py @@ -557,6 +557,11 @@ class HtmlElementClassLookup(etree.CustomElementClassLookup): # parsing ################################################################################ +_looks_like_full_html_unicode = re.compile( + unicode(r'^\s*<(?:html|!doctype)'), re.I).match +_looks_like_full_html_bytes = re.compile( + r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match + def document_fromstring(html, parser=None, **kw): if parser is None: parser = html_parser @@ -581,11 +586,12 @@ def fragments_fromstring(html, no_leading_text=False, base_url=None, if parser is None: parser = html_parser # FIXME: check what happens when you give html with a body, head, etc. - start = html[:20].lstrip().lower() - if sys.version_info[0] >= 3 and hasattr(start, 'decode'): # Py3 can't mix bytes into startswith() - start = start.decode('ISO8859-1') - if not start.startswith('<html') and not start.startswith('<!doctype'): - html = '<html><body>%s</body></html>' % html + if isinstance(html, bytes): + if not _looks_like_full_html_bytes(html): + html = '<html><body>%s</body></html>'.encode('ascii') % html + else: + if not _looks_like_full_html_unicode(html): + html = '<html><body>%s</body></html>' % html doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) assert _nons(doc.tag) == 'html' bodies = [e for e in doc if _nons(e.tag) == 'body'] @@ -659,14 +665,14 @@ def fromstring(html, base_url=None, parser=None, **kw): """ if parser is None: parser = html_parser - start = html[:10].lstrip().lower() - if sys.version_info[0] >= 3 and hasattr(start, 'decode'): # Py3 can't mix bytes into startswith() - start = start.decode('ISO8859-1') - if start.startswith('<html') or start.startswith('<!doctype'): - # Looks like a full HTML document - return document_fromstring(html, parser=parser, base_url=base_url, **kw) - # otherwise, lets parse it out... + if isinstance(html, bytes): + is_full_html = _looks_like_full_html_bytes(html) + else: + is_full_html = _looks_like_full_html_unicode(html) doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) + if is_full_html: + return doc + # otherwise, lets parse it out... bodies = doc.findall('body') if not bodies: bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) |