Merge pull request #119 from seelmann/599318-690319-fromstring-error-if-no-body

Avoid error in lxml.html.fromstring() if content contains no body (bugs ...
author: scoder <stefan_ml@behnel.de> 2013-04-27 08:24:40 -0700
committer: scoder <stefan_ml@behnel.de> 2013-04-27 08:24:40 -0700
commit: a8a5a39a135a2c2c31a764f0f759217a937df3bb (patch)
tree: e1f53ebc63d334442b9729eb854333e30954e381
parent: d96e70fa8fd551bd3a7724ba17cad01dacaa4c4b (diff)
parent: 7b7958e175f0218cea58d4f42644f8ee07437f2e (diff)
download: python-lxml-a8a5a39a135a2c2c31a764f0f759217a937df3bb.tar.gz
2 files changed, 44 insertions, 0 deletions
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index 23a96c1e..78b0e9f4 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -699,6 +699,8 @@ def fromstring(html, base_url=None, parser=None, **kw):
                 # We don't care about text or tail in a head
                 other_head.drop_tree()
         return doc
+    if body is None:
+        return doc
     if (len(body) == 1 and (not body.text or not body.text.strip())
         and (not body[-1].tail or not body[-1].tail.strip())):
         # The body has just one element, so it was probably a single
diff --git a/src/lxml/html/tests/test_basic.txt b/src/lxml/html/tests/test_basic.txt
index 074dddef..d7066402 100644
--- a/src/lxml/html/tests/test_basic.txt
+++ b/src/lxml/html/tests/test_basic.txt
@@ -118,3 +118,45 @@ least if an encoding is given.
     1
     >>> print(docs[0])
     Test
+
+Bug 599318: Call fromstring with a frameset fragment should not raise an error,
+the whole document is returned.
+
+    >>> import lxml.html
+    >>> content='''
+    ... <frameset>
+    ...  <frame src="main.php" name="srcpg">
+    ... </frameset>'''
+    >>> etree_document = lxml.html.fromstring(content)
+    >>> print(tostring(etree_document, encoding=unicode))
+    <html><frameset><frame src="main.php" name="srcpg"></frameset></html>
+
+Bug 599318: Call fromstring with a div fragment should not raise an error,
+only the element is returned
+
+    >>> import lxml.html
+    >>> content='<div></div>'
+    >>> etree_document = lxml.html.fromstring(content)
+    >>> print(tostring(etree_document, encoding=unicode))
+    <div></div>
+
+Bug 599318: Call fromstring with a head fragment should not raise an error,
+the whole document is returned.
+
+    >>> import lxml.html
+    >>> content='<head></head>'
+    >>> etree_document = lxml.html.fromstring(content)
+    >>> print(tostring(etree_document, encoding=unicode))
+    <html><head></head></html>
+
+Bug 690319: Leading whitespace before doctype declaration should not raise an error.
+
+    >>> import lxml.html
+    >>> content='''
+    ...     <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+    ...     <html>
+    ...     </html>'''
+    >>> etree_document = lxml.html.fromstring(content)
+    >>> print(tostring(etree_document, encoding=unicode))
+    <html></html>
+
author	scoder <stefan_ml@behnel.de>	2013-04-27 08:24:40 -0700
committer	scoder <stefan_ml@behnel.de>	2013-04-27 08:24:40 -0700
commit	a8a5a39a135a2c2c31a764f0f759217a937df3bb (patch)
tree	e1f53ebc63d334442b9729eb854333e30954e381
parent	d96e70fa8fd551bd3a7724ba17cad01dacaa4c4b (diff)
parent	7b7958e175f0218cea58d4f42644f8ee07437f2e (diff)
download	python-lxml-a8a5a39a135a2c2c31a764f0f759217a937df3bb.tar.gz