# -*- coding: utf-8 -*- """ HTML parser test cases for etree """ import unittest import tempfile, os, os.path, sys this_dir = os.path.dirname(__file__) if this_dir not in sys.path: sys.path.insert(0, this_dir) # needed for Py3 from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str from common_imports import SillyFileLike, HelperTestCase, write_to_file try: unicode except NameError: unicode = str class HtmlParserTestCase(HelperTestCase): """HTML parser test cases """ etree = etree html_str = _bytes("test

page title

") html_str_pretty = _bytes("""\ test

page title

""") broken_html_str = _bytes("test<body><h1>page title</h3></p></html>") uhtml_str = _str("<html><head><title>test á\uF8D2

page á\uF8D2 title

") def tearDown(self): super(HtmlParserTestCase, self).tearDown() self.etree.set_default_parser() def test_module_HTML(self): element = self.etree.HTML(self.html_str) self.assertEqual(self.etree.tostring(element, method="html"), self.html_str) def test_module_HTML_unicode(self): element = self.etree.HTML(self.uhtml_str) self.assertEqual(unicode(self.etree.tostring(element, method="html", encoding='UTF8'), 'UTF8'), unicode(self.uhtml_str.encode('UTF8'), 'UTF8')) def test_module_HTML_pretty_print(self): element = self.etree.HTML(self.html_str) self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), self.html_str_pretty) def test_module_parse_html_error(self): parser = self.etree.HTMLParser(recover=False) parse = self.etree.parse f = BytesIO("") self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser) def test_html_element_name_empty(self): parser = self.etree.HTMLParser() Element = parser.makeelement el = Element('name') self.assertRaises(ValueError, Element, '{}') self.assertRaises(ValueError, setattr, el, 'tag', '{}') self.assertRaises(ValueError, Element, '{test}') self.assertRaises(ValueError, setattr, el, 'tag', '{test}') def test_html_element_name_colon(self): parser = self.etree.HTMLParser() Element = parser.makeelement pname = Element('p:name') self.assertEquals(pname.tag, 'p:name') pname = Element('{test}p:name') self.assertEquals(pname.tag, '{test}p:name') pname = Element('name') pname.tag = 'p:name' self.assertEquals(pname.tag, 'p:name') def test_html_element_name_quote(self): parser = self.etree.HTMLParser() Element = parser.makeelement self.assertRaises(ValueError, Element, 'p"name') self.assertRaises(ValueError, Element, "na'me") self.assertRaises(ValueError, Element, '{test}"name') self.assertRaises(ValueError, Element, "{test}name'") el = Element('name') self.assertRaises(ValueError, setattr, el, 'tag', "pname'") self.assertRaises(ValueError, setattr, el, 'tag', '"pname') self.assertEquals(el.tag, "name") def test_html_element_name_space(self): parser = self.etree.HTMLParser() Element = parser.makeelement self.assertRaises(ValueError, Element, ' name ') self.assertRaises(ValueError, Element, 'na me') self.assertRaises(ValueError, Element, '{test} name') el = Element('name') self.assertRaises(ValueError, setattr, el, 'tag', ' name ') self.assertEquals(el.tag, "name") def test_html_subelement_name_empty(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') self.assertRaises(ValueError, SubElement, el, '{}') self.assertRaises(ValueError, SubElement, el, '{test}') def test_html_subelement_name_colon(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') pname = SubElement(el, 'p:name') self.assertEquals(pname.tag, 'p:name') pname = SubElement(el, '{test}p:name') self.assertEquals(pname.tag, '{test}p:name') def test_html_subelement_name_quote(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') self.assertRaises(ValueError, SubElement, el, "name'") self.assertRaises(ValueError, SubElement, el, 'na"me') self.assertRaises(ValueError, SubElement, el, "{test}na'me") self.assertRaises(ValueError, SubElement, el, '{test}"name') def test_html_subelement_name_space(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') self.assertRaises(ValueError, SubElement, el, ' name ') self.assertRaises(ValueError, SubElement, el, 'na me') self.assertRaises(ValueError, SubElement, el, '{test} name') def test_module_parse_html_norecover(self): parser = self.etree.HTMLParser(recover=False) parse = self.etree.parse f = BytesIO(self.broken_html_str) self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser) def test_parse_encoding_8bit_explicit(self): text = _str('Søk på nettet') html_latin1 = (_str('

%s

') % text).encode('iso-8859-1') tree = self.etree.parse( BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEquals(p.text, text) def test_parse_encoding_8bit_override(self): text = _str('Søk på nettet') wrong_head = _str(''' ''') html_latin1 = (_str('%s

%s

') % (wrong_head, text) ).encode('iso-8859-1') self.assertRaises(self.etree.ParseError, self.etree.parse, BytesIO(html_latin1)) tree = self.etree.parse( BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEquals(p.text, text) def test_module_HTML_broken(self): element = self.etree.HTML(self.broken_html_str) self.assertEqual(self.etree.tostring(element, method="html"), self.html_str) def test_module_HTML_cdata(self): # by default, libxml2 generates CDATA nodes for