The rain' expected = [ ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]), ('starttag', 'b', []), ('data', 'The '), ('starttag', 'a', [('href', 'some_url')]), ('data', 'rain'), ('endtag', 'a'), ] self._run_check(html, expected) def test_EOF_in_charref(self): # see #17802 # This test checks that the UnboundLocalError reported in the issue # is not raised, however I'm not sure the returned values are correct. # Maybe HTMLParser should use self.unescape for these data = [ ('a&', [('data', 'a&')]), ('a&b', [('data', 'ab')]), ('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]), ('a&b;', [('data', 'a'), ('entityref', 'b')]), ] for html, expected in data: self._run_check(html, expected) def test_broken_comments(self): html = ('' '' '' '' '') expected = [ ('comment', ' not really a comment '), ('comment', ' not a comment either --'), ('comment', ' -- close enough --'), ('comment', ''), ('comment', '<-- this was an empty comment'), ('comment', '!! another bogus comment !!!'), ] self._run_check(html, expected) def test_broken_condcoms(self): # these condcoms are missing the '--' after '' html = ('broken condcom' '' '' 'foo' '') # According to the HTML5 specs sections "8.2.4.44 Bogus comment state" # and "8.2.4.45 Markup declaration open state", comment tokens should # be emitted instead of 'unknown decl', but calling unknown_decl # provides more flexibility. # See also Lib/_markupbase.py:parse_declaration expected = [ ('unknown decl', 'if !(IE)'), ('data', 'broken condcom'), ('unknown decl', 'endif'), ('unknown decl', 'if ! IE'), ('startendtag', 'link', [('href', 'favicon.tiff')]), ('unknown decl', 'endif'), ('unknown decl', 'if !IE 6'), ('startendtag', 'img', [('src', 'firefox.png')]), ('unknown decl', 'endif'), ('unknown decl', 'if !ie 6'), ('starttag', 'b', []), ('data', 'foo'), ('endtag', 'b'), ('unknown decl', 'endif'), ('unknown decl', 'if (!IE)|(lt IE 9)'), ('startendtag', 'img', [('src', 'mammoth.bmp')]), ('unknown decl', 'endif') ] self._run_check(html, expected) def test_convert_charrefs_dropped_text(self): # #23144: make sure that all the events are triggered when # convert_charrefs is True, even if we don't call .close() parser = EventCollector(convert_charrefs=True) # before the fix, bar & baz was missing parser.feed("foo link bar & baz") self.assertEqual( parser.get_events(), [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'), ('endtag', 'a'), ('data', ' bar & baz')] ) class AttributesTestCase(TestCaseBase): def test_attr_syntax(self): output = [ ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) ] self._run_check("""""", output) self._run_check("""""", output) self._run_check("""""", output) self._run_check("""""", output) def test_attr_values(self): self._run_check("""""", [("starttag", "a", [("b", "xxx\n\txxx"), ("c", "yyy\t\nyyy"), ("d", "\txyz\n")])]) self._run_check("""""", [("starttag", "a", [("b", ""), ("c", "")])]) # Regression test for SF patch #669683. self._run_check("", [("starttag", "e", [("a", "rgb(1,2,3)")])]) # Regression test for SF bug #921657. self._run_check( "", [("starttag", "a", [("href", "mailto:xyz@example.com")])]) def test_attr_nonascii(self): # see issue 7311 self._run_check( " $\u4e2d\u6587$ ", [("starttag", "img", [("src", "/foo/bar.png"), ("alt", "\u4e2d\u6587")])]) self._run_check( "", [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), ("href", "\u30c6\u30b9\u30c8.html")])]) self._run_check( '', [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), ("href", "\u30c6\u30b9\u30c8.html")])]) def test_attr_entity_replacement(self): self._run_check( "", [("starttag", "a", [("b", "&><\"'")])]) def test_attr_funky_names(self): self._run_check( "", [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])]) def test_entityrefs_in_attributes(self): self._run_check( "", [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) def test_attr_funky_names2(self): self._run_check( r"", [("starttag", "a", [("$", None)]), ("starttag", "b", [("$", "%")]), ("starttag", "c", [("\\", "/")])]) def test_entities_in_attribute_value(self): # see #1200313 for entity in ['&', '&', '&', '&']: self._run_check('' % entity, [("starttag", "a", [("href", "&")])]) self._run_check("" % entity, [("starttag", "a", [("href", "&")])]) self._run_check("" % entity, [("starttag", "a", [("href", "&")])]) def test_malformed_attributes(self): # see #13357 html = ( "test - bad1" "test - bad2" "test - bad3" "test - bad4" ) expected = [ ('starttag', 'a', [('href', "test'style='color:red;bad1'")]), ('data', 'test - bad1'), ('endtag', 'a'), ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]), ('data', 'test - bad2'), ('endtag', 'a'), ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]), ('data', 'test - bad3'), ('endtag', 'a'), ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]), ('data', 'test - bad4'), ('endtag', 'a') ] self._run_check(html, expected) def test_malformed_adjacent_attributes(self): # see #12629 self._run_check('', [('starttag', 'x', []), ('startendtag', 'y', [('z', ''), ('o""', None)]), ('endtag', 'x')]) self._run_check('', [('starttag', 'x', []), ('startendtag', 'y', [('z', ''), ('""', None)]), ('endtag', 'x')]) # see #755670 for the following 3 tests def test_adjacent_attributes(self): self._run_check('', [("starttag", "a", [("width", "100%"), ("cellspacing","0")])]) self._run_check('', [("starttag", "a", [("id", "foo"), ("class","bar")])]) def test_missing_attribute_value(self): self._run_check('', [("starttag", "a", [("v", "")])]) def test_javascript_attribute_value(self): self._run_check("", [("starttag", "a", [("href", "javascript:popup('/popup/help.html')")])]) def test_end_tag_in_attribute_value(self): # see #1745761 self._run_check("spam", [("starttag", "a", [("href", "http://www.example.org/\">;")]), ("data", "spam"), ("endtag", "a")]) def test_with_unquoted_attributes(self): # see #12008 html = ("" "" "
" "- software-and-i" "- library
") expected = [ ('starttag', 'html', []), ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]), ('starttag', 'table', [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]), ('starttag', 'tr', []), ('starttag', 'td', [('align', 'left')]), ('starttag', 'font', [('size', '-1')]), ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]), ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'), ('endtag', 'span'), ('endtag', 'a'), ('data', '- '), ('starttag', 'a', [('href', '/1/')]), ('starttag', 'span', [('class', 'en')]), ('data', ' library'), ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table') ] self._run_check(html, expected) def test_comma_between_attributes(self): # see bpo 41478 # HTMLParser preserves duplicate attributes, leaving the task of # removing duplicate attributes to a conformant html tree builder html = ('
' # between attrs (unquoted) '
' # between attrs (quoted) '
' # after values (unquoted) '
' # after values (quoted) '
' # one comma values (quoted) '
' # before values (unquoted) '
' # before values (quoted) '
' # before names '
' # after names ) expected = [ ('starttag', 'div', [('class', 'bar,baz=asd'),]), ('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]), ('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]), ('starttag', 'div', [('class', 'bar'), (',', None), ('baz', 'asd'), (',', None)]), ('starttag', 'div', [('class', 'bar'), (',', None)]), ('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]), ('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]), ('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]), ('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]), ] self._run_check(html, expected) def test_weird_chars_in_unquoted_attribute_values(self): self._run_check('
', [ ('starttag', 'form', [('action', 'bogus|&#()value')])]) if __name__ == "__main__": unittest.main()