The rain' expected = [ ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]), ('starttag', 'b', []), ('data', 'The '), ('starttag', 'a', [('href', 'some_url')]), ('data', 'rain'), ('endtag', 'a'), ] self._run_check(html, expected) def test_EOF_in_charref(self): # see #17802 # This test checks that the UnboundLocalError reported in the issue # is not raised, however I'm not sure the returned values are correct. # Maybe HTMLParser should use self.unescape for these data = [ ('a&', [('data', 'a&')]), ('a&b', [('data', 'ab')]), ('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]), ('a&b;', [('data', 'a'), ('entityref', 'b')]), ] for html, expected in data: self._run_check(html, expected) def test_broken_comments(self): html = ('' '' '' '' '') expected = [ ('comment', ' not really a comment '), ('comment', ' not a comment either --'), ('comment', ' -- close enough --'), ('comment', ''), ('comment', '<-- this was an empty comment'), ('comment', '!! another bogus comment !!!'), ] self._run_check(html, expected) def test_broken_condcoms(self): # these condcoms are missing the '--' after '' html = ('broken condcom' '' '' 'foo' '') # According to the HTML5 specs sections "8.2.4.44 Bogus comment state" # and "8.2.4.45 Markup declaration open state", comment tokens should # be emitted instead of 'unknown decl', but calling unknown_decl # provides more flexibility. # See also Lib/_markupbase.py:parse_declaration expected = [ ('unknown decl', 'if !(IE)'), ('data', 'broken condcom'), ('unknown decl', 'endif'), ('unknown decl', 'if ! IE'), ('startendtag', 'link', [('href', 'favicon.tiff')]), ('unknown decl', 'endif'), ('unknown decl', 'if !IE 6'), ('startendtag', 'img', [('src', 'firefox.png')]), ('unknown decl', 'endif'), ('unknown decl', 'if !ie 6'), ('starttag', 'b', []), ('data', 'foo'), ('endtag', 'b'), ('unknown decl', 'endif'), ('unknown decl', 'if (!IE)|(lt IE 9)'), ('startendtag', 'img', [('src', 'mammoth.bmp')]), ('unknown decl', 'endif') ] self._run_check(html, expected) def test_convert_charrefs_dropped_text(self): # #23144: make sure that all the events are triggered when # convert_charrefs is True, even if we don't call .close() parser = EventCollector(convert_charrefs=True) # before the fix, bar & baz was missing parser.feed("foo link bar & baz") self.assertEqual( parser.get_events(), [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'), ('endtag', 'a'), ('data', ' bar & baz')] ) class AttributesTestCase(TestCaseBase): def test_attr_syntax(self): output = [ ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) ] self._run_check("""""", output) self._run_check("""""", output) self._run_check("""""", output) self._run_check("""""", output) def test_attr_values(self): self._run_check("""""", [("starttag", "a", [("b", "xxx\n\txxx"), ("c", "yyy\t\nyyy"), ("d", "\txyz\n")])]) self._run_check("""""", [("starttag", "a", [("b", ""), ("c", "")])]) # Regression test for SF patch #669683. self._run_check("", [("starttag", "e", [("a", "rgb(1,2,3)")])]) # Regression test for SF bug #921657. self._run_check( "", [("starttag", "a", [("href", "mailto:xyz@example.com")])]) def test_attr_nonascii(self): # see issue 7311 self._run_check( " $\u4e2d\u6587$ ", [("starttag", "img", [("src", "/foo/bar.png"), ("alt", "\u4e2d\u6587")])]) self._run_check( "", [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), ("href", "\u30c6\u30b9\u30c8.html")])]) self._run_check( '', [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), ("href", "\u30c6\u30b9\u30c8.html")])]) def test_attr_entity_replacement(self): self._run_check( "", [("starttag", "a", [("b", "&><\"'")])]) def test_attr_funky_names(self): self._run_check( "", [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])]) def test_entityrefs_in_attributes(self): self._run_check( "", [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) def test_attr_funky_names2(self): self._run_check( r"", [("starttag", "a", [("$", None)]), ("starttag", "b", [("$", "%")]), ("starttag", "c", [("\\", "/")])]) def test_entities_in_attribute_value(self): # see #1200313 for entity in ['&', '&', '&', '&']: self._run_check('' % entity, [("starttag", "a", [("href", "&")])]) self._run_check("" % entity, [("starttag", "a", [("href", "&")])]) self._run_check("" % entity, [("starttag", "a", [("href", "&")])]) def test_malformed_attributes(self): # see #13357 html = ( "test - bad1" "test - bad2" "test - bad3" "test - bad4" ) expected = [ ('starttag', 'a', [('href', "test'style='color:red;bad1'")]), ('data', 'test - bad1'), ('endtag', 'a'), ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]), ('data', 'test - bad2'), ('endtag', 'a'), ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]), ('data', 'test - bad3'), ('endtag', 'a'), ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]), ('data', 'test - bad4'), ('endtag', 'a') ] self._run_check(html, expected) def test_malformed_adjacent_attributes(self): # see #12629 self._run_check('', [('starttag', 'x', []), ('startendtag', 'y', [('z', ''), ('o""', None)]), ('endtag', 'x')]) self._run_check('', [('starttag', 'x', []), ('startendtag', 'y', [('z', ''), ('""', None)]), ('endtag', 'x')]) # see #755670 for the following 3 tests def test_adjacent_attributes(self): self._run_check('', [("starttag", "a", [("width", "100%"), ("cellspacing","0")])]) self._run_check('', [("starttag", "a", [("id", "foo"), ("class","bar")])]) def test_missing_attribute_value(self): self._run_check('', [("starttag", "a", [("v", "")])]) def test_javascript_attribute_value(self): self._run_check("", [("starttag", "a", [("href", "javascript:popup('/popup/help.html')")])]) def test_end_tag_in_attribute_value(self): # see #1745761 self._run_check("spam", [("starttag", "a", [("href", "http://www.example.org/\">;")]), ("data", "spam"), ("endtag", "a")]) if __name__ == "__main__": unittest.main()