summaryrefslogtreecommitdiff
path: root/libgo/go/html/token_test.go
diff options
context:
space:
mode:
Diffstat (limited to 'libgo/go/html/token_test.go')
-rw-r--r--libgo/go/html/token_test.go285
1 files changed, 258 insertions, 27 deletions
diff --git a/libgo/go/html/token_test.go b/libgo/go/html/token_test.go
index 0a0beb201b3..310cd97d670 100644
--- a/libgo/go/html/token_test.go
+++ b/libgo/go/html/token_test.go
@@ -21,6 +21,11 @@ type tokenTest struct {
}
var tokenTests = []tokenTest{
+ {
+ "empty",
+ "",
+ "",
+ },
// A single text node. The tokenizer should not break text nodes on whitespace,
// nor should it normalize whitespace within a text node.
{
@@ -41,6 +46,88 @@ var tokenTests = []tokenTest{
"<a>b<c/>d</e>",
"<a>$b$<c/>$d$</e>",
},
+ // Angle brackets that aren't a tag.
+ {
+ "not a tag #0",
+ "<",
+ "&lt;",
+ },
+ {
+ "not a tag #1",
+ "</",
+ "&lt;/",
+ },
+ {
+ "not a tag #2",
+ "</>",
+ "",
+ },
+ {
+ "not a tag #3",
+ "a</>b",
+ "a$b",
+ },
+ {
+ "not a tag #4",
+ "</ >",
+ "<!-- -->",
+ },
+ {
+ "not a tag #5",
+ "</.",
+ "<!--.-->",
+ },
+ {
+ "not a tag #6",
+ "</.>",
+ "<!--.-->",
+ },
+ {
+ "not a tag #7",
+ "a < b",
+ "a &lt; b",
+ },
+ {
+ "not a tag #8",
+ "<.>",
+ "&lt;.&gt;",
+ },
+ {
+ "not a tag #9",
+ "a<<<b>>>c",
+ "a&lt;&lt;$<b>$&gt;&gt;c",
+ },
+ {
+ "not a tag #10",
+ "if x<0 and y < 0 then x*y>0",
+ "if x&lt;0 and y &lt; 0 then x*y&gt;0",
+ },
+ // EOF in a tag name.
+ {
+ "tag name eof #0",
+ "<a",
+ "",
+ },
+ {
+ "tag name eof #1",
+ "<a ",
+ "",
+ },
+ {
+ "tag name eof #2",
+ "a<b",
+ "a",
+ },
+ {
+ "tag name eof #3",
+ "<a><b",
+ "<a>",
+ },
+ {
+ "tag name eof #4",
+ `<a x`,
+ `<a x="">`,
+ },
// Some malformed tags that are missing a '>'.
{
"malformed tag #0",
@@ -54,70 +141,198 @@ var tokenTests = []tokenTest{
},
{
"malformed tag #2",
+ `<p id`,
+ `<p id="">`,
+ },
+ {
+ "malformed tag #3",
+ `<p id=`,
+ `<p id="">`,
+ },
+ {
+ "malformed tag #4",
+ `<p id=>`,
+ `<p id="">`,
+ },
+ {
+ "malformed tag #5",
+ `<p id=0`,
+ `<p id="0">`,
+ },
+ {
+ "malformed tag #6",
`<p id=0</p>`,
`<p id="0&lt;/p">`,
},
{
- "malformed tag #3",
+ "malformed tag #7",
`<p id="0</p>`,
`<p id="0&lt;/p&gt;">`,
},
{
- "malformed tag #4",
+ "malformed tag #8",
`<p id="0"</p>`,
`<p id="0" <="" p="">`,
},
+ // Raw text and RCDATA.
+ {
+ "basic raw text",
+ "<script><a></b></script>",
+ "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
+ },
+ {
+ "unfinished script end tag",
+ "<SCRIPT>a</SCR",
+ "<script>$a&lt;/SCR",
+ },
+ {
+ "broken script end tag",
+ "<SCRIPT>a</SCR ipt>",
+ "<script>$a&lt;/SCR ipt&gt;",
+ },
+ {
+ "EOF in script end tag",
+ "<SCRIPT>a</SCRipt",
+ "<script>$a&lt;/SCRipt",
+ },
+ {
+ "scriptx end tag",
+ "<SCRIPT>a</SCRiptx",
+ "<script>$a&lt;/SCRiptx",
+ },
+ {
+ "' ' completes script end tag",
+ "<SCRIPT>a</SCRipt ",
+ "<script>$a$</script>",
+ },
+ {
+ "'>' completes script end tag",
+ "<SCRIPT>a</SCRipt>",
+ "<script>$a$</script>",
+ },
+ {
+ "self-closing script end tag",
+ "<SCRIPT>a</SCRipt/>",
+ "<script>$a$</script>",
+ },
+ {
+ "nested script tag",
+ "<SCRIPT>a</SCRipt<script>",
+ "<script>$a&lt;/SCRipt&lt;script&gt;",
+ },
+ {
+ "script end tag after unfinished",
+ "<SCRIPT>a</SCRipt</script>",
+ "<script>$a&lt;/SCRipt$</script>",
+ },
+ {
+ "script/style mismatched tags",
+ "<script>a</style>",
+ "<script>$a&lt;/style&gt;",
+ },
+ {
+ "style element with entity",
+ "<style>&apos;",
+ "<style>$&amp;apos;",
+ },
+ {
+ "textarea with tag",
+ "<textarea><div></textarea>",
+ "<textarea>$&lt;div&gt;$</textarea>",
+ },
+ {
+ "title with tag and entity",
+ "<title><b>K&amp;R C</b></title>",
+ "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
+ },
+ // DOCTYPE tests.
+ {
+ "Proper DOCTYPE",
+ "<!DOCTYPE html>",
+ "<!DOCTYPE html>",
+ },
+ {
+ "DOCTYPE with no space",
+ "<!doctypehtml>",
+ "<!DOCTYPE html>",
+ },
+ {
+ "DOCTYPE with two spaces",
+ "<!doctype html>",
+ "<!DOCTYPE html>",
+ },
+ {
+ "looks like DOCTYPE but isn't",
+ "<!DOCUMENT html>",
+ "<!--DOCUMENT html-->",
+ },
+ {
+ "DOCTYPE at EOF",
+ "<!DOCtype",
+ "<!DOCTYPE >",
+ },
+ // XML processing instructions.
+ {
+ "XML processing instruction",
+ "<?xml?>",
+ "<!--?xml?-->",
+ },
// Comments.
{
"comment0",
"abc<b><!-- skipme --></b>def",
- "abc$<b>$</b>$def",
+ "abc$<b>$<!-- skipme -->$</b>$def",
},
{
"comment1",
"a<!-->z",
- "a$z",
+ "a$<!---->$z",
},
{
"comment2",
"a<!--->z",
- "a$z",
+ "a$<!---->$z",
},
{
"comment3",
"a<!--x>-->z",
- "a$z",
+ "a$<!--x>-->$z",
},
{
"comment4",
"a<!--x->-->z",
- "a$z",
+ "a$<!--x->-->$z",
},
{
"comment5",
"a<!>z",
- "a$&lt;!&gt;z",
+ "a$<!---->$z",
},
{
"comment6",
"a<!->z",
- "a$&lt;!-&gt;z",
+ "a$<!----->$z",
},
{
"comment7",
"a<!---<>z",
- "a$&lt;!---&lt;&gt;z",
+ "a$<!---<>z-->",
},
{
"comment8",
"a<!--z",
- "a$&lt;!--z",
+ "a$<!--z-->",
+ },
+ {
+ "comment9",
+ "a<!--x--!>z",
+ "a$<!--x-->$z",
},
// An attribute with a backslash.
{
"backslash",
`<p id="a\"b">`,
- `<p id="a&quot;b">`,
+ `<p id="a\" b"="">`,
},
// Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag.
@@ -133,11 +348,14 @@ var tokenTests = []tokenTest{
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
},
- {
- "entity without semicolon",
- `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
- `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
- },
+ /*
+ // TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
+ {
+ "entity without semicolon",
+ `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
+ `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
+ },
+ */
{
"entity with digits",
"&frac12;",
@@ -190,21 +408,34 @@ var tokenTests = []tokenTest{
`<meta http-equiv="content-type">`,
`<meta http-equiv="content-type">`,
},
+ {
+ "Mixed attributes",
+ `a<P V="0 1" w='2' X=3 y>z`,
+ `a$<p v="0 1" w="2" x="3" y="">$z`,
+ },
+ {
+ "Attributes with a solitary single quote",
+ `<p id=can't><p id=won't>`,
+ `<p id="can&apos;t">$<p id="won&apos;t">`,
+ },
}
func TestTokenizer(t *testing.T) {
loop:
for _, tt := range tokenTests {
- z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
- for i, s := range strings.Split(tt.golden, "$") {
- if z.Next() == ErrorToken {
- t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
- continue loop
- }
- actual := z.Token().String()
- if s != actual {
- t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
- continue loop
+ z := NewTokenizer(strings.NewReader(tt.html))
+ z.ReturnComments = true
+ if tt.golden != "" {
+ for i, s := range strings.Split(tt.golden, "$") {
+ if z.Next() == ErrorToken {
+ t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
+ continue loop
+ }
+ actual := z.Token().String()
+ if s != actual {
+ t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
+ continue loop
+ }
}
}
z.Next()