1 files changed, 258 insertions, 27 deletions
diff --git a/libgo/go/html/token_test.go b/libgo/go/html/token_test.go
index 0a0beb201b3..310cd97d670 100644
--- a/libgo/go/html/token_test.go
+++ b/libgo/go/html/token_test.go
@@ -21,6 +21,11 @@ type tokenTest struct {
 }
 
 var tokenTests = []tokenTest{
+	{
+		"empty",
+		"",
+		"",
+	},
 	// A single text node. The tokenizer should not break text nodes on whitespace,
 	// nor should it normalize whitespace within a text node.
 	{
@@ -41,6 +46,88 @@ var tokenTests = []tokenTest{
 		"<a>b<c/>d</e>",
 		"<a>$b$<c/>$d$</e>",
 	},
+	// Angle brackets that aren't a tag.
+	{
+		"not a tag #0",
+		"<",
+		"&lt;",
+	},
+	{
+		"not a tag #1",
+		"</",
+		"&lt;/",
+	},
+	{
+		"not a tag #2",
+		"</>",
+		"",
+	},
+	{
+		"not a tag #3",
+		"a</>b",
+		"a$b",
+	},
+	{
+		"not a tag #4",
+		"</ >",
+		"<!-- -->",
+	},
+	{
+		"not a tag #5",
+		"</.",
+		"<!--.-->",
+	},
+	{
+		"not a tag #6",
+		"</.>",
+		"<!--.-->",
+	},
+	{
+		"not a tag #7",
+		"a < b",
+		"a &lt; b",
+	},
+	{
+		"not a tag #8",
+		"<.>",
+		"&lt;.&gt;",
+	},
+	{
+		"not a tag #9",
+		"a<<<b>>>c",
+		"a&lt;&lt;$<b>$&gt;&gt;c",
+	},
+	{
+		"not a tag #10",
+		"if x<0 and y < 0 then x*y>0",
+		"if x&lt;0 and y &lt; 0 then x*y&gt;0",
+	},
+	// EOF in a tag name.
+	{
+		"tag name eof #0",
+		"<a",
+		"",
+	},
+	{
+		"tag name eof #1",
+		"<a ",
+		"",
+	},
+	{
+		"tag name eof #2",
+		"a<b",
+		"a",
+	},
+	{
+		"tag name eof #3",
+		"<a><b",
+		"<a>",
+	},
+	{
+		"tag name eof #4",
+		`<a x`,
+		`<a x="">`,
+	},
 	// Some malformed tags that are missing a '>'.
 	{
 		"malformed tag #0",
@@ -54,70 +141,198 @@ var tokenTests = []tokenTest{
 	},
 	{
 		"malformed tag #2",
+		`<p id`,
+		`<p id="">`,
+	},
+	{
+		"malformed tag #3",
+		`<p id=`,
+		`<p id="">`,
+	},
+	{
+		"malformed tag #4",
+		`<p id=>`,
+		`<p id="">`,
+	},
+	{
+		"malformed tag #5",
+		`<p id=0`,
+		`<p id="0">`,
+	},
+	{
+		"malformed tag #6",
 		`<p id=0</p>`,
 		`<p id="0&lt;/p">`,
 	},
 	{
-		"malformed tag #3",
+		"malformed tag #7",
 		`<p id="0</p>`,
 		`<p id="0&lt;/p&gt;">`,
 	},
 	{
-		"malformed tag #4",
+		"malformed tag #8",
 		`<p id="0"</p>`,
 		`<p id="0" <="" p="">`,
 	},
+	// Raw text and RCDATA.
+	{
+		"basic raw text",
+		"<script><a></b></script>",
+		"<script>$&lt;a&gt;&lt;/b&gt;$</script>",
+	},
+	{
+		"unfinished script end tag",
+		"<SCRIPT>a</SCR",
+		"<script>$a&lt;/SCR",
+	},
+	{
+		"broken script end tag",
+		"<SCRIPT>a</SCR ipt>",
+		"<script>$a&lt;/SCR ipt&gt;",
+	},
+	{
+		"EOF in script end tag",
+		"<SCRIPT>a</SCRipt",
+		"<script>$a&lt;/SCRipt",
+	},
+	{
+		"scriptx end tag",
+		"<SCRIPT>a</SCRiptx",
+		"<script>$a&lt;/SCRiptx",
+	},
+	{
+		"' ' completes script end tag",
+		"<SCRIPT>a</SCRipt ",
+		"<script>$a$</script>",
+	},
+	{
+		"'>' completes script end tag",
+		"<SCRIPT>a</SCRipt>",
+		"<script>$a$</script>",
+	},
+	{
+		"self-closing script end tag",
+		"<SCRIPT>a</SCRipt/>",
+		"<script>$a$</script>",
+	},
+	{
+		"nested script tag",
+		"<SCRIPT>a</SCRipt<script>",
+		"<script>$a&lt;/SCRipt&lt;script&gt;",
+	},
+	{
+		"script end tag after unfinished",
+		"<SCRIPT>a</SCRipt</script>",
+		"<script>$a&lt;/SCRipt$</script>",
+	},
+	{
+		"script/style mismatched tags",
+		"<script>a</style>",
+		"<script>$a&lt;/style&gt;",
+	},
+	{
+		"style element with entity",
+		"<style>&apos;",
+		"<style>$&amp;apos;",
+	},
+	{
+		"textarea with tag",
+		"<textarea><div></textarea>",
+		"<textarea>$&lt;div&gt;$</textarea>",
+	},
+	{
+		"title with tag and entity",
+		"<title><b>K&amp;R C</b></title>",
+		"<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
+	},
+	// DOCTYPE tests.
+	{
+		"Proper DOCTYPE",
+		"<!DOCTYPE html>",
+		"<!DOCTYPE html>",
+	},
+	{
+		"DOCTYPE with no space",
+		"<!doctypehtml>",
+		"<!DOCTYPE html>",
+	},
+	{
+		"DOCTYPE with two spaces",
+		"<!doctype  html>",
+		"<!DOCTYPE html>",
+	},
+	{
+		"looks like DOCTYPE but isn't",
+		"<!DOCUMENT html>",
+		"<!--DOCUMENT html-->",
+	},
+	{
+		"DOCTYPE at EOF",
+		"<!DOCtype",
+		"<!DOCTYPE >",
+	},
+	// XML processing instructions.
+	{
+		"XML processing instruction",
+		"<?xml?>",
+		"<!--?xml?-->",
+	},
 	// Comments.
 	{
 		"comment0",
 		"abc<b><!-- skipme --></b>def",
-		"abc$<b>$</b>$def",
+		"abc$<b>$<!-- skipme -->$</b>$def",
 	},
 	{
 		"comment1",
 		"a<!-->z",
-		"a$z",
+		"a$<!---->$z",
 	},
 	{
 		"comment2",
 		"a<!--->z",
-		"a$z",
+		"a$<!---->$z",
 	},
 	{
 		"comment3",
 		"a<!--x>-->z",
-		"a$z",
+		"a$<!--x>-->$z",
 	},
 	{
 		"comment4",
 		"a<!--x->-->z",
-		"a$z",
+		"a$<!--x->-->$z",
 	},
 	{
 		"comment5",
 		"a<!>z",
-		"a$&lt;!&gt;z",
+		"a$<!---->$z",
 	},
 	{
 		"comment6",
 		"a<!->z",
-		"a$&lt;!-&gt;z",
+		"a$<!----->$z",
 	},
 	{
 		"comment7",
 		"a<!---<>z",
-		"a$&lt;!---&lt;&gt;z",
+		"a$<!---<>z-->",
 	},
 	{
 		"comment8",
 		"a<!--z",
-		"a$&lt;!--z",
+		"a$<!--z-->",
+	},
+	{
+		"comment9",
+		"a<!--x--!>z",
+		"a$<!--x-->$z",
 	},
 	// An attribute with a backslash.
 	{
 		"backslash",
 		`<p id="a\"b">`,
-		`<p id="a&quot;b">`,
+		`<p id="a\" b"="">`,
 	},
 	// Entities, tag name and attribute key lower-casing, and whitespace
 	// normalization within a tag.
@@ -133,11 +348,14 @@ var tokenTests = []tokenTest{
 		`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
 		`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
 	},
-	{
-		"entity without semicolon",
-		`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
-		`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
-	},
+	/*
+		// TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
+		{
+			"entity without semicolon",
+			`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
+			`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
+		},
+	*/
 	{
 		"entity with digits",
 		"&frac12;",
@@ -190,21 +408,34 @@ var tokenTests = []tokenTest{
 		`<meta http-equiv="content-type">`,
 		`<meta http-equiv="content-type">`,
 	},
+	{
+		"Mixed attributes",
+		`a<P V="0 1" w='2' X=3 y>z`,
+		`a$<p v="0 1" w="2" x="3" y="">$z`,
+	},
+	{
+		"Attributes with a solitary single quote",
+		`<p id=can't><p id=won't>`,
+		`<p id="can&apos;t">$<p id="won&apos;t">`,
+	},
 }
 
 func TestTokenizer(t *testing.T) {
 loop:
 	for _, tt := range tokenTests {
-		z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
-		for i, s := range strings.Split(tt.golden, "$") {
-			if z.Next() == ErrorToken {
-				t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
-				continue loop
-			}
-			actual := z.Token().String()
-			if s != actual {
-				t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
-				continue loop
+		z := NewTokenizer(strings.NewReader(tt.html))
+		z.ReturnComments = true
+		if tt.golden != "" {
+			for i, s := range strings.Split(tt.golden, "$") {
+				if z.Next() == ErrorToken {
+					t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
+					continue loop
+				}
+				actual := z.Token().String()
+				if s != actual {
+					t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
+					continue loop
+				}
 			}
 		}
 		z.Next()