diff options
author | Ian Lance Taylor <ian@gcc.gnu.org> | 2011-12-13 19:16:27 +0000 |
---|---|---|
committer | Ian Lance Taylor <ian@gcc.gnu.org> | 2011-12-13 19:16:27 +0000 |
commit | 7b1c3dd9e670da2041ff1af415999310f88888ad (patch) | |
tree | c5132538d5da85ed816c7e1f9d93c4a503b838ab /libgo/go/html/parse.go | |
parent | 36cfbee133027429a681ce585643d38228ab1213 (diff) | |
download | gcc-7b1c3dd9e670da2041ff1af415999310f88888ad.tar.gz |
libgo: Update to weekly.2011-12-02.
From-SVN: r182295
Diffstat (limited to 'libgo/go/html/parse.go')
-rw-r--r-- | libgo/go/html/parse.go | 169 |
1 files changed, 144 insertions, 25 deletions
diff --git a/libgo/go/html/parse.go b/libgo/go/html/parse.go index 9b7e934ac34..97fbc514d82 100644 --- a/libgo/go/html/parse.go +++ b/libgo/go/html/parse.go @@ -37,6 +37,11 @@ type parser struct { // fosterParenting is whether new elements should be inserted according to // the foster parenting rules (section 11.2.5.3). fosterParenting bool + // quirks is whether the parser is operating in "quirks mode." + quirks bool + // context is the context element when parsing an HTML fragment + // (section 11.4). + context *Node } func (p *parser) top() *Node { @@ -285,9 +290,10 @@ func (p *parser) setOriginalIM() { func (p *parser) resetInsertionMode() { for i := len(p.oe) - 1; i >= 0; i-- { n := p.oe[i] - if i == 0 { - // TODO: set n to the context element, for HTML fragment parsing. + if i == 0 && p.context != nil { + n = p.context } + switch n.Data { case "select": p.im = inSelectIM @@ -319,9 +325,17 @@ func (p *parser) resetInsertionMode() { p.im = inBodyIM } +const whitespace = " \t\r\n\f" + // Section 11.2.5.4.1. func initialIM(p *parser) bool { switch p.tok.Type { + case TextToken: + p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) + if len(p.tok.Data) == 0 { + // It was all whitespace, so ignore it. + return true + } case CommentToken: p.doc.Add(&Node{ Type: CommentNode, @@ -329,15 +343,13 @@ func initialIM(p *parser) bool { }) return true case DoctypeToken: - p.doc.Add(&Node{ - Type: DoctypeNode, - Data: p.tok.Data, - }) + n, quirks := parseDoctype(p.tok.Data) + p.doc.Add(n) + p.quirks = quirks p.im = beforeHTMLIM return true } - // TODO: set "quirks mode"? It's defined in the DOM spec instead of HTML5 proper, - // and so switching on "quirks mode" might belong in a different package. + p.quirks = true p.im = beforeHTMLIM return false } @@ -345,6 +357,12 @@ func initialIM(p *parser) bool { // Section 11.2.5.4.2. func beforeHTMLIM(p *parser) bool { switch p.tok.Type { + case TextToken: + p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) + if len(p.tok.Data) == 0 { + // It was all whitespace, so ignore it. + return true + } case StartTagToken: if p.tok.Data == "html" { p.addElement(p.tok.Data, p.tok.Attr) @@ -383,7 +401,11 @@ func beforeHeadIM(p *parser) bool { case ErrorToken: implied = true case TextToken: - // TODO: distinguish whitespace text from others. + p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) + if len(p.tok.Data) == 0 { + // It was all whitespace, so ignore it. + return true + } implied = true case StartTagToken: switch p.tok.Data { @@ -417,8 +439,6 @@ func beforeHeadIM(p *parser) bool { return !implied } -const whitespace = " \t\r\n\f" - // Section 11.2.5.4.4. func inHeadIM(p *parser) bool { var ( @@ -441,6 +461,8 @@ func inHeadIM(p *parser) bool { implied = true case StartTagToken: switch p.tok.Data { + case "html": + return inBodyIM(p) case "base", "basefont", "bgsound", "command", "link", "meta": p.addElement(p.tok.Data, p.tok.Attr) p.oe.pop() @@ -450,6 +472,9 @@ func inHeadIM(p *parser) bool { p.setOriginalIM() p.im = textIM return true + case "head": + // Ignore the token. + return true default: implied = true } @@ -560,11 +585,30 @@ func copyAttributes(dst *Node, src Token) { func inBodyIM(p *parser) bool { switch p.tok.Type { case TextToken: + switch n := p.oe.top(); n.Data { + case "pre", "listing", "textarea": + if len(n.Child) == 0 { + // Ignore a newline at the start of a <pre> block. + d := p.tok.Data + if d != "" && d[0] == '\r' { + d = d[1:] + } + if d != "" && d[0] == '\n' { + d = d[1:] + } + if d == "" { + return true + } + p.tok.Data = d + } + } p.reconstructActiveFormattingElements() p.addText(p.tok.Data) p.framesetOK = false case StartTagToken: switch p.tok.Data { + case "html": + copyAttributes(p.oe[0], p.tok) case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul": p.popUntil(buttonScopeStopTags, "p") p.addElement(p.tok.Data, p.tok.Attr) @@ -589,6 +633,13 @@ func inBodyIM(p *parser) bool { case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u": p.reconstructActiveFormattingElements() p.addFormattingElement(p.tok.Data, p.tok.Attr) + case "nobr": + p.reconstructActiveFormattingElements() + if p.elementInScope(defaultScopeStopTags, "nobr") { + p.inBodyEndTagFormatting("nobr") + p.reconstructActiveFormattingElements() + } + p.addFormattingElement(p.tok.Data, p.tok.Attr) case "applet", "marquee", "object": p.reconstructActiveFormattingElements() p.addElement(p.tok.Data, p.tok.Attr) @@ -601,7 +652,9 @@ func inBodyIM(p *parser) bool { p.acknowledgeSelfClosingTag() p.framesetOK = false case "table": - p.popUntil(buttonScopeStopTags, "p") // TODO: skip this step in quirks mode. + if !p.quirks { + p.popUntil(buttonScopeStopTags, "p") + } p.addElement(p.tok.Data, p.tok.Attr) p.framesetOK = false p.im = inTableIM @@ -721,6 +774,11 @@ func inBodyIM(p *parser) bool { p.oe.pop() p.oe.pop() p.form = nil + case "xmp": + p.popUntil(buttonScopeStopTags, "p") + p.reconstructActiveFormattingElements() + p.framesetOK = false + p.addElement(p.tok.Data, p.tok.Attr) case "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr": // Ignore the token. default: @@ -1462,18 +1520,7 @@ func afterAfterFramesetIM(p *parser) bool { return true } -// Parse returns the parse tree for the HTML from the given Reader. -// The input is assumed to be UTF-8 encoded. -func Parse(r io.Reader) (*Node, error) { - p := &parser{ - tokenizer: NewTokenizer(r), - doc: &Node{ - Type: DocumentNode, - }, - scripting: true, - framesetOK: true, - im: initialIM, - } +func (p *parser) parse() error { // Iterate until EOF. Any other error will cause an early return. consumed := true for { @@ -1482,7 +1529,7 @@ func Parse(r io.Reader) (*Node, error) { if err == io.EOF { break } - return nil, err + return err } } consumed = p.im(p) @@ -1493,5 +1540,77 @@ func Parse(r io.Reader) (*Node, error) { break } } + return nil +} + +// Parse returns the parse tree for the HTML from the given Reader. +// The input is assumed to be UTF-8 encoded. +func Parse(r io.Reader) (*Node, error) { + p := &parser{ + tokenizer: NewTokenizer(r), + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + framesetOK: true, + im: initialIM, + } + err := p.parse() + if err != nil { + return nil, err + } return p.doc, nil } + +// ParseFragment parses a fragment of HTML and returns the nodes that were +// found. If the fragment is the InnerHTML for an existing element, pass that +// element in context. +func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { + p := &parser{ + tokenizer: NewTokenizer(r), + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + context: context, + } + + if context != nil { + switch context.Data { + case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp": + p.tokenizer.rawTag = context.Data + } + } + + root := &Node{ + Type: ElementNode, + Data: "html", + } + p.doc.Add(root) + p.oe = nodeStack{root} + p.resetInsertionMode() + + for n := context; n != nil; n = n.Parent { + if n.Type == ElementNode && n.Data == "form" { + p.form = n + break + } + } + + err := p.parse() + if err != nil { + return nil, err + } + + parent := p.doc + if context != nil { + parent = root + } + + result := parent.Child + parent.Child = nil + for _, n := range result { + n.Parent = nil + } + return result, nil +} |