From 1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c Mon Sep 17 00:00:00 2001 From: Lorry Tar Creator Date: Tue, 27 Jun 2017 06:07:23 +0000 Subject: webkitgtk-2.16.5 --- Source/WebCore/html/parser/HTMLTokenizer.h | 339 ++++++++++++++++++----------- 1 file changed, 213 insertions(+), 126 deletions(-) (limited to 'Source/WebCore/html/parser/HTMLTokenizer.h') diff --git a/Source/WebCore/html/parser/HTMLTokenizer.h b/Source/WebCore/html/parser/HTMLTokenizer.h index 38021e87d..a553acbd9 100644 --- a/Source/WebCore/html/parser/HTMLTokenizer.h +++ b/Source/WebCore/html/parser/HTMLTokenizer.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2008, 2015 Apple Inc. All Rights Reserved. * Copyright (C) 2010 Google, Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without @@ -24,25 +24,62 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef HTMLTokenizer_h -#define HTMLTokenizer_h +#pragma once #include "HTMLParserOptions.h" #include "HTMLToken.h" #include "InputStreamPreprocessor.h" -#include "SegmentedString.h" namespace WebCore { +class SegmentedString; + class HTMLTokenizer { - WTF_MAKE_NONCOPYABLE(HTMLTokenizer); - WTF_MAKE_FAST_ALLOCATED; public: - explicit HTMLTokenizer(const HTMLParserOptions&); - ~HTMLTokenizer(); + explicit HTMLTokenizer(const HTMLParserOptions& = HTMLParserOptions()); + + // If we can't parse a whole token, this returns null. + class TokenPtr; + TokenPtr nextToken(SegmentedString&); + + // Used by HTMLSourceTracker. + void setTokenAttributeBaseOffset(unsigned); + + // Returns a copy of any characters buffered internally by the tokenizer. + // The tokenizer buffers characters when searching for the token that terminates a script element. + String bufferedCharacters() const; + size_t numberOfBufferedCharacters() const; + + // Updates the tokenizer's state according to the given tag name. This is an approximation of how the tree + // builder would update the tokenizer's state. This method is useful for approximating HTML tokenization. + // To get exactly the correct tokenization, you need the real tree builder. + // + // The main failures in the approximation are as follows: + // + // * The first set of character tokens emitted for a
 element might contain an extra leading newline.
+    //  * The replacement of U+0000 with U+FFFD will not be sensitive to the tree builder's insertion mode.
+    //  * CDATA sections in foreign content will be tokenized as bogus comments instead of as character tokens.
+    //
+    // This approximation is also the algorithm called for when parsing an HTML fragment.
+    // https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
+    void updateStateFor(const AtomicString& tagName);
+
+    void setForceNullCharacterReplacement(bool);
+
+    bool shouldAllowCDATA() const;
+    void setShouldAllowCDATA(bool);
 
-    void reset();
+    bool isInDataState() const;
 
+    void setDataState();
+    void setPLAINTEXTState();
+    void setRAWTEXTState();
+    void setRCDATAState();
+    void setScriptDataState();
+
+    bool neverSkipNullCharacters() const;
+
+private:
     enum State {
         DataState,
         CharacterReferenceInDataState,
@@ -88,10 +125,7 @@ public:
         AfterAttributeValueQuotedState,
         SelfClosingStartTagState,
         BogusCommentState,
-        // The ContinueBogusCommentState is not in the HTML5 spec, but we use
-        // it internally to keep track of whether we've started the bogus
-        // comment token yet.
-        ContinueBogusCommentState,
+        ContinueBogusCommentState, // Not in the HTML spec, used internally to track whether we started the bogus comment token.
         MarkupDeclarationOpenState,
         CommentStartState,
         CommentStartDashState,
@@ -121,148 +155,201 @@ public:
         CDATASectionDoubleRightSquareBracketState,
     };
 
-    // This function returns true if it emits a token. Otherwise, callers
-    // must provide the same (in progress) token on the next call (unless
-    // they call reset() first).
-    bool nextToken(SegmentedString&, HTMLToken&);
+    bool processToken(SegmentedString&);
+    bool processEntity(SegmentedString&);
 
-    // Returns a copy of any characters buffered internally by the tokenizer.
-    // The tokenizer buffers characters when searching for the  token
-    // that terminates a script element.
-    String bufferedCharacters() const;
+    void parseError();
 
-    size_t numberOfBufferedCharacters() const
-    {
-        // Notice that we add 2 to the length of the m_temporaryBuffer to
-        // account for the " element might
-    //    contain an extra leading newline.
-    //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
-    //    tree builder's insertion mode.
-    //  * CDATA sections in foreign content will be tokenized as bogus comments
-    //    instead of as character tokens.
-    //
-    void updateStateFor(const AtomicString& tagName);
+    bool emitAndResumeInDataState(SegmentedString&);
+    bool emitAndReconsumeInDataState();
+    bool emitEndOfFile(SegmentedString&);
 
-    bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
-    void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
+    // Return true if we wil emit a character token before dealing with the buffered end tag.
+    void flushBufferedEndTag();
+    bool commitToPartialEndTag(SegmentedString&, UChar, State);
+    bool commitToCompleteEndTag(SegmentedString&);
 
-    bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
-    void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
+    void appendToTemporaryBuffer(UChar);
+    bool temporaryBufferIs(const char*);
 
-    State state() const { return m_state; }
-    void setState(State state) { m_state = state; }
+    // Sometimes we speculatively consume input characters and we don't know whether they represent
+    // end tags or RCDATA, etc. These functions help manage these state.
+    bool inEndTagBufferingState() const;
+    void appendToPossibleEndTag(UChar);
+    void saveEndTagNameIfNeeded();
+    bool isAppropriateEndTag() const;
 
-    inline bool shouldSkipNullCharacters() const
-    {
-        return !m_forceNullCharacterReplacement
-            && (m_state == HTMLTokenizer::DataState
-                || m_state == HTMLTokenizer::RCDATAState
-                || m_state == HTMLTokenizer::RAWTEXTState);
-    }
+    bool haveBufferedCharacterToken() const;
+
+    static bool isNullCharacterSkippingState(State);
+
+    State m_state { DataState };
+    bool m_forceNullCharacterReplacement { false };
+    bool m_shouldAllowCDATA { false };
+
+    mutable HTMLToken m_token;
+
+    // https://html.spec.whatwg.org/#additional-allowed-character
+    UChar m_additionalAllowedCharacter { 0 };
+
+    // https://html.spec.whatwg.org/#preprocessing-the-input-stream
+    InputStreamPreprocessor m_preprocessor;
+
+    Vector m_appropriateEndTagName;
+
+    // https://html.spec.whatwg.org/#temporary-buffer
+    Vector m_temporaryBuffer;
+
+    // We occasionally want to emit both a character token and an end tag
+    // token (e.g., when lexing script). We buffer the name of the end tag
+    // token here so we remember it next time we re-enter the tokenizer.
+    Vector m_bufferedEndTagName;
+
+    const HTMLParserOptions m_options;
+};
+
+class HTMLTokenizer::TokenPtr {
+public:
+    TokenPtr();
+    ~TokenPtr();
+
+    TokenPtr(TokenPtr&&);
+    TokenPtr& operator=(TokenPtr&&) = delete;
+
+    void clear();
+
+    operator bool() const;
+
+    HTMLToken& operator*() const;
+    HTMLToken* operator->() const;
 
 private:
-    inline bool processEntity(SegmentedString&);
+    friend class HTMLTokenizer;
+    explicit TokenPtr(HTMLToken*);
 
-    inline void parseError();
+    HTMLToken* m_token { nullptr };
+};
 
-    inline void bufferCharacter(UChar character)
-    {
-        ASSERT(character != kEndOfFileMarker);
-        m_token->ensureIsCharacterToken();
-        m_token->appendToCharacter(character);
-    }
+inline HTMLTokenizer::TokenPtr::TokenPtr()
+{
+}
 
-    inline bool emitAndResumeIn(SegmentedString& source, State state)
-    {
-        saveEndTagNameIfNeeded();
-        m_state = state;
-        source.advanceAndUpdateLineNumber();
-        return true;
-    }
-    
-    inline bool emitAndReconsumeIn(SegmentedString&, State state)
-    {
-        saveEndTagNameIfNeeded();
-        m_state = state;
-        return true;
-    }
+inline HTMLTokenizer::TokenPtr::TokenPtr(HTMLToken* token)
+    : m_token(token)
+{
+}
 
-    inline bool emitEndOfFile(SegmentedString& source)
-    {
-        if (haveBufferedCharacterToken())
-            return true;
-        m_state = HTMLTokenizer::DataState;
-        source.advanceAndUpdateLineNumber();
+inline HTMLTokenizer::TokenPtr::~TokenPtr()
+{
+    if (m_token)
         m_token->clear();
-        m_token->makeEndOfFile();
-        return true;
+}
+
+inline HTMLTokenizer::TokenPtr::TokenPtr(TokenPtr&& other)
+    : m_token(other.m_token)
+{
+    other.m_token = nullptr;
+}
+
+inline void HTMLTokenizer::TokenPtr::clear()
+{
+    if (m_token) {
+        m_token->clear();
+        m_token = nullptr;
     }
+}
 
-    inline bool flushEmitAndResumeIn(SegmentedString&, State);
+inline HTMLTokenizer::TokenPtr::operator bool() const
+{
+    return m_token;
+}
 
-    // Return whether we need to emit a character token before dealing with
-    // the buffered end tag.
-    inline bool flushBufferedEndTag(SegmentedString&);
-    inline bool temporaryBufferIs(const String&);
+inline HTMLToken& HTMLTokenizer::TokenPtr::operator*() const
+{
+    ASSERT(m_token);
+    return *m_token;
+}
 
-    // Sometimes we speculatively consume input characters and we don't
-    // know whether they represent end tags or RCDATA, etc. These
-    // functions help manage these state.
-    inline void addToPossibleEndTag(LChar cc);
+inline HTMLToken* HTMLTokenizer::TokenPtr::operator->() const
+{
+    ASSERT(m_token);
+    return m_token;
+}
 
-    inline void saveEndTagNameIfNeeded()
-    {
-        ASSERT(m_token->type() != HTMLToken::Uninitialized);
-        if (m_token->type() == HTMLToken::StartTag)
-            m_appropriateEndTagName = m_token->name();
-    }
-    inline bool isAppropriateEndTag();
+inline HTMLTokenizer::TokenPtr HTMLTokenizer::nextToken(SegmentedString& source)
+{
+    return TokenPtr(processToken(source) ? &m_token : nullptr);
+}
 
+inline void HTMLTokenizer::setTokenAttributeBaseOffset(unsigned offset)
+{
+    m_token.setAttributeBaseOffset(offset);
+}
 
-    inline bool haveBufferedCharacterToken()
-    {
-        return m_token->type() == HTMLToken::Character;
-    }
+inline size_t HTMLTokenizer::numberOfBufferedCharacters() const
+{
+    // Notice that we add 2 to the length of the m_temporaryBuffer to
+    // account for the " m_inputStreamPreprocessor;
+inline bool HTMLTokenizer::isInDataState() const
+{
+    return m_state == DataState;
+}
 
-    Vector m_appropriateEndTagName;
+inline void HTMLTokenizer::setDataState()
+{
+    m_state = DataState;
+}
 
-    // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
-    Vector m_temporaryBuffer;
+inline void HTMLTokenizer::setPLAINTEXTState()
+{
+    m_state = PLAINTEXTState;
+}
 
-    // We occationally want to emit both a character token and an end tag
-    // token (e.g., when lexing script). We buffer the name of the end tag
-    // token here so we remember it next time we re-enter the tokenizer.
-    Vector m_bufferedEndTagName;
+inline void HTMLTokenizer::setRAWTEXTState()
+{
+    m_state = RAWTEXTState;
+}
 
-    HTMLParserOptions m_options;
-};
+inline void HTMLTokenizer::setRCDATAState()
+{
+    m_state = RCDATAState;
+}
+
+inline void HTMLTokenizer::setScriptDataState()
+{
+    m_state = ScriptDataState;
+}
+
+inline bool HTMLTokenizer::isNullCharacterSkippingState(State state)
+{
+    return state == DataState || state == RCDATAState || state == RAWTEXTState;
+}
 
+inline bool HTMLTokenizer::neverSkipNullCharacters() const
+{
+    return m_forceNullCharacterReplacement;
 }
 
-#endif
+} // namespace WebCore
-- 
cgit v1.2.1