diff options
Diffstat (limited to 'Source/WebCore/html/parser/HTMLTokenizer.h')
-rw-r--r-- | Source/WebCore/html/parser/HTMLTokenizer.h | 339 |
1 files changed, 213 insertions, 126 deletions
diff --git a/Source/WebCore/html/parser/HTMLTokenizer.h b/Source/WebCore/html/parser/HTMLTokenizer.h index 38021e87d..a553acbd9 100644 --- a/Source/WebCore/html/parser/HTMLTokenizer.h +++ b/Source/WebCore/html/parser/HTMLTokenizer.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Apple Inc. All Rights Reserved. + * Copyright (C) 2008, 2015 Apple Inc. All Rights Reserved. * Copyright (C) 2010 Google, Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without @@ -24,25 +24,62 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef HTMLTokenizer_h -#define HTMLTokenizer_h +#pragma once #include "HTMLParserOptions.h" #include "HTMLToken.h" #include "InputStreamPreprocessor.h" -#include "SegmentedString.h" namespace WebCore { +class SegmentedString; + class HTMLTokenizer { - WTF_MAKE_NONCOPYABLE(HTMLTokenizer); - WTF_MAKE_FAST_ALLOCATED; public: - explicit HTMLTokenizer(const HTMLParserOptions&); - ~HTMLTokenizer(); + explicit HTMLTokenizer(const HTMLParserOptions& = HTMLParserOptions()); + + // If we can't parse a whole token, this returns null. + class TokenPtr; + TokenPtr nextToken(SegmentedString&); + + // Used by HTMLSourceTracker. + void setTokenAttributeBaseOffset(unsigned); + + // Returns a copy of any characters buffered internally by the tokenizer. + // The tokenizer buffers characters when searching for the </script> token that terminates a script element. + String bufferedCharacters() const; + size_t numberOfBufferedCharacters() const; + + // Updates the tokenizer's state according to the given tag name. This is an approximation of how the tree + // builder would update the tokenizer's state. This method is useful for approximating HTML tokenization. + // To get exactly the correct tokenization, you need the real tree builder. + // + // The main failures in the approximation are as follows: + // + // * The first set of character tokens emitted for a <pre> element might contain an extra leading newline. + // * The replacement of U+0000 with U+FFFD will not be sensitive to the tree builder's insertion mode. + // * CDATA sections in foreign content will be tokenized as bogus comments instead of as character tokens. + // + // This approximation is also the algorithm called for when parsing an HTML fragment. + // https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments + void updateStateFor(const AtomicString& tagName); + + void setForceNullCharacterReplacement(bool); + + bool shouldAllowCDATA() const; + void setShouldAllowCDATA(bool); - void reset(); + bool isInDataState() const; + void setDataState(); + void setPLAINTEXTState(); + void setRAWTEXTState(); + void setRCDATAState(); + void setScriptDataState(); + + bool neverSkipNullCharacters() const; + +private: enum State { DataState, CharacterReferenceInDataState, @@ -88,10 +125,7 @@ public: AfterAttributeValueQuotedState, SelfClosingStartTagState, BogusCommentState, - // The ContinueBogusCommentState is not in the HTML5 spec, but we use - // it internally to keep track of whether we've started the bogus - // comment token yet. - ContinueBogusCommentState, + ContinueBogusCommentState, // Not in the HTML spec, used internally to track whether we started the bogus comment token. MarkupDeclarationOpenState, CommentStartState, CommentStartDashState, @@ -121,148 +155,201 @@ public: CDATASectionDoubleRightSquareBracketState, }; - // This function returns true if it emits a token. Otherwise, callers - // must provide the same (in progress) token on the next call (unless - // they call reset() first). - bool nextToken(SegmentedString&, HTMLToken&); + bool processToken(SegmentedString&); + bool processEntity(SegmentedString&); - // Returns a copy of any characters buffered internally by the tokenizer. - // The tokenizer buffers characters when searching for the </script> token - // that terminates a script element. - String bufferedCharacters() const; + void parseError(); - size_t numberOfBufferedCharacters() const - { - // Notice that we add 2 to the length of the m_temporaryBuffer to - // account for the "</" characters, which are effecitvely buffered in - // the tokenizer's state machine. - return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0; - } + void bufferASCIICharacter(UChar); + void bufferCharacter(UChar); - // Updates the tokenizer's state according to the given tag name. This is - // an approximation of how the tree builder would update the tokenizer's - // state. This method is useful for approximating HTML tokenization. To - // get exactly the correct tokenization, you need the real tree builder. - // - // The main failures in the approximation are as follows: - // - // * The first set of character tokens emitted for a <pre> element might - // contain an extra leading newline. - // * The replacement of U+0000 with U+FFFD will not be sensitive to the - // tree builder's insertion mode. - // * CDATA sections in foreign content will be tokenized as bogus comments - // instead of as character tokens. - // - void updateStateFor(const AtomicString& tagName); + bool emitAndResumeInDataState(SegmentedString&); + bool emitAndReconsumeInDataState(); + bool emitEndOfFile(SegmentedString&); - bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; } - void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; } + // Return true if we wil emit a character token before dealing with the buffered end tag. + void flushBufferedEndTag(); + bool commitToPartialEndTag(SegmentedString&, UChar, State); + bool commitToCompleteEndTag(SegmentedString&); - bool shouldAllowCDATA() const { return m_shouldAllowCDATA; } - void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; } + void appendToTemporaryBuffer(UChar); + bool temporaryBufferIs(const char*); - State state() const { return m_state; } - void setState(State state) { m_state = state; } + // Sometimes we speculatively consume input characters and we don't know whether they represent + // end tags or RCDATA, etc. These functions help manage these state. + bool inEndTagBufferingState() const; + void appendToPossibleEndTag(UChar); + void saveEndTagNameIfNeeded(); + bool isAppropriateEndTag() const; - inline bool shouldSkipNullCharacters() const - { - return !m_forceNullCharacterReplacement - && (m_state == HTMLTokenizer::DataState - || m_state == HTMLTokenizer::RCDATAState - || m_state == HTMLTokenizer::RAWTEXTState); - } + bool haveBufferedCharacterToken() const; + + static bool isNullCharacterSkippingState(State); + + State m_state { DataState }; + bool m_forceNullCharacterReplacement { false }; + bool m_shouldAllowCDATA { false }; + + mutable HTMLToken m_token; + + // https://html.spec.whatwg.org/#additional-allowed-character + UChar m_additionalAllowedCharacter { 0 }; + + // https://html.spec.whatwg.org/#preprocessing-the-input-stream + InputStreamPreprocessor<HTMLTokenizer> m_preprocessor; + + Vector<UChar, 32> m_appropriateEndTagName; + + // https://html.spec.whatwg.org/#temporary-buffer + Vector<LChar, 32> m_temporaryBuffer; + + // We occasionally want to emit both a character token and an end tag + // token (e.g., when lexing script). We buffer the name of the end tag + // token here so we remember it next time we re-enter the tokenizer. + Vector<LChar, 32> m_bufferedEndTagName; + + const HTMLParserOptions m_options; +}; + +class HTMLTokenizer::TokenPtr { +public: + TokenPtr(); + ~TokenPtr(); + + TokenPtr(TokenPtr&&); + TokenPtr& operator=(TokenPtr&&) = delete; + + void clear(); + + operator bool() const; + + HTMLToken& operator*() const; + HTMLToken* operator->() const; private: - inline bool processEntity(SegmentedString&); + friend class HTMLTokenizer; + explicit TokenPtr(HTMLToken*); - inline void parseError(); + HTMLToken* m_token { nullptr }; +}; - inline void bufferCharacter(UChar character) - { - ASSERT(character != kEndOfFileMarker); - m_token->ensureIsCharacterToken(); - m_token->appendToCharacter(character); - } +inline HTMLTokenizer::TokenPtr::TokenPtr() +{ +} - inline bool emitAndResumeIn(SegmentedString& source, State state) - { - saveEndTagNameIfNeeded(); - m_state = state; - source.advanceAndUpdateLineNumber(); - return true; - } - - inline bool emitAndReconsumeIn(SegmentedString&, State state) - { - saveEndTagNameIfNeeded(); - m_state = state; - return true; - } +inline HTMLTokenizer::TokenPtr::TokenPtr(HTMLToken* token) + : m_token(token) +{ +} - inline bool emitEndOfFile(SegmentedString& source) - { - if (haveBufferedCharacterToken()) - return true; - m_state = HTMLTokenizer::DataState; - source.advanceAndUpdateLineNumber(); +inline HTMLTokenizer::TokenPtr::~TokenPtr() +{ + if (m_token) m_token->clear(); - m_token->makeEndOfFile(); - return true; +} + +inline HTMLTokenizer::TokenPtr::TokenPtr(TokenPtr&& other) + : m_token(other.m_token) +{ + other.m_token = nullptr; +} + +inline void HTMLTokenizer::TokenPtr::clear() +{ + if (m_token) { + m_token->clear(); + m_token = nullptr; } +} - inline bool flushEmitAndResumeIn(SegmentedString&, State); +inline HTMLTokenizer::TokenPtr::operator bool() const +{ + return m_token; +} - // Return whether we need to emit a character token before dealing with - // the buffered end tag. - inline bool flushBufferedEndTag(SegmentedString&); - inline bool temporaryBufferIs(const String&); +inline HTMLToken& HTMLTokenizer::TokenPtr::operator*() const +{ + ASSERT(m_token); + return *m_token; +} - // Sometimes we speculatively consume input characters and we don't - // know whether they represent end tags or RCDATA, etc. These - // functions help manage these state. - inline void addToPossibleEndTag(LChar cc); +inline HTMLToken* HTMLTokenizer::TokenPtr::operator->() const +{ + ASSERT(m_token); + return m_token; +} - inline void saveEndTagNameIfNeeded() - { - ASSERT(m_token->type() != HTMLToken::Uninitialized); - if (m_token->type() == HTMLToken::StartTag) - m_appropriateEndTagName = m_token->name(); - } - inline bool isAppropriateEndTag(); +inline HTMLTokenizer::TokenPtr HTMLTokenizer::nextToken(SegmentedString& source) +{ + return TokenPtr(processToken(source) ? &m_token : nullptr); +} +inline void HTMLTokenizer::setTokenAttributeBaseOffset(unsigned offset) +{ + m_token.setAttributeBaseOffset(offset); +} - inline bool haveBufferedCharacterToken() - { - return m_token->type() == HTMLToken::Character; - } +inline size_t HTMLTokenizer::numberOfBufferedCharacters() const +{ + // Notice that we add 2 to the length of the m_temporaryBuffer to + // account for the "</" characters, which are effecitvely buffered in + // the tokenizer's state machine. + return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0; +} - State m_state; - bool m_forceNullCharacterReplacement; - bool m_shouldAllowCDATA; +inline void HTMLTokenizer::setForceNullCharacterReplacement(bool value) +{ + m_forceNullCharacterReplacement = value; +} - // m_token is owned by the caller. If nextToken is not on the stack, - // this member might be pointing to unallocated memory. - HTMLToken* m_token; +inline bool HTMLTokenizer::shouldAllowCDATA() const +{ + return m_shouldAllowCDATA; +} - // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character - UChar m_additionalAllowedCharacter; +inline void HTMLTokenizer::setShouldAllowCDATA(bool value) +{ + m_shouldAllowCDATA = value; +} - // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream - InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor; +inline bool HTMLTokenizer::isInDataState() const +{ + return m_state == DataState; +} - Vector<UChar, 32> m_appropriateEndTagName; +inline void HTMLTokenizer::setDataState() +{ + m_state = DataState; +} - // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer - Vector<LChar, 32> m_temporaryBuffer; +inline void HTMLTokenizer::setPLAINTEXTState() +{ + m_state = PLAINTEXTState; +} - // We occationally want to emit both a character token and an end tag - // token (e.g., when lexing script). We buffer the name of the end tag - // token here so we remember it next time we re-enter the tokenizer. - Vector<LChar, 32> m_bufferedEndTagName; +inline void HTMLTokenizer::setRAWTEXTState() +{ + m_state = RAWTEXTState; +} - HTMLParserOptions m_options; -}; +inline void HTMLTokenizer::setRCDATAState() +{ + m_state = RCDATAState; +} + +inline void HTMLTokenizer::setScriptDataState() +{ + m_state = ScriptDataState; +} + +inline bool HTMLTokenizer::isNullCharacterSkippingState(State state) +{ + return state == DataState || state == RCDATAState || state == RAWTEXTState; +} +inline bool HTMLTokenizer::neverSkipNullCharacters() const +{ + return m_forceNullCharacterReplacement; } -#endif +} // namespace WebCore |