summaryrefslogtreecommitdiff
path: root/Source/WebCore/html/parser/HTMLTokenizer.h
diff options
context:
space:
mode:
Diffstat (limited to 'Source/WebCore/html/parser/HTMLTokenizer.h')
-rw-r--r--Source/WebCore/html/parser/HTMLTokenizer.h339
1 files changed, 213 insertions, 126 deletions
diff --git a/Source/WebCore/html/parser/HTMLTokenizer.h b/Source/WebCore/html/parser/HTMLTokenizer.h
index 38021e87d..a553acbd9 100644
--- a/Source/WebCore/html/parser/HTMLTokenizer.h
+++ b/Source/WebCore/html/parser/HTMLTokenizer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008 Apple Inc. All Rights Reserved.
+ * Copyright (C) 2008, 2015 Apple Inc. All Rights Reserved.
* Copyright (C) 2010 Google, Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -24,25 +24,62 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef HTMLTokenizer_h
-#define HTMLTokenizer_h
+#pragma once
#include "HTMLParserOptions.h"
#include "HTMLToken.h"
#include "InputStreamPreprocessor.h"
-#include "SegmentedString.h"
namespace WebCore {
+class SegmentedString;
+
class HTMLTokenizer {
- WTF_MAKE_NONCOPYABLE(HTMLTokenizer);
- WTF_MAKE_FAST_ALLOCATED;
public:
- explicit HTMLTokenizer(const HTMLParserOptions&);
- ~HTMLTokenizer();
+ explicit HTMLTokenizer(const HTMLParserOptions& = HTMLParserOptions());
+
+ // If we can't parse a whole token, this returns null.
+ class TokenPtr;
+ TokenPtr nextToken(SegmentedString&);
+
+ // Used by HTMLSourceTracker.
+ void setTokenAttributeBaseOffset(unsigned);
+
+ // Returns a copy of any characters buffered internally by the tokenizer.
+ // The tokenizer buffers characters when searching for the </script> token that terminates a script element.
+ String bufferedCharacters() const;
+ size_t numberOfBufferedCharacters() const;
+
+ // Updates the tokenizer's state according to the given tag name. This is an approximation of how the tree
+ // builder would update the tokenizer's state. This method is useful for approximating HTML tokenization.
+ // To get exactly the correct tokenization, you need the real tree builder.
+ //
+ // The main failures in the approximation are as follows:
+ //
+ // * The first set of character tokens emitted for a <pre> element might contain an extra leading newline.
+ // * The replacement of U+0000 with U+FFFD will not be sensitive to the tree builder's insertion mode.
+ // * CDATA sections in foreign content will be tokenized as bogus comments instead of as character tokens.
+ //
+ // This approximation is also the algorithm called for when parsing an HTML fragment.
+ // https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
+ void updateStateFor(const AtomicString& tagName);
+
+ void setForceNullCharacterReplacement(bool);
+
+ bool shouldAllowCDATA() const;
+ void setShouldAllowCDATA(bool);
- void reset();
+ bool isInDataState() const;
+ void setDataState();
+ void setPLAINTEXTState();
+ void setRAWTEXTState();
+ void setRCDATAState();
+ void setScriptDataState();
+
+ bool neverSkipNullCharacters() const;
+
+private:
enum State {
DataState,
CharacterReferenceInDataState,
@@ -88,10 +125,7 @@ public:
AfterAttributeValueQuotedState,
SelfClosingStartTagState,
BogusCommentState,
- // The ContinueBogusCommentState is not in the HTML5 spec, but we use
- // it internally to keep track of whether we've started the bogus
- // comment token yet.
- ContinueBogusCommentState,
+ ContinueBogusCommentState, // Not in the HTML spec, used internally to track whether we started the bogus comment token.
MarkupDeclarationOpenState,
CommentStartState,
CommentStartDashState,
@@ -121,148 +155,201 @@ public:
CDATASectionDoubleRightSquareBracketState,
};
- // This function returns true if it emits a token. Otherwise, callers
- // must provide the same (in progress) token on the next call (unless
- // they call reset() first).
- bool nextToken(SegmentedString&, HTMLToken&);
+ bool processToken(SegmentedString&);
+ bool processEntity(SegmentedString&);
- // Returns a copy of any characters buffered internally by the tokenizer.
- // The tokenizer buffers characters when searching for the </script> token
- // that terminates a script element.
- String bufferedCharacters() const;
+ void parseError();
- size_t numberOfBufferedCharacters() const
- {
- // Notice that we add 2 to the length of the m_temporaryBuffer to
- // account for the "</" characters, which are effecitvely buffered in
- // the tokenizer's state machine.
- return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
- }
+ void bufferASCIICharacter(UChar);
+ void bufferCharacter(UChar);
- // Updates the tokenizer's state according to the given tag name. This is
- // an approximation of how the tree builder would update the tokenizer's
- // state. This method is useful for approximating HTML tokenization. To
- // get exactly the correct tokenization, you need the real tree builder.
- //
- // The main failures in the approximation are as follows:
- //
- // * The first set of character tokens emitted for a <pre> element might
- // contain an extra leading newline.
- // * The replacement of U+0000 with U+FFFD will not be sensitive to the
- // tree builder's insertion mode.
- // * CDATA sections in foreign content will be tokenized as bogus comments
- // instead of as character tokens.
- //
- void updateStateFor(const AtomicString& tagName);
+ bool emitAndResumeInDataState(SegmentedString&);
+ bool emitAndReconsumeInDataState();
+ bool emitEndOfFile(SegmentedString&);
- bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
- void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
+ // Return true if we wil emit a character token before dealing with the buffered end tag.
+ void flushBufferedEndTag();
+ bool commitToPartialEndTag(SegmentedString&, UChar, State);
+ bool commitToCompleteEndTag(SegmentedString&);
- bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
- void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
+ void appendToTemporaryBuffer(UChar);
+ bool temporaryBufferIs(const char*);
- State state() const { return m_state; }
- void setState(State state) { m_state = state; }
+ // Sometimes we speculatively consume input characters and we don't know whether they represent
+ // end tags or RCDATA, etc. These functions help manage these state.
+ bool inEndTagBufferingState() const;
+ void appendToPossibleEndTag(UChar);
+ void saveEndTagNameIfNeeded();
+ bool isAppropriateEndTag() const;
- inline bool shouldSkipNullCharacters() const
- {
- return !m_forceNullCharacterReplacement
- && (m_state == HTMLTokenizer::DataState
- || m_state == HTMLTokenizer::RCDATAState
- || m_state == HTMLTokenizer::RAWTEXTState);
- }
+ bool haveBufferedCharacterToken() const;
+
+ static bool isNullCharacterSkippingState(State);
+
+ State m_state { DataState };
+ bool m_forceNullCharacterReplacement { false };
+ bool m_shouldAllowCDATA { false };
+
+ mutable HTMLToken m_token;
+
+ // https://html.spec.whatwg.org/#additional-allowed-character
+ UChar m_additionalAllowedCharacter { 0 };
+
+ // https://html.spec.whatwg.org/#preprocessing-the-input-stream
+ InputStreamPreprocessor<HTMLTokenizer> m_preprocessor;
+
+ Vector<UChar, 32> m_appropriateEndTagName;
+
+ // https://html.spec.whatwg.org/#temporary-buffer
+ Vector<LChar, 32> m_temporaryBuffer;
+
+ // We occasionally want to emit both a character token and an end tag
+ // token (e.g., when lexing script). We buffer the name of the end tag
+ // token here so we remember it next time we re-enter the tokenizer.
+ Vector<LChar, 32> m_bufferedEndTagName;
+
+ const HTMLParserOptions m_options;
+};
+
+class HTMLTokenizer::TokenPtr {
+public:
+ TokenPtr();
+ ~TokenPtr();
+
+ TokenPtr(TokenPtr&&);
+ TokenPtr& operator=(TokenPtr&&) = delete;
+
+ void clear();
+
+ operator bool() const;
+
+ HTMLToken& operator*() const;
+ HTMLToken* operator->() const;
private:
- inline bool processEntity(SegmentedString&);
+ friend class HTMLTokenizer;
+ explicit TokenPtr(HTMLToken*);
- inline void parseError();
+ HTMLToken* m_token { nullptr };
+};
- inline void bufferCharacter(UChar character)
- {
- ASSERT(character != kEndOfFileMarker);
- m_token->ensureIsCharacterToken();
- m_token->appendToCharacter(character);
- }
+inline HTMLTokenizer::TokenPtr::TokenPtr()
+{
+}
- inline bool emitAndResumeIn(SegmentedString& source, State state)
- {
- saveEndTagNameIfNeeded();
- m_state = state;
- source.advanceAndUpdateLineNumber();
- return true;
- }
-
- inline bool emitAndReconsumeIn(SegmentedString&, State state)
- {
- saveEndTagNameIfNeeded();
- m_state = state;
- return true;
- }
+inline HTMLTokenizer::TokenPtr::TokenPtr(HTMLToken* token)
+ : m_token(token)
+{
+}
- inline bool emitEndOfFile(SegmentedString& source)
- {
- if (haveBufferedCharacterToken())
- return true;
- m_state = HTMLTokenizer::DataState;
- source.advanceAndUpdateLineNumber();
+inline HTMLTokenizer::TokenPtr::~TokenPtr()
+{
+ if (m_token)
m_token->clear();
- m_token->makeEndOfFile();
- return true;
+}
+
+inline HTMLTokenizer::TokenPtr::TokenPtr(TokenPtr&& other)
+ : m_token(other.m_token)
+{
+ other.m_token = nullptr;
+}
+
+inline void HTMLTokenizer::TokenPtr::clear()
+{
+ if (m_token) {
+ m_token->clear();
+ m_token = nullptr;
}
+}
- inline bool flushEmitAndResumeIn(SegmentedString&, State);
+inline HTMLTokenizer::TokenPtr::operator bool() const
+{
+ return m_token;
+}
- // Return whether we need to emit a character token before dealing with
- // the buffered end tag.
- inline bool flushBufferedEndTag(SegmentedString&);
- inline bool temporaryBufferIs(const String&);
+inline HTMLToken& HTMLTokenizer::TokenPtr::operator*() const
+{
+ ASSERT(m_token);
+ return *m_token;
+}
- // Sometimes we speculatively consume input characters and we don't
- // know whether they represent end tags or RCDATA, etc. These
- // functions help manage these state.
- inline void addToPossibleEndTag(LChar cc);
+inline HTMLToken* HTMLTokenizer::TokenPtr::operator->() const
+{
+ ASSERT(m_token);
+ return m_token;
+}
- inline void saveEndTagNameIfNeeded()
- {
- ASSERT(m_token->type() != HTMLToken::Uninitialized);
- if (m_token->type() == HTMLToken::StartTag)
- m_appropriateEndTagName = m_token->name();
- }
- inline bool isAppropriateEndTag();
+inline HTMLTokenizer::TokenPtr HTMLTokenizer::nextToken(SegmentedString& source)
+{
+ return TokenPtr(processToken(source) ? &m_token : nullptr);
+}
+inline void HTMLTokenizer::setTokenAttributeBaseOffset(unsigned offset)
+{
+ m_token.setAttributeBaseOffset(offset);
+}
- inline bool haveBufferedCharacterToken()
- {
- return m_token->type() == HTMLToken::Character;
- }
+inline size_t HTMLTokenizer::numberOfBufferedCharacters() const
+{
+ // Notice that we add 2 to the length of the m_temporaryBuffer to
+ // account for the "</" characters, which are effecitvely buffered in
+ // the tokenizer's state machine.
+ return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
+}
- State m_state;
- bool m_forceNullCharacterReplacement;
- bool m_shouldAllowCDATA;
+inline void HTMLTokenizer::setForceNullCharacterReplacement(bool value)
+{
+ m_forceNullCharacterReplacement = value;
+}
- // m_token is owned by the caller. If nextToken is not on the stack,
- // this member might be pointing to unallocated memory.
- HTMLToken* m_token;
+inline bool HTMLTokenizer::shouldAllowCDATA() const
+{
+ return m_shouldAllowCDATA;
+}
- // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
- UChar m_additionalAllowedCharacter;
+inline void HTMLTokenizer::setShouldAllowCDATA(bool value)
+{
+ m_shouldAllowCDATA = value;
+}
- // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
- InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor;
+inline bool HTMLTokenizer::isInDataState() const
+{
+ return m_state == DataState;
+}
- Vector<UChar, 32> m_appropriateEndTagName;
+inline void HTMLTokenizer::setDataState()
+{
+ m_state = DataState;
+}
- // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
- Vector<LChar, 32> m_temporaryBuffer;
+inline void HTMLTokenizer::setPLAINTEXTState()
+{
+ m_state = PLAINTEXTState;
+}
- // We occationally want to emit both a character token and an end tag
- // token (e.g., when lexing script). We buffer the name of the end tag
- // token here so we remember it next time we re-enter the tokenizer.
- Vector<LChar, 32> m_bufferedEndTagName;
+inline void HTMLTokenizer::setRAWTEXTState()
+{
+ m_state = RAWTEXTState;
+}
- HTMLParserOptions m_options;
-};
+inline void HTMLTokenizer::setRCDATAState()
+{
+ m_state = RCDATAState;
+}
+
+inline void HTMLTokenizer::setScriptDataState()
+{
+ m_state = ScriptDataState;
+}
+
+inline bool HTMLTokenizer::isNullCharacterSkippingState(State state)
+{
+ return state == DataState || state == RCDATAState || state == RAWTEXTState;
+}
+inline bool HTMLTokenizer::neverSkipNullCharacters() const
+{
+ return m_forceNullCharacterReplacement;
}
-#endif
+} // namespace WebCore