summaryrefslogtreecommitdiff
path: root/Source/WebCore/css/parser/CSSTokenizer.cpp
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
commit1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch)
tree46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/WebCore/css/parser/CSSTokenizer.cpp
parent32761a6cee1d0dee366b885b7b9c777e67885688 (diff)
downloadWebKitGtk-tarball-1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c.tar.gz
Diffstat (limited to 'Source/WebCore/css/parser/CSSTokenizer.cpp')
-rw-r--r--Source/WebCore/css/parser/CSSTokenizer.cpp879
1 files changed, 879 insertions, 0 deletions
diff --git a/Source/WebCore/css/parser/CSSTokenizer.cpp b/Source/WebCore/css/parser/CSSTokenizer.cpp
new file mode 100644
index 000000000..1f1a23e4d
--- /dev/null
+++ b/Source/WebCore/css/parser/CSSTokenizer.cpp
@@ -0,0 +1,879 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Copyright (C) 2016 Apple Inc. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "config.h"
+#include "CSSTokenizer.h"
+
+#include "CSSParserIdioms.h"
+#include "CSSParserObserverWrapper.h"
+#include "CSSParserTokenRange.h"
+#include "CSSTokenizerInputStream.h"
+#include "HTMLParserIdioms.h"
+#include <wtf/text/StringBuilder.h>
+#include <wtf/unicode/CharacterNames.h>
+
+namespace WebCore {
+
+CSSTokenizer::CSSTokenizer(const String& string)
+ : m_input(string)
+{
+ // According to the spec, we should perform preprocessing here.
+ // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing
+ //
+ // However, we can skip this step since:
+ // * We're using HTML spaces (which accept \r and \f as a valid white space)
+ // * Do not count white spaces
+ // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement characters
+
+ if (string.isEmpty())
+ return;
+
+ // To avoid resizing we err on the side of reserving too much space.
+ // Most strings we tokenize have about 3.5 to 5 characters per token.
+ m_tokens.reserveInitialCapacity(string.length() / 3);
+
+ while (true) {
+ CSSParserToken token = nextToken();
+ if (token.type() == CommentToken)
+ continue;
+ if (token.type() == EOFToken)
+ return;
+ m_tokens.append(token);
+ }
+}
+
+CSSTokenizer::CSSTokenizer(const String& string, CSSParserObserverWrapper& wrapper)
+ : m_input(string)
+{
+ if (string.isEmpty())
+ return;
+
+ unsigned offset = 0;
+ while (true) {
+ CSSParserToken token = nextToken();
+ if (token.type() == EOFToken)
+ break;
+ if (token.type() == CommentToken)
+ wrapper.addComment(offset, m_input.offset(), m_tokens.size());
+ else {
+ m_tokens.append(token);
+ wrapper.addToken(offset);
+ }
+ offset = m_input.offset();
+ }
+
+ wrapper.addToken(offset);
+ wrapper.finalizeConstruction(m_tokens.begin());
+}
+
+CSSParserTokenRange CSSTokenizer::tokenRange() const
+{
+ return m_tokens;
+}
+
+unsigned CSSTokenizer::tokenCount()
+{
+ return m_tokens.size();
+}
+
+static bool isNewLine(UChar cc)
+{
+ // We check \r and \f here, since we have no preprocessing stage
+ return (cc == '\r' || cc == '\n' || cc == '\f');
+}
+
+// http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape
+static bool twoCharsAreValidEscape(UChar first, UChar second)
+{
+ return first == '\\' && !isNewLine(second);
+}
+
+void CSSTokenizer::reconsume(UChar c)
+{
+ m_input.pushBack(c);
+}
+
+UChar CSSTokenizer::consume()
+{
+ UChar current = m_input.nextInputChar();
+ m_input.advance();
+ return current;
+}
+
+CSSParserToken CSSTokenizer::whiteSpace(UChar /*cc*/)
+{
+ m_input.advanceUntilNonWhitespace();
+ return CSSParserToken(WhitespaceToken);
+}
+
+CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType type)
+{
+ m_blockStack.append(type);
+ return CSSParserToken(type, CSSParserToken::BlockStart);
+}
+
+CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType blockType, CSSParserTokenType type, StringView name)
+{
+ m_blockStack.append(blockType);
+ return CSSParserToken(type, name, CSSParserToken::BlockStart);
+}
+
+CSSParserToken CSSTokenizer::blockEnd(CSSParserTokenType type, CSSParserTokenType startType)
+{
+ if (!m_blockStack.isEmpty() && m_blockStack.last() == startType) {
+ m_blockStack.removeLast();
+ return CSSParserToken(type, CSSParserToken::BlockEnd);
+ }
+ return CSSParserToken(type);
+}
+
+CSSParserToken CSSTokenizer::leftParenthesis(UChar /*cc*/)
+{
+ return blockStart(LeftParenthesisToken);
+}
+
+CSSParserToken CSSTokenizer::rightParenthesis(UChar /*cc*/)
+{
+ return blockEnd(RightParenthesisToken, LeftParenthesisToken);
+}
+
+CSSParserToken CSSTokenizer::leftBracket(UChar /*cc*/)
+{
+ return blockStart(LeftBracketToken);
+}
+
+CSSParserToken CSSTokenizer::rightBracket(UChar /*cc*/)
+{
+ return blockEnd(RightBracketToken, LeftBracketToken);
+}
+
+CSSParserToken CSSTokenizer::leftBrace(UChar /*cc*/)
+{
+ return blockStart(LeftBraceToken);
+}
+
+CSSParserToken CSSTokenizer::rightBrace(UChar /*cc*/)
+{
+ return blockEnd(RightBraceToken, LeftBraceToken);
+}
+
+CSSParserToken CSSTokenizer::plusOrFullStop(UChar cc)
+{
+ if (nextCharsAreNumber(cc)) {
+ reconsume(cc);
+ return consumeNumericToken();
+ }
+ return CSSParserToken(DelimiterToken, cc);
+}
+
+CSSParserToken CSSTokenizer::asterisk(UChar cc)
+{
+ ASSERT_UNUSED(cc, cc == '*');
+ if (consumeIfNext('='))
+ return CSSParserToken(SubstringMatchToken);
+ return CSSParserToken(DelimiterToken, '*');
+}
+
+CSSParserToken CSSTokenizer::lessThan(UChar cc)
+{
+ ASSERT_UNUSED(cc, cc == '<');
+ if (m_input.peekWithoutReplacement(0) == '!'
+ && m_input.peekWithoutReplacement(1) == '-'
+ && m_input.peekWithoutReplacement(2) == '-') {
+ m_input.advance(3);
+ return CSSParserToken(CDOToken);
+ }
+ return CSSParserToken(DelimiterToken, '<');
+}
+
+CSSParserToken CSSTokenizer::comma(UChar /*cc*/)
+{
+ return CSSParserToken(CommaToken);
+}
+
+CSSParserToken CSSTokenizer::hyphenMinus(UChar cc)
+{
+ if (nextCharsAreNumber(cc)) {
+ reconsume(cc);
+ return consumeNumericToken();
+ }
+ if (m_input.peekWithoutReplacement(0) == '-'
+ && m_input.peekWithoutReplacement(1) == '>') {
+ m_input.advance(2);
+ return CSSParserToken(CDCToken);
+ }
+ if (nextCharsAreIdentifier(cc)) {
+ reconsume(cc);
+ return consumeIdentLikeToken();
+ }
+ return CSSParserToken(DelimiterToken, cc);
+}
+
+CSSParserToken CSSTokenizer::solidus(UChar cc)
+{
+ if (consumeIfNext('*')) {
+ // These get ignored, but we need a value to return.
+ consumeUntilCommentEndFound();
+ return CSSParserToken(CommentToken);
+ }
+
+ return CSSParserToken(DelimiterToken, cc);
+}
+
+CSSParserToken CSSTokenizer::colon(UChar /*cc*/)
+{
+ return CSSParserToken(ColonToken);
+}
+
+CSSParserToken CSSTokenizer::semiColon(UChar /*cc*/)
+{
+ return CSSParserToken(SemicolonToken);
+}
+
+CSSParserToken CSSTokenizer::hash(UChar cc)
+{
+ UChar nextChar = m_input.peekWithoutReplacement(0);
+ if (isNameCodePoint(nextChar) || twoCharsAreValidEscape(nextChar, m_input.peekWithoutReplacement(1))) {
+ HashTokenType type = nextCharsAreIdentifier() ? HashTokenId : HashTokenUnrestricted;
+ return CSSParserToken(type, consumeName());
+ }
+
+ return CSSParserToken(DelimiterToken, cc);
+}
+
+CSSParserToken CSSTokenizer::circumflexAccent(UChar cc)
+{
+ ASSERT_UNUSED(cc, cc == '^');
+ if (consumeIfNext('='))
+ return CSSParserToken(PrefixMatchToken);
+ return CSSParserToken(DelimiterToken, '^');
+}
+
+CSSParserToken CSSTokenizer::dollarSign(UChar cc)
+{
+ ASSERT_UNUSED(cc, cc == '$');
+ if (consumeIfNext('='))
+ return CSSParserToken(SuffixMatchToken);
+ return CSSParserToken(DelimiterToken, '$');
+}
+
+CSSParserToken CSSTokenizer::verticalLine(UChar cc)
+{
+ ASSERT_UNUSED(cc, cc == '|');
+ if (consumeIfNext('='))
+ return CSSParserToken(DashMatchToken);
+ if (consumeIfNext('|'))
+ return CSSParserToken(ColumnToken);
+ return CSSParserToken(DelimiterToken, '|');
+}
+
+CSSParserToken CSSTokenizer::tilde(UChar cc)
+{
+ ASSERT_UNUSED(cc, cc == '~');
+ if (consumeIfNext('='))
+ return CSSParserToken(IncludeMatchToken);
+ return CSSParserToken(DelimiterToken, '~');
+}
+
+CSSParserToken CSSTokenizer::commercialAt(UChar cc)
+{
+ ASSERT_UNUSED(cc, cc == '@');
+ if (nextCharsAreIdentifier())
+ return CSSParserToken(AtKeywordToken, consumeName());
+ return CSSParserToken(DelimiterToken, '@');
+}
+
+CSSParserToken CSSTokenizer::reverseSolidus(UChar cc)
+{
+ if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
+ reconsume(cc);
+ return consumeIdentLikeToken();
+ }
+ return CSSParserToken(DelimiterToken, cc);
+}
+
+CSSParserToken CSSTokenizer::asciiDigit(UChar cc)
+{
+ reconsume(cc);
+ return consumeNumericToken();
+}
+
+CSSParserToken CSSTokenizer::letterU(UChar cc)
+{
+ if (m_input.peekWithoutReplacement(0) == '+'
+ && (isASCIIHexDigit(m_input.peekWithoutReplacement(1))
+ || m_input.peekWithoutReplacement(1) == '?')) {
+ m_input.advance();
+ return consumeUnicodeRange();
+ }
+ reconsume(cc);
+ return consumeIdentLikeToken();
+}
+
+CSSParserToken CSSTokenizer::nameStart(UChar cc)
+{
+ reconsume(cc);
+ return consumeIdentLikeToken();
+}
+
+CSSParserToken CSSTokenizer::stringStart(UChar cc)
+{
+ return consumeStringTokenUntil(cc);
+}
+
+CSSParserToken CSSTokenizer::endOfFile(UChar /*cc*/)
+{
+ return CSSParserToken(EOFToken);
+}
+
+const CSSTokenizer::CodePoint CSSTokenizer::codePoints[128] = {
+ &CSSTokenizer::endOfFile,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ &CSSTokenizer::whiteSpace,
+ &CSSTokenizer::whiteSpace,
+ 0,
+ &CSSTokenizer::whiteSpace,
+ &CSSTokenizer::whiteSpace,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ &CSSTokenizer::whiteSpace,
+ 0,
+ &CSSTokenizer::stringStart,
+ &CSSTokenizer::hash,
+ &CSSTokenizer::dollarSign,
+ 0,
+ 0,
+ &CSSTokenizer::stringStart,
+ &CSSTokenizer::leftParenthesis,
+ &CSSTokenizer::rightParenthesis,
+ &CSSTokenizer::asterisk,
+ &CSSTokenizer::plusOrFullStop,
+ &CSSTokenizer::comma,
+ &CSSTokenizer::hyphenMinus,
+ &CSSTokenizer::plusOrFullStop,
+ &CSSTokenizer::solidus,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::asciiDigit,
+ &CSSTokenizer::colon,
+ &CSSTokenizer::semiColon,
+ &CSSTokenizer::lessThan,
+ 0,
+ 0,
+ 0,
+ &CSSTokenizer::commercialAt,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::letterU,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::leftBracket,
+ &CSSTokenizer::reverseSolidus,
+ &CSSTokenizer::rightBracket,
+ &CSSTokenizer::circumflexAccent,
+ &CSSTokenizer::nameStart,
+ 0,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::letterU,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::nameStart,
+ &CSSTokenizer::leftBrace,
+ &CSSTokenizer::verticalLine,
+ &CSSTokenizer::rightBrace,
+ &CSSTokenizer::tilde,
+ 0,
+};
+#if !ASSERT_WITH_SECURITY_IMPLICATION_DISABLED
+const unsigned codePointsNumber = 128;
+#endif
+
+CSSParserToken CSSTokenizer::nextToken()
+{
+ // Unlike the HTMLTokenizer, the CSS Syntax spec is written
+ // as a stateless, (fixed-size) look-ahead tokenizer.
+ // We could move to the stateful model and instead create
+ // states for all the "next 3 codepoints are X" cases.
+ // State-machine tokenizers are easier to write to handle
+ // incremental tokenization of partial sources.
+ // However, for now we follow the spec exactly.
+ UChar cc = consume();
+ CodePoint codePointFunc = 0;
+
+ if (isASCII(cc)) {
+ ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber);
+ codePointFunc = codePoints[cc];
+ } else
+ codePointFunc = &CSSTokenizer::nameStart;
+
+ if (codePointFunc)
+ return ((this)->*(codePointFunc))(cc);
+ return CSSParserToken(DelimiterToken, cc);
+}
+
+// This method merges the following spec sections for efficiency
+// http://www.w3.org/TR/css3-syntax/#consume-a-number
+// http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
+CSSParserToken CSSTokenizer::consumeNumber()
+{
+ ASSERT(nextCharsAreNumber());
+
+ NumericValueType type = IntegerValueType;
+ NumericSign sign = NoSign;
+ unsigned numberLength = 0;
+
+ UChar next = m_input.peekWithoutReplacement(0);
+ if (next == '+') {
+ ++numberLength;
+ sign = PlusSign;
+ } else if (next == '-') {
+ ++numberLength;
+ sign = MinusSign;
+ }
+
+ numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength);
+ next = m_input.peekWithoutReplacement(numberLength);
+ if (next == '.' && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 1))) {
+ type = NumberValueType;
+ numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 2);
+ next = m_input.peekWithoutReplacement(numberLength);
+ }
+
+ if (next == 'E' || next == 'e') {
+ next = m_input.peekWithoutReplacement(numberLength + 1);
+ if (isASCIIDigit(next)) {
+ type = NumberValueType;
+ numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 1);
+ } else if ((next == '+' || next == '-') && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 2))) {
+ type = NumberValueType;
+ numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 3);
+ }
+ }
+
+ double value = m_input.getDouble(0, numberLength);
+ m_input.advance(numberLength);
+
+ return CSSParserToken(NumberToken, value, type, sign);
+}
+
+// http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
+CSSParserToken CSSTokenizer::consumeNumericToken()
+{
+ CSSParserToken token = consumeNumber();
+ if (nextCharsAreIdentifier())
+ token.convertToDimensionWithUnit(consumeName());
+ else if (consumeIfNext('%'))
+ token.convertToPercentage();
+ return token;
+}
+
+// http://dev.w3.org/csswg/css-syntax/#consume-ident-like-token
+CSSParserToken CSSTokenizer::consumeIdentLikeToken()
+{
+ StringView name = consumeName();
+ if (consumeIfNext('(')) {
+ if (equalIgnoringASCIICase(name, "url")) {
+ // The spec is slightly different so as to avoid dropping whitespace
+ // tokens, but they wouldn't be used and this is easier.
+ m_input.advanceUntilNonWhitespace();
+ UChar next = m_input.peekWithoutReplacement(0);
+ if (next != '"' && next != '\'')
+ return consumeUrlToken();
+ }
+ return blockStart(LeftParenthesisToken, FunctionToken, name);
+ }
+ return CSSParserToken(IdentToken, name);
+}
+
+// http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
+CSSParserToken CSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
+{
+ // Strings without escapes get handled without allocations
+ for (unsigned size = 0; ; size++) {
+ UChar cc = m_input.peekWithoutReplacement(size);
+ if (cc == endingCodePoint) {
+ unsigned startOffset = m_input.offset();
+ m_input.advance(size + 1);
+ return CSSParserToken(StringToken, m_input.rangeAt(startOffset, size));
+ }
+ if (isNewLine(cc)) {
+ m_input.advance(size);
+ return CSSParserToken(BadStringToken);
+ }
+ if (cc == '\0' || cc == '\\')
+ break;
+ }
+
+ StringBuilder output;
+ while (true) {
+ UChar cc = consume();
+ if (cc == endingCodePoint || cc == kEndOfFileMarker)
+ return CSSParserToken(StringToken, registerString(output.toString()));
+ if (isNewLine(cc)) {
+ reconsume(cc);
+ return CSSParserToken(BadStringToken);
+ }
+ if (cc == '\\') {
+ if (m_input.nextInputChar() == kEndOfFileMarker)
+ continue;
+ if (isNewLine(m_input.peekWithoutReplacement(0)))
+ consumeSingleWhitespaceIfNext(); // This handles \r\n for us
+ else
+ output.append(consumeEscape());
+ } else
+ output.append(cc);
+ }
+}
+
+CSSParserToken CSSTokenizer::consumeUnicodeRange()
+{
+ ASSERT(isASCIIHexDigit(m_input.peekWithoutReplacement(0)) || m_input.peekWithoutReplacement(0) == '?');
+ int lengthRemaining = 6;
+ UChar32 start = 0;
+
+ while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) {
+ start = start * 16 + toASCIIHexValue(consume());
+ --lengthRemaining;
+ }
+
+ UChar32 end = start;
+ if (lengthRemaining && consumeIfNext('?')) {
+ do {
+ start *= 16;
+ end = end * 16 + 0xF;
+ --lengthRemaining;
+ } while (lengthRemaining && consumeIfNext('?'));
+ } else if (m_input.peekWithoutReplacement(0) == '-' && isASCIIHexDigit(m_input.peekWithoutReplacement(1))) {
+ m_input.advance();
+ lengthRemaining = 6;
+ end = 0;
+ do {
+ end = end * 16 + toASCIIHexValue(consume());
+ --lengthRemaining;
+ } while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0)));
+ }
+
+ return CSSParserToken(UnicodeRangeToken, start, end);
+}
+
+// http://dev.w3.org/csswg/css-syntax/#non-printable-code-point
+static bool isNonPrintableCodePoint(UChar cc)
+{
+ return cc <= '\x8' || cc == '\xb' || (cc >= '\xe' && cc <= '\x1f') || cc == '\x7f';
+}
+
+// http://dev.w3.org/csswg/css-syntax/#consume-url-token
+CSSParserToken CSSTokenizer::consumeUrlToken()
+{
+ m_input.advanceUntilNonWhitespace();
+
+ // URL tokens without escapes get handled without allocations
+ for (unsigned size = 0; ; size++) {
+ UChar cc = m_input.peekWithoutReplacement(size);
+ if (cc == ')') {
+ unsigned startOffset = m_input.offset();
+ m_input.advance(size + 1);
+ return CSSParserToken(UrlToken, m_input.rangeAt(startOffset, size));
+ }
+ if (cc <= ' ' || cc == '\\' || cc == '"' || cc == '\'' || cc == '(' || cc == '\x7f')
+ break;
+ }
+
+ StringBuilder result;
+ while (true) {
+ UChar cc = consume();
+ if (cc == ')' || cc == kEndOfFileMarker)
+ return CSSParserToken(UrlToken, registerString(result.toString()));
+
+ if (isHTMLSpace(cc)) {
+ m_input.advanceUntilNonWhitespace();
+ if (consumeIfNext(')') || m_input.nextInputChar() == kEndOfFileMarker)
+ return CSSParserToken(UrlToken, registerString(result.toString()));
+ break;
+ }
+
+ if (cc == '"' || cc == '\'' || cc == '(' || isNonPrintableCodePoint(cc))
+ break;
+
+ if (cc == '\\') {
+ if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
+ result.append(consumeEscape());
+ continue;
+ }
+ break;
+ }
+
+ result.append(cc);
+ }
+
+ consumeBadUrlRemnants();
+ return CSSParserToken(BadUrlToken);
+}
+
+// http://dev.w3.org/csswg/css-syntax/#consume-the-remnants-of-a-bad-url
+void CSSTokenizer::consumeBadUrlRemnants()
+{
+ while (true) {
+ UChar cc = consume();
+ if (cc == ')' || cc == kEndOfFileMarker)
+ return;
+ if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0)))
+ consumeEscape();
+ }
+}
+
+void CSSTokenizer::consumeSingleWhitespaceIfNext()
+{
+ // We check for \r\n and HTML spaces since we don't do preprocessing
+ UChar next = m_input.peekWithoutReplacement(0);
+ if (next == '\r' && m_input.peekWithoutReplacement(1) == '\n')
+ m_input.advance(2);
+ else if (isHTMLSpace(next))
+ m_input.advance();
+}
+
+void CSSTokenizer::consumeUntilCommentEndFound()
+{
+ UChar c = consume();
+ while (true) {
+ if (c == kEndOfFileMarker)
+ return;
+ if (c != '*') {
+ c = consume();
+ continue;
+ }
+ c = consume();
+ if (c == '/')
+ return;
+ }
+}
+
+bool CSSTokenizer::consumeIfNext(UChar character)
+{
+ // Since we're not doing replacement we can't tell the difference from
+ // a NUL in the middle and the kEndOfFileMarker, so character must not be
+ // NUL.
+ ASSERT(character);
+ if (m_input.peekWithoutReplacement(0) == character) {
+ m_input.advance();
+ return true;
+ }
+ return false;
+}
+
+// http://www.w3.org/TR/css3-syntax/#consume-a-name
+StringView CSSTokenizer::consumeName()
+{
+ // Names without escapes get handled without allocations
+ for (unsigned size = 0; ; ++size) {
+ UChar cc = m_input.peekWithoutReplacement(size);
+ if (isNameCodePoint(cc))
+ continue;
+ // peekWithoutReplacement will return NUL when we hit the end of the
+ // input. In that case we want to still use the rangeAt() fast path
+ // below.
+ if (cc == '\0' && m_input.offset() + size < m_input.length())
+ break;
+ if (cc == '\\')
+ break;
+ unsigned startOffset = m_input.offset();
+ m_input.advance(size);
+ return m_input.rangeAt(startOffset, size);
+ }
+
+ StringBuilder result;
+ while (true) {
+ UChar cc = consume();
+ if (isNameCodePoint(cc)) {
+ result.append(cc);
+ continue;
+ }
+ if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
+ result.append(consumeEscape());
+ continue;
+ }
+ reconsume(cc);
+ return registerString(result.toString());
+ }
+}
+
+// http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point
+UChar32 CSSTokenizer::consumeEscape()
+{
+ UChar cc = consume();
+ ASSERT(!isNewLine(cc));
+ if (isASCIIHexDigit(cc)) {
+ unsigned consumedHexDigits = 1;
+ StringBuilder hexChars;
+ hexChars.append(cc);
+ while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) {
+ cc = consume();
+ hexChars.append(cc);
+ consumedHexDigits++;
+ };
+ consumeSingleWhitespaceIfNext();
+ bool ok = false;
+ UChar32 codePoint = hexChars.toString().toUIntStrict(&ok, 16);
+ ASSERT(ok);
+ if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF)
+ return replacementCharacter;
+ return codePoint;
+ }
+
+ if (cc == kEndOfFileMarker)
+ return replacementCharacter;
+ return cc;
+}
+
+bool CSSTokenizer::nextTwoCharsAreValidEscape()
+{
+ return twoCharsAreValidEscape(m_input.peekWithoutReplacement(0), m_input.peekWithoutReplacement(1));
+}
+
+// http://www.w3.org/TR/css3-syntax/#starts-with-a-number
+bool CSSTokenizer::nextCharsAreNumber(UChar first)
+{
+ UChar second = m_input.peekWithoutReplacement(0);
+ if (isASCIIDigit(first))
+ return true;
+ if (first == '+' || first == '-')
+ return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peekWithoutReplacement(1))));
+ if (first =='.')
+ return (isASCIIDigit(second));
+ return false;
+}
+
+bool CSSTokenizer::nextCharsAreNumber()
+{
+ UChar first = consume();
+ bool areNumber = nextCharsAreNumber(first);
+ reconsume(first);
+ return areNumber;
+}
+
+// http://dev.w3.org/csswg/css-syntax/#would-start-an-identifier
+bool CSSTokenizer::nextCharsAreIdentifier(UChar first)
+{
+ UChar second = m_input.peekWithoutReplacement(0);
+ if (isNameStartCodePoint(first) || twoCharsAreValidEscape(first, second))
+ return true;
+
+ if (first == '-')
+ return isNameStartCodePoint(second) || second == '-' || nextTwoCharsAreValidEscape();
+
+ return false;
+}
+
+bool CSSTokenizer::nextCharsAreIdentifier()
+{
+ UChar first = consume();
+ bool areIdentifier = nextCharsAreIdentifier(first);
+ reconsume(first);
+ return areIdentifier;
+}
+
+StringView CSSTokenizer::registerString(const String& string)
+{
+ m_stringPool.append(string);
+ return string;
+}
+
+} // namespace WebCore