diff options
Diffstat (limited to 'Source/WebCore/css/parser/CSSTokenizer.cpp')
-rw-r--r-- | Source/WebCore/css/parser/CSSTokenizer.cpp | 879 |
1 files changed, 879 insertions, 0 deletions
diff --git a/Source/WebCore/css/parser/CSSTokenizer.cpp b/Source/WebCore/css/parser/CSSTokenizer.cpp new file mode 100644 index 000000000..1f1a23e4d --- /dev/null +++ b/Source/WebCore/css/parser/CSSTokenizer.cpp @@ -0,0 +1,879 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Copyright (C) 2016 Apple Inc. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "config.h" +#include "CSSTokenizer.h" + +#include "CSSParserIdioms.h" +#include "CSSParserObserverWrapper.h" +#include "CSSParserTokenRange.h" +#include "CSSTokenizerInputStream.h" +#include "HTMLParserIdioms.h" +#include <wtf/text/StringBuilder.h> +#include <wtf/unicode/CharacterNames.h> + +namespace WebCore { + +CSSTokenizer::CSSTokenizer(const String& string) + : m_input(string) +{ + // According to the spec, we should perform preprocessing here. + // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing + // + // However, we can skip this step since: + // * We're using HTML spaces (which accept \r and \f as a valid white space) + // * Do not count white spaces + // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement characters + + if (string.isEmpty()) + return; + + // To avoid resizing we err on the side of reserving too much space. + // Most strings we tokenize have about 3.5 to 5 characters per token. + m_tokens.reserveInitialCapacity(string.length() / 3); + + while (true) { + CSSParserToken token = nextToken(); + if (token.type() == CommentToken) + continue; + if (token.type() == EOFToken) + return; + m_tokens.append(token); + } +} + +CSSTokenizer::CSSTokenizer(const String& string, CSSParserObserverWrapper& wrapper) + : m_input(string) +{ + if (string.isEmpty()) + return; + + unsigned offset = 0; + while (true) { + CSSParserToken token = nextToken(); + if (token.type() == EOFToken) + break; + if (token.type() == CommentToken) + wrapper.addComment(offset, m_input.offset(), m_tokens.size()); + else { + m_tokens.append(token); + wrapper.addToken(offset); + } + offset = m_input.offset(); + } + + wrapper.addToken(offset); + wrapper.finalizeConstruction(m_tokens.begin()); +} + +CSSParserTokenRange CSSTokenizer::tokenRange() const +{ + return m_tokens; +} + +unsigned CSSTokenizer::tokenCount() +{ + return m_tokens.size(); +} + +static bool isNewLine(UChar cc) +{ + // We check \r and \f here, since we have no preprocessing stage + return (cc == '\r' || cc == '\n' || cc == '\f'); +} + +// http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape +static bool twoCharsAreValidEscape(UChar first, UChar second) +{ + return first == '\\' && !isNewLine(second); +} + +void CSSTokenizer::reconsume(UChar c) +{ + m_input.pushBack(c); +} + +UChar CSSTokenizer::consume() +{ + UChar current = m_input.nextInputChar(); + m_input.advance(); + return current; +} + +CSSParserToken CSSTokenizer::whiteSpace(UChar /*cc*/) +{ + m_input.advanceUntilNonWhitespace(); + return CSSParserToken(WhitespaceToken); +} + +CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType type) +{ + m_blockStack.append(type); + return CSSParserToken(type, CSSParserToken::BlockStart); +} + +CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType blockType, CSSParserTokenType type, StringView name) +{ + m_blockStack.append(blockType); + return CSSParserToken(type, name, CSSParserToken::BlockStart); +} + +CSSParserToken CSSTokenizer::blockEnd(CSSParserTokenType type, CSSParserTokenType startType) +{ + if (!m_blockStack.isEmpty() && m_blockStack.last() == startType) { + m_blockStack.removeLast(); + return CSSParserToken(type, CSSParserToken::BlockEnd); + } + return CSSParserToken(type); +} + +CSSParserToken CSSTokenizer::leftParenthesis(UChar /*cc*/) +{ + return blockStart(LeftParenthesisToken); +} + +CSSParserToken CSSTokenizer::rightParenthesis(UChar /*cc*/) +{ + return blockEnd(RightParenthesisToken, LeftParenthesisToken); +} + +CSSParserToken CSSTokenizer::leftBracket(UChar /*cc*/) +{ + return blockStart(LeftBracketToken); +} + +CSSParserToken CSSTokenizer::rightBracket(UChar /*cc*/) +{ + return blockEnd(RightBracketToken, LeftBracketToken); +} + +CSSParserToken CSSTokenizer::leftBrace(UChar /*cc*/) +{ + return blockStart(LeftBraceToken); +} + +CSSParserToken CSSTokenizer::rightBrace(UChar /*cc*/) +{ + return blockEnd(RightBraceToken, LeftBraceToken); +} + +CSSParserToken CSSTokenizer::plusOrFullStop(UChar cc) +{ + if (nextCharsAreNumber(cc)) { + reconsume(cc); + return consumeNumericToken(); + } + return CSSParserToken(DelimiterToken, cc); +} + +CSSParserToken CSSTokenizer::asterisk(UChar cc) +{ + ASSERT_UNUSED(cc, cc == '*'); + if (consumeIfNext('=')) + return CSSParserToken(SubstringMatchToken); + return CSSParserToken(DelimiterToken, '*'); +} + +CSSParserToken CSSTokenizer::lessThan(UChar cc) +{ + ASSERT_UNUSED(cc, cc == '<'); + if (m_input.peekWithoutReplacement(0) == '!' + && m_input.peekWithoutReplacement(1) == '-' + && m_input.peekWithoutReplacement(2) == '-') { + m_input.advance(3); + return CSSParserToken(CDOToken); + } + return CSSParserToken(DelimiterToken, '<'); +} + +CSSParserToken CSSTokenizer::comma(UChar /*cc*/) +{ + return CSSParserToken(CommaToken); +} + +CSSParserToken CSSTokenizer::hyphenMinus(UChar cc) +{ + if (nextCharsAreNumber(cc)) { + reconsume(cc); + return consumeNumericToken(); + } + if (m_input.peekWithoutReplacement(0) == '-' + && m_input.peekWithoutReplacement(1) == '>') { + m_input.advance(2); + return CSSParserToken(CDCToken); + } + if (nextCharsAreIdentifier(cc)) { + reconsume(cc); + return consumeIdentLikeToken(); + } + return CSSParserToken(DelimiterToken, cc); +} + +CSSParserToken CSSTokenizer::solidus(UChar cc) +{ + if (consumeIfNext('*')) { + // These get ignored, but we need a value to return. + consumeUntilCommentEndFound(); + return CSSParserToken(CommentToken); + } + + return CSSParserToken(DelimiterToken, cc); +} + +CSSParserToken CSSTokenizer::colon(UChar /*cc*/) +{ + return CSSParserToken(ColonToken); +} + +CSSParserToken CSSTokenizer::semiColon(UChar /*cc*/) +{ + return CSSParserToken(SemicolonToken); +} + +CSSParserToken CSSTokenizer::hash(UChar cc) +{ + UChar nextChar = m_input.peekWithoutReplacement(0); + if (isNameCodePoint(nextChar) || twoCharsAreValidEscape(nextChar, m_input.peekWithoutReplacement(1))) { + HashTokenType type = nextCharsAreIdentifier() ? HashTokenId : HashTokenUnrestricted; + return CSSParserToken(type, consumeName()); + } + + return CSSParserToken(DelimiterToken, cc); +} + +CSSParserToken CSSTokenizer::circumflexAccent(UChar cc) +{ + ASSERT_UNUSED(cc, cc == '^'); + if (consumeIfNext('=')) + return CSSParserToken(PrefixMatchToken); + return CSSParserToken(DelimiterToken, '^'); +} + +CSSParserToken CSSTokenizer::dollarSign(UChar cc) +{ + ASSERT_UNUSED(cc, cc == '$'); + if (consumeIfNext('=')) + return CSSParserToken(SuffixMatchToken); + return CSSParserToken(DelimiterToken, '$'); +} + +CSSParserToken CSSTokenizer::verticalLine(UChar cc) +{ + ASSERT_UNUSED(cc, cc == '|'); + if (consumeIfNext('=')) + return CSSParserToken(DashMatchToken); + if (consumeIfNext('|')) + return CSSParserToken(ColumnToken); + return CSSParserToken(DelimiterToken, '|'); +} + +CSSParserToken CSSTokenizer::tilde(UChar cc) +{ + ASSERT_UNUSED(cc, cc == '~'); + if (consumeIfNext('=')) + return CSSParserToken(IncludeMatchToken); + return CSSParserToken(DelimiterToken, '~'); +} + +CSSParserToken CSSTokenizer::commercialAt(UChar cc) +{ + ASSERT_UNUSED(cc, cc == '@'); + if (nextCharsAreIdentifier()) + return CSSParserToken(AtKeywordToken, consumeName()); + return CSSParserToken(DelimiterToken, '@'); +} + +CSSParserToken CSSTokenizer::reverseSolidus(UChar cc) +{ + if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) { + reconsume(cc); + return consumeIdentLikeToken(); + } + return CSSParserToken(DelimiterToken, cc); +} + +CSSParserToken CSSTokenizer::asciiDigit(UChar cc) +{ + reconsume(cc); + return consumeNumericToken(); +} + +CSSParserToken CSSTokenizer::letterU(UChar cc) +{ + if (m_input.peekWithoutReplacement(0) == '+' + && (isASCIIHexDigit(m_input.peekWithoutReplacement(1)) + || m_input.peekWithoutReplacement(1) == '?')) { + m_input.advance(); + return consumeUnicodeRange(); + } + reconsume(cc); + return consumeIdentLikeToken(); +} + +CSSParserToken CSSTokenizer::nameStart(UChar cc) +{ + reconsume(cc); + return consumeIdentLikeToken(); +} + +CSSParserToken CSSTokenizer::stringStart(UChar cc) +{ + return consumeStringTokenUntil(cc); +} + +CSSParserToken CSSTokenizer::endOfFile(UChar /*cc*/) +{ + return CSSParserToken(EOFToken); +} + +const CSSTokenizer::CodePoint CSSTokenizer::codePoints[128] = { + &CSSTokenizer::endOfFile, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + &CSSTokenizer::whiteSpace, + &CSSTokenizer::whiteSpace, + 0, + &CSSTokenizer::whiteSpace, + &CSSTokenizer::whiteSpace, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + &CSSTokenizer::whiteSpace, + 0, + &CSSTokenizer::stringStart, + &CSSTokenizer::hash, + &CSSTokenizer::dollarSign, + 0, + 0, + &CSSTokenizer::stringStart, + &CSSTokenizer::leftParenthesis, + &CSSTokenizer::rightParenthesis, + &CSSTokenizer::asterisk, + &CSSTokenizer::plusOrFullStop, + &CSSTokenizer::comma, + &CSSTokenizer::hyphenMinus, + &CSSTokenizer::plusOrFullStop, + &CSSTokenizer::solidus, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::asciiDigit, + &CSSTokenizer::colon, + &CSSTokenizer::semiColon, + &CSSTokenizer::lessThan, + 0, + 0, + 0, + &CSSTokenizer::commercialAt, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::letterU, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::leftBracket, + &CSSTokenizer::reverseSolidus, + &CSSTokenizer::rightBracket, + &CSSTokenizer::circumflexAccent, + &CSSTokenizer::nameStart, + 0, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::letterU, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::nameStart, + &CSSTokenizer::leftBrace, + &CSSTokenizer::verticalLine, + &CSSTokenizer::rightBrace, + &CSSTokenizer::tilde, + 0, +}; +#if !ASSERT_WITH_SECURITY_IMPLICATION_DISABLED +const unsigned codePointsNumber = 128; +#endif + +CSSParserToken CSSTokenizer::nextToken() +{ + // Unlike the HTMLTokenizer, the CSS Syntax spec is written + // as a stateless, (fixed-size) look-ahead tokenizer. + // We could move to the stateful model and instead create + // states for all the "next 3 codepoints are X" cases. + // State-machine tokenizers are easier to write to handle + // incremental tokenization of partial sources. + // However, for now we follow the spec exactly. + UChar cc = consume(); + CodePoint codePointFunc = 0; + + if (isASCII(cc)) { + ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber); + codePointFunc = codePoints[cc]; + } else + codePointFunc = &CSSTokenizer::nameStart; + + if (codePointFunc) + return ((this)->*(codePointFunc))(cc); + return CSSParserToken(DelimiterToken, cc); +} + +// This method merges the following spec sections for efficiency +// http://www.w3.org/TR/css3-syntax/#consume-a-number +// http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number +CSSParserToken CSSTokenizer::consumeNumber() +{ + ASSERT(nextCharsAreNumber()); + + NumericValueType type = IntegerValueType; + NumericSign sign = NoSign; + unsigned numberLength = 0; + + UChar next = m_input.peekWithoutReplacement(0); + if (next == '+') { + ++numberLength; + sign = PlusSign; + } else if (next == '-') { + ++numberLength; + sign = MinusSign; + } + + numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength); + next = m_input.peekWithoutReplacement(numberLength); + if (next == '.' && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 1))) { + type = NumberValueType; + numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 2); + next = m_input.peekWithoutReplacement(numberLength); + } + + if (next == 'E' || next == 'e') { + next = m_input.peekWithoutReplacement(numberLength + 1); + if (isASCIIDigit(next)) { + type = NumberValueType; + numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 1); + } else if ((next == '+' || next == '-') && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 2))) { + type = NumberValueType; + numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 3); + } + } + + double value = m_input.getDouble(0, numberLength); + m_input.advance(numberLength); + + return CSSParserToken(NumberToken, value, type, sign); +} + +// http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token +CSSParserToken CSSTokenizer::consumeNumericToken() +{ + CSSParserToken token = consumeNumber(); + if (nextCharsAreIdentifier()) + token.convertToDimensionWithUnit(consumeName()); + else if (consumeIfNext('%')) + token.convertToPercentage(); + return token; +} + +// http://dev.w3.org/csswg/css-syntax/#consume-ident-like-token +CSSParserToken CSSTokenizer::consumeIdentLikeToken() +{ + StringView name = consumeName(); + if (consumeIfNext('(')) { + if (equalIgnoringASCIICase(name, "url")) { + // The spec is slightly different so as to avoid dropping whitespace + // tokens, but they wouldn't be used and this is easier. + m_input.advanceUntilNonWhitespace(); + UChar next = m_input.peekWithoutReplacement(0); + if (next != '"' && next != '\'') + return consumeUrlToken(); + } + return blockStart(LeftParenthesisToken, FunctionToken, name); + } + return CSSParserToken(IdentToken, name); +} + +// http://dev.w3.org/csswg/css-syntax/#consume-a-string-token +CSSParserToken CSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint) +{ + // Strings without escapes get handled without allocations + for (unsigned size = 0; ; size++) { + UChar cc = m_input.peekWithoutReplacement(size); + if (cc == endingCodePoint) { + unsigned startOffset = m_input.offset(); + m_input.advance(size + 1); + return CSSParserToken(StringToken, m_input.rangeAt(startOffset, size)); + } + if (isNewLine(cc)) { + m_input.advance(size); + return CSSParserToken(BadStringToken); + } + if (cc == '\0' || cc == '\\') + break; + } + + StringBuilder output; + while (true) { + UChar cc = consume(); + if (cc == endingCodePoint || cc == kEndOfFileMarker) + return CSSParserToken(StringToken, registerString(output.toString())); + if (isNewLine(cc)) { + reconsume(cc); + return CSSParserToken(BadStringToken); + } + if (cc == '\\') { + if (m_input.nextInputChar() == kEndOfFileMarker) + continue; + if (isNewLine(m_input.peekWithoutReplacement(0))) + consumeSingleWhitespaceIfNext(); // This handles \r\n for us + else + output.append(consumeEscape()); + } else + output.append(cc); + } +} + +CSSParserToken CSSTokenizer::consumeUnicodeRange() +{ + ASSERT(isASCIIHexDigit(m_input.peekWithoutReplacement(0)) || m_input.peekWithoutReplacement(0) == '?'); + int lengthRemaining = 6; + UChar32 start = 0; + + while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) { + start = start * 16 + toASCIIHexValue(consume()); + --lengthRemaining; + } + + UChar32 end = start; + if (lengthRemaining && consumeIfNext('?')) { + do { + start *= 16; + end = end * 16 + 0xF; + --lengthRemaining; + } while (lengthRemaining && consumeIfNext('?')); + } else if (m_input.peekWithoutReplacement(0) == '-' && isASCIIHexDigit(m_input.peekWithoutReplacement(1))) { + m_input.advance(); + lengthRemaining = 6; + end = 0; + do { + end = end * 16 + toASCIIHexValue(consume()); + --lengthRemaining; + } while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))); + } + + return CSSParserToken(UnicodeRangeToken, start, end); +} + +// http://dev.w3.org/csswg/css-syntax/#non-printable-code-point +static bool isNonPrintableCodePoint(UChar cc) +{ + return cc <= '\x8' || cc == '\xb' || (cc >= '\xe' && cc <= '\x1f') || cc == '\x7f'; +} + +// http://dev.w3.org/csswg/css-syntax/#consume-url-token +CSSParserToken CSSTokenizer::consumeUrlToken() +{ + m_input.advanceUntilNonWhitespace(); + + // URL tokens without escapes get handled without allocations + for (unsigned size = 0; ; size++) { + UChar cc = m_input.peekWithoutReplacement(size); + if (cc == ')') { + unsigned startOffset = m_input.offset(); + m_input.advance(size + 1); + return CSSParserToken(UrlToken, m_input.rangeAt(startOffset, size)); + } + if (cc <= ' ' || cc == '\\' || cc == '"' || cc == '\'' || cc == '(' || cc == '\x7f') + break; + } + + StringBuilder result; + while (true) { + UChar cc = consume(); + if (cc == ')' || cc == kEndOfFileMarker) + return CSSParserToken(UrlToken, registerString(result.toString())); + + if (isHTMLSpace(cc)) { + m_input.advanceUntilNonWhitespace(); + if (consumeIfNext(')') || m_input.nextInputChar() == kEndOfFileMarker) + return CSSParserToken(UrlToken, registerString(result.toString())); + break; + } + + if (cc == '"' || cc == '\'' || cc == '(' || isNonPrintableCodePoint(cc)) + break; + + if (cc == '\\') { + if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) { + result.append(consumeEscape()); + continue; + } + break; + } + + result.append(cc); + } + + consumeBadUrlRemnants(); + return CSSParserToken(BadUrlToken); +} + +// http://dev.w3.org/csswg/css-syntax/#consume-the-remnants-of-a-bad-url +void CSSTokenizer::consumeBadUrlRemnants() +{ + while (true) { + UChar cc = consume(); + if (cc == ')' || cc == kEndOfFileMarker) + return; + if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) + consumeEscape(); + } +} + +void CSSTokenizer::consumeSingleWhitespaceIfNext() +{ + // We check for \r\n and HTML spaces since we don't do preprocessing + UChar next = m_input.peekWithoutReplacement(0); + if (next == '\r' && m_input.peekWithoutReplacement(1) == '\n') + m_input.advance(2); + else if (isHTMLSpace(next)) + m_input.advance(); +} + +void CSSTokenizer::consumeUntilCommentEndFound() +{ + UChar c = consume(); + while (true) { + if (c == kEndOfFileMarker) + return; + if (c != '*') { + c = consume(); + continue; + } + c = consume(); + if (c == '/') + return; + } +} + +bool CSSTokenizer::consumeIfNext(UChar character) +{ + // Since we're not doing replacement we can't tell the difference from + // a NUL in the middle and the kEndOfFileMarker, so character must not be + // NUL. + ASSERT(character); + if (m_input.peekWithoutReplacement(0) == character) { + m_input.advance(); + return true; + } + return false; +} + +// http://www.w3.org/TR/css3-syntax/#consume-a-name +StringView CSSTokenizer::consumeName() +{ + // Names without escapes get handled without allocations + for (unsigned size = 0; ; ++size) { + UChar cc = m_input.peekWithoutReplacement(size); + if (isNameCodePoint(cc)) + continue; + // peekWithoutReplacement will return NUL when we hit the end of the + // input. In that case we want to still use the rangeAt() fast path + // below. + if (cc == '\0' && m_input.offset() + size < m_input.length()) + break; + if (cc == '\\') + break; + unsigned startOffset = m_input.offset(); + m_input.advance(size); + return m_input.rangeAt(startOffset, size); + } + + StringBuilder result; + while (true) { + UChar cc = consume(); + if (isNameCodePoint(cc)) { + result.append(cc); + continue; + } + if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) { + result.append(consumeEscape()); + continue; + } + reconsume(cc); + return registerString(result.toString()); + } +} + +// http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point +UChar32 CSSTokenizer::consumeEscape() +{ + UChar cc = consume(); + ASSERT(!isNewLine(cc)); + if (isASCIIHexDigit(cc)) { + unsigned consumedHexDigits = 1; + StringBuilder hexChars; + hexChars.append(cc); + while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) { + cc = consume(); + hexChars.append(cc); + consumedHexDigits++; + }; + consumeSingleWhitespaceIfNext(); + bool ok = false; + UChar32 codePoint = hexChars.toString().toUIntStrict(&ok, 16); + ASSERT(ok); + if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF) + return replacementCharacter; + return codePoint; + } + + if (cc == kEndOfFileMarker) + return replacementCharacter; + return cc; +} + +bool CSSTokenizer::nextTwoCharsAreValidEscape() +{ + return twoCharsAreValidEscape(m_input.peekWithoutReplacement(0), m_input.peekWithoutReplacement(1)); +} + +// http://www.w3.org/TR/css3-syntax/#starts-with-a-number +bool CSSTokenizer::nextCharsAreNumber(UChar first) +{ + UChar second = m_input.peekWithoutReplacement(0); + if (isASCIIDigit(first)) + return true; + if (first == '+' || first == '-') + return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peekWithoutReplacement(1)))); + if (first =='.') + return (isASCIIDigit(second)); + return false; +} + +bool CSSTokenizer::nextCharsAreNumber() +{ + UChar first = consume(); + bool areNumber = nextCharsAreNumber(first); + reconsume(first); + return areNumber; +} + +// http://dev.w3.org/csswg/css-syntax/#would-start-an-identifier +bool CSSTokenizer::nextCharsAreIdentifier(UChar first) +{ + UChar second = m_input.peekWithoutReplacement(0); + if (isNameStartCodePoint(first) || twoCharsAreValidEscape(first, second)) + return true; + + if (first == '-') + return isNameStartCodePoint(second) || second == '-' || nextTwoCharsAreValidEscape(); + + return false; +} + +bool CSSTokenizer::nextCharsAreIdentifier() +{ + UChar first = consume(); + bool areIdentifier = nextCharsAreIdentifier(first); + reconsume(first); + return areIdentifier; +} + +StringView CSSTokenizer::registerString(const String& string) +{ + m_stringPool.append(string); + return string; +} + +} // namespace WebCore |