diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
commit | 1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch) | |
tree | 46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/JavaScriptCore/yarr/YarrParser.h | |
parent | 32761a6cee1d0dee366b885b7b9c777e67885688 (diff) | |
download | WebKitGtk-tarball-master.tar.gz |
webkitgtk-2.16.5HEADwebkitgtk-2.16.5master
Diffstat (limited to 'Source/JavaScriptCore/yarr/YarrParser.h')
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrParser.h | 225 |
1 files changed, 149 insertions, 76 deletions
diff --git a/Source/JavaScriptCore/yarr/YarrParser.h b/Source/JavaScriptCore/yarr/YarrParser.h index 366aa40d3..9c0982017 100644 --- a/Source/JavaScriptCore/yarr/YarrParser.h +++ b/Source/JavaScriptCore/yarr/YarrParser.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,18 +23,14 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrParser_h -#define YarrParser_h +#pragma once #include "Yarr.h" #include <wtf/ASCIICType.h> #include <wtf/text/WTFString.h> -#include <wtf/unicode/Unicode.h> namespace JSC { namespace Yarr { -#define REGEXP_ERROR_PREFIX "Invalid regular expression: " - enum BuiltInCharacterClassID { DigitClassID, SpaceClassID, @@ -47,22 +43,7 @@ template<class Delegate, typename CharType> class Parser { private: template<class FriendDelegate> - friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); - - enum ErrorCode { - NoError, - PatternTooLarge, - QuantifierOutOfOrder, - QuantifierWithoutAtom, - QuantifierTooLarge, - MissingParentheses, - ParenthesesUnmatched, - ParenthesesTypeInvalid, - CharacterClassUnmatched, - CharacterClassOutOfOrder, - EscapeUnterminated, - NumberOfErrorCodes - }; + friend const char* parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit); /* * CharacterClassParserDelegate: @@ -75,7 +56,7 @@ private: */ class CharacterClassParserDelegate { public: - CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) + CharacterClassParserDelegate(Delegate& delegate, YarrPattern::ErrorCode& err) : m_delegate(delegate) , m_err(err) , m_state(Empty) @@ -102,7 +83,7 @@ private: * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ * is different to /[a\-z]/). */ - void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) + void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false) { switch (m_state) { case AfterCharacterClass: @@ -137,7 +118,7 @@ private: case CachedCharacterHyphen: if (ch < m_character) { - m_err = CharacterClassOutOfOrder; + m_err = YarrPattern::CharacterClassOutOfOrder; return; } m_delegate.atomCharacterClassRange(m_character, ch); @@ -218,7 +199,7 @@ private: private: Delegate& m_delegate; - ErrorCode& m_err; + YarrPattern::ErrorCode& m_err; enum CharacterClassConstructionState { Empty, CachedCharacter, @@ -226,20 +207,34 @@ private: AfterCharacterClass, AfterCharacterClassHyphen, } m_state; - UChar m_character; + UChar32 m_character; }; - Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) + Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit) : m_delegate(delegate) , m_backReferenceLimit(backReferenceLimit) - , m_err(NoError) - , m_data(pattern.getCharacters<CharType>()) + , m_err(YarrPattern::NoError) + , m_data(pattern.characters<CharType>()) , m_size(pattern.length()) , m_index(0) + , m_isUnicode(isUnicode) , m_parenthesesNestingDepth(0) { } + // The handling of IdentityEscapes is different depending on the unicode flag. + // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'. + // For non-unicode patterns, most any character can be escaped. + bool isIdentityEscapeAnError(int ch) + { + if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) { + m_err = YarrPattern::InvalidIdentityEscape; + return true; + } + + return false; + } + /* * parseEscape(): * @@ -268,7 +263,7 @@ private: consume(); if (atEndOfPattern()) { - m_err = EscapeUnterminated; + m_err = YarrPattern::EscapeUnterminated; return false; } @@ -276,18 +271,24 @@ private: // Assertions case 'b': consume(); - if (inCharacterClass) + if (inCharacterClass) { + if (isIdentityEscapeAnError('b')) + break; + delegate.atomPatternCharacter('\b'); - else { + } else { delegate.assertionWordBoundary(false); return false; } break; case 'B': consume(); - if (inCharacterClass) + if (inCharacterClass) { + if (isIdentityEscapeAnError('B')) + break; + delegate.atomPatternCharacter('B'); - else { + } else { delegate.assertionWordBoundary(true); return false; } @@ -402,9 +403,12 @@ private: case 'x': { consume(); int x = tryConsumeHex(2); - if (x == -1) + if (x == -1) { + if (isIdentityEscapeAnError('x')) + break; + delegate.atomPatternCharacter('x'); - else + } else delegate.atomPatternCharacter(x); break; } @@ -412,22 +416,101 @@ private: // UnicodeEscape case 'u': { consume(); + if (atEndOfPattern()) { + if (isIdentityEscapeAnError('u')) + break; + + delegate.atomPatternCharacter('u'); + break; + } + + if (m_isUnicode && peek() == '{') { + consume(); + UChar32 codePoint = 0; + do { + if (atEndOfPattern() || !isASCIIHexDigit(peek())) { + m_err = YarrPattern::InvalidUnicodeEscape; + break; + } + + codePoint = (codePoint << 4) | toASCIIHexValue(consume()); + + if (codePoint > UCHAR_MAX_VALUE) + m_err = YarrPattern::InvalidUnicodeEscape; + } while (!atEndOfPattern() && peek() != '}'); + if (!atEndOfPattern() && peek() == '}') + consume(); + else if (!m_err) + m_err = YarrPattern::InvalidUnicodeEscape; + if (m_err) + return false; + + delegate.atomPatternCharacter(codePoint); + break; + } int u = tryConsumeHex(4); - if (u == -1) + if (u == -1) { + if (isIdentityEscapeAnError('u')) + break; + delegate.atomPatternCharacter('u'); - else + } else { + // If we have the first of a surrogate pair, look for the second. + if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { + ParseState state = saveState(); + consume(); + + if (tryConsume('u')) { + int surrogate2 = tryConsumeHex(4); + if (U16_IS_TRAIL(surrogate2)) { + u = U16_GET_SUPPLEMENTARY(u, surrogate2); + delegate.atomPatternCharacter(u); + break; + } + } + + restoreState(state); + } delegate.atomPatternCharacter(u); + } break; } // IdentityEscape default: + int ch = peek(); + + if (ch == '-' && m_isUnicode && inCharacterClass) { + // \- is allowed for ClassEscape with unicode flag. + delegate.atomPatternCharacter(consume()); + break; + } + + if (isIdentityEscapeAnError(ch)) + break; + delegate.atomPatternCharacter(consume()); } return true; } + UChar32 consumePossibleSurrogatePair() + { + UChar32 ch = consume(); + if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) { + ParseState state = saveState(); + + UChar32 surrogate2 = consume(); + if (U16_IS_TRAIL(surrogate2)) + ch = U16_GET_SUPPLEMENTARY(ch, surrogate2); + else + restoreState(state); + } + + return ch; + } + /* * parseAtomEscape(), parseCharacterClassEscape(): * @@ -471,14 +554,14 @@ private: break; default: - characterClassConstructor.atomPatternCharacter(consume(), true); + characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true); } if (m_err) return; } - m_err = CharacterClassUnmatched; + m_err = YarrPattern::CharacterClassUnmatched; } /* @@ -494,7 +577,7 @@ private: if (tryConsume('?')) { if (atEndOfPattern()) { - m_err = ParenthesesTypeInvalid; + m_err = YarrPattern::ParenthesesTypeInvalid; return; } @@ -512,7 +595,7 @@ private: break; default: - m_err = ParenthesesTypeInvalid; + m_err = YarrPattern::ParenthesesTypeInvalid; } } else m_delegate.atomParenthesesSubpatternBegin(); @@ -534,7 +617,7 @@ private: if (m_parenthesesNestingDepth > 0) m_delegate.atomParenthesesEnd(); else - m_err = ParenthesesUnmatched; + m_err = YarrPattern::ParenthesesUnmatched; --m_parenthesesNestingDepth; } @@ -550,14 +633,14 @@ private: ASSERT(min <= max); if (min == UINT_MAX) { - m_err = QuantifierTooLarge; + m_err = YarrPattern::QuantifierTooLarge; return; } if (lastTokenWasAnAtom) m_delegate.quantifyAtom(min, max, !tryConsume('?')); else - m_err = QuantifierWithoutAtom; + m_err = YarrPattern::QuantifierWithoutAtom; } /* @@ -651,7 +734,7 @@ private: if (min <= max) parseQuantifier(lastTokenWasAnAtom, min, max); else - m_err = QuantifierOutOfOrder; + m_err = YarrPattern::QuantifierOutOfOrder; lastTokenWasAnAtom = false; break; } @@ -663,7 +746,7 @@ private: FALLTHROUGH; default: - m_delegate.atomPatternCharacter(consume()); + m_delegate.atomPatternCharacter(consumePossibleSurrogatePair()); lastTokenWasAnAtom = true; } @@ -672,7 +755,7 @@ private: } if (m_parenthesesNestingDepth > 0) - m_err = MissingParentheses; + m_err = YarrPattern::MissingParentheses; } /* @@ -684,27 +767,12 @@ private: const char* parse() { if (m_size > MAX_PATTERN_SIZE) - m_err = PatternTooLarge; + m_err = YarrPattern::PatternTooLarge; else parseTokens(); ASSERT(atEndOfPattern() || m_err); - - // The order of this array must match the ErrorCode enum. - static const char* errorMessages[NumberOfErrorCodes] = { - 0, // NoError - REGEXP_ERROR_PREFIX "regular expression too large", - REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", - REGEXP_ERROR_PREFIX "nothing to repeat", - REGEXP_ERROR_PREFIX "number too large in {} quantifier", - REGEXP_ERROR_PREFIX "missing )", - REGEXP_ERROR_PREFIX "unmatched parentheses", - REGEXP_ERROR_PREFIX "unrecognized character after (?", - REGEXP_ERROR_PREFIX "missing terminating ] for character class", - REGEXP_ERROR_PREFIX "range out of order in character class", - REGEXP_ERROR_PREFIX "\\ at end of pattern" - }; - - return errorMessages[m_err]; + + return YarrPattern::errorMessage(m_err); } // Misc helper functions: @@ -727,6 +795,12 @@ private: return m_index == m_size; } + unsigned patternRemaining() + { + ASSERT(m_index <= m_size); + return m_size - m_index; + } + int peek() { ASSERT(m_index < m_size); @@ -802,10 +876,11 @@ private: Delegate& m_delegate; unsigned m_backReferenceLimit; - ErrorCode m_err; + YarrPattern::ErrorCode m_err; const CharType* m_data; unsigned m_size; unsigned m_index; + bool m_isUnicode; unsigned m_parenthesesNestingDepth; // Derived by empirical testing of compile time in PCRE and WREC. @@ -826,11 +901,11 @@ private: * void assertionEOL(); * void assertionWordBoundary(bool invert); * - * void atomPatternCharacter(UChar ch); + * void atomPatternCharacter(UChar32 ch); * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); * void atomCharacterClassBegin(bool invert) - * void atomCharacterClassAtom(UChar ch) - * void atomCharacterClassRange(UChar begin, UChar end) + * void atomCharacterClassAtom(UChar32 ch) + * void atomCharacterClassRange(UChar32 begin, UChar32 end) * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) * void atomCharacterClassEnd() * void atomParenthesesSubpatternBegin(bool capture = true); @@ -872,13 +947,11 @@ private: */ template<class Delegate> -const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) +const char* parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite) { if (pattern.is8Bit()) - return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); - return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); + return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); + return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); } } } // namespace JSC::Yarr - -#endif // YarrParser_h |