diff options
Diffstat (limited to 'src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexParser.h')
-rw-r--r-- | src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexParser.h | 854 |
1 files changed, 0 insertions, 854 deletions
diff --git a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexParser.h b/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexParser.h deleted file mode 100644 index 64e8463..0000000 --- a/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexParser.h +++ /dev/null @@ -1,854 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef RegexParser_h -#define RegexParser_h - -#include <wtf/Platform.h> - -#if ENABLE(YARR) - -#include <UString.h> -#include <wtf/ASCIICType.h> -#include <wtf/unicode/Unicode.h> -#include <limits.h> - -namespace JSC { namespace Yarr { - -enum BuiltInCharacterClassID { - DigitClassID, - SpaceClassID, - WordClassID, - NewlineClassID, -}; - -// The Parser class should not be used directly - only via the Yarr::parse() method. -template<class Delegate> -class Parser { -private: - template<class FriendDelegate> - friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); - - enum ErrorCode { - NoError, - PatternTooLarge, - QuantifierOutOfOrder, - QuantifierWithoutAtom, - MissingParentheses, - ParenthesesUnmatched, - ParenthesesTypeInvalid, - CharacterClassUnmatched, - CharacterClassOutOfOrder, - EscapeUnterminated, - NumberOfErrorCodes - }; - - /* - * CharacterClassParserDelegate: - * - * The class CharacterClassParserDelegate is used in the parsing of character - * classes. This class handles detection of character ranges. This class - * implements enough of the delegate interface such that it can be passed to - * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused - * to perform the parsing of escape characters in character sets. - */ - class CharacterClassParserDelegate { - public: - CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) - : m_delegate(delegate) - , m_err(err) - , m_state(empty) - { - } - - /* - * begin(): - * - * Called at beginning of construction. - */ - void begin(bool invert) - { - m_delegate.atomCharacterClassBegin(invert); - } - - /* - * atomPatternCharacterUnescaped(): - * - * This method is called directly from parseCharacterClass(), to report a new - * pattern character token. This method differs from atomPatternCharacter(), - * which will be called from parseEscape(), since a hypen provided via this - * method may be indicating a character range, but a hyphen parsed by - * parseEscape() cannot be interpreted as doing so. - */ - void atomPatternCharacterUnescaped(UChar ch) - { - switch (m_state) { - case empty: - m_character = ch; - m_state = cachedCharacter; - break; - - case cachedCharacter: - if (ch == '-') - m_state = cachedCharacterHyphen; - else { - m_delegate.atomCharacterClassAtom(m_character); - m_character = ch; - } - break; - - case cachedCharacterHyphen: - if (ch >= m_character) - m_delegate.atomCharacterClassRange(m_character, ch); - else - m_err = CharacterClassOutOfOrder; - m_state = empty; - } - } - - /* - * atomPatternCharacter(): - * - * Adds a pattern character, called by parseEscape(), as such will not - * interpret a hyphen as indicating a character range. - */ - void atomPatternCharacter(UChar ch) - { - // Flush if a character is already pending to prevent the - // hyphen from begin interpreted as indicating a range. - if((ch == '-') && (m_state == cachedCharacter)) - flush(); - - atomPatternCharacterUnescaped(ch); - } - - /* - * atomBuiltInCharacterClass(): - * - * Adds a built-in character class, called by parseEscape(). - */ - void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) - { - flush(); - m_delegate.atomCharacterClassBuiltIn(classID, invert); - } - - /* - * end(): - * - * Called at end of construction. - */ - void end() - { - flush(); - m_delegate.atomCharacterClassEnd(); - } - - // parseEscape() should never call these delegate methods when - // invoked with inCharacterClass set. - void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } - void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } - - private: - void flush() - { - if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen - m_delegate.atomCharacterClassAtom(m_character); - if (m_state == cachedCharacterHyphen) - m_delegate.atomCharacterClassAtom('-'); - m_state = empty; - } - - Delegate& m_delegate; - ErrorCode& m_err; - enum CharacterClassConstructionState { - empty, - cachedCharacter, - cachedCharacterHyphen, - } m_state; - UChar m_character; - }; - - Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) - : m_delegate(delegate) - , m_backReferenceLimit(backReferenceLimit) - , m_err(NoError) - , m_data(pattern.data()) - , m_size(pattern.size()) - , m_index(0) - , m_parenthesesNestingDepth(0) - { - } - - /* - * parseEscape(): - * - * Helper for parseTokens() AND parseCharacterClass(). - * Unlike the other parser methods, this function does not report tokens - * directly to the member delegate (m_delegate), instead tokens are - * emitted to the delegate provided as an argument. In the case of atom - * escapes, parseTokens() will call parseEscape() passing m_delegate as - * an argument, and as such the escape will be reported to the delegate. - * - * However this method may also be used by parseCharacterClass(), in which - * case a CharacterClassParserDelegate will be passed as the delegate that - * tokens should be added to. A boolean flag is also provided to indicate - * whether that an escape in a CharacterClass is being parsed (some parsing - * rules change in this context). - * - * The boolean value returned by this method indicates whether the token - * parsed was an atom (outside of a characted class \b and \B will be - * interpreted as assertions). - */ - template<bool inCharacterClass, class EscapeDelegate> - bool parseEscape(EscapeDelegate& delegate) - { - ASSERT(!m_err); - ASSERT(peek() == '\\'); - consume(); - - if (atEndOfPattern()) { - m_err = EscapeUnterminated; - return false; - } - - switch (peek()) { - // Assertions - case 'b': - consume(); - if (inCharacterClass) - delegate.atomPatternCharacter('\b'); - else { - delegate.assertionWordBoundary(false); - return false; - } - break; - case 'B': - consume(); - if (inCharacterClass) - delegate.atomPatternCharacter('B'); - else { - delegate.assertionWordBoundary(true); - return false; - } - break; - - // CharacterClassEscape - case 'd': - consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, false); - break; - case 's': - consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, false); - break; - case 'w': - consume(); - delegate.atomBuiltInCharacterClass(WordClassID, false); - break; - case 'D': - consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, true); - break; - case 'S': - consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, true); - break; - case 'W': - consume(); - delegate.atomBuiltInCharacterClass(WordClassID, true); - break; - - // DecimalEscape - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. - // First, try to parse this as backreference. - if (!inCharacterClass) { - ParseState state = saveState(); - - unsigned backReference = consumeNumber(); - if (backReference <= m_backReferenceLimit) { - delegate.atomBackReference(backReference); - break; - } - - restoreState(state); - } - - // Not a backreference, and not octal. - if (peek() >= '8') { - delegate.atomPatternCharacter('\\'); - break; - } - - // Fall-through to handle this as an octal escape. - } - - // Octal escape - case '0': - delegate.atomPatternCharacter(consumeOctal()); - break; - - // ControlEscape - case 'f': - consume(); - delegate.atomPatternCharacter('\f'); - break; - case 'n': - consume(); - delegate.atomPatternCharacter('\n'); - break; - case 'r': - consume(); - delegate.atomPatternCharacter('\r'); - break; - case 't': - consume(); - delegate.atomPatternCharacter('\t'); - break; - case 'v': - consume(); - delegate.atomPatternCharacter('\v'); - break; - - // ControlLetter - case 'c': { - ParseState state = saveState(); - consume(); - if (!atEndOfPattern()) { - int control = consume(); - - // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. - if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { - delegate.atomPatternCharacter(control & 0x1f); - break; - } - } - restoreState(state); - delegate.atomPatternCharacter('\\'); - break; - } - - // HexEscape - case 'x': { - consume(); - int x = tryConsumeHex(2); - if (x == -1) - delegate.atomPatternCharacter('x'); - else - delegate.atomPatternCharacter(x); - break; - } - - // UnicodeEscape - case 'u': { - consume(); - int u = tryConsumeHex(4); - if (u == -1) - delegate.atomPatternCharacter('u'); - else - delegate.atomPatternCharacter(u); - break; - } - - // IdentityEscape - default: - delegate.atomPatternCharacter(consume()); - } - - return true; - } - - /* - * parseAtomEscape(), parseCharacterClassEscape(): - * - * These methods alias to parseEscape(). - */ - bool parseAtomEscape() - { - return parseEscape<false>(m_delegate); - } - void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) - { - parseEscape<true>(delegate); - } - - /* - * parseCharacterClass(): - * - * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) - * to an instance of CharacterClassParserDelegate, to describe the character class to the - * delegate. - */ - void parseCharacterClass() - { - ASSERT(!m_err); - ASSERT(peek() == '['); - consume(); - - CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); - - characterClassConstructor.begin(tryConsume('^')); - - while (!atEndOfPattern()) { - switch (peek()) { - case ']': - consume(); - characterClassConstructor.end(); - return; - - case '\\': - parseCharacterClassEscape(characterClassConstructor); - break; - - default: - characterClassConstructor.atomPatternCharacterUnescaped(consume()); - } - - if (m_err) - return; - } - - m_err = CharacterClassUnmatched; - } - - /* - * parseParenthesesBegin(): - * - * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. - */ - void parseParenthesesBegin() - { - ASSERT(!m_err); - ASSERT(peek() == '('); - consume(); - - if (tryConsume('?')) { - if (atEndOfPattern()) { - m_err = ParenthesesTypeInvalid; - return; - } - - switch (consume()) { - case ':': - m_delegate.atomParenthesesSubpatternBegin(false); - break; - - case '=': - m_delegate.atomParentheticalAssertionBegin(); - break; - - case '!': - m_delegate.atomParentheticalAssertionBegin(true); - break; - - default: - m_err = ParenthesesTypeInvalid; - } - } else - m_delegate.atomParenthesesSubpatternBegin(); - - ++m_parenthesesNestingDepth; - } - - /* - * parseParenthesesEnd(): - * - * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). - */ - void parseParenthesesEnd() - { - ASSERT(!m_err); - ASSERT(peek() == ')'); - consume(); - - if (m_parenthesesNestingDepth > 0) - m_delegate.atomParenthesesEnd(); - else - m_err = ParenthesesUnmatched; - - --m_parenthesesNestingDepth; - } - - /* - * parseQuantifier(): - * - * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. - */ - void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) - { - ASSERT(!m_err); - ASSERT(min <= max); - - if (lastTokenWasAnAtom) - m_delegate.quantifyAtom(min, max, !tryConsume('?')); - else - m_err = QuantifierWithoutAtom; - } - - /* - * parseTokens(): - * - * This method loops over the input pattern reporting tokens to the delegate. - * The method returns when a parse error is detected, or the end of the pattern - * is reached. One piece of state is tracked around the loop, which is whether - * the last token passed to the delegate was an atom (this is necessary to detect - * a parse error when a quantifier provided without an atom to quantify). - */ - void parseTokens() - { - bool lastTokenWasAnAtom = false; - - while (!atEndOfPattern()) { - switch (peek()) { - case '|': - consume(); - m_delegate.disjunction(); - lastTokenWasAnAtom = false; - break; - - case '(': - parseParenthesesBegin(); - lastTokenWasAnAtom = false; - break; - - case ')': - parseParenthesesEnd(); - lastTokenWasAnAtom = true; - break; - - case '^': - consume(); - m_delegate.assertionBOL(); - lastTokenWasAnAtom = false; - break; - - case '$': - consume(); - m_delegate.assertionEOL(); - lastTokenWasAnAtom = false; - break; - - case '.': - consume(); - m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); - lastTokenWasAnAtom = true; - break; - - case '[': - parseCharacterClass(); - lastTokenWasAnAtom = true; - break; - - case '\\': - lastTokenWasAnAtom = parseAtomEscape(); - break; - - case '*': - consume(); - parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX); - lastTokenWasAnAtom = false; - break; - - case '+': - consume(); - parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX); - lastTokenWasAnAtom = false; - break; - - case '?': - consume(); - parseQuantifier(lastTokenWasAnAtom, 0, 1); - lastTokenWasAnAtom = false; - break; - - case '{': { - ParseState state = saveState(); - - consume(); - if (peekIsDigit()) { - unsigned min = consumeNumber(); - unsigned max = min; - - if (tryConsume(',')) - max = peekIsDigit() ? consumeNumber() : UINT_MAX; - - if (tryConsume('}')) { - if (min <= max) - parseQuantifier(lastTokenWasAnAtom, min, max); - else - m_err = QuantifierOutOfOrder; - lastTokenWasAnAtom = false; - break; - } - } - - restoreState(state); - } // if we did not find a complete quantifer, fall through to the default case. - - default: - m_delegate.atomPatternCharacter(consume()); - lastTokenWasAnAtom = true; - } - - if (m_err) - return; - } - - if (m_parenthesesNestingDepth > 0) - m_err = MissingParentheses; - } - - /* - * parse(): - * - * This method calls regexBegin(), calls parseTokens() to parse over the input - * patterns, calls regexEnd() or regexError() as appropriate, and converts any - * error code to a const char* for a result. - */ - const char* parse() - { - m_delegate.regexBegin(); - - if (m_size > MAX_PATTERN_SIZE) - m_err = PatternTooLarge; - else - parseTokens(); - ASSERT(atEndOfPattern() || m_err); - - if (m_err) - m_delegate.regexError(); - else - m_delegate.regexEnd(); - - // The order of this array must match the ErrorCode enum. - static const char* errorMessages[NumberOfErrorCodes] = { - 0, // NoError - "regular expression too large", - "numbers out of order in {} quantifier", - "nothing to repeat", - "missing )", - "unmatched parentheses", - "unrecognized character after (?", - "missing terminating ] for character class", - "range out of order in character class", - "\\ at end of pattern" - }; - - return errorMessages[m_err]; - } - - - // Misc helper functions: - - typedef unsigned ParseState; - - ParseState saveState() - { - return m_index; - } - - void restoreState(ParseState state) - { - m_index = state; - } - - bool atEndOfPattern() - { - ASSERT(m_index <= m_size); - return m_index == m_size; - } - - int peek() - { - ASSERT(m_index < m_size); - return m_data[m_index]; - } - - bool peekIsDigit() - { - return !atEndOfPattern() && WTF::isASCIIDigit(peek()); - } - - unsigned peekDigit() - { - ASSERT(peekIsDigit()); - return peek() - '0'; - } - - int consume() - { - ASSERT(m_index < m_size); - return m_data[m_index++]; - } - - unsigned consumeDigit() - { - ASSERT(peekIsDigit()); - return consume() - '0'; - } - - unsigned consumeNumber() - { - unsigned n = consumeDigit(); - // check for overflow. - for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { - n = newValue; - consume(); - } - return n; - } - - unsigned consumeOctal() - { - ASSERT(WTF::isASCIIOctalDigit(peek())); - - unsigned n = consumeDigit(); - while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) - n = n * 8 + consumeDigit(); - return n; - } - - bool tryConsume(UChar ch) - { - if (atEndOfPattern() || (m_data[m_index] != ch)) - return false; - ++m_index; - return true; - } - - int tryConsumeHex(int count) - { - ParseState state = saveState(); - - int n = 0; - while (count--) { - if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { - restoreState(state); - return -1; - } - n = (n << 4) | WTF::toASCIIHexValue(consume()); - } - return n; - } - - Delegate& m_delegate; - unsigned m_backReferenceLimit; - ErrorCode m_err; - const UChar* m_data; - unsigned m_size; - unsigned m_index; - unsigned m_parenthesesNestingDepth; - - // Derived by empirical testing of compile time in PCRE and WREC. - static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; -}; - -/* - * Yarr::parse(): - * - * The parse method is passed a pattern to be parsed and a delegate upon which - * callbacks will be made to record the parsed tokens forming the regex. - * Yarr::parse() returns null on success, or a const C string providing an error - * message where a parse error occurs. - * - * The Delegate must implement the following interface: - * - * void assertionBOL(); - * void assertionEOL(); - * void assertionWordBoundary(bool invert); - * - * void atomPatternCharacter(UChar ch); - * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); - * void atomCharacterClassBegin(bool invert) - * void atomCharacterClassAtom(UChar ch) - * void atomCharacterClassRange(UChar begin, UChar end) - * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) - * void atomCharacterClassEnd() - * void atomParenthesesSubpatternBegin(bool capture = true); - * void atomParentheticalAssertionBegin(bool invert = false); - * void atomParenthesesEnd(); - * void atomBackReference(unsigned subpatternId); - * - * void quantifyAtom(unsigned min, unsigned max, bool greedy); - * - * void disjunction(); - * - * void regexBegin(); - * void regexEnd(); - * void regexError(); - * - * Before any call recording tokens are made, regexBegin() will be called on the - * delegate once. Once parsing is complete either regexEnd() or regexError() will - * be called, as appropriate. - * - * The regular expression is described by a sequence of assertion*() and atom*() - * callbacks to the delegate, describing the terms in the regular expression. - * Following an atom a quantifyAtom() call may occur to indicate that the previous - * atom should be quantified. In the case of atoms described across multiple - * calls (parentheses and character classes) the call to quantifyAtom() will come - * after the call to the atom*End() method, never after atom*Begin(). - * - * Character classes may either be described by a single call to - * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. - * In the latter case, ...Begin() will be called, followed by a sequence of - * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). - * - * Sequences of atoms and assertions are broken into alternatives via calls to - * disjunction(). Assertions, atoms, and disjunctions emitted between calls to - * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. - * atomParenthesesBegin() is passed a subpatternId. In the case of a regular - * capturing subpattern, this will be the subpatternId associated with these - * parentheses, and will also by definition be the lowest subpatternId of these - * parentheses and of any nested paretheses. The atomParenthesesEnd() method - * is passed the subpatternId of the last capturing subexpression nested within - * these paretheses. In the case of a capturing subpattern with no nested - * capturing subpatterns, the same subpatternId will be passed to the begin and - * end functions. In the case of non-capturing subpatterns the subpatternId - * passed to the begin method is also the first possible subpatternId that might - * be nested within these paretheses. If a set of non-capturing parentheses does - * not contain any capturing subpatterns, then the subpatternId passed to begin - * will be greater than the subpatternId passed to end. - */ - -template<class Delegate> -const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX) -{ - return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); -} - -} } // namespace JSC::Yarr - -#endif - -#endif // RegexParser_h |