/* * Copyright (C) 2009 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "RegexInterpreter.h" #include "RegexCompiler.h" #include "RegexPattern.h" #ifndef NDEBUG #include #endif #if ENABLE(YARR) using namespace WTF; namespace JSC { namespace Yarr { class Interpreter { public: struct ParenthesesDisjunctionContext; struct BackTrackInfoPatternCharacter { uintptr_t matchAmount; }; struct BackTrackInfoCharacterClass { uintptr_t matchAmount; }; struct BackTrackInfoBackReference { uintptr_t begin; // Not really needed for greedy quantifiers. uintptr_t matchAmount; // Not really needed for fixed quantifiers. }; struct BackTrackInfoAlternative { uintptr_t offset; }; struct BackTrackInfoParentheticalAssertion { uintptr_t begin; }; struct BackTrackInfoParenthesesOnce { uintptr_t inParentheses; }; struct BackTrackInfoParentheses { uintptr_t matchAmount; ParenthesesDisjunctionContext* lastContext; uintptr_t prevBegin; uintptr_t prevEnd; }; static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) { context->next = backTrack->lastContext; backTrack->lastContext = context; ++backTrack->matchAmount; } static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) { ASSERT(backTrack->matchAmount); ASSERT(backTrack->lastContext); backTrack->lastContext = backTrack->lastContext->next; --backTrack->matchAmount; } struct DisjunctionContext { DisjunctionContext() : term(0) { } void* operator new(size_t, void* where) { return where; } int term; unsigned matchBegin; unsigned matchEnd; uintptr_t frame[1]; }; DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) { return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext(); } void freeDisjunctionContext(DisjunctionContext* context) { free(context); } struct ParenthesesDisjunctionContext { ParenthesesDisjunctionContext(int* output, ByteTerm& term) : next(0) { unsigned firstSubpatternId = term.atom.subpatternId; unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; output[(firstSubpatternId << 1) + i] = -1; } new(getDisjunctionContext(term)) DisjunctionContext(); } void* operator new(size_t, void* where) { return where; } void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) { for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; } DisjunctionContext* getDisjunctionContext(ByteTerm& term) { return reinterpret_cast(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); } ParenthesesDisjunctionContext* next; int subpatternBackup[1]; }; ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) { return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term); } void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) { free(context); } class InputStream { public: InputStream(const UChar* input, unsigned start, unsigned length) : input(input) , pos(start) , length(length) { } void next() { ++pos; } void rewind(unsigned amount) { ASSERT(pos >= amount); pos -= amount; } int read() { ASSERT(pos < length); if (pos < length) return input[pos]; return -1; } int readChecked(int position) { ASSERT(position < 0); ASSERT((unsigned)-position <= pos); unsigned p = pos + position; ASSERT(p < length); return input[p]; } int reread(unsigned from) { ASSERT(from < length); return input[from]; } int prev() { ASSERT(!(pos > length)); if (pos && length) return input[pos - 1]; return -1; } unsigned getPos() { return pos; } void setPos(unsigned p) { pos = p; } bool atStart() { return pos == 0; } bool atEnd() { return pos == length; } bool checkInput(int count) { if ((pos + count) <= length) { pos += count; return true; } else return false; } void uncheckInput(int count) { pos -= count; } bool atStart(int position) { return (pos + position) == 0; } bool atEnd(int position) { return (pos + position) == length; } private: const UChar* input; unsigned pos; unsigned length; }; bool testCharacterClass(CharacterClass* characterClass, int ch) { if (ch & 0xFF80) { for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) if (ch == characterClass->m_matchesUnicode[i]) return true; for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) return true; } else { for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) if (ch == characterClass->m_matches[i]) return true; for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) return true; } return false; } bool tryConsumeCharacter(int testChar) { if (input.atEnd()) return false; int ch = input.read(); if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) { input.next(); return true; } return false; } bool checkCharacter(int testChar, int inputPosition) { return testChar == input.readChecked(inputPosition); } bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) { int ch = input.readChecked(inputPosition); return (loChar == ch) || (hiChar == ch); } bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert) { if (input.atEnd()) return false; bool match = testCharacterClass(characterClass, input.read()); if (invert) match = !match; if (match) { input.next(); return true; } return false; } bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) { bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); return invert ? !match : match; } bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) { int matchSize = matchEnd - matchBegin; if (!input.checkInput(matchSize)) return false; for (int i = 0; i < matchSize; ++i) { if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { input.uncheckInput(matchSize); return false; } } return true; } bool matchAssertionBOL(ByteTerm& term) { return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); } bool matchAssertionEOL(ByteTerm& term) { if (term.inputPosition) return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); else return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); } bool matchAssertionWordBoundary(ByteTerm& term) { bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); bool readIsWordchar; if (term.inputPosition) readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); else readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); bool wordBoundary = prevIsWordchar != readIsWordchar; return term.invert() ? !wordBoundary : wordBoundary; } bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierFixedCount: break; case QuantifierGreedy: if (backTrack->matchAmount) { --backTrack->matchAmount; input.uncheckInput(1); return true; } break; case QuantifierNonGreedy: if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { ++backTrack->matchAmount; if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) return true; } input.uncheckInput(backTrack->matchAmount); break; } return false; } bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierFixedCount: break; case QuantifierGreedy: if (backTrack->matchAmount) { --backTrack->matchAmount; input.uncheckInput(1); return true; } break; case QuantifierNonGreedy: if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { ++backTrack->matchAmount; if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) return true; } input.uncheckInput(backTrack->matchAmount); break; } return false; } bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeCharacterClass); BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierFixedCount: { for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) return false; } return true; } case QuantifierGreedy: { unsigned matchAmount = 0; while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { input.uncheckInput(1); break; } ++matchAmount; } backTrack->matchAmount = matchAmount; return true; } case QuantifierNonGreedy: backTrack->matchAmount = 0; return true; } ASSERT_NOT_REACHED(); return false; } bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeCharacterClass); BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierFixedCount: break; case QuantifierGreedy: if (backTrack->matchAmount) { --backTrack->matchAmount; input.uncheckInput(1); return true; } break; case QuantifierNonGreedy: if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { ++backTrack->matchAmount; if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) return true; } input.uncheckInput(backTrack->matchAmount); break; } return false; } bool matchBackReference(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeBackReference); BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); int matchBegin = output[(term.atom.subpatternId << 1)]; int matchEnd = output[(term.atom.subpatternId << 1) + 1]; ASSERT((matchBegin == -1) == (matchEnd == -1)); ASSERT(matchBegin <= matchEnd); if (matchBegin == matchEnd) return true; switch (term.atom.quantityType) { case QuantifierFixedCount: { backTrack->begin = input.getPos(); for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { input.setPos(backTrack->begin); return false; } } return true; } case QuantifierGreedy: { unsigned matchAmount = 0; while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) ++matchAmount; backTrack->matchAmount = matchAmount; return true; } case QuantifierNonGreedy: backTrack->begin = input.getPos(); backTrack->matchAmount = 0; return true; } ASSERT_NOT_REACHED(); return false; } bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeBackReference); BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); int matchBegin = output[(term.atom.subpatternId << 1)]; int matchEnd = output[(term.atom.subpatternId << 1) + 1]; ASSERT((matchBegin == -1) == (matchEnd == -1)); ASSERT(matchBegin <= matchEnd); if (matchBegin == matchEnd) return false; switch (term.atom.quantityType) { case QuantifierFixedCount: // for quantityCount == 1, could rewind. input.setPos(backTrack->begin); break; case QuantifierGreedy: if (backTrack->matchAmount) { --backTrack->matchAmount; input.rewind(matchEnd - matchBegin); return true; } break; case QuantifierNonGreedy: if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { ++backTrack->matchAmount; return true; } else input.setPos(backTrack->begin); break; } return false; } void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) { if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; } } void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) { unsigned firstSubpatternId = term.atom.subpatternId; unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; context->restoreOutput(output, firstSubpatternId, count); } void resetAssertionMatches(ByteTerm& term) { unsigned firstSubpatternId = term.atom.subpatternId; unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; for (unsigned i = 0; i < (count << 1); ++i) output[(firstSubpatternId << 1) + i] = -1; } bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) { while (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true)) return true; resetMatches(term, context); popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); } return false; } bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); ASSERT(term.atom.quantityCount == 1); BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierGreedy: { // set this speculatively; if we get to the parens end this will be true. backTrack->inParentheses = 1; break; } case QuantifierNonGreedy: { backTrack->inParentheses = 0; context->term += term.atom.parenthesesWidth; return true; } case QuantifierFixedCount: break; } if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; output[(subpatternId << 1)] = input.getPos() + term.inputPosition; } return true; } bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); ASSERT(term.atom.quantityCount == 1); if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; } return true; } bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); ASSERT(term.atom.quantityCount == 1); BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; output[(subpatternId << 1)] = -1; output[(subpatternId << 1) + 1] = -1; } switch (term.atom.quantityType) { case QuantifierGreedy: // if we backtrack to this point, there is another chance - try matching nothing. ASSERT(backTrack->inParentheses); backTrack->inParentheses = 0; context->term += term.atom.parenthesesWidth; return true; case QuantifierNonGreedy: ASSERT(backTrack->inParentheses); case QuantifierFixedCount: break; } return false; } bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); ASSERT(term.atom.quantityCount == 1); BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierGreedy: if (!backTrack->inParentheses) { context->term -= term.atom.parenthesesWidth; return false; } case QuantifierNonGreedy: if (!backTrack->inParentheses) { // now try to match the parens; set this speculatively. backTrack->inParentheses = 1; if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; } context->term -= term.atom.parenthesesWidth; return true; } case QuantifierFixedCount: break; } return false; } bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); ASSERT(term.atom.quantityCount == 1); BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); backTrack->begin = input.getPos(); return true; } bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); ASSERT(term.atom.quantityCount == 1); BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); input.setPos(backTrack->begin); // We've reached the end of the parens; if they are inverted, this is failure. if (term.invert()) { context->term -= term.atom.parenthesesWidth; return false; } return true; } bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); ASSERT(term.atom.quantityCount == 1); // We've failed to match parens; if they are inverted, this is win! if (term.invert()) { context->term += term.atom.parenthesesWidth; return true; } return false; } bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); ASSERT(term.atom.quantityCount == 1); BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); input.setPos(backTrack->begin); context->term -= term.atom.parenthesesWidth; return false; } bool matchParentheses(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); BackTrackInfoParentheses* backTrack = reinterpret_cast(context->frame + term.frameLocation); unsigned subpatternId = term.atom.subpatternId; ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; backTrack->prevBegin = output[(subpatternId << 1)]; backTrack->prevEnd = output[(subpatternId << 1) + 1]; backTrack->matchAmount = 0; backTrack->lastContext = 0; switch (term.atom.quantityType) { case QuantifierFixedCount: { // While we haven't yet reached our fixed limit, while (backTrack->matchAmount < term.atom.quantityCount) { // Try to do a match, and it it succeeds, add it to the list. ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) appendParenthesesDisjunctionContext(backTrack, context); else { // The match failed; try to find an alternate point to carry on from. resetMatches(term, context); freeParenthesesDisjunctionContext(context); if (!parenthesesDoBacktrack(term, backTrack)) return false; } } ASSERT(backTrack->matchAmount == term.atom.quantityCount); ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); return true; } case QuantifierGreedy: { while (backTrack->matchAmount < term.atom.quantityCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) appendParenthesesDisjunctionContext(backTrack, context); else { resetMatches(term, context); freeParenthesesDisjunctionContext(context); break; } } if (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); } return true; } case QuantifierNonGreedy: return true; } ASSERT_NOT_REACHED(); return false; } // Rules for backtracking differ depending on whether this is greedy or non-greedy. // // Greedy matches never should try just adding more - you should already have done // the 'more' cases. Always backtrack, at least a leetle bit. However cases where // you backtrack an item off the list needs checking, since we'll never have matched // the one less case. Tracking forwards, still add as much as possible. // // Non-greedy, we've already done the one less case, so don't match on popping. // We haven't done the one more case, so always try to add that. // bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); BackTrackInfoParentheses* backTrack = reinterpret_cast(context->frame + term.frameLocation); if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; output[(subpatternId << 1)] = backTrack->prevBegin; output[(subpatternId << 1) + 1] = backTrack->prevEnd; } ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; switch (term.atom.quantityType) { case QuantifierFixedCount: { ASSERT(backTrack->matchAmount == term.atom.quantityCount); ParenthesesDisjunctionContext* context = 0; if (!parenthesesDoBacktrack(term, backTrack)) return false; // While we haven't yet reached our fixed limit, while (backTrack->matchAmount < term.atom.quantityCount) { // Try to do a match, and it it succeeds, add it to the list. context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) appendParenthesesDisjunctionContext(backTrack, context); else { // The match failed; try to find an alternate point to carry on from. resetMatches(term, context); freeParenthesesDisjunctionContext(context); if (!parenthesesDoBacktrack(term, backTrack)) return false; } } ASSERT(backTrack->matchAmount == term.atom.quantityCount); context = backTrack->lastContext; recordParenthesesMatch(term, context); return true; } case QuantifierGreedy: { if (!backTrack->matchAmount) return false; ParenthesesDisjunctionContext* context = backTrack->lastContext; if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { while (backTrack->matchAmount < term.atom.quantityCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) appendParenthesesDisjunctionContext(backTrack, context); else { resetMatches(term, context); freeParenthesesDisjunctionContext(context); break; } } } else { resetMatches(term, context); popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); } if (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); } return true; } case QuantifierNonGreedy: { // If we've not reached the limit, try to add one more match. if (backTrack->matchAmount < term.atom.quantityCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) { appendParenthesesDisjunctionContext(backTrack, context); recordParenthesesMatch(term, context); return true; } else { resetMatches(term, context); freeParenthesesDisjunctionContext(context); } } // Nope - okay backtrack looking for an alternative. while (backTrack->matchAmount) { ParenthesesDisjunctionContext* context = backTrack->lastContext; if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { // successful backtrack! we're back in the game! if (backTrack->matchAmount) { context = backTrack->lastContext; recordParenthesesMatch(term, context); } return true; } // pop a match off the stack resetMatches(term, context); popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); } return false; } } ASSERT_NOT_REACHED(); return false; } #define MATCH_NEXT() { ++context->term; goto matchAgain; } #define BACKTRACK() { --context->term; goto backtrack; } #define currentTerm() (disjunction->terms[context->term]) bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) { if (btrack) BACKTRACK(); context->matchBegin = input.getPos(); context->term = 0; matchAgain: ASSERT(context->term < static_cast(disjunction->terms.size())); switch (currentTerm().type) { case ByteTerm::TypeSubpatternBegin: MATCH_NEXT(); case ByteTerm::TypeSubpatternEnd: context->matchEnd = input.getPos(); return true; case ByteTerm::TypeBodyAlternativeBegin: MATCH_NEXT(); case ByteTerm::TypeBodyAlternativeDisjunction: case ByteTerm::TypeBodyAlternativeEnd: context->matchEnd = input.getPos(); return true; case ByteTerm::TypeAlternativeBegin: MATCH_NEXT(); case ByteTerm::TypeAlternativeDisjunction: case ByteTerm::TypeAlternativeEnd: { int offset = currentTerm().alternative.end; BackTrackInfoAlternative* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); backTrack->offset = offset; context->term += offset; MATCH_NEXT(); } case ByteTerm::TypeAssertionBOL: if (matchAssertionBOL(currentTerm())) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeAssertionEOL: if (matchAssertionEOL(currentTerm())) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeAssertionWordBoundary: if (matchAssertionWordBoundary(currentTerm())) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypePatternCharacterOnce: case ByteTerm::TypePatternCharacterFixed: { for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) BACKTRACK(); } MATCH_NEXT(); } case ByteTerm::TypePatternCharacterGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); unsigned matchAmount = 0; while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { input.uncheckInput(1); break; } ++matchAmount; } backTrack->matchAmount = matchAmount; MATCH_NEXT(); } case ByteTerm::TypePatternCharacterNonGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); backTrack->matchAmount = 0; MATCH_NEXT(); } case ByteTerm::TypePatternCasedCharacterOnce: case ByteTerm::TypePatternCasedCharacterFixed: { for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) BACKTRACK(); } MATCH_NEXT(); } case ByteTerm::TypePatternCasedCharacterGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); unsigned matchAmount = 0; while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { input.uncheckInput(1); break; } ++matchAmount; } backTrack->matchAmount = matchAmount; MATCH_NEXT(); } case ByteTerm::TypePatternCasedCharacterNonGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); backTrack->matchAmount = 0; MATCH_NEXT(); } case ByteTerm::TypeCharacterClass: if (matchCharacterClass(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeBackReference: if (matchBackReference(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParenthesesSubpattern: if (matchParentheses(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParenthesesSubpatternOnceBegin: if (matchParenthesesOnceBegin(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParenthesesSubpatternOnceEnd: if (matchParenthesesOnceEnd(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParentheticalAssertionBegin: if (matchParentheticalAssertionBegin(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParentheticalAssertionEnd: if (matchParentheticalAssertionEnd(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeCheckInput: if (input.checkInput(currentTerm().checkInputCount)) MATCH_NEXT(); BACKTRACK(); } // We should never fall-through to here. ASSERT_NOT_REACHED(); backtrack: ASSERT(context->term < static_cast(disjunction->terms.size())); switch (currentTerm().type) { case ByteTerm::TypeSubpatternBegin: return false; case ByteTerm::TypeSubpatternEnd: ASSERT_NOT_REACHED(); case ByteTerm::TypeBodyAlternativeBegin: case ByteTerm::TypeBodyAlternativeDisjunction: { int offset = currentTerm().alternative.next; context->term += offset; if (offset > 0) MATCH_NEXT(); if (input.atEnd()) return false; input.next(); context->matchBegin = input.getPos(); MATCH_NEXT(); } case ByteTerm::TypeBodyAlternativeEnd: ASSERT_NOT_REACHED(); case ByteTerm::TypeAlternativeBegin: case ByteTerm::TypeAlternativeDisjunction: { int offset = currentTerm().alternative.next; context->term += offset; if (offset > 0) MATCH_NEXT(); BACKTRACK(); } case ByteTerm::TypeAlternativeEnd: { // We should never backtrack back into an alternative of the main body of the regex. BackTrackInfoAlternative* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); unsigned offset = backTrack->offset; context->term -= offset; BACKTRACK(); } case ByteTerm::TypeAssertionBOL: case ByteTerm::TypeAssertionEOL: case ByteTerm::TypeAssertionWordBoundary: BACKTRACK(); case ByteTerm::TypePatternCharacterOnce: case ByteTerm::TypePatternCharacterFixed: case ByteTerm::TypePatternCharacterGreedy: case ByteTerm::TypePatternCharacterNonGreedy: if (backtrackPatternCharacter(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypePatternCasedCharacterOnce: case ByteTerm::TypePatternCasedCharacterFixed: case ByteTerm::TypePatternCasedCharacterGreedy: case ByteTerm::TypePatternCasedCharacterNonGreedy: if (backtrackPatternCasedCharacter(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeCharacterClass: if (backtrackCharacterClass(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeBackReference: if (backtrackBackReference(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParenthesesSubpattern: if (backtrackParentheses(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParenthesesSubpatternOnceBegin: if (backtrackParenthesesOnceBegin(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParenthesesSubpatternOnceEnd: if (backtrackParenthesesOnceEnd(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParentheticalAssertionBegin: if (backtrackParentheticalAssertionBegin(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeParentheticalAssertionEnd: if (backtrackParentheticalAssertionEnd(currentTerm(), context)) MATCH_NEXT(); BACKTRACK(); case ByteTerm::TypeCheckInput: input.uncheckInput(currentTerm().checkInputCount); BACKTRACK(); } ASSERT_NOT_REACHED(); return false; } bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) { if (matchDisjunction(disjunction, context, btrack)) { while (context->matchBegin == context->matchEnd) { if (!matchDisjunction(disjunction, context, true)) return false; } return true; } return false; } int interpret() { for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) output[i] = -1; DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); if (matchDisjunction(pattern->m_body.get(), context)) { output[0] = context->matchBegin; output[1] = context->matchEnd; } freeDisjunctionContext(context); return output[0]; } Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) : pattern(pattern) , output(output) , input(inputChar, start, length) { } private: BytecodePattern *pattern; int* output; InputStream input; }; class ByteCompiler { struct ParenthesesStackEntry { unsigned beginTerm; unsigned savedAlternativeIndex; ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) : beginTerm(beginTerm) , savedAlternativeIndex(savedAlternativeIndex) { } }; public: ByteCompiler(RegexPattern& pattern) : m_pattern(pattern) { m_bodyDisjunction = 0; m_currentAlternativeIndex = 0; } BytecodePattern* compile() { regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize); emitDisjunction(m_pattern.m_body); regexEnd(); return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern); } void checkInput(unsigned count) { m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); } void assertionBOL(int inputPosition) { m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); } void assertionEOL(int inputPosition) { m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); } void assertionWordBoundary(bool invert, int inputPosition) { m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); } void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) { if (m_pattern.m_ignoreCase) { UChar lo = Unicode::toLower(ch); UChar hi = Unicode::toUpper(ch); if (lo != hi) { m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); return; } } m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); } void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) { m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) { ASSERT(subpatternId); m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) { int beginTerm = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); m_currentAlternativeIndex = beginTerm + 1; } void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) { int beginTerm = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0)); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); m_currentAlternativeIndex = beginTerm + 1; } unsigned popParenthesesStack() { ASSERT(m_parenthesesStack.size()); int stackEnd = m_parenthesesStack.size() - 1; unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; m_parenthesesStack.shrink(stackEnd); ASSERT(beginTerm < m_bodyDisjunction->terms.size()); ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); return beginTerm; } #ifndef NDEBUG void dumpDisjunction(ByteDisjunction* disjunction) { printf("ByteDisjunction(%p):\n\t", disjunction); for (unsigned i = 0; i < disjunction->terms.size(); ++i) printf("{ %d } ", disjunction->terms[i].type); printf("\n"); } #endif void closeAlternative(int beginTerm) { int origBeginTerm = beginTerm; ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); int endIndex = m_bodyDisjunction->terms.size(); unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; if (!m_bodyDisjunction->terms[beginTerm].alternative.next) m_bodyDisjunction->terms.remove(beginTerm); else { while (m_bodyDisjunction->terms[beginTerm].alternative.next) { beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; } m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; } } void closeBodyAlternative() { int beginTerm = 0; int origBeginTerm = 0; ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); int endIndex = m_bodyDisjunction->terms.size(); unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; while (m_bodyDisjunction->terms[beginTerm].alternative.next) { beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; } m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; } void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); unsigned endTerm = m_bodyDisjunction->terms.size(); bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; if (doInline) { m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } else { ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); bool invertOrCapture = parenthesesBegin.invertOrCapture; unsigned subpatternId = parenthesesBegin.atom.subpatternId; unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); m_bodyDisjunction->terms.shrink(beginTerm); m_allParenthesesInfo.append(parenthesesDisjunction); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; } } void regexBegin(unsigned numSubpatterns, unsigned callFrameSize) { m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin()); m_bodyDisjunction->terms[0].frameLocation = 0; m_currentAlternativeIndex = 0; } void regexEnd() { closeBodyAlternative(); } void alternativeBodyDisjunction() { int newAlternativeIndex = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction()); m_currentAlternativeIndex = newAlternativeIndex; } void alternativeDisjunction() { int newAlternativeIndex = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); m_currentAlternativeIndex = newAlternativeIndex; } void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) { for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; if (alt) { if (disjunction == m_pattern.m_body) alternativeBodyDisjunction(); else alternativeDisjunction(); } PatternAlternative* alternative = disjunction->m_alternatives[alt]; unsigned minimumSize = alternative->m_minimumSize; ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; if (countToCheck) checkInput(countToCheck); currentCountAlreadyChecked += countToCheck; for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { PatternTerm& term = alternative->m_terms[i]; switch (term.type) { case PatternTerm::TypeAssertionBOL: assertionBOL(term.inputPosition - currentCountAlreadyChecked); break; case PatternTerm::TypeAssertionEOL: assertionEOL(term.inputPosition - currentCountAlreadyChecked); break; case PatternTerm::TypeAssertionWordBoundary: assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked); break; case PatternTerm::TypePatternCharacter: atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); break; case PatternTerm::TypeCharacterClass: atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); break; case PatternTerm::TypeBackReference: atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); break; case PatternTerm::TypeForwardReference: break; case PatternTerm::TypeParenthesesSubpattern: { unsigned disjunctionAlreadyCheckedCount = 0; if ((term.quantityCount == 1) && !term.parentheses.isCopy) { if (term.quantityType == QuantifierFixedCount) { disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize); atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); } else { unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); } } else { unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); } break; } case PatternTerm::TypeParentheticalAssertion: { unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion; atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType); break; } } } } } private: RegexPattern& m_pattern; ByteDisjunction* m_bodyDisjunction; unsigned m_currentAlternativeIndex; Vector m_parenthesesStack; Vector m_allParenthesesInfo; }; BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) { RegexPattern pattern(ignoreCase, multiline); if ((error = compileRegex(patternString, pattern))) return 0; numSubpatterns = pattern.m_numSubpatterns; return ByteCompiler(pattern).compile(); } int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) { return Interpreter(regex, output, input, start, length).interpret(); } COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); } } #endif