summaryrefslogtreecommitdiff
path: root/Source/JavaScriptCore/yarr/YarrPattern.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'Source/JavaScriptCore/yarr/YarrPattern.cpp')
-rw-r--r--Source/JavaScriptCore/yarr/YarrPattern.cpp309
1 files changed, 203 insertions, 106 deletions
diff --git a/Source/JavaScriptCore/yarr/YarrPattern.cpp b/Source/JavaScriptCore/yarr/YarrPattern.cpp
index 7ed9d3c30..386b33728 100644
--- a/Source/JavaScriptCore/yarr/YarrPattern.cpp
+++ b/Source/JavaScriptCore/yarr/YarrPattern.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
+ * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
* Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
@@ -28,9 +28,10 @@
#include "YarrPattern.h"
#include "Yarr.h"
-#include "YarrCanonicalizeUCS2.h"
+#include "YarrCanonicalize.h"
#include "YarrParser.h"
#include <wtf/Vector.h>
+#include <wtf/WTFThreadData.h>
using namespace WTF;
@@ -40,8 +41,9 @@ namespace JSC { namespace Yarr {
class CharacterClassConstructor {
public:
- CharacterClassConstructor(bool isCaseInsensitive = false)
+ CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
: m_isCaseInsensitive(isCaseInsensitive)
+ , m_canonicalMode(canonicalMode)
{
}
@@ -65,11 +67,16 @@ public:
addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
}
- void putChar(UChar ch)
+ void putChar(UChar32 ch)
{
- // Handle ascii cases.
- if (ch <= 0x7f) {
- if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
+ if (!m_isCaseInsensitive) {
+ addSorted(ch);
+ return;
+ }
+
+ if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
+ // Handle ASCII cases.
+ if (isASCIIAlpha(ch)) {
addSorted(m_matches, toASCIIUpper(ch));
addSorted(m_matches, toASCIILower(ch));
} else
@@ -77,40 +84,33 @@ public:
return;
}
- // Simple case, not a case-insensitive match.
- if (!m_isCaseInsensitive) {
- addSorted(m_matchesUnicode, ch);
- return;
- }
-
// Add multiple matches, if necessary.
- const UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode);
if (info->type == CanonicalizeUnique)
- addSorted(m_matchesUnicode, ch);
+ addSorted(ch);
else
putUnicodeIgnoreCase(ch, info);
}
- void putUnicodeIgnoreCase(UChar ch, const UCS2CanonicalizationRange* info)
+ void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
{
ASSERT(m_isCaseInsensitive);
- ASSERT(ch > 0x7f);
ASSERT(ch >= info->begin && ch <= info->end);
ASSERT(info->type != CanonicalizeUnique);
if (info->type == CanonicalizeSet) {
- for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
- addSorted(m_matchesUnicode, ch);
+ for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
+ addSorted(ch);
} else {
- addSorted(m_matchesUnicode, ch);
- addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
+ addSorted(ch);
+ addSorted(getCanonicalPair(info, ch));
}
}
- void putRange(UChar lo, UChar hi)
+ void putRange(UChar32 lo, UChar32 hi)
{
- if (lo <= 0x7f) {
+ if (isASCII(lo)) {
char asciiLo = lo;
- char asciiHi = std::min(hi, (UChar)0x7f);
+ char asciiHi = std::min(hi, (UChar32)0x7f);
addSortedRange(m_ranges, lo, asciiHi);
if (m_isCaseInsensitive) {
@@ -120,19 +120,19 @@ public:
addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
}
}
- if (hi <= 0x7f)
+ if (isASCII(hi))
return;
- lo = std::max(lo, (UChar)0x80);
+ lo = std::max(lo, (UChar32)0x80);
addSortedRange(m_rangesUnicode, lo, hi);
if (!m_isCaseInsensitive)
return;
- const UCS2CanonicalizationRange* info = rangeInfoFor(lo);
+ const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode);
while (true) {
// Handle the range [lo .. end]
- UChar end = std::min<UChar>(info->end, hi);
+ UChar32 end = std::min<UChar32>(info->end, hi);
switch (info->type) {
case CanonicalizeUnique:
@@ -140,7 +140,7 @@ public:
break;
case CanonicalizeSet: {
UChar ch;
- for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
addSorted(m_matchesUnicode, ch);
break;
}
@@ -175,20 +175,25 @@ public:
}
- PassOwnPtr<CharacterClass> charClass()
+ std::unique_ptr<CharacterClass> charClass()
{
- OwnPtr<CharacterClass> characterClass = adoptPtr(new CharacterClass);
+ auto characterClass = std::make_unique<CharacterClass>();
characterClass->m_matches.swap(m_matches);
characterClass->m_ranges.swap(m_ranges);
characterClass->m_matchesUnicode.swap(m_matchesUnicode);
characterClass->m_rangesUnicode.swap(m_rangesUnicode);
- return characterClass.release();
+ return characterClass;
}
private:
- void addSorted(Vector<UChar>& matches, UChar ch)
+ void addSorted(UChar32 ch)
+ {
+ addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
+ }
+
+ void addSorted(Vector<UChar32>& matches, UChar32 ch)
{
unsigned pos = 0;
unsigned range = matches.size();
@@ -214,7 +219,7 @@ private:
matches.insert(pos, ch);
}
- void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
+ void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
{
unsigned end = ranges.size();
@@ -260,24 +265,26 @@ private:
}
bool m_isCaseInsensitive;
+ CanonicalMode m_canonicalMode;
- Vector<UChar> m_matches;
+ Vector<UChar32> m_matches;
Vector<CharacterRange> m_ranges;
- Vector<UChar> m_matchesUnicode;
+ Vector<UChar32> m_matchesUnicode;
Vector<CharacterRange> m_rangesUnicode;
};
class YarrPatternConstructor {
public:
- YarrPatternConstructor(YarrPattern& pattern)
+ YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
: m_pattern(pattern)
- , m_characterClassConstructor(pattern.m_ignoreCase)
+ , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
+ , m_stackLimit(stackLimit)
, m_invertParentheticalAssertion(false)
{
- OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction);
+ auto body = std::make_unique<PatternDisjunction>();
m_pattern.m_body = body.get();
m_alternative = body->addNewAlternative();
- m_pattern.m_disjunctions.append(body.release());
+ m_pattern.m_disjunctions.append(WTFMove(body));
}
~YarrPatternConstructor()
@@ -289,15 +296,15 @@ public:
m_pattern.reset();
m_characterClassConstructor.reset();
- OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction);
+ auto body = std::make_unique<PatternDisjunction>();
m_pattern.m_body = body.get();
m_alternative = body->addNewAlternative();
- m_pattern.m_disjunctions.append(body.release());
+ m_pattern.m_disjunctions.append(WTFMove(body));
}
void assertionBOL()
{
- if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
+ if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
m_alternative->m_startsWithBOL = true;
m_alternative->m_containsBOL = true;
m_pattern.m_containsBOL = true;
@@ -313,25 +320,25 @@ public:
m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
}
- void atomPatternCharacter(UChar ch)
+ void atomPatternCharacter(UChar32 ch)
{
// We handle case-insensitive checking of unicode characters which do have both
// cases by handling them as if they were defined using a CharacterClass.
- if (!m_pattern.m_ignoreCase || isASCII(ch)) {
+ if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) {
m_alternative->m_terms.append(PatternTerm(ch));
return;
}
- const UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
if (info->type == CanonicalizeUnique) {
m_alternative->m_terms.append(PatternTerm(ch));
return;
}
m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
- OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass();
+ auto newCharacterClass = m_characterClassConstructor.charClass();
m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false));
- m_pattern.m_userCharacterClasses.append(newCharacterClass.release());
+ m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
}
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
@@ -344,7 +351,10 @@ public:
m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
break;
case WordClassID:
- m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
+ if (m_pattern.unicode() && m_pattern.ignoreCase())
+ m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
+ else
+ m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
break;
case NewlineClassID:
m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
@@ -357,12 +367,12 @@ public:
m_invertCharacterClass = invert;
}
- void atomCharacterClassAtom(UChar ch)
+ void atomCharacterClassAtom(UChar32 ch)
{
m_characterClassConstructor.putChar(ch);
}
- void atomCharacterClassRange(UChar begin, UChar end)
+ void atomCharacterClassRange(UChar32 begin, UChar32 end)
{
m_characterClassConstructor.putRange(begin, end);
}
@@ -381,7 +391,10 @@ public:
break;
case WordClassID:
- m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
+ if (m_pattern.unicode() && m_pattern.ignoreCase())
+ m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
+ else
+ m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
break;
default:
@@ -391,9 +404,9 @@ public:
void atomCharacterClassEnd()
{
- OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass();
+ auto newCharacterClass = m_characterClassConstructor.charClass();
m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
- m_pattern.m_userCharacterClasses.append(newCharacterClass.release());
+ m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
}
void atomParenthesesSubpatternBegin(bool capture = true)
@@ -402,19 +415,19 @@ public:
if (capture)
m_pattern.m_numSubpatterns++;
- OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative));
+ auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
m_alternative = parenthesesDisjunction->addNewAlternative();
- m_pattern.m_disjunctions.append(parenthesesDisjunction.release());
+ m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
}
void atomParentheticalAssertionBegin(bool invert = false)
{
- OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative));
+ auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
m_alternative = parenthesesDisjunction->addNewAlternative();
m_invertParentheticalAssertion = invert;
- m_pattern.m_disjunctions.append(parenthesesDisjunction.release());
+ m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
}
void atomParenthesesEnd()
@@ -479,12 +492,12 @@ public:
// skip alternatives with m_startsWithBOL set true.
PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
{
- OwnPtr<PatternDisjunction> newDisjunction;
+ std::unique_ptr<PatternDisjunction> newDisjunction;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
if (!newDisjunction) {
- newDisjunction = adoptPtr(new PatternDisjunction());
+ newDisjunction = std::make_unique<PatternDisjunction>();
newDisjunction->m_parent = disjunction->m_parent;
}
PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
@@ -498,7 +511,7 @@ public:
return 0;
PatternDisjunction* copiedDisjunction = newDisjunction.get();
- m_pattern.m_disjunctions.append(newDisjunction.release());
+ m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
return copiedDisjunction;
}
@@ -509,6 +522,7 @@ public:
PatternTerm termCopy = term;
termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
+ m_pattern.m_hasCopiedParenSubexpressions = true;
return termCopy;
}
@@ -524,7 +538,7 @@ public:
PatternTerm& term = m_alternative->lastTerm();
ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
- ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
+ ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount);
if (term.type == PatternTerm::TypeParentheticalAssertion) {
// If an assertion is quantified with a minimum count of zero, it can simply be removed.
@@ -546,12 +560,12 @@ public:
return;
}
- if (min == 0)
- term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
- else if (min == max)
- term.quantify(min, QuantifierFixedCount);
+ if (min == max)
+ term.quantify(min, max, QuantifierFixedCount);
+ else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
+ term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
else {
- term.quantify(min, QuantifierFixedCount);
+ term.quantify(min, min, QuantifierFixedCount);
m_alternative->m_terms.append(copyTerm(term));
// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
@@ -565,10 +579,14 @@ public:
m_alternative = m_alternative->m_parent->addNewAlternative();
}
- unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
+ YarrPattern::ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
{
+ if (UNLIKELY(!isSafeToRecurse()))
+ return YarrPattern::TooManyDisjunctions;
+
+ YarrPattern::ErrorCode error = YarrPattern::NoError;
alternative->m_hasFixedSize = true;
- Checked<unsigned> currentInputPosition = initialInputPosition;
+ Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
PatternTerm& term = alternative->m_terms[i];
@@ -596,8 +614,10 @@ public:
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
alternative->m_hasFixedSize = false;
+ } else if (m_pattern.unicode()) {
+ currentInputPosition += U16_LENGTH(term.patternCharacter) * term.quantityMaxCount;
} else
- currentInputPosition += term.quantityCount;
+ currentInputPosition += term.quantityMaxCount;
break;
case PatternTerm::TypeCharacterClass:
@@ -606,28 +626,40 @@ public:
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
alternative->m_hasFixedSize = false;
+ } else if (m_pattern.unicode()) {
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
+ currentInputPosition += term.quantityMaxCount;
+ alternative->m_hasFixedSize = false;
} else
- currentInputPosition += term.quantityCount;
+ currentInputPosition += term.quantityMaxCount;
break;
case PatternTerm::TypeParenthesesSubpattern:
// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
term.frameLocation = currentCallFrameSize;
- if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+ if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
if (term.quantityType != QuantifierFixedCount)
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+ error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
+ if (error)
+ return error;
// If quantity is fixed, then pre-check its minimum size.
if (term.quantityType == QuantifierFixedCount)
currentInputPosition += term.parentheses.disjunction->m_minimumSize;
term.inputPosition = currentInputPosition.unsafeGet();
} else if (term.parentheses.isTerminal) {
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+ error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
+ if (error)
+ return error;
term.inputPosition = currentInputPosition.unsafeGet();
} else {
term.inputPosition = currentInputPosition.unsafeGet();
- setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet());
+ unsigned ignoredCallFrameSize;
+ error = setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet(), ignoredCallFrameSize);
+ if (error)
+ return error;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
}
// Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
@@ -637,7 +669,9 @@ public:
case PatternTerm::TypeParentheticalAssertion:
term.inputPosition = currentInputPosition.unsafeGet();
term.frameLocation = currentCallFrameSize;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet());
+ error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize);
+ if (error)
+ return error;
break;
case PatternTerm::TypeDotStarEnclosure:
@@ -645,27 +679,39 @@ public:
term.inputPosition = initialInputPosition;
break;
}
+ if (currentInputPosition.hasOverflowed())
+ return YarrPattern::OffsetTooLarge;
}
alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
- return currentCallFrameSize;
+ newCallFrameSize = currentCallFrameSize;
+ return error;
}
- unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
+ YarrPattern::ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
{
+ if (UNLIKELY(!isSafeToRecurse()))
+ return YarrPattern::TooManyDisjunctions;
+
if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
unsigned minimumInputSize = UINT_MAX;
unsigned maximumCallFrameSize = 0;
bool hasFixedSize = true;
+ YarrPattern::ErrorCode error = YarrPattern::NoError;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
- unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
+ unsigned currentAlternativeCallFrameSize;
+ error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize);
+ if (error)
+ return error;
minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
hasFixedSize &= alternative->m_hasFixedSize;
+ if (alternative->m_minimumSize > INT_MAX)
+ m_pattern.m_containsUnsignedLengthPattern = true;
}
ASSERT(minimumInputSize != UINT_MAX);
@@ -674,12 +720,18 @@ public:
disjunction->m_hasFixedSize = hasFixedSize;
disjunction->m_minimumSize = minimumInputSize;
disjunction->m_callFrameSize = maximumCallFrameSize;
- return maximumCallFrameSize;
+ callFrameSize = maximumCallFrameSize;
+ return error;
}
- void setupOffsets()
+ const char* setupOffsets()
{
- setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
+ // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
+ unsigned ignoredCallFrameSize;
+ YarrPattern::ErrorCode error = setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize);
+ if (error)
+ return YarrPattern::errorMessage(error);
+ return nullptr;
}
// This optimization identifies sets of parentheses that we will never need to backtrack.
@@ -696,14 +748,15 @@ public:
if (m_pattern.m_numSubpatterns)
return;
- Vector<OwnPtr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
+ Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
for (size_t i = 0; i < alternatives.size(); ++i) {
Vector<PatternTerm>& terms = alternatives[i]->m_terms;
if (terms.size()) {
PatternTerm& term = terms.last();
if (term.type == PatternTerm::TypeParenthesesSubpattern
&& term.quantityType == QuantifierGreedy
- && term.quantityCount == quantifyInfinite
+ && term.quantityMinCount == 0
+ && term.quantityMaxCount == quantifyInfinite
&& !term.capture())
term.parentheses.isTerminal = true;
}
@@ -719,7 +772,7 @@ public:
// At this point, this is only valid for non-multiline expressions.
PatternDisjunction* disjunction = m_pattern.m_body;
- if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
+ if (!m_pattern.m_containsBOL || m_pattern.multiline())
return;
PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
@@ -737,11 +790,12 @@ public:
}
}
- bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex)
+ bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
{
Vector<PatternTerm>& terms = alternative->m_terms;
- for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) {
+ ASSERT(endIndex <= terms.size());
+ for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
PatternTerm& term = terms[termIndex];
if (term.m_capture)
@@ -750,7 +804,7 @@ public:
if (term.type == PatternTerm::TypeParenthesesSubpattern) {
PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
- if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
+ if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size()))
return true;
}
}
@@ -766,7 +820,7 @@ public:
// beginning and the end of the match.
void optimizeDotStarWrappedExpressions()
{
- Vector<OwnPtr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
+ Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
if (alternatives.size() != 1)
return;
@@ -775,7 +829,7 @@ public:
if (terms.size() >= 3) {
bool startsWithBOL = false;
bool endsWithEOL = false;
- size_t termIndex, firstExpressionTerm, lastExpressionTerm;
+ size_t termIndex, firstExpressionTerm;
termIndex = 0;
if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
@@ -798,14 +852,13 @@ public:
PatternTerm& lastNonAnchorTerm = terms[termIndex];
if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
return;
-
- lastExpressionTerm = termIndex - 1;
- if (firstExpressionTerm > lastExpressionTerm)
+ size_t endIndex = termIndex;
+ if (firstExpressionTerm >= endIndex)
return;
- if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) {
- for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex)
+ if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) {
+ for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex)
terms.remove(termIndex);
for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
@@ -819,18 +872,58 @@ public:
}
private:
+ bool isSafeToRecurse() const
+ {
+ if (!m_stackLimit)
+ return true;
+ ASSERT(wtfThreadData().stack().isGrowingDownward());
+ int8_t* curr = reinterpret_cast<int8_t*>(&curr);
+ int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
+ return curr >= limit;
+ }
+
YarrPattern& m_pattern;
PatternAlternative* m_alternative;
CharacterClassConstructor m_characterClassConstructor;
+ void* m_stackLimit;
bool m_invertCharacterClass;
bool m_invertParentheticalAssertion;
};
-const char* YarrPattern::compile(const String& patternString)
+const char* YarrPattern::errorMessage(YarrPattern::ErrorCode error)
+{
+#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
+ // The order of this array must match the ErrorCode enum.
+ static const char* errorMessages[NumberOfErrorCodes] = {
+ nullptr, // NoError
+ REGEXP_ERROR_PREFIX "regular expression too large",
+ REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
+ REGEXP_ERROR_PREFIX "nothing to repeat",
+ REGEXP_ERROR_PREFIX "number too large in {} quantifier",
+ REGEXP_ERROR_PREFIX "missing )",
+ REGEXP_ERROR_PREFIX "unmatched parentheses",
+ REGEXP_ERROR_PREFIX "unrecognized character after (?",
+ REGEXP_ERROR_PREFIX "missing terminating ] for character class",
+ REGEXP_ERROR_PREFIX "range out of order in character class",
+ REGEXP_ERROR_PREFIX "\\ at end of pattern",
+ REGEXP_ERROR_PREFIX "invalid unicode {} escape",
+ REGEXP_ERROR_PREFIX "invalid escaped character for unicode pattern",
+ REGEXP_ERROR_PREFIX "too many nested disjunctions",
+ REGEXP_ERROR_PREFIX "pattern exceeds string length limits",
+ REGEXP_ERROR_PREFIX "invalid flags"
+ };
+
+ return errorMessages[error];
+}
+
+const char* YarrPattern::compile(const String& patternString, void* stackLimit)
{
- YarrPatternConstructor constructor(*this);
+ YarrPatternConstructor constructor(*this, stackLimit);
+
+ if (m_flags == InvalidFlags)
+ return errorMessage(InvalidRegularExpressionFlags);
- if (const char* error = parse(constructor, patternString))
+ if (const char* error = parse(constructor, patternString, unicode()))
return error;
// If the pattern contains illegal backreferences reset & reparse.
@@ -844,7 +937,7 @@ const char* YarrPattern::compile(const String& patternString)
#if !ASSERT_DISABLED
const char* error =
#endif
- parse(constructor, patternString, numSubpatterns);
+ parse(constructor, patternString, unicode(), numSubpatterns);
ASSERT(!error);
ASSERT(numSubpatterns == m_numSubpatterns);
@@ -854,27 +947,31 @@ const char* YarrPattern::compile(const String& patternString)
constructor.optimizeDotStarWrappedExpressions();
constructor.optimizeBOL();
- constructor.setupOffsets();
+ if (const char* error = constructor.setupOffsets())
+ return error;
- return 0;
+ return nullptr;
}
-YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error)
- : m_ignoreCase(ignoreCase)
- , m_multiline(multiline)
- , m_containsBackreferences(false)
+YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, const char** error, void* stackLimit)
+ : m_containsBackreferences(false)
, m_containsBOL(false)
+ , m_containsUnsignedLengthPattern(false)
+ , m_hasCopiedParenSubexpressions(false)
+ , m_flags(flags)
, m_numSubpatterns(0)
, m_maxBackReference(0)
, newlineCached(0)
, digitsCached(0)
, spacesCached(0)
, wordcharCached(0)
+ , wordUnicodeIgnoreCaseCharCached(0)
, nondigitsCached(0)
, nonspacesCached(0)
, nonwordcharCached(0)
+ , nonwordUnicodeIgnoreCasecharCached(0)
{
- *error = compile(pattern);
+ *error = compile(pattern, stackLimit);
}
} }