summaryrefslogtreecommitdiff
path: root/Source/JavaScriptCore/yarr/YarrParser.h
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
commit1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch)
tree46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/JavaScriptCore/yarr/YarrParser.h
parent32761a6cee1d0dee366b885b7b9c777e67885688 (diff)
downloadWebKitGtk-tarball-master.tar.gz
Diffstat (limited to 'Source/JavaScriptCore/yarr/YarrParser.h')
-rw-r--r--Source/JavaScriptCore/yarr/YarrParser.h225
1 files changed, 149 insertions, 76 deletions
diff --git a/Source/JavaScriptCore/yarr/YarrParser.h b/Source/JavaScriptCore/yarr/YarrParser.h
index 366aa40d3..9c0982017 100644
--- a/Source/JavaScriptCore/yarr/YarrParser.h
+++ b/Source/JavaScriptCore/yarr/YarrParser.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -23,18 +23,14 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef YarrParser_h
-#define YarrParser_h
+#pragma once
#include "Yarr.h"
#include <wtf/ASCIICType.h>
#include <wtf/text/WTFString.h>
-#include <wtf/unicode/Unicode.h>
namespace JSC { namespace Yarr {
-#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
-
enum BuiltInCharacterClassID {
DigitClassID,
SpaceClassID,
@@ -47,22 +43,7 @@ template<class Delegate, typename CharType>
class Parser {
private:
template<class FriendDelegate>
- friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
-
- enum ErrorCode {
- NoError,
- PatternTooLarge,
- QuantifierOutOfOrder,
- QuantifierWithoutAtom,
- QuantifierTooLarge,
- MissingParentheses,
- ParenthesesUnmatched,
- ParenthesesTypeInvalid,
- CharacterClassUnmatched,
- CharacterClassOutOfOrder,
- EscapeUnterminated,
- NumberOfErrorCodes
- };
+ friend const char* parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit);
/*
* CharacterClassParserDelegate:
@@ -75,7 +56,7 @@ private:
*/
class CharacterClassParserDelegate {
public:
- CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
+ CharacterClassParserDelegate(Delegate& delegate, YarrPattern::ErrorCode& err)
: m_delegate(delegate)
, m_err(err)
, m_state(Empty)
@@ -102,7 +83,7 @@ private:
* mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
* is different to /[a\-z]/).
*/
- void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
+ void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false)
{
switch (m_state) {
case AfterCharacterClass:
@@ -137,7 +118,7 @@ private:
case CachedCharacterHyphen:
if (ch < m_character) {
- m_err = CharacterClassOutOfOrder;
+ m_err = YarrPattern::CharacterClassOutOfOrder;
return;
}
m_delegate.atomCharacterClassRange(m_character, ch);
@@ -218,7 +199,7 @@ private:
private:
Delegate& m_delegate;
- ErrorCode& m_err;
+ YarrPattern::ErrorCode& m_err;
enum CharacterClassConstructionState {
Empty,
CachedCharacter,
@@ -226,20 +207,34 @@ private:
AfterCharacterClass,
AfterCharacterClassHyphen,
} m_state;
- UChar m_character;
+ UChar32 m_character;
};
- Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
+ Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit)
: m_delegate(delegate)
, m_backReferenceLimit(backReferenceLimit)
- , m_err(NoError)
- , m_data(pattern.getCharacters<CharType>())
+ , m_err(YarrPattern::NoError)
+ , m_data(pattern.characters<CharType>())
, m_size(pattern.length())
, m_index(0)
+ , m_isUnicode(isUnicode)
, m_parenthesesNestingDepth(0)
{
}
+ // The handling of IdentityEscapes is different depending on the unicode flag.
+ // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'.
+ // For non-unicode patterns, most any character can be escaped.
+ bool isIdentityEscapeAnError(int ch)
+ {
+ if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) {
+ m_err = YarrPattern::InvalidIdentityEscape;
+ return true;
+ }
+
+ return false;
+ }
+
/*
* parseEscape():
*
@@ -268,7 +263,7 @@ private:
consume();
if (atEndOfPattern()) {
- m_err = EscapeUnterminated;
+ m_err = YarrPattern::EscapeUnterminated;
return false;
}
@@ -276,18 +271,24 @@ private:
// Assertions
case 'b':
consume();
- if (inCharacterClass)
+ if (inCharacterClass) {
+ if (isIdentityEscapeAnError('b'))
+ break;
+
delegate.atomPatternCharacter('\b');
- else {
+ } else {
delegate.assertionWordBoundary(false);
return false;
}
break;
case 'B':
consume();
- if (inCharacterClass)
+ if (inCharacterClass) {
+ if (isIdentityEscapeAnError('B'))
+ break;
+
delegate.atomPatternCharacter('B');
- else {
+ } else {
delegate.assertionWordBoundary(true);
return false;
}
@@ -402,9 +403,12 @@ private:
case 'x': {
consume();
int x = tryConsumeHex(2);
- if (x == -1)
+ if (x == -1) {
+ if (isIdentityEscapeAnError('x'))
+ break;
+
delegate.atomPatternCharacter('x');
- else
+ } else
delegate.atomPatternCharacter(x);
break;
}
@@ -412,22 +416,101 @@ private:
// UnicodeEscape
case 'u': {
consume();
+ if (atEndOfPattern()) {
+ if (isIdentityEscapeAnError('u'))
+ break;
+
+ delegate.atomPatternCharacter('u');
+ break;
+ }
+
+ if (m_isUnicode && peek() == '{') {
+ consume();
+ UChar32 codePoint = 0;
+ do {
+ if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
+ m_err = YarrPattern::InvalidUnicodeEscape;
+ break;
+ }
+
+ codePoint = (codePoint << 4) | toASCIIHexValue(consume());
+
+ if (codePoint > UCHAR_MAX_VALUE)
+ m_err = YarrPattern::InvalidUnicodeEscape;
+ } while (!atEndOfPattern() && peek() != '}');
+ if (!atEndOfPattern() && peek() == '}')
+ consume();
+ else if (!m_err)
+ m_err = YarrPattern::InvalidUnicodeEscape;
+ if (m_err)
+ return false;
+
+ delegate.atomPatternCharacter(codePoint);
+ break;
+ }
int u = tryConsumeHex(4);
- if (u == -1)
+ if (u == -1) {
+ if (isIdentityEscapeAnError('u'))
+ break;
+
delegate.atomPatternCharacter('u');
- else
+ } else {
+ // If we have the first of a surrogate pair, look for the second.
+ if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
+ ParseState state = saveState();
+ consume();
+
+ if (tryConsume('u')) {
+ int surrogate2 = tryConsumeHex(4);
+ if (U16_IS_TRAIL(surrogate2)) {
+ u = U16_GET_SUPPLEMENTARY(u, surrogate2);
+ delegate.atomPatternCharacter(u);
+ break;
+ }
+ }
+
+ restoreState(state);
+ }
delegate.atomPatternCharacter(u);
+ }
break;
}
// IdentityEscape
default:
+ int ch = peek();
+
+ if (ch == '-' && m_isUnicode && inCharacterClass) {
+ // \- is allowed for ClassEscape with unicode flag.
+ delegate.atomPatternCharacter(consume());
+ break;
+ }
+
+ if (isIdentityEscapeAnError(ch))
+ break;
+
delegate.atomPatternCharacter(consume());
}
return true;
}
+ UChar32 consumePossibleSurrogatePair()
+ {
+ UChar32 ch = consume();
+ if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) {
+ ParseState state = saveState();
+
+ UChar32 surrogate2 = consume();
+ if (U16_IS_TRAIL(surrogate2))
+ ch = U16_GET_SUPPLEMENTARY(ch, surrogate2);
+ else
+ restoreState(state);
+ }
+
+ return ch;
+ }
+
/*
* parseAtomEscape(), parseCharacterClassEscape():
*
@@ -471,14 +554,14 @@ private:
break;
default:
- characterClassConstructor.atomPatternCharacter(consume(), true);
+ characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true);
}
if (m_err)
return;
}
- m_err = CharacterClassUnmatched;
+ m_err = YarrPattern::CharacterClassUnmatched;
}
/*
@@ -494,7 +577,7 @@ private:
if (tryConsume('?')) {
if (atEndOfPattern()) {
- m_err = ParenthesesTypeInvalid;
+ m_err = YarrPattern::ParenthesesTypeInvalid;
return;
}
@@ -512,7 +595,7 @@ private:
break;
default:
- m_err = ParenthesesTypeInvalid;
+ m_err = YarrPattern::ParenthesesTypeInvalid;
}
} else
m_delegate.atomParenthesesSubpatternBegin();
@@ -534,7 +617,7 @@ private:
if (m_parenthesesNestingDepth > 0)
m_delegate.atomParenthesesEnd();
else
- m_err = ParenthesesUnmatched;
+ m_err = YarrPattern::ParenthesesUnmatched;
--m_parenthesesNestingDepth;
}
@@ -550,14 +633,14 @@ private:
ASSERT(min <= max);
if (min == UINT_MAX) {
- m_err = QuantifierTooLarge;
+ m_err = YarrPattern::QuantifierTooLarge;
return;
}
if (lastTokenWasAnAtom)
m_delegate.quantifyAtom(min, max, !tryConsume('?'));
else
- m_err = QuantifierWithoutAtom;
+ m_err = YarrPattern::QuantifierWithoutAtom;
}
/*
@@ -651,7 +734,7 @@ private:
if (min <= max)
parseQuantifier(lastTokenWasAnAtom, min, max);
else
- m_err = QuantifierOutOfOrder;
+ m_err = YarrPattern::QuantifierOutOfOrder;
lastTokenWasAnAtom = false;
break;
}
@@ -663,7 +746,7 @@ private:
FALLTHROUGH;
default:
- m_delegate.atomPatternCharacter(consume());
+ m_delegate.atomPatternCharacter(consumePossibleSurrogatePair());
lastTokenWasAnAtom = true;
}
@@ -672,7 +755,7 @@ private:
}
if (m_parenthesesNestingDepth > 0)
- m_err = MissingParentheses;
+ m_err = YarrPattern::MissingParentheses;
}
/*
@@ -684,27 +767,12 @@ private:
const char* parse()
{
if (m_size > MAX_PATTERN_SIZE)
- m_err = PatternTooLarge;
+ m_err = YarrPattern::PatternTooLarge;
else
parseTokens();
ASSERT(atEndOfPattern() || m_err);
-
- // The order of this array must match the ErrorCode enum.
- static const char* errorMessages[NumberOfErrorCodes] = {
- 0, // NoError
- REGEXP_ERROR_PREFIX "regular expression too large",
- REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
- REGEXP_ERROR_PREFIX "nothing to repeat",
- REGEXP_ERROR_PREFIX "number too large in {} quantifier",
- REGEXP_ERROR_PREFIX "missing )",
- REGEXP_ERROR_PREFIX "unmatched parentheses",
- REGEXP_ERROR_PREFIX "unrecognized character after (?",
- REGEXP_ERROR_PREFIX "missing terminating ] for character class",
- REGEXP_ERROR_PREFIX "range out of order in character class",
- REGEXP_ERROR_PREFIX "\\ at end of pattern"
- };
-
- return errorMessages[m_err];
+
+ return YarrPattern::errorMessage(m_err);
}
// Misc helper functions:
@@ -727,6 +795,12 @@ private:
return m_index == m_size;
}
+ unsigned patternRemaining()
+ {
+ ASSERT(m_index <= m_size);
+ return m_size - m_index;
+ }
+
int peek()
{
ASSERT(m_index < m_size);
@@ -802,10 +876,11 @@ private:
Delegate& m_delegate;
unsigned m_backReferenceLimit;
- ErrorCode m_err;
+ YarrPattern::ErrorCode m_err;
const CharType* m_data;
unsigned m_size;
unsigned m_index;
+ bool m_isUnicode;
unsigned m_parenthesesNestingDepth;
// Derived by empirical testing of compile time in PCRE and WREC.
@@ -826,11 +901,11 @@ private:
* void assertionEOL();
* void assertionWordBoundary(bool invert);
*
- * void atomPatternCharacter(UChar ch);
+ * void atomPatternCharacter(UChar32 ch);
* void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
* void atomCharacterClassBegin(bool invert)
- * void atomCharacterClassAtom(UChar ch)
- * void atomCharacterClassRange(UChar begin, UChar end)
+ * void atomCharacterClassAtom(UChar32 ch)
+ * void atomCharacterClassRange(UChar32 begin, UChar32 end)
* void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
* void atomCharacterClassEnd()
* void atomParenthesesSubpatternBegin(bool capture = true);
@@ -872,13 +947,11 @@ private:
*/
template<class Delegate>
-const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
+const char* parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite)
{
if (pattern.is8Bit())
- return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
- return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
+ return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
+ return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
}
} } // namespace JSC::Yarr
-
-#endif // YarrParser_h