diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
commit | 1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch) | |
tree | 46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/JavaScriptCore/yarr | |
parent | 32761a6cee1d0dee366b885b7b9c777e67885688 (diff) | |
download | WebKitGtk-tarball-master.tar.gz |
webkitgtk-2.16.5HEADwebkitgtk-2.16.5master
Diffstat (limited to 'Source/JavaScriptCore/yarr')
-rw-r--r-- | Source/JavaScriptCore/yarr/RegularExpression.cpp | 25 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/RegularExpression.h | 9 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/Yarr.h | 15 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrCanonicalize.h (renamed from Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h) | 73 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp | 823 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js | 193 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrInterpreter.cpp | 411 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrInterpreter.h | 97 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrJIT.cpp | 534 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrJIT.h | 42 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrParser.h | 225 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrPattern.cpp | 309 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrPattern.h | 189 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp | 10 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrSyntaxChecker.h | 10 |
15 files changed, 1804 insertions, 1161 deletions
diff --git a/Source/JavaScriptCore/yarr/RegularExpression.cpp b/Source/JavaScriptCore/yarr/RegularExpression.cpp index b58ad393c..d87d6031f 100644 --- a/Source/JavaScriptCore/yarr/RegularExpression.cpp +++ b/Source/JavaScriptCore/yarr/RegularExpression.cpp @@ -12,10 +12,10 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -29,6 +29,7 @@ #include "RegularExpression.h" #include "Yarr.h" +#include "YarrInterpreter.h" #include <wtf/Assertions.h> #include <wtf/BumpPointerAllocator.h> @@ -36,15 +37,15 @@ namespace JSC { namespace Yarr { class RegularExpression::Private : public RefCounted<RegularExpression::Private> { public: - static PassRefPtr<Private> create(const String& pattern, TextCaseSensitivity caseSensitivity, MultilineMode multilineMode) + static Ref<Private> create(const String& pattern, TextCaseSensitivity caseSensitivity, MultilineMode multilineMode) { - return adoptRef(new Private(pattern, caseSensitivity, multilineMode)); + return adoptRef(*new Private(pattern, caseSensitivity, multilineMode)); } int lastMatchLength; unsigned m_numSubpatterns; - OwnPtr<JSC::Yarr::BytecodePattern> m_regExpByteCode; + std::unique_ptr<JSC::Yarr::BytecodePattern> m_regExpByteCode; private: Private(const String& pattern, TextCaseSensitivity caseSensitivity, MultilineMode multilineMode) @@ -54,9 +55,17 @@ private: { } - PassOwnPtr<JSC::Yarr::BytecodePattern> compile(const String& patternString, TextCaseSensitivity caseSensitivity, MultilineMode multilineMode) + std::unique_ptr<JSC::Yarr::BytecodePattern> compile(const String& patternString, TextCaseSensitivity caseSensitivity, MultilineMode multilineMode) { - JSC::Yarr::YarrPattern pattern(patternString, (caseSensitivity == TextCaseInsensitive), (multilineMode == MultilineEnabled), &m_constructionError); + RegExpFlags flags = NoFlags; + + if (caseSensitivity == TextCaseInsensitive) + flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase); + + if (multilineMode == MultilineEnabled) + flags = static_cast<RegExpFlags>(flags | FlagMultiline); + + JSC::Yarr::YarrPattern pattern(patternString, flags, &m_constructionError); if (m_constructionError) { LOG_ERROR("RegularExpression: YARR compile failed with '%s'", m_constructionError); return nullptr; @@ -178,7 +187,7 @@ void replace(String& string, const RegularExpression& target, const String& repl bool RegularExpression::isValid() const { - return d->m_regExpByteCode; + return d->m_regExpByteCode.get(); } } } // namespace JSC::Yarr diff --git a/Source/JavaScriptCore/yarr/RegularExpression.h b/Source/JavaScriptCore/yarr/RegularExpression.h index 08aaf353a..28427743c 100644 --- a/Source/JavaScriptCore/yarr/RegularExpression.h +++ b/Source/JavaScriptCore/yarr/RegularExpression.h @@ -10,10 +10,10 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -23,8 +23,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef RegularExpression_h -#define RegularExpression_h +#pragma once #include <wtf/text/WTFString.h> @@ -58,5 +57,3 @@ private: void JS_EXPORT_PRIVATE replace(String&, const RegularExpression&, const String&); } } // namespace JSC::Yarr - -#endif // RegularExpression_h diff --git a/Source/JavaScriptCore/yarr/Yarr.h b/Source/JavaScriptCore/yarr/Yarr.h index d393e9fa9..8e5a3e667 100644 --- a/Source/JavaScriptCore/yarr/Yarr.h +++ b/Source/JavaScriptCore/yarr/Yarr.h @@ -25,16 +25,14 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef Yarr_h -#define Yarr_h +#pragma once -#include "YarrInterpreter.h" #include "YarrPattern.h" namespace JSC { namespace Yarr { -#define YarrStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. -#define YarrStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoPatternCharacter 2 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoCharacterClass 2 // Only for !fixed quantifiers. #define YarrStackSpaceForBackTrackInfoBackReference 2 #define YarrStackSpaceForBackTrackInfoAlternative 1 // One per alternative. #define YarrStackSpaceForBackTrackInfoParentheticalAssertion 1 @@ -43,7 +41,7 @@ namespace JSC { namespace Yarr { #define YarrStackSpaceForBackTrackInfoParentheses 2 static const unsigned quantifyInfinite = UINT_MAX; -static const unsigned offsetNoMatch = (unsigned)-1; +static const unsigned offsetNoMatch = std::numeric_limits<unsigned>::max(); // The below limit restricts the number of "recursive" match calls in order to // avoid spending exponential time on complex regular expressions. @@ -63,7 +61,6 @@ enum YarrCharSize { Char16 }; -} } // namespace JSC::Yarr - -#endif // Yarr_h +struct BytecodePattern; +} } // namespace JSC::Yarr diff --git a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h b/Source/JavaScriptCore/yarr/YarrCanonicalize.h index fcc318673..fb5e0231a 100644 --- a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h +++ b/Source/JavaScriptCore/yarr/YarrCanonicalize.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012 Apple Inc. All rights reserved. + * Copyright (C) 2012-2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,17 +23,18 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrCanonicalizeUCS2_H -#define YarrCanonicalizeUCS2_H +#pragma once #include <stdint.h> -#include <wtf/unicode/Unicode.h> +#include <unicode/utypes.h> namespace JSC { namespace Yarr { -// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp) -// provides information for each UCS2 code point as to the set of code points that it should -// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8. +// This set of data provides information for each UCS2 code point as to the set of code points +// that it should match under the ES6 case insensitive RegExp matching rules, specified in 21.2.2.8.2. +// The non-Unicode tables are autogenerated using YarrCanonicalize.js into YarrCanonicalize.cpp. +// The Unicode tables are autogenerated using the python script generateYarrCanonicalizeUnicode +// which creates YarrCanonicalizeUnicode.cpp. enum UCS2CanonicalizationType { CanonicalizeUnique, // No canonically equal values, e.g. 0x0. CanonicalizeSet, // Value indicates a set in characterSetInfo. @@ -42,32 +43,38 @@ enum UCS2CanonicalizationType { CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5. CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242. }; -struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; }; +struct CanonicalizationRange { + UChar32 begin; + UChar32 end; + UChar32 value; + UCS2CanonicalizationType type; +}; + extern const size_t UCS2_CANONICALIZATION_RANGES; -extern const uint16_t* characterSetInfo[]; -extern const UCS2CanonicalizationRange rangeInfo[]; +extern const UChar32* const ucs2CharacterSetInfo[]; +extern const CanonicalizationRange ucs2RangeInfo[]; -// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to -// the set of Latin1 codepoints that could match. -enum LatinCanonicalizationType { - CanonicalizeLatinSelf, // This character is in the Latin1 range, but has no canonical equivalent in the range. - CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20. - CanonicalizeLatinOther, // This character is not in the Latin1 range, but canonicalizes to another that is. - CanonicalizeLatinInvalid, // Cannot match against Latin1 input. -}; -struct LatinCanonicalizationRange { uint16_t begin, end, value, type; }; -extern const size_t LATIN_CANONICALIZATION_RANGES; -extern LatinCanonicalizationRange latinRangeInfo[]; +extern const size_t UNICODE_CANONICALIZATION_RANGES; +extern const UChar32* const unicodeCharacterSetInfo[]; +extern const CanonicalizationRange unicodeRangeInfo[]; + +enum class CanonicalMode { UCS2, Unicode }; -// This searches in log2 time over ~364 entries, so should typically result in 8 compares. -inline const UCS2CanonicalizationRange* rangeInfoFor(UChar ch) +inline const UChar32* canonicalCharacterSetInfo(unsigned index, CanonicalMode canonicalMode) { - const UCS2CanonicalizationRange* info = rangeInfo; - size_t entries = UCS2_CANONICALIZATION_RANGES; + const UChar32* const* rangeInfo = canonicalMode == CanonicalMode::UCS2 ? ucs2CharacterSetInfo : unicodeCharacterSetInfo; + return rangeInfo[index]; +} + +// This searches in log2 time over ~400-600 entries, so should typically result in 9 compares. +inline const CanonicalizationRange* canonicalRangeInfoFor(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2) +{ + const CanonicalizationRange* info = canonicalMode == CanonicalMode::UCS2 ? ucs2RangeInfo : unicodeRangeInfo; + size_t entries = canonicalMode == CanonicalMode::UCS2 ? UCS2_CANONICALIZATION_RANGES : UNICODE_CANONICALIZATION_RANGES; while (true) { size_t candidate = entries >> 1; - const UCS2CanonicalizationRange* candidateInfo = info + candidate; + const CanonicalizationRange* candidateInfo = info + candidate; if (ch < candidateInfo->begin) entries = candidate; else if (ch <= candidateInfo->end) @@ -80,7 +87,7 @@ inline const UCS2CanonicalizationRange* rangeInfoFor(UChar ch) } // Should only be called for characters that have one canonically matching value. -inline UChar getCanonicalPair(const UCS2CanonicalizationRange* info, UChar ch) +inline UChar32 getCanonicalPair(const CanonicalizationRange* info, UChar32 ch) { ASSERT(ch >= info->begin && ch <= info->end); switch (info->type) { @@ -100,20 +107,20 @@ inline UChar getCanonicalPair(const UCS2CanonicalizationRange* info, UChar ch) } // Returns true if no other UCS2 codepoint can match this value. -inline bool isCanonicallyUnique(UChar ch) +inline bool isCanonicallyUnique(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2) { - return rangeInfoFor(ch)->type == CanonicalizeUnique; + return canonicalRangeInfoFor(ch, canonicalMode)->type == CanonicalizeUnique; } // Returns true if values are equal, under the canonicalization rules. -inline bool areCanonicallyEquivalent(UChar a, UChar b) +inline bool areCanonicallyEquivalent(UChar32 a, UChar32 b, CanonicalMode canonicalMode = CanonicalMode::UCS2) { - const UCS2CanonicalizationRange* info = rangeInfoFor(a); + const CanonicalizationRange* info = canonicalRangeInfoFor(a, canonicalMode); switch (info->type) { case CanonicalizeUnique: return a == b; case CanonicalizeSet: { - for (const uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) { + for (const UChar32* set = canonicalCharacterSetInfo(info->value, canonicalMode); (a = *set); ++set) { if (a == b) return true; } @@ -134,5 +141,3 @@ inline bool areCanonicallyEquivalent(UChar a, UChar b) } } } // JSC::Yarr - -#endif diff --git a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp index 777b1cff1..d91c77159 100644 --- a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp +++ b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012 Apple Inc. All rights reserved. + * Copyright (C) 2012-2013, 2015-2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,33 +23,31 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js +// DO NOT EDIT! - this file autogenerated by YarrCanonicalize.js #include "config.h" -#include "YarrCanonicalizeUCS2.h" +#include "YarrCanonicalize.h" namespace JSC { namespace Yarr { -#include <stdint.h> - -const uint16_t ucs2CharacterSet0[] = { 0x01c4u, 0x01c5u, 0x01c6u, 0 }; -const uint16_t ucs2CharacterSet1[] = { 0x01c7u, 0x01c8u, 0x01c9u, 0 }; -const uint16_t ucs2CharacterSet2[] = { 0x01cau, 0x01cbu, 0x01ccu, 0 }; -const uint16_t ucs2CharacterSet3[] = { 0x01f1u, 0x01f2u, 0x01f3u, 0 }; -const uint16_t ucs2CharacterSet4[] = { 0x0392u, 0x03b2u, 0x03d0u, 0 }; -const uint16_t ucs2CharacterSet5[] = { 0x0395u, 0x03b5u, 0x03f5u, 0 }; -const uint16_t ucs2CharacterSet6[] = { 0x0398u, 0x03b8u, 0x03d1u, 0 }; -const uint16_t ucs2CharacterSet7[] = { 0x0345u, 0x0399u, 0x03b9u, 0x1fbeu, 0 }; -const uint16_t ucs2CharacterSet8[] = { 0x039au, 0x03bau, 0x03f0u, 0 }; -const uint16_t ucs2CharacterSet9[] = { 0x00b5u, 0x039cu, 0x03bcu, 0 }; -const uint16_t ucs2CharacterSet10[] = { 0x03a0u, 0x03c0u, 0x03d6u, 0 }; -const uint16_t ucs2CharacterSet11[] = { 0x03a1u, 0x03c1u, 0x03f1u, 0 }; -const uint16_t ucs2CharacterSet12[] = { 0x03a3u, 0x03c2u, 0x03c3u, 0 }; -const uint16_t ucs2CharacterSet13[] = { 0x03a6u, 0x03c6u, 0x03d5u, 0 }; -const uint16_t ucs2CharacterSet14[] = { 0x1e60u, 0x1e61u, 0x1e9bu, 0 }; +const UChar32 ucs2CharacterSet0[] = { 0x01c4, 0x01c5, 0x01c6, 0 }; +const UChar32 ucs2CharacterSet1[] = { 0x01c7, 0x01c8, 0x01c9, 0 }; +const UChar32 ucs2CharacterSet2[] = { 0x01ca, 0x01cb, 0x01cc, 0 }; +const UChar32 ucs2CharacterSet3[] = { 0x01f1, 0x01f2, 0x01f3, 0 }; +const UChar32 ucs2CharacterSet4[] = { 0x0392, 0x03b2, 0x03d0, 0 }; +const UChar32 ucs2CharacterSet5[] = { 0x0395, 0x03b5, 0x03f5, 0 }; +const UChar32 ucs2CharacterSet6[] = { 0x0398, 0x03b8, 0x03d1, 0 }; +const UChar32 ucs2CharacterSet7[] = { 0x0345, 0x0399, 0x03b9, 0x1fbe, 0 }; +const UChar32 ucs2CharacterSet8[] = { 0x039a, 0x03ba, 0x03f0, 0 }; +const UChar32 ucs2CharacterSet9[] = { 0x00b5, 0x039c, 0x03bc, 0 }; +const UChar32 ucs2CharacterSet10[] = { 0x03a0, 0x03c0, 0x03d6, 0 }; +const UChar32 ucs2CharacterSet11[] = { 0x03a1, 0x03c1, 0x03f1, 0 }; +const UChar32 ucs2CharacterSet12[] = { 0x03a3, 0x03c2, 0x03c3, 0 }; +const UChar32 ucs2CharacterSet13[] = { 0x03a6, 0x03c6, 0x03d5, 0 }; +const UChar32 ucs2CharacterSet14[] = { 0x1e60, 0x1e61, 0x1e9b, 0 }; static const size_t UCS2_CANONICALIZATION_SETS = 15; -const uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = { +const UChar32* const ucs2CharacterSetInfo[UCS2_CANONICALIZATION_SETS] = { ucs2CharacterSet0, ucs2CharacterSet1, ucs2CharacterSet2, @@ -67,396 +65,399 @@ const uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = { ucs2CharacterSet14, }; -const size_t UCS2_CANONICALIZATION_RANGES = 364; -const UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = { - { 0x0000u, 0x0040u, 0x0000u, CanonicalizeUnique }, - { 0x0041u, 0x005au, 0x0020u, CanonicalizeRangeLo }, - { 0x005bu, 0x0060u, 0x0000u, CanonicalizeUnique }, - { 0x0061u, 0x007au, 0x0020u, CanonicalizeRangeHi }, - { 0x007bu, 0x00b4u, 0x0000u, CanonicalizeUnique }, - { 0x00b5u, 0x00b5u, 0x0009u, CanonicalizeSet }, - { 0x00b6u, 0x00bfu, 0x0000u, CanonicalizeUnique }, - { 0x00c0u, 0x00d6u, 0x0020u, CanonicalizeRangeLo }, - { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeUnique }, - { 0x00d8u, 0x00deu, 0x0020u, CanonicalizeRangeLo }, - { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeUnique }, - { 0x00e0u, 0x00f6u, 0x0020u, CanonicalizeRangeHi }, - { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeUnique }, - { 0x00f8u, 0x00feu, 0x0020u, CanonicalizeRangeHi }, - { 0x00ffu, 0x00ffu, 0x0079u, CanonicalizeRangeLo }, - { 0x0100u, 0x012fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0130u, 0x0131u, 0x0000u, CanonicalizeUnique }, - { 0x0132u, 0x0137u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0138u, 0x0138u, 0x0000u, CanonicalizeUnique }, - { 0x0139u, 0x0148u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0149u, 0x0149u, 0x0000u, CanonicalizeUnique }, - { 0x014au, 0x0177u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0178u, 0x0178u, 0x0079u, CanonicalizeRangeHi }, - { 0x0179u, 0x017eu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x017fu, 0x017fu, 0x0000u, CanonicalizeUnique }, - { 0x0180u, 0x0180u, 0x00c3u, CanonicalizeRangeLo }, - { 0x0181u, 0x0181u, 0x00d2u, CanonicalizeRangeLo }, - { 0x0182u, 0x0185u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0186u, 0x0186u, 0x00ceu, CanonicalizeRangeLo }, - { 0x0187u, 0x0188u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0189u, 0x018au, 0x00cdu, CanonicalizeRangeLo }, - { 0x018bu, 0x018cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x018du, 0x018du, 0x0000u, CanonicalizeUnique }, - { 0x018eu, 0x018eu, 0x004fu, CanonicalizeRangeLo }, - { 0x018fu, 0x018fu, 0x00cau, CanonicalizeRangeLo }, - { 0x0190u, 0x0190u, 0x00cbu, CanonicalizeRangeLo }, - { 0x0191u, 0x0192u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0193u, 0x0193u, 0x00cdu, CanonicalizeRangeLo }, - { 0x0194u, 0x0194u, 0x00cfu, CanonicalizeRangeLo }, - { 0x0195u, 0x0195u, 0x0061u, CanonicalizeRangeLo }, - { 0x0196u, 0x0196u, 0x00d3u, CanonicalizeRangeLo }, - { 0x0197u, 0x0197u, 0x00d1u, CanonicalizeRangeLo }, - { 0x0198u, 0x0199u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x019au, 0x019au, 0x00a3u, CanonicalizeRangeLo }, - { 0x019bu, 0x019bu, 0x0000u, CanonicalizeUnique }, - { 0x019cu, 0x019cu, 0x00d3u, CanonicalizeRangeLo }, - { 0x019du, 0x019du, 0x00d5u, CanonicalizeRangeLo }, - { 0x019eu, 0x019eu, 0x0082u, CanonicalizeRangeLo }, - { 0x019fu, 0x019fu, 0x00d6u, CanonicalizeRangeLo }, - { 0x01a0u, 0x01a5u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01a6u, 0x01a6u, 0x00dau, CanonicalizeRangeLo }, - { 0x01a7u, 0x01a8u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01a9u, 0x01a9u, 0x00dau, CanonicalizeRangeLo }, - { 0x01aau, 0x01abu, 0x0000u, CanonicalizeUnique }, - { 0x01acu, 0x01adu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01aeu, 0x01aeu, 0x00dau, CanonicalizeRangeLo }, - { 0x01afu, 0x01b0u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01b1u, 0x01b2u, 0x00d9u, CanonicalizeRangeLo }, - { 0x01b3u, 0x01b6u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01b7u, 0x01b7u, 0x00dbu, CanonicalizeRangeLo }, - { 0x01b8u, 0x01b9u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01bau, 0x01bbu, 0x0000u, CanonicalizeUnique }, - { 0x01bcu, 0x01bdu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01beu, 0x01beu, 0x0000u, CanonicalizeUnique }, - { 0x01bfu, 0x01bfu, 0x0038u, CanonicalizeRangeLo }, - { 0x01c0u, 0x01c3u, 0x0000u, CanonicalizeUnique }, - { 0x01c4u, 0x01c6u, 0x0000u, CanonicalizeSet }, - { 0x01c7u, 0x01c9u, 0x0001u, CanonicalizeSet }, - { 0x01cau, 0x01ccu, 0x0002u, CanonicalizeSet }, - { 0x01cdu, 0x01dcu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01ddu, 0x01ddu, 0x004fu, CanonicalizeRangeHi }, - { 0x01deu, 0x01efu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01f0u, 0x01f0u, 0x0000u, CanonicalizeUnique }, - { 0x01f1u, 0x01f3u, 0x0003u, CanonicalizeSet }, - { 0x01f4u, 0x01f5u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01f6u, 0x01f6u, 0x0061u, CanonicalizeRangeHi }, - { 0x01f7u, 0x01f7u, 0x0038u, CanonicalizeRangeHi }, - { 0x01f8u, 0x021fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0220u, 0x0220u, 0x0082u, CanonicalizeRangeHi }, - { 0x0221u, 0x0221u, 0x0000u, CanonicalizeUnique }, - { 0x0222u, 0x0233u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0234u, 0x0239u, 0x0000u, CanonicalizeUnique }, - { 0x023au, 0x023au, 0x2a2bu, CanonicalizeRangeLo }, - { 0x023bu, 0x023cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x023du, 0x023du, 0x00a3u, CanonicalizeRangeHi }, - { 0x023eu, 0x023eu, 0x2a28u, CanonicalizeRangeLo }, - { 0x023fu, 0x0240u, 0x2a3fu, CanonicalizeRangeLo }, - { 0x0241u, 0x0242u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0243u, 0x0243u, 0x00c3u, CanonicalizeRangeHi }, - { 0x0244u, 0x0244u, 0x0045u, CanonicalizeRangeLo }, - { 0x0245u, 0x0245u, 0x0047u, CanonicalizeRangeLo }, - { 0x0246u, 0x024fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0250u, 0x0250u, 0x2a1fu, CanonicalizeRangeLo }, - { 0x0251u, 0x0251u, 0x2a1cu, CanonicalizeRangeLo }, - { 0x0252u, 0x0252u, 0x2a1eu, CanonicalizeRangeLo }, - { 0x0253u, 0x0253u, 0x00d2u, CanonicalizeRangeHi }, - { 0x0254u, 0x0254u, 0x00ceu, CanonicalizeRangeHi }, - { 0x0255u, 0x0255u, 0x0000u, CanonicalizeUnique }, - { 0x0256u, 0x0257u, 0x00cdu, CanonicalizeRangeHi }, - { 0x0258u, 0x0258u, 0x0000u, CanonicalizeUnique }, - { 0x0259u, 0x0259u, 0x00cau, CanonicalizeRangeHi }, - { 0x025au, 0x025au, 0x0000u, CanonicalizeUnique }, - { 0x025bu, 0x025bu, 0x00cbu, CanonicalizeRangeHi }, - { 0x025cu, 0x025fu, 0x0000u, CanonicalizeUnique }, - { 0x0260u, 0x0260u, 0x00cdu, CanonicalizeRangeHi }, - { 0x0261u, 0x0262u, 0x0000u, CanonicalizeUnique }, - { 0x0263u, 0x0263u, 0x00cfu, CanonicalizeRangeHi }, - { 0x0264u, 0x0264u, 0x0000u, CanonicalizeUnique }, - { 0x0265u, 0x0265u, 0xa528u, CanonicalizeRangeLo }, - { 0x0266u, 0x0267u, 0x0000u, CanonicalizeUnique }, - { 0x0268u, 0x0268u, 0x00d1u, CanonicalizeRangeHi }, - { 0x0269u, 0x0269u, 0x00d3u, CanonicalizeRangeHi }, - { 0x026au, 0x026au, 0x0000u, CanonicalizeUnique }, - { 0x026bu, 0x026bu, 0x29f7u, CanonicalizeRangeLo }, - { 0x026cu, 0x026eu, 0x0000u, CanonicalizeUnique }, - { 0x026fu, 0x026fu, 0x00d3u, CanonicalizeRangeHi }, - { 0x0270u, 0x0270u, 0x0000u, CanonicalizeUnique }, - { 0x0271u, 0x0271u, 0x29fdu, CanonicalizeRangeLo }, - { 0x0272u, 0x0272u, 0x00d5u, CanonicalizeRangeHi }, - { 0x0273u, 0x0274u, 0x0000u, CanonicalizeUnique }, - { 0x0275u, 0x0275u, 0x00d6u, CanonicalizeRangeHi }, - { 0x0276u, 0x027cu, 0x0000u, CanonicalizeUnique }, - { 0x027du, 0x027du, 0x29e7u, CanonicalizeRangeLo }, - { 0x027eu, 0x027fu, 0x0000u, CanonicalizeUnique }, - { 0x0280u, 0x0280u, 0x00dau, CanonicalizeRangeHi }, - { 0x0281u, 0x0282u, 0x0000u, CanonicalizeUnique }, - { 0x0283u, 0x0283u, 0x00dau, CanonicalizeRangeHi }, - { 0x0284u, 0x0287u, 0x0000u, CanonicalizeUnique }, - { 0x0288u, 0x0288u, 0x00dau, CanonicalizeRangeHi }, - { 0x0289u, 0x0289u, 0x0045u, CanonicalizeRangeHi }, - { 0x028au, 0x028bu, 0x00d9u, CanonicalizeRangeHi }, - { 0x028cu, 0x028cu, 0x0047u, CanonicalizeRangeHi }, - { 0x028du, 0x0291u, 0x0000u, CanonicalizeUnique }, - { 0x0292u, 0x0292u, 0x00dbu, CanonicalizeRangeHi }, - { 0x0293u, 0x0344u, 0x0000u, CanonicalizeUnique }, - { 0x0345u, 0x0345u, 0x0007u, CanonicalizeSet }, - { 0x0346u, 0x036fu, 0x0000u, CanonicalizeUnique }, - { 0x0370u, 0x0373u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0374u, 0x0375u, 0x0000u, CanonicalizeUnique }, - { 0x0376u, 0x0377u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0378u, 0x037au, 0x0000u, CanonicalizeUnique }, - { 0x037bu, 0x037du, 0x0082u, CanonicalizeRangeLo }, - { 0x037eu, 0x0385u, 0x0000u, CanonicalizeUnique }, - { 0x0386u, 0x0386u, 0x0026u, CanonicalizeRangeLo }, - { 0x0387u, 0x0387u, 0x0000u, CanonicalizeUnique }, - { 0x0388u, 0x038au, 0x0025u, CanonicalizeRangeLo }, - { 0x038bu, 0x038bu, 0x0000u, CanonicalizeUnique }, - { 0x038cu, 0x038cu, 0x0040u, CanonicalizeRangeLo }, - { 0x038du, 0x038du, 0x0000u, CanonicalizeUnique }, - { 0x038eu, 0x038fu, 0x003fu, CanonicalizeRangeLo }, - { 0x0390u, 0x0390u, 0x0000u, CanonicalizeUnique }, - { 0x0391u, 0x0391u, 0x0020u, CanonicalizeRangeLo }, - { 0x0392u, 0x0392u, 0x0004u, CanonicalizeSet }, - { 0x0393u, 0x0394u, 0x0020u, CanonicalizeRangeLo }, - { 0x0395u, 0x0395u, 0x0005u, CanonicalizeSet }, - { 0x0396u, 0x0397u, 0x0020u, CanonicalizeRangeLo }, - { 0x0398u, 0x0398u, 0x0006u, CanonicalizeSet }, - { 0x0399u, 0x0399u, 0x0007u, CanonicalizeSet }, - { 0x039au, 0x039au, 0x0008u, CanonicalizeSet }, - { 0x039bu, 0x039bu, 0x0020u, CanonicalizeRangeLo }, - { 0x039cu, 0x039cu, 0x0009u, CanonicalizeSet }, - { 0x039du, 0x039fu, 0x0020u, CanonicalizeRangeLo }, - { 0x03a0u, 0x03a0u, 0x000au, CanonicalizeSet }, - { 0x03a1u, 0x03a1u, 0x000bu, CanonicalizeSet }, - { 0x03a2u, 0x03a2u, 0x0000u, CanonicalizeUnique }, - { 0x03a3u, 0x03a3u, 0x000cu, CanonicalizeSet }, - { 0x03a4u, 0x03a5u, 0x0020u, CanonicalizeRangeLo }, - { 0x03a6u, 0x03a6u, 0x000du, CanonicalizeSet }, - { 0x03a7u, 0x03abu, 0x0020u, CanonicalizeRangeLo }, - { 0x03acu, 0x03acu, 0x0026u, CanonicalizeRangeHi }, - { 0x03adu, 0x03afu, 0x0025u, CanonicalizeRangeHi }, - { 0x03b0u, 0x03b0u, 0x0000u, CanonicalizeUnique }, - { 0x03b1u, 0x03b1u, 0x0020u, CanonicalizeRangeHi }, - { 0x03b2u, 0x03b2u, 0x0004u, CanonicalizeSet }, - { 0x03b3u, 0x03b4u, 0x0020u, CanonicalizeRangeHi }, - { 0x03b5u, 0x03b5u, 0x0005u, CanonicalizeSet }, - { 0x03b6u, 0x03b7u, 0x0020u, CanonicalizeRangeHi }, - { 0x03b8u, 0x03b8u, 0x0006u, CanonicalizeSet }, - { 0x03b9u, 0x03b9u, 0x0007u, CanonicalizeSet }, - { 0x03bau, 0x03bau, 0x0008u, CanonicalizeSet }, - { 0x03bbu, 0x03bbu, 0x0020u, CanonicalizeRangeHi }, - { 0x03bcu, 0x03bcu, 0x0009u, CanonicalizeSet }, - { 0x03bdu, 0x03bfu, 0x0020u, CanonicalizeRangeHi }, - { 0x03c0u, 0x03c0u, 0x000au, CanonicalizeSet }, - { 0x03c1u, 0x03c1u, 0x000bu, CanonicalizeSet }, - { 0x03c2u, 0x03c3u, 0x000cu, CanonicalizeSet }, - { 0x03c4u, 0x03c5u, 0x0020u, CanonicalizeRangeHi }, - { 0x03c6u, 0x03c6u, 0x000du, CanonicalizeSet }, - { 0x03c7u, 0x03cbu, 0x0020u, CanonicalizeRangeHi }, - { 0x03ccu, 0x03ccu, 0x0040u, CanonicalizeRangeHi }, - { 0x03cdu, 0x03ceu, 0x003fu, CanonicalizeRangeHi }, - { 0x03cfu, 0x03cfu, 0x0008u, CanonicalizeRangeLo }, - { 0x03d0u, 0x03d0u, 0x0004u, CanonicalizeSet }, - { 0x03d1u, 0x03d1u, 0x0006u, CanonicalizeSet }, - { 0x03d2u, 0x03d4u, 0x0000u, CanonicalizeUnique }, - { 0x03d5u, 0x03d5u, 0x000du, CanonicalizeSet }, - { 0x03d6u, 0x03d6u, 0x000au, CanonicalizeSet }, - { 0x03d7u, 0x03d7u, 0x0008u, CanonicalizeRangeHi }, - { 0x03d8u, 0x03efu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x03f0u, 0x03f0u, 0x0008u, CanonicalizeSet }, - { 0x03f1u, 0x03f1u, 0x000bu, CanonicalizeSet }, - { 0x03f2u, 0x03f2u, 0x0007u, CanonicalizeRangeLo }, - { 0x03f3u, 0x03f4u, 0x0000u, CanonicalizeUnique }, - { 0x03f5u, 0x03f5u, 0x0005u, CanonicalizeSet }, - { 0x03f6u, 0x03f6u, 0x0000u, CanonicalizeUnique }, - { 0x03f7u, 0x03f8u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x03f9u, 0x03f9u, 0x0007u, CanonicalizeRangeHi }, - { 0x03fau, 0x03fbu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x03fcu, 0x03fcu, 0x0000u, CanonicalizeUnique }, - { 0x03fdu, 0x03ffu, 0x0082u, CanonicalizeRangeHi }, - { 0x0400u, 0x040fu, 0x0050u, CanonicalizeRangeLo }, - { 0x0410u, 0x042fu, 0x0020u, CanonicalizeRangeLo }, - { 0x0430u, 0x044fu, 0x0020u, CanonicalizeRangeHi }, - { 0x0450u, 0x045fu, 0x0050u, CanonicalizeRangeHi }, - { 0x0460u, 0x0481u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0482u, 0x0489u, 0x0000u, CanonicalizeUnique }, - { 0x048au, 0x04bfu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x04c0u, 0x04c0u, 0x000fu, CanonicalizeRangeLo }, - { 0x04c1u, 0x04ceu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x04cfu, 0x04cfu, 0x000fu, CanonicalizeRangeHi }, - { 0x04d0u, 0x0527u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0528u, 0x0530u, 0x0000u, CanonicalizeUnique }, - { 0x0531u, 0x0556u, 0x0030u, CanonicalizeRangeLo }, - { 0x0557u, 0x0560u, 0x0000u, CanonicalizeUnique }, - { 0x0561u, 0x0586u, 0x0030u, CanonicalizeRangeHi }, - { 0x0587u, 0x109fu, 0x0000u, CanonicalizeUnique }, - { 0x10a0u, 0x10c5u, 0x1c60u, CanonicalizeRangeLo }, - { 0x10c6u, 0x1d78u, 0x0000u, CanonicalizeUnique }, - { 0x1d79u, 0x1d79u, 0x8a04u, CanonicalizeRangeLo }, - { 0x1d7au, 0x1d7cu, 0x0000u, CanonicalizeUnique }, - { 0x1d7du, 0x1d7du, 0x0ee6u, CanonicalizeRangeLo }, - { 0x1d7eu, 0x1dffu, 0x0000u, CanonicalizeUnique }, - { 0x1e00u, 0x1e5fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x1e60u, 0x1e61u, 0x000eu, CanonicalizeSet }, - { 0x1e62u, 0x1e95u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x1e96u, 0x1e9au, 0x0000u, CanonicalizeUnique }, - { 0x1e9bu, 0x1e9bu, 0x000eu, CanonicalizeSet }, - { 0x1e9cu, 0x1e9fu, 0x0000u, CanonicalizeUnique }, - { 0x1ea0u, 0x1effu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x1f00u, 0x1f07u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f08u, 0x1f0fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f10u, 0x1f15u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f16u, 0x1f17u, 0x0000u, CanonicalizeUnique }, - { 0x1f18u, 0x1f1du, 0x0008u, CanonicalizeRangeHi }, - { 0x1f1eu, 0x1f1fu, 0x0000u, CanonicalizeUnique }, - { 0x1f20u, 0x1f27u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f28u, 0x1f2fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f30u, 0x1f37u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f38u, 0x1f3fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f40u, 0x1f45u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f46u, 0x1f47u, 0x0000u, CanonicalizeUnique }, - { 0x1f48u, 0x1f4du, 0x0008u, CanonicalizeRangeHi }, - { 0x1f4eu, 0x1f50u, 0x0000u, CanonicalizeUnique }, - { 0x1f51u, 0x1f51u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f52u, 0x1f52u, 0x0000u, CanonicalizeUnique }, - { 0x1f53u, 0x1f53u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f54u, 0x1f54u, 0x0000u, CanonicalizeUnique }, - { 0x1f55u, 0x1f55u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f56u, 0x1f56u, 0x0000u, CanonicalizeUnique }, - { 0x1f57u, 0x1f57u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f58u, 0x1f58u, 0x0000u, CanonicalizeUnique }, - { 0x1f59u, 0x1f59u, 0x0008u, CanonicalizeRangeHi }, - { 0x1f5au, 0x1f5au, 0x0000u, CanonicalizeUnique }, - { 0x1f5bu, 0x1f5bu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f5cu, 0x1f5cu, 0x0000u, CanonicalizeUnique }, - { 0x1f5du, 0x1f5du, 0x0008u, CanonicalizeRangeHi }, - { 0x1f5eu, 0x1f5eu, 0x0000u, CanonicalizeUnique }, - { 0x1f5fu, 0x1f5fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f60u, 0x1f67u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f68u, 0x1f6fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f70u, 0x1f71u, 0x004au, CanonicalizeRangeLo }, - { 0x1f72u, 0x1f75u, 0x0056u, CanonicalizeRangeLo }, - { 0x1f76u, 0x1f77u, 0x0064u, CanonicalizeRangeLo }, - { 0x1f78u, 0x1f79u, 0x0080u, CanonicalizeRangeLo }, - { 0x1f7au, 0x1f7bu, 0x0070u, CanonicalizeRangeLo }, - { 0x1f7cu, 0x1f7du, 0x007eu, CanonicalizeRangeLo }, - { 0x1f7eu, 0x1fafu, 0x0000u, CanonicalizeUnique }, - { 0x1fb0u, 0x1fb1u, 0x0008u, CanonicalizeRangeLo }, - { 0x1fb2u, 0x1fb7u, 0x0000u, CanonicalizeUnique }, - { 0x1fb8u, 0x1fb9u, 0x0008u, CanonicalizeRangeHi }, - { 0x1fbau, 0x1fbbu, 0x004au, CanonicalizeRangeHi }, - { 0x1fbcu, 0x1fbdu, 0x0000u, CanonicalizeUnique }, - { 0x1fbeu, 0x1fbeu, 0x0007u, CanonicalizeSet }, - { 0x1fbfu, 0x1fc7u, 0x0000u, CanonicalizeUnique }, - { 0x1fc8u, 0x1fcbu, 0x0056u, CanonicalizeRangeHi }, - { 0x1fccu, 0x1fcfu, 0x0000u, CanonicalizeUnique }, - { 0x1fd0u, 0x1fd1u, 0x0008u, CanonicalizeRangeLo }, - { 0x1fd2u, 0x1fd7u, 0x0000u, CanonicalizeUnique }, - { 0x1fd8u, 0x1fd9u, 0x0008u, CanonicalizeRangeHi }, - { 0x1fdau, 0x1fdbu, 0x0064u, CanonicalizeRangeHi }, - { 0x1fdcu, 0x1fdfu, 0x0000u, CanonicalizeUnique }, - { 0x1fe0u, 0x1fe1u, 0x0008u, CanonicalizeRangeLo }, - { 0x1fe2u, 0x1fe4u, 0x0000u, CanonicalizeUnique }, - { 0x1fe5u, 0x1fe5u, 0x0007u, CanonicalizeRangeLo }, - { 0x1fe6u, 0x1fe7u, 0x0000u, CanonicalizeUnique }, - { 0x1fe8u, 0x1fe9u, 0x0008u, CanonicalizeRangeHi }, - { 0x1feau, 0x1febu, 0x0070u, CanonicalizeRangeHi }, - { 0x1fecu, 0x1fecu, 0x0007u, CanonicalizeRangeHi }, - { 0x1fedu, 0x1ff7u, 0x0000u, CanonicalizeUnique }, - { 0x1ff8u, 0x1ff9u, 0x0080u, CanonicalizeRangeHi }, - { 0x1ffau, 0x1ffbu, 0x007eu, CanonicalizeRangeHi }, - { 0x1ffcu, 0x2131u, 0x0000u, CanonicalizeUnique }, - { 0x2132u, 0x2132u, 0x001cu, CanonicalizeRangeLo }, - { 0x2133u, 0x214du, 0x0000u, CanonicalizeUnique }, - { 0x214eu, 0x214eu, 0x001cu, CanonicalizeRangeHi }, - { 0x214fu, 0x215fu, 0x0000u, CanonicalizeUnique }, - { 0x2160u, 0x216fu, 0x0010u, CanonicalizeRangeLo }, - { 0x2170u, 0x217fu, 0x0010u, CanonicalizeRangeHi }, - { 0x2180u, 0x2182u, 0x0000u, CanonicalizeUnique }, - { 0x2183u, 0x2184u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2185u, 0x24b5u, 0x0000u, CanonicalizeUnique }, - { 0x24b6u, 0x24cfu, 0x001au, CanonicalizeRangeLo }, - { 0x24d0u, 0x24e9u, 0x001au, CanonicalizeRangeHi }, - { 0x24eau, 0x2bffu, 0x0000u, CanonicalizeUnique }, - { 0x2c00u, 0x2c2eu, 0x0030u, CanonicalizeRangeLo }, - { 0x2c2fu, 0x2c2fu, 0x0000u, CanonicalizeUnique }, - { 0x2c30u, 0x2c5eu, 0x0030u, CanonicalizeRangeHi }, - { 0x2c5fu, 0x2c5fu, 0x0000u, CanonicalizeUnique }, - { 0x2c60u, 0x2c61u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x2c62u, 0x2c62u, 0x29f7u, CanonicalizeRangeHi }, - { 0x2c63u, 0x2c63u, 0x0ee6u, CanonicalizeRangeHi }, - { 0x2c64u, 0x2c64u, 0x29e7u, CanonicalizeRangeHi }, - { 0x2c65u, 0x2c65u, 0x2a2bu, CanonicalizeRangeHi }, - { 0x2c66u, 0x2c66u, 0x2a28u, CanonicalizeRangeHi }, - { 0x2c67u, 0x2c6cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2c6du, 0x2c6du, 0x2a1cu, CanonicalizeRangeHi }, - { 0x2c6eu, 0x2c6eu, 0x29fdu, CanonicalizeRangeHi }, - { 0x2c6fu, 0x2c6fu, 0x2a1fu, CanonicalizeRangeHi }, - { 0x2c70u, 0x2c70u, 0x2a1eu, CanonicalizeRangeHi }, - { 0x2c71u, 0x2c71u, 0x0000u, CanonicalizeUnique }, - { 0x2c72u, 0x2c73u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x2c74u, 0x2c74u, 0x0000u, CanonicalizeUnique }, - { 0x2c75u, 0x2c76u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2c77u, 0x2c7du, 0x0000u, CanonicalizeUnique }, - { 0x2c7eu, 0x2c7fu, 0x2a3fu, CanonicalizeRangeHi }, - { 0x2c80u, 0x2ce3u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x2ce4u, 0x2ceau, 0x0000u, CanonicalizeUnique }, - { 0x2cebu, 0x2ceeu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2cefu, 0x2cffu, 0x0000u, CanonicalizeUnique }, - { 0x2d00u, 0x2d25u, 0x1c60u, CanonicalizeRangeHi }, - { 0x2d26u, 0xa63fu, 0x0000u, CanonicalizeUnique }, - { 0xa640u, 0xa66du, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa66eu, 0xa67fu, 0x0000u, CanonicalizeUnique }, - { 0xa680u, 0xa697u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa698u, 0xa721u, 0x0000u, CanonicalizeUnique }, - { 0xa722u, 0xa72fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa730u, 0xa731u, 0x0000u, CanonicalizeUnique }, - { 0xa732u, 0xa76fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa770u, 0xa778u, 0x0000u, CanonicalizeUnique }, - { 0xa779u, 0xa77cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0xa77du, 0xa77du, 0x8a04u, CanonicalizeRangeHi }, - { 0xa77eu, 0xa787u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa788u, 0xa78au, 0x0000u, CanonicalizeUnique }, - { 0xa78bu, 0xa78cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0xa78du, 0xa78du, 0xa528u, CanonicalizeRangeHi }, - { 0xa78eu, 0xa78fu, 0x0000u, CanonicalizeUnique }, - { 0xa790u, 0xa791u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa792u, 0xa79fu, 0x0000u, CanonicalizeUnique }, - { 0xa7a0u, 0xa7a9u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa7aau, 0xff20u, 0x0000u, CanonicalizeUnique }, - { 0xff21u, 0xff3au, 0x0020u, CanonicalizeRangeLo }, - { 0xff3bu, 0xff40u, 0x0000u, CanonicalizeUnique }, - { 0xff41u, 0xff5au, 0x0020u, CanonicalizeRangeHi }, - { 0xff5bu, 0xffffu, 0x0000u, CanonicalizeUnique }, -}; - -const size_t LATIN_CANONICALIZATION_RANGES = 20; -LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = { - { 0x0000u, 0x0040u, 0x0000u, CanonicalizeLatinSelf }, - { 0x0041u, 0x005au, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x005bu, 0x0060u, 0x0000u, CanonicalizeLatinSelf }, - { 0x0061u, 0x007au, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x007bu, 0x00bfu, 0x0000u, CanonicalizeLatinSelf }, - { 0x00c0u, 0x00d6u, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeLatinSelf }, - { 0x00d8u, 0x00deu, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeLatinSelf }, - { 0x00e0u, 0x00f6u, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeLatinSelf }, - { 0x00f8u, 0x00feu, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00ffu, 0x00ffu, 0x0000u, CanonicalizeLatinSelf }, - { 0x0100u, 0x0177u, 0x0000u, CanonicalizeLatinInvalid }, - { 0x0178u, 0x0178u, 0x00ffu, CanonicalizeLatinOther }, - { 0x0179u, 0x039bu, 0x0000u, CanonicalizeLatinInvalid }, - { 0x039cu, 0x039cu, 0x00b5u, CanonicalizeLatinOther }, - { 0x039du, 0x03bbu, 0x0000u, CanonicalizeLatinInvalid }, - { 0x03bcu, 0x03bcu, 0x00b5u, CanonicalizeLatinOther }, - { 0x03bdu, 0xffffu, 0x0000u, CanonicalizeLatinInvalid }, +const size_t UCS2_CANONICALIZATION_RANGES = 391; +const CanonicalizationRange ucs2RangeInfo[UCS2_CANONICALIZATION_RANGES] = { + { 0x0000, 0x0040, 0x0000, CanonicalizeUnique }, + { 0x0041, 0x005a, 0x0020, CanonicalizeRangeLo }, + { 0x005b, 0x0060, 0x0000, CanonicalizeUnique }, + { 0x0061, 0x007a, 0x0020, CanonicalizeRangeHi }, + { 0x007b, 0x00b4, 0x0000, CanonicalizeUnique }, + { 0x00b5, 0x00b5, 0x0009, CanonicalizeSet }, + { 0x00b6, 0x00bf, 0x0000, CanonicalizeUnique }, + { 0x00c0, 0x00d6, 0x0020, CanonicalizeRangeLo }, + { 0x00d7, 0x00d7, 0x0000, CanonicalizeUnique }, + { 0x00d8, 0x00de, 0x0020, CanonicalizeRangeLo }, + { 0x00df, 0x00df, 0x0000, CanonicalizeUnique }, + { 0x00e0, 0x00f6, 0x0020, CanonicalizeRangeHi }, + { 0x00f7, 0x00f7, 0x0000, CanonicalizeUnique }, + { 0x00f8, 0x00fe, 0x0020, CanonicalizeRangeHi }, + { 0x00ff, 0x00ff, 0x0079, CanonicalizeRangeLo }, + { 0x0100, 0x012f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0130, 0x0131, 0x0000, CanonicalizeUnique }, + { 0x0132, 0x0137, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0138, 0x0138, 0x0000, CanonicalizeUnique }, + { 0x0139, 0x0148, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0149, 0x0149, 0x0000, CanonicalizeUnique }, + { 0x014a, 0x0177, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0178, 0x0178, 0x0079, CanonicalizeRangeHi }, + { 0x0179, 0x017e, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x017f, 0x017f, 0x0000, CanonicalizeUnique }, + { 0x0180, 0x0180, 0x00c3, CanonicalizeRangeLo }, + { 0x0181, 0x0181, 0x00d2, CanonicalizeRangeLo }, + { 0x0182, 0x0185, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0186, 0x0186, 0x00ce, CanonicalizeRangeLo }, + { 0x0187, 0x0188, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0189, 0x018a, 0x00cd, CanonicalizeRangeLo }, + { 0x018b, 0x018c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x018d, 0x018d, 0x0000, CanonicalizeUnique }, + { 0x018e, 0x018e, 0x004f, CanonicalizeRangeLo }, + { 0x018f, 0x018f, 0x00ca, CanonicalizeRangeLo }, + { 0x0190, 0x0190, 0x00cb, CanonicalizeRangeLo }, + { 0x0191, 0x0192, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0193, 0x0193, 0x00cd, CanonicalizeRangeLo }, + { 0x0194, 0x0194, 0x00cf, CanonicalizeRangeLo }, + { 0x0195, 0x0195, 0x0061, CanonicalizeRangeLo }, + { 0x0196, 0x0196, 0x00d3, CanonicalizeRangeLo }, + { 0x0197, 0x0197, 0x00d1, CanonicalizeRangeLo }, + { 0x0198, 0x0199, 0x0000, CanonicalizeAlternatingAligned }, + { 0x019a, 0x019a, 0x00a3, CanonicalizeRangeLo }, + { 0x019b, 0x019b, 0x0000, CanonicalizeUnique }, + { 0x019c, 0x019c, 0x00d3, CanonicalizeRangeLo }, + { 0x019d, 0x019d, 0x00d5, CanonicalizeRangeLo }, + { 0x019e, 0x019e, 0x0082, CanonicalizeRangeLo }, + { 0x019f, 0x019f, 0x00d6, CanonicalizeRangeLo }, + { 0x01a0, 0x01a5, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01a6, 0x01a6, 0x00da, CanonicalizeRangeLo }, + { 0x01a7, 0x01a8, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01a9, 0x01a9, 0x00da, CanonicalizeRangeLo }, + { 0x01aa, 0x01ab, 0x0000, CanonicalizeUnique }, + { 0x01ac, 0x01ad, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01ae, 0x01ae, 0x00da, CanonicalizeRangeLo }, + { 0x01af, 0x01b0, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01b1, 0x01b2, 0x00d9, CanonicalizeRangeLo }, + { 0x01b3, 0x01b6, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01b7, 0x01b7, 0x00db, CanonicalizeRangeLo }, + { 0x01b8, 0x01b9, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01ba, 0x01bb, 0x0000, CanonicalizeUnique }, + { 0x01bc, 0x01bd, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01be, 0x01be, 0x0000, CanonicalizeUnique }, + { 0x01bf, 0x01bf, 0x0038, CanonicalizeRangeLo }, + { 0x01c0, 0x01c3, 0x0000, CanonicalizeUnique }, + { 0x01c4, 0x01c6, 0x0000, CanonicalizeSet }, + { 0x01c7, 0x01c9, 0x0001, CanonicalizeSet }, + { 0x01ca, 0x01cc, 0x0002, CanonicalizeSet }, + { 0x01cd, 0x01dc, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01dd, 0x01dd, 0x004f, CanonicalizeRangeHi }, + { 0x01de, 0x01ef, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01f0, 0x01f0, 0x0000, CanonicalizeUnique }, + { 0x01f1, 0x01f3, 0x0003, CanonicalizeSet }, + { 0x01f4, 0x01f5, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01f6, 0x01f6, 0x0061, CanonicalizeRangeHi }, + { 0x01f7, 0x01f7, 0x0038, CanonicalizeRangeHi }, + { 0x01f8, 0x021f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0220, 0x0220, 0x0082, CanonicalizeRangeHi }, + { 0x0221, 0x0221, 0x0000, CanonicalizeUnique }, + { 0x0222, 0x0233, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0234, 0x0239, 0x0000, CanonicalizeUnique }, + { 0x023a, 0x023a, 0x2a2b, CanonicalizeRangeLo }, + { 0x023b, 0x023c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x023d, 0x023d, 0x00a3, CanonicalizeRangeHi }, + { 0x023e, 0x023e, 0x2a28, CanonicalizeRangeLo }, + { 0x023f, 0x0240, 0x2a3f, CanonicalizeRangeLo }, + { 0x0241, 0x0242, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0243, 0x0243, 0x00c3, CanonicalizeRangeHi }, + { 0x0244, 0x0244, 0x0045, CanonicalizeRangeLo }, + { 0x0245, 0x0245, 0x0047, CanonicalizeRangeLo }, + { 0x0246, 0x024f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0250, 0x0250, 0x2a1f, CanonicalizeRangeLo }, + { 0x0251, 0x0251, 0x2a1c, CanonicalizeRangeLo }, + { 0x0252, 0x0252, 0x2a1e, CanonicalizeRangeLo }, + { 0x0253, 0x0253, 0x00d2, CanonicalizeRangeHi }, + { 0x0254, 0x0254, 0x00ce, CanonicalizeRangeHi }, + { 0x0255, 0x0255, 0x0000, CanonicalizeUnique }, + { 0x0256, 0x0257, 0x00cd, CanonicalizeRangeHi }, + { 0x0258, 0x0258, 0x0000, CanonicalizeUnique }, + { 0x0259, 0x0259, 0x00ca, CanonicalizeRangeHi }, + { 0x025a, 0x025a, 0x0000, CanonicalizeUnique }, + { 0x025b, 0x025b, 0x00cb, CanonicalizeRangeHi }, + { 0x025c, 0x025c, 0xa54f, CanonicalizeRangeLo }, + { 0x025d, 0x025f, 0x0000, CanonicalizeUnique }, + { 0x0260, 0x0260, 0x00cd, CanonicalizeRangeHi }, + { 0x0261, 0x0261, 0xa54b, CanonicalizeRangeLo }, + { 0x0262, 0x0262, 0x0000, CanonicalizeUnique }, + { 0x0263, 0x0263, 0x00cf, CanonicalizeRangeHi }, + { 0x0264, 0x0264, 0x0000, CanonicalizeUnique }, + { 0x0265, 0x0265, 0xa528, CanonicalizeRangeLo }, + { 0x0266, 0x0266, 0xa544, CanonicalizeRangeLo }, + { 0x0267, 0x0267, 0x0000, CanonicalizeUnique }, + { 0x0268, 0x0268, 0x00d1, CanonicalizeRangeHi }, + { 0x0269, 0x0269, 0x00d3, CanonicalizeRangeHi }, + { 0x026a, 0x026a, 0x0000, CanonicalizeUnique }, + { 0x026b, 0x026b, 0x29f7, CanonicalizeRangeLo }, + { 0x026c, 0x026c, 0xa541, CanonicalizeRangeLo }, + { 0x026d, 0x026e, 0x0000, CanonicalizeUnique }, + { 0x026f, 0x026f, 0x00d3, CanonicalizeRangeHi }, + { 0x0270, 0x0270, 0x0000, CanonicalizeUnique }, + { 0x0271, 0x0271, 0x29fd, CanonicalizeRangeLo }, + { 0x0272, 0x0272, 0x00d5, CanonicalizeRangeHi }, + { 0x0273, 0x0274, 0x0000, CanonicalizeUnique }, + { 0x0275, 0x0275, 0x00d6, CanonicalizeRangeHi }, + { 0x0276, 0x027c, 0x0000, CanonicalizeUnique }, + { 0x027d, 0x027d, 0x29e7, CanonicalizeRangeLo }, + { 0x027e, 0x027f, 0x0000, CanonicalizeUnique }, + { 0x0280, 0x0280, 0x00da, CanonicalizeRangeHi }, + { 0x0281, 0x0282, 0x0000, CanonicalizeUnique }, + { 0x0283, 0x0283, 0x00da, CanonicalizeRangeHi }, + { 0x0284, 0x0286, 0x0000, CanonicalizeUnique }, + { 0x0287, 0x0287, 0xa52a, CanonicalizeRangeLo }, + { 0x0288, 0x0288, 0x00da, CanonicalizeRangeHi }, + { 0x0289, 0x0289, 0x0045, CanonicalizeRangeHi }, + { 0x028a, 0x028b, 0x00d9, CanonicalizeRangeHi }, + { 0x028c, 0x028c, 0x0047, CanonicalizeRangeHi }, + { 0x028d, 0x0291, 0x0000, CanonicalizeUnique }, + { 0x0292, 0x0292, 0x00db, CanonicalizeRangeHi }, + { 0x0293, 0x029d, 0x0000, CanonicalizeUnique }, + { 0x029e, 0x029e, 0xa512, CanonicalizeRangeLo }, + { 0x029f, 0x0344, 0x0000, CanonicalizeUnique }, + { 0x0345, 0x0345, 0x0007, CanonicalizeSet }, + { 0x0346, 0x036f, 0x0000, CanonicalizeUnique }, + { 0x0370, 0x0373, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0374, 0x0375, 0x0000, CanonicalizeUnique }, + { 0x0376, 0x0377, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0378, 0x037a, 0x0000, CanonicalizeUnique }, + { 0x037b, 0x037d, 0x0082, CanonicalizeRangeLo }, + { 0x037e, 0x037e, 0x0000, CanonicalizeUnique }, + { 0x037f, 0x037f, 0x0074, CanonicalizeRangeLo }, + { 0x0380, 0x0385, 0x0000, CanonicalizeUnique }, + { 0x0386, 0x0386, 0x0026, CanonicalizeRangeLo }, + { 0x0387, 0x0387, 0x0000, CanonicalizeUnique }, + { 0x0388, 0x038a, 0x0025, CanonicalizeRangeLo }, + { 0x038b, 0x038b, 0x0000, CanonicalizeUnique }, + { 0x038c, 0x038c, 0x0040, CanonicalizeRangeLo }, + { 0x038d, 0x038d, 0x0000, CanonicalizeUnique }, + { 0x038e, 0x038f, 0x003f, CanonicalizeRangeLo }, + { 0x0390, 0x0390, 0x0000, CanonicalizeUnique }, + { 0x0391, 0x0391, 0x0020, CanonicalizeRangeLo }, + { 0x0392, 0x0392, 0x0004, CanonicalizeSet }, + { 0x0393, 0x0394, 0x0020, CanonicalizeRangeLo }, + { 0x0395, 0x0395, 0x0005, CanonicalizeSet }, + { 0x0396, 0x0397, 0x0020, CanonicalizeRangeLo }, + { 0x0398, 0x0398, 0x0006, CanonicalizeSet }, + { 0x0399, 0x0399, 0x0007, CanonicalizeSet }, + { 0x039a, 0x039a, 0x0008, CanonicalizeSet }, + { 0x039b, 0x039b, 0x0020, CanonicalizeRangeLo }, + { 0x039c, 0x039c, 0x0009, CanonicalizeSet }, + { 0x039d, 0x039f, 0x0020, CanonicalizeRangeLo }, + { 0x03a0, 0x03a0, 0x000a, CanonicalizeSet }, + { 0x03a1, 0x03a1, 0x000b, CanonicalizeSet }, + { 0x03a2, 0x03a2, 0x0000, CanonicalizeUnique }, + { 0x03a3, 0x03a3, 0x000c, CanonicalizeSet }, + { 0x03a4, 0x03a5, 0x0020, CanonicalizeRangeLo }, + { 0x03a6, 0x03a6, 0x000d, CanonicalizeSet }, + { 0x03a7, 0x03ab, 0x0020, CanonicalizeRangeLo }, + { 0x03ac, 0x03ac, 0x0026, CanonicalizeRangeHi }, + { 0x03ad, 0x03af, 0x0025, CanonicalizeRangeHi }, + { 0x03b0, 0x03b0, 0x0000, CanonicalizeUnique }, + { 0x03b1, 0x03b1, 0x0020, CanonicalizeRangeHi }, + { 0x03b2, 0x03b2, 0x0004, CanonicalizeSet }, + { 0x03b3, 0x03b4, 0x0020, CanonicalizeRangeHi }, + { 0x03b5, 0x03b5, 0x0005, CanonicalizeSet }, + { 0x03b6, 0x03b7, 0x0020, CanonicalizeRangeHi }, + { 0x03b8, 0x03b8, 0x0006, CanonicalizeSet }, + { 0x03b9, 0x03b9, 0x0007, CanonicalizeSet }, + { 0x03ba, 0x03ba, 0x0008, CanonicalizeSet }, + { 0x03bb, 0x03bb, 0x0020, CanonicalizeRangeHi }, + { 0x03bc, 0x03bc, 0x0009, CanonicalizeSet }, + { 0x03bd, 0x03bf, 0x0020, CanonicalizeRangeHi }, + { 0x03c0, 0x03c0, 0x000a, CanonicalizeSet }, + { 0x03c1, 0x03c1, 0x000b, CanonicalizeSet }, + { 0x03c2, 0x03c3, 0x000c, CanonicalizeSet }, + { 0x03c4, 0x03c5, 0x0020, CanonicalizeRangeHi }, + { 0x03c6, 0x03c6, 0x000d, CanonicalizeSet }, + { 0x03c7, 0x03cb, 0x0020, CanonicalizeRangeHi }, + { 0x03cc, 0x03cc, 0x0040, CanonicalizeRangeHi }, + { 0x03cd, 0x03ce, 0x003f, CanonicalizeRangeHi }, + { 0x03cf, 0x03cf, 0x0008, CanonicalizeRangeLo }, + { 0x03d0, 0x03d0, 0x0004, CanonicalizeSet }, + { 0x03d1, 0x03d1, 0x0006, CanonicalizeSet }, + { 0x03d2, 0x03d4, 0x0000, CanonicalizeUnique }, + { 0x03d5, 0x03d5, 0x000d, CanonicalizeSet }, + { 0x03d6, 0x03d6, 0x000a, CanonicalizeSet }, + { 0x03d7, 0x03d7, 0x0008, CanonicalizeRangeHi }, + { 0x03d8, 0x03ef, 0x0000, CanonicalizeAlternatingAligned }, + { 0x03f0, 0x03f0, 0x0008, CanonicalizeSet }, + { 0x03f1, 0x03f1, 0x000b, CanonicalizeSet }, + { 0x03f2, 0x03f2, 0x0007, CanonicalizeRangeLo }, + { 0x03f3, 0x03f3, 0x0074, CanonicalizeRangeHi }, + { 0x03f4, 0x03f4, 0x0000, CanonicalizeUnique }, + { 0x03f5, 0x03f5, 0x0005, CanonicalizeSet }, + { 0x03f6, 0x03f6, 0x0000, CanonicalizeUnique }, + { 0x03f7, 0x03f8, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x03f9, 0x03f9, 0x0007, CanonicalizeRangeHi }, + { 0x03fa, 0x03fb, 0x0000, CanonicalizeAlternatingAligned }, + { 0x03fc, 0x03fc, 0x0000, CanonicalizeUnique }, + { 0x03fd, 0x03ff, 0x0082, CanonicalizeRangeHi }, + { 0x0400, 0x040f, 0x0050, CanonicalizeRangeLo }, + { 0x0410, 0x042f, 0x0020, CanonicalizeRangeLo }, + { 0x0430, 0x044f, 0x0020, CanonicalizeRangeHi }, + { 0x0450, 0x045f, 0x0050, CanonicalizeRangeHi }, + { 0x0460, 0x0481, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0482, 0x0489, 0x0000, CanonicalizeUnique }, + { 0x048a, 0x04bf, 0x0000, CanonicalizeAlternatingAligned }, + { 0x04c0, 0x04c0, 0x000f, CanonicalizeRangeLo }, + { 0x04c1, 0x04ce, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x04cf, 0x04cf, 0x000f, CanonicalizeRangeHi }, + { 0x04d0, 0x052f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0530, 0x0530, 0x0000, CanonicalizeUnique }, + { 0x0531, 0x0556, 0x0030, CanonicalizeRangeLo }, + { 0x0557, 0x0560, 0x0000, CanonicalizeUnique }, + { 0x0561, 0x0586, 0x0030, CanonicalizeRangeHi }, + { 0x0587, 0x109f, 0x0000, CanonicalizeUnique }, + { 0x10a0, 0x10c5, 0x1c60, CanonicalizeRangeLo }, + { 0x10c6, 0x10c6, 0x0000, CanonicalizeUnique }, + { 0x10c7, 0x10c7, 0x1c60, CanonicalizeRangeLo }, + { 0x10c8, 0x10cc, 0x0000, CanonicalizeUnique }, + { 0x10cd, 0x10cd, 0x1c60, CanonicalizeRangeLo }, + { 0x10ce, 0x1d78, 0x0000, CanonicalizeUnique }, + { 0x1d79, 0x1d79, 0x8a04, CanonicalizeRangeLo }, + { 0x1d7a, 0x1d7c, 0x0000, CanonicalizeUnique }, + { 0x1d7d, 0x1d7d, 0x0ee6, CanonicalizeRangeLo }, + { 0x1d7e, 0x1dff, 0x0000, CanonicalizeUnique }, + { 0x1e00, 0x1e5f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x1e60, 0x1e61, 0x000e, CanonicalizeSet }, + { 0x1e62, 0x1e95, 0x0000, CanonicalizeAlternatingAligned }, + { 0x1e96, 0x1e9a, 0x0000, CanonicalizeUnique }, + { 0x1e9b, 0x1e9b, 0x000e, CanonicalizeSet }, + { 0x1e9c, 0x1e9f, 0x0000, CanonicalizeUnique }, + { 0x1ea0, 0x1eff, 0x0000, CanonicalizeAlternatingAligned }, + { 0x1f00, 0x1f07, 0x0008, CanonicalizeRangeLo }, + { 0x1f08, 0x1f0f, 0x0008, CanonicalizeRangeHi }, + { 0x1f10, 0x1f15, 0x0008, CanonicalizeRangeLo }, + { 0x1f16, 0x1f17, 0x0000, CanonicalizeUnique }, + { 0x1f18, 0x1f1d, 0x0008, CanonicalizeRangeHi }, + { 0x1f1e, 0x1f1f, 0x0000, CanonicalizeUnique }, + { 0x1f20, 0x1f27, 0x0008, CanonicalizeRangeLo }, + { 0x1f28, 0x1f2f, 0x0008, CanonicalizeRangeHi }, + { 0x1f30, 0x1f37, 0x0008, CanonicalizeRangeLo }, + { 0x1f38, 0x1f3f, 0x0008, CanonicalizeRangeHi }, + { 0x1f40, 0x1f45, 0x0008, CanonicalizeRangeLo }, + { 0x1f46, 0x1f47, 0x0000, CanonicalizeUnique }, + { 0x1f48, 0x1f4d, 0x0008, CanonicalizeRangeHi }, + { 0x1f4e, 0x1f50, 0x0000, CanonicalizeUnique }, + { 0x1f51, 0x1f51, 0x0008, CanonicalizeRangeLo }, + { 0x1f52, 0x1f52, 0x0000, CanonicalizeUnique }, + { 0x1f53, 0x1f53, 0x0008, CanonicalizeRangeLo }, + { 0x1f54, 0x1f54, 0x0000, CanonicalizeUnique }, + { 0x1f55, 0x1f55, 0x0008, CanonicalizeRangeLo }, + { 0x1f56, 0x1f56, 0x0000, CanonicalizeUnique }, + { 0x1f57, 0x1f57, 0x0008, CanonicalizeRangeLo }, + { 0x1f58, 0x1f58, 0x0000, CanonicalizeUnique }, + { 0x1f59, 0x1f59, 0x0008, CanonicalizeRangeHi }, + { 0x1f5a, 0x1f5a, 0x0000, CanonicalizeUnique }, + { 0x1f5b, 0x1f5b, 0x0008, CanonicalizeRangeHi }, + { 0x1f5c, 0x1f5c, 0x0000, CanonicalizeUnique }, + { 0x1f5d, 0x1f5d, 0x0008, CanonicalizeRangeHi }, + { 0x1f5e, 0x1f5e, 0x0000, CanonicalizeUnique }, + { 0x1f5f, 0x1f5f, 0x0008, CanonicalizeRangeHi }, + { 0x1f60, 0x1f67, 0x0008, CanonicalizeRangeLo }, + { 0x1f68, 0x1f6f, 0x0008, CanonicalizeRangeHi }, + { 0x1f70, 0x1f71, 0x004a, CanonicalizeRangeLo }, + { 0x1f72, 0x1f75, 0x0056, CanonicalizeRangeLo }, + { 0x1f76, 0x1f77, 0x0064, CanonicalizeRangeLo }, + { 0x1f78, 0x1f79, 0x0080, CanonicalizeRangeLo }, + { 0x1f7a, 0x1f7b, 0x0070, CanonicalizeRangeLo }, + { 0x1f7c, 0x1f7d, 0x007e, CanonicalizeRangeLo }, + { 0x1f7e, 0x1faf, 0x0000, CanonicalizeUnique }, + { 0x1fb0, 0x1fb1, 0x0008, CanonicalizeRangeLo }, + { 0x1fb2, 0x1fb7, 0x0000, CanonicalizeUnique }, + { 0x1fb8, 0x1fb9, 0x0008, CanonicalizeRangeHi }, + { 0x1fba, 0x1fbb, 0x004a, CanonicalizeRangeHi }, + { 0x1fbc, 0x1fbd, 0x0000, CanonicalizeUnique }, + { 0x1fbe, 0x1fbe, 0x0007, CanonicalizeSet }, + { 0x1fbf, 0x1fc7, 0x0000, CanonicalizeUnique }, + { 0x1fc8, 0x1fcb, 0x0056, CanonicalizeRangeHi }, + { 0x1fcc, 0x1fcf, 0x0000, CanonicalizeUnique }, + { 0x1fd0, 0x1fd1, 0x0008, CanonicalizeRangeLo }, + { 0x1fd2, 0x1fd7, 0x0000, CanonicalizeUnique }, + { 0x1fd8, 0x1fd9, 0x0008, CanonicalizeRangeHi }, + { 0x1fda, 0x1fdb, 0x0064, CanonicalizeRangeHi }, + { 0x1fdc, 0x1fdf, 0x0000, CanonicalizeUnique }, + { 0x1fe0, 0x1fe1, 0x0008, CanonicalizeRangeLo }, + { 0x1fe2, 0x1fe4, 0x0000, CanonicalizeUnique }, + { 0x1fe5, 0x1fe5, 0x0007, CanonicalizeRangeLo }, + { 0x1fe6, 0x1fe7, 0x0000, CanonicalizeUnique }, + { 0x1fe8, 0x1fe9, 0x0008, CanonicalizeRangeHi }, + { 0x1fea, 0x1feb, 0x0070, CanonicalizeRangeHi }, + { 0x1fec, 0x1fec, 0x0007, CanonicalizeRangeHi }, + { 0x1fed, 0x1ff7, 0x0000, CanonicalizeUnique }, + { 0x1ff8, 0x1ff9, 0x0080, CanonicalizeRangeHi }, + { 0x1ffa, 0x1ffb, 0x007e, CanonicalizeRangeHi }, + { 0x1ffc, 0x2131, 0x0000, CanonicalizeUnique }, + { 0x2132, 0x2132, 0x001c, CanonicalizeRangeLo }, + { 0x2133, 0x214d, 0x0000, CanonicalizeUnique }, + { 0x214e, 0x214e, 0x001c, CanonicalizeRangeHi }, + { 0x214f, 0x215f, 0x0000, CanonicalizeUnique }, + { 0x2160, 0x216f, 0x0010, CanonicalizeRangeLo }, + { 0x2170, 0x217f, 0x0010, CanonicalizeRangeHi }, + { 0x2180, 0x2182, 0x0000, CanonicalizeUnique }, + { 0x2183, 0x2184, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2185, 0x24b5, 0x0000, CanonicalizeUnique }, + { 0x24b6, 0x24cf, 0x001a, CanonicalizeRangeLo }, + { 0x24d0, 0x24e9, 0x001a, CanonicalizeRangeHi }, + { 0x24ea, 0x2bff, 0x0000, CanonicalizeUnique }, + { 0x2c00, 0x2c2e, 0x0030, CanonicalizeRangeLo }, + { 0x2c2f, 0x2c2f, 0x0000, CanonicalizeUnique }, + { 0x2c30, 0x2c5e, 0x0030, CanonicalizeRangeHi }, + { 0x2c5f, 0x2c5f, 0x0000, CanonicalizeUnique }, + { 0x2c60, 0x2c61, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2c62, 0x2c62, 0x29f7, CanonicalizeRangeHi }, + { 0x2c63, 0x2c63, 0x0ee6, CanonicalizeRangeHi }, + { 0x2c64, 0x2c64, 0x29e7, CanonicalizeRangeHi }, + { 0x2c65, 0x2c65, 0x2a2b, CanonicalizeRangeHi }, + { 0x2c66, 0x2c66, 0x2a28, CanonicalizeRangeHi }, + { 0x2c67, 0x2c6c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2c6d, 0x2c6d, 0x2a1c, CanonicalizeRangeHi }, + { 0x2c6e, 0x2c6e, 0x29fd, CanonicalizeRangeHi }, + { 0x2c6f, 0x2c6f, 0x2a1f, CanonicalizeRangeHi }, + { 0x2c70, 0x2c70, 0x2a1e, CanonicalizeRangeHi }, + { 0x2c71, 0x2c71, 0x0000, CanonicalizeUnique }, + { 0x2c72, 0x2c73, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2c74, 0x2c74, 0x0000, CanonicalizeUnique }, + { 0x2c75, 0x2c76, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2c77, 0x2c7d, 0x0000, CanonicalizeUnique }, + { 0x2c7e, 0x2c7f, 0x2a3f, CanonicalizeRangeHi }, + { 0x2c80, 0x2ce3, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2ce4, 0x2cea, 0x0000, CanonicalizeUnique }, + { 0x2ceb, 0x2cee, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2cef, 0x2cf1, 0x0000, CanonicalizeUnique }, + { 0x2cf2, 0x2cf3, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2cf4, 0x2cff, 0x0000, CanonicalizeUnique }, + { 0x2d00, 0x2d25, 0x1c60, CanonicalizeRangeHi }, + { 0x2d26, 0x2d26, 0x0000, CanonicalizeUnique }, + { 0x2d27, 0x2d27, 0x1c60, CanonicalizeRangeHi }, + { 0x2d28, 0x2d2c, 0x0000, CanonicalizeUnique }, + { 0x2d2d, 0x2d2d, 0x1c60, CanonicalizeRangeHi }, + { 0x2d2e, 0xa63f, 0x0000, CanonicalizeUnique }, + { 0xa640, 0xa66d, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa66e, 0xa67f, 0x0000, CanonicalizeUnique }, + { 0xa680, 0xa69b, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa69c, 0xa721, 0x0000, CanonicalizeUnique }, + { 0xa722, 0xa72f, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa730, 0xa731, 0x0000, CanonicalizeUnique }, + { 0xa732, 0xa76f, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa770, 0xa778, 0x0000, CanonicalizeUnique }, + { 0xa779, 0xa77c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0xa77d, 0xa77d, 0x8a04, CanonicalizeRangeHi }, + { 0xa77e, 0xa787, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa788, 0xa78a, 0x0000, CanonicalizeUnique }, + { 0xa78b, 0xa78c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0xa78d, 0xa78d, 0xa528, CanonicalizeRangeHi }, + { 0xa78e, 0xa78f, 0x0000, CanonicalizeUnique }, + { 0xa790, 0xa793, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa794, 0xa795, 0x0000, CanonicalizeUnique }, + { 0xa796, 0xa7a9, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa7aa, 0xa7aa, 0xa544, CanonicalizeRangeHi }, + { 0xa7ab, 0xa7ab, 0xa54f, CanonicalizeRangeHi }, + { 0xa7ac, 0xa7ac, 0xa54b, CanonicalizeRangeHi }, + { 0xa7ad, 0xa7ad, 0xa541, CanonicalizeRangeHi }, + { 0xa7ae, 0xa7af, 0x0000, CanonicalizeUnique }, + { 0xa7b0, 0xa7b0, 0xa512, CanonicalizeRangeHi }, + { 0xa7b1, 0xa7b1, 0xa52a, CanonicalizeRangeHi }, + { 0xa7b2, 0xff20, 0x0000, CanonicalizeUnique }, + { 0xff21, 0xff3a, 0x0020, CanonicalizeRangeLo }, + { 0xff3b, 0xff40, 0x0000, CanonicalizeUnique }, + { 0xff41, 0xff5a, 0x0020, CanonicalizeRangeHi }, + { 0xff5b, 0xffff, 0x0000, CanonicalizeUnique }, }; } } // JSC::Yarr diff --git a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js new file mode 100644 index 000000000..dc578cfec --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2012, 2016 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +function printHeader() +{ + var copyright = ( + "/*" + "\n" + + " * Copyright (C) 2012-2013, 2015-2016 Apple Inc. All rights reserved." + "\n" + + " *" + "\n" + + " * Redistribution and use in source and binary forms, with or without" + "\n" + + " * modification, are permitted provided that the following conditions" + "\n" + + " * are met:" + "\n" + + " * 1. Redistributions of source code must retain the above copyright" + "\n" + + " * notice, this list of conditions and the following disclaimer." + "\n" + + " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + + " * notice, this list of conditions and the following disclaimer in the" + "\n" + + " * documentation and/or other materials provided with the distribution." + "\n" + + " *" + "\n" + + " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + + " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + + " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + + " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + + " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + + " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + + " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + + " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + + " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + + " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + + " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + + " */"); + + print(copyright); + print(); + print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalize.js"); + print(); + print('#include "config.h"'); + print('#include "YarrCanonicalize.h"'); + print(); + print("namespace JSC { namespace Yarr {"); + print(); +} + +function printFooter() +{ + print("} } // JSC::Yarr"); + print(); +} + +// Helper function to convert a number to a fixed width hex representation of a UChar32. +function hex(x) +{ + var s = Number(x).toString(16); + while (s.length < 4) + s = 0 + s; + return "0x" + s; +} + +// See ES 6.0, 21.2.2.8.2 Steps 3 +function canonicalize(ch) +{ + var u = String.fromCharCode(ch).toUpperCase(); + if (u.length > 1) + return ch; + var cu = u.charCodeAt(0); + if (ch >= 128 && cu < 128) + return ch; + return cu; +} + +var MAX_UCS2 = 0xFFFF; + +function createUCS2CanonicalGroups() +{ + var groupedCanonically = []; + // Pass 1: populate groupedCanonically - this is mapping from canonicalized + // values back to the set of character code that canonicalize to them. + for (var i = 0; i <= MAX_UCS2; ++i) { + var ch = canonicalize(i); + if (!groupedCanonically[ch]) + groupedCanonically[ch] = []; + groupedCanonically[ch].push(i); + } + + return groupedCanonically; +} + +function createTables(prefix, maxValue, canonicalGroups) +{ + var prefixLower = prefix.toLowerCase(); + var prefixUpper = prefix.toUpperCase(); + var typeInfo = []; + var characterSetInfo = []; + // Pass 2: populate typeInfo & characterSetInfo. For every character calculate + // a typeInfo value, described by the types above, and a value payload. + for (cu in canonicalGroups) { + // The set of characters that canonicalize to cu + var characters = canonicalGroups[cu]; + + // If there is only one, it is unique. + if (characters.length == 1) { + typeInfo[characters[0]] = "CanonicalizeUnique:0"; + continue; + } + + // Sort the array. + characters.sort(function(x,y){return x-y;}); + + // If there are more than two characters, create an entry in characterSetInfo. + if (characters.length > 2) { + for (i in characters) + typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; + characterSetInfo.push(characters); + + continue; + } + + // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. + var lo = characters[0]; + var hi = characters[1]; + var delta = hi - lo; + if (delta == 1) { + var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; + typeInfo[lo] = type; + typeInfo[hi] = type; + } else { + typeInfo[lo] = "CanonicalizeRangeLo:" + delta; + typeInfo[hi] = "CanonicalizeRangeHi:" + delta; + } + } + + var rangeInfo = []; + // Pass 3: coallesce types into ranges. + for (var end = 0; end <= maxValue; ++end) { + var begin = end; + var type = typeInfo[end]; + while (end < maxValue && typeInfo[end + 1] == type) + ++end; + rangeInfo.push({begin:begin, end:end, type:type}); + } + + for (i in characterSetInfo) { + var characters = "" + var set = characterSetInfo[i]; + for (var j in set) + characters += hex(set[j]) + ", "; + print("const UChar32 " + prefixLower + "CharacterSet" + i + "[] = { " + characters + "0 };"); + } + print(); + print("static const size_t " + prefixUpper + "_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); + print("const UChar32* const " + prefixLower + "CharacterSetInfo[" + prefixUpper + "_CANONICALIZATION_SETS] = {"); + for (i in characterSetInfo) + print(" " + prefixLower + "CharacterSet" + i + ","); + print("};"); + print(); + print("const size_t " + prefixUpper + "_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); + print("const CanonicalizationRange " + prefixLower + "RangeInfo[" + prefixUpper + "_CANONICALIZATION_RANGES] = {"); + for (i in rangeInfo) { + var info = rangeInfo[i]; + var typeAndValue = info.type.split(':'); + print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); + } + print("};"); + print(); +} + +printHeader(); + +createTables("UCS2", MAX_UCS2, createUCS2CanonicalGroups()); + +printFooter(); + diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.cpp b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp index 8645b5f20..9cf14e702 100644 --- a/Source/JavaScriptCore/yarr/YarrInterpreter.cpp +++ b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2013, 2016 Apple Inc. All rights reserved. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without @@ -27,8 +27,9 @@ #include "config.h" #include "YarrInterpreter.h" +#include "SuperSampler.h" #include "Yarr.h" -#include "YarrCanonicalizeUCS2.h" +#include "YarrCanonicalize.h" #include <wtf/BumpPointerAllocator.h> #include <wtf/DataLog.h> #include <wtf/text/CString.h> @@ -44,9 +45,11 @@ public: struct ParenthesesDisjunctionContext; struct BackTrackInfoPatternCharacter { + uintptr_t begin; // Only needed for unicode patterns uintptr_t matchAmount; }; struct BackTrackInfoCharacterClass { + uintptr_t begin; // Only needed for unicode patterns uintptr_t matchAmount; }; struct BackTrackInfoBackReference { @@ -154,7 +157,7 @@ public: ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term) { - size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + static_cast<size_t>(disjunction->m_frameSize) * sizeof(uintptr_t); allocatorPool = allocatorPool->ensureCapacity(size); RELEASE_ASSERT(allocatorPool); return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); @@ -167,10 +170,11 @@ public: class InputStream { public: - InputStream(const CharType* input, unsigned start, unsigned length) + InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs) : input(input) , pos(start) , length(length) + , decodeSurrogatePairs(decodeSurrogatePairs) { } @@ -204,13 +208,40 @@ public: RELEASE_ASSERT(pos >= negativePositionOffest); unsigned p = pos - negativePositionOffest; ASSERT(p < length); - return input[p]; + int result = input[p]; + if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) { + if (atEnd()) + return -1; + + result = U16_GET_SUPPLEMENTARY(result, input[p + 1]); + next(); + } + return result; + } + + int readSurrogatePairChecked(unsigned negativePositionOffset) + { + RELEASE_ASSERT(pos >= negativePositionOffset); + unsigned p = pos - negativePositionOffset; + ASSERT(p < length); + if (p + 1 >= length) + return -1; + + int first = input[p]; + int second = input[p + 1]; + if (U16_IS_LEAD(first) && U16_IS_TRAIL(second)) + return U16_GET_SUPPLEMENTARY(first, second); + + return -1; } int reread(unsigned from) { ASSERT(from < length); - return input[from]; + int result = input[from]; + if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1])) + result = U16_GET_SUPPLEMENTARY(result, input[from + 1]); + return result; } int prev() @@ -261,9 +292,9 @@ public: pos -= count; } - bool atStart(unsigned negativePositionOffest) + bool atStart(unsigned negativePositionOffset) { - return pos == negativePositionOffest; + return pos == negativePositionOffset; } bool atEnd(unsigned negativePositionOffest) @@ -281,11 +312,12 @@ public: const CharType* input; unsigned pos; unsigned length; + bool decodeSurrogatePairs; }; bool testCharacterClass(CharacterClass* characterClass, int ch) { - if (ch & 0xFF80) { + if (!isASCII(ch)) { for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) if (ch == characterClass->m_matchesUnicode[i]) return true; @@ -309,6 +341,11 @@ public: return testChar == input.readChecked(negativeInputOffset); } + bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset) + { + return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset); + } + bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset) { int ch = input.readChecked(negativeInputOffset); @@ -328,32 +365,31 @@ public: if (!input.checkInput(matchSize)) return false; - if (pattern->m_ignoreCase) { - for (unsigned i = 0; i < matchSize; ++i) { - int oldCh = input.reread(matchBegin + i); - int ch = input.readChecked(negativeInputOffset + matchSize - i); - - if (oldCh == ch) - continue; - - // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that - // unicode values are never allowed to match against ascii ones. - if (isASCII(oldCh) || isASCII(ch)) { + for (unsigned i = 0; i < matchSize; ++i) { + int oldCh = input.reread(matchBegin + i); + int ch; + if (!U_IS_BMP(oldCh)) { + ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i); + ++i; + } else + ch = input.readChecked(negativeInputOffset + matchSize - i); + + if (oldCh == ch) + continue; + + if (pattern->ignoreCase()) { + // See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode + // patterns, Unicode values are never allowed to match against ASCII ones. + // For Unicode, we need to check all canonical equivalents of a character. + if (!unicode && (isASCII(oldCh) || isASCII(ch))) { if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) continue; - } else if (areCanonicallyEquivalent(oldCh, ch)) + } else if (areCanonicallyEquivalent(oldCh, ch, unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2)) continue; - - input.uncheckInput(matchSize); - return false; - } - } else { - for (unsigned i = 0; i < matchSize; ++i) { - if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) { - input.uncheckInput(matchSize); - return false; - } } + + input.uncheckInput(matchSize); + return false; } return true; @@ -361,15 +397,15 @@ public: bool matchAssertionBOL(ByteTerm& term) { - return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1))); + return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1))); } bool matchAssertionEOL(ByteTerm& term) { if (term.inputPosition) - return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); + return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); - return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); + return (input.atEnd()) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.read())); } bool matchAssertionWordBoundary(ByteTerm& term) @@ -396,18 +432,18 @@ public: case QuantifierGreedy: if (backTrack->matchAmount) { --backTrack->matchAmount; - input.uncheckInput(1); + input.uncheckInput(U16_LENGTH(term.atom.patternCharacter)); return true; } break; case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { ++backTrack->matchAmount; if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1)) return true; } - input.uncheckInput(backTrack->matchAmount); + input.setPos(backTrack->begin); break; } @@ -431,7 +467,7 @@ public: break; case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { ++backTrack->matchAmount; if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1)) return true; @@ -446,11 +482,24 @@ public: bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierFixedCount: { - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (unicode) { + backTrack->begin = input.getPos(); + unsigned matchAmount = 0; + for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) { + input.setPos(backTrack->begin); + return false; + } + } + + return true; + } + + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) return false; } @@ -458,13 +507,16 @@ public: } case QuantifierGreedy: { + unsigned position = input.getPos(); + backTrack->begin = position; unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { - input.uncheckInput(1); + input.setPos(position); break; } ++matchAmount; + position = input.getPos(); } backTrack->matchAmount = matchAmount; @@ -472,6 +524,7 @@ public: } case QuantifierNonGreedy: + backTrack->begin = input.getPos(); backTrack->matchAmount = 0; return true; } @@ -483,14 +536,28 @@ public: bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierFixedCount: + if (unicode) + input.setPos(backTrack->begin); break; case QuantifierGreedy: if (backTrack->matchAmount) { + if (unicode) { + // Rematch one less match + input.setPos(backTrack->begin); + --backTrack->matchAmount; + for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { + input.uncheckInput(1); + break; + } + } + return true; + } --backTrack->matchAmount; input.uncheckInput(1); return true; @@ -498,12 +565,12 @@ public: break; case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { ++backTrack->matchAmount; if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) return true; } - input.uncheckInput(backTrack->matchAmount); + input.setPos(backTrack->begin); break; } @@ -535,7 +602,7 @@ public: switch (term.atom.quantityType) { case QuantifierFixedCount: { backTrack->begin = input.getPos(); - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { input.setPos(backTrack->begin); return false; @@ -546,7 +613,7 @@ public: case QuantifierGreedy: { unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) + while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) ++matchAmount; backTrack->matchAmount = matchAmount; return true; @@ -580,7 +647,7 @@ public: switch (term.atom.quantityType) { case QuantifierFixedCount: - // for quantityCount == 1, could rewind. + // for quantityMaxCount == 1, could rewind. input.setPos(backTrack->begin); break; @@ -593,7 +660,7 @@ public: break; case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { ++backTrack->matchAmount; return true; } @@ -608,8 +675,8 @@ public: { if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; - output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; + output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition; + output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition; } } void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) @@ -641,7 +708,7 @@ public: bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); @@ -671,11 +738,11 @@ public: bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; + output[(subpatternId << 1) + 1] = input.getPos() - term.inputPosition; } if (term.atom.quantityType == QuantifierFixedCount) @@ -688,7 +755,7 @@ public: bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); @@ -718,7 +785,7 @@ public: bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); @@ -739,7 +806,7 @@ public: ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); unsigned subpatternId = term.atom.subpatternId; - output[subpatternId << 1] = input.getPos() + term.inputPosition; + output[subpatternId << 1] = input.getPos() - term.inputPosition; } context->term -= term.atom.parenthesesWidth; return true; @@ -756,7 +823,7 @@ public: { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(term.atom.quantityMaxCount == quantifyInfinite); ASSERT(!term.capture()); BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); @@ -773,7 +840,7 @@ public: if (backTrack->begin == input.getPos()) return false; - // Successful match! Okay, what's next? - loop around and try to match moar! + // Successful match! Okay, what's next? - loop around and try to match more! context->term -= (term.atom.parenthesesWidth + 1); return true; } @@ -782,7 +849,7 @@ public: { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(term.atom.quantityMaxCount == quantifyInfinite); ASSERT(!term.capture()); // If we backtrack to this point, we have failed to match this iteration of the parens. @@ -802,7 +869,7 @@ public: bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); @@ -813,7 +880,7 @@ public: bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); @@ -831,7 +898,7 @@ public: bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); // We've failed to match parens; if they are inverted, this is win! if (term.invert()) { @@ -845,7 +912,7 @@ public: bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); @@ -865,36 +932,45 @@ public: backTrack->matchAmount = 0; backTrack->lastContext = 0; - switch (term.atom.quantityType) { - case QuantifierFixedCount: { + ASSERT(term.atom.quantityType != QuantifierFixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount); + + unsigned minimumMatchCount = term.atom.quantityMinCount; + JSRegExpResult fixedMatchResult; + + // Handle fixed matches and the minimum part of a variable length match. + if (minimumMatchCount) { // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { + while (backTrack->matchAmount < minimumMatchCount) { // Try to do a match, and it it succeeds, add it to the list. ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (result == JSRegExpMatch) + fixedMatchResult = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (fixedMatchResult == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { // The match failed; try to find an alternate point to carry on from. resetMatches(term, context); freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; + + if (fixedMatchResult != JSRegExpNoMatch) + return fixedMatchResult; JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); if (backtrackResult != JSRegExpMatch) return backtrackResult; } } - ASSERT(backTrack->matchAmount == term.atom.quantityCount); ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); + } + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); return JSRegExpMatch; } case QuantifierGreedy: { - while (backTrack->matchAmount < term.atom.quantityCount) { + while (backTrack->matchAmount < term.atom.quantityMaxCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); if (result == JSRegExpMatch) @@ -944,7 +1020,7 @@ public: switch (term.atom.quantityType) { case QuantifierFixedCount: { - ASSERT(backTrack->matchAmount == term.atom.quantityCount); + ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); ParenthesesDisjunctionContext* context = 0; JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); @@ -953,7 +1029,7 @@ public: return result; // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { + while (backTrack->matchAmount < term.atom.quantityMaxCount) { // Try to do a match, and it it succeeds, add it to the list. context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); @@ -973,7 +1049,7 @@ public: } } - ASSERT(backTrack->matchAmount == term.atom.quantityCount); + ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); context = backTrack->lastContext; recordParenthesesMatch(term, context); return JSRegExpMatch; @@ -986,7 +1062,7 @@ public: ParenthesesDisjunctionContext* context = backTrack->lastContext; JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); if (result == JSRegExpMatch) { - while (backTrack->matchAmount < term.atom.quantityCount) { + while (backTrack->matchAmount < term.atom.quantityMaxCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); if (parenthesesResult == JSRegExpMatch) @@ -1019,7 +1095,7 @@ public: case QuantifierNonGreedy: { // If we've not reached the limit, try to add one more match. - if (backTrack->matchAmount < term.atom.quantityCount) { + if (backTrack->matchAmount < term.atom.quantityMaxCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); if (result == JSRegExpMatch) { @@ -1089,7 +1165,7 @@ public: if (((matchBegin && term.anchors.m_bol) || ((matchEnd != input.end()) && term.anchors.m_eol)) - && !pattern->m_multiline) + && !pattern->multiline()) return false; context->matchBegin = matchBegin; @@ -1154,21 +1230,37 @@ public: case ByteTerm::TypePatternCharacterOnce: case ByteTerm::TypePatternCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) + if (unicode) { + if (!U_IS_BMP(currentTerm().atom.patternCharacter)) { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { + if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 2 * matchAmount)) { + BACKTRACK(); + } + } + MATCH_NEXT(); + } + } + unsigned position = input.getPos(); // May need to back out reading a surrogate pair. + + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) { + input.setPos(position); BACKTRACK(); + } } MATCH_NEXT(); } case ByteTerm::TypePatternCharacterGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + unsigned position = input.getPos(); // May need to back out reading a surrogate pair. + while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) { if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) { - input.uncheckInput(1); + input.setPos(position); break; } ++matchAmount; + position = input.getPos(); } backTrack->matchAmount = matchAmount; @@ -1176,13 +1268,29 @@ public: } case ByteTerm::TypePatternCharacterNonGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->begin = input.getPos(); backTrack->matchAmount = 0; MATCH_NEXT(); } case ByteTerm::TypePatternCasedCharacterOnce: case ByteTerm::TypePatternCasedCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (unicode) { + // Case insensitive matching of unicode characters is handled as TypeCharacterClass. + ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter)); + + unsigned position = input.getPos(); // May need to back out reading a surrogate pair. + + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) { + input.setPos(position); + BACKTRACK(); + } + } + MATCH_NEXT(); + } + + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) BACKTRACK(); } @@ -1190,8 +1298,12 @@ public: } case ByteTerm::TypePatternCasedCharacterGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + + // Case insensitive matching of unicode characters is handled as TypeCharacterClass. + ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter)); + unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) { if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) { input.uncheckInput(1); break; @@ -1204,6 +1316,10 @@ public: } case ByteTerm::TypePatternCasedCharacterNonGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + + // Case insensitive matching of unicode characters is handled as TypeCharacterClass. + ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter)); + backTrack->matchAmount = 0; MATCH_NEXT(); } @@ -1285,7 +1401,7 @@ public: if (offset > 0) MATCH_NEXT(); - if (input.atEnd()) + if (input.atEnd() || pattern->sticky()) return JSRegExpNoMatch; input.next(); @@ -1415,6 +1531,9 @@ public: if (!input.isAvailableInput(0)) return offsetNoMatch; + if (pattern->m_lock) + pattern->m_lock->lock(); + for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i) output[i << 1] = offsetNoMatch; @@ -1434,13 +1553,18 @@ public: pattern->m_allocator->stopAllocator(); ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch)); + + if (pattern->m_lock) + pattern->m_lock->unlock(); + return output[0]; } Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) : pattern(pattern) + , unicode(pattern->unicode()) , output(output) - , input(input, start, length) + , input(input, start, length, pattern->unicode()) , allocatorPool(0) , remainingMatchCount(matchLimit) { @@ -1448,6 +1572,7 @@ public: private: BytecodePattern* pattern; + bool unicode; unsigned* output; InputStream input; BumpPointerPool* allocatorPool; @@ -1472,13 +1597,13 @@ public: m_currentAlternativeIndex = 0; } - PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator) + std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock) { regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); emitDisjunction(m_pattern.m_body); regexEnd(); - return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); + return std::make_unique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock); } void checkInput(unsigned count) @@ -1506,40 +1631,37 @@ public: m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); } - void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { - if (m_pattern.m_ignoreCase) { - ASSERT(u_tolower(ch) <= 0xFFFF); - ASSERT(u_toupper(ch) <= 0xFFFF); - - UChar lo = u_tolower(ch); - UChar hi = u_toupper(ch); + if (m_pattern.ignoreCase()) { + UChar32 lo = u_tolower(ch); + UChar32 hi = u_toupper(ch); if (lo != hi) { - m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); + m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType)); return; } } - m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); + m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType)); } - void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } - void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { ASSERT(subpatternId); m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } @@ -1600,7 +1722,7 @@ public: m_currentAlternativeIndex = beginTerm + 1; } - void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); @@ -1616,9 +1738,9 @@ public: m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } @@ -1698,7 +1820,7 @@ public: m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; } - void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); @@ -1712,7 +1834,7 @@ public: unsigned subpatternId = parenthesesBegin.atom.subpatternId; unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; - OwnPtr<ByteDisjunction> parenthesesDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + auto parenthesesDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize); unsigned firstTermInParentheses = beginTerm + 1; parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2); @@ -1725,14 +1847,15 @@ public: m_bodyDisjunction->terms.shrink(beginTerm); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition)); - m_allParenthesesInfo.append(parenthesesDisjunction.release()); + m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction)); - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; } - void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); @@ -1748,13 +1871,15 @@ public: m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } - void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); @@ -1770,15 +1895,17 @@ public: m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) { - m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + m_bodyDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize); m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); m_bodyDisjunction->terms[0].frameLocation = 0; m_currentAlternativeIndex = 0; @@ -1830,9 +1957,7 @@ public: currentCountAlreadyChecked += countToCheck; } - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { - PatternTerm& term = alternative->m_terms[i]; - + for (auto& term : alternative->m_terms) { switch (term.type) { case PatternTerm::TypeAssertionBOL: assertionBOL(currentCountAlreadyChecked - term.inputPosition); @@ -1847,15 +1972,15 @@ public: break; case PatternTerm::TypePatternCharacter: - atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); + atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType); break; case PatternTerm::TypeCharacterClass: - atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); + atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType); break; case PatternTerm::TypeBackReference: - atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); + atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType); break; case PatternTerm::TypeForwardReference: @@ -1863,27 +1988,30 @@ public: case PatternTerm::TypeParenthesesSubpattern: { unsigned disjunctionAlreadyCheckedCount = 0; - if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) { unsigned alternativeFrameLocation = term.frameLocation; // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. if (term.quantityType == QuantifierFixedCount) disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; else alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); + ASSERT(currentCountAlreadyChecked >= term.inputPosition); + unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition; + atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); - atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType); } else if (term.parentheses.isTerminal) { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); + ASSERT(currentCountAlreadyChecked >= term.inputPosition); + unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition; + atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); - atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType); } else { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0); + ASSERT(currentCountAlreadyChecked >= term.inputPosition); + unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition; + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, 0); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); } break; } @@ -1891,8 +2019,8 @@ public: case PatternTerm::TypeParentheticalAssertion: { unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; - ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition)); - unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition); + ASSERT(currentCountAlreadyChecked >= term.inputPosition); + unsigned positiveInputOffset = currentCountAlreadyChecked - term.inputPosition; unsigned uncheckAmount = 0; if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) { uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize; @@ -1902,7 +2030,7 @@ public: atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount); - atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); + atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType); if (uncheckAmount) { checkInput(uncheckAmount); currentCountAlreadyChecked += uncheckAmount; @@ -1920,19 +2048,20 @@ public: private: YarrPattern& m_pattern; - OwnPtr<ByteDisjunction> m_bodyDisjunction; + std::unique_ptr<ByteDisjunction> m_bodyDisjunction; unsigned m_currentAlternativeIndex; Vector<ParenthesesStackEntry> m_parenthesesStack; - Vector<OwnPtr<ByteDisjunction>> m_allParenthesesInfo; + Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; }; -PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) +std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock) { - return ByteCompiler(pattern).compile(allocator); + return ByteCompiler(pattern).compile(allocator, lock); } unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output) { + SuperSamplerScope superSamplerScope(false); if (input.is8Bit()) return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret(); return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret(); @@ -1940,11 +2069,13 @@ unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned star unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output) { + SuperSamplerScope superSamplerScope(false); return Interpreter<LChar>(bytecode, output, input, length, start).interpret(); } unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output) { + SuperSamplerScope superSamplerScope(false); return Interpreter<UChar>(bytecode, output, input, length, start).interpret(); } diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.h b/Source/JavaScriptCore/yarr/YarrInterpreter.h index f37309436..43dcb1f40 100644 --- a/Source/JavaScriptCore/yarr/YarrInterpreter.h +++ b/Source/JavaScriptCore/yarr/YarrInterpreter.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,12 +23,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrInterpreter_h -#define YarrInterpreter_h +#pragma once +#include "ConcurrentJSLock.h" #include "YarrPattern.h" -#include <wtf/PassOwnPtr.h> -#include <wtf/unicode/Unicode.h> namespace WTF { class BumpPointerAllocator; @@ -76,10 +74,10 @@ struct ByteTerm { union { struct { union { - UChar patternCharacter; + UChar32 patternCharacter; struct { - UChar lo; - UChar hi; + UChar32 lo; + UChar32 hi; } casedCharacter; CharacterClass* characterClass; unsigned subpatternId; @@ -89,7 +87,8 @@ struct ByteTerm { unsigned parenthesesWidth; }; QuantifierType quantityType; - unsigned quantityCount; + unsigned quantityMinCount; + unsigned quantityMaxCount; } atom; struct { int next; @@ -107,11 +106,17 @@ struct ByteTerm { bool m_invert : 1; unsigned inputPosition; - ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) : frameLocation(frameLocation) , m_capture(false) , m_invert(false) { + atom.patternCharacter = ch; + atom.quantityType = quantityType; + atom.quantityMinCount = quantityCount.unsafeGet(); + atom.quantityMaxCount = quantityCount.unsafeGet(); + inputPosition = inputPos; + switch (quantityType) { case QuantifierFixedCount: type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; @@ -123,14 +128,9 @@ struct ByteTerm { type = ByteTerm::TypePatternCharacterNonGreedy; break; } - - atom.patternCharacter = ch; - atom.quantityType = quantityType; - atom.quantityCount = quantityCount.unsafeGet(); - inputPosition = inputPos; } - ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) : frameLocation(frameLocation) , m_capture(false) , m_invert(false) @@ -150,22 +150,24 @@ struct ByteTerm { atom.casedCharacter.lo = lo; atom.casedCharacter.hi = hi; atom.quantityType = quantityType; - atom.quantityCount = quantityCount.unsafeGet(); + atom.quantityMinCount = quantityCount.unsafeGet(); + atom.quantityMaxCount = quantityCount.unsafeGet(); inputPosition = inputPos; } - ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) + ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos) : type(ByteTerm::TypeCharacterClass) , m_capture(false) , m_invert(invert) { atom.characterClass = characterClass; atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; + atom.quantityMinCount = 1; + atom.quantityMaxCount = 1; inputPosition = inputPos; } - ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) + ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos) : type(type) , m_capture(capture) , m_invert(false) @@ -173,7 +175,8 @@ struct ByteTerm { atom.subpatternId = subpatternId; atom.parenthesesDisjunction = parenthesesInfo; atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; + atom.quantityMinCount = 1; + atom.quantityMaxCount = 1; inputPosition = inputPos; } @@ -183,21 +186,23 @@ struct ByteTerm { , m_invert(invert) { atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; + atom.quantityMinCount = 1; + atom.quantityMaxCount = 1; } - ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) + ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos) : type(type) , m_capture(capture) , m_invert(invert) { atom.subpatternId = subpatternId; atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; + atom.quantityMinCount = 1; + atom.quantityMaxCount = 1; inputPosition = inputPos; } - static ByteTerm BOL(int inputPos) + static ByteTerm BOL(unsigned inputPos) { ByteTerm term(TypeAssertionBOL); term.inputPosition = inputPos; @@ -218,21 +223,21 @@ struct ByteTerm { return term; } - static ByteTerm EOL(int inputPos) + static ByteTerm EOL(unsigned inputPos) { ByteTerm term(TypeAssertionEOL); term.inputPosition = inputPos; return term; } - static ByteTerm WordBoundary(bool invert, int inputPos) + static ByteTerm WordBoundary(bool invert, unsigned inputPos) { ByteTerm term(TypeAssertionWordBoundary, invert); term.inputPosition = inputPos; return term; } - static ByteTerm BackReference(unsigned subpatternId, int inputPos) + static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos) { return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); } @@ -329,6 +334,8 @@ public: { } + size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); } + Vector<ByteTerm> terms; unsigned m_numSubpatterns; unsigned m_frameSize; @@ -337,16 +344,19 @@ public: struct BytecodePattern { WTF_MAKE_FAST_ALLOCATED; public: - BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<OwnPtr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator) - : m_body(body) - , m_ignoreCase(pattern.m_ignoreCase) - , m_multiline(pattern.m_multiline) + BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock) + : m_body(WTFMove(body)) + , m_flags(pattern.m_flags) , m_allocator(allocator) + , m_lock(lock) { m_body->terms.shrinkToFit(); newlineCharacterClass = pattern.newlineCharacterClass(); - wordcharCharacterClass = pattern.wordcharCharacterClass(); + if (unicode() && ignoreCase()) + wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass(); + else + wordcharCharacterClass = pattern.wordcharCharacterClass(); m_allParenthesesInfo.swap(parenthesesInfoToAdopt); m_allParenthesesInfo.shrinkToFit(); @@ -355,26 +365,31 @@ public: m_userCharacterClasses.shrinkToFit(); } - OwnPtr<ByteDisjunction> m_body; - bool m_ignoreCase; - bool m_multiline; + size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); } + + bool ignoreCase() const { return m_flags & FlagIgnoreCase; } + bool multiline() const { return m_flags & FlagMultiline; } + bool sticky() const { return m_flags & FlagSticky; } + bool unicode() const { return m_flags & FlagUnicode; } + + std::unique_ptr<ByteDisjunction> m_body; + RegExpFlags m_flags; // Each BytecodePattern is associated with a RegExp, each RegExp is associated // with a VM. Cache a pointer to out VM's m_regExpAllocator. BumpPointerAllocator* m_allocator; + ConcurrentJSLock* m_lock; CharacterClass* newlineCharacterClass; CharacterClass* wordcharCharacterClass; private: - Vector<OwnPtr<ByteDisjunction>> m_allParenthesesInfo; - Vector<OwnPtr<CharacterClass>> m_userCharacterClasses; + Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; + Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses; }; -JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); +JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ConcurrentJSLock* = nullptr); JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); } } // namespace JSC::Yarr - -#endif // YarrInterpreter_h diff --git a/Source/JavaScriptCore/yarr/YarrJIT.cpp b/Source/JavaScriptCore/yarr/YarrJIT.cpp index 364a72dd8..26a22bd57 100644 --- a/Source/JavaScriptCore/yarr/YarrJIT.cpp +++ b/Source/JavaScriptCore/yarr/YarrJIT.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2013, 2015-2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -30,7 +30,7 @@ #include "LinkBuffer.h" #include "Options.h" #include "Yarr.h" -#include "YarrCanonicalizeUCS2.h" +#include "YarrCanonicalize.h" #if ENABLE(YARR_JIT) @@ -75,17 +75,6 @@ class YarrGenerator : private MacroAssembler { static const RegisterID returnRegister = MIPSRegisters::v0; static const RegisterID returnRegister2 = MIPSRegisters::v1; -#elif CPU(SH4) - static const RegisterID input = SH4Registers::r4; - static const RegisterID index = SH4Registers::r5; - static const RegisterID length = SH4Registers::r6; - static const RegisterID output = SH4Registers::r7; - - static const RegisterID regT0 = SH4Registers::r0; - static const RegisterID regT1 = SH4Registers::r1; - - static const RegisterID returnRegister = SH4Registers::r0; - static const RegisterID returnRegister2 = SH4Registers::r1; #elif CPU(X86) static const RegisterID input = X86Registers::eax; static const RegisterID index = X86Registers::edx; @@ -140,7 +129,7 @@ class YarrGenerator : private MacroAssembler { } } - void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) + void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar32* matches, unsigned matchCount) { do { // pick which range we're going to generate @@ -200,15 +189,15 @@ class YarrGenerator : private MacroAssembler { if (charClass->m_matchesUnicode.size()) { for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) { - UChar ch = charClass->m_matchesUnicode[i]; + UChar32 ch = charClass->m_matchesUnicode[i]; matchDest.append(branch32(Equal, character, Imm32(ch))); } } if (charClass->m_rangesUnicode.size()) { for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) { - UChar lo = charClass->m_rangesUnicode[i].begin; - UChar hi = charClass->m_rangesUnicode[i].end; + UChar32 lo = charClass->m_rangesUnicode[i].begin; + UChar32 hi = charClass->m_rangesUnicode[i].end; Jump below = branch32(LessThan, character, Imm32(lo)); matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi))); @@ -234,7 +223,7 @@ class YarrGenerator : private MacroAssembler { for (unsigned i = 0; i < charClass->m_matches.size(); ++i) { char ch = charClass->m_matches[i]; - if (m_pattern.m_ignoreCase) { + if (m_pattern.ignoreCase()) { if (isASCIILower(ch)) { matchesAZaz.append(ch); continue; @@ -285,29 +274,62 @@ class YarrGenerator : private MacroAssembler { return branch32(NotEqual, index, length); } - Jump jumpIfCharNotEquals(UChar ch, int inputPosition, RegisterID character) + BaseIndex negativeOffsetIndexedAddress(Checked<unsigned> negativeCharacterOffset, RegisterID tempReg, RegisterID indexReg = index) { - readCharacter(inputPosition, character); - - // For case-insesitive compares, non-ascii characters that have different - // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { - or32(TrustedImm32(0x20), character); - ch |= 0x20; + RegisterID base = input; + + // BaseIndex() addressing can take a int32_t offset. Given that we can have a regular + // expression that has unsigned character offsets, BaseIndex's signed offset is insufficient + // for addressing in extreme cases where we might underflow. Therefore we check to see if + // negativeCharacterOffset will underflow directly or after converting for 16 bit characters. + // If so, we do our own address calculating by adjusting the base, using the result register + // as a temp address register. + unsigned maximumNegativeOffsetForCharacterSize = m_charSize == Char8 ? 0x7fffffff : 0x3fffffff; + unsigned offsetAdjustAmount = 0x40000000; + if (negativeCharacterOffset.unsafeGet() > maximumNegativeOffsetForCharacterSize) { + base = tempReg; + move(input, base); + while (negativeCharacterOffset.unsafeGet() > maximumNegativeOffsetForCharacterSize) { + subPtr(TrustedImm32(offsetAdjustAmount), base); + if (m_charSize != Char8) + subPtr(TrustedImm32(offsetAdjustAmount), base); + negativeCharacterOffset -= offsetAdjustAmount; + } } - return branch32(NotEqual, character, Imm32(ch)); + Checked<int32_t> characterOffset(-static_cast<int32_t>(negativeCharacterOffset.unsafeGet())); + + if (m_charSize == Char8) + return BaseIndex(input, indexReg, TimesOne, (characterOffset * static_cast<int32_t>(sizeof(char))).unsafeGet()); + + return BaseIndex(input, indexReg, TimesTwo, (characterOffset * static_cast<int32_t>(sizeof(UChar))).unsafeGet()); } - void readCharacter(int inputPosition, RegisterID reg) + void readCharacter(Checked<unsigned> negativeCharacterOffset, RegisterID resultReg, RegisterID indexReg = index) { + BaseIndex address = negativeOffsetIndexedAddress(negativeCharacterOffset, resultReg, indexReg); + if (m_charSize == Char8) - load8(BaseIndex(input, index, TimesOne, inputPosition * sizeof(char)), reg); + load8(address, resultReg); else - load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg); + load16Unaligned(address, resultReg); } + Jump jumpIfCharNotEquals(UChar32 ch, Checked<unsigned> negativeCharacterOffset, RegisterID character) + { + readCharacter(negativeCharacterOffset, character); + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); + if (m_pattern.ignoreCase() && isASCIIAlpha(ch)) { + or32(TrustedImm32(0x20), character); + ch |= 0x20; + } + + return branch32(NotEqual, character, Imm32(ch)); + } + void storeToFrame(RegisterID reg, unsigned frameLocation) { poke(reg, frameLocation); @@ -356,6 +378,13 @@ class YarrGenerator : private MacroAssembler { addPtr(Imm32(alignCallFrameSizeInBytes(callFrameSize)), stackPointerRegister); } + void generateFailReturn() + { + move(TrustedImmPtr((void*)WTF::notFound), returnRegister); + move(TrustedImm32(0), returnRegister2); + generateReturn(); + } + // Used to record subpatters, should only be called if compileMode is IncludeSubpatterns. void setSubpatternStart(RegisterID reg, unsigned subpattern) { @@ -424,7 +453,7 @@ class YarrGenerator : private MacroAssembler { OpSimpleNestedAlternativeBegin, OpSimpleNestedAlternativeNext, OpSimpleNestedAlternativeEnd, - // Used to wrap 'Once' subpattern matches (quantityCount == 1). + // Used to wrap 'Once' subpattern matches (quantityMaxCount == 1). OpParenthesesSubpatternOnceBegin, OpParenthesesSubpatternOnceEnd, // Used to wrap 'Terminal' subpattern matches (at the end of the regexp). @@ -486,9 +515,9 @@ class YarrGenerator : private MacroAssembler { bool m_isDeadCode; // Currently used in the case of some of the more complex management of - // 'm_checked', to cache the offset used in this alternative, to avoid + // 'm_checkedOffset', to cache the offset used in this alternative, to avoid // recalculating it. - int m_checkAdjust; + Checked<unsigned> m_checkAdjust; // Used by OpNestedAlternativeNext/End to hold the pointer to the // value that will be pushed into the pattern's frame to return to, @@ -633,14 +662,14 @@ class YarrGenerator : private MacroAssembler { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - if (m_pattern.m_multiline) { + if (m_pattern.multiline()) { const RegisterID character = regT0; JumpList matchDest; if (!term->inputPosition) - matchDest.append(branch32(Equal, index, Imm32(m_checked))); + matchDest.append(branch32(Equal, index, Imm32(m_checkedOffset.unsafeGet()))); - readCharacter((term->inputPosition - m_checked) - 1, character); + readCharacter(m_checkedOffset - term->inputPosition + 1, character); matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); op.m_jumps.append(jump()); @@ -650,7 +679,7 @@ class YarrGenerator : private MacroAssembler { if (term->inputPosition) op.m_jumps.append(jump()); else - op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checked))); + op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checkedOffset.unsafeGet()))); } } void backtrackAssertionBOL(size_t opIndex) @@ -663,20 +692,20 @@ class YarrGenerator : private MacroAssembler { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - if (m_pattern.m_multiline) { + if (m_pattern.multiline()) { const RegisterID character = regT0; JumpList matchDest; - if (term->inputPosition == m_checked) + if (term->inputPosition == m_checkedOffset.unsafeGet()) matchDest.append(atEndOfInput()); - readCharacter(term->inputPosition - m_checked, character); + readCharacter(m_checkedOffset - term->inputPosition, character); matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); op.m_jumps.append(jump()); matchDest.link(this); } else { - if (term->inputPosition == m_checked) + if (term->inputPosition == m_checkedOffset.unsafeGet()) op.m_jumps.append(notAtEndOfInput()); // Erk, really should poison out these alternatives early. :-/ else @@ -696,10 +725,10 @@ class YarrGenerator : private MacroAssembler { const RegisterID character = regT0; - if (term->inputPosition == m_checked) + if (term->inputPosition == m_checkedOffset.unsafeGet()) nextIsNotWordChar.append(atEndOfInput()); - readCharacter((term->inputPosition - m_checked), character); + readCharacter(m_checkedOffset - term->inputPosition, character); matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass()); } @@ -713,8 +742,8 @@ class YarrGenerator : private MacroAssembler { Jump atBegin; JumpList matchDest; if (!term->inputPosition) - atBegin = branch32(Equal, index, Imm32(m_checked)); - readCharacter((term->inputPosition - m_checked) - 1, character); + atBegin = branch32(Equal, index, Imm32(m_checkedOffset.unsafeGet())); + readCharacter(m_checkedOffset - term->inputPosition + 1, character); matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass()); if (!term->inputPosition) atBegin.link(this); @@ -766,7 +795,7 @@ class YarrGenerator : private MacroAssembler { YarrOp* nextOp = &m_ops[opIndex + 1]; PatternTerm* term = op.m_term; - UChar ch = term->patternCharacter; + UChar32 ch = term->patternCharacter; if ((ch > 0xff) && (m_charSize == Char8)) { // Have a 16 bit pattern character and an 8 bit string - short circuit @@ -775,21 +804,21 @@ class YarrGenerator : private MacroAssembler { } const RegisterID character = regT0; - int maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2; + unsigned maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2; unsigned ignoreCaseMask = 0; #if CPU(BIG_ENDIAN) int allCharacters = ch << (m_charSize == Char8 ? 24 : 16); #else int allCharacters = ch; #endif - int numberCharacters; - int startTermPosition = term->inputPosition; + unsigned numberCharacters; + unsigned startTermPosition = term->inputPosition; // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); + ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) + if (m_pattern.ignoreCase() && isASCIIAlpha(ch)) #if CPU(BIG_ENDIAN) ignoreCaseMask |= 32 << (m_charSize == Char8 ? 24 : 16); #else @@ -801,7 +830,7 @@ class YarrGenerator : private MacroAssembler { if (nextTerm->type != PatternTerm::TypePatternCharacter || nextTerm->quantityType != QuantifierFixedCount - || nextTerm->quantityCount != 1 + || nextTerm->quantityMaxCount != 1 || nextTerm->inputPosition != (startTermPosition + numberCharacters)) break; @@ -813,7 +842,7 @@ class YarrGenerator : private MacroAssembler { int shiftAmount = (m_charSize == Char8 ? 8 : 16) * numberCharacters; #endif - UChar currentCharacter = nextTerm->patternCharacter; + UChar32 currentCharacter = nextTerm->patternCharacter; if ((currentCharacter > 0xff) && (m_charSize == Char8)) { // Have a 16 bit pattern character and an 8 bit string - short circuit @@ -823,47 +852,43 @@ class YarrGenerator : private MacroAssembler { // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter)); + ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter)); allCharacters |= (currentCharacter << shiftAmount); - if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(currentCharacter))) + if ((m_pattern.ignoreCase()) && (isASCIIAlpha(currentCharacter))) ignoreCaseMask |= 32 << shiftAmount; } if (m_charSize == Char8) { switch (numberCharacters) { case 1: - op.m_jumps.append(jumpIfCharNotEquals(ch, startTermPosition - m_checked, character)); + op.m_jumps.append(jumpIfCharNotEquals(ch, m_checkedOffset - startTermPosition, character)); return; case 2: { - BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); - load16Unaligned(address, character); + load16Unaligned(negativeOffsetIndexedAddress(m_checkedOffset - startTermPosition, character), character); break; } case 3: { - BaseIndex highAddress(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); - load16Unaligned(highAddress, character); + load16Unaligned(negativeOffsetIndexedAddress(m_checkedOffset - startTermPosition, character), character); if (ignoreCaseMask) or32(Imm32(ignoreCaseMask), character); op.m_jumps.append(branch32(NotEqual, character, Imm32((allCharacters & 0xffff) | ignoreCaseMask))); - op.m_jumps.append(jumpIfCharNotEquals(allCharacters >> 16, startTermPosition + 2 - m_checked, character)); + op.m_jumps.append(jumpIfCharNotEquals(allCharacters >> 16, m_checkedOffset - startTermPosition - 2, character)); return; } case 4: { - BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); - load32WithUnalignedHalfWords(address, character); + load32WithUnalignedHalfWords(negativeOffsetIndexedAddress(m_checkedOffset- startTermPosition, character), character); break; } } } else { switch (numberCharacters) { case 1: - op.m_jumps.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + op.m_jumps.append(jumpIfCharNotEquals(ch, m_checkedOffset - term->inputPosition, character)); return; case 2: - BaseIndex address(input, index, TimesTwo, (term->inputPosition - m_checked) * sizeof(UChar)); - load32WithUnalignedHalfWords(address, character); + load32WithUnalignedHalfWords(negativeOffsetIndexedAddress(m_checkedOffset- term->inputPosition, character), character); break; } } @@ -882,26 +907,20 @@ class YarrGenerator : private MacroAssembler { { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - UChar ch = term->patternCharacter; + UChar32 ch = term->patternCharacter; const RegisterID character = regT0; const RegisterID countRegister = regT1; move(index, countRegister); - sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); + sub32(Imm32(term->quantityMaxCount.unsafeGet()), countRegister); Label loop(this); - BaseIndex address(input, countRegister, m_charScale, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(m_charSize == Char8 ? sizeof(char) : sizeof(UChar))).unsafeGet()); - - if (m_charSize == Char8) - load8(address, character); - else - load16(address, character); - + readCharacter(m_checkedOffset - term->inputPosition - term->quantityMaxCount, character, countRegister); // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); + if (m_pattern.ignoreCase() && isASCIIAlpha(ch)) { or32(TrustedImm32(0x20), character); ch |= 0x20; } @@ -919,7 +938,7 @@ class YarrGenerator : private MacroAssembler { { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - UChar ch = term->patternCharacter; + UChar32 ch = term->patternCharacter; const RegisterID character = regT0; const RegisterID countRegister = regT1; @@ -931,14 +950,14 @@ class YarrGenerator : private MacroAssembler { JumpList failures; Label loop(this); failures.append(atEndOfInput()); - failures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + failures.append(jumpIfCharNotEquals(ch, m_checkedOffset - term->inputPosition, character)); add32(TrustedImm32(1), countRegister); add32(TrustedImm32(1), index); - if (term->quantityCount == quantifyInfinite) + if (term->quantityMaxCount == quantifyInfinite) jump(loop); else - branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); + branch32(NotEqual, countRegister, Imm32(term->quantityMaxCount.unsafeGet())).linkTo(loop, this); failures.link(this); } @@ -977,7 +996,7 @@ class YarrGenerator : private MacroAssembler { { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - UChar ch = term->patternCharacter; + UChar32 ch = term->patternCharacter; const RegisterID character = regT0; const RegisterID countRegister = regT1; @@ -990,9 +1009,9 @@ class YarrGenerator : private MacroAssembler { if (!((ch > 0xff) && (m_charSize == Char8))) { JumpList nonGreedyFailures; nonGreedyFailures.append(atEndOfInput()); - if (term->quantityCount != quantifyInfinite) - nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); - nonGreedyFailures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + if (term->quantityMaxCount != quantifyInfinite) + nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityMaxCount.unsafeGet()))); + nonGreedyFailures.append(jumpIfCharNotEquals(ch, m_checkedOffset - term->inputPosition, character)); add32(TrustedImm32(1), countRegister); add32(TrustedImm32(1), index); @@ -1013,7 +1032,7 @@ class YarrGenerator : private MacroAssembler { const RegisterID character = regT0; JumpList matchDest; - readCharacter(term->inputPosition - m_checked, character); + readCharacter(m_checkedOffset - term->inputPosition, character); matchCharacterClass(character, matchDest, term->characterClass); if (term->invert()) @@ -1037,14 +1056,11 @@ class YarrGenerator : private MacroAssembler { const RegisterID countRegister = regT1; move(index, countRegister); - sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); + sub32(Imm32(term->quantityMaxCount.unsafeGet()), countRegister); Label loop(this); JumpList matchDest; - if (m_charSize == Char8) - load8(BaseIndex(input, countRegister, TimesOne, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(char))).unsafeGet()), character); - else - load16(BaseIndex(input, countRegister, TimesTwo, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(UChar))).unsafeGet()), character); + readCharacter(m_checkedOffset - term->inputPosition - term->quantityMaxCount, character, countRegister); matchCharacterClass(character, matchDest, term->characterClass); if (term->invert()) @@ -1077,11 +1093,11 @@ class YarrGenerator : private MacroAssembler { failures.append(atEndOfInput()); if (term->invert()) { - readCharacter(term->inputPosition - m_checked, character); + readCharacter(m_checkedOffset - term->inputPosition, character); matchCharacterClass(character, failures, term->characterClass); } else { JumpList matchDest; - readCharacter(term->inputPosition - m_checked, character); + readCharacter(m_checkedOffset - term->inputPosition, character); matchCharacterClass(character, matchDest, term->characterClass); failures.append(jump()); matchDest.link(this); @@ -1089,8 +1105,8 @@ class YarrGenerator : private MacroAssembler { add32(TrustedImm32(1), countRegister); add32(TrustedImm32(1), index); - if (term->quantityCount != quantifyInfinite) { - branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); + if (term->quantityMaxCount != quantifyInfinite) { + branch32(NotEqual, countRegister, Imm32(term->quantityMaxCount.unsafeGet())).linkTo(loop, this); failures.append(jump()); } else jump(loop); @@ -1142,10 +1158,10 @@ class YarrGenerator : private MacroAssembler { loadFromFrame(term->frameLocation, countRegister); nonGreedyFailures.append(atEndOfInput()); - nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); + nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityMaxCount.unsafeGet()))); JumpList matchDest; - readCharacter(term->inputPosition - m_checked, character); + readCharacter(m_checkedOffset - term->inputPosition, character); matchCharacterClass(character, matchDest, term->characterClass); if (term->invert()) @@ -1195,7 +1211,7 @@ class YarrGenerator : private MacroAssembler { add32(TrustedImm32(1), matchPos); // Advance past newline saveStartIndex.link(this); - if (!m_pattern.m_multiline && term->anchors.bolAnchor) + if (!m_pattern.multiline() && term->anchors.bolAnchor) op.m_jumps.append(branchTest32(NonZero, matchPos)); ASSERT(!m_pattern.m_body->m_hasFixedSize); @@ -1215,7 +1231,7 @@ class YarrGenerator : private MacroAssembler { foundEndingNewLine.link(this); - if (!m_pattern.m_multiline && term->anchors.eolAnchor) + if (!m_pattern.multiline() && term->anchors.eolAnchor) op.m_jumps.append(branch32(NotEqual, matchPos, length)); move(matchPos, index); @@ -1238,7 +1254,7 @@ class YarrGenerator : private MacroAssembler { case PatternTerm::TypePatternCharacter: switch (term->quantityType) { case QuantifierFixedCount: - if (term->quantityCount == 1) + if (term->quantityMaxCount == 1) generatePatternCharacterOnce(opIndex); else generatePatternCharacterFixed(opIndex); @@ -1255,7 +1271,7 @@ class YarrGenerator : private MacroAssembler { case PatternTerm::TypeCharacterClass: switch (term->quantityType) { case QuantifierFixedCount: - if (term->quantityCount == 1) + if (term->quantityMaxCount == 1) generateCharacterClassOnce(opIndex); else generateCharacterClassFixed(opIndex); @@ -1304,7 +1320,7 @@ class YarrGenerator : private MacroAssembler { case PatternTerm::TypePatternCharacter: switch (term->quantityType) { case QuantifierFixedCount: - if (term->quantityCount == 1) + if (term->quantityMaxCount == 1) backtrackPatternCharacterOnce(opIndex); else backtrackPatternCharacterFixed(opIndex); @@ -1321,7 +1337,7 @@ class YarrGenerator : private MacroAssembler { case PatternTerm::TypeCharacterClass: switch (term->quantityType) { case QuantifierFixedCount: - if (term->quantityCount == 1) + if (term->quantityMaxCount == 1) backtrackCharacterClassOnce(opIndex); else backtrackCharacterClassFixed(opIndex); @@ -1410,7 +1426,7 @@ class YarrGenerator : private MacroAssembler { // set as appropriate to this alternative. op.m_reentry = label(); - m_checked += alternative->m_minimumSize; + m_checkedOffset += alternative->m_minimumSize; break; } case OpBodyAlternativeNext: @@ -1463,8 +1479,8 @@ class YarrGenerator : private MacroAssembler { } if (op.m_op == OpBodyAlternativeNext) - m_checked += alternative->m_minimumSize; - m_checked -= priorAlternative->m_minimumSize; + m_checkedOffset += alternative->m_minimumSize; + m_checkedOffset -= priorAlternative->m_minimumSize; break; } @@ -1491,13 +1507,13 @@ class YarrGenerator : private MacroAssembler { PatternDisjunction* disjunction = term->parentheses.disjunction; // Calculate how much input we need to check for, and if non-zero check. - op.m_checkAdjust = alternative->m_minimumSize; + op.m_checkAdjust = Checked<unsigned>(alternative->m_minimumSize); if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) op.m_checkAdjust -= disjunction->m_minimumSize; if (op.m_checkAdjust) - op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); + op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust.unsafeGet())); - m_checked += op.m_checkAdjust; + m_checkedOffset += op.m_checkAdjust; break; } case OpSimpleNestedAlternativeNext: @@ -1545,11 +1561,11 @@ class YarrGenerator : private MacroAssembler { if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) op.m_checkAdjust -= disjunction->m_minimumSize; if (op.m_checkAdjust) - op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); + op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust.unsafeGet())); YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked -= lastOp.m_checkAdjust; - m_checked += op.m_checkAdjust; + m_checkedOffset -= lastOp.m_checkAdjust; + m_checkedOffset += op.m_checkAdjust; break; } case OpSimpleNestedAlternativeEnd: @@ -1578,7 +1594,7 @@ class YarrGenerator : private MacroAssembler { op.m_jumps.clear(); YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked -= lastOp.m_checkAdjust; + m_checkedOffset -= lastOp.m_checkAdjust; break; } @@ -1590,7 +1606,7 @@ class YarrGenerator : private MacroAssembler { PatternTerm* term = op.m_term; unsigned parenthesesFrameLocation = term->frameLocation; const RegisterID indexTemporary = regT0; - ASSERT(term->quantityCount == 1); + ASSERT(term->quantityMaxCount == 1); // Upon entry to a Greedy quantified set of parenthese store the index. // We'll use this for two purposes: @@ -1622,12 +1638,12 @@ class YarrGenerator : private MacroAssembler { // offsets only afterwards, at the point the results array is // being accessed. if (term->capture() && compileMode == IncludeSubpatterns) { - int inputOffset = term->inputPosition - m_checked; + unsigned inputOffset = (m_checkedOffset - term->inputPosition).unsafeGet(); if (term->quantityType == QuantifierFixedCount) - inputOffset -= term->parentheses.disjunction->m_minimumSize; + inputOffset += term->parentheses.disjunction->m_minimumSize; if (inputOffset) { move(index, indexTemporary); - add32(Imm32(inputOffset), indexTemporary); + sub32(Imm32(inputOffset), indexTemporary); setSubpatternStart(indexTemporary, term->parentheses.subpatternId); } else setSubpatternStart(index, term->parentheses.subpatternId); @@ -1637,18 +1653,16 @@ class YarrGenerator : private MacroAssembler { case OpParenthesesSubpatternOnceEnd: { PatternTerm* term = op.m_term; const RegisterID indexTemporary = regT0; - ASSERT(term->quantityCount == 1); + ASSERT(term->quantityMaxCount == 1); -#ifndef NDEBUG // Runtime ASSERT to make sure that the nested alternative handled the // "no input consumed" check. - if (term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) { + if (!ASSERT_DISABLED && term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) { Jump pastBreakpoint; pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); - breakpoint(); + abortWithReason(YARRNoInputConsumed); pastBreakpoint.link(this); } -#endif // If the parenthese are capturing, store the ending index value to the // captures array, offsetting as necessary. @@ -1657,10 +1671,10 @@ class YarrGenerator : private MacroAssembler { // offsets only afterwards, at the point the results array is // being accessed. if (term->capture() && compileMode == IncludeSubpatterns) { - int inputOffset = term->inputPosition - m_checked; + unsigned inputOffset = (m_checkedOffset - term->inputPosition).unsafeGet(); if (inputOffset) { move(index, indexTemporary); - add32(Imm32(inputOffset), indexTemporary); + sub32(Imm32(inputOffset), indexTemporary); setSubpatternEnd(indexTemporary, term->parentheses.subpatternId); } else setSubpatternEnd(index, term->parentheses.subpatternId); @@ -1682,7 +1696,7 @@ class YarrGenerator : private MacroAssembler { case OpParenthesesSubpatternTerminalBegin: { PatternTerm* term = op.m_term; ASSERT(term->quantityType == QuantifierGreedy); - ASSERT(term->quantityCount == quantifyInfinite); + ASSERT(term->quantityMaxCount == quantifyInfinite); ASSERT(!term->capture()); // Upon entry set a label to loop back to. @@ -1695,16 +1709,16 @@ class YarrGenerator : private MacroAssembler { } case OpParenthesesSubpatternTerminalEnd: { YarrOp& beginOp = m_ops[op.m_previousOp]; -#ifndef NDEBUG - PatternTerm* term = op.m_term; - - // Runtime ASSERT to make sure that the nested alternative handled the - // "no input consumed" check. - Jump pastBreakpoint; - pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); - breakpoint(); - pastBreakpoint.link(this); -#endif + if (!ASSERT_DISABLED) { + PatternTerm* term = op.m_term; + + // Runtime ASSERT to make sure that the nested alternative handled the + // "no input consumed" check. + Jump pastBreakpoint; + pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); + abortWithReason(YARRNoInputConsumed); + pastBreakpoint.link(this); + } // We know that the match is non-zero, we can accept it and // loop back up to the head of the subpattern. @@ -1726,11 +1740,11 @@ class YarrGenerator : private MacroAssembler { storeToFrame(index, parenthesesFrameLocation); // Check - op.m_checkAdjust = m_checked - term->inputPosition; + op.m_checkAdjust = m_checkedOffset - term->inputPosition; if (op.m_checkAdjust) - sub32(Imm32(op.m_checkAdjust), index); + sub32(Imm32(op.m_checkAdjust.unsafeGet()), index); - m_checked -= op.m_checkAdjust; + m_checkedOffset -= op.m_checkAdjust; break; } case OpParentheticalAssertionEnd: { @@ -1748,15 +1762,13 @@ class YarrGenerator : private MacroAssembler { } YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked += lastOp.m_checkAdjust; + m_checkedOffset += lastOp.m_checkAdjust; break; } case OpMatchFailed: removeCallFrame(); - move(TrustedImmPtr((void*)WTF::notFound), returnRegister); - move(TrustedImm32(0), returnRegister2); - generateReturn(); + generateFailReturn(); break; } @@ -1808,9 +1820,9 @@ class YarrGenerator : private MacroAssembler { if (op.m_op == OpBodyAlternativeNext) { PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; - m_checked += priorAlternative->m_minimumSize; + m_checkedOffset += priorAlternative->m_minimumSize; } - m_checked -= alternative->m_minimumSize; + m_checkedOffset -= alternative->m_minimumSize; // Is this the last alternative? If not, then if we backtrack to this point we just // need to jump to try to match the next alternative. @@ -1827,6 +1839,8 @@ class YarrGenerator : private MacroAssembler { } bool onceThrough = endOp.m_nextOp == notFound; + + JumpList lastStickyAlternativeFailures; // First, generate code to handle cases where we backtrack out of an attempted match // of the last alternative. If this is a 'once through' set of alternatives then we @@ -1842,43 +1856,49 @@ class YarrGenerator : private MacroAssembler { && (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) && (alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize == 1)) m_backtrackingState.linkTo(beginOp->m_reentry, this); - else { + else if (m_pattern.sticky() && m_ops[op.m_nextOp].m_op == OpBodyAlternativeEnd) { + // It is a sticky pattern and the last alternative failed, jump to the end. + m_backtrackingState.takeBacktracksToJumpList(lastStickyAlternativeFailures, this); + } else { // We need to generate a trampoline of code to execute before looping back // around to the first alternative. m_backtrackingState.link(this); - // If the pattern size is not fixed, then store the start index, for use if we match. - if (!m_pattern.m_body->m_hasFixedSize) { - if (alternative->m_minimumSize == 1) - setMatchStart(index); - else { - move(index, regT0); - if (alternative->m_minimumSize) - sub32(Imm32(alternative->m_minimumSize - 1), regT0); - else - add32(TrustedImm32(1), regT0); - setMatchStart(regT0); + // No need to advance and retry for a sticky pattern. + if (!m_pattern.sticky()) { + // If the pattern size is not fixed, then store the start index for use if we match. + if (!m_pattern.m_body->m_hasFixedSize) { + if (alternative->m_minimumSize == 1) + setMatchStart(index); + else { + move(index, regT0); + if (alternative->m_minimumSize) + sub32(Imm32(alternative->m_minimumSize - 1), regT0); + else + add32(TrustedImm32(1), regT0); + setMatchStart(regT0); + } } - } - // Generate code to loop. Check whether the last alternative is longer than the - // first (e.g. /a|xy/ or /a|xyz/). - if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) { - // We want to loop, and increment input position. If the delta is 1, it is - // already correctly incremented, if more than one then decrement as appropriate. - unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize; - ASSERT(delta); - if (delta != 1) - sub32(Imm32(delta - 1), index); - jump(beginOp->m_reentry); - } else { - // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot - // be sufficent input available to handle this, so just fall through. - unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize; - if (delta != 0xFFFFFFFFu) { - // We need to check input because we are incrementing the input. - add32(Imm32(delta + 1), index); - checkInput().linkTo(beginOp->m_reentry, this); + // Generate code to loop. Check whether the last alternative is longer than the + // first (e.g. /a|xy/ or /a|xyz/). + if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) { + // We want to loop, and increment input position. If the delta is 1, it is + // already correctly incremented, if more than one then decrement as appropriate. + unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize; + ASSERT(delta); + if (delta != 1) + sub32(Imm32(delta - 1), index); + jump(beginOp->m_reentry); + } else { + // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot + // be sufficent input available to handle this, so just fall through. + unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize; + if (delta != 0xFFFFFFFFu) { + // We need to check input because we are incrementing the input. + add32(Imm32(delta + 1), index); + checkInput().linkTo(beginOp->m_reentry, this); + } } } } @@ -1887,7 +1907,7 @@ class YarrGenerator : private MacroAssembler { // We can reach this point in the code in two ways: // - Fallthrough from the code above (a repeating alternative backtracked out of its // last alternative, and did not have sufficent input to run the first). - // - We will loop back up to the following label when a releating alternative loops, + // - We will loop back up to the following label when a repeating alternative loops, // following a failed input check. // // Either way, we have just failed the input check for the first alternative. @@ -1947,56 +1967,57 @@ class YarrGenerator : private MacroAssembler { needsToUpdateMatchStart = false; } - // Check whether there is sufficient input to loop. Increment the input position by - // one, and check. Also add in the minimum disjunction size before checking - there - // is no point in looping if we're just going to fail all the input checks around - // the next iteration. - ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize); - if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) { - // If the last alternative had the same minimum size as the disjunction, - // just simply increment input pos by 1, no adjustment based on minimum size. - add32(TrustedImm32(1), index); - } else { - // If the minumum for the last alternative was one greater than than that - // for the disjunction, we're already progressed by 1, nothing to do! - unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1; - if (delta) - sub32(Imm32(delta), index); - } - Jump matchFailed = jumpIfNoAvailableInput(); + if (!m_pattern.sticky()) { + // Check whether there is sufficient input to loop. Increment the input position by + // one, and check. Also add in the minimum disjunction size before checking - there + // is no point in looping if we're just going to fail all the input checks around + // the next iteration. + ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize); + if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) { + // If the last alternative had the same minimum size as the disjunction, + // just simply increment input pos by 1, no adjustment based on minimum size. + add32(TrustedImm32(1), index); + } else { + // If the minumum for the last alternative was one greater than than that + // for the disjunction, we're already progressed by 1, nothing to do! + unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1; + if (delta) + sub32(Imm32(delta), index); + } + Jump matchFailed = jumpIfNoAvailableInput(); + + if (needsToUpdateMatchStart) { + if (!m_pattern.m_body->m_minimumSize) + setMatchStart(index); + else { + move(index, regT0); + sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0); + setMatchStart(regT0); + } + } - if (needsToUpdateMatchStart) { - if (!m_pattern.m_body->m_minimumSize) - setMatchStart(index); + // Calculate how much more input the first alternative requires than the minimum + // for the body as a whole. If no more is needed then we dont need an additional + // input check here - jump straight back up to the start of the first alternative. + if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) + jump(beginOp->m_reentry); else { - move(index, regT0); - sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0); - setMatchStart(regT0); + if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize) + add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index); + else + sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index); + checkInput().linkTo(beginOp->m_reentry, this); + jump(firstInputCheckFailed); } - } - // Calculate how much more input the first alternative requires than the minimum - // for the body as a whole. If no more is needed then we dont need an additional - // input check here - jump straight back up to the start of the first alternative. - if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) - jump(beginOp->m_reentry); - else { - if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize) - add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index); - else - sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index); - checkInput().linkTo(beginOp->m_reentry, this); - jump(firstInputCheckFailed); + // We jump to here if we iterate to the point that there is insufficient input to + // run any matches, and need to return a failure state from JIT code. + matchFailed.link(this); } - // We jump to here if we iterate to the point that there is insufficient input to - // run any matches, and need to return a failure state from JIT code. - matchFailed.link(this); - + lastStickyAlternativeFailures.link(this); removeCallFrame(); - move(TrustedImmPtr((void*)WTF::notFound), returnRegister); - move(TrustedImm32(0), returnRegister2); - generateReturn(); + generateFailReturn(); break; } case OpBodyAlternativeEnd: { @@ -2004,7 +2025,7 @@ class YarrGenerator : private MacroAssembler { ASSERT(m_backtrackingState.isEmpty()); PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; - m_checked += priorAlternative->m_minimumSize; + m_checkedOffset += priorAlternative->m_minimumSize; break; } @@ -2055,7 +2076,7 @@ class YarrGenerator : private MacroAssembler { if (op.m_checkAdjust) { // Handle the cases where we need to link the backtracks here. m_backtrackingState.link(this); - sub32(Imm32(op.m_checkAdjust), index); + sub32(Imm32(op.m_checkAdjust.unsafeGet()), index); if (!isLastAlternative) { // An alternative that is not the last should jump to its successor. jump(nextOp.m_reentry); @@ -2105,9 +2126,9 @@ class YarrGenerator : private MacroAssembler { if (!isBegin) { YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked += lastOp.m_checkAdjust; + m_checkedOffset += lastOp.m_checkAdjust; } - m_checked -= op.m_checkAdjust; + m_checkedOffset -= op.m_checkAdjust; break; } case OpSimpleNestedAlternativeEnd: @@ -2138,7 +2159,7 @@ class YarrGenerator : private MacroAssembler { } YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked += lastOp.m_checkAdjust; + m_checkedOffset += lastOp.m_checkAdjust; break; } @@ -2159,7 +2180,7 @@ class YarrGenerator : private MacroAssembler { // matching start, depending of whether the match is Greedy or NonGreedy. case OpParenthesesSubpatternOnceBegin: { PatternTerm* term = op.m_term; - ASSERT(term->quantityCount == 1); + ASSERT(term->quantityMaxCount == 1); // We only need to backtrack to thispoint if capturing or greedy. if ((term->capture() && compileMode == IncludeSubpatterns) || term->quantityType == QuantifierGreedy) { @@ -2251,7 +2272,7 @@ class YarrGenerator : private MacroAssembler { m_backtrackingState.link(this); if (op.m_checkAdjust) - add32(Imm32(op.m_checkAdjust), index); + add32(Imm32(op.m_checkAdjust.unsafeGet()), index); // In an inverted assertion failure to match the subpattern // is treated as a successful match - jump to the end of the @@ -2268,7 +2289,7 @@ class YarrGenerator : private MacroAssembler { // added the failure caused by a successful match to this. m_backtrackingState.append(endOp.m_jumps); - m_checked += op.m_checkAdjust; + m_checkedOffset += op.m_checkAdjust; break; } case OpParentheticalAssertionEnd: { @@ -2280,7 +2301,7 @@ class YarrGenerator : private MacroAssembler { m_backtrackingState.takeBacktracksToJumpList(op.m_jumps, this); YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked -= lastOp.m_checkAdjust; + m_checkedOffset -= lastOp.m_checkAdjust; break; } @@ -2298,7 +2319,7 @@ class YarrGenerator : private MacroAssembler { // Emits ops for a subpattern (set of parentheses). These consist // of a set of alternatives wrapped in an outer set of nodes for // the parentheses. - // Supported types of parentheses are 'Once' (quantityCount == 1) + // Supported types of parentheses are 'Once' (quantityMaxCount == 1) // and 'Terminal' (non-capturing parentheses quantified as greedy // and infinite). // Alternatives will use the 'Simple' set of ops if either the @@ -2319,7 +2340,10 @@ class YarrGenerator : private MacroAssembler { // comes where the subpattern is capturing, in which case we would // need to restore the capture from the first subpattern upon a // failure in the second. - if (term->quantityCount == 1 && !term->parentheses.isCopy) { + if (term->quantityMinCount && term->quantityMinCount != term->quantityMaxCount) { + m_shouldFallBack = true; + return; + } if (term->quantityMaxCount == 1 && !term->parentheses.isCopy) { // Select the 'Once' nodes. parenthesesBeginOpCode = OpParenthesesSubpatternOnceBegin; parenthesesEndOpCode = OpParenthesesSubpatternOnceEnd; @@ -2346,7 +2370,7 @@ class YarrGenerator : private MacroAssembler { m_ops.append(alternativeBeginOpCode); m_ops.last().m_previousOp = notFound; m_ops.last().m_term = term; - Vector<OwnPtr<PatternAlternative>>& alternatives = term->parentheses.disjunction->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = term->parentheses.disjunction->m_alternatives; for (unsigned i = 0; i < alternatives.size(); ++i) { size_t lastOpIndex = m_ops.size() - 1; @@ -2397,7 +2421,7 @@ class YarrGenerator : private MacroAssembler { m_ops.append(OpSimpleNestedAlternativeBegin); m_ops.last().m_previousOp = notFound; m_ops.last().m_term = term; - Vector<OwnPtr<PatternAlternative>>& alternatives = term->parentheses.disjunction->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = term->parentheses.disjunction->m_alternatives; for (unsigned i = 0; i < alternatives.size(); ++i) { size_t lastOpIndex = m_ops.size() - 1; @@ -2471,7 +2495,7 @@ class YarrGenerator : private MacroAssembler { // to return the failing result. void opCompileBody(PatternDisjunction* disjunction) { - Vector<OwnPtr<PatternAlternative>>& alternatives = disjunction->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = disjunction->m_alternatives; size_t currentAlternativeIndex = 0; // Emit the 'once through' alternatives. @@ -2578,16 +2602,17 @@ class YarrGenerator : private MacroAssembler { push(ARMRegisters::r4); push(ARMRegisters::r5); push(ARMRegisters::r6); -#elif CPU(SH4) - push(SH4Registers::r11); - push(SH4Registers::r13); #elif CPU(MIPS) // Do nothing. #endif + + store8(TrustedImm32(1), &m_vm->isExecutingInRegExpJIT); } void generateReturn() { + store8(TrustedImm32(0), &m_vm->isExecutingInRegExpJIT); + #if CPU(X86_64) #if OS(WINDOWS) // Store the return value in the allocated space pointed by rcx. @@ -2606,9 +2631,6 @@ class YarrGenerator : private MacroAssembler { pop(ARMRegisters::r6); pop(ARMRegisters::r5); pop(ARMRegisters::r4); -#elif CPU(SH4) - pop(SH4Registers::r13); - pop(SH4Registers::r11); #elif CPU(MIPS) // Do nothing #endif @@ -2616,12 +2638,11 @@ class YarrGenerator : private MacroAssembler { } public: - YarrGenerator(YarrPattern& pattern, YarrCharSize charSize) - : m_pattern(pattern) + YarrGenerator(VM* vm, YarrPattern& pattern, YarrCharSize charSize) + : m_vm(vm) + , m_pattern(pattern) , m_charSize(charSize) - , m_charScale(m_charSize == Char8 ? TimesOne: TimesTwo) , m_shouldFallBack(false) - , m_checked(0) { } @@ -2630,9 +2651,7 @@ public: generateEnter(); Jump hasInput = checkInput(); - move(TrustedImmPtr((void*)WTF::notFound), returnRegister); - move(TrustedImm32(0), returnRegister2); - generateReturn(); + generateFailReturn(); hasInput.link(this); if (compileMode == IncludeSubpatterns) { @@ -2645,11 +2664,8 @@ public: initCallFrame(); - // Compile the pattern to the internal 'YarrOp' representation. opCompileBody(m_pattern.m_body); - // If we encountered anything we can't handle in the JIT code - // (e.g. backreferences) then return early. if (m_shouldFallBack) { jitObject.setFallBack(true); return; @@ -2658,8 +2674,12 @@ public: generate(); backtrack(); - // Link & finalize the code. - LinkBuffer linkBuffer(*vm, this, REGEXP_CODE_ID); + LinkBuffer linkBuffer(*vm, *this, REGEXP_CODE_ID, JITCompilationCanFail); + if (linkBuffer.didFailToAllocate()) { + jitObject.setFallBack(true); + return; + } + m_backtrackingState.linkDataLabels(linkBuffer); if (compileMode == MatchOnly) { @@ -2677,12 +2697,12 @@ public: } private: + VM* m_vm; + YarrPattern& m_pattern; YarrCharSize m_charSize; - Scale m_charScale; - // Used to detect regular expression constructs that are not currently // supported in the JIT; fall back to the interpreter when this is detected. bool m_shouldFallBack; @@ -2700,7 +2720,7 @@ private: // FIXME: This should go away. Rather than tracking this value throughout // code generation, we should gather this information up front & store it // on the YarrOp structure. - int m_checked; + Checked<unsigned> m_checkedOffset; // This class records state whilst generating the backtracking path of code. BacktrackingState m_backtrackingState; @@ -2709,9 +2729,9 @@ private: void jitCompile(YarrPattern& pattern, YarrCharSize charSize, VM* vm, YarrCodeBlock& jitObject, YarrJITCompileMode mode) { if (mode == MatchOnly) - YarrGenerator<MatchOnly>(pattern, charSize).compile(vm, jitObject); + YarrGenerator<MatchOnly>(vm, pattern, charSize).compile(vm, jitObject); else - YarrGenerator<IncludeSubpatterns>(pattern, charSize).compile(vm, jitObject); + YarrGenerator<IncludeSubpatterns>(vm, pattern, charSize).compile(vm, jitObject); } }} diff --git a/Source/JavaScriptCore/yarr/YarrJIT.h b/Source/JavaScriptCore/yarr/YarrJIT.h index e7b222a8e..45a6a3d63 100644 --- a/Source/JavaScriptCore/yarr/YarrJIT.h +++ b/Source/JavaScriptCore/yarr/YarrJIT.h @@ -23,8 +23,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrJIT_h -#define YarrJIT_h +#pragma once #if ENABLE(YARR_JIT) @@ -108,9 +107,44 @@ public: } #if ENABLE(REGEXP_TRACING) - void *getAddr() { return m_ref.code().executableAddress(); } + void *get8BitMatchOnlyAddr() + { + if (!has8BitCodeMatchOnly()) + return 0; + + return m_matchOnly8.code().executableAddress(); + } + + void *get16BitMatchOnlyAddr() + { + if (!has16BitCodeMatchOnly()) + return 0; + + return m_matchOnly16.code().executableAddress(); + } + + void *get8BitMatchAddr() + { + if (!has8BitCode()) + return 0; + + return m_ref8.code().executableAddress(); + } + + void *get16BitMatchAddr() + { + if (!has16BitCode()) + return 0; + + return m_ref16.code().executableAddress(); + } #endif + size_t size() const + { + return m_ref8.size() + m_ref16.size() + m_matchOnly8.size() + m_matchOnly16.size(); + } + void clear() { m_ref8 = MacroAssemblerCodeRef(); @@ -137,5 +171,3 @@ void jitCompile(YarrPattern&, YarrCharSize, VM*, YarrCodeBlock& jitObject, YarrJ } } // namespace JSC::Yarr #endif - -#endif // YarrJIT_h diff --git a/Source/JavaScriptCore/yarr/YarrParser.h b/Source/JavaScriptCore/yarr/YarrParser.h index 366aa40d3..9c0982017 100644 --- a/Source/JavaScriptCore/yarr/YarrParser.h +++ b/Source/JavaScriptCore/yarr/YarrParser.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,18 +23,14 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrParser_h -#define YarrParser_h +#pragma once #include "Yarr.h" #include <wtf/ASCIICType.h> #include <wtf/text/WTFString.h> -#include <wtf/unicode/Unicode.h> namespace JSC { namespace Yarr { -#define REGEXP_ERROR_PREFIX "Invalid regular expression: " - enum BuiltInCharacterClassID { DigitClassID, SpaceClassID, @@ -47,22 +43,7 @@ template<class Delegate, typename CharType> class Parser { private: template<class FriendDelegate> - friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); - - enum ErrorCode { - NoError, - PatternTooLarge, - QuantifierOutOfOrder, - QuantifierWithoutAtom, - QuantifierTooLarge, - MissingParentheses, - ParenthesesUnmatched, - ParenthesesTypeInvalid, - CharacterClassUnmatched, - CharacterClassOutOfOrder, - EscapeUnterminated, - NumberOfErrorCodes - }; + friend const char* parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit); /* * CharacterClassParserDelegate: @@ -75,7 +56,7 @@ private: */ class CharacterClassParserDelegate { public: - CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) + CharacterClassParserDelegate(Delegate& delegate, YarrPattern::ErrorCode& err) : m_delegate(delegate) , m_err(err) , m_state(Empty) @@ -102,7 +83,7 @@ private: * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ * is different to /[a\-z]/). */ - void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) + void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false) { switch (m_state) { case AfterCharacterClass: @@ -137,7 +118,7 @@ private: case CachedCharacterHyphen: if (ch < m_character) { - m_err = CharacterClassOutOfOrder; + m_err = YarrPattern::CharacterClassOutOfOrder; return; } m_delegate.atomCharacterClassRange(m_character, ch); @@ -218,7 +199,7 @@ private: private: Delegate& m_delegate; - ErrorCode& m_err; + YarrPattern::ErrorCode& m_err; enum CharacterClassConstructionState { Empty, CachedCharacter, @@ -226,20 +207,34 @@ private: AfterCharacterClass, AfterCharacterClassHyphen, } m_state; - UChar m_character; + UChar32 m_character; }; - Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) + Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit) : m_delegate(delegate) , m_backReferenceLimit(backReferenceLimit) - , m_err(NoError) - , m_data(pattern.getCharacters<CharType>()) + , m_err(YarrPattern::NoError) + , m_data(pattern.characters<CharType>()) , m_size(pattern.length()) , m_index(0) + , m_isUnicode(isUnicode) , m_parenthesesNestingDepth(0) { } + // The handling of IdentityEscapes is different depending on the unicode flag. + // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'. + // For non-unicode patterns, most any character can be escaped. + bool isIdentityEscapeAnError(int ch) + { + if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) { + m_err = YarrPattern::InvalidIdentityEscape; + return true; + } + + return false; + } + /* * parseEscape(): * @@ -268,7 +263,7 @@ private: consume(); if (atEndOfPattern()) { - m_err = EscapeUnterminated; + m_err = YarrPattern::EscapeUnterminated; return false; } @@ -276,18 +271,24 @@ private: // Assertions case 'b': consume(); - if (inCharacterClass) + if (inCharacterClass) { + if (isIdentityEscapeAnError('b')) + break; + delegate.atomPatternCharacter('\b'); - else { + } else { delegate.assertionWordBoundary(false); return false; } break; case 'B': consume(); - if (inCharacterClass) + if (inCharacterClass) { + if (isIdentityEscapeAnError('B')) + break; + delegate.atomPatternCharacter('B'); - else { + } else { delegate.assertionWordBoundary(true); return false; } @@ -402,9 +403,12 @@ private: case 'x': { consume(); int x = tryConsumeHex(2); - if (x == -1) + if (x == -1) { + if (isIdentityEscapeAnError('x')) + break; + delegate.atomPatternCharacter('x'); - else + } else delegate.atomPatternCharacter(x); break; } @@ -412,22 +416,101 @@ private: // UnicodeEscape case 'u': { consume(); + if (atEndOfPattern()) { + if (isIdentityEscapeAnError('u')) + break; + + delegate.atomPatternCharacter('u'); + break; + } + + if (m_isUnicode && peek() == '{') { + consume(); + UChar32 codePoint = 0; + do { + if (atEndOfPattern() || !isASCIIHexDigit(peek())) { + m_err = YarrPattern::InvalidUnicodeEscape; + break; + } + + codePoint = (codePoint << 4) | toASCIIHexValue(consume()); + + if (codePoint > UCHAR_MAX_VALUE) + m_err = YarrPattern::InvalidUnicodeEscape; + } while (!atEndOfPattern() && peek() != '}'); + if (!atEndOfPattern() && peek() == '}') + consume(); + else if (!m_err) + m_err = YarrPattern::InvalidUnicodeEscape; + if (m_err) + return false; + + delegate.atomPatternCharacter(codePoint); + break; + } int u = tryConsumeHex(4); - if (u == -1) + if (u == -1) { + if (isIdentityEscapeAnError('u')) + break; + delegate.atomPatternCharacter('u'); - else + } else { + // If we have the first of a surrogate pair, look for the second. + if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { + ParseState state = saveState(); + consume(); + + if (tryConsume('u')) { + int surrogate2 = tryConsumeHex(4); + if (U16_IS_TRAIL(surrogate2)) { + u = U16_GET_SUPPLEMENTARY(u, surrogate2); + delegate.atomPatternCharacter(u); + break; + } + } + + restoreState(state); + } delegate.atomPatternCharacter(u); + } break; } // IdentityEscape default: + int ch = peek(); + + if (ch == '-' && m_isUnicode && inCharacterClass) { + // \- is allowed for ClassEscape with unicode flag. + delegate.atomPatternCharacter(consume()); + break; + } + + if (isIdentityEscapeAnError(ch)) + break; + delegate.atomPatternCharacter(consume()); } return true; } + UChar32 consumePossibleSurrogatePair() + { + UChar32 ch = consume(); + if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) { + ParseState state = saveState(); + + UChar32 surrogate2 = consume(); + if (U16_IS_TRAIL(surrogate2)) + ch = U16_GET_SUPPLEMENTARY(ch, surrogate2); + else + restoreState(state); + } + + return ch; + } + /* * parseAtomEscape(), parseCharacterClassEscape(): * @@ -471,14 +554,14 @@ private: break; default: - characterClassConstructor.atomPatternCharacter(consume(), true); + characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true); } if (m_err) return; } - m_err = CharacterClassUnmatched; + m_err = YarrPattern::CharacterClassUnmatched; } /* @@ -494,7 +577,7 @@ private: if (tryConsume('?')) { if (atEndOfPattern()) { - m_err = ParenthesesTypeInvalid; + m_err = YarrPattern::ParenthesesTypeInvalid; return; } @@ -512,7 +595,7 @@ private: break; default: - m_err = ParenthesesTypeInvalid; + m_err = YarrPattern::ParenthesesTypeInvalid; } } else m_delegate.atomParenthesesSubpatternBegin(); @@ -534,7 +617,7 @@ private: if (m_parenthesesNestingDepth > 0) m_delegate.atomParenthesesEnd(); else - m_err = ParenthesesUnmatched; + m_err = YarrPattern::ParenthesesUnmatched; --m_parenthesesNestingDepth; } @@ -550,14 +633,14 @@ private: ASSERT(min <= max); if (min == UINT_MAX) { - m_err = QuantifierTooLarge; + m_err = YarrPattern::QuantifierTooLarge; return; } if (lastTokenWasAnAtom) m_delegate.quantifyAtom(min, max, !tryConsume('?')); else - m_err = QuantifierWithoutAtom; + m_err = YarrPattern::QuantifierWithoutAtom; } /* @@ -651,7 +734,7 @@ private: if (min <= max) parseQuantifier(lastTokenWasAnAtom, min, max); else - m_err = QuantifierOutOfOrder; + m_err = YarrPattern::QuantifierOutOfOrder; lastTokenWasAnAtom = false; break; } @@ -663,7 +746,7 @@ private: FALLTHROUGH; default: - m_delegate.atomPatternCharacter(consume()); + m_delegate.atomPatternCharacter(consumePossibleSurrogatePair()); lastTokenWasAnAtom = true; } @@ -672,7 +755,7 @@ private: } if (m_parenthesesNestingDepth > 0) - m_err = MissingParentheses; + m_err = YarrPattern::MissingParentheses; } /* @@ -684,27 +767,12 @@ private: const char* parse() { if (m_size > MAX_PATTERN_SIZE) - m_err = PatternTooLarge; + m_err = YarrPattern::PatternTooLarge; else parseTokens(); ASSERT(atEndOfPattern() || m_err); - - // The order of this array must match the ErrorCode enum. - static const char* errorMessages[NumberOfErrorCodes] = { - 0, // NoError - REGEXP_ERROR_PREFIX "regular expression too large", - REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", - REGEXP_ERROR_PREFIX "nothing to repeat", - REGEXP_ERROR_PREFIX "number too large in {} quantifier", - REGEXP_ERROR_PREFIX "missing )", - REGEXP_ERROR_PREFIX "unmatched parentheses", - REGEXP_ERROR_PREFIX "unrecognized character after (?", - REGEXP_ERROR_PREFIX "missing terminating ] for character class", - REGEXP_ERROR_PREFIX "range out of order in character class", - REGEXP_ERROR_PREFIX "\\ at end of pattern" - }; - - return errorMessages[m_err]; + + return YarrPattern::errorMessage(m_err); } // Misc helper functions: @@ -727,6 +795,12 @@ private: return m_index == m_size; } + unsigned patternRemaining() + { + ASSERT(m_index <= m_size); + return m_size - m_index; + } + int peek() { ASSERT(m_index < m_size); @@ -802,10 +876,11 @@ private: Delegate& m_delegate; unsigned m_backReferenceLimit; - ErrorCode m_err; + YarrPattern::ErrorCode m_err; const CharType* m_data; unsigned m_size; unsigned m_index; + bool m_isUnicode; unsigned m_parenthesesNestingDepth; // Derived by empirical testing of compile time in PCRE and WREC. @@ -826,11 +901,11 @@ private: * void assertionEOL(); * void assertionWordBoundary(bool invert); * - * void atomPatternCharacter(UChar ch); + * void atomPatternCharacter(UChar32 ch); * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); * void atomCharacterClassBegin(bool invert) - * void atomCharacterClassAtom(UChar ch) - * void atomCharacterClassRange(UChar begin, UChar end) + * void atomCharacterClassAtom(UChar32 ch) + * void atomCharacterClassRange(UChar32 begin, UChar32 end) * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) * void atomCharacterClassEnd() * void atomParenthesesSubpatternBegin(bool capture = true); @@ -872,13 +947,11 @@ private: */ template<class Delegate> -const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) +const char* parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite) { if (pattern.is8Bit()) - return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); - return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); + return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); + return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); } } } // namespace JSC::Yarr - -#endif // YarrParser_h diff --git a/Source/JavaScriptCore/yarr/YarrPattern.cpp b/Source/JavaScriptCore/yarr/YarrPattern.cpp index 7ed9d3c30..386b33728 100644 --- a/Source/JavaScriptCore/yarr/YarrPattern.cpp +++ b/Source/JavaScriptCore/yarr/YarrPattern.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without @@ -28,9 +28,10 @@ #include "YarrPattern.h" #include "Yarr.h" -#include "YarrCanonicalizeUCS2.h" +#include "YarrCanonicalize.h" #include "YarrParser.h" #include <wtf/Vector.h> +#include <wtf/WTFThreadData.h> using namespace WTF; @@ -40,8 +41,9 @@ namespace JSC { namespace Yarr { class CharacterClassConstructor { public: - CharacterClassConstructor(bool isCaseInsensitive = false) + CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode) : m_isCaseInsensitive(isCaseInsensitive) + , m_canonicalMode(canonicalMode) { } @@ -65,11 +67,16 @@ public: addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); } - void putChar(UChar ch) + void putChar(UChar32 ch) { - // Handle ascii cases. - if (ch <= 0x7f) { - if (m_isCaseInsensitive && isASCIIAlpha(ch)) { + if (!m_isCaseInsensitive) { + addSorted(ch); + return; + } + + if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) { + // Handle ASCII cases. + if (isASCIIAlpha(ch)) { addSorted(m_matches, toASCIIUpper(ch)); addSorted(m_matches, toASCIILower(ch)); } else @@ -77,40 +84,33 @@ public: return; } - // Simple case, not a case-insensitive match. - if (!m_isCaseInsensitive) { - addSorted(m_matchesUnicode, ch); - return; - } - // Add multiple matches, if necessary. - const UCS2CanonicalizationRange* info = rangeInfoFor(ch); + const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode); if (info->type == CanonicalizeUnique) - addSorted(m_matchesUnicode, ch); + addSorted(ch); else putUnicodeIgnoreCase(ch, info); } - void putUnicodeIgnoreCase(UChar ch, const UCS2CanonicalizationRange* info) + void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info) { ASSERT(m_isCaseInsensitive); - ASSERT(ch > 0x7f); ASSERT(ch >= info->begin && ch <= info->end); ASSERT(info->type != CanonicalizeUnique); if (info->type == CanonicalizeSet) { - for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) - addSorted(m_matchesUnicode, ch); + for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set) + addSorted(ch); } else { - addSorted(m_matchesUnicode, ch); - addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); + addSorted(ch); + addSorted(getCanonicalPair(info, ch)); } } - void putRange(UChar lo, UChar hi) + void putRange(UChar32 lo, UChar32 hi) { - if (lo <= 0x7f) { + if (isASCII(lo)) { char asciiLo = lo; - char asciiHi = std::min(hi, (UChar)0x7f); + char asciiHi = std::min(hi, (UChar32)0x7f); addSortedRange(m_ranges, lo, asciiHi); if (m_isCaseInsensitive) { @@ -120,19 +120,19 @@ public: addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); } } - if (hi <= 0x7f) + if (isASCII(hi)) return; - lo = std::max(lo, (UChar)0x80); + lo = std::max(lo, (UChar32)0x80); addSortedRange(m_rangesUnicode, lo, hi); if (!m_isCaseInsensitive) return; - const UCS2CanonicalizationRange* info = rangeInfoFor(lo); + const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode); while (true) { // Handle the range [lo .. end] - UChar end = std::min<UChar>(info->end, hi); + UChar32 end = std::min<UChar32>(info->end, hi); switch (info->type) { case CanonicalizeUnique: @@ -140,7 +140,7 @@ public: break; case CanonicalizeSet: { UChar ch; - for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set) addSorted(m_matchesUnicode, ch); break; } @@ -175,20 +175,25 @@ public: } - PassOwnPtr<CharacterClass> charClass() + std::unique_ptr<CharacterClass> charClass() { - OwnPtr<CharacterClass> characterClass = adoptPtr(new CharacterClass); + auto characterClass = std::make_unique<CharacterClass>(); characterClass->m_matches.swap(m_matches); characterClass->m_ranges.swap(m_ranges); characterClass->m_matchesUnicode.swap(m_matchesUnicode); characterClass->m_rangesUnicode.swap(m_rangesUnicode); - return characterClass.release(); + return characterClass; } private: - void addSorted(Vector<UChar>& matches, UChar ch) + void addSorted(UChar32 ch) + { + addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch); + } + + void addSorted(Vector<UChar32>& matches, UChar32 ch) { unsigned pos = 0; unsigned range = matches.size(); @@ -214,7 +219,7 @@ private: matches.insert(pos, ch); } - void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) + void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi) { unsigned end = ranges.size(); @@ -260,24 +265,26 @@ private: } bool m_isCaseInsensitive; + CanonicalMode m_canonicalMode; - Vector<UChar> m_matches; + Vector<UChar32> m_matches; Vector<CharacterRange> m_ranges; - Vector<UChar> m_matchesUnicode; + Vector<UChar32> m_matchesUnicode; Vector<CharacterRange> m_rangesUnicode; }; class YarrPatternConstructor { public: - YarrPatternConstructor(YarrPattern& pattern) + YarrPatternConstructor(YarrPattern& pattern, void* stackLimit) : m_pattern(pattern) - , m_characterClassConstructor(pattern.m_ignoreCase) + , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2) + , m_stackLimit(stackLimit) , m_invertParentheticalAssertion(false) { - OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); + auto body = std::make_unique<PatternDisjunction>(); m_pattern.m_body = body.get(); m_alternative = body->addNewAlternative(); - m_pattern.m_disjunctions.append(body.release()); + m_pattern.m_disjunctions.append(WTFMove(body)); } ~YarrPatternConstructor() @@ -289,15 +296,15 @@ public: m_pattern.reset(); m_characterClassConstructor.reset(); - OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); + auto body = std::make_unique<PatternDisjunction>(); m_pattern.m_body = body.get(); m_alternative = body->addNewAlternative(); - m_pattern.m_disjunctions.append(body.release()); + m_pattern.m_disjunctions.append(WTFMove(body)); } void assertionBOL() { - if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { + if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) { m_alternative->m_startsWithBOL = true; m_alternative->m_containsBOL = true; m_pattern.m_containsBOL = true; @@ -313,25 +320,25 @@ public: m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); } - void atomPatternCharacter(UChar ch) + void atomPatternCharacter(UChar32 ch) { // We handle case-insensitive checking of unicode characters which do have both // cases by handling them as if they were defined using a CharacterClass. - if (!m_pattern.m_ignoreCase || isASCII(ch)) { + if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) { m_alternative->m_terms.append(PatternTerm(ch)); return; } - const UCS2CanonicalizationRange* info = rangeInfoFor(ch); + const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2); if (info->type == CanonicalizeUnique) { m_alternative->m_terms.append(PatternTerm(ch)); return; } m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); - OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); + auto newCharacterClass = m_characterClassConstructor.charClass(); m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false)); - m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); + m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass)); } void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) @@ -344,7 +351,10 @@ public: m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); break; case WordClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + if (m_pattern.unicode() && m_pattern.ignoreCase()) + m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert)); + else + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); break; case NewlineClassID: m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); @@ -357,12 +367,12 @@ public: m_invertCharacterClass = invert; } - void atomCharacterClassAtom(UChar ch) + void atomCharacterClassAtom(UChar32 ch) { m_characterClassConstructor.putChar(ch); } - void atomCharacterClassRange(UChar begin, UChar end) + void atomCharacterClassRange(UChar32 begin, UChar32 end) { m_characterClassConstructor.putRange(begin, end); } @@ -381,7 +391,10 @@ public: break; case WordClassID: - m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); + if (m_pattern.unicode() && m_pattern.ignoreCase()) + m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass()); + else + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); break; default: @@ -391,9 +404,9 @@ public: void atomCharacterClassEnd() { - OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); + auto newCharacterClass = m_characterClassConstructor.charClass(); m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass)); - m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); + m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass)); } void atomParenthesesSubpatternBegin(bool capture = true) @@ -402,19 +415,19 @@ public: if (capture) m_pattern.m_numSubpatterns++; - OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); + auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative); m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false)); m_alternative = parenthesesDisjunction->addNewAlternative(); - m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); + m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction)); } void atomParentheticalAssertionBegin(bool invert = false) { - OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); + auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative); m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert)); m_alternative = parenthesesDisjunction->addNewAlternative(); m_invertParentheticalAssertion = invert; - m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); + m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction)); } void atomParenthesesEnd() @@ -479,12 +492,12 @@ public: // skip alternatives with m_startsWithBOL set true. PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) { - OwnPtr<PatternDisjunction> newDisjunction; + std::unique_ptr<PatternDisjunction> newDisjunction; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { if (!newDisjunction) { - newDisjunction = adoptPtr(new PatternDisjunction()); + newDisjunction = std::make_unique<PatternDisjunction>(); newDisjunction->m_parent = disjunction->m_parent; } PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); @@ -498,7 +511,7 @@ public: return 0; PatternDisjunction* copiedDisjunction = newDisjunction.get(); - m_pattern.m_disjunctions.append(newDisjunction.release()); + m_pattern.m_disjunctions.append(WTFMove(newDisjunction)); return copiedDisjunction; } @@ -509,6 +522,7 @@ public: PatternTerm termCopy = term; termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); + m_pattern.m_hasCopiedParenSubexpressions = true; return termCopy; } @@ -524,7 +538,7 @@ public: PatternTerm& term = m_alternative->lastTerm(); ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); - ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); + ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount); if (term.type == PatternTerm::TypeParentheticalAssertion) { // If an assertion is quantified with a minimum count of zero, it can simply be removed. @@ -546,12 +560,12 @@ public: return; } - if (min == 0) - term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); - else if (min == max) - term.quantify(min, QuantifierFixedCount); + if (min == max) + term.quantify(min, max, QuantifierFixedCount); + else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions)) + term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy); else { - term.quantify(min, QuantifierFixedCount); + term.quantify(min, min, QuantifierFixedCount); m_alternative->m_terms.append(copyTerm(term)); // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); @@ -565,10 +579,14 @@ public: m_alternative = m_alternative->m_parent->addNewAlternative(); } - unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) + YarrPattern::ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN { + if (UNLIKELY(!isSafeToRecurse())) + return YarrPattern::TooManyDisjunctions; + + YarrPattern::ErrorCode error = YarrPattern::NoError; alternative->m_hasFixedSize = true; - Checked<unsigned> currentInputPosition = initialInputPosition; + Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition; for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { PatternTerm& term = alternative->m_terms[i]; @@ -596,8 +614,10 @@ public: term.frameLocation = currentCallFrameSize; currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; alternative->m_hasFixedSize = false; + } else if (m_pattern.unicode()) { + currentInputPosition += U16_LENGTH(term.patternCharacter) * term.quantityMaxCount; } else - currentInputPosition += term.quantityCount; + currentInputPosition += term.quantityMaxCount; break; case PatternTerm::TypeCharacterClass: @@ -606,28 +626,40 @@ public: term.frameLocation = currentCallFrameSize; currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; alternative->m_hasFixedSize = false; + } else if (m_pattern.unicode()) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; + currentInputPosition += term.quantityMaxCount; + alternative->m_hasFixedSize = false; } else - currentInputPosition += term.quantityCount; + currentInputPosition += term.quantityMaxCount; break; case PatternTerm::TypeParenthesesSubpattern: // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. term.frameLocation = currentCallFrameSize; - if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) { if (term.quantityType != QuantifierFixedCount) currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (error) + return error; // If quantity is fixed, then pre-check its minimum size. if (term.quantityType == QuantifierFixedCount) currentInputPosition += term.parentheses.disjunction->m_minimumSize; term.inputPosition = currentInputPosition.unsafeGet(); } else if (term.parentheses.isTerminal) { currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (error) + return error; term.inputPosition = currentInputPosition.unsafeGet(); } else { term.inputPosition = currentInputPosition.unsafeGet(); - setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet()); + unsigned ignoredCallFrameSize; + error = setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet(), ignoredCallFrameSize); + if (error) + return error; currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; } // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. @@ -637,7 +669,9 @@ public: case PatternTerm::TypeParentheticalAssertion: term.inputPosition = currentInputPosition.unsafeGet(); term.frameLocation = currentCallFrameSize; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet()); + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (error) + return error; break; case PatternTerm::TypeDotStarEnclosure: @@ -645,27 +679,39 @@ public: term.inputPosition = initialInputPosition; break; } + if (currentInputPosition.hasOverflowed()) + return YarrPattern::OffsetTooLarge; } alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet(); - return currentCallFrameSize; + newCallFrameSize = currentCallFrameSize; + return error; } - unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) + YarrPattern::ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize) { + if (UNLIKELY(!isSafeToRecurse())) + return YarrPattern::TooManyDisjunctions; + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; unsigned minimumInputSize = UINT_MAX; unsigned maximumCallFrameSize = 0; bool hasFixedSize = true; + YarrPattern::ErrorCode error = YarrPattern::NoError; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); - unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); + unsigned currentAlternativeCallFrameSize; + error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize); + if (error) + return error; minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); hasFixedSize &= alternative->m_hasFixedSize; + if (alternative->m_minimumSize > INT_MAX) + m_pattern.m_containsUnsignedLengthPattern = true; } ASSERT(minimumInputSize != UINT_MAX); @@ -674,12 +720,18 @@ public: disjunction->m_hasFixedSize = hasFixedSize; disjunction->m_minimumSize = minimumInputSize; disjunction->m_callFrameSize = maximumCallFrameSize; - return maximumCallFrameSize; + callFrameSize = maximumCallFrameSize; + return error; } - void setupOffsets() + const char* setupOffsets() { - setupDisjunctionOffsets(m_pattern.m_body, 0, 0); + // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314). + unsigned ignoredCallFrameSize; + YarrPattern::ErrorCode error = setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize); + if (error) + return YarrPattern::errorMessage(error); + return nullptr; } // This optimization identifies sets of parentheses that we will never need to backtrack. @@ -696,14 +748,15 @@ public: if (m_pattern.m_numSubpatterns) return; - Vector<OwnPtr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; for (size_t i = 0; i < alternatives.size(); ++i) { Vector<PatternTerm>& terms = alternatives[i]->m_terms; if (terms.size()) { PatternTerm& term = terms.last(); if (term.type == PatternTerm::TypeParenthesesSubpattern && term.quantityType == QuantifierGreedy - && term.quantityCount == quantifyInfinite + && term.quantityMinCount == 0 + && term.quantityMaxCount == quantifyInfinite && !term.capture()) term.parentheses.isTerminal = true; } @@ -719,7 +772,7 @@ public: // At this point, this is only valid for non-multiline expressions. PatternDisjunction* disjunction = m_pattern.m_body; - if (!m_pattern.m_containsBOL || m_pattern.m_multiline) + if (!m_pattern.m_containsBOL || m_pattern.multiline()) return; PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); @@ -737,11 +790,12 @@ public: } } - bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) + bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex) { Vector<PatternTerm>& terms = alternative->m_terms; - for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { + ASSERT(endIndex <= terms.size()); + for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) { PatternTerm& term = terms[termIndex]; if (term.m_capture) @@ -750,7 +804,7 @@ public: if (term.type == PatternTerm::TypeParenthesesSubpattern) { PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { - if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1)) + if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size())) return true; } } @@ -766,7 +820,7 @@ public: // beginning and the end of the match. void optimizeDotStarWrappedExpressions() { - Vector<OwnPtr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; if (alternatives.size() != 1) return; @@ -775,7 +829,7 @@ public: if (terms.size() >= 3) { bool startsWithBOL = false; bool endsWithEOL = false; - size_t termIndex, firstExpressionTerm, lastExpressionTerm; + size_t termIndex, firstExpressionTerm; termIndex = 0; if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { @@ -798,14 +852,13 @@ public: PatternTerm& lastNonAnchorTerm = terms[termIndex]; if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) return; - - lastExpressionTerm = termIndex - 1; - if (firstExpressionTerm > lastExpressionTerm) + size_t endIndex = termIndex; + if (firstExpressionTerm >= endIndex) return; - if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { - for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) + if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) { + for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex) terms.remove(termIndex); for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) @@ -819,18 +872,58 @@ public: } private: + bool isSafeToRecurse() const + { + if (!m_stackLimit) + return true; + ASSERT(wtfThreadData().stack().isGrowingDownward()); + int8_t* curr = reinterpret_cast<int8_t*>(&curr); + int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit); + return curr >= limit; + } + YarrPattern& m_pattern; PatternAlternative* m_alternative; CharacterClassConstructor m_characterClassConstructor; + void* m_stackLimit; bool m_invertCharacterClass; bool m_invertParentheticalAssertion; }; -const char* YarrPattern::compile(const String& patternString) +const char* YarrPattern::errorMessage(YarrPattern::ErrorCode error) +{ +#define REGEXP_ERROR_PREFIX "Invalid regular expression: " + // The order of this array must match the ErrorCode enum. + static const char* errorMessages[NumberOfErrorCodes] = { + nullptr, // NoError + REGEXP_ERROR_PREFIX "regular expression too large", + REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", + REGEXP_ERROR_PREFIX "nothing to repeat", + REGEXP_ERROR_PREFIX "number too large in {} quantifier", + REGEXP_ERROR_PREFIX "missing )", + REGEXP_ERROR_PREFIX "unmatched parentheses", + REGEXP_ERROR_PREFIX "unrecognized character after (?", + REGEXP_ERROR_PREFIX "missing terminating ] for character class", + REGEXP_ERROR_PREFIX "range out of order in character class", + REGEXP_ERROR_PREFIX "\\ at end of pattern", + REGEXP_ERROR_PREFIX "invalid unicode {} escape", + REGEXP_ERROR_PREFIX "invalid escaped character for unicode pattern", + REGEXP_ERROR_PREFIX "too many nested disjunctions", + REGEXP_ERROR_PREFIX "pattern exceeds string length limits", + REGEXP_ERROR_PREFIX "invalid flags" + }; + + return errorMessages[error]; +} + +const char* YarrPattern::compile(const String& patternString, void* stackLimit) { - YarrPatternConstructor constructor(*this); + YarrPatternConstructor constructor(*this, stackLimit); + + if (m_flags == InvalidFlags) + return errorMessage(InvalidRegularExpressionFlags); - if (const char* error = parse(constructor, patternString)) + if (const char* error = parse(constructor, patternString, unicode())) return error; // If the pattern contains illegal backreferences reset & reparse. @@ -844,7 +937,7 @@ const char* YarrPattern::compile(const String& patternString) #if !ASSERT_DISABLED const char* error = #endif - parse(constructor, patternString, numSubpatterns); + parse(constructor, patternString, unicode(), numSubpatterns); ASSERT(!error); ASSERT(numSubpatterns == m_numSubpatterns); @@ -854,27 +947,31 @@ const char* YarrPattern::compile(const String& patternString) constructor.optimizeDotStarWrappedExpressions(); constructor.optimizeBOL(); - constructor.setupOffsets(); + if (const char* error = constructor.setupOffsets()) + return error; - return 0; + return nullptr; } -YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error) - : m_ignoreCase(ignoreCase) - , m_multiline(multiline) - , m_containsBackreferences(false) +YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, const char** error, void* stackLimit) + : m_containsBackreferences(false) , m_containsBOL(false) + , m_containsUnsignedLengthPattern(false) + , m_hasCopiedParenSubexpressions(false) + , m_flags(flags) , m_numSubpatterns(0) , m_maxBackReference(0) , newlineCached(0) , digitsCached(0) , spacesCached(0) , wordcharCached(0) + , wordUnicodeIgnoreCaseCharCached(0) , nondigitsCached(0) , nonspacesCached(0) , nonwordcharCached(0) + , nonwordUnicodeIgnoreCasecharCached(0) { - *error = compile(pattern); + *error = compile(pattern, stackLimit); } } } diff --git a/Source/JavaScriptCore/yarr/YarrPattern.h b/Source/JavaScriptCore/yarr/YarrPattern.h index d42b0f979..2db32d94b 100644 --- a/Source/JavaScriptCore/yarr/YarrPattern.h +++ b/Source/JavaScriptCore/yarr/YarrPattern.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2013-2014, 2016 Apple Inc. All rights reserved. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without @@ -24,26 +24,22 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrPattern_h -#define YarrPattern_h +#pragma once +#include "RegExpKey.h" #include <wtf/CheckedArithmetic.h> -#include <wtf/OwnPtr.h> -#include <wtf/PassOwnPtr.h> -#include <wtf/RefCounted.h> #include <wtf/Vector.h> #include <wtf/text/WTFString.h> -#include <wtf/unicode/Unicode.h> namespace JSC { namespace Yarr { struct PatternDisjunction; struct CharacterRange { - UChar begin; - UChar end; + UChar32 begin; + UChar32 end; - CharacterRange(UChar begin, UChar end) + CharacterRange(UChar32 begin, UChar32 end) : begin(begin) , end(end) { @@ -65,9 +61,9 @@ public: , m_tableInverted(inverted) { } - Vector<UChar> m_matches; + Vector<UChar32> m_matches; Vector<CharacterRange> m_ranges; - Vector<UChar> m_matchesUnicode; + Vector<UChar32> m_matchesUnicode; Vector<CharacterRange> m_rangesUnicode; const char* m_table; @@ -96,7 +92,7 @@ struct PatternTerm { bool m_capture :1; bool m_invert :1; union { - UChar patternCharacter; + UChar32 patternCharacter; CharacterClass* characterClass; unsigned backReferenceSubpatternId; struct { @@ -112,18 +108,19 @@ struct PatternTerm { } anchors; }; QuantifierType quantityType; - Checked<unsigned> quantityCount; - int inputPosition; + Checked<unsigned> quantityMinCount; + Checked<unsigned> quantityMaxCount; + unsigned inputPosition; unsigned frameLocation; - PatternTerm(UChar ch) + PatternTerm(UChar32 ch) : type(PatternTerm::TypePatternCharacter) , m_capture(false) , m_invert(false) { patternCharacter = ch; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(CharacterClass* charClass, bool invert) @@ -133,7 +130,7 @@ struct PatternTerm { { characterClass = charClass; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) @@ -146,7 +143,7 @@ struct PatternTerm { parentheses.isCopy = false; parentheses.isTerminal = false; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(Type type, bool invert = false) @@ -155,7 +152,7 @@ struct PatternTerm { , m_invert(invert) { quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(unsigned spatternId) @@ -165,7 +162,7 @@ struct PatternTerm { { backReferenceSubpatternId = spatternId; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(bool bolAnchor, bool eolAnchor) @@ -176,7 +173,7 @@ struct PatternTerm { anchors.bolAnchor = bolAnchor; anchors.eolAnchor = eolAnchor; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } static PatternTerm ForwardReference() @@ -211,7 +208,18 @@ struct PatternTerm { void quantify(unsigned count, QuantifierType type) { - quantityCount = count; + quantityMinCount = 0; + quantityMaxCount = count; + quantityType = type; + } + + void quantify(unsigned minCount, unsigned maxCount, QuantifierType type) + { + // Currently only Parentheses can specify a non-zero min with a different max. + ASSERT(this->type == TypeParenthesesSubpattern || !minCount || minCount == maxCount); + ASSERT(minCount <= maxCount); + quantityMinCount = minCount; + quantityMaxCount = maxCount; quantityType = type; } }; @@ -270,12 +278,11 @@ public: PatternAlternative* addNewAlternative() { - PatternAlternative* alternative = new PatternAlternative(this); - m_alternatives.append(adoptPtr(alternative)); - return alternative; + m_alternatives.append(std::make_unique<PatternAlternative>(this)); + return static_cast<PatternAlternative*>(m_alternatives.last().get()); } - Vector<OwnPtr<PatternAlternative>> m_alternatives; + Vector<std::unique_ptr<PatternAlternative>> m_alternatives; PatternAlternative* m_parent; unsigned m_minimumSize; unsigned m_callFrameSize; @@ -286,13 +293,15 @@ public: // (please to be calling newlineCharacterClass() et al on your // friendly neighborhood YarrPattern instance to get nicely // cached copies). -CharacterClass* newlineCreate(); -CharacterClass* digitsCreate(); -CharacterClass* spacesCreate(); -CharacterClass* wordcharCreate(); -CharacterClass* nondigitsCreate(); -CharacterClass* nonspacesCreate(); -CharacterClass* nonwordcharCreate(); +std::unique_ptr<CharacterClass> newlineCreate(); +std::unique_ptr<CharacterClass> digitsCreate(); +std::unique_ptr<CharacterClass> spacesCreate(); +std::unique_ptr<CharacterClass> wordcharCreate(); +std::unique_ptr<CharacterClass> wordUnicodeIgnoreCaseCharCreate(); +std::unique_ptr<CharacterClass> nondigitsCreate(); +std::unique_ptr<CharacterClass> nonspacesCreate(); +std::unique_ptr<CharacterClass> nonwordcharCreate(); +std::unique_ptr<CharacterClass> nonwordUnicodeIgnoreCaseCharCreate(); struct TermChain { TermChain(PatternTerm term) @@ -303,8 +312,31 @@ struct TermChain { Vector<TermChain> hotTerms; }; + struct YarrPattern { - JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error); + JS_EXPORT_PRIVATE YarrPattern(const String& pattern, RegExpFlags, const char** error, void* stackLimit = nullptr); + + enum ErrorCode { + NoError, + PatternTooLarge, + QuantifierOutOfOrder, + QuantifierWithoutAtom, + QuantifierTooLarge, + MissingParentheses, + ParenthesesUnmatched, + ParenthesesTypeInvalid, + CharacterClassUnmatched, + CharacterClassOutOfOrder, + EscapeUnterminated, + InvalidUnicodeEscape, + InvalidIdentityEscape, + TooManyDisjunctions, + OffsetTooLarge, + InvalidRegularExpressionFlags, + NumberOfErrorCodes + }; + + JS_EXPORT_PRIVATE static const char* errorMessage(ErrorCode); void reset() { @@ -313,14 +345,18 @@ struct YarrPattern { m_containsBackreferences = false; m_containsBOL = false; + m_containsUnsignedLengthPattern = false; + m_hasCopiedParenSubexpressions = false; newlineCached = 0; digitsCached = 0; spacesCached = 0; wordcharCached = 0; + wordUnicodeIgnoreCaseCharCached = 0; nondigitsCached = 0; nonspacesCached = 0; nonwordcharCached = 0; + nonwordUnicodeIgnoreCasecharCached = 0; m_disjunctions.clear(); m_userCharacterClasses.clear(); @@ -331,71 +367,112 @@ struct YarrPattern { return m_maxBackReference > m_numSubpatterns; } + bool containsUnsignedLengthPattern() + { + return m_containsUnsignedLengthPattern; + } + CharacterClass* newlineCharacterClass() { - if (!newlineCached) - m_userCharacterClasses.append(adoptPtr(newlineCached = newlineCreate())); + if (!newlineCached) { + m_userCharacterClasses.append(newlineCreate()); + newlineCached = m_userCharacterClasses.last().get(); + } return newlineCached; } CharacterClass* digitsCharacterClass() { - if (!digitsCached) - m_userCharacterClasses.append(adoptPtr(digitsCached = digitsCreate())); + if (!digitsCached) { + m_userCharacterClasses.append(digitsCreate()); + digitsCached = m_userCharacterClasses.last().get(); + } return digitsCached; } CharacterClass* spacesCharacterClass() { - if (!spacesCached) - m_userCharacterClasses.append(adoptPtr(spacesCached = spacesCreate())); + if (!spacesCached) { + m_userCharacterClasses.append(spacesCreate()); + spacesCached = m_userCharacterClasses.last().get(); + } return spacesCached; } CharacterClass* wordcharCharacterClass() { - if (!wordcharCached) - m_userCharacterClasses.append(adoptPtr(wordcharCached = wordcharCreate())); + if (!wordcharCached) { + m_userCharacterClasses.append(wordcharCreate()); + wordcharCached = m_userCharacterClasses.last().get(); + } return wordcharCached; } + CharacterClass* wordUnicodeIgnoreCaseCharCharacterClass() + { + if (!wordUnicodeIgnoreCaseCharCached) { + m_userCharacterClasses.append(wordUnicodeIgnoreCaseCharCreate()); + wordUnicodeIgnoreCaseCharCached = m_userCharacterClasses.last().get(); + } + return wordUnicodeIgnoreCaseCharCached; + } CharacterClass* nondigitsCharacterClass() { - if (!nondigitsCached) - m_userCharacterClasses.append(adoptPtr(nondigitsCached = nondigitsCreate())); + if (!nondigitsCached) { + m_userCharacterClasses.append(nondigitsCreate()); + nondigitsCached = m_userCharacterClasses.last().get(); + } return nondigitsCached; } CharacterClass* nonspacesCharacterClass() { - if (!nonspacesCached) - m_userCharacterClasses.append(adoptPtr(nonspacesCached = nonspacesCreate())); + if (!nonspacesCached) { + m_userCharacterClasses.append(nonspacesCreate()); + nonspacesCached = m_userCharacterClasses.last().get(); + } return nonspacesCached; } CharacterClass* nonwordcharCharacterClass() { - if (!nonwordcharCached) - m_userCharacterClasses.append(adoptPtr(nonwordcharCached = nonwordcharCreate())); + if (!nonwordcharCached) { + m_userCharacterClasses.append(nonwordcharCreate()); + nonwordcharCached = m_userCharacterClasses.last().get(); + } return nonwordcharCached; } + CharacterClass* nonwordUnicodeIgnoreCaseCharCharacterClass() + { + if (!nonwordUnicodeIgnoreCasecharCached) { + m_userCharacterClasses.append(nonwordUnicodeIgnoreCaseCharCreate()); + nonwordUnicodeIgnoreCasecharCached = m_userCharacterClasses.last().get(); + } + return nonwordUnicodeIgnoreCasecharCached; + } + + bool ignoreCase() const { return m_flags & FlagIgnoreCase; } + bool multiline() const { return m_flags & FlagMultiline; } + bool sticky() const { return m_flags & FlagSticky; } + bool unicode() const { return m_flags & FlagUnicode; } - bool m_ignoreCase : 1; - bool m_multiline : 1; bool m_containsBackreferences : 1; bool m_containsBOL : 1; + bool m_containsUnsignedLengthPattern : 1; + bool m_hasCopiedParenSubexpressions : 1; + RegExpFlags m_flags; unsigned m_numSubpatterns; unsigned m_maxBackReference; PatternDisjunction* m_body; - Vector<OwnPtr<PatternDisjunction>, 4> m_disjunctions; - Vector<OwnPtr<CharacterClass>> m_userCharacterClasses; + Vector<std::unique_ptr<PatternDisjunction>, 4> m_disjunctions; + Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses; private: - const char* compile(const String& patternString); + const char* compile(const String& patternString, void* stackLimit); CharacterClass* newlineCached; CharacterClass* digitsCached; CharacterClass* spacesCached; CharacterClass* wordcharCached; + CharacterClass* wordUnicodeIgnoreCaseCharCached; CharacterClass* nondigitsCached; CharacterClass* nonspacesCached; CharacterClass* nonwordcharCached; + CharacterClass* nonwordUnicodeIgnoreCasecharCached; }; } } // namespace JSC::Yarr - -#endif // YarrPattern_h diff --git a/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp index aa98c4a35..218f451a2 100644 --- a/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp +++ b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Apple Inc. All rights reserved. + * Copyright (C) 2011, 2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -35,7 +35,7 @@ public: void assertionBOL() {} void assertionEOL() {} void assertionWordBoundary(bool) {} - void atomPatternCharacter(UChar) {} + void atomPatternCharacter(UChar32) {} void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) {} void atomCharacterClassBegin(bool = false) {} void atomCharacterClassAtom(UChar) {} @@ -50,10 +50,10 @@ public: void disjunction() {} }; -const char* checkSyntax(const String& pattern) +const char* checkSyntax(const String& pattern, const String& flags) { SyntaxChecker syntaxChecker; - return parse(syntaxChecker, pattern); + return parse(syntaxChecker, pattern, flags.contains('u')); } -}} // JSC::YARR +}} // JSC::Yarr diff --git a/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h index 104ced3ab..2e2e9bcfb 100644 --- a/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h +++ b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h @@ -23,16 +23,12 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrSyntaxChecker_h -#define YarrSyntaxChecker_h +#pragma once #include <wtf/text/WTFString.h> namespace JSC { namespace Yarr { -const char* checkSyntax(const String& pattern); - -}} // JSC::YARR - -#endif // YarrSyntaxChecker_h +const char* checkSyntax(const String& pattern, const String& flags); +}} // JSC::Yarr |