diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
commit | 1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch) | |
tree | 46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/WebCore/contentextensions/Term.h | |
parent | 32761a6cee1d0dee366b885b7b9c777e67885688 (diff) | |
download | WebKitGtk-tarball-master.tar.gz |
webkitgtk-2.16.5HEADwebkitgtk-2.16.5master
Diffstat (limited to 'Source/WebCore/contentextensions/Term.h')
-rw-r--r-- | Source/WebCore/contentextensions/Term.h | 687 |
1 files changed, 687 insertions, 0 deletions
diff --git a/Source/WebCore/contentextensions/Term.h b/Source/WebCore/contentextensions/Term.h new file mode 100644 index 000000000..0d51833f5 --- /dev/null +++ b/Source/WebCore/contentextensions/Term.h @@ -0,0 +1,687 @@ +/* + * Copyright (C) 2014-2015 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#if ENABLE(CONTENT_EXTENSIONS) + +#include "ContentExtensionsDebugging.h" +#include "NFA.h" +#include <unicode/utypes.h> +#include <wtf/ASCIICType.h> +#include <wtf/HashMap.h> +#include <wtf/Vector.h> +#include <wtf/text/StringBuilder.h> +#include <wtf/text/WTFString.h> + +namespace WebCore { + +namespace ContentExtensions { + +enum class AtomQuantifier : uint8_t { + One, + ZeroOrOne, + ZeroOrMore, + OneOrMore +}; + +class Term { +public: + Term(); + Term(char character, bool isCaseSensitive); + + enum UniversalTransitionTag { UniversalTransition }; + explicit Term(UniversalTransitionTag); + + enum CharacterSetTermTag { CharacterSetTerm }; + explicit Term(CharacterSetTermTag, bool isInverted); + + enum GroupTermTag { GroupTerm }; + explicit Term(GroupTermTag); + + enum EndOfLineAssertionTermTag { EndOfLineAssertionTerm }; + explicit Term(EndOfLineAssertionTermTag); + + Term(const Term& other); + Term(Term&& other); + + enum EmptyValueTag { EmptyValue }; + Term(EmptyValueTag); + + ~Term(); + + bool isValid() const; + + // CharacterSet terms only. + void addCharacter(UChar character, bool isCaseSensitive); + + // Group terms only. + void extendGroupSubpattern(const Term&); + + void quantify(const AtomQuantifier&); + + ImmutableCharNFANodeBuilder generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const; + void generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const; + + bool isEndOfLineAssertion() const; + + bool matchesAtLeastOneCharacter() const; + + // Matches any string, the empty string included. + // This is very conservative. Patterns matching any string can return false here. + bool isKnownToMatchAnyString() const; + + // Return true if the term can only match input of a single fixed length. + bool hasFixedLength() const; + + Term& operator=(const Term& other); + Term& operator=(Term&& other); + + bool operator==(const Term& other) const; + + unsigned hash() const; + + bool isEmptyValue() const; + +#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING + String toString() const; +#endif + +#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING + size_t memoryUsed() const; +#endif + +private: + // This is exact for character sets but conservative for groups. + // The return value can be false for a group equivalent to a universal transition. + bool isUniversalTransition() const; + + void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const; + void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const; + + void destroy(); + + enum class TermType : uint8_t { + Empty, + CharacterSet, + Group + }; + + TermType m_termType { TermType::Empty }; + AtomQuantifier m_quantifier { AtomQuantifier::One }; + + class CharacterSet { + public: + void set(UChar character) + { + RELEASE_ASSERT(character < 128); + m_characters[character / 64] |= (uint64_t(1) << (character % 64)); + } + + bool get(UChar character) const + { + RELEASE_ASSERT(character < 128); + return m_characters[character / 64] & (uint64_t(1) << (character % 64)); + } + + void invert() + { + ASSERT(!m_inverted); + m_inverted = true; + } + + bool inverted() const { return m_inverted; } + + unsigned bitCount() const + { + return WTF::bitCount(m_characters[0]) + WTF::bitCount(m_characters[1]); + } + + bool operator==(const CharacterSet& other) const + { + return other.m_inverted == m_inverted + && other.m_characters[0] == m_characters[0] + && other.m_characters[1] == m_characters[1]; + } + + unsigned hash() const + { + return WTF::pairIntHash(WTF::pairIntHash(WTF::intHash(m_characters[0]), WTF::intHash(m_characters[1])), m_inverted); + } + private: + bool m_inverted { false }; + uint64_t m_characters[2] { 0, 0 }; + }; + + struct Group { + Vector<Term> terms; + + bool operator==(const Group& other) const + { + return other.terms == terms; + } + + unsigned hash() const + { + unsigned hash = 6421749; + for (const Term& term : terms) { + unsigned termHash = term.hash(); + hash = (hash << 16) ^ ((termHash << 11) ^ hash); + hash += hash >> 11; + } + return hash; + } + }; + + union AtomData { + AtomData() + : invalidTerm(0) + { + } + ~AtomData() + { + } + + char invalidTerm; + CharacterSet characterSet; + Group group; + } m_atomData; +}; + +#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING +inline String quantifierToString(AtomQuantifier quantifier) +{ + switch (quantifier) { + case AtomQuantifier::One: + return ""; + case AtomQuantifier::ZeroOrOne: + return "?"; + case AtomQuantifier::ZeroOrMore: + return "*"; + case AtomQuantifier::OneOrMore: + return "+"; + } +} + +inline String Term::toString() const +{ + switch (m_termType) { + case TermType::Empty: + return "(Empty)"; + case TermType::CharacterSet: { + StringBuilder builder; + builder.append('['); + for (UChar c = 0; c < 128; c++) { + if (m_atomData.characterSet.get(c)) { + if (isASCIIPrintable(c) && !isASCIISpace(c)) + builder.append(c); + else { + builder.append('\\'); + builder.append('u'); + builder.appendNumber(c); + } + } + } + builder.append(']'); + builder.append(quantifierToString(m_quantifier)); + return builder.toString(); + } + case TermType::Group: { + StringBuilder builder; + builder.append('('); + for (const Term& term : m_atomData.group.terms) + builder.append(term.toString()); + builder.append(')'); + builder.append(quantifierToString(m_quantifier)); + return builder.toString(); + } + } +} +#endif + +inline Term::Term() +{ +} + +inline Term::Term(char character, bool isCaseSensitive) + : m_termType(TermType::CharacterSet) +{ + new (NotNull, &m_atomData.characterSet) CharacterSet(); + addCharacter(character, isCaseSensitive); +} + +inline Term::Term(UniversalTransitionTag) + : m_termType(TermType::CharacterSet) +{ + new (NotNull, &m_atomData.characterSet) CharacterSet(); + for (UChar i = 1; i < 128; ++i) + m_atomData.characterSet.set(i); +} + +inline Term::Term(CharacterSetTermTag, bool isInverted) + : m_termType(TermType::CharacterSet) +{ + new (NotNull, &m_atomData.characterSet) CharacterSet(); + if (isInverted) + m_atomData.characterSet.invert(); +} + +inline Term::Term(GroupTermTag) + : m_termType(TermType::Group) +{ + new (NotNull, &m_atomData.group) Group(); +} + +inline Term::Term(EndOfLineAssertionTermTag) + : Term(CharacterSetTerm, false) +{ + m_atomData.characterSet.set(0); +} + +inline Term::Term(const Term& other) + : m_termType(other.m_termType) + , m_quantifier(other.m_quantifier) +{ + switch (m_termType) { + case TermType::Empty: + break; + case TermType::CharacterSet: + new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet); + break; + case TermType::Group: + new (NotNull, &m_atomData.group) Group(other.m_atomData.group); + break; + } +} + +inline Term::Term(Term&& other) + : m_termType(WTFMove(other.m_termType)) + , m_quantifier(WTFMove(other.m_quantifier)) +{ + switch (m_termType) { + case TermType::Empty: + break; + case TermType::CharacterSet: + new (NotNull, &m_atomData.characterSet) CharacterSet(WTFMove(other.m_atomData.characterSet)); + break; + case TermType::Group: + new (NotNull, &m_atomData.group) Group(WTFMove(other.m_atomData.group)); + break; + } + other.destroy(); +} + +inline Term::Term(EmptyValueTag) + : m_termType(TermType::Empty) +{ +} + +inline Term::~Term() +{ + destroy(); +} + +inline bool Term::isValid() const +{ + return m_termType != TermType::Empty; +} + +inline void Term::addCharacter(UChar character, bool isCaseSensitive) +{ + ASSERT(isASCII(character)); + + ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet); + if (m_termType != TermType::CharacterSet) + return; + + if (isCaseSensitive || !isASCIIAlpha(character)) + m_atomData.characterSet.set(character); + else { + m_atomData.characterSet.set(toASCIIUpper(character)); + m_atomData.characterSet.set(toASCIILower(character)); + } +} + +inline void Term::extendGroupSubpattern(const Term& term) +{ + ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group); + if (m_termType != TermType::Group) + return; + m_atomData.group.terms.append(term); +} + +inline void Term::quantify(const AtomQuantifier& quantifier) +{ + ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once."); + m_quantifier = quantifier; +} + +inline ImmutableCharNFANodeBuilder Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const +{ + ImmutableCharNFANodeBuilder newEnd(nfa); + generateGraph(nfa, source, newEnd.nodeId()); + newEnd.setActions(finalActions.begin(), finalActions.end()); + return newEnd; +} + +inline void Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const +{ + ASSERT(isValid()); + + switch (m_quantifier) { + case AtomQuantifier::One: { + generateSubgraphForAtom(nfa, source, destination); + break; + } + case AtomQuantifier::ZeroOrOne: { + generateSubgraphForAtom(nfa, source, destination); + source.addEpsilonTransition(destination); + break; + } + case AtomQuantifier::ZeroOrMore: { + ImmutableCharNFANodeBuilder repeatStart(nfa); + source.addEpsilonTransition(repeatStart); + + ImmutableCharNFANodeBuilder repeatEnd(nfa); + generateSubgraphForAtom(nfa, repeatStart, repeatEnd); + repeatEnd.addEpsilonTransition(repeatStart); + + repeatEnd.addEpsilonTransition(destination); + source.addEpsilonTransition(destination); + break; + } + case AtomQuantifier::OneOrMore: { + ImmutableCharNFANodeBuilder repeatStart(nfa); + source.addEpsilonTransition(repeatStart); + + ImmutableCharNFANodeBuilder repeatEnd(nfa); + generateSubgraphForAtom(nfa, repeatStart, repeatEnd); + repeatEnd.addEpsilonTransition(repeatStart); + + repeatEnd.addEpsilonTransition(destination); + break; + } + } +} + +inline bool Term::isEndOfLineAssertion() const +{ + return m_termType == TermType::CharacterSet && m_atomData.characterSet.bitCount() == 1 && m_atomData.characterSet.get(0); +} + +inline bool Term::matchesAtLeastOneCharacter() const +{ + ASSERT(isValid()); + + if (m_quantifier == AtomQuantifier::ZeroOrOne || m_quantifier == AtomQuantifier::ZeroOrMore) + return false; + if (isEndOfLineAssertion()) + return false; + + if (m_termType == TermType::Group) { + for (const Term& term : m_atomData.group.terms) { + if (term.matchesAtLeastOneCharacter()) + return true; + } + return false; + } + return true; +} + +inline bool Term::isKnownToMatchAnyString() const +{ + ASSERT(isValid()); + + switch (m_termType) { + case TermType::Empty: + ASSERT_NOT_REACHED(); + break; + case TermType::CharacterSet: + // ".*" is the only simple term matching any string. + return isUniversalTransition() && m_quantifier == AtomQuantifier::ZeroOrMore; + break; + case TermType::Group: { + // There are infinitely many ways to match anything with groups, we just handle simple cases + if (m_atomData.group.terms.size() != 1) + return false; + + const Term& firstTermInGroup = m_atomData.group.terms.first(); + // -(.*) with any quantifier. + if (firstTermInGroup.isKnownToMatchAnyString()) + return true; + + if (firstTermInGroup.isUniversalTransition()) { + // -(.)*, (.+)*, (.?)* etc. + if (m_quantifier == AtomQuantifier::ZeroOrMore) + return true; + + // -(.+)?. + if (m_quantifier == AtomQuantifier::ZeroOrOne && firstTermInGroup.m_quantifier == AtomQuantifier::OneOrMore) + return true; + + // -(.?)+. + if (m_quantifier == AtomQuantifier::OneOrMore && firstTermInGroup.m_quantifier == AtomQuantifier::ZeroOrOne) + return true; + } + break; + } + } + return false; +} + +inline bool Term::hasFixedLength() const +{ + ASSERT(isValid()); + + switch (m_termType) { + case TermType::Empty: + ASSERT_NOT_REACHED(); + break; + case TermType::CharacterSet: + return m_quantifier == AtomQuantifier::One; + case TermType::Group: { + if (m_quantifier != AtomQuantifier::One) + return false; + for (const Term& term : m_atomData.group.terms) { + if (!term.hasFixedLength()) + return false; + } + return true; + } + } + return false; +} + +inline Term& Term::operator=(const Term& other) +{ + destroy(); + new (NotNull, this) Term(other); + return *this; +} + +inline Term& Term::operator=(Term&& other) +{ + destroy(); + new (NotNull, this) Term(WTFMove(other)); + return *this; +} + +inline bool Term::operator==(const Term& other) const +{ + if (other.m_termType != m_termType || other.m_quantifier != m_quantifier) + return false; + + switch (m_termType) { + case TermType::Empty: + return true; + case TermType::CharacterSet: + return m_atomData.characterSet == other.m_atomData.characterSet; + case TermType::Group: + return m_atomData.group == other.m_atomData.group; + } + ASSERT_NOT_REACHED(); + return false; +} + +inline unsigned Term::hash() const +{ + unsigned primary = static_cast<unsigned>(m_termType) << 16 | static_cast<unsigned>(m_quantifier); + unsigned secondary = 0; + switch (m_termType) { + case TermType::Empty: + secondary = 52184393; + break; + case TermType::CharacterSet: + secondary = m_atomData.characterSet.hash(); + break; + case TermType::Group: + secondary = m_atomData.group.hash(); + break; + } + return WTF::pairIntHash(primary, secondary); +} + +inline bool Term::isEmptyValue() const +{ + return m_termType == TermType::Empty; +} + +inline bool Term::isUniversalTransition() const +{ + ASSERT(isValid()); + + switch (m_termType) { + case TermType::Empty: + ASSERT_NOT_REACHED(); + break; + case TermType::CharacterSet: + return (m_atomData.characterSet.inverted() && !m_atomData.characterSet.bitCount()) + || (!m_atomData.characterSet.inverted() && m_atomData.characterSet.bitCount() == 127 && !m_atomData.characterSet.get(0)); + case TermType::Group: + return m_atomData.group.terms.size() == 1 && m_atomData.group.terms.first().isUniversalTransition(); + } + return false; +} + +inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const +{ + generateSubgraphForAtom(nfa, source, destination.nodeId()); +} + +inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const +{ + switch (m_termType) { + case TermType::Empty: + ASSERT_NOT_REACHED(); + source.addEpsilonTransition(destination); + break; + case TermType::CharacterSet: { + if (!m_atomData.characterSet.inverted()) { + UChar i = 0; + while (true) { + while (i < 128 && !m_atomData.characterSet.get(i)) + ++i; + if (i == 128) + break; + + UChar start = i; + ++i; + while (i < 128 && m_atomData.characterSet.get(i)) + ++i; + source.addTransition(start, i - 1, destination); + } + } else { + UChar i = 1; + while (true) { + while (i < 128 && m_atomData.characterSet.get(i)) + ++i; + if (i == 128) + break; + + UChar start = i; + ++i; + while (i < 128 && !m_atomData.characterSet.get(i)) + ++i; + source.addTransition(start, i - 1, destination); + } + } + break; + } + case TermType::Group: { + if (m_atomData.group.terms.isEmpty()) { + // FIXME: any kind of empty term could be avoided in the parser. This case should turned into an assertion. + source.addEpsilonTransition(destination); + return; + } + + if (m_atomData.group.terms.size() == 1) { + m_atomData.group.terms.first().generateGraph(nfa, source, destination); + return; + } + + ImmutableCharNFANodeBuilder lastTarget = m_atomData.group.terms.first().generateGraph(nfa, source, ActionList()); + for (unsigned i = 1; i < m_atomData.group.terms.size() - 1; ++i) { + const Term& currentTerm = m_atomData.group.terms[i]; + ImmutableCharNFANodeBuilder newNode = currentTerm.generateGraph(nfa, lastTarget, ActionList()); + lastTarget = WTFMove(newNode); + } + const Term& lastTerm = m_atomData.group.terms.last(); + lastTerm.generateGraph(nfa, lastTarget, destination); + break; + } + } +} + +inline void Term::destroy() +{ + switch (m_termType) { + case TermType::Empty: + break; + case TermType::CharacterSet: + m_atomData.characterSet.~CharacterSet(); + break; + case TermType::Group: + m_atomData.group.~Group(); + break; + } + m_termType = TermType::Empty; +} + +#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING +inline size_t Term::memoryUsed() const +{ + size_t extraMemory = 0; + if (m_termType == TermType::Group) { + for (const Term& term : m_atomData.group.terms) + extraMemory += term.memoryUsed(); + } + return sizeof(Term) + extraMemory; +} +#endif + +} // namespace ContentExtensions +} // namespace WebCore + +#endif // ENABLE(CONTENT_EXTENSIONS) |