summaryrefslogtreecommitdiff
path: root/Source/WebCore/contentextensions/Term.h
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
commit1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch)
tree46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/WebCore/contentextensions/Term.h
parent32761a6cee1d0dee366b885b7b9c777e67885688 (diff)
downloadWebKitGtk-tarball-master.tar.gz
Diffstat (limited to 'Source/WebCore/contentextensions/Term.h')
-rw-r--r--Source/WebCore/contentextensions/Term.h687
1 files changed, 687 insertions, 0 deletions
diff --git a/Source/WebCore/contentextensions/Term.h b/Source/WebCore/contentextensions/Term.h
new file mode 100644
index 000000000..0d51833f5
--- /dev/null
+++ b/Source/WebCore/contentextensions/Term.h
@@ -0,0 +1,687 @@
+/*
+ * Copyright (C) 2014-2015 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#if ENABLE(CONTENT_EXTENSIONS)
+
+#include "ContentExtensionsDebugging.h"
+#include "NFA.h"
+#include <unicode/utypes.h>
+#include <wtf/ASCIICType.h>
+#include <wtf/HashMap.h>
+#include <wtf/Vector.h>
+#include <wtf/text/StringBuilder.h>
+#include <wtf/text/WTFString.h>
+
+namespace WebCore {
+
+namespace ContentExtensions {
+
+enum class AtomQuantifier : uint8_t {
+ One,
+ ZeroOrOne,
+ ZeroOrMore,
+ OneOrMore
+};
+
+class Term {
+public:
+ Term();
+ Term(char character, bool isCaseSensitive);
+
+ enum UniversalTransitionTag { UniversalTransition };
+ explicit Term(UniversalTransitionTag);
+
+ enum CharacterSetTermTag { CharacterSetTerm };
+ explicit Term(CharacterSetTermTag, bool isInverted);
+
+ enum GroupTermTag { GroupTerm };
+ explicit Term(GroupTermTag);
+
+ enum EndOfLineAssertionTermTag { EndOfLineAssertionTerm };
+ explicit Term(EndOfLineAssertionTermTag);
+
+ Term(const Term& other);
+ Term(Term&& other);
+
+ enum EmptyValueTag { EmptyValue };
+ Term(EmptyValueTag);
+
+ ~Term();
+
+ bool isValid() const;
+
+ // CharacterSet terms only.
+ void addCharacter(UChar character, bool isCaseSensitive);
+
+ // Group terms only.
+ void extendGroupSubpattern(const Term&);
+
+ void quantify(const AtomQuantifier&);
+
+ ImmutableCharNFANodeBuilder generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const;
+ void generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;
+
+ bool isEndOfLineAssertion() const;
+
+ bool matchesAtLeastOneCharacter() const;
+
+ // Matches any string, the empty string included.
+ // This is very conservative. Patterns matching any string can return false here.
+ bool isKnownToMatchAnyString() const;
+
+ // Return true if the term can only match input of a single fixed length.
+ bool hasFixedLength() const;
+
+ Term& operator=(const Term& other);
+ Term& operator=(Term&& other);
+
+ bool operator==(const Term& other) const;
+
+ unsigned hash() const;
+
+ bool isEmptyValue() const;
+
+#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
+ String toString() const;
+#endif
+
+#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
+ size_t memoryUsed() const;
+#endif
+
+private:
+ // This is exact for character sets but conservative for groups.
+ // The return value can be false for a group equivalent to a universal transition.
+ bool isUniversalTransition() const;
+
+ void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const;
+ void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;
+
+ void destroy();
+
+ enum class TermType : uint8_t {
+ Empty,
+ CharacterSet,
+ Group
+ };
+
+ TermType m_termType { TermType::Empty };
+ AtomQuantifier m_quantifier { AtomQuantifier::One };
+
+ class CharacterSet {
+ public:
+ void set(UChar character)
+ {
+ RELEASE_ASSERT(character < 128);
+ m_characters[character / 64] |= (uint64_t(1) << (character % 64));
+ }
+
+ bool get(UChar character) const
+ {
+ RELEASE_ASSERT(character < 128);
+ return m_characters[character / 64] & (uint64_t(1) << (character % 64));
+ }
+
+ void invert()
+ {
+ ASSERT(!m_inverted);
+ m_inverted = true;
+ }
+
+ bool inverted() const { return m_inverted; }
+
+ unsigned bitCount() const
+ {
+ return WTF::bitCount(m_characters[0]) + WTF::bitCount(m_characters[1]);
+ }
+
+ bool operator==(const CharacterSet& other) const
+ {
+ return other.m_inverted == m_inverted
+ && other.m_characters[0] == m_characters[0]
+ && other.m_characters[1] == m_characters[1];
+ }
+
+ unsigned hash() const
+ {
+ return WTF::pairIntHash(WTF::pairIntHash(WTF::intHash(m_characters[0]), WTF::intHash(m_characters[1])), m_inverted);
+ }
+ private:
+ bool m_inverted { false };
+ uint64_t m_characters[2] { 0, 0 };
+ };
+
+ struct Group {
+ Vector<Term> terms;
+
+ bool operator==(const Group& other) const
+ {
+ return other.terms == terms;
+ }
+
+ unsigned hash() const
+ {
+ unsigned hash = 6421749;
+ for (const Term& term : terms) {
+ unsigned termHash = term.hash();
+ hash = (hash << 16) ^ ((termHash << 11) ^ hash);
+ hash += hash >> 11;
+ }
+ return hash;
+ }
+ };
+
+ union AtomData {
+ AtomData()
+ : invalidTerm(0)
+ {
+ }
+ ~AtomData()
+ {
+ }
+
+ char invalidTerm;
+ CharacterSet characterSet;
+ Group group;
+ } m_atomData;
+};
+
+#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
+inline String quantifierToString(AtomQuantifier quantifier)
+{
+ switch (quantifier) {
+ case AtomQuantifier::One:
+ return "";
+ case AtomQuantifier::ZeroOrOne:
+ return "?";
+ case AtomQuantifier::ZeroOrMore:
+ return "*";
+ case AtomQuantifier::OneOrMore:
+ return "+";
+ }
+}
+
+inline String Term::toString() const
+{
+ switch (m_termType) {
+ case TermType::Empty:
+ return "(Empty)";
+ case TermType::CharacterSet: {
+ StringBuilder builder;
+ builder.append('[');
+ for (UChar c = 0; c < 128; c++) {
+ if (m_atomData.characterSet.get(c)) {
+ if (isASCIIPrintable(c) && !isASCIISpace(c))
+ builder.append(c);
+ else {
+ builder.append('\\');
+ builder.append('u');
+ builder.appendNumber(c);
+ }
+ }
+ }
+ builder.append(']');
+ builder.append(quantifierToString(m_quantifier));
+ return builder.toString();
+ }
+ case TermType::Group: {
+ StringBuilder builder;
+ builder.append('(');
+ for (const Term& term : m_atomData.group.terms)
+ builder.append(term.toString());
+ builder.append(')');
+ builder.append(quantifierToString(m_quantifier));
+ return builder.toString();
+ }
+ }
+}
+#endif
+
+inline Term::Term()
+{
+}
+
+inline Term::Term(char character, bool isCaseSensitive)
+ : m_termType(TermType::CharacterSet)
+{
+ new (NotNull, &m_atomData.characterSet) CharacterSet();
+ addCharacter(character, isCaseSensitive);
+}
+
+inline Term::Term(UniversalTransitionTag)
+ : m_termType(TermType::CharacterSet)
+{
+ new (NotNull, &m_atomData.characterSet) CharacterSet();
+ for (UChar i = 1; i < 128; ++i)
+ m_atomData.characterSet.set(i);
+}
+
+inline Term::Term(CharacterSetTermTag, bool isInverted)
+ : m_termType(TermType::CharacterSet)
+{
+ new (NotNull, &m_atomData.characterSet) CharacterSet();
+ if (isInverted)
+ m_atomData.characterSet.invert();
+}
+
+inline Term::Term(GroupTermTag)
+ : m_termType(TermType::Group)
+{
+ new (NotNull, &m_atomData.group) Group();
+}
+
+inline Term::Term(EndOfLineAssertionTermTag)
+ : Term(CharacterSetTerm, false)
+{
+ m_atomData.characterSet.set(0);
+}
+
+inline Term::Term(const Term& other)
+ : m_termType(other.m_termType)
+ , m_quantifier(other.m_quantifier)
+{
+ switch (m_termType) {
+ case TermType::Empty:
+ break;
+ case TermType::CharacterSet:
+ new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet);
+ break;
+ case TermType::Group:
+ new (NotNull, &m_atomData.group) Group(other.m_atomData.group);
+ break;
+ }
+}
+
+inline Term::Term(Term&& other)
+ : m_termType(WTFMove(other.m_termType))
+ , m_quantifier(WTFMove(other.m_quantifier))
+{
+ switch (m_termType) {
+ case TermType::Empty:
+ break;
+ case TermType::CharacterSet:
+ new (NotNull, &m_atomData.characterSet) CharacterSet(WTFMove(other.m_atomData.characterSet));
+ break;
+ case TermType::Group:
+ new (NotNull, &m_atomData.group) Group(WTFMove(other.m_atomData.group));
+ break;
+ }
+ other.destroy();
+}
+
+inline Term::Term(EmptyValueTag)
+ : m_termType(TermType::Empty)
+{
+}
+
+inline Term::~Term()
+{
+ destroy();
+}
+
+inline bool Term::isValid() const
+{
+ return m_termType != TermType::Empty;
+}
+
+inline void Term::addCharacter(UChar character, bool isCaseSensitive)
+{
+ ASSERT(isASCII(character));
+
+ ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet);
+ if (m_termType != TermType::CharacterSet)
+ return;
+
+ if (isCaseSensitive || !isASCIIAlpha(character))
+ m_atomData.characterSet.set(character);
+ else {
+ m_atomData.characterSet.set(toASCIIUpper(character));
+ m_atomData.characterSet.set(toASCIILower(character));
+ }
+}
+
+inline void Term::extendGroupSubpattern(const Term& term)
+{
+ ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group);
+ if (m_termType != TermType::Group)
+ return;
+ m_atomData.group.terms.append(term);
+}
+
+inline void Term::quantify(const AtomQuantifier& quantifier)
+{
+ ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once.");
+ m_quantifier = quantifier;
+}
+
+inline ImmutableCharNFANodeBuilder Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const
+{
+ ImmutableCharNFANodeBuilder newEnd(nfa);
+ generateGraph(nfa, source, newEnd.nodeId());
+ newEnd.setActions(finalActions.begin(), finalActions.end());
+ return newEnd;
+}
+
+inline void Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
+{
+ ASSERT(isValid());
+
+ switch (m_quantifier) {
+ case AtomQuantifier::One: {
+ generateSubgraphForAtom(nfa, source, destination);
+ break;
+ }
+ case AtomQuantifier::ZeroOrOne: {
+ generateSubgraphForAtom(nfa, source, destination);
+ source.addEpsilonTransition(destination);
+ break;
+ }
+ case AtomQuantifier::ZeroOrMore: {
+ ImmutableCharNFANodeBuilder repeatStart(nfa);
+ source.addEpsilonTransition(repeatStart);
+
+ ImmutableCharNFANodeBuilder repeatEnd(nfa);
+ generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
+ repeatEnd.addEpsilonTransition(repeatStart);
+
+ repeatEnd.addEpsilonTransition(destination);
+ source.addEpsilonTransition(destination);
+ break;
+ }
+ case AtomQuantifier::OneOrMore: {
+ ImmutableCharNFANodeBuilder repeatStart(nfa);
+ source.addEpsilonTransition(repeatStart);
+
+ ImmutableCharNFANodeBuilder repeatEnd(nfa);
+ generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
+ repeatEnd.addEpsilonTransition(repeatStart);
+
+ repeatEnd.addEpsilonTransition(destination);
+ break;
+ }
+ }
+}
+
+inline bool Term::isEndOfLineAssertion() const
+{
+ return m_termType == TermType::CharacterSet && m_atomData.characterSet.bitCount() == 1 && m_atomData.characterSet.get(0);
+}
+
+inline bool Term::matchesAtLeastOneCharacter() const
+{
+ ASSERT(isValid());
+
+ if (m_quantifier == AtomQuantifier::ZeroOrOne || m_quantifier == AtomQuantifier::ZeroOrMore)
+ return false;
+ if (isEndOfLineAssertion())
+ return false;
+
+ if (m_termType == TermType::Group) {
+ for (const Term& term : m_atomData.group.terms) {
+ if (term.matchesAtLeastOneCharacter())
+ return true;
+ }
+ return false;
+ }
+ return true;
+}
+
+inline bool Term::isKnownToMatchAnyString() const
+{
+ ASSERT(isValid());
+
+ switch (m_termType) {
+ case TermType::Empty:
+ ASSERT_NOT_REACHED();
+ break;
+ case TermType::CharacterSet:
+ // ".*" is the only simple term matching any string.
+ return isUniversalTransition() && m_quantifier == AtomQuantifier::ZeroOrMore;
+ break;
+ case TermType::Group: {
+ // There are infinitely many ways to match anything with groups, we just handle simple cases
+ if (m_atomData.group.terms.size() != 1)
+ return false;
+
+ const Term& firstTermInGroup = m_atomData.group.terms.first();
+ // -(.*) with any quantifier.
+ if (firstTermInGroup.isKnownToMatchAnyString())
+ return true;
+
+ if (firstTermInGroup.isUniversalTransition()) {
+ // -(.)*, (.+)*, (.?)* etc.
+ if (m_quantifier == AtomQuantifier::ZeroOrMore)
+ return true;
+
+ // -(.+)?.
+ if (m_quantifier == AtomQuantifier::ZeroOrOne && firstTermInGroup.m_quantifier == AtomQuantifier::OneOrMore)
+ return true;
+
+ // -(.?)+.
+ if (m_quantifier == AtomQuantifier::OneOrMore && firstTermInGroup.m_quantifier == AtomQuantifier::ZeroOrOne)
+ return true;
+ }
+ break;
+ }
+ }
+ return false;
+}
+
+inline bool Term::hasFixedLength() const
+{
+ ASSERT(isValid());
+
+ switch (m_termType) {
+ case TermType::Empty:
+ ASSERT_NOT_REACHED();
+ break;
+ case TermType::CharacterSet:
+ return m_quantifier == AtomQuantifier::One;
+ case TermType::Group: {
+ if (m_quantifier != AtomQuantifier::One)
+ return false;
+ for (const Term& term : m_atomData.group.terms) {
+ if (!term.hasFixedLength())
+ return false;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+inline Term& Term::operator=(const Term& other)
+{
+ destroy();
+ new (NotNull, this) Term(other);
+ return *this;
+}
+
+inline Term& Term::operator=(Term&& other)
+{
+ destroy();
+ new (NotNull, this) Term(WTFMove(other));
+ return *this;
+}
+
+inline bool Term::operator==(const Term& other) const
+{
+ if (other.m_termType != m_termType || other.m_quantifier != m_quantifier)
+ return false;
+
+ switch (m_termType) {
+ case TermType::Empty:
+ return true;
+ case TermType::CharacterSet:
+ return m_atomData.characterSet == other.m_atomData.characterSet;
+ case TermType::Group:
+ return m_atomData.group == other.m_atomData.group;
+ }
+ ASSERT_NOT_REACHED();
+ return false;
+}
+
+inline unsigned Term::hash() const
+{
+ unsigned primary = static_cast<unsigned>(m_termType) << 16 | static_cast<unsigned>(m_quantifier);
+ unsigned secondary = 0;
+ switch (m_termType) {
+ case TermType::Empty:
+ secondary = 52184393;
+ break;
+ case TermType::CharacterSet:
+ secondary = m_atomData.characterSet.hash();
+ break;
+ case TermType::Group:
+ secondary = m_atomData.group.hash();
+ break;
+ }
+ return WTF::pairIntHash(primary, secondary);
+}
+
+inline bool Term::isEmptyValue() const
+{
+ return m_termType == TermType::Empty;
+}
+
+inline bool Term::isUniversalTransition() const
+{
+ ASSERT(isValid());
+
+ switch (m_termType) {
+ case TermType::Empty:
+ ASSERT_NOT_REACHED();
+ break;
+ case TermType::CharacterSet:
+ return (m_atomData.characterSet.inverted() && !m_atomData.characterSet.bitCount())
+ || (!m_atomData.characterSet.inverted() && m_atomData.characterSet.bitCount() == 127 && !m_atomData.characterSet.get(0));
+ case TermType::Group:
+ return m_atomData.group.terms.size() == 1 && m_atomData.group.terms.first().isUniversalTransition();
+ }
+ return false;
+}
+
+inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const
+{
+ generateSubgraphForAtom(nfa, source, destination.nodeId());
+}
+
+inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
+{
+ switch (m_termType) {
+ case TermType::Empty:
+ ASSERT_NOT_REACHED();
+ source.addEpsilonTransition(destination);
+ break;
+ case TermType::CharacterSet: {
+ if (!m_atomData.characterSet.inverted()) {
+ UChar i = 0;
+ while (true) {
+ while (i < 128 && !m_atomData.characterSet.get(i))
+ ++i;
+ if (i == 128)
+ break;
+
+ UChar start = i;
+ ++i;
+ while (i < 128 && m_atomData.characterSet.get(i))
+ ++i;
+ source.addTransition(start, i - 1, destination);
+ }
+ } else {
+ UChar i = 1;
+ while (true) {
+ while (i < 128 && m_atomData.characterSet.get(i))
+ ++i;
+ if (i == 128)
+ break;
+
+ UChar start = i;
+ ++i;
+ while (i < 128 && !m_atomData.characterSet.get(i))
+ ++i;
+ source.addTransition(start, i - 1, destination);
+ }
+ }
+ break;
+ }
+ case TermType::Group: {
+ if (m_atomData.group.terms.isEmpty()) {
+ // FIXME: any kind of empty term could be avoided in the parser. This case should turned into an assertion.
+ source.addEpsilonTransition(destination);
+ return;
+ }
+
+ if (m_atomData.group.terms.size() == 1) {
+ m_atomData.group.terms.first().generateGraph(nfa, source, destination);
+ return;
+ }
+
+ ImmutableCharNFANodeBuilder lastTarget = m_atomData.group.terms.first().generateGraph(nfa, source, ActionList());
+ for (unsigned i = 1; i < m_atomData.group.terms.size() - 1; ++i) {
+ const Term& currentTerm = m_atomData.group.terms[i];
+ ImmutableCharNFANodeBuilder newNode = currentTerm.generateGraph(nfa, lastTarget, ActionList());
+ lastTarget = WTFMove(newNode);
+ }
+ const Term& lastTerm = m_atomData.group.terms.last();
+ lastTerm.generateGraph(nfa, lastTarget, destination);
+ break;
+ }
+ }
+}
+
+inline void Term::destroy()
+{
+ switch (m_termType) {
+ case TermType::Empty:
+ break;
+ case TermType::CharacterSet:
+ m_atomData.characterSet.~CharacterSet();
+ break;
+ case TermType::Group:
+ m_atomData.group.~Group();
+ break;
+ }
+ m_termType = TermType::Empty;
+}
+
+#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
+inline size_t Term::memoryUsed() const
+{
+ size_t extraMemory = 0;
+ if (m_termType == TermType::Group) {
+ for (const Term& term : m_atomData.group.terms)
+ extraMemory += term.memoryUsed();
+ }
+ return sizeof(Term) + extraMemory;
+}
+#endif
+
+} // namespace ContentExtensions
+} // namespace WebCore
+
+#endif // ENABLE(CONTENT_EXTENSIONS)