1 files changed, 156 insertions, 28 deletions
diff --git a/Source/WTF/wtf/BloomFilter.h b/Source/WTF/wtf/BloomFilter.h
index e14cb280e..afb17b4d6 100644
--- a/Source/WTF/wtf/BloomFilter.h
+++ b/Source/WTF/wtf/BloomFilter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 Apple Inc. All rights reserved.
+ * Copyright (C) 2011, 2015 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -26,31 +26,150 @@
 #ifndef BloomFilter_h
 #define BloomFilter_h
 
+#include <array>
 #include <wtf/text/AtomicString.h>
 
 namespace WTF {
 
-// Counting bloom filter with k=2 and 8 bit counters. Uses 2^keyBits bytes of memory.
+// Bloom filter with k=2. Uses 2^keyBits/8 bytes of memory.
 // False positive rate is approximately (1-e^(-2n/m))^2, where n is the number of unique 
 // keys and m is the table size (==2^keyBits).
+// See http://en.wikipedia.org/wiki/Bloom_filter
 template <unsigned keyBits>
 class BloomFilter {
     WTF_MAKE_FAST_ALLOCATED;
 public:
-    static_assert(keyBits <= 16, "BloomFilter key size must be less than or equal to 16!");
-
     static const size_t tableSize = 1 << keyBits;
+
+    BloomFilter();
+
+    void add(unsigned hash);
+    // For example SHA1::Digest.
+    template <size_t hashSize> void add(const std::array<uint8_t, hashSize>&);
+
+    void add(const BloomFilter&);
+
+    // The filter may give false positives (claim it may contain a key it doesn't)
+    // but never false negatives (claim it doesn't contain a key it does).
+    bool mayContain(unsigned hash) const;
+    template <size_t hashSize> bool mayContain(const std::array<uint8_t, hashSize>&) const;
+    
+    void clear();
+
+    void add(const AtomicString& string) { add(string.impl()->existingHash()); }
+    void add(const String& string) { add(string.impl()->hash()); }
+    bool mayContain(const AtomicString& string) const { return mayContain(string.impl()->existingHash()); }
+    bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); }
+
+private:
+    static const unsigned bitsPerPosition = 8 * sizeof(unsigned);
     static const unsigned keyMask = (1 << keyBits) - 1;
-    static uint8_t maximumCount() { return std::numeric_limits<uint8_t>::max(); }
+    static unsigned arrayIndex(unsigned key) { return key / bitsPerPosition; }
+    static unsigned bitMask(unsigned key) { return 1 << (key % bitsPerPosition); }
+    template <size_t hashSize> static std::pair<unsigned, unsigned> keysFromHash(const std::array<uint8_t, hashSize>&);
+
+    bool isBitSet(unsigned key) const;
+    void setBit(unsigned key);
+
+    std::array<unsigned, tableSize / bitsPerPosition> m_bitArray;
+};
+
+template <unsigned keyBits>
+inline BloomFilter<keyBits>::BloomFilter()
+    : m_bitArray()
+{
+}
+
+template <unsigned keyBits>
+inline bool BloomFilter<keyBits>::mayContain(unsigned hash) const
+{
+    // The top and bottom bits of the incoming hash are treated as independent bloom filter hash functions.
+    // This works well as long as the filter size is not much above 2^16.
+    return isBitSet(hash) && isBitSet(hash >> 16);
+}
+
+template <unsigned keyBits>
+inline void BloomFilter<keyBits>::add(unsigned hash)
+{
+    setBit(hash);
+    setBit(hash >> 16);
+}
+
+template <unsigned keyBits>
+template <size_t hashSize>
+inline std::pair<unsigned, unsigned> BloomFilter<keyBits>::keysFromHash(const std::array<uint8_t, hashSize>& hash)
+{
+    // We could use larger k value than 2 for long hashes.
+    static_assert(hashSize >= 2 * sizeof(unsigned), "Hash array too short");
+    return {
+        *reinterpret_cast<const unsigned*>(hash.data()),
+        *reinterpret_cast<const unsigned*>(hash.data() + sizeof(unsigned))
+    };
+}
+
+template <unsigned keyBits>
+template <size_t hashSize>
+inline bool BloomFilter<keyBits>::mayContain(const std::array<uint8_t, hashSize>& hash) const
+{
+    auto keys = keysFromHash(hash);
+    return isBitSet(keys.first) && isBitSet(keys.second);
+}
+
+template <unsigned keyBits>
+template <size_t hashSize>
+inline void BloomFilter<keyBits>::add(const std::array<uint8_t, hashSize>& hash)
+{
+    auto keys = keysFromHash(hash);
+    setBit(keys.first);
+    setBit(keys.second);
+}
+
+template <unsigned keyBits>
+inline void BloomFilter<keyBits>::add(const BloomFilter& other)
+{
+    for (size_t i = 0; i < m_bitArray.size(); ++i)
+        m_bitArray[i] |= other.m_bitArray[i];
+}
+
+template <unsigned keyBits>
+bool BloomFilter<keyBits>::isBitSet(unsigned key) const
+{
+    unsigned maskedKey = key & keyMask;
+    ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
+    return m_bitArray[arrayIndex(maskedKey)] & bitMask(maskedKey);
+}
+
+template <unsigned keyBits>
+void BloomFilter<keyBits>::setBit(unsigned key)
+{
+    unsigned maskedKey = key & keyMask;
+    ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
+    m_bitArray[arrayIndex(maskedKey)] |= bitMask(maskedKey);
+}
+
+template <unsigned keyBits>
+inline void BloomFilter<keyBits>::clear()
+{
+    m_bitArray.fill(0);
+}
+
+// Counting bloom filter with 8 bit counters. Uses 2^keyBits bytes of memory. Error rates as above.
+// See http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters
+template <unsigned keyBits>
+class CountingBloomFilter {
+    WTF_MAKE_FAST_ALLOCATED;
+public:
+    static const size_t tableSize = 1 << keyBits;
+    static unsigned maximumCount() { return std::numeric_limits<uint8_t>::max(); }
     
-    BloomFilter() { clear(); }
+    CountingBloomFilter();
 
     void add(unsigned hash);
     void remove(unsigned hash);
 
     // The filter may give false positives (claim it may contain a key it doesn't)
     // but never false negatives (claim it doesn't contain a key it does).
-    bool mayContain(unsigned hash) const { return firstSlot(hash) && secondSlot(hash); }
+    bool mayContain(unsigned hash) const { return firstBucket(hash) && secondBucket(hash); }
     
     // The filter must be cleared before reuse even if all keys are removed.
     // Otherwise overflowed keys will stick around.
@@ -71,19 +190,27 @@ public:
 #endif
 
 private:
-    uint8_t& firstSlot(unsigned hash) { return m_table[hash & keyMask]; }
-    uint8_t& secondSlot(unsigned hash) { return m_table[(hash >> 16) & keyMask]; }
-    const uint8_t& firstSlot(unsigned hash) const { return m_table[hash & keyMask]; }
-    const uint8_t& secondSlot(unsigned hash) const { return m_table[(hash >> 16) & keyMask]; }
+    static const unsigned keyMask = (1 << keyBits) - 1;
 
-    uint8_t m_table[tableSize];
+    uint8_t& firstBucket(unsigned hash) { return m_buckets[hash & keyMask]; }
+    uint8_t& secondBucket(unsigned hash) { return m_buckets[(hash >> 16) & keyMask]; }
+    const uint8_t& firstBucket(unsigned hash) const { return m_buckets[hash & keyMask]; }
+    const uint8_t& secondBucket(unsigned hash) const { return m_buckets[(hash >> 16) & keyMask]; }
+
+    std::array<uint8_t, tableSize> m_buckets;
 };
-    
+
 template <unsigned keyBits>
-inline void BloomFilter<keyBits>::add(unsigned hash)
+inline CountingBloomFilter<keyBits>::CountingBloomFilter()
+    : m_buckets()
 {
-    uint8_t& first = firstSlot(hash);
-    uint8_t& second = secondSlot(hash);
+}
+
+template <unsigned keyBits>
+inline void CountingBloomFilter<keyBits>::add(unsigned hash)
+{
+    auto& first = firstBucket(hash);
+    auto& second = secondBucket(hash);
     if (LIKELY(first < maximumCount()))
         ++first;
     if (LIKELY(second < maximumCount()))
@@ -91,13 +218,13 @@ inline void BloomFilter<keyBits>::add(unsigned hash)
 }
 
 template <unsigned keyBits>
-inline void BloomFilter<keyBits>::remove(unsigned hash)
+inline void CountingBloomFilter<keyBits>::remove(unsigned hash)
 {
-    uint8_t& first = firstSlot(hash);
-    uint8_t& second = secondSlot(hash);
+    auto& first = firstBucket(hash);
+    auto& second = secondBucket(hash);
     ASSERT(first);
     ASSERT(second);
-    // In case of an overflow, the slot sticks in the table until clear().
+    // In case of an overflow, the bucket sticks in the table until clear().
     if (LIKELY(first < maximumCount()))
         --first;
     if (LIKELY(second < maximumCount()))
@@ -105,27 +232,27 @@ inline void BloomFilter<keyBits>::remove(unsigned hash)
 }
     
 template <unsigned keyBits>
-inline void BloomFilter<keyBits>::clear()
+inline void CountingBloomFilter<keyBits>::clear()
 {
-    memset(m_table, 0, tableSize);
+    m_buckets.fill(0);
 }
 
 #if !ASSERT_DISABLED
 template <unsigned keyBits>
-bool BloomFilter<keyBits>::likelyEmpty() const
+bool CountingBloomFilter<keyBits>::likelyEmpty() const
 {
-    for (size_t n = 0; n < tableSize; ++n) {
-        if (m_table[n] && m_table[n] != maximumCount())
+    for (auto& bucket : m_buckets) {
+        if (bucket && bucket != maximumCount())
             return false;
     }
     return true;
 }
 
 template <unsigned keyBits>
-bool BloomFilter<keyBits>::isClear() const
+bool CountingBloomFilter<keyBits>::isClear() const
 {
-    for (size_t n = 0; n < tableSize; ++n) {
-        if (m_table[n])
+    for (auto& bucket : m_buckets) {
+        if (bucket)
             return false;
     }
     return true;
@@ -135,5 +262,6 @@ bool BloomFilter<keyBits>::isClear() const
 }
 
 using WTF::BloomFilter;
+using WTF::CountingBloomFilter;
 
 #endif