summaryrefslogtreecommitdiff
path: root/Source/WTF/wtf/BloomFilter.h
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2017-06-27 06:07:23 +0000
commit1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch)
tree46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/WTF/wtf/BloomFilter.h
parent32761a6cee1d0dee366b885b7b9c777e67885688 (diff)
downloadWebKitGtk-tarball-master.tar.gz
Diffstat (limited to 'Source/WTF/wtf/BloomFilter.h')
-rw-r--r--Source/WTF/wtf/BloomFilter.h184
1 files changed, 156 insertions, 28 deletions
diff --git a/Source/WTF/wtf/BloomFilter.h b/Source/WTF/wtf/BloomFilter.h
index e14cb280e..afb17b4d6 100644
--- a/Source/WTF/wtf/BloomFilter.h
+++ b/Source/WTF/wtf/BloomFilter.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 Apple Inc. All rights reserved.
+ * Copyright (C) 2011, 2015 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -26,31 +26,150 @@
#ifndef BloomFilter_h
#define BloomFilter_h
+#include <array>
#include <wtf/text/AtomicString.h>
namespace WTF {
-// Counting bloom filter with k=2 and 8 bit counters. Uses 2^keyBits bytes of memory.
+// Bloom filter with k=2. Uses 2^keyBits/8 bytes of memory.
// False positive rate is approximately (1-e^(-2n/m))^2, where n is the number of unique
// keys and m is the table size (==2^keyBits).
+// See http://en.wikipedia.org/wiki/Bloom_filter
template <unsigned keyBits>
class BloomFilter {
WTF_MAKE_FAST_ALLOCATED;
public:
- static_assert(keyBits <= 16, "BloomFilter key size must be less than or equal to 16!");
-
static const size_t tableSize = 1 << keyBits;
+
+ BloomFilter();
+
+ void add(unsigned hash);
+ // For example SHA1::Digest.
+ template <size_t hashSize> void add(const std::array<uint8_t, hashSize>&);
+
+ void add(const BloomFilter&);
+
+ // The filter may give false positives (claim it may contain a key it doesn't)
+ // but never false negatives (claim it doesn't contain a key it does).
+ bool mayContain(unsigned hash) const;
+ template <size_t hashSize> bool mayContain(const std::array<uint8_t, hashSize>&) const;
+
+ void clear();
+
+ void add(const AtomicString& string) { add(string.impl()->existingHash()); }
+ void add(const String& string) { add(string.impl()->hash()); }
+ bool mayContain(const AtomicString& string) const { return mayContain(string.impl()->existingHash()); }
+ bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); }
+
+private:
+ static const unsigned bitsPerPosition = 8 * sizeof(unsigned);
static const unsigned keyMask = (1 << keyBits) - 1;
- static uint8_t maximumCount() { return std::numeric_limits<uint8_t>::max(); }
+ static unsigned arrayIndex(unsigned key) { return key / bitsPerPosition; }
+ static unsigned bitMask(unsigned key) { return 1 << (key % bitsPerPosition); }
+ template <size_t hashSize> static std::pair<unsigned, unsigned> keysFromHash(const std::array<uint8_t, hashSize>&);
+
+ bool isBitSet(unsigned key) const;
+ void setBit(unsigned key);
+
+ std::array<unsigned, tableSize / bitsPerPosition> m_bitArray;
+};
+
+template <unsigned keyBits>
+inline BloomFilter<keyBits>::BloomFilter()
+ : m_bitArray()
+{
+}
+
+template <unsigned keyBits>
+inline bool BloomFilter<keyBits>::mayContain(unsigned hash) const
+{
+ // The top and bottom bits of the incoming hash are treated as independent bloom filter hash functions.
+ // This works well as long as the filter size is not much above 2^16.
+ return isBitSet(hash) && isBitSet(hash >> 16);
+}
+
+template <unsigned keyBits>
+inline void BloomFilter<keyBits>::add(unsigned hash)
+{
+ setBit(hash);
+ setBit(hash >> 16);
+}
+
+template <unsigned keyBits>
+template <size_t hashSize>
+inline std::pair<unsigned, unsigned> BloomFilter<keyBits>::keysFromHash(const std::array<uint8_t, hashSize>& hash)
+{
+ // We could use larger k value than 2 for long hashes.
+ static_assert(hashSize >= 2 * sizeof(unsigned), "Hash array too short");
+ return {
+ *reinterpret_cast<const unsigned*>(hash.data()),
+ *reinterpret_cast<const unsigned*>(hash.data() + sizeof(unsigned))
+ };
+}
+
+template <unsigned keyBits>
+template <size_t hashSize>
+inline bool BloomFilter<keyBits>::mayContain(const std::array<uint8_t, hashSize>& hash) const
+{
+ auto keys = keysFromHash(hash);
+ return isBitSet(keys.first) && isBitSet(keys.second);
+}
+
+template <unsigned keyBits>
+template <size_t hashSize>
+inline void BloomFilter<keyBits>::add(const std::array<uint8_t, hashSize>& hash)
+{
+ auto keys = keysFromHash(hash);
+ setBit(keys.first);
+ setBit(keys.second);
+}
+
+template <unsigned keyBits>
+inline void BloomFilter<keyBits>::add(const BloomFilter& other)
+{
+ for (size_t i = 0; i < m_bitArray.size(); ++i)
+ m_bitArray[i] |= other.m_bitArray[i];
+}
+
+template <unsigned keyBits>
+bool BloomFilter<keyBits>::isBitSet(unsigned key) const
+{
+ unsigned maskedKey = key & keyMask;
+ ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
+ return m_bitArray[arrayIndex(maskedKey)] & bitMask(maskedKey);
+}
+
+template <unsigned keyBits>
+void BloomFilter<keyBits>::setBit(unsigned key)
+{
+ unsigned maskedKey = key & keyMask;
+ ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
+ m_bitArray[arrayIndex(maskedKey)] |= bitMask(maskedKey);
+}
+
+template <unsigned keyBits>
+inline void BloomFilter<keyBits>::clear()
+{
+ m_bitArray.fill(0);
+}
+
+// Counting bloom filter with 8 bit counters. Uses 2^keyBits bytes of memory. Error rates as above.
+// See http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters
+template <unsigned keyBits>
+class CountingBloomFilter {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ static const size_t tableSize = 1 << keyBits;
+ static unsigned maximumCount() { return std::numeric_limits<uint8_t>::max(); }
- BloomFilter() { clear(); }
+ CountingBloomFilter();
void add(unsigned hash);
void remove(unsigned hash);
// The filter may give false positives (claim it may contain a key it doesn't)
// but never false negatives (claim it doesn't contain a key it does).
- bool mayContain(unsigned hash) const { return firstSlot(hash) && secondSlot(hash); }
+ bool mayContain(unsigned hash) const { return firstBucket(hash) && secondBucket(hash); }
// The filter must be cleared before reuse even if all keys are removed.
// Otherwise overflowed keys will stick around.
@@ -71,19 +190,27 @@ public:
#endif
private:
- uint8_t& firstSlot(unsigned hash) { return m_table[hash & keyMask]; }
- uint8_t& secondSlot(unsigned hash) { return m_table[(hash >> 16) & keyMask]; }
- const uint8_t& firstSlot(unsigned hash) const { return m_table[hash & keyMask]; }
- const uint8_t& secondSlot(unsigned hash) const { return m_table[(hash >> 16) & keyMask]; }
+ static const unsigned keyMask = (1 << keyBits) - 1;
- uint8_t m_table[tableSize];
+ uint8_t& firstBucket(unsigned hash) { return m_buckets[hash & keyMask]; }
+ uint8_t& secondBucket(unsigned hash) { return m_buckets[(hash >> 16) & keyMask]; }
+ const uint8_t& firstBucket(unsigned hash) const { return m_buckets[hash & keyMask]; }
+ const uint8_t& secondBucket(unsigned hash) const { return m_buckets[(hash >> 16) & keyMask]; }
+
+ std::array<uint8_t, tableSize> m_buckets;
};
-
+
template <unsigned keyBits>
-inline void BloomFilter<keyBits>::add(unsigned hash)
+inline CountingBloomFilter<keyBits>::CountingBloomFilter()
+ : m_buckets()
{
- uint8_t& first = firstSlot(hash);
- uint8_t& second = secondSlot(hash);
+}
+
+template <unsigned keyBits>
+inline void CountingBloomFilter<keyBits>::add(unsigned hash)
+{
+ auto& first = firstBucket(hash);
+ auto& second = secondBucket(hash);
if (LIKELY(first < maximumCount()))
++first;
if (LIKELY(second < maximumCount()))
@@ -91,13 +218,13 @@ inline void BloomFilter<keyBits>::add(unsigned hash)
}
template <unsigned keyBits>
-inline void BloomFilter<keyBits>::remove(unsigned hash)
+inline void CountingBloomFilter<keyBits>::remove(unsigned hash)
{
- uint8_t& first = firstSlot(hash);
- uint8_t& second = secondSlot(hash);
+ auto& first = firstBucket(hash);
+ auto& second = secondBucket(hash);
ASSERT(first);
ASSERT(second);
- // In case of an overflow, the slot sticks in the table until clear().
+ // In case of an overflow, the bucket sticks in the table until clear().
if (LIKELY(first < maximumCount()))
--first;
if (LIKELY(second < maximumCount()))
@@ -105,27 +232,27 @@ inline void BloomFilter<keyBits>::remove(unsigned hash)
}
template <unsigned keyBits>
-inline void BloomFilter<keyBits>::clear()
+inline void CountingBloomFilter<keyBits>::clear()
{
- memset(m_table, 0, tableSize);
+ m_buckets.fill(0);
}
#if !ASSERT_DISABLED
template <unsigned keyBits>
-bool BloomFilter<keyBits>::likelyEmpty() const
+bool CountingBloomFilter<keyBits>::likelyEmpty() const
{
- for (size_t n = 0; n < tableSize; ++n) {
- if (m_table[n] && m_table[n] != maximumCount())
+ for (auto& bucket : m_buckets) {
+ if (bucket && bucket != maximumCount())
return false;
}
return true;
}
template <unsigned keyBits>
-bool BloomFilter<keyBits>::isClear() const
+bool CountingBloomFilter<keyBits>::isClear() const
{
- for (size_t n = 0; n < tableSize; ++n) {
- if (m_table[n])
+ for (auto& bucket : m_buckets) {
+ if (bucket)
return false;
}
return true;
@@ -135,5 +262,6 @@ bool BloomFilter<keyBits>::isClear() const
}
using WTF::BloomFilter;
+using WTF::CountingBloomFilter;
#endif