diff options
Diffstat (limited to 'Source/WTF/wtf/BloomFilter.h')
-rw-r--r-- | Source/WTF/wtf/BloomFilter.h | 184 |
1 files changed, 156 insertions, 28 deletions
diff --git a/Source/WTF/wtf/BloomFilter.h b/Source/WTF/wtf/BloomFilter.h index e14cb280e..afb17b4d6 100644 --- a/Source/WTF/wtf/BloomFilter.h +++ b/Source/WTF/wtf/BloomFilter.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Apple Inc. All rights reserved. + * Copyright (C) 2011, 2015 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,31 +26,150 @@ #ifndef BloomFilter_h #define BloomFilter_h +#include <array> #include <wtf/text/AtomicString.h> namespace WTF { -// Counting bloom filter with k=2 and 8 bit counters. Uses 2^keyBits bytes of memory. +// Bloom filter with k=2. Uses 2^keyBits/8 bytes of memory. // False positive rate is approximately (1-e^(-2n/m))^2, where n is the number of unique // keys and m is the table size (==2^keyBits). +// See http://en.wikipedia.org/wiki/Bloom_filter template <unsigned keyBits> class BloomFilter { WTF_MAKE_FAST_ALLOCATED; public: - static_assert(keyBits <= 16, "BloomFilter key size must be less than or equal to 16!"); - static const size_t tableSize = 1 << keyBits; + + BloomFilter(); + + void add(unsigned hash); + // For example SHA1::Digest. + template <size_t hashSize> void add(const std::array<uint8_t, hashSize>&); + + void add(const BloomFilter&); + + // The filter may give false positives (claim it may contain a key it doesn't) + // but never false negatives (claim it doesn't contain a key it does). + bool mayContain(unsigned hash) const; + template <size_t hashSize> bool mayContain(const std::array<uint8_t, hashSize>&) const; + + void clear(); + + void add(const AtomicString& string) { add(string.impl()->existingHash()); } + void add(const String& string) { add(string.impl()->hash()); } + bool mayContain(const AtomicString& string) const { return mayContain(string.impl()->existingHash()); } + bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); } + +private: + static const unsigned bitsPerPosition = 8 * sizeof(unsigned); static const unsigned keyMask = (1 << keyBits) - 1; - static uint8_t maximumCount() { return std::numeric_limits<uint8_t>::max(); } + static unsigned arrayIndex(unsigned key) { return key / bitsPerPosition; } + static unsigned bitMask(unsigned key) { return 1 << (key % bitsPerPosition); } + template <size_t hashSize> static std::pair<unsigned, unsigned> keysFromHash(const std::array<uint8_t, hashSize>&); + + bool isBitSet(unsigned key) const; + void setBit(unsigned key); + + std::array<unsigned, tableSize / bitsPerPosition> m_bitArray; +}; + +template <unsigned keyBits> +inline BloomFilter<keyBits>::BloomFilter() + : m_bitArray() +{ +} + +template <unsigned keyBits> +inline bool BloomFilter<keyBits>::mayContain(unsigned hash) const +{ + // The top and bottom bits of the incoming hash are treated as independent bloom filter hash functions. + // This works well as long as the filter size is not much above 2^16. + return isBitSet(hash) && isBitSet(hash >> 16); +} + +template <unsigned keyBits> +inline void BloomFilter<keyBits>::add(unsigned hash) +{ + setBit(hash); + setBit(hash >> 16); +} + +template <unsigned keyBits> +template <size_t hashSize> +inline std::pair<unsigned, unsigned> BloomFilter<keyBits>::keysFromHash(const std::array<uint8_t, hashSize>& hash) +{ + // We could use larger k value than 2 for long hashes. + static_assert(hashSize >= 2 * sizeof(unsigned), "Hash array too short"); + return { + *reinterpret_cast<const unsigned*>(hash.data()), + *reinterpret_cast<const unsigned*>(hash.data() + sizeof(unsigned)) + }; +} + +template <unsigned keyBits> +template <size_t hashSize> +inline bool BloomFilter<keyBits>::mayContain(const std::array<uint8_t, hashSize>& hash) const +{ + auto keys = keysFromHash(hash); + return isBitSet(keys.first) && isBitSet(keys.second); +} + +template <unsigned keyBits> +template <size_t hashSize> +inline void BloomFilter<keyBits>::add(const std::array<uint8_t, hashSize>& hash) +{ + auto keys = keysFromHash(hash); + setBit(keys.first); + setBit(keys.second); +} + +template <unsigned keyBits> +inline void BloomFilter<keyBits>::add(const BloomFilter& other) +{ + for (size_t i = 0; i < m_bitArray.size(); ++i) + m_bitArray[i] |= other.m_bitArray[i]; +} + +template <unsigned keyBits> +bool BloomFilter<keyBits>::isBitSet(unsigned key) const +{ + unsigned maskedKey = key & keyMask; + ASSERT(arrayIndex(maskedKey) < m_bitArray.size()); + return m_bitArray[arrayIndex(maskedKey)] & bitMask(maskedKey); +} + +template <unsigned keyBits> +void BloomFilter<keyBits>::setBit(unsigned key) +{ + unsigned maskedKey = key & keyMask; + ASSERT(arrayIndex(maskedKey) < m_bitArray.size()); + m_bitArray[arrayIndex(maskedKey)] |= bitMask(maskedKey); +} + +template <unsigned keyBits> +inline void BloomFilter<keyBits>::clear() +{ + m_bitArray.fill(0); +} + +// Counting bloom filter with 8 bit counters. Uses 2^keyBits bytes of memory. Error rates as above. +// See http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters +template <unsigned keyBits> +class CountingBloomFilter { + WTF_MAKE_FAST_ALLOCATED; +public: + static const size_t tableSize = 1 << keyBits; + static unsigned maximumCount() { return std::numeric_limits<uint8_t>::max(); } - BloomFilter() { clear(); } + CountingBloomFilter(); void add(unsigned hash); void remove(unsigned hash); // The filter may give false positives (claim it may contain a key it doesn't) // but never false negatives (claim it doesn't contain a key it does). - bool mayContain(unsigned hash) const { return firstSlot(hash) && secondSlot(hash); } + bool mayContain(unsigned hash) const { return firstBucket(hash) && secondBucket(hash); } // The filter must be cleared before reuse even if all keys are removed. // Otherwise overflowed keys will stick around. @@ -71,19 +190,27 @@ public: #endif private: - uint8_t& firstSlot(unsigned hash) { return m_table[hash & keyMask]; } - uint8_t& secondSlot(unsigned hash) { return m_table[(hash >> 16) & keyMask]; } - const uint8_t& firstSlot(unsigned hash) const { return m_table[hash & keyMask]; } - const uint8_t& secondSlot(unsigned hash) const { return m_table[(hash >> 16) & keyMask]; } + static const unsigned keyMask = (1 << keyBits) - 1; - uint8_t m_table[tableSize]; + uint8_t& firstBucket(unsigned hash) { return m_buckets[hash & keyMask]; } + uint8_t& secondBucket(unsigned hash) { return m_buckets[(hash >> 16) & keyMask]; } + const uint8_t& firstBucket(unsigned hash) const { return m_buckets[hash & keyMask]; } + const uint8_t& secondBucket(unsigned hash) const { return m_buckets[(hash >> 16) & keyMask]; } + + std::array<uint8_t, tableSize> m_buckets; }; - + template <unsigned keyBits> -inline void BloomFilter<keyBits>::add(unsigned hash) +inline CountingBloomFilter<keyBits>::CountingBloomFilter() + : m_buckets() { - uint8_t& first = firstSlot(hash); - uint8_t& second = secondSlot(hash); +} + +template <unsigned keyBits> +inline void CountingBloomFilter<keyBits>::add(unsigned hash) +{ + auto& first = firstBucket(hash); + auto& second = secondBucket(hash); if (LIKELY(first < maximumCount())) ++first; if (LIKELY(second < maximumCount())) @@ -91,13 +218,13 @@ inline void BloomFilter<keyBits>::add(unsigned hash) } template <unsigned keyBits> -inline void BloomFilter<keyBits>::remove(unsigned hash) +inline void CountingBloomFilter<keyBits>::remove(unsigned hash) { - uint8_t& first = firstSlot(hash); - uint8_t& second = secondSlot(hash); + auto& first = firstBucket(hash); + auto& second = secondBucket(hash); ASSERT(first); ASSERT(second); - // In case of an overflow, the slot sticks in the table until clear(). + // In case of an overflow, the bucket sticks in the table until clear(). if (LIKELY(first < maximumCount())) --first; if (LIKELY(second < maximumCount())) @@ -105,27 +232,27 @@ inline void BloomFilter<keyBits>::remove(unsigned hash) } template <unsigned keyBits> -inline void BloomFilter<keyBits>::clear() +inline void CountingBloomFilter<keyBits>::clear() { - memset(m_table, 0, tableSize); + m_buckets.fill(0); } #if !ASSERT_DISABLED template <unsigned keyBits> -bool BloomFilter<keyBits>::likelyEmpty() const +bool CountingBloomFilter<keyBits>::likelyEmpty() const { - for (size_t n = 0; n < tableSize; ++n) { - if (m_table[n] && m_table[n] != maximumCount()) + for (auto& bucket : m_buckets) { + if (bucket && bucket != maximumCount()) return false; } return true; } template <unsigned keyBits> -bool BloomFilter<keyBits>::isClear() const +bool CountingBloomFilter<keyBits>::isClear() const { - for (size_t n = 0; n < tableSize; ++n) { - if (m_table[n]) + for (auto& bucket : m_buckets) { + if (bucket) return false; } return true; @@ -135,5 +262,6 @@ bool BloomFilter<keyBits>::isClear() const } using WTF::BloomFilter; +using WTF::CountingBloomFilter; #endif |