diff options
Diffstat (limited to 'src/test/common/test_bloom_filter.cc')
-rw-r--r-- | src/test/common/test_bloom_filter.cc | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc new file mode 100644 index 00000000000..8e3661b2cc1 --- /dev/null +++ b/src/test/common/test_bloom_filter.cc @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank <info@inktank.com> + * + * LGPL2.1 (see COPYING-LGPL2.1) or later + */ + +#include <iostream> +#include <gtest/gtest.h> + +#include "include/stringify.h" +#include "common/bloom_filter.hpp" + +TEST(BloomFilter, Basic) { + bloom_filter bf(10, .1, 1); + bf.insert("foo"); + bf.insert("bar"); + + ASSERT_TRUE(bf.contains("foo")); + ASSERT_TRUE(bf.contains("bar")); +} + +TEST(BloomFilter, Sweep) { + std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; + for (int ex = 3; ex < 12; ex += 2) { + for (float fpp = .001; fpp < .5; fpp *= 4.0) { + int max = 2 << ex; + bloom_filter bf(max, fpp, 1); + bf.insert("foo"); + bf.insert("bar"); + + ASSERT_TRUE(bf.contains("foo")); + ASSERT_TRUE(bf.contains("bar")); + + for (int n = 0; n < max; n++) + bf.insert("ok" + stringify(n)); + + int test = max * 100; + int hit = 0; + for (int n = 0; n < test; n++) + if (bf.contains("asdf" + stringify(n))) + hit++; + + ASSERT_TRUE(bf.contains("foo")); + ASSERT_TRUE(bf.contains("bar")); + + double actual = (double)hit / (double)test; + + bufferlist bl; + ::encode(bf, bl); + + double byte_per_insert = (double)bl.length() / (double)max; + + std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl; + ASSERT_TRUE(actual < fpp * 10); + + } + } +} + +TEST(BloomFilter, SweepInt) { + std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; + for (int ex = 3; ex < 12; ex += 2) { + for (float fpp = .001; fpp < .5; fpp *= 4.0) { + int max = 2 << ex; + bloom_filter bf(max, fpp, 1); + bf.insert("foo"); + bf.insert("bar"); + + ASSERT_TRUE(123); + ASSERT_TRUE(456); + + for (int n = 0; n < max; n++) + bf.insert(n); + + int test = max * 100; + int hit = 0; + for (int n = 0; n < test; n++) + if (bf.contains(100000 + n)) + hit++; + + ASSERT_TRUE(123); + ASSERT_TRUE(456); + + double actual = (double)hit / (double)test; + + bufferlist bl; + ::encode(bf, bl); + + double byte_per_insert = (double)bl.length() / (double)max; + + std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl; + ASSERT_TRUE(actual < fpp * 10); + ASSERT_TRUE(actual > fpp / 10); + } + } +} + + +TEST(BloomFilter, BinSweep) { + int total_max = 16384; + float total_fpp = .01; + std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl; + for (int bins = 1; bins < 16; ++bins) { + int max = total_max / bins; + float fpp = total_fpp / bins;//pow(total_fpp, bins); + + std::vector<bloom_filter*> ls; + bufferlist bl; + for (int i=0; i<bins; i++) { + ls.push_back(new bloom_filter(max, fpp, i)); + for (int j=0; j<max; j++) { + ls.back()->insert(10000 * (i+1) + j); + } + ::encode(*ls.front(), bl); + } + + int hit = 0; + int test = max * 100; + for (int i=0; i<test; ++i) { + for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) { + if ((*j)->contains(i * 732)) { // note: sequential i does not work here; the intenral int hash is weak!! + hit++; + break; + } + } + } + + double actual = (double)hit / (double)test; + std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp + << " actual-fpp " << actual + << " total-size " << bl.length() << std::endl; + } +} + +// disable these tests; doing dual insertions in consecutive filters +// appears to be equivalent to doing a single insertion in a bloom +// filter that is twice as big. +#if 0 + +// test the fpp over a sequence of bloom filters, each with unique +// items inserted into it. +// +// we expect: actual_fpp = num_filters * per_filter_fpp +TEST(BloomFilter, Sequence) { + + int max = 1024; + double fpp = .01; + for (int seq = 2; seq <= 128; seq *= 2) { + std::vector<bloom_filter*> ls; + for (int i=0; i<seq; i++) { + ls.push_back(new bloom_filter(max*2, fpp, i)); + for (int j=0; j<max; j++) { + ls.back()->insert("ok" + stringify(j) + "_" + stringify(i)); + if (ls.size() > 1) + ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i)); + } + } + + int hit = 0; + int test = max * 100; + for (int i=0; i<test; ++i) { + for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) { + if ((*j)->contains("bad" + stringify(i))) { + hit++; + break; + } + } + } + + double actual = (double)hit / (double)test; + std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl; + } +} + +// test the ffp over a sequence of bloom filters, where actual values +// are always inserted into a consecutive pair of filters. in order +// to have a false positive, we need to falsely match two consecutive +// filters. +// +// we expect: actual_fpp = num_filters * per_filter_fpp^2 +TEST(BloomFilter, SequenceDouble) { + int max = 1024; + double fpp = .01; + for (int seq = 2; seq <= 128; seq *= 2) { + std::vector<bloom_filter*> ls; + for (int i=0; i<seq; i++) { + ls.push_back(new bloom_filter(max*2, fpp, i)); + for (int j=0; j<max; j++) { + ls.back()->insert("ok" + stringify(j) + "_" + stringify(i)); + if (ls.size() > 1) + ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i)); + } + } + + int hit = 0; + int test = max * 100; + int run = 0; + for (int i=0; i<test; ++i) { + for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) { + if ((*j)->contains("bad" + stringify(i))) { + run++; + if (run >= 2) { + hit++; + break; + } + } else { + run = 0; + } + } + } + + double actual = (double)hit / (double)test; + std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual + << " expected " << (fpp*fpp*(double)seq) << std::endl; + } +} + +#endif |