diff options
-rw-r--r--[-rwxr-xr-x] | src/sampler.cc | 71 | ||||
-rw-r--r--[-rwxr-xr-x] | src/sampler.h | 138 | ||||
-rw-r--r-- | src/static_vars.cc | 1 | ||||
-rwxr-xr-x | src/tests/sampler_test.cc | 37 | ||||
-rw-r--r-- | src/thread_cache.h | 2 |
5 files changed, 137 insertions, 112 deletions
diff --git a/src/sampler.cc b/src/sampler.cc index cc71112..f7f2fcc 100755..100644 --- a/src/sampler.cc +++ b/src/sampler.cc @@ -37,6 +37,7 @@ #include <algorithm> // For min() #include <math.h> +#include <limits> #include "base/commandlineflags.h" using std::min; @@ -57,34 +58,16 @@ DEFINE_int64(tcmalloc_sample_parameter, namespace tcmalloc { -// Statics for Sampler -double Sampler::log_table_[1<<kFastlogNumBits]; - -// Populate the lookup table for FastLog2. -// This approximates the log2 curve with a step function. -// Steps have height equal to log2 of the mid-point of the step. -void Sampler::PopulateFastLog2Table() { - for (int i = 0; i < (1<<kFastlogNumBits); i++) { - log_table_[i] = (log(1.0 + static_cast<double>(i+0.5)/(1<<kFastlogNumBits)) - / log(2.0)); - } -} - int Sampler::GetSamplePeriod() { return FLAGS_tcmalloc_sample_parameter; } // Run this before using your sampler -void Sampler::Init(uint32_t seed) { +void Sampler::Init(uint64_t seed) { + ASSERT(seed != 0); + // Initialize PRNG - if (seed != 0) { - rnd_ = seed; - } else { - rnd_ = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(this)); - if (rnd_ == 0) { - rnd_ = 1; - } - } + rnd_ = seed; // Step it forward 20 times for good measure for (int i = 0; i < 20; i++) { rnd_ = NextRandom(rnd_); @@ -93,10 +76,7 @@ void Sampler::Init(uint32_t seed) { bytes_until_sample_ = PickNextSamplingPoint(); } -// Initialize the Statics for the Sampler class -void Sampler::InitStatics() { - PopulateFastLog2Table(); -} +#define MAX_SSIZE (SIZE_MAX >> 1) // Generates a geometric variable with the specified mean (512K by default). // This is done by generating a random number between 0 and 1 and applying @@ -109,7 +89,17 @@ void Sampler::InitStatics() { // -log_e(q)/m = x // log_2(q) * (-log_e(2) * 1/m) = x // In the code, q is actually in the range 1 to 2**26, hence the -26 below -size_t Sampler::PickNextSamplingPoint() { +ssize_t Sampler::PickNextSamplingPoint() { + if (FLAGS_tcmalloc_sample_parameter <= 0) { + // In this case, we don't want to sample ever, and the larger a + // value we put here, the longer until we hit the slow path + // again. However, we have to support the flag changing at + // runtime, so pick something reasonably large (to keep overhead + // low) but small enough that we'll eventually start to sample + // again. + return 16 << 20; + } + rnd_ = NextRandom(rnd_); // Take the top 26 bits as the random number // (This plus the 1<<58 sampling bound give a max possible step of @@ -119,13 +109,26 @@ size_t Sampler::PickNextSamplingPoint() { // under piii debug for some binaries. double q = static_cast<uint32_t>(rnd_ >> (prng_mod_power - 26)) + 1.0; // Put the computed p-value through the CDF of a geometric. - // For faster performance (save ~1/20th exec time), replace - // min(0.0, FastLog2(q) - 26) by (Fastlog2(q) - 26.000705) - // The value 26.000705 is used rather than 26 to compensate - // for inaccuracies in FastLog2 which otherwise result in a - // negative answer. - return static_cast<size_t>(min(0.0, (FastLog2(q) - 26)) * (-log(2.0) - * FLAGS_tcmalloc_sample_parameter) + 1); + double interval = + (log2(q) - 26) * (-log(2.0) * FLAGS_tcmalloc_sample_parameter); + + // Very large values of interval overflow ssize_t. If we happen to + // hit such improbable condition, we simply cheat and clamp interval + // to largest supported value. + return static_cast<ssize_t>(std::min<double>(interval, MAX_SSIZE)); +} + +bool Sampler::RecordAllocationSlow(size_t k) { + if (!initialized_) { + initialized_ = true; + Init(reinterpret_cast<uintptr_t>(this)); + if (static_cast<size_t>(bytes_until_sample_) >= k) { + bytes_until_sample_ -= k; + return true; + } + } + bytes_until_sample_ = PickNextSamplingPoint(); + return FLAGS_tcmalloc_sample_parameter <= 0; } } // namespace tcmalloc diff --git a/src/sampler.h b/src/sampler.h index eb316d7..16d3b09 100755..100644 --- a/src/sampler.h +++ b/src/sampler.h @@ -44,6 +44,7 @@ #include <string.h> // for memcpy #include "base/basictypes.h" // for ASSERT #include "internal_logging.h" // for ASSERT +#include "static_vars.h" namespace tcmalloc { @@ -80,7 +81,7 @@ namespace tcmalloc { // a geometric with mean tcmalloc_sample_parameter. (ie. the // probability that at least one byte in the range is marked). This // is accurately given by the CDF of the corresponding exponential -// distribution : 1 - e^(X/tcmalloc_sample_parameter_) +// distribution : 1 - e^(-X/tcmalloc_sample_parameter_) // Independence of the byte marking ensures independence of // the sampling of each allocation. // @@ -100,52 +101,114 @@ namespace tcmalloc { // only result in a single call to PickNextSamplingPoint. //------------------------------------------------------------------- +class SamplerTest; + class PERFTOOLS_DLL_DECL Sampler { public: // Initialize this sampler. - // Passing a seed of 0 gives a non-deterministic - // seed value given by casting the object ("this") - void Init(uint32_t seed); - void Cleanup(); - - // Record allocation of "k" bytes. Return true iff allocation - // should be sampled - bool SampleAllocation(size_t k); + void Init(uint64_t seed); + + // Record allocation of "k" bytes. Return true if no further work + // is need, and false if allocation needed to be sampled. + bool RecordAllocation(size_t k); + + // Same as above (but faster), except: + // a) REQUIRES(k < std::numeric_limits<ssize_t>::max()) + // b) if this returns false, you must call RecordAllocation + // to confirm if sampling truly needed. + // + // The point of this function is to only deal with common case of no + // sampling and let caller (which is in malloc fast-path) to + // "escalate" to fuller and slower logic only if necessary. + bool TryRecordAllocationFast(size_t k); // Generate a geometric with mean 512K (or FLAG_tcmalloc_sample_parameter) - size_t PickNextSamplingPoint(); - - // Initialize the statics for the Sampler class - static void InitStatics(); + ssize_t PickNextSamplingPoint(); // Returns the current sample period - int GetSamplePeriod(); + static int GetSamplePeriod(); // The following are public for the purposes of testing static uint64_t NextRandom(uint64_t rnd_); // Returns the next prng value - static double FastLog2(const double & d); // Computes Log2(x) quickly - static void PopulateFastLog2Table(); // Populate the lookup table + + // C++03 requires that types stored in TLS be POD. As a result, you must + // initialize these members to {0, 0, false} before using this class! + // + // TODO(ahh): C++11 support will let us make these private. + + // Bytes until we sample next. + // + // More specifically when bytes_until_sample_ is X, we can allocate + // X bytes without triggering sampling; on the (X+1)th allocated + // byte, the containing allocation will be sampled. + // + // Always non-negative with only very brief exceptions (see + // DecrementFast{,Finish}, so casting to size_t is ok. + ssize_t bytes_until_sample_; + uint64_t rnd_; // Cheap random number generator + bool initialized_; private: - size_t bytes_until_sample_; // Bytes until we sample next - uint64_t rnd_; // Cheap random number generator - - // Statics for the fast log - // Note that this code may not depend on anything in //util - // hence the duplication of functionality here - static const int kFastlogNumBits = 10; - static const int kFastlogMask = (1 << kFastlogNumBits) - 1; - static double log_table_[1<<kFastlogNumBits]; // Constant + friend class SamplerTest; + bool RecordAllocationSlow(size_t k); }; -inline bool Sampler::SampleAllocation(size_t k) { - if (bytes_until_sample_ < k) { - bytes_until_sample_ = PickNextSamplingPoint(); - return true; +inline bool Sampler::RecordAllocation(size_t k) { + // The first time we enter this function we expect bytes_until_sample_ + // to be zero, and we must call SampleAllocationSlow() to ensure + // proper initialization of static vars. + ASSERT(Static::IsInited() || bytes_until_sample_ == 0); + + // Note that we have to deal with arbitrarily large values of k + // here. Thus we're upcasting bytes_until_sample_ to unsigned rather + // than the other way around. And this is why this code cannot be + // merged with DecrementFast code below. + if (static_cast<size_t>(bytes_until_sample_) < k) { + bool result = RecordAllocationSlow(k); + ASSERT(Static::IsInited()); + return result; } else { bytes_until_sample_ -= k; + ASSERT(Static::IsInited()); + return true; + } +} + +inline bool Sampler::TryRecordAllocationFast(size_t k) { + // For efficiency reason, we're testing bytes_until_sample_ after + // decrementing it by k. This allows compiler to do sub <reg>, <mem> + // followed by conditional jump on sign. But it is correct only if k + // is actually smaller than largest ssize_t value. Otherwise + // converting k to signed value overflows. + // + // It would be great for generated code to be sub <reg>, <mem> + // followed by conditional jump on 'carry', which would work for + // arbitrary values of k, but there seem to be no way to express + // that in C++. + // + // Our API contract explicitly states that only small values of k + // are permitted. And thus it makes sense to assert on that. + ASSERT(static_cast<ssize_t>(k) >= 0); + + bytes_until_sample_ -= static_cast<ssize_t>(k); + if (PREDICT_FALSE(bytes_until_sample_ < 0)) { + // Note, we undo sampling counter update, since we're not actually + // handling slow path in the "needs sampling" case (calling + // RecordAllocationSlow to reset counter). And we do that in order + // to avoid non-tail calls in malloc fast-path. See also comments + // on declaration inside Sampler class. + // + // volatile is used here to improve compiler's choice of + // instuctions. We know that this path is very rare and that there + // is no need to keep previous value of bytes_until_sample_ in + // register. This helps compiler generate slightly more efficient + // sub <reg>, <mem> instruction for subtraction above. + volatile ssize_t *ptr = + const_cast<volatile ssize_t *>(&bytes_until_sample_); + *ptr += k; return false; } + return true; } // Inline functions which are public for testing purposes @@ -154,27 +217,14 @@ inline bool Sampler::SampleAllocation(size_t k) { // pRNG is: aX+b mod c with a = 0x5DEECE66D, b = 0xB, c = 1<<48 // This is the lrand64 generator. inline uint64_t Sampler::NextRandom(uint64_t rnd) { - const uint64_t prng_mult = 0x5DEECE66DLL; + const uint64_t prng_mult = 0x5DEECE66DULL; const uint64_t prng_add = 0xB; const uint64_t prng_mod_power = 48; const uint64_t prng_mod_mask = - ~((~static_cast<uint64_t>(0)) << prng_mod_power); + ~((~static_cast<uint64_t>(0)) << prng_mod_power); return (prng_mult * rnd + prng_add) & prng_mod_mask; } -// Adapted from //util/math/fastmath.[h|cc] by Noam Shazeer -// This mimics the VeryFastLog2 code in those files -inline double Sampler::FastLog2(const double & d) { - ASSERT(d>0); - COMPILE_ASSERT(sizeof(d) == sizeof(uint64_t), DoubleMustBe64Bits); - uint64_t x; - memcpy(&x, &d, sizeof(x)); // we depend on the compiler inlining this - const uint32_t x_high = x >> 32; - const uint32_t y = x_high >> (20 - kFastlogNumBits) & kFastlogMask; - const int32_t exponent = ((x_high >> 20) & 0x7FF) - 1023; - return exponent + log_table_[y]; -} - } // namespace tcmalloc #endif // TCMALLOC_SAMPLER_H_ diff --git a/src/static_vars.cc b/src/static_vars.cc index 6cbd20e..be8d1d2 100644 --- a/src/static_vars.cc +++ b/src/static_vars.cc @@ -102,7 +102,6 @@ void Static::InitStaticVars() { inited_ = true; DLL_Init(&sampled_objects_); - Sampler::InitStatics(); } diff --git a/src/tests/sampler_test.cc b/src/tests/sampler_test.cc index df94ee0..e0d24d4 100755 --- a/src/tests/sampler_test.cc +++ b/src/tests/sampler_test.cc @@ -48,7 +48,7 @@ #include <algorithm> #include <vector> #include <string> -#include <cmath> +#include <math.h> #include "base/logging.h" #include "base/commandlineflags.h" #include "sampler.h" // The Sampler class being tested @@ -325,28 +325,6 @@ void TestLRand64Spread() { } -// Test for Fastlog2 code -// We care about the percentage error because we're using this -// for choosing step sizes, so "close" is relative to the size of -// the step we would get if we used the built-in log function -TEST(Sampler, FastLog2) { - tcmalloc::Sampler sampler; - sampler.Init(1); - double max_ratio_error = 0; - for (double d = -1021.9; d < 1; d+= 0.13124235) { - double e = pow(2.0, d); - double truelog = log(e) / log(2.0); // log_2(e) - double fastlog = sampler.FastLog2(e); - max_ratio_error = max(max_ratio_error, - max(truelog/fastlog-1, fastlog/truelog-1)); - CHECK_LE(max_ratio_error, 0.01); - // << StringPrintf("d = %f, e=%f, truelog = %f, fastlog= %f\n", - // d, e, truelog, fastlog); - } - LOG(INFO) << StringPrintf("Fastlog2: max_ratio_error = %f\n", - max_ratio_error); -} - // Futher tests bool CheckMean(size_t mean, int num_samples) { @@ -392,11 +370,11 @@ TEST(Sampler, LargeAndSmallAllocs_CombinedTest) { int num_iters = 128*4*8; // Allocate in mixed chunks for (int i = 0; i < num_iters; i++) { - if (sampler.SampleAllocation(size_big)) { + if (!sampler.RecordAllocation(size_big)) { counter_big += 1; } for (int i = 0; i < 129; i++) { - if (sampler.SampleAllocation(size_small)) { + if (!sampler.RecordAllocation(size_small)) { counter_small += 1; } } @@ -540,12 +518,9 @@ TEST(Sampler, bytes_until_sample_Overflow_Underflow) { uint64_t largest_prng_value = (static_cast<uint64_t>(1)<<48) - 1; double q = (largest_prng_value >> (prng_mod_power - 26)) + 1.0; LOG(INFO) << StringPrintf("q = %f\n", q); - LOG(INFO) << StringPrintf("FastLog2(q) = %f\n", sampler.FastLog2(q)); LOG(INFO) << StringPrintf("log2(q) = %f\n", log(q)/log(2.0)); - // Replace min(sampler.FastLog2(q) - 26, 0.0) with - // (sampler.FastLog2(q) - 26.000705) when using that optimization uint64_t smallest_sample_step - = static_cast<uint64_t>(min(sampler.FastLog2(q) - 26, 0.0) + = static_cast<uint64_t>(min(log2(q) - 26, 0.0) * sample_scaling + 1); LOG(INFO) << "Smallest sample step is " << smallest_sample_step; uint64_t cutoff = static_cast<uint64_t>(10) @@ -558,10 +533,8 @@ TEST(Sampler, bytes_until_sample_Overflow_Underflow) { uint64_t smallest_prng_value = 0; q = (smallest_prng_value >> (prng_mod_power - 26)) + 1.0; LOG(INFO) << StringPrintf("q = %f\n", q); - // Replace min(sampler.FastLog2(q) - 26, 0.0) with - // (sampler.FastLog2(q) - 26.000705) when using that optimization uint64_t largest_sample_step - = static_cast<uint64_t>(min(sampler.FastLog2(q) - 26, 0.0) + = static_cast<uint64_t>(min(log2(q) - 26, 0.0) * sample_scaling + 1); LOG(INFO) << "Largest sample step is " << largest_sample_step; CHECK_LE(largest_sample_step, one<<63); diff --git a/src/thread_cache.h b/src/thread_cache.h index ad5ce3e..eda3034 100644 --- a/src/thread_cache.h +++ b/src/thread_cache.h @@ -347,7 +347,7 @@ inline int ThreadCache::HeapsInUse() { inline bool ThreadCache::SampleAllocation(size_t k) { #ifndef NO_TCMALLOC_SAMPLES - return PREDICT_FALSE(FLAGS_tcmalloc_sample_parameter > 0) && sampler_.SampleAllocation(k); + return !sampler_.RecordAllocation(k); #else return false; #endif |