synchronized Sampler implementation with Google-internal version

This is mostly dropping FastLog2 which was never necessary for performance, and making sampler to be called always, even if sampling is disabled (this benefits more for always-sampling case of Google fork). We're also getting TryRecordAllocationFast which is not used yet, but will be as part of subsequent fast-path speedup commit.
author: Aliaksey Kandratsenka <alkondratenko@gmail.com> 2016-12-18 18:36:57 -0800
committer: Aliaksey Kandratsenka <alkondratenko@gmail.com> 2017-05-14 19:04:55 -0700
commit: 7d588da7ec4f315ea2d02824d7e8813b0f95171d (patch)
tree: 6c4fcb36ad7495611383881f766718767928ed44
parent: 27da4ade70d45312bfdf334aa8cf0d63bf78df14 (diff)
download: gperftools-7d588da7ec4f315ea2d02824d7e8813b0f95171d.tar.gz
5 files changed, 137 insertions, 112 deletions
diff --git a/src/sampler.cc b/src/sampler.cc
index cc71112..f7f2fcc 100755..100644
--- a/src/sampler.cc
+++ b/src/sampler.cc
@@ -37,6 +37,7 @@
 
 #include <algorithm>  // For min()
 #include <math.h>
+#include <limits>
 #include "base/commandlineflags.h"
 
 using std::min;
@@ -57,34 +58,16 @@ DEFINE_int64(tcmalloc_sample_parameter,
 
 namespace tcmalloc {
 
-// Statics for Sampler
-double Sampler::log_table_[1<<kFastlogNumBits];
-
-// Populate the lookup table for FastLog2.
-// This approximates the log2 curve with a step function.
-// Steps have height equal to log2 of the mid-point of the step.
-void Sampler::PopulateFastLog2Table() {
-  for (int i = 0; i < (1<<kFastlogNumBits); i++) {
-    log_table_[i] = (log(1.0 + static_cast<double>(i+0.5)/(1<<kFastlogNumBits))
-                     / log(2.0));
-  }
-}
-
 int Sampler::GetSamplePeriod() {
   return FLAGS_tcmalloc_sample_parameter;
 }
 
 // Run this before using your sampler
-void Sampler::Init(uint32_t seed) {
+void Sampler::Init(uint64_t seed) {
+  ASSERT(seed != 0);
+
   // Initialize PRNG
-  if (seed != 0) {
-    rnd_ = seed;
-  } else {
-    rnd_ = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(this));
-    if (rnd_ == 0) {
-      rnd_ = 1;
-    }
-  }
+  rnd_ = seed;
   // Step it forward 20 times for good measure
   for (int i = 0; i < 20; i++) {
     rnd_ = NextRandom(rnd_);
@@ -93,10 +76,7 @@ void Sampler::Init(uint32_t seed) {
   bytes_until_sample_ = PickNextSamplingPoint();
 }
 
-// Initialize the Statics for the Sampler class
-void Sampler::InitStatics() {
-  PopulateFastLog2Table();
-}
+#define MAX_SSIZE (SIZE_MAX >> 1)
 
 // Generates a geometric variable with the specified mean (512K by default).
 // This is done by generating a random number between 0 and 1 and applying
@@ -109,7 +89,17 @@ void Sampler::InitStatics() {
 // -log_e(q)/m = x
 // log_2(q) * (-log_e(2) * 1/m) = x
 // In the code, q is actually in the range 1 to 2**26, hence the -26 below
-size_t Sampler::PickNextSamplingPoint() {
+ssize_t Sampler::PickNextSamplingPoint() {
+  if (FLAGS_tcmalloc_sample_parameter <= 0) {
+    // In this case, we don't want to sample ever, and the larger a
+    // value we put here, the longer until we hit the slow path
+    // again. However, we have to support the flag changing at
+    // runtime, so pick something reasonably large (to keep overhead
+    // low) but small enough that we'll eventually start to sample
+    // again.
+    return 16 << 20;
+  }
+
   rnd_ = NextRandom(rnd_);
   // Take the top 26 bits as the random number
   // (This plus the 1<<58 sampling bound give a max possible step of
@@ -119,13 +109,26 @@ size_t Sampler::PickNextSamplingPoint() {
   // under piii debug for some binaries.
   double q = static_cast<uint32_t>(rnd_ >> (prng_mod_power - 26)) + 1.0;
   // Put the computed p-value through the CDF of a geometric.
-  // For faster performance (save ~1/20th exec time), replace
-  // min(0.0, FastLog2(q) - 26)  by  (Fastlog2(q) - 26.000705)
-  // The value 26.000705 is used rather than 26 to compensate
-  // for inaccuracies in FastLog2 which otherwise result in a
-  // negative answer.
-  return static_cast<size_t>(min(0.0, (FastLog2(q) - 26)) * (-log(2.0)
-                             * FLAGS_tcmalloc_sample_parameter) + 1);
+  double interval =
+      (log2(q) - 26) * (-log(2.0) * FLAGS_tcmalloc_sample_parameter);
+
+  // Very large values of interval overflow ssize_t. If we happen to
+  // hit such improbable condition, we simply cheat and clamp interval
+  // to largest supported value.
+  return static_cast<ssize_t>(std::min<double>(interval, MAX_SSIZE));
+}
+
+bool Sampler::RecordAllocationSlow(size_t k) {
+  if (!initialized_) {
+    initialized_ = true;
+    Init(reinterpret_cast<uintptr_t>(this));
+    if (static_cast<size_t>(bytes_until_sample_) >= k) {
+      bytes_until_sample_ -= k;
+      return true;
+    }
+  }
+  bytes_until_sample_ = PickNextSamplingPoint();
+  return FLAGS_tcmalloc_sample_parameter <= 0;
 }
 
 }  // namespace tcmalloc
diff --git a/src/sampler.h b/src/sampler.h
index eb316d7..16d3b09 100755..100644
--- a/src/sampler.h
+++ b/src/sampler.h
@@ -44,6 +44,7 @@
 #include <string.h>                     // for memcpy
 #include "base/basictypes.h"  // for ASSERT
 #include "internal_logging.h"  // for ASSERT
+#include "static_vars.h"
 
 namespace tcmalloc {
 
@@ -80,7 +81,7 @@ namespace tcmalloc {
 // a geometric with mean tcmalloc_sample_parameter. (ie. the
 // probability that at least one byte in the range is marked). This
 // is accurately given by the CDF of the corresponding exponential
-// distribution : 1 - e^(X/tcmalloc_sample_parameter_)
+// distribution : 1 - e^(-X/tcmalloc_sample_parameter_)
 // Independence of the byte marking ensures independence of
 // the sampling of each allocation.
 //
@@ -100,52 +101,114 @@ namespace tcmalloc {
 // only result in a single call to PickNextSamplingPoint.
 //-------------------------------------------------------------------
 
+class SamplerTest;
+
 class PERFTOOLS_DLL_DECL Sampler {
  public:
   // Initialize this sampler.
-  // Passing a seed of 0 gives a non-deterministic
-  // seed value given by casting the object ("this")
-  void Init(uint32_t seed);
-  void Cleanup();
-
-  // Record allocation of "k" bytes.  Return true iff allocation
-  // should be sampled
-  bool SampleAllocation(size_t k);
+  void Init(uint64_t seed);
+
+  // Record allocation of "k" bytes.  Return true if no further work
+  // is need, and false if allocation needed to be sampled.
+  bool RecordAllocation(size_t k);
+
+  // Same as above (but faster), except:
+  // a) REQUIRES(k < std::numeric_limits<ssize_t>::max())
+  // b) if this returns false, you must call RecordAllocation
+  //    to confirm if sampling truly needed.
+  //
+  // The point of this function is to only deal with common case of no
+  // sampling and let caller (which is in malloc fast-path) to
+  // "escalate" to fuller and slower logic only if necessary.
+  bool TryRecordAllocationFast(size_t k);
 
   // Generate a geometric with mean 512K (or FLAG_tcmalloc_sample_parameter)
-  size_t PickNextSamplingPoint();
-
-  // Initialize the statics for the Sampler class
-  static void InitStatics();
+  ssize_t PickNextSamplingPoint();
 
   // Returns the current sample period
-  int GetSamplePeriod();
+  static int GetSamplePeriod();
 
   // The following are public for the purposes of testing
   static uint64_t NextRandom(uint64_t rnd_);  // Returns the next prng value
-  static double FastLog2(const double & d);  // Computes Log2(x) quickly
-  static void PopulateFastLog2Table();  // Populate the lookup table
+
+  // C++03 requires that types stored in TLS be POD.  As a result, you must
+  // initialize these members to {0, 0, false} before using this class!
+  //
+  // TODO(ahh): C++11 support will let us make these private.
+
+  // Bytes until we sample next.
+  //
+  // More specifically when bytes_until_sample_ is X, we can allocate
+  // X bytes without triggering sampling; on the (X+1)th allocated
+  // byte, the containing allocation will be sampled.
+  //
+  // Always non-negative with only very brief exceptions (see
+  // DecrementFast{,Finish}, so casting to size_t is ok.
+  ssize_t bytes_until_sample_;
+  uint64_t rnd_;  // Cheap random number generator
+  bool initialized_;
 
  private:
-  size_t        bytes_until_sample_;    // Bytes until we sample next
-  uint64_t      rnd_;                   // Cheap random number generator
-
-  // Statics for the fast log
-  // Note that this code may not depend on anything in //util
-  // hence the duplication of functionality here
-  static const int kFastlogNumBits = 10;
-  static const int kFastlogMask = (1 << kFastlogNumBits) - 1;
-  static double log_table_[1<<kFastlogNumBits];  // Constant
+  friend class SamplerTest;
+  bool RecordAllocationSlow(size_t k);
 };
 
-inline bool Sampler::SampleAllocation(size_t k) {
-  if (bytes_until_sample_ < k) {
-    bytes_until_sample_ = PickNextSamplingPoint();
-    return true;
+inline bool Sampler::RecordAllocation(size_t k) {
+  // The first time we enter this function we expect bytes_until_sample_
+  // to be zero, and we must call SampleAllocationSlow() to ensure
+  // proper initialization of static vars.
+  ASSERT(Static::IsInited() || bytes_until_sample_ == 0);
+
+  // Note that we have to deal with arbitrarily large values of k
+  // here. Thus we're upcasting bytes_until_sample_ to unsigned rather
+  // than the other way around. And this is why this code cannot be
+  // merged with DecrementFast code below.
+  if (static_cast<size_t>(bytes_until_sample_) < k) {
+    bool result = RecordAllocationSlow(k);
+    ASSERT(Static::IsInited());
+    return result;
   } else {
     bytes_until_sample_ -= k;
+    ASSERT(Static::IsInited());
+    return true;
+  }
+}
+
+inline bool Sampler::TryRecordAllocationFast(size_t k) {
+  // For efficiency reason, we're testing bytes_until_sample_ after
+  // decrementing it by k. This allows compiler to do sub <reg>, <mem>
+  // followed by conditional jump on sign. But it is correct only if k
+  // is actually smaller than largest ssize_t value. Otherwise
+  // converting k to signed value overflows.
+  //
+  // It would be great for generated code to be sub <reg>, <mem>
+  // followed by conditional jump on 'carry', which would work for
+  // arbitrary values of k, but there seem to be no way to express
+  // that in C++.
+  //
+  // Our API contract explicitly states that only small values of k
+  // are permitted. And thus it makes sense to assert on that.
+  ASSERT(static_cast<ssize_t>(k) >= 0);
+
+  bytes_until_sample_ -= static_cast<ssize_t>(k);
+  if (PREDICT_FALSE(bytes_until_sample_ < 0)) {
+    // Note, we undo sampling counter update, since we're not actually
+    // handling slow path in the "needs sampling" case (calling
+    // RecordAllocationSlow to reset counter). And we do that in order
+    // to avoid non-tail calls in malloc fast-path. See also comments
+    // on declaration inside Sampler class.
+    //
+    // volatile is used here to improve compiler's choice of
+    // instuctions. We know that this path is very rare and that there
+    // is no need to keep previous value of bytes_until_sample_ in
+    // register. This helps compiler generate slightly more efficient
+    // sub <reg>, <mem> instruction for subtraction above.
+    volatile ssize_t *ptr =
+        const_cast<volatile ssize_t *>(&bytes_until_sample_);
+    *ptr += k;
     return false;
   }
+  return true;
 }
 
 // Inline functions which are public for testing purposes
@@ -154,27 +217,14 @@ inline bool Sampler::SampleAllocation(size_t k) {
 // pRNG is: aX+b mod c with a = 0x5DEECE66D, b =  0xB, c = 1<<48
 // This is the lrand64 generator.
 inline uint64_t Sampler::NextRandom(uint64_t rnd) {
-  const uint64_t prng_mult = 0x5DEECE66DLL;
+  const uint64_t prng_mult = 0x5DEECE66DULL;
   const uint64_t prng_add = 0xB;
   const uint64_t prng_mod_power = 48;
   const uint64_t prng_mod_mask =
-                ~((~static_cast<uint64_t>(0)) << prng_mod_power);
+      ~((~static_cast<uint64_t>(0)) << prng_mod_power);
   return (prng_mult * rnd + prng_add) & prng_mod_mask;
 }
 
-// Adapted from //util/math/fastmath.[h|cc] by Noam Shazeer
-// This mimics the VeryFastLog2 code in those files
-inline double Sampler::FastLog2(const double & d) {
-  ASSERT(d>0);
-  COMPILE_ASSERT(sizeof(d) == sizeof(uint64_t), DoubleMustBe64Bits);
-  uint64_t x;
-  memcpy(&x, &d, sizeof(x));   // we depend on the compiler inlining this
-  const uint32_t x_high = x >> 32;
-  const uint32_t y = x_high >> (20 - kFastlogNumBits) & kFastlogMask;
-  const int32_t exponent = ((x_high >> 20) & 0x7FF) - 1023;
-  return exponent + log_table_[y];
-}
-
 }  // namespace tcmalloc
 
 #endif  // TCMALLOC_SAMPLER_H_
diff --git a/src/static_vars.cc b/src/static_vars.cc
index 6cbd20e..be8d1d2 100644
--- a/src/static_vars.cc
+++ b/src/static_vars.cc
@@ -102,7 +102,6 @@ void Static::InitStaticVars() {
   inited_ = true;
 
   DLL_Init(&sampled_objects_);
-  Sampler::InitStatics();
 }
 
 
diff --git a/src/tests/sampler_test.cc b/src/tests/sampler_test.cc
index df94ee0..e0d24d4 100755
--- a/src/tests/sampler_test.cc
+++ b/src/tests/sampler_test.cc
@@ -48,7 +48,7 @@
 #include <algorithm>
 #include <vector>
 #include <string>
-#include <cmath>
+#include <math.h>
 #include "base/logging.h"
 #include "base/commandlineflags.h"
 #include "sampler.h"       // The Sampler class being tested
@@ -325,28 +325,6 @@ void TestLRand64Spread() {
 }
 
 
-// Test for Fastlog2 code
-// We care about the percentage error because we're using this
-// for choosing step sizes, so "close" is relative to the size of
-// the step we would get if we used the built-in log function
-TEST(Sampler, FastLog2) {
-  tcmalloc::Sampler sampler;
-  sampler.Init(1);
-  double max_ratio_error = 0;
-  for (double d = -1021.9; d < 1; d+= 0.13124235) {
-    double e = pow(2.0, d);
-    double truelog = log(e) / log(2.0);  // log_2(e)
-    double fastlog = sampler.FastLog2(e);
-    max_ratio_error = max(max_ratio_error,
-                          max(truelog/fastlog-1, fastlog/truelog-1));
-    CHECK_LE(max_ratio_error, 0.01);
-        //        << StringPrintf("d = %f, e=%f, truelog = %f, fastlog= %f\n",
-        //                        d, e, truelog, fastlog);
-  }
-  LOG(INFO) << StringPrintf("Fastlog2: max_ratio_error = %f\n",
-                            max_ratio_error);
-}
-
 // Futher tests
 
 bool CheckMean(size_t mean, int num_samples) {
@@ -392,11 +370,11 @@ TEST(Sampler, LargeAndSmallAllocs_CombinedTest) {
   int num_iters = 128*4*8;
   // Allocate in mixed chunks
   for (int i = 0; i < num_iters; i++) {
-    if (sampler.SampleAllocation(size_big)) {
+    if (!sampler.RecordAllocation(size_big)) {
       counter_big += 1;
     }
     for (int i = 0; i < 129; i++) {
-      if (sampler.SampleAllocation(size_small)) {
+      if (!sampler.RecordAllocation(size_small)) {
         counter_small += 1;
       }
     }
@@ -540,12 +518,9 @@ TEST(Sampler, bytes_until_sample_Overflow_Underflow) {
     uint64_t largest_prng_value = (static_cast<uint64_t>(1)<<48) - 1;
     double q = (largest_prng_value >> (prng_mod_power - 26)) + 1.0;
     LOG(INFO) << StringPrintf("q = %f\n", q);
-    LOG(INFO) << StringPrintf("FastLog2(q) = %f\n", sampler.FastLog2(q));
     LOG(INFO) << StringPrintf("log2(q) = %f\n", log(q)/log(2.0));
-    // Replace min(sampler.FastLog2(q) - 26, 0.0) with
-    // (sampler.FastLog2(q) - 26.000705) when using that optimization
     uint64_t smallest_sample_step
-        = static_cast<uint64_t>(min(sampler.FastLog2(q) - 26, 0.0)
+        = static_cast<uint64_t>(min(log2(q) - 26, 0.0)
                                 * sample_scaling + 1);
     LOG(INFO) << "Smallest sample step is " << smallest_sample_step;
     uint64_t cutoff = static_cast<uint64_t>(10)
@@ -558,10 +533,8 @@ TEST(Sampler, bytes_until_sample_Overflow_Underflow) {
     uint64_t smallest_prng_value = 0;
     q = (smallest_prng_value >> (prng_mod_power - 26)) + 1.0;
     LOG(INFO) << StringPrintf("q = %f\n", q);
-    // Replace min(sampler.FastLog2(q) - 26, 0.0) with
-    // (sampler.FastLog2(q) - 26.000705) when using that optimization
     uint64_t largest_sample_step
-        = static_cast<uint64_t>(min(sampler.FastLog2(q) - 26, 0.0)
+        = static_cast<uint64_t>(min(log2(q) - 26, 0.0)
                                 * sample_scaling + 1);
     LOG(INFO) << "Largest sample step is " << largest_sample_step;
     CHECK_LE(largest_sample_step, one<<63);
diff --git a/src/thread_cache.h b/src/thread_cache.h
index ad5ce3e..eda3034 100644
--- a/src/thread_cache.h
+++ b/src/thread_cache.h
@@ -347,7 +347,7 @@ inline int ThreadCache::HeapsInUse() {
 
 inline bool ThreadCache::SampleAllocation(size_t k) {
 #ifndef NO_TCMALLOC_SAMPLES
-  return PREDICT_FALSE(FLAGS_tcmalloc_sample_parameter > 0) && sampler_.SampleAllocation(k);
+  return !sampler_.RecordAllocation(k);
 #else
   return false;
 #endif
author	Aliaksey Kandratsenka <alkondratenko@gmail.com>	2016-12-18 18:36:57 -0800
committer	Aliaksey Kandratsenka <alkondratenko@gmail.com>	2017-05-14 19:04:55 -0700
commit	7d588da7ec4f315ea2d02824d7e8813b0f95171d (patch)
tree	6c4fcb36ad7495611383881f766718767928ed44
parent	27da4ade70d45312bfdf334aa8cf0d63bf78df14 (diff)
download	gperftools-7d588da7ec4f315ea2d02824d7e8813b0f95171d.tar.gz