issue-488: Performance improvement for initialization checks

These came in from the Google-internal version of tcmalloc. They saw some decent speed improvements, both on microbenchmarks and big programs. The idea is to improve the speed of the "Is everything initialized?" type of code, that's at the start of all allocations and deallocations. git-svn-id: http://gperftools.googlecode.com/svn/trunk@184 6b5cf1ce-ec42-a296-1ba9-69fdba395a50
author: chappedm@gmail.com <chappedm@gmail.com@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> 2012-12-22 18:34:43 +0000
committer: chappedm@gmail.com <chappedm@gmail.com@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> 2012-12-22 18:34:43 +0000
commit: 6856d1d1b2a807efd84dcafa10b865b6de22bc28 (patch)
tree: 393ef824dbf875b697c04521de639957a8967bfa
parent: ad5aa05838121d52ad1fde5463a796c3320fe067 (diff)
download: gperftools-6856d1d1b2a807efd84dcafa10b865b6de22bc28.tar.gz
4 files changed, 160 insertions, 57 deletions
diff --git a/src/static_vars.h b/src/static_vars.h
index 185a1d4..abcb314 100644
--- a/src/static_vars.h
+++ b/src/static_vars.h
@@ -82,6 +82,9 @@ class Static {
     return &bucket_allocator_;
   }
 
+  // Check if InitStaticVars() has been run.
+  static bool IsInited() { return pageheap() != NULL; }
+
  private:
   static SpinLock pageheap_lock_;
 
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 9823230..eea76b7 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -1005,6 +1005,7 @@ static void ReportLargeAlloc(Length num_pages, void* result) {
 
 inline void* cpp_alloc(size_t size, bool nothrow);
 inline void* do_malloc(size_t size);
+inline void* do_malloc_no_errno(size_t size);
 
 // TODO(willchan): Investigate whether or not lining this much is harmful to
 // performance.
@@ -1014,6 +1015,10 @@ inline void* do_malloc_or_cpp_alloc(size_t size) {
   return tc_new_mode ? cpp_alloc(size, true) : do_malloc(size);
 }
 
+inline void* do_malloc_no_errno_or_cpp_alloc(size_t size) {
+  return tc_new_mode ? cpp_alloc(size, true) : do_malloc_no_errno(size);
+}
+
 void* cpp_memalign(size_t align, size_t size);
 void* do_memalign(size_t align, size_t size);
 
@@ -1060,26 +1065,35 @@ inline void* do_malloc_pages(ThreadCache* heap, size_t size) {
   return result;
 }
 
-inline void* do_malloc(size_t size) {
-  void* ret = NULL;
+inline void* do_malloc_small(ThreadCache* heap, size_t size) {
+  ASSERT(Static::IsInited());
+  ASSERT(heap != NULL);
+  size_t cl = Static::sizemap()->SizeClass(size);
+  size = Static::sizemap()->class_to_size(cl);
 
-  // The following call forces module initialization
-  ThreadCache* heap = ThreadCache::GetCache();
-  if (size <= kMaxSize) {
-    size_t cl = Static::sizemap()->SizeClass(size);
-    size = Static::sizemap()->class_to_size(cl);
+  if ((FLAGS_tcmalloc_sample_parameter > 0) && heap->SampleAllocation(size)) {
+    return DoSampledAllocation(size);
+  } else {
+    // The common case, and also the simplest.  This just pops the
+    // size-appropriate freelist, after replenishing it if it's empty.
+    return CheckedMallocResult(heap->Allocate(size, cl));
+  }
+}
 
-    if ((FLAGS_tcmalloc_sample_parameter > 0) && heap->SampleAllocation(size)) {
-      ret = DoSampledAllocation(size);
-    } else {
-      // The common case, and also the simplest.  This just pops the
-      // size-appropriate freelist, after replenishing it if it's empty.
-      ret = CheckedMallocResult(heap->Allocate(size, cl));
-    }
+inline void* do_malloc_no_errno(size_t size) {
+  if (ThreadCache::have_tls &&
+      LIKELY(size < ThreadCache::MinSizeForSlowPath())) {
+    return do_malloc_small(ThreadCache::GetCacheWhichMustBePresent(), size);
+  } else if (size <= kMaxSize) {
+    return do_malloc_small(ThreadCache::GetCache(), size);
   } else {
-    ret = do_malloc_pages(heap, size);
+    return do_malloc_pages(ThreadCache::GetCache(), size);
   }
-  if (ret == NULL) errno = ENOMEM;
+}
+
+inline void* do_malloc(size_t size) {
+  void* ret = do_malloc_no_errno(size);
+  if (UNLIKELY(ret == NULL)) errno = ENOMEM;
   return ret;
 }
 
@@ -1088,55 +1102,72 @@ inline void* do_calloc(size_t n, size_t elem_size) {
   const size_t size = n * elem_size;
   if (elem_size != 0 && size / elem_size != n) return NULL;
 
-  void* result = do_malloc_or_cpp_alloc(size);
-  if (result != NULL) {
+  void* result = do_malloc_no_errno_or_cpp_alloc(size);
+  if (result == NULL) {
+    errno = ENOMEM;
+  } else {
     memset(result, 0, size);
   }
   return result;
 }
 
-static inline ThreadCache* GetCacheIfPresent() {
-  void* const p = ThreadCache::GetCacheIfPresent();
-  return reinterpret_cast<ThreadCache*>(p);
+// If ptr is NULL, do nothing.  Otherwise invoke the given function.
+inline void free_null_or_invalid(void* ptr, void (*invalid_free_fn)(void*)) {
+  if (ptr != NULL) {
+    (*invalid_free_fn)(ptr);
+  }
 }
 
-// This lets you call back to a given function pointer if ptr is invalid.
-// It is used primarily by windows code which wants a specialized callback.
-inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) {
-  if (ptr == NULL) return;
-  if (Static::pageheap() == NULL) {
+// Helper for do_free_with_callback(), below.  Inputs:
+//   ptr is object to be freed
+//   invalid_free_fn is a function that gets invoked on certain "bad frees"
+//   heap is the ThreadCache for this thread, or NULL if it isn't known
+//   heap_must_be_valid is whether heap is known to be non-NULL
+//
+// This function may only be used after Static::IsInited() is true.
+//
+// We can usually detect the case where ptr is not pointing to a page that
+// tcmalloc is using, and in those cases we invoke invalid_free_fn.
+//
+// To maximize speed in the common case, we usually get here with
+// heap_must_be_valid being a manifest constant equal to true.
+inline void do_free_helper(void* ptr,
+                           void (*invalid_free_fn)(void*),
+                           ThreadCache* heap,
+                           bool heap_must_be_valid) {
+  ASSERT((Static::IsInited() && heap != NULL) || !heap_must_be_valid);
+  if (!heap_must_be_valid && !Static::IsInited()) {
     // We called free() before malloc().  This can occur if the
     // (system) malloc() is called before tcmalloc is loaded, and then
     // free() is called after tcmalloc is loaded (and tc_free has
     // replaced free), but before the global constructor has run that
     // sets up the tcmalloc data structures.
-    (*invalid_free_fn)(ptr);  // Decide how to handle the bad free request
+    free_null_or_invalid(ptr, invalid_free_fn);
     return;
   }
-  const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
   Span* span = NULL;
+  const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
   size_t cl = Static::pageheap()->GetSizeClassIfCached(p);
-
-  if (cl == 0) {
+  if (UNLIKELY(cl == 0)) {
     span = Static::pageheap()->GetDescriptor(p);
-    if (!span) {
-      // span can be NULL because the pointer passed in is invalid
+    if (UNLIKELY(!span)) {
+      // span can be NULL because the pointer passed in is NULL or invalid
       // (not something returned by malloc or friends), or because the
       // pointer was allocated with some other allocator besides
       // tcmalloc.  The latter can happen if tcmalloc is linked in via
       // a dynamic library, but is not listed last on the link line.
       // In that case, libraries after it on the link line will
       // allocate with libc malloc, but free with tcmalloc's free.
-      (*invalid_free_fn)(ptr);  // Decide how to handle the bad free request
+      free_null_or_invalid(ptr, invalid_free_fn);
       return;
     }
     cl = span->sizeclass;
     Static::pageheap()->CacheSizeClass(p, cl);
   }
-  if (cl != 0) {
+  ASSERT(ptr != NULL);
+  if (LIKELY(cl != 0)) {
     ASSERT(!Static::pageheap()->GetDescriptor(p)->sample);
-    ThreadCache* heap = GetCacheIfPresent();
-    if (heap != NULL) {
+    if (heap_must_be_valid || heap != NULL) {
       heap->Deallocate(ptr, cl);
     } else {
       // Delete directly into central cache
@@ -1157,6 +1188,23 @@ inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) {
   }
 }
 
+// Helper for the object deletion (free, delete, etc.).  Inputs:
+//   ptr is object to be freed
+//   invalid_free_fn is a function that gets invoked on certain "bad frees"
+//
+// We can usually detect the case where ptr is not pointing to a page that
+// tcmalloc is using, and in those cases we invoke invalid_free_fn.
+inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) {
+  ThreadCache* heap = NULL;
+  if (LIKELY(ThreadCache::IsFastPathAllowed())) {
+    heap = ThreadCache::GetCacheWhichMustBePresent();
+    do_free_helper(ptr, invalid_free_fn, heap, true);
+  } else {
+    heap = ThreadCache::GetCacheIfPresent();
+    do_free_helper(ptr, invalid_free_fn, heap, false);
+  }
+}
+
 // The default "do_free" that uses the default callback.
 inline void do_free(void* ptr) {
   return do_free_with_callback(ptr, &InvalidFree);
@@ -1207,7 +1255,7 @@ inline void* do_realloc_with_callback(
     void* new_ptr = NULL;
 
     if (new_size > old_size && new_size < lower_bound_to_grow) {
-      new_ptr = do_malloc_or_cpp_alloc(lower_bound_to_grow);
+      new_ptr = do_malloc_no_errno_or_cpp_alloc(lower_bound_to_grow);
     }
     if (new_ptr == NULL) {
       // Either new_size is not a tiny increment, or last do_malloc failed.
@@ -1359,11 +1407,11 @@ inline struct mallinfo do_mallinfo() {
 static SpinLock set_new_handler_lock(SpinLock::LINKER_INITIALIZED);
 
 inline void* cpp_alloc(size_t size, bool nothrow) {
-  for (;;) {
-    void* p = do_malloc(size);
 #ifdef PREANSINEW
-    return p;
+  return do_malloc(size);
 #else
+  for (;;) {
+    void* p = do_malloc_no_errno(size);
     if (p == NULL) {  // allocation failed
       // Get the current new handler.  NB: this function is not
       // thread-safe.  We make a feeble stab at making it so here, but
@@ -1382,11 +1430,11 @@ inline void* cpp_alloc(size_t size, bool nothrow) {
         (*nh)();
         continue;
       }
-      return 0;
+      goto fail;
 #else
       // If no new_handler is established, the allocation failed.
       if (!nh) {
-        if (nothrow) return 0;
+        if (nothrow) goto fail;
         throw std::bad_alloc();
       }
       // Otherwise, try the new_handler.  If it returns, retry the
@@ -1396,7 +1444,7 @@ inline void* cpp_alloc(size_t size, bool nothrow) {
         (*nh)();
       } catch (const std::bad_alloc&) {
         if (!nothrow) throw;
-        return p;
+        goto fail;
       }
 #endif  // (defined(__GNUC__) && !defined(__EXCEPTIONS)) || (defined(_HAS_EXCEPTIONS) && !_HAS_EXCEPTIONS)
     } else {  // allocation success
@@ -1404,6 +1452,9 @@ inline void* cpp_alloc(size_t size, bool nothrow) {
     }
 #endif  // PREANSINEW
   }
+fail:
+  errno = ENOMEM;
+  return 0;
 }
 
 void* cpp_memalign(size_t align, size_t size) {
diff --git a/src/thread_cache.cc b/src/thread_cache.cc
index d6dead3..1764d14 100644
--- a/src/thread_cache.cc
+++ b/src/thread_cache.cc
@@ -63,11 +63,9 @@ ThreadCache* ThreadCache::thread_heaps_ = NULL;
 int ThreadCache::thread_heap_count_ = 0;
 ThreadCache* ThreadCache::next_memory_steal_ = NULL;
 #ifdef HAVE_TLS
-__thread ThreadCache* ThreadCache::threadlocal_heap_
-# ifdef HAVE___ATTRIBUTE__
-   __attribute__ ((tls_model ("initial-exec")))
-# endif
-   ;
+__thread ThreadCache::ThreadLocalData ThreadCache::threadlocal_data_
+    ATTR_INITIAL_EXEC
+    = {0, 0};
 #endif
 bool ThreadCache::tsd_inited_ = false;
 pthread_key_t ThreadCache::heap_key_;
@@ -379,7 +377,8 @@ ThreadCache* ThreadCache::CreateCacheIfNecessary() {
     perftools_pthread_setspecific(heap_key_, heap);
 #ifdef HAVE_TLS
     // Also keep a copy in __thread for faster retrieval
-    threadlocal_heap_ = heap;
+    threadlocal_data_.heap = heap;
+    SetMinSizeForSlowPath(kMaxSize + 1);
 #endif
     heap->in_setspecific_ = false;
   }
@@ -414,7 +413,8 @@ void ThreadCache::BecomeIdle() {
   perftools_pthread_setspecific(heap_key_, NULL);
 #ifdef HAVE_TLS
   // Also update the copy in __thread
-  threadlocal_heap_ = NULL;
+  threadlocal_data_.heap = NULL;
+  SetMinSizeForSlowPath(0);
 #endif
   heap->in_setspecific_ = false;
   if (GetThreadHeap() == heap) {
@@ -434,7 +434,8 @@ void ThreadCache::DestroyThreadCache(void* ptr) {
   if (ptr == NULL) return;
 #ifdef HAVE_TLS
   // Prevent fast path of GetThreadHeap() from returning heap.
-  threadlocal_heap_ = NULL;
+  threadlocal_data_.heap = NULL;
+  SetMinSizeForSlowPath(0);
 #endif
   DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
 }
diff --git a/src/thread_cache.h b/src/thread_cache.h
index 1d0413b..8644a4d 100644
--- a/src/thread_cache.h
+++ b/src/thread_cache.h
@@ -75,6 +75,12 @@ inline bool KernelSupportsTLS() {
 
 class ThreadCache {
  public:
+#ifdef HAVE_TLS
+  enum { have_tls = true };
+#else
+  enum { have_tls = false };
+#endif
+
   // All ThreadCache objects are kept in a linked list (for stats collection)
   ThreadCache* next_;
   ThreadCache* prev_;
@@ -106,8 +112,13 @@ class ThreadCache {
   static ThreadCache* GetThreadHeap();
   static ThreadCache* GetCache();
   static ThreadCache* GetCacheIfPresent();
+  static ThreadCache* GetCacheWhichMustBePresent();
   static ThreadCache* CreateCacheIfNecessary();
   static void         BecomeIdle();
+  static size_t       MinSizeForSlowPath();
+  static void         SetMinSizeForSlowPath(size_t size);
+
+  static bool IsFastPathAllowed() { return MinSizeForSlowPath() != 0; }
 
   // Return the number of thread heaps in use.
   static inline int HeapsInUse();
@@ -251,12 +262,24 @@ class ThreadCache {
   // Since we don't really use dlopen in google code -- and using dlopen
   // on a malloc replacement is asking for trouble in any case -- that's
   // a good tradeoff for us.
+#ifdef HAVE___ATTRIBUTE__
+#define ATTR_INITIAL_EXEC __attribute__ ((tls_model ("initial-exec")))
+#else
+#define ATTR_INITIAL_EXEC
+#endif
+
 #ifdef HAVE_TLS
-  static __thread ThreadCache* threadlocal_heap_
-# ifdef HAVE___ATTRIBUTE__
-   __attribute__ ((tls_model ("initial-exec")))
-# endif
-   ;
+  struct ThreadLocalData {
+    ThreadCache* heap;
+    // min_size_for_slow_path is 0 if heap is NULL or kMaxSize + 1 otherwise.
+    // The latter is the common case and allows allocation to be faster
+    // than it would be otherwise: typically a single branch will
+    // determine that the requested allocation is no more than kMaxSize
+    // and we can then proceed, knowing that global and thread-local tcmalloc
+    // state is initialized.
+    size_t min_size_for_slow_path;
+  };
+  static __thread ThreadLocalData threadlocal_data_ ATTR_INITIAL_EXEC;
 #endif
 
   // Thread-specific key.  Initialization here is somewhat tricky
@@ -373,12 +396,23 @@ inline ThreadCache* ThreadCache::GetThreadHeap() {
 #ifdef HAVE_TLS
   // __thread is faster, but only when the kernel supports it
   if (KernelSupportsTLS())
-    return threadlocal_heap_;
+    return threadlocal_data_.heap;
 #endif
   return reinterpret_cast<ThreadCache *>(
       perftools_pthread_getspecific(heap_key_));
 }
 
+inline ThreadCache* ThreadCache::GetCacheWhichMustBePresent() {
+#ifdef HAVE_TLS
+  ASSERT(threadlocal_data_.heap);
+  return threadlocal_data_.heap;
+#else
+  ASSERT(perftools_pthread_getspecific(heap_key_));
+  return reinterpret_cast<ThreadCache *>(
+      perftools_pthread_getspecific(heap_key_));
+#endif
+}
+
 inline ThreadCache* ThreadCache::GetCache() {
   ThreadCache* ptr = NULL;
   if (!tsd_inited_) {
@@ -398,6 +432,20 @@ inline ThreadCache* ThreadCache::GetCacheIfPresent() {
   return GetThreadHeap();
 }
 
+inline size_t ThreadCache::MinSizeForSlowPath() {
+#ifdef HAVE_TLS
+  return threadlocal_data_.min_size_for_slow_path;
+#else
+  return 0;
+#endif
+}
+
+inline void ThreadCache::SetMinSizeForSlowPath(size_t size) {
+#ifdef HAVE_TLS
+  threadlocal_data_.min_size_for_slow_path = size;
+#endif
+}
+
 }  // namespace tcmalloc
 
 #endif  // TCMALLOC_THREAD_CACHE_H_
author	chappedm@gmail.com <chappedm@gmail.com@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>	2012-12-22 18:34:43 +0000
committer	chappedm@gmail.com <chappedm@gmail.com@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>	2012-12-22 18:34:43 +0000
commit	6856d1d1b2a807efd84dcafa10b865b6de22bc28 (patch)
tree	393ef824dbf875b697c04521de639957a8967bfa
parent	ad5aa05838121d52ad1fde5463a796c3320fe067 (diff)
download	gperftools-6856d1d1b2a807efd84dcafa10b865b6de22bc28.tar.gz