From 6fa2a2574ce1c15ac12293e24691d69a41972e54 Mon Sep 17 00:00:00 2001 From: csilvers Date: Sat, 13 Dec 2008 01:35:42 +0000 Subject: Thu Dec 11 16:01:32 2008 Google Inc. * google-perftools: version 1.0rc1 release * Replace API for selectively disabling heap-checker in code (sanjay) * Add a pre-mmap hook (daven, adlr) * Add MallocExtension interface to set memory-releasing rate (fikes) * Augment pprof to allow any string ending in /pprof/profile (csilvers) * PORTING: Rewrite -- and fix -- malloc patching for windows (dvitek) * PORTING: Add nm-pdb and addr2line-pdb for use by pprof (dvitek) * PORTING: Improve cygwin and mingw support (jperkins, csilvers) * PORTING: Fix pprof for mac os x, other pprof improvements (csilvers) * PORTING: Fix some PPC bugs in our locking code (anton.blanchard) * A new unittest, smapling_test, to verify tcmalloc-profiles (csilvers) * Turn off TLS for gcc < 4.1.2, due to a TLS + -fPIC bug (csilvers) * Prefer __builtin_frame_address to assembly for stacktraces (nlewycky) * Separate tcmalloc.cc out into multiple files -- finally! (kash) * Make our locking code work with -fPIC on 32-bit x86 (aruns) * Fix an initialization-ordering bug for tcmalloc/profiling (csilvers) * Use "initial exec" model of TLS to speed up tcmalloc (csilvers) * Enforce 16-byte alignment for tcmalloc, for SSE (sanjay) git-svn-id: http://gperftools.googlecode.com/svn/trunk@60 6b5cf1ce-ec42-a296-1ba9-69fdba395a50 --- src/thread_cache.cc | 417 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 417 insertions(+) create mode 100644 src/thread_cache.cc (limited to 'src/thread_cache.cc') diff --git a/src/thread_cache.cc b/src/thread_cache.cc new file mode 100644 index 0000000..ba23a0d --- /dev/null +++ b/src/thread_cache.cc @@ -0,0 +1,417 @@ +// Copyright (c) 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// --- +// Author: Ken Ashcraft + +#include "thread_cache.h" +#include "maybe_threads.h" + +// Twice the approximate gap between sampling actions. +// I.e., we take one sample approximately once every +// tcmalloc_sample_parameter/2 +// bytes of allocation, i.e., ~ once every 128KB. +// Must be a prime number. +#ifdef NO_TCMALLOC_SAMPLES +DEFINE_int64(tcmalloc_sample_parameter, 0, + "Unused: code is compiled with NO_TCMALLOC_SAMPLES"); +static size_t sample_period = 0; +#else +DEFINE_int64(tcmalloc_sample_parameter, + EnvToInt("TCMALLOC_SAMPLE_PARAMETER", 262147), + "Twice the approximate gap between sampling actions." + " Must be a prime number. Otherwise will be rounded up to a " + " larger prime number"); +static size_t sample_period = EnvToInt("TCMALLOC_SAMPLE_PARAMETER", 262147); +#endif +// Protects sample_period above +static SpinLock sample_period_lock(SpinLock::LINKER_INITIALIZED); + +namespace tcmalloc { + +/* The smallest prime > 2^n */ +static unsigned int primes_list[] = { + // Small values might cause high rates of sampling + // and hence commented out. + // 2, 5, 11, 17, 37, 67, 131, 257, + // 521, 1031, 2053, 4099, 8209, 16411, + 32771, 65537, 131101, 262147, 524309, 1048583, + 2097169, 4194319, 8388617, 16777259, 33554467 }; + +static bool phinited = false; + +volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize; +size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize; +PageHeapAllocator threadcache_allocator; +ThreadCache* ThreadCache::thread_heaps_ = NULL; +int ThreadCache::thread_heap_count_ = 0; +#ifdef HAVE_TLS +__thread ThreadCache* ThreadCache::threadlocal_heap_ +# ifdef HAVE___ATTRIBUTE__ + __attribute__ ((tls_model ("initial-exec"))) +# endif + ; +#endif +bool ThreadCache::tsd_inited_ = false; +pthread_key_t ThreadCache::heap_key_; + +#if defined(HAVE_TLS) +bool kernel_supports_tls = false; // be conservative +# if !HAVE_DECL_UNAME // if too old for uname, probably too old for TLS + void CheckIfKernelSupportsTLS() { + kernel_supports_tls = false; + } +# else +# include // DECL_UNAME checked for too + void CheckIfKernelSupportsTLS() { + struct utsname buf; + if (uname(&buf) != 0) { // should be impossible + MESSAGE("uname failed assuming no TLS support (errno=%d)\n", errno); + kernel_supports_tls = false; + } else if (strcasecmp(buf.sysname, "linux") == 0) { + // The linux case: the first kernel to support TLS was 2.6.0 + if (buf.release[0] < '2' && buf.release[1] == '.') // 0.x or 1.x + kernel_supports_tls = false; + else if (buf.release[0] == '2' && buf.release[1] == '.' && + buf.release[2] >= '0' && buf.release[2] < '6' && + buf.release[3] == '.') // 2.0 - 2.5 + kernel_supports_tls = false; + else + kernel_supports_tls = true; + } else { // some other kernel, we'll be optimisitic + kernel_supports_tls = true; + } + // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG + } +# endif // HAVE_DECL_UNAME +#endif // HAVE_TLS + +void ThreadCache::Init(pthread_t tid) { + size_ = 0; + next_ = NULL; + prev_ = NULL; + tid_ = tid; + in_setspecific_ = false; + for (size_t cl = 0; cl < kNumClasses; ++cl) { + list_[cl].Init(); + } + + // Initialize RNG -- run it for a bit to get to good values + bytes_until_sample_ = 0; + rnd_ = static_cast(reinterpret_cast(this)); + for (int i = 0; i < 100; i++) { + PickNextSample(FLAGS_tcmalloc_sample_parameter * 2); + } +} + +void ThreadCache::Cleanup() { + // Put unused memory back into central cache + for (int cl = 0; cl < kNumClasses; ++cl) { + if (list_[cl].length() > 0) { + ReleaseToCentralCache(&list_[cl], cl, list_[cl].length()); + } + } +} + +// Remove some objects of class "cl" from central cache and add to thread heap. +// On success, return the first object for immediate use; otherwise return NULL. +void* ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) { + void *start, *end; + int fetch_count = Static::central_cache()[cl].RemoveRange( + &start, &end, + Static::sizemap()->num_objects_to_move(cl)); + ASSERT((start == NULL) == (fetch_count == 0)); + if (--fetch_count >= 0) { + size_ += byte_size * fetch_count; + list_[cl].PushRange(fetch_count, SLL_Next(start), end); + } + return start; +} + +// Remove some objects of class "cl" from thread heap and add to central cache +size_t ThreadCache::ReleaseToCentralCache(FreeList* src, + size_t cl, int N) { + ASSERT(src == &list_[cl]); + if (N > src->length()) N = src->length(); + size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl); + + // We return prepackaged chains of the correct size to the central cache. + // TODO: Use the same format internally in the thread caches? + int batch_size = Static::sizemap()->num_objects_to_move(cl); + while (N > batch_size) { + void *tail, *head; + src->PopRange(batch_size, &head, &tail); + Static::central_cache()[cl].InsertRange(head, tail, batch_size); + N -= batch_size; + } + void *tail, *head; + src->PopRange(N, &head, &tail); + Static::central_cache()[cl].InsertRange(head, tail, N); + return size_ -= delta_bytes; +} + +// Release idle memory to the central cache +void ThreadCache::Scavenge() { + // If the low-water mark for the free list is L, it means we would + // not have had to allocate anything from the central cache even if + // we had reduced the free list size by L. We aim to get closer to + // that situation by dropping L/2 nodes from the free list. This + // may not release much memory, but if so we will call scavenge again + // pretty soon and the low-water marks will be high on that call. + //int64 start = CycleClock::Now(); + + for (int cl = 0; cl < kNumClasses; cl++) { + FreeList* list = &list_[cl]; + const int lowmark = list->lowwatermark(); + if (lowmark > 0) { + const int drop = (lowmark > 1) ? lowmark/2 : 1; + ReleaseToCentralCache(list, cl, drop); + } + list->clear_lowwatermark(); + } + + //int64 finish = CycleClock::Now(); + //CycleTimer ct; + //MESSAGE("GC: %.0f ns\n", ct.CyclesToUsec(finish-start)*1000.0); +} + +void ThreadCache::PickNextSample(size_t k) { + // Copied from "base/synchronization.cc" (written by Mike Burrows) + // Make next "random" number + // x^32+x^22+x^2+x^1+1 is a primitive polynomial for random numbers + static const uint32_t kPoly = (1 << 22) | (1 << 2) | (1 << 1) | (1 << 0); + uint32_t r = rnd_; + rnd_ = (r << 1) ^ ((static_cast(r) >> 31) & kPoly); + + // Next point is "rnd_ % (sample_period)". I.e., average + // increment is "sample_period/2". + const int flag_value = FLAGS_tcmalloc_sample_parameter; + static int last_flag_value = -1; + + if (flag_value != last_flag_value) { + SpinLockHolder h(&sample_period_lock); + int i; + for (i = 0; i < (sizeof(primes_list)/sizeof(primes_list[0]) - 1); i++) { + if (primes_list[i] >= flag_value) { + break; + } + } + sample_period = primes_list[i]; + last_flag_value = flag_value; + } + + bytes_until_sample_ += rnd_ % sample_period; + + if (k > (static_cast(-1) >> 2)) { + // If the user has asked for a huge allocation then it is possible + // for the code below to loop infinitely. Just return (note that + // this throws off the sampling accuracy somewhat, but a user who + // is allocating more than 1G of memory at a time can live with a + // minor inaccuracy in profiling of small allocations, and also + // would rather not wait for the loop below to terminate). + return; + } + + while (bytes_until_sample_ < k) { + // Increase bytes_until_sample_ by enough average sampling periods + // (sample_period >> 1) to allow us to sample past the current + // allocation. + bytes_until_sample_ += (sample_period >> 1); + } + + bytes_until_sample_ -= k; +} + +void ThreadCache::InitModule() { + // There is a slight potential race here because of double-checked + // locking idiom. However, as long as the program does a small + // allocation before switching to multi-threaded mode, we will be + // fine. We increase the chances of doing such a small allocation + // by doing one in the constructor of the module_enter_exit_hook + // object declared below. + SpinLockHolder h(Static::pageheap_lock()); + if (!phinited) { + Static::InitStaticVars(); + threadcache_allocator.Init(); + phinited = 1; + } +} + +void ThreadCache::InitTSD() { + ASSERT(!tsd_inited_); + perftools_pthread_key_create(&heap_key_, DestroyThreadCache); + tsd_inited_ = true; + + // We may have used a fake pthread_t for the main thread. Fix it. + pthread_t zero; + memset(&zero, 0, sizeof(zero)); + SpinLockHolder h(Static::pageheap_lock()); + for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { + if (h->tid_ == zero) { + h->tid_ = pthread_self(); + } + } +} + +ThreadCache* ThreadCache::CreateCacheIfNecessary() { + // Initialize per-thread data if necessary + ThreadCache* heap = NULL; + { + SpinLockHolder h(Static::pageheap_lock()); + + // Early on in glibc's life, we cannot even call pthread_self() + pthread_t me; + if (!tsd_inited_) { + memset(&me, 0, sizeof(me)); + } else { + me = pthread_self(); + } + + // This may be a recursive malloc call from pthread_setspecific() + // In that case, the heap for this thread has already been created + // and added to the linked list. So we search for that first. + for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { + if (h->tid_ == me) { + heap = h; + break; + } + } + + if (heap == NULL) heap = NewHeap(me); + } + + // We call pthread_setspecific() outside the lock because it may + // call malloc() recursively. We check for the recursive call using + // the "in_setspecific_" flag so that we can avoid calling + // pthread_setspecific() if we are already inside pthread_setspecific(). + if (!heap->in_setspecific_ && tsd_inited_) { + heap->in_setspecific_ = true; + perftools_pthread_setspecific(heap_key_, heap); +#ifdef HAVE_TLS + // Also keep a copy in __thread for faster retrieval + threadlocal_heap_ = heap; +#endif + heap->in_setspecific_ = false; + } + return heap; +} + +void ThreadCache::BecomeIdle() { + if (!tsd_inited_) return; // No caches yet + ThreadCache* heap = GetThreadHeap(); + if (heap == NULL) return; // No thread cache to remove + if (heap->in_setspecific_) return; // Do not disturb the active caller + + heap->in_setspecific_ = true; + perftools_pthread_setspecific(heap_key_, NULL); +#ifdef HAVE_TLS + // Also update the copy in __thread + threadlocal_heap_ = NULL; +#endif + heap->in_setspecific_ = false; + if (GetThreadHeap() == heap) { + // Somehow heap got reinstated by a recursive call to malloc + // from pthread_setspecific. We give up in this case. + return; + } + + // We can now get rid of the heap + DeleteCache(heap); +} + +void ThreadCache::DestroyThreadCache(void* ptr) { + // Note that "ptr" cannot be NULL since pthread promises not + // to invoke the destructor on NULL values, but for safety, + // we check anyway. + if (ptr == NULL) return; +#ifdef HAVE_TLS + // Prevent fast path of GetThreadHeap() from returning heap. + threadlocal_heap_ = NULL; +#endif + DeleteCache(reinterpret_cast(ptr)); +} + +void ThreadCache::DeleteCache(ThreadCache* heap) { + // Remove all memory from heap + heap->Cleanup(); + + // Remove from linked list + SpinLockHolder h(Static::pageheap_lock()); + if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_; + if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_; + if (thread_heaps_ == heap) thread_heaps_ = heap->next_; + thread_heap_count_--; + RecomputeThreadCacheSize(); + + threadcache_allocator.Delete(heap); +} + +void ThreadCache::RecomputeThreadCacheSize() { + // Divide available space across threads + int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1; + size_t space = overall_thread_cache_size_ / n; + + // Limit to allowed range + if (space < kMinThreadCacheSize) space = kMinThreadCacheSize; + if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize; + + per_thread_cache_size_ = space; + //MESSAGE("Threads %d => cache size %8d\n", n, int(space)); +} + +void ThreadCache::Print() const { + for (int cl = 0; cl < kNumClasses; ++cl) { + MESSAGE(" %5" PRIuS " : %4" PRIuS " len; %4d lo\n", + Static::sizemap()->ByteSizeForClass(cl), + list_[cl].length(), + list_[cl].lowwatermark()); + } +} + +void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) { + for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { + *total_bytes += h->Size(); + if (class_count) { + for (int cl = 0; cl < kNumClasses; ++cl) { + class_count[cl] += h->freelist_length(cl); + } + } + } +} + +void ThreadCache::set_overall_thread_cache_size(size_t new_size) { + // Clip the value to a reasonable range + if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize; + if (new_size > (1<<30)) new_size = (1<<30); // Limit to 1GB + + overall_thread_cache_size_ = new_size; + ThreadCache::RecomputeThreadCacheSize(); +} + +} // namespace tcmalloc -- cgit v1.2.1