diff options
Diffstat (limited to 'src/tcmalloc.cc')
-rw-r--r-- | src/tcmalloc.cc | 1050 |
1 files changed, 742 insertions, 308 deletions
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc index bf45dfb..a23449b 100644 --- a/src/tcmalloc.cc +++ b/src/tcmalloc.cc @@ -55,7 +55,6 @@ // TODO: Bias reclamation to larger addresses // TODO: implement mallinfo/mallopt // TODO: Better testing -// TODO: Return memory to system // // 9/28/2003 (new page-level allocator replaces ptmalloc2): // * malloc/free of small objects goes from ~300 ns to ~50 ns. @@ -73,28 +72,68 @@ #else #include <sys/types.h> #endif -#include <malloc.h> +#ifdef HAVE_STRUCT_MALLINFO +#include <malloc.h> // for struct mallinfo +#endif #include <string.h> #include <pthread.h> #include <unistd.h> #include <errno.h> #include <stdarg.h> #include "base/commandlineflags.h" -#include "google/malloc_hook.h" -#include "google/malloc_extension.h" -#include "google/stacktrace.h" +#include "base/basictypes.h" // gets us PRIu64 +#include "base/sysinfo.h" +#include "base/spinlock.h" +#include <google/malloc_hook.h> +#include <google/malloc_extension.h> +#include <google/stacktrace.h> #include "internal_logging.h" -#include "internal_spinlock.h" #include "pagemap.h" #include "system-alloc.h" #include "maybe_threads.h" -#if defined HAVE_INTTYPES_H -#define __STDC_FORMAT_MACROS -#include <inttypes.h> -#define LLU PRIu64 -#else -#define LLU "llu" // hope for the best +// Even if we have support for thread-local storage in the compiler +// and linker, the OS may not support it. We need to check that at +// runtime. Right now, we have to keep a manual set of "bad" OSes. +#if defined(HAVE_TLS) + static bool kernel_supports_tls = false; // be conservative + static inline bool KernelSupportsTLS() { + return kernel_supports_tls; + } +# if !HAVE_DECL_UNAME // if too old for uname, probably too old for TLS + static void CheckIfKernelSupportsTLS() { + kernel_supports_tls = false; + } +# else +# include <sys/utsname.h> // DECL_UNAME checked for <sys/utsname.h> too + static void CheckIfKernelSupportsTLS() { + struct utsname buf; + if (uname(&buf) != 0) { // should be impossible + MESSAGE("uname failed assuming no TLS support (errno=%d)\n", errno); + kernel_supports_tls = false; + } else if (strcasecmp(buf.sysname, "linux") == 0) { + // The linux case: the first kernel to support TLS was 2.6.0 + if (buf.release[0] < '2' && buf.release[1] == '.') // 0.x or 1.x + kernel_supports_tls = false; + else if (buf.release[0] == '2' && buf.release[1] == '.' && + buf.release[2] >= '0' && buf.release[2] < '6' && + buf.release[3] == '.') // 2.0 - 2.5 + kernel_supports_tls = false; + else + kernel_supports_tls = true; + } else { // some other kernel, we'll be optimisitic + kernel_supports_tls = true; + } + // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG + } +# endif // HAVE_DECL_UNAME +#endif // HAVE_TLS + +// __THROW is defined in glibc systems. It means, counter-intuitively, +// "This function will never throw an exception." It's an optional +// optimization tool, but we may need to use it to match glibc prototypes. +#ifndef __THROW // I guess we're not on a glibc system +# define __THROW // __THROW is just an optimization, so ok to make it "" #endif //------------------------------------------------------------------- @@ -109,7 +148,7 @@ static const size_t kPageSize = 1 << kPageShift; static const size_t kMaxSize = 8u * kPageSize; static const size_t kAlignShift = 3; static const size_t kAlignment = 1 << kAlignShift; -static const size_t kNumClasses = 170; +static const size_t kNumClasses = 68; // Allocates a big block of memory for the pagemap once we reach more than // 128MB @@ -168,26 +207,63 @@ DEFINE_int64(tcmalloc_sample_parameter, 262147, " larger prime number"); static size_t sample_period = 262147; // Protects sample_period above -static SpinLock sample_period_lock = SPINLOCK_INITIALIZER; +static SpinLock sample_period_lock(SpinLock::LINKER_INITIALIZED); + +// Parameters for controlling how fast memory is returned to the OS. + +DEFINE_double(tcmalloc_release_rate, 1, + "Rate at which we release unused memory to the system. " + "Zero means we never release memory back to the system. " + "Increase this flag to return memory faster; decrease it " + "to return memory slower. Reasonable rates are in the " + "range [0,10]"); //------------------------------------------------------------------- // Mapping from size to size_class and vice versa //------------------------------------------------------------------- -// A pair of arrays we use for implementing the mapping from a size to -// its size class. Indexed by "floor(lg(size))". -static const int kSizeBits = 8 * sizeof(size_t); -static unsigned char size_base[kSizeBits]; -static unsigned char size_shift[kSizeBits]; - -// Mapping from size class to size +// Sizes <= 1024 have an alignment >= 8. So for such sizes we have an +// array indexed by ceil(size/8). Sizes > 1024 have an alignment >= 128. +// So for these larger sizes we have an array indexed by ceil(size/128). +// +// We flatten both logical arrays into one physical array and use +// arithmetic to compute an appropriate index. The "base_index[]" +// array contains the bases of the two logical arrays. +// +// base_index[] contains non-obvious values. We always add 127 to the +// size before dividing it by either 8 or 128 to implement ceil() +// efficiently. Therefore base_index[0] is -15 to compensate for the +// extra 127/8 we added to small sizes. Similarly base_index[1] is +// 120, so that the first index used by the second logical array is +// just past the last index used by the first logical array. +// +// Examples: +// Size Expression Index +// ------------------------------------------------------- +// 0 -15 + ((0+127) / 8) 0 +// 1 -15 + ((1+127) / 8) 1 +// ... +// 1024 -15 + ((1024+127) / 8) 128 +// 1025 120 + ((1025+127) / 128) 129 +// ... +// 32768 120 + ((32768+127) / 128) 376 +static const int kMaxSmallSize = 1024; +static const int shift_amount[2] = { 3, 7 }; // For divides by 8 or 128 +static const int base_index[2] = { -15, 120 }; // For finding array bases +static unsigned char class_array[377]; + +// Compute index of the class_array[] entry for a given size +static inline int ClassIndex(size_t s) { + const int i = (s > kMaxSmallSize); + return base_index[i] + ((s+127) >> shift_amount[i]); +} + +// Mapping from size class to max size storable in that class static size_t class_to_size[kNumClasses]; // Mapping from size class to number of pages to allocate at a time static size_t class_to_pages[kNumClasses]; - - // TransferCache is used to cache transfers of num_objects_to_move[size_class] // back and forth between thread caches and the central cache for a given size // class. @@ -202,20 +278,6 @@ struct TCEntry { // one class can have is kNumClasses. static const int kNumTransferEntries = kNumClasses; -// Return floor(log2(n)) for n > 0. -#if (defined __i386__ || defined __x86_64__) && defined __GNUC__ -static inline int LgFloor(size_t n) { - // "ro" for the input spec means the input can come from either a - // register ("r") or offsetable memory ("o"). - size_t result; - __asm__("bsr %1, %0" - : "=r" (result) // Output spec - : "ro" (n) // Input spec - : "cc" // Clobbers condition-codes - ); - return result; -} -#else // Note: the following only works for "n"s that fit in 32-bits, but // that is fine since we only use it for small sizes. static inline int LgFloor(size_t n) { @@ -231,8 +293,6 @@ static inline int LgFloor(size_t n) { ASSERT(n == 1); return log; } -#endif - // Some very basic linked list functions for dealing with using void * as // storage. @@ -298,10 +358,7 @@ static inline size_t SLL_Size(void *head) { // Setup helper functions. static inline int SizeClass(size_t size) { - if (size == 0) size = 1; - const int lg = LgFloor(size); - const int align = size_shift[lg]; - return static_cast<int>(size_base[lg]) + ((size-1) >> align); + return class_array[ClassIndex(size)]; } // Get the byte-size for a specified class @@ -335,13 +392,18 @@ static int NumMoveSize(size_t size) { // Initialize the mapping arrays static void InitSizeClasses() { - // Special initialization for small sizes - for (int lg = 0; lg < kAlignShift; lg++) { - size_base[lg] = 1; - size_shift[lg] = kAlignShift; + // Do some sanity checking on base_index[]/shift_amount[]/class_array[] + if (ClassIndex(0) < 0) { + MESSAGE("Invalid class index %d for size 0\n", ClassIndex(0)); + abort(); + } + if (ClassIndex(kMaxSize) >= sizeof(class_array)) { + MESSAGE("Invalid class index %d for kMaxSize\n", ClassIndex(kMaxSize)); + abort(); } - int next_class = 1; + // Compute the size classes we want to use + int sc = 1; // Next size class to assign int alignshift = kAlignShift; int last_lg = -1; for (size_t size = kAlignment; size <= kMaxSize; size += (1 << alignshift)) { @@ -357,31 +419,49 @@ static void InitSizeClasses() { if ((lg >= 7) && (alignshift < 8)) { alignshift++; } - size_base[lg] = next_class - ((size-1) >> alignshift); - size_shift[lg] = alignshift; + last_lg = lg; } - class_to_size[next_class] = size; - last_lg = lg; + // Allocate enough pages so leftover is less than 1/8 of total. + // This bounds wasted space to at most 12.5%. + size_t psize = kPageSize; + while ((psize % size) > (psize >> 3)) { + psize += kPageSize; + } + const size_t my_pages = psize >> kPageShift; + + if (sc > 1 && my_pages == class_to_pages[sc-1]) { + // See if we can merge this into the previous class without + // increasing the fragmentation of the previous class. + const size_t my_objects = (my_pages << kPageShift) / size; + const size_t prev_objects = (class_to_pages[sc-1] << kPageShift) + / class_to_size[sc-1]; + if (my_objects == prev_objects) { + // Adjust last class to include this size + class_to_size[sc-1] = size; + continue; + } + } - next_class++; + // Add new class + class_to_pages[sc] = my_pages; + class_to_size[sc] = size; + sc++; } - if (next_class >= kNumClasses) { - MESSAGE("used up too many size classes: %d\n", next_class); + if (sc != kNumClasses) { + MESSAGE("wrong number of size classes: found %d instead of %d\n", + sc, int(kNumClasses)); abort(); } - // Initialize the number of pages we should allocate to split into - // small objects for a given class. - for (size_t cl = 1; cl < next_class; cl++) { - // Allocate enough pages so leftover is less than 1/8 of total. - // This bounds wasted space to at most 12.5%. - size_t psize = kPageSize; - const size_t s = class_to_size[cl]; - while ((psize % s) > (psize >> 3)) { - psize += kPageSize; + // Initialize the mapping arrays + int next_size = 0; + for (int c = 1; c < kNumClasses; c++) { + const int max_size_in_class = class_to_size[c]; + for (int s = next_size; s <= max_size_in_class; s += kAlignment) { + class_array[ClassIndex(s)] = c; } - class_to_pages[cl] = psize >> kPageShift; + next_size = max_size_in_class + kAlignment; } // Double-check sizes just to be safe @@ -415,6 +495,23 @@ static void InitSizeClasses() { for (size_t cl = 1; cl < kNumClasses; ++cl) { num_objects_to_move[cl] = NumMoveSize(ByteSizeForClass(cl)); } + + if (false) { + // Dump class sizes and maximum external wastage per size class + for (size_t cl = 1; cl < kNumClasses; ++cl) { + const int alloc_size = class_to_pages[cl] << kPageShift; + const int alloc_objs = alloc_size / class_to_size[cl]; + const int min_used = (class_to_size[cl-1] + 1) * alloc_objs; + const int max_waste = alloc_size - min_used; + MESSAGE("SC %3d [ %8d .. %8d ] from %8d ; %2.0f%% maxwaste\n", + int(cl), + int(class_to_size[cl-1] + 1), + int(class_to_size[cl]), + int(class_to_pages[cl] << kPageShift), + max_waste * 100.0 / alloc_size + ); + } + } } // ------------------------------------------------------------------------- @@ -620,20 +717,6 @@ static void DLL_Prepend(Span* list, Span* span) { list->next = span; } -static void DLL_InsertOrdered(Span* list, Span* span) { - ASSERT(span->next == NULL); - ASSERT(span->prev == NULL); - // Look for appropriate place to insert - Span* x = list; - while ((x->next != list) && (x->next->start < span->start)) { - x = x->next; - } - span->next = x->next; - span->prev = x; - x->next->prev = span; - x->next = span; -} - // ------------------------------------------------------------------------- // Stack traces kept for sampled allocations // The following state is protected by pageheap_lock_. @@ -729,16 +812,27 @@ class TCMalloc_PageHeap { bool Check(); bool CheckList(Span* list, Length min_pages, Length max_pages); + // Release all pages on the free list for reuse by the OS: + void ReleaseFreePages(); + private: // Pick the appropriate map type based on pointer size typedef MapSelector<8*sizeof(uintptr_t)>::Type PageMap; PageMap pagemap_; + // We segregate spans of a given size into two circular linked + // lists: one for normal spans, and one for spans whose memory + // has been returned to the system. + struct SpanList { + Span normal; + Span returned; + }; + // List of free spans of length >= kMaxPages - Span large_; + SpanList large_; // Array mapping from span length to a doubly linked list of free spans - Span free_[kMaxPages]; + SpanList free_[kMaxPages]; // Number of pages kept in free lists uintptr_t free_pages_; @@ -753,7 +847,9 @@ class TCMalloc_PageHeap { // span into appropriate free lists. Also update "span" to have // length exactly "n" and mark it as non-free so it can be returned // to the client. - void Carve(Span* span, Length n); + // + // "released" is true iff "span" was found on a "returned" list. + void Carve(Span* span, Length n, bool released); void RecordSpan(Span* span) { pagemap_.set(span->start, span); @@ -761,14 +857,34 @@ class TCMalloc_PageHeap { pagemap_.set(span->start + span->length - 1, span); } } + + // Allocate a large span of length == n. If successful, returns a + // span of exactly the specified length. Else, returns NULL. + Span* AllocLarge(Length n); + + // Incrementally release some memory to the system. + // IncrementalScavenge(n) is called whenever n pages are freed. + void IncrementalScavenge(Length n); + + // Number of pages to deallocate before doing more scavenging + int64_t scavenge_counter_; + + // Index of last free list we scavenged + int scavenge_index_; }; -TCMalloc_PageHeap::TCMalloc_PageHeap() : pagemap_(MetaDataAlloc), - free_pages_(0), - system_bytes_(0) { - DLL_Init(&large_); +TCMalloc_PageHeap::TCMalloc_PageHeap() + : pagemap_(MetaDataAlloc), + free_pages_(0), + system_bytes_(0), + scavenge_counter_(0), + // Start scavenging at kMaxPages list + scavenge_index_(kMaxPages-1) { + DLL_Init(&large_.normal); + DLL_Init(&large_.returned); for (int i = 0; i < kMaxPages; i++) { - DLL_Init(&free_[i]); + DLL_Init(&free_[i].normal); + DLL_Init(&free_[i].returned); } } @@ -780,40 +896,79 @@ Span* TCMalloc_PageHeap::New(Length n) { // Find first size >= n that has a non-empty list for (Length s = n; s < kMaxPages; s++) { - if (!DLL_IsEmpty(&free_[s])) { - Span* result = free_[s].next; - Carve(result, n); - ASSERT(Check()); - free_pages_ -= n; - return result; + Span* ll = NULL; + bool released = false; + if (!DLL_IsEmpty(&free_[s].normal)) { + // Found normal span + ll = &free_[s].normal; + } else if (!DLL_IsEmpty(&free_[s].returned)) { + // Found returned span; reallocate it + ll = &free_[s].returned; + released = true; + } else { + // Keep looking in larger classes + continue; } + + Span* result = ll->next; + Carve(result, n, released); + ASSERT(Check()); + free_pages_ -= n; + return result; } - // Look in large list. If we first do not find something, we try to - // grow the heap and try again. - for (int i = 0; i < 2; i++) { - // find the best span (closest to n in size) - Span *best = NULL; - for (Span* span = large_.next; span != &large_; span = span->next) { - if (span->length >= n && - (best == NULL || span->length < best->length)) { + Span* result = AllocLarge(n); + if (result != NULL) return result; + + // Grow the heap and try again + if (!GrowHeap(n)) { + ASSERT(Check()); + return NULL; + } + + return AllocLarge(n); +} + +Span* TCMalloc_PageHeap::AllocLarge(Length n) { + // find the best span (closest to n in size). + // The following loops implements address-ordered best-fit. + bool from_released = false; + Span *best = NULL; + + // Search through normal list + for (Span* span = large_.normal.next; + span != &large_.normal; + span = span->next) { + if (span->length >= n) { + if ((best == NULL) + || (span->length < best->length) + || ((span->length == best->length) && (span->start < best->start))) { best = span; + from_released = false; } } - if (best != NULL) { - Carve(best, n); - ASSERT(Check()); - free_pages_ -= n; - return best; - } - if (i == 0) { - // Nothing suitable in large list. Grow the heap and look again. - if (!GrowHeap(n)) { - ASSERT(Check()); - return NULL; + } + + // Search through released list in case it has a better fit + for (Span* span = large_.returned.next; + span != &large_.returned; + span = span->next) { + if (span->length >= n) { + if ((best == NULL) + || (span->length < best->length) + || ((span->length == best->length) && (span->start < best->start))) { + best = span; + from_released = true; } } } + + if (best != NULL) { + Carve(best, n, from_released); + ASSERT(Check()); + free_pages_ -= n; + return best; + } return NULL; } @@ -834,7 +989,7 @@ Span* TCMalloc_PageHeap::Split(Span* span, Length n) { return leftover; } -void TCMalloc_PageHeap::Carve(Span* span, Length n) { +void TCMalloc_PageHeap::Carve(Span* span, Length n, bool released) { ASSERT(n > 0); DLL_Remove(span); span->free = 0; @@ -847,11 +1002,12 @@ void TCMalloc_PageHeap::Carve(Span* span, Length n) { leftover->free = 1; Event(leftover, 'S', extra); RecordSpan(leftover); - if (extra < kMaxPages) { - DLL_Prepend(&free_[extra], leftover); - } else { - DLL_InsertOrdered(&large_, leftover); - } + + // Place leftover span on appropriate free list + SpanList* listpair = (extra < kMaxPages) ? &free_[extra] : &large_; + Span* dst = released ? &listpair->returned : &listpair->normal; + DLL_Prepend(dst, leftover); + span->length = n; pagemap_.set(span->start + n - 1, span); } @@ -870,6 +1026,10 @@ void TCMalloc_PageHeap::Delete(Span* span) { // necessary. We do not bother resetting the stale pagemap // entries for the pieces we are merging together because we only // care about the pagemap entries for the boundaries. + // + // Note that the spans we merge into "span" may come out of + // a "returned" list. For simplicity, we move these into the + // "normal" list of the appropriate size class. const PageID p = span->start; const Length n = span->length; Span* prev = GetDescriptor(p-1); @@ -899,15 +1059,71 @@ void TCMalloc_PageHeap::Delete(Span* span) { Event(span, 'D', span->length); span->free = 1; if (span->length < kMaxPages) { - DLL_Prepend(&free_[span->length], span); + DLL_Prepend(&free_[span->length].normal, span); } else { - DLL_InsertOrdered(&large_, span); + DLL_Prepend(&large_.normal, span); } free_pages_ += n; + IncrementalScavenge(n); ASSERT(Check()); } +void TCMalloc_PageHeap::IncrementalScavenge(Length n) { + // Fast path; not yet time to release memory + scavenge_counter_ -= n; + if (scavenge_counter_ >= 0) return; // Not yet time to scavenge + + // Never delay scavenging for more than the following number of + // deallocated pages. With 4K pages, this comes to 4GB of + // deallocation. + static const int kMaxReleaseDelay = 1 << 20; + + // If there is nothing to release, wait for so many pages before + // scavenging again. With 4K pages, this comes to 1GB of memory. + static const int kDefaultReleaseDelay = 1 << 18; + + const double rate = FLAGS_tcmalloc_release_rate; + if (rate <= 1e-6) { + // Tiny release rate means that releasing is disabled. + scavenge_counter_ = kDefaultReleaseDelay; + return; + } + + // Find index of free list to scavenge + int index = scavenge_index_ + 1; + for (int i = 0; i < kMaxPages+1; i++) { + if (index > kMaxPages) index = 0; + SpanList* slist = (index == kMaxPages) ? &large_ : &free_[index]; + if (!DLL_IsEmpty(&slist->normal)) { + // Release the last span on the normal portion of this list + Span* s = slist->normal.prev; + DLL_Remove(s); + TCMalloc_SystemRelease(reinterpret_cast<void*>(s->start << kPageShift), + static_cast<size_t>(s->length << kPageShift)); + DLL_Prepend(&slist->returned, s); + + // Compute how long to wait until we return memory. + // FLAGS_tcmalloc_release_rate==1 means wait for 1000 pages + // after releasing one page. + const double mult = 1000.0 / rate; + double wait = mult * static_cast<double>(s->length); + if (wait > kMaxReleaseDelay) { + // Avoid overflow and bound to reasonable range + wait = kMaxReleaseDelay; + } + scavenge_counter_ = static_cast<int64_t>(wait); + + scavenge_index_ = index; // Scavenge at index+1 next time + return; + } + index++; + } + + // Nothing to scavenge, delay for a while + scavenge_counter_ = kDefaultReleaseDelay; +} + void TCMalloc_PageHeap::RegisterSizeClass(Span* span, size_t sc) { // Associate span object with all interior pages as well ASSERT(!span->free); @@ -920,40 +1136,69 @@ void TCMalloc_PageHeap::RegisterSizeClass(Span* span, size_t sc) { } } +static double PagesToMB(uint64_t pages) { + return (pages << kPageShift) / 1048576.0; +} + void TCMalloc_PageHeap::Dump(TCMalloc_Printer* out) { int nonempty_sizes = 0; for (int s = 0; s < kMaxPages; s++) { - if (!DLL_IsEmpty(&free_[s])) nonempty_sizes++; + if (!DLL_IsEmpty(&free_[s].normal) || !DLL_IsEmpty(&free_[s].returned)) { + nonempty_sizes++; + } } out->printf("------------------------------------------------\n"); - out->printf("PageHeap: %d sizes; %6.1f MB free\n", nonempty_sizes, - (static_cast<double>(free_pages_) * kPageSize) / 1048576.0); + out->printf("PageHeap: %d sizes; %6.1f MB free\n", + nonempty_sizes, PagesToMB(free_pages_)); out->printf("------------------------------------------------\n"); - uint64_t cumulative = 0; + uint64_t total_normal = 0; + uint64_t total_returned = 0; for (int s = 0; s < kMaxPages; s++) { - if (!DLL_IsEmpty(&free_[s])) { - const int list_length = DLL_Length(&free_[s]); - uint64_t s_pages = s * list_length; - cumulative += s_pages; - out->printf("%6u pages * %6u spans ~ %6.1f MB; %6.1f MB cum\n", - s, list_length, - (s_pages << kPageShift) / 1048576.0, - (cumulative << kPageShift) / 1048576.0); + const int n_length = DLL_Length(&free_[s].normal); + const int r_length = DLL_Length(&free_[s].returned); + if (n_length + r_length > 0) { + uint64_t n_pages = s * n_length; + uint64_t r_pages = s * r_length; + total_normal += n_pages; + total_returned += r_pages; + out->printf("%6u pages * %6u spans ~ %6.1f MB; %6.1f MB cum" + "; unmapped: %6.1f MB; %6.1f MB cum\n", + s, + (n_length + r_length), + PagesToMB(n_pages + r_pages), + PagesToMB(total_normal + total_returned), + PagesToMB(r_pages), + PagesToMB(total_returned)); } } - uint64_t large_pages = 0; - int large_spans = 0; - for (Span* s = large_.next; s != &large_; s = s->next) { - out->printf(" [ %6" PRIuS " pages ]\n", s->length); - large_pages += s->length; - large_spans++; - } - cumulative += large_pages; - out->printf(">255 large * %6u spans ~ %6.1f MB; %6.1f MB cum\n", - large_spans, - (large_pages << kPageShift) / 1048576.0, - (cumulative << kPageShift) / 1048576.0); + uint64_t n_pages = 0; + uint64_t r_pages = 0; + int n_spans = 0; + int r_spans = 0; + out->printf("Normal large spans:\n"); + for (Span* s = large_.normal.next; s != &large_.normal; s = s->next) { + out->printf(" [ %6" PRIuS " pages ] %6.1f MB\n", + s->length, PagesToMB(s->length)); + n_pages += s->length; + n_spans++; + } + out->printf("Unmapped large spans:\n"); + for (Span* s = large_.returned.next; s != &large_.returned; s = s->next) { + out->printf(" [ %6" PRIuS " pages ] %6.1f MB\n", + s->length, PagesToMB(s->length)); + r_pages += s->length; + r_spans++; + } + total_normal += n_pages; + total_returned += r_pages; + out->printf(">255 large * %6u spans ~ %6.1f MB; %6.1f MB cum" + "; unmapped: %6.1f MB; %6.1f MB cum\n", + (n_spans + r_spans), + PagesToMB(n_pages + r_pages), + PagesToMB(total_normal + total_returned), + PagesToMB(r_pages), + PagesToMB(total_returned)); } static void RecordGrowth(size_t growth) { @@ -1013,10 +1258,13 @@ bool TCMalloc_PageHeap::GrowHeap(Length n) { } bool TCMalloc_PageHeap::Check() { - ASSERT(free_[0].next == &free_[0]); - CheckList(&large_, kMaxPages, 1000000000); + ASSERT(free_[0].normal.next == &free_[0].normal); + ASSERT(free_[0].returned.next == &free_[0].returned); + CheckList(&large_.normal, kMaxPages, 1000000000); + CheckList(&large_.returned, kMaxPages, 1000000000); for (Length s = 1; s < kMaxPages; s++) { - CheckList(&free_[s], s, s); + CheckList(&free_[s].normal, s, s); + CheckList(&free_[s].returned, s, s); } return true; } @@ -1032,6 +1280,26 @@ bool TCMalloc_PageHeap::CheckList(Span* list, Length min_pages, Length max_pages return true; } +static void ReleaseFreeList(Span* list, Span* returned) { + // Walk backwards through list so that when we push these + // spans on the "returned" list, we preserve the order. + while (!DLL_IsEmpty(list)) { + Span* s = list->prev; + DLL_Remove(s); + DLL_Prepend(returned, s); + TCMalloc_SystemRelease(reinterpret_cast<void*>(s->start << kPageShift), + static_cast<size_t>(s->length << kPageShift)); + } +} + +void TCMalloc_PageHeap::ReleaseFreePages() { + for (Length s = 0; s < kMaxPages; s++) { + ReleaseFreeList(&free_[s].normal, &free_[s].returned); + } + ReleaseFreeList(&large_.normal, &large_.returned); + ASSERT(Check()); +} + //------------------------------------------------------------------- // Free list //------------------------------------------------------------------- @@ -1105,6 +1373,11 @@ class TCMalloc_ThreadCache { uint32_t rnd_; // Cheap random number generator size_t bytes_until_sample_; // Bytes until we sample next + // Allocate a new heap. REQUIRES: pageheap_lock is held. + static inline TCMalloc_ThreadCache* NewHeap(pthread_t tid); + + // Use only as pthread thread-specific destructor function. + static void DestroyThreadCache(void* ptr); public: // All ThreadCache objects are kept in a linked list (for stats collection) TCMalloc_ThreadCache* next_; @@ -1132,14 +1405,16 @@ class TCMalloc_ThreadCache { bool SampleAllocation(size_t k); // Pick next sampling point - void PickNextSample(); + void PickNextSample(size_t k); static void InitModule(); static void InitTSD(); + static TCMalloc_ThreadCache* GetThreadHeap(); static TCMalloc_ThreadCache* GetCache(); static TCMalloc_ThreadCache* GetCacheIfPresent(); - static void* CreateCacheIfNecessary(); - static void DeleteCache(void* ptr); + static TCMalloc_ThreadCache* CreateCacheIfNecessary(); + static void DeleteCache(TCMalloc_ThreadCache* heap); + static void BecomeIdle(); static void RecomputeThreadCacheSize(); }; @@ -1260,7 +1535,7 @@ class TCMalloc_Central_FreeListPadded : public TCMalloc_Central_FreeList { static TCMalloc_Central_FreeListPadded central_cache[kNumClasses]; // Page-level allocator -static SpinLock pageheap_lock = SPINLOCK_INITIALIZER; +static SpinLock pageheap_lock(SpinLock::LINKER_INITIALIZED); static char pageheap_memory[sizeof(TCMalloc_PageHeap)]; static bool phinited = false; @@ -1268,6 +1543,16 @@ static bool phinited = false; // of pageheap_memory. #define pageheap ((TCMalloc_PageHeap*) pageheap_memory) +// If TLS is available, we also store a copy +// of the per-thread object in a __thread variable +// since __thread variables are faster to read +// than pthread_getspecific(). We still need +// pthread_setspecific() because __thread +// variables provide no way to run cleanup +// code when a thread is destroyed. +#ifdef HAVE_TLS +static __thread TCMalloc_ThreadCache *threadlocal_heap; +#endif // Thread-specific key. Initialization here is somewhat tricky // because some Linux startup code invokes malloc() before it // is in a good enough state to handle pthread_keycreate(). @@ -1297,7 +1582,6 @@ static volatile size_t per_thread_cache_size = kMaxThreadCacheSize; //------------------------------------------------------------------- void TCMalloc_Central_FreeList::Init(size_t cl) { - lock_.Init(); size_class_ = cl; DLL_Init(&empty_); DLL_Init(&nonempty_); @@ -1396,9 +1680,9 @@ bool TCMalloc_Central_FreeList::MakeCacheSpace() { namespace { class LockInverter { private: - TCMalloc_SpinLock *held_, *temp_; + SpinLock *held_, *temp_; public: - inline explicit LockInverter(TCMalloc_SpinLock* held, TCMalloc_SpinLock *temp) + inline explicit LockInverter(SpinLock* held, SpinLock *temp) : held_(held), temp_(temp) { held_->Unlock(); temp_->Lock(); } inline ~LockInverter() { temp_->Unlock(); held_->Lock(); } }; @@ -1558,7 +1842,7 @@ void TCMalloc_Central_FreeList::Populate() { inline bool TCMalloc_ThreadCache::SampleAllocation(size_t k) { if (bytes_until_sample_ < k) { - PickNextSample(); + PickNextSample(k); return true; } else { bytes_until_sample_ -= k; @@ -1577,9 +1861,10 @@ void TCMalloc_ThreadCache::Init(pthread_t tid) { } // Initialize RNG -- run it for a bit to get to good values + bytes_until_sample_ = 0; rnd_ = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(this)); for (int i = 0; i < 100; i++) { - PickNextSample(); + PickNextSample(FLAGS_tcmalloc_sample_parameter * 2); } } @@ -1670,27 +1955,7 @@ void TCMalloc_ThreadCache::Scavenge() { //MESSAGE("GC: %.0f ns\n", ct.CyclesToUsec(finish-start)*1000.0); } -inline TCMalloc_ThreadCache* TCMalloc_ThreadCache::GetCache() { - void* ptr = NULL; - if (!tsd_inited) { - InitModule(); - } else { - ptr = perftools_pthread_getspecific(heap_key); - } - if (ptr == NULL) ptr = CreateCacheIfNecessary(); - return reinterpret_cast<TCMalloc_ThreadCache*>(ptr); -} - -// In deletion paths, we do not try to create a thread-cache. This is -// because we may be in the thread destruction code and may have -// already cleaned up the cache for this thread. -inline TCMalloc_ThreadCache* TCMalloc_ThreadCache::GetCacheIfPresent() { - if (!tsd_inited) return NULL; - return reinterpret_cast<TCMalloc_ThreadCache*> - (perftools_pthread_getspecific(heap_key)); -} - -void TCMalloc_ThreadCache::PickNextSample() { +void TCMalloc_ThreadCache::PickNextSample(size_t k) { // Make next "random" number // x^32+x^22+x^2+x^1+1 is a primitive polynomial for random numbers static const uint32_t kPoly = (1 << 22) | (1 << 2) | (1 << 1) | (1 << 0); @@ -1713,7 +1978,27 @@ void TCMalloc_ThreadCache::PickNextSample() { sample_period = primes_list[i]; last_flag_value = flag_value; } - bytes_until_sample_ = rnd_ % sample_period; + + bytes_until_sample_ += rnd_ % sample_period; + + if (k > (static_cast<size_t>(-1) >> 2)) { + // If the user has asked for a huge allocation then it is possible + // for the code below to loop infinitely. Just return (note that + // this throws off the sampling accuracy somewhat, but a user who + // is allocating more than 1G of memory at a time can live with a + // minor inaccuracy in profiling of small allocations, and also + // would rather not wait for the loop below to terminate). + return; + } + + while (bytes_until_sample_ < k) { + // Increase bytes_until_sample_ by enough average sampling periods + // (sample_period >> 1) to allow us to sample past the current + // allocation. + bytes_until_sample_ += (sample_period >> 1); + } + + bytes_until_sample_ -= k; } void TCMalloc_ThreadCache::InitModule() { @@ -1740,9 +2025,52 @@ void TCMalloc_ThreadCache::InitModule() { } } +inline TCMalloc_ThreadCache* TCMalloc_ThreadCache::NewHeap(pthread_t tid) { + // Create the heap and add it to the linked list + TCMalloc_ThreadCache *heap = threadheap_allocator.New(); + heap->Init(tid); + heap->next_ = thread_heaps; + heap->prev_ = NULL; + if (thread_heaps != NULL) thread_heaps->prev_ = heap; + thread_heaps = heap; + thread_heap_count++; + RecomputeThreadCacheSize(); + return heap; +} + +inline TCMalloc_ThreadCache* TCMalloc_ThreadCache::GetThreadHeap() { +#ifdef HAVE_TLS + // __thread is faster, but only when the kernel supports it + if (KernelSupportsTLS()) + return threadlocal_heap; +#endif + return + reinterpret_cast<TCMalloc_ThreadCache *>(perftools_pthread_getspecific(heap_key)); +} + +inline TCMalloc_ThreadCache* TCMalloc_ThreadCache::GetCache() { + TCMalloc_ThreadCache* ptr = NULL; + if (!tsd_inited) { + InitModule(); + } else { + ptr = GetThreadHeap(); + } + if (ptr == NULL) ptr = CreateCacheIfNecessary(); + return ptr; +} + +// In deletion paths, we do not try to create a thread-cache. This is +// because we may be in the thread destruction code and may have +// already cleaned up the cache for this thread. +inline TCMalloc_ThreadCache* TCMalloc_ThreadCache::GetCacheIfPresent() { + if (!tsd_inited) return NULL; + void* const p = GetThreadHeap(); + return reinterpret_cast<TCMalloc_ThreadCache*>(p); +} + void TCMalloc_ThreadCache::InitTSD() { ASSERT(!tsd_inited); - perftools_pthread_key_create(&heap_key, DeleteCache); + perftools_pthread_key_create(&heap_key, DestroyThreadCache); tsd_inited = true; // We may have used a fake pthread_t for the main thread. Fix it. @@ -1756,7 +2084,7 @@ void TCMalloc_ThreadCache::InitTSD() { } } -void* TCMalloc_ThreadCache::CreateCacheIfNecessary() { +TCMalloc_ThreadCache* TCMalloc_ThreadCache::CreateCacheIfNecessary() { // Initialize per-thread data if necessary TCMalloc_ThreadCache* heap = NULL; { @@ -1780,17 +2108,7 @@ void* TCMalloc_ThreadCache::CreateCacheIfNecessary() { } } - if (heap == NULL) { - // Create the heap and add it to the linked list - heap = threadheap_allocator.New(); - heap->Init(me); - heap->next_ = thread_heaps; - heap->prev_ = NULL; - if (thread_heaps != NULL) thread_heaps->prev_ = heap; - thread_heaps = heap; - thread_heap_count++; - RecomputeThreadCacheSize(); - } + if (heap == NULL) heap = NewHeap(me); } // We call pthread_setspecific() outside the lock because it may @@ -1800,15 +2118,52 @@ void* TCMalloc_ThreadCache::CreateCacheIfNecessary() { if (!heap->in_setspecific_ && tsd_inited) { heap->in_setspecific_ = true; perftools_pthread_setspecific(heap_key, heap); +#ifdef HAVE_TLS + // Also keep a copy in __thread for faster retrieval + threadlocal_heap = heap; +#endif heap->in_setspecific_ = false; } return heap; } -void TCMalloc_ThreadCache::DeleteCache(void* ptr) { +void TCMalloc_ThreadCache::BecomeIdle() { + if (!tsd_inited) return; // No caches yet + TCMalloc_ThreadCache* heap = GetThreadHeap(); + if (heap == NULL) return; // No thread cache to remove + if (heap->in_setspecific_) return; // Do not disturb the active caller + + heap->in_setspecific_ = true; + perftools_pthread_setspecific(heap_key, NULL); +#ifdef HAVE_TLS + // Also update the copy in __thread + threadlocal_heap = NULL; +#endif + heap->in_setspecific_ = false; + if (GetThreadHeap() == heap) { + // Somehow heap got reinstated by a recursive call to malloc + // from pthread_setspecific. We give up in this case. + return; + } + + // We can now get rid of the heap + DeleteCache(heap); +} + +void TCMalloc_ThreadCache::DestroyThreadCache(void* ptr) { + // Note that "ptr" cannot be NULL since pthread promises not + // to invoke the destructor on NULL values, but for safety, + // we check anyway. + if (ptr == NULL) return; +#ifdef HAVE_TLS + // Prevent fast path of GetThreadHeap() from returning heap. + threadlocal_heap = NULL; +#endif + DeleteCache(reinterpret_cast<TCMalloc_ThreadCache*>(ptr)); +} + +void TCMalloc_ThreadCache::DeleteCache(TCMalloc_ThreadCache* heap) { // Remove all memory from heap - TCMalloc_ThreadCache* heap; - heap = reinterpret_cast<TCMalloc_ThreadCache*>(ptr); heap->Cleanup(); // Remove from linked list @@ -1832,6 +2187,7 @@ void TCMalloc_ThreadCache::RecomputeThreadCacheSize() { if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize; per_thread_cache_size = space; + //MESSAGE("Threads %d => cache size %8d\n", n, int(space)); } void TCMalloc_ThreadCache::Print() const { @@ -1902,7 +2258,7 @@ static void DumpStats(TCMalloc_Printer* out, int level) { uint64_t class_bytes = class_count[cl] * ByteSizeForClass(cl); cumulative += class_bytes; out->printf("class %3d [ %8" PRIuS " bytes ] : " - "%8" LLU " objs; %5.1f MB; %5.1f cum MB\n", + "%8" PRIu64 " objs; %5.1f MB; %5.1f cum MB\n", cl, ByteSizeForClass(cl), class_count[cl], class_bytes / 1048576.0, @@ -1921,15 +2277,15 @@ static void DumpStats(TCMalloc_Printer* out, int level) { - stats.thread_bytes; out->printf("------------------------------------------------\n" - "MALLOC: %12" LLU " Heap size\n" - "MALLOC: %12" LLU " Bytes in use by application\n" - "MALLOC: %12" LLU " Bytes free in page heap\n" - "MALLOC: %12" LLU " Bytes free in central cache\n" - "MALLOC: %12" LLU " Bytes free in transfer cache\n" - "MALLOC: %12" LLU " Bytes free in thread caches\n" - "MALLOC: %12" LLU " Spans in use\n" - "MALLOC: %12" LLU " Thread heaps in use\n" - "MALLOC: %12" LLU " Metadata allocated\n" + "MALLOC: %12" PRIu64 " Heap size\n" + "MALLOC: %12" PRIu64 " Bytes in use by application\n" + "MALLOC: %12" PRIu64 " Bytes free in page heap\n" + "MALLOC: %12" PRIu64 " Bytes free in central cache\n" + "MALLOC: %12" PRIu64 " Bytes free in transfer cache\n" + "MALLOC: %12" PRIu64 " Bytes free in thread caches\n" + "MALLOC: %12" PRIu64 " Spans in use\n" + "MALLOC: %12" PRIu64 " Thread heaps in use\n" + "MALLOC: %12" PRIu64 " Metadata allocated\n" "------------------------------------------------\n", stats.system_bytes, bytes_in_use, @@ -2120,31 +2476,80 @@ class TCMallocImplementation : public MallocExtension { return false; } + + virtual void MarkThreadIdle() { + TCMalloc_ThreadCache::BecomeIdle(); + } + + virtual void ReleaseFreeMemory() { + SpinLockHolder h(&pageheap_lock); + pageheap->ReleaseFreePages(); + } +}; + +// The constructor allocates an object to ensure that initialization +// runs before main(), and therefore we do not have a chance to become +// multi-threaded before initialization. We also create the TSD key +// here. Presumably by the time this constructor runs, glibc is in +// good enough shape to handle pthread_key_create(). +// +// The constructor also takes the opportunity to tell STL to use +// tcmalloc. We want to do this early, before construct time, so +// all user STL allocations go through tcmalloc (which works really +// well for STL). +// +// The destructor prints stats when the program exits. +class TCMallocGuard { + public: + + TCMallocGuard() { +#ifdef HAVE_TLS // this is true if the cc/ld/libc combo support TLS + // Check whether the kernel also supports TLS (needs to happen at runtime) + CheckIfKernelSupportsTLS(); +#endif + free(malloc(1)); + TCMalloc_ThreadCache::InitTSD(); + free(malloc(1)); + MallocExtension::Register(new TCMallocImplementation); + } + + ~TCMallocGuard() { + const char* env = getenv("MALLOCSTATS"); + if (env != NULL) { + int level = atoi(env); + if (level < 1) level = 1; + PrintStats(level); + } + } }; +static TCMallocGuard module_enter_exit_hook; //------------------------------------------------------------------- // Helpers for the exported routines below //------------------------------------------------------------------- static Span* DoSampledAllocation(size_t size) { - SpinLockHolder h(&pageheap_lock); + // Grab the stack trace outside the heap lock + StackTrace tmp; + tmp.depth = GetStackTrace(tmp.stack, kMaxStackDepth, 1); + tmp.size = size; + + SpinLockHolder h(&pageheap_lock); // Allocate span - Span* span = pageheap->New(pages(size == 0 ? 1 : size)); + Span *span = pageheap->New(pages(size == 0 ? 1 : size)); if (span == NULL) { return NULL; } - + // Allocate stack trace - StackTrace* stack = stacktrace_allocator.New(); + StackTrace *stack = stacktrace_allocator.New(); if (stack == NULL) { // Sampling failed because of lack of memory return span; } - // Fill stack trace and record properly - stack->depth = GetStackTrace(stack->stack, kMaxStackDepth, 1); - stack->size = size; + *stack = tmp; span->sample = 1; span->objects = stack; DLL_Prepend(&sampled_objects, span); @@ -2155,9 +2560,6 @@ static Span* DoSampledAllocation(size_t size) { static inline void* do_malloc(size_t size) { void* ret = NULL; - if (TCMallocDebug::level >= TCMallocDebug::kVerbose) { - MESSAGE("In tcmalloc do_malloc(%" PRIuS")\n", size); - } // The following call forces module initialization TCMalloc_ThreadCache* heap = TCMalloc_ThreadCache::GetCache(); if ((FLAGS_tcmalloc_sample_parameter > 0) && heap->SampleAllocation(size)) { @@ -2180,8 +2582,6 @@ static inline void* do_malloc(size_t size) { } static inline void do_free(void* ptr) { - if (TCMallocDebug::level >= TCMallocDebug::kVerbose) - MESSAGE("In tcmalloc do_free(%p)\n", ptr); if (ptr == NULL) return; ASSERT(pageheap != NULL); // Should not call free() before malloc() const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift; @@ -2286,47 +2686,41 @@ static void* do_memalign(size_t align, size_t size) { return reinterpret_cast<void*>(span->start << kPageShift); } +// Helpers for use by exported routines below: +static inline void do_malloc_stats() { + PrintStats(1); +} -// The constructor allocates an object to ensure that initialization -// runs before main(), and therefore we do not have a chance to become -// multi-threaded before initialization. We also create the TSD key -// here. Presumably by the time this constructor runs, glibc is in -// good enough shape to handle pthread_key_create(). -// -// The constructor also takes the opportunity to tell STL to use -// tcmalloc. We want to do this early, before construct time, so -// all user STL allocations go through tcmalloc (which works really -// well for STL). -// -// The destructor prints stats when the program exits. +static inline int do_mallopt(int cmd, int value) { + return 1; // Indicates error +} -class TCMallocGuard { - public: - TCMallocGuard() { - char *envval; - if ((envval = getenv("TCMALLOC_DEBUG"))) { - TCMallocDebug::level = atoi(envval); - MESSAGE("Set tcmalloc debugging level to %d\n", TCMallocDebug::level); - } - do_free(do_malloc(1)); - TCMalloc_ThreadCache::InitTSD(); - do_free(do_malloc(1)); - MallocExtension::Register(new TCMallocImplementation); - } +#ifdef HAVE_STRUCT_MALLINFO // mallinfo isn't defined on freebsd, for instance +static inline struct mallinfo do_mallinfo() { + TCMallocStats stats; + ExtractStats(&stats, NULL); - ~TCMallocGuard() { - const char* env = getenv("MALLOCSTATS"); - if (env != NULL) { - int level = atoi(env); - if (level < 1) level = 1; - PrintStats(level); - } - } -}; + // Just some of the fields are filled in. + struct mallinfo info; + memset(&info, 0, sizeof(info)); -static TCMallocGuard module_enter_exit_hook; + // Unfortunately, the struct contains "int" field, so some of the + // size values will be truncated. + info.arena = static_cast<int>(stats.system_bytes); + info.fsmblks = static_cast<int>(stats.thread_bytes + + stats.central_bytes + + stats.transfer_bytes); + info.fordblks = static_cast<int>(stats.pageheap_bytes); + info.uordblks = static_cast<int>(stats.system_bytes + - stats.thread_bytes + - stats.central_bytes + - stats.transfer_bytes + - stats.pageheap_bytes); + return info; +} +#endif //------------------------------------------------------------------- // Exported routines @@ -2337,18 +2731,67 @@ static TCMallocGuard module_enter_exit_hook; // heap-checker.cc depends on this to start a stack trace from // the call to the (de)allocation function. -extern "C" void* malloc(size_t size) { +// Put all callers of MallocHook::Invoke* in this module into +// ATTRIBUTE_SECTION(google_malloc_allocators) section, +// so that MallocHook::GetCallerStackTrace can function accurately: + +// NOTE: __THROW expands to 'throw()', which means 'never throws.' Urgh. +extern "C" { + void* malloc(size_t size) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + void free(void* ptr) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + void* realloc(void* ptr, size_t size) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + void* calloc(size_t nmemb, size_t size) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + void cfree(void* ptr) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + + void* memalign(size_t __alignment, size_t __size) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + int posix_memalign(void** ptr, size_t align, size_t size) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + void* valloc(size_t __size) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + void* pvalloc(size_t __size) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); +} + +static void *MemalignOverride(size_t align, size_t size, const void *caller) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + +void* operator new(size_t size) + ATTRIBUTE_SECTION(google_malloc_allocators); +void operator delete(void* p) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); +void* operator new[](size_t size) + ATTRIBUTE_SECTION(google_malloc_allocators); +void operator delete[](void* p) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + +// And the nothrow variants of these: +void* operator new(size_t size, const std::nothrow_t&) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); +void operator delete(void* p, const std::nothrow_t&) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); +void* operator new[](size_t size, const std::nothrow_t&) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); +void operator delete[](void* p, const std::nothrow_t&) + __THROW ATTRIBUTE_SECTION(google_malloc_allocators); + +extern "C" void* malloc(size_t size) __THROW { void* result = do_malloc(size); MallocHook::InvokeNewHook(result, size); return result; } -extern "C" void free(void* ptr) { +extern "C" void free(void* ptr) __THROW { MallocHook::InvokeDeleteHook(ptr); do_free(ptr); } -extern "C" void* calloc(size_t n, size_t elem_size) { +extern "C" void* calloc(size_t n, size_t elem_size) __THROW { // Overflow check const size_t size = n * elem_size; if (elem_size != 0 && size / elem_size != n) return NULL; @@ -2361,12 +2804,12 @@ extern "C" void* calloc(size_t n, size_t elem_size) { return result; } -extern "C" void cfree(void* ptr) { +extern "C" void cfree(void* ptr) __THROW { MallocHook::InvokeDeleteHook(ptr); do_free(ptr); } -extern "C" void* realloc(void* old_ptr, size_t new_size) { +extern "C" void* realloc(void* old_ptr, size_t new_size) __THROW { if (old_ptr == NULL) { void* result = do_malloc(new_size); MallocHook::InvokeNewHook(result, new_size); @@ -2406,21 +2849,12 @@ extern "C" void* realloc(void* old_ptr, size_t new_size) { } } -#ifndef COMPILER_INTEL -#define OP_THROWNOTHING -#define OP_THROWBADALLOC -#else -#define OP_THROWNOTHING throw() -#define OP_THROWBADALLOC throw(std::bad_alloc) -#endif - -static SpinLock set_new_handler_lock = SPINLOCK_INITIALIZER; +static SpinLock set_new_handler_lock(SpinLock::LINKER_INITIALIZED); static inline void* cpp_alloc(size_t size, bool nothrow) { for (;;) { void* p = do_malloc(size); #ifdef PREANSINEW - MallocHook::InvokeNewHook(p, size); return p; #else if (p == NULL) { // allocation failed @@ -2446,60 +2880,77 @@ static inline void* cpp_alloc(size_t size, bool nothrow) { (*nh)(); } catch (const std::bad_alloc&) { if (!nothrow) throw; - MallocHook::InvokeNewHook(p, size); return p; } } else { // allocation success - MallocHook::InvokeNewHook(p, size); return p; } #endif } } -void* operator new(size_t size) OP_THROWBADALLOC { - return cpp_alloc(size, false); +void* operator new(size_t size) { + void* p = cpp_alloc(size, false); + // We keep this next instruction out of cpp_alloc for a reason: when + // it's in, and new just calls cpp_alloc, the optimizer may fold the + // new call into cpp_alloc, which messes up our whole section-based + // stacktracing (see ATTRIBUTE_SECTION, above). This ensures cpp_alloc + // isn't the last thing this fn calls, and prevents the folding. + MallocHook::InvokeNewHook(p, size); + return p; } -void* operator new(size_t size, const std::nothrow_t&) OP_THROWNOTHING { - return cpp_alloc(size, true); +void* operator new(size_t size, const std::nothrow_t&) __THROW { + void* p = cpp_alloc(size, true); + MallocHook::InvokeNewHook(p, size); + return p; } -void operator delete(void* p) OP_THROWNOTHING { +void operator delete(void* p) __THROW { MallocHook::InvokeDeleteHook(p); do_free(p); } -void operator delete(void* p, const std::nothrow_t&) OP_THROWNOTHING { +void operator delete(void* p, const std::nothrow_t&) __THROW { MallocHook::InvokeDeleteHook(p); do_free(p); } -void* operator new[](size_t size) OP_THROWBADALLOC { - return cpp_alloc(size, false); +void* operator new[](size_t size) { + void* p = cpp_alloc(size, false); + // We keep this next instruction out of cpp_alloc for a reason: when + // it's in, and new just calls cpp_alloc, the optimizer may fold the + // new call into cpp_alloc, which messes up our whole section-based + // stacktracing (see ATTRIBUTE_SECTION, above). This ensures cpp_alloc + // isn't the last thing this fn calls, and prevents the folding. + MallocHook::InvokeNewHook(p, size); + return p; } -void* operator new[](size_t size, const std::nothrow_t&) OP_THROWNOTHING { - return cpp_alloc(size, true); +void* operator new[](size_t size, const std::nothrow_t&) __THROW { + void* p = cpp_alloc(size, true); + MallocHook::InvokeNewHook(p, size); + return p; } -void operator delete[](void* p) OP_THROWNOTHING { +void operator delete[](void* p) __THROW { MallocHook::InvokeDeleteHook(p); do_free(p); } -void operator delete[](void* p, const std::nothrow_t&) OP_THROWNOTHING { +void operator delete[](void* p, const std::nothrow_t&) __THROW { MallocHook::InvokeDeleteHook(p); do_free(p); } -extern "C" void* memalign(size_t align, size_t size) { +extern "C" void* memalign(size_t align, size_t size) __THROW { void* result = do_memalign(align, size); MallocHook::InvokeNewHook(result, size); return result; } -extern "C" int posix_memalign(void** result_ptr, size_t align, size_t size) { +extern "C" int posix_memalign(void** result_ptr, size_t align, size_t size) + __THROW { if (((align % sizeof(void*)) != 0) || ((align & (align - 1)) != 0) || (align == 0)) { @@ -2518,7 +2969,7 @@ extern "C" int posix_memalign(void** result_ptr, size_t align, size_t size) { static size_t pagesize = 0; -extern "C" void* valloc(size_t size) { +extern "C" void* valloc(size_t size) __THROW { // Allocate page-aligned object of length >= size bytes if (pagesize == 0) pagesize = getpagesize(); void* result = do_memalign(pagesize, size); @@ -2526,7 +2977,7 @@ extern "C" void* valloc(size_t size) { return result; } -extern "C" void* pvalloc(size_t size) { +extern "C" void* pvalloc(size_t size) __THROW { // Round up size to a multiple of pagesize if (pagesize == 0) pagesize = getpagesize(); size = (size + pagesize - 1) & ~(pagesize - 1); @@ -2536,36 +2987,18 @@ extern "C" void* pvalloc(size_t size) { } extern "C" void malloc_stats(void) { - PrintStats(1); + do_malloc_stats(); } extern "C" int mallopt(int cmd, int value) { - return 1; // Indicates error + return do_mallopt(cmd, value); } +#ifdef HAVE_STRUCT_MALLINFO extern "C" struct mallinfo mallinfo(void) { - TCMallocStats stats; - ExtractStats(&stats, NULL); - - // Just some of the fields are filled in. - struct mallinfo info; - memset(&info, 0, sizeof(info)); - - // Unfortunately, the struct contains "int" field, so some of the - // size values will be truncated. - info.arena = static_cast<int>(stats.system_bytes); - info.fsmblks = static_cast<int>(stats.thread_bytes - + stats.central_bytes - + stats.transfer_bytes); - info.fordblks = static_cast<int>(stats.pageheap_bytes); - info.uordblks = static_cast<int>(stats.system_bytes - - stats.thread_bytes - - stats.central_bytes - - stats.transfer_bytes - - stats.pageheap_bytes); - - return info; + return do_mallinfo(); } +#endif //------------------------------------------------------------------- // Some library routines on RedHat 9 allocate memory using malloc() @@ -2611,7 +3044,8 @@ extern "C" { // This function is an exception to the rule of calling MallocHook method // from the stack frame of the allocation function; // heap-checker handles this special case explicitly. -static void *MemalignOverride(size_t align, size_t size, const void *caller) { +static void *MemalignOverride(size_t align, size_t size, const void *caller) + __THROW { void* result = do_memalign(align, size); MallocHook::InvokeNewHook(result, size); return result; |