diff options
author | csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> | 2008-02-13 00:55:09 +0000 |
---|---|---|
committer | csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> | 2008-02-13 00:55:09 +0000 |
commit | 8a0a3101bc6a7d56ac04b278f28bdf3f95b00a3c (patch) | |
tree | 46f871a3160a4023201d72b1b04a9a88e3d88b78 /src/tcmalloc.cc | |
parent | b43ba444fcd74fa7c3260f6b2494dcbaa3fdb296 (diff) | |
download | gperftools-8a0a3101bc6a7d56ac04b278f28bdf3f95b00a3c.tar.gz |
Tue Feb 12 12:28:32 2008 Google Inc. <opensource@google.com>
* google-perftools: version 0.95 release
* Better -- not perfect -- support for linux-ppc (csilvers)
* Fix race condition in libunwind stacktrace (aruns)
* Speed up x86 spinlock locking (m3b)
* Improve heap-checker performance (maxim)
* Heap checker traverses more ptrs inside heap-alloced objects (maxim)
* Remove deprecated ProfilerThreadState function (cgd)
* Update libunwind documentation for statically linked binaries (aruns)
git-svn-id: http://gperftools.googlecode.com/svn/trunk@44 6b5cf1ce-ec42-a296-1ba9-69fdba395a50
Diffstat (limited to 'src/tcmalloc.cc')
-rw-r--r-- | src/tcmalloc.cc | 225 |
1 files changed, 128 insertions, 97 deletions
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc index 76435a7..a3f8733 100644 --- a/src/tcmalloc.cc +++ b/src/tcmalloc.cc @@ -277,7 +277,9 @@ static const int add_amount[2] = { 7, 127 + (120 << 7) }; static unsigned char class_array[377]; // Compute index of the class_array[] entry for a given size -static inline int ClassIndex(size_t s) { +static inline int ClassIndex(int s) { + ASSERT(0 <= s); + ASSERT(s <= kMaxSize); const int i = (s > kMaxSmallSize); return (s + add_amount[i]) >> shift_amount[i]; } @@ -381,7 +383,7 @@ static inline size_t SLL_Size(void *head) { // Setup helper functions. -static inline int SizeClass(size_t size) { +static inline int SizeClass(int size) { return class_array[ClassIndex(size)]; } @@ -780,14 +782,14 @@ static StackTrace* growth_stacks = NULL; template <int BITS> class MapSelector { public: typedef TCMalloc_PageMap3<BITS-kPageShift> Type; - typedef PackedCache<BITS, uint64> CacheType; + typedef PackedCache<BITS-kPageShift, uint64_t> CacheType; }; // A two-level map for 32-bit machines template <> class MapSelector<32> { public: typedef TCMalloc_PageMap2<32-kPageShift> Type; - typedef PackedCache<32-kPageShift, uint16> CacheType; + typedef PackedCache<32-kPageShift, uint16_t> CacheType; }; // ------------------------------------------------------------------------- @@ -893,10 +895,11 @@ class TCMalloc_PageHeap { // Remove span from its free list, and move any leftover part of // span into appropriate free lists. Also update "span" to have // length exactly "n" and mark it as non-free so it can be returned - // to the client. + // to the client. After all that, decrease free_pages_ by n and + // return span. // // "released" is true iff "span" was found on a "returned" list. - void Carve(Span* span, Length n, bool released); + Span* Carve(Span* span, Length n, bool released); void RecordSpan(Span* span) { pagemap_.set(span->start, span); @@ -943,25 +946,19 @@ Span* TCMalloc_PageHeap::New(Length n) { // Find first size >= n that has a non-empty list for (Length s = n; s < kMaxPages; s++) { - Span* ll = NULL; + Span* ll = &free_[s].normal; bool released = false; - if (!DLL_IsEmpty(&free_[s].normal)) { - // Found normal span - ll = &free_[s].normal; - } else if (!DLL_IsEmpty(&free_[s].returned)) { - // Found returned span; reallocate it + // If we're lucky, ll is non-empty, meaning it has a suitable span. + if (DLL_IsEmpty(ll)) { + // Alternatively, maybe there's a usable returned span. ll = &free_[s].returned; released = true; - } else { - // Keep looking in larger classes - continue; + if (DLL_IsEmpty(ll)) { + // Still no luck, so keep looking in larger classes. + continue; + } } - - Span* result = ll->next; - Carve(result, n, released); - ASSERT(Check()); - free_pages_ -= n; - return result; + return Carve(ll->next, n, released); } Span* result = AllocLarge(n); @@ -1010,13 +1007,7 @@ Span* TCMalloc_PageHeap::AllocLarge(Length n) { } } - if (best != NULL) { - Carve(best, n, from_released); - ASSERT(Check()); - free_pages_ -= n; - return best; - } - return NULL; + return best == NULL ? NULL : Carve(best, n, from_released); } Span* TCMalloc_PageHeap::Split(Span* span, Length n) { @@ -1036,7 +1027,7 @@ Span* TCMalloc_PageHeap::Split(Span* span, Length n) { return leftover; } -void TCMalloc_PageHeap::Carve(Span* span, Length n, bool released) { +Span* TCMalloc_PageHeap::Carve(Span* span, Length n, bool released) { ASSERT(n > 0); DLL_Remove(span); span->free = 0; @@ -1058,6 +1049,9 @@ void TCMalloc_PageHeap::Carve(Span* span, Length n, bool released) { span->length = n; pagemap_.set(span->start + n - 1, span); } + ASSERT(Check()); + free_pages_ -= n; + return span; } void TCMalloc_PageHeap::Delete(Span* span) { @@ -1357,8 +1351,17 @@ void TCMalloc_PageHeap::ReleaseFreePages() { class TCMalloc_ThreadCache_FreeList { private: void* list_; // Linked list of nodes - uint16_t length_; // Current length - uint16_t lowater_; // Low water mark for list length + +#ifdef _LP64 + // On 64-bit hardware, manipulating 16-bit values may be slightly slow. + // Since it won't cost any space, let's make these fields 32 bits each. + uint32_t length_; // Current length + uint32_t lowater_; // Low water mark for list length +#else + // If we aren't using 64-bit pointers then pack these into less space. + uint16_t length_; + uint16_t lowater_; +#endif public: void Init() { @@ -1368,7 +1371,7 @@ class TCMalloc_ThreadCache_FreeList { } // Return current length of list - int length() const { + size_t length() const { return length_; } @@ -1414,14 +1417,19 @@ class TCMalloc_ThreadCache { private: typedef TCMalloc_ThreadCache_FreeList FreeList; - size_t size_; // Combined size of data - pthread_t tid_; // Which thread owns it - bool in_setspecific_; // In call to pthread_setspecific? - FreeList list_[kNumClasses]; // Array indexed by size-class + // Warning: the offset of list_ affects performance. On general + // principles, we don't like list_[x] to span multiple L1 cache + // lines. However, merely placing list_ at offset 0 here seems to + // cause cache conflicts. // We sample allocations, biased by the size of the allocation - uint32_t rnd_; // Cheap random number generator size_t bytes_until_sample_; // Bytes until we sample next + uint32_t rnd_; // Cheap random number generator + + size_t size_; // Combined size of data + pthread_t tid_; // Which thread owns it + FreeList list_[kNumClasses]; // Array indexed by size-class + bool in_setspecific_; // In call to pthread_setspecific? // Allocate a new heap. REQUIRES: pageheap_lock is held. static inline TCMalloc_ThreadCache* NewHeap(pthread_t tid); @@ -1445,8 +1453,13 @@ class TCMalloc_ThreadCache { void* Allocate(size_t size); void Deallocate(void* ptr, size_t size_class); - void FetchFromCentralCache(size_t cl); - void ReleaseToCentralCache(size_t cl, int N); + // Gets and returns an object from the central cache, and, if possible, + // also adds some objects of that size class to this thread cache. + void* FetchFromCentralCache(size_t cl, size_t byte_size); + + // Releases N items from this thread cache. Returns size_. + size_t ReleaseToCentralCache(FreeList* src, size_t cl, int N); + void Scavenge(); void Print() const; @@ -1479,11 +1492,11 @@ class TCMalloc_Central_FreeList { // These methods all do internal locking. // Insert the specified range into the central freelist. N is the number of - // elements in the range. + // elements in the range. RemoveRange() is the opposite operation. void InsertRange(void *start, void *end, int N); - // Returns the actual number of fetched elements into N. - void RemoveRange(void **start, void **end, int *N); + // Returns the actual number of fetched elements and sets *start and *end. + int RemoveRange(void **start, void **end, int N); // Returns the number of free objects in cache. int length() { @@ -1542,7 +1555,7 @@ class TCMalloc_Central_FreeList { // spans if it allows it to shrink the cache. Return false if it failed to // shrink the cache. Decrements cache_size_ on succeess. // May temporarily take lock_. If it takes lock_, the locked_size_class - // lock is released to the thread from holding two size class locks + // lock is released to keep the thread from holding two size class locks // concurrently which could lead to a deadlock. bool ShrinkCache(int locked_size_class, bool force); @@ -1780,41 +1793,39 @@ void TCMalloc_Central_FreeList::InsertRange(void *start, void *end, int N) { ReleaseListToSpans(start); } -void TCMalloc_Central_FreeList::RemoveRange(void **start, void **end, int *N) { - int num = *N; - ASSERT(num > 0); - - SpinLockHolder h(&lock_); - if (num == num_objects_to_move[size_class_] && used_slots_ > 0) { +int TCMalloc_Central_FreeList::RemoveRange(void **start, void **end, int N) { + ASSERT(N > 0); + lock_.Lock(); + if (N == num_objects_to_move[size_class_] && used_slots_ > 0) { int slot = --used_slots_; ASSERT(slot >= 0); TCEntry *entry = &tc_slots_[slot]; *start = entry->head; *end = entry->tail; - return; + lock_.Unlock(); + return N; } + int result = 0; + void* head = NULL; + void* tail = NULL; // TODO: Prefetch multiple TCEntries? - void *tail = FetchFromSpansSafe(); - if (!tail) { - // We are completely out of memory. - *start = *end = NULL; - *N = 0; - return; - } - - SLL_SetNext(tail, NULL); - void *head = tail; - int count = 1; - while (count < num) { - void *t = FetchFromSpans(); - if (!t) break; - SLL_Push(&head, t); - count++; + tail = FetchFromSpansSafe(); + if (tail != NULL) { + SLL_SetNext(tail, NULL); + head = tail; + result = 1; + while (result < N) { + void *t = FetchFromSpans(); + if (!t) break; + SLL_Push(&head, t); + result++; + } } + lock_.Unlock(); *start = head; *end = tail; - *N = count; + return result; } @@ -1929,7 +1940,7 @@ void TCMalloc_ThreadCache::Cleanup() { // Put unused memory back into central cache for (int cl = 0; cl < kNumClasses; ++cl) { if (list_[cl].length() > 0) { - ReleaseToCentralCache(cl, list_[cl].length()); + ReleaseToCentralCache(&list_[cl], cl, list_[cl].length()); } } } @@ -1937,41 +1948,55 @@ void TCMalloc_ThreadCache::Cleanup() { inline void* TCMalloc_ThreadCache::Allocate(size_t size) { ASSERT(size <= kMaxSize); const size_t cl = SizeClass(size); + const size_t alloc_size = ByteSizeForClass(cl); FreeList* list = &list_[cl]; if (list->empty()) { - FetchFromCentralCache(cl); - if (list->empty()) return NULL; + return FetchFromCentralCache(cl, alloc_size); } - size_ -= ByteSizeForClass(cl); + size_ -= alloc_size; return list->Pop(); } inline void TCMalloc_ThreadCache::Deallocate(void* ptr, size_t cl) { - size_ += ByteSizeForClass(cl); FreeList* list = &list_[cl]; + ssize_t list_headroom = + static_cast<ssize_t>(kMaxFreeListLength - 1) - list->length(); + size_ += ByteSizeForClass(cl); + size_t cache_size = size_; + ssize_t size_headroom = per_thread_cache_size - cache_size - 1; list->Push(ptr); - // If enough data is free, put back into central cache - if (list->length() > kMaxFreeListLength) { - ReleaseToCentralCache(cl, num_objects_to_move[cl]); + + // There are two relatively uncommon things that require further work. + // In the common case we're done, and in that case we need a single branch + // because of the bitwise-or trick that follows. + if ((list_headroom | size_headroom) < 0) { + if (list_headroom < 0) { + cache_size = ReleaseToCentralCache(list, cl, num_objects_to_move[cl]); + } + if (cache_size >= per_thread_cache_size) Scavenge(); } - if (size_ >= per_thread_cache_size) Scavenge(); } -// Remove some objects of class "cl" from central cache and add to thread heap -void TCMalloc_ThreadCache::FetchFromCentralCache(size_t cl) { - int fetch_count = num_objects_to_move[cl]; +// Remove some objects of class "cl" from central cache and add to thread heap. +// On success, return the first object for immediate use; otherwise return NULL. +void* TCMalloc_ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) { void *start, *end; - central_cache[cl].RemoveRange(&start, &end, &fetch_count); - list_[cl].PushRange(fetch_count, start, end); - size_ += ByteSizeForClass(cl) * fetch_count; + int fetch_count = central_cache[cl].RemoveRange(&start, &end, + num_objects_to_move[cl]); + ASSERT((start == NULL) == (fetch_count == 0)); + if (--fetch_count >= 0) { + size_ += byte_size * fetch_count; + list_[cl].PushRange(fetch_count, SLL_Next(start), end); + } + return start; } // Remove some objects of class "cl" from thread heap and add to central cache -void TCMalloc_ThreadCache::ReleaseToCentralCache(size_t cl, int N) { - ASSERT(N > 0); - FreeList* src = &list_[cl]; +size_t TCMalloc_ThreadCache::ReleaseToCentralCache(FreeList* src, + size_t cl, int N) { + ASSERT(src == &list_[cl]); if (N > src->length()) N = src->length(); - size_ -= N*ByteSizeForClass(cl); + size_t delta_bytes = N * ByteSizeForClass(cl); // We return prepackaged chains of the correct size to the central cache. // TODO: Use the same format internally in the thread caches? @@ -1985,6 +2010,7 @@ void TCMalloc_ThreadCache::ReleaseToCentralCache(size_t cl, int N) { void *tail, *head; src->PopRange(N, &head, &tail); central_cache[cl].InsertRange(head, tail, N); + return size_ -= delta_bytes; } // Release idle memory to the central cache @@ -2002,7 +2028,7 @@ void TCMalloc_ThreadCache::Scavenge() { const int lowmark = list->lowwatermark(); if (lowmark > 0) { const int drop = (lowmark > 1) ? lowmark/2 : 1; - ReleaseToCentralCache(cl, drop); + ReleaseToCentralCache(list, cl, drop); } list->clear_lowwatermark(); } @@ -2249,7 +2275,7 @@ void TCMalloc_ThreadCache::RecomputeThreadCacheSize() { void TCMalloc_ThreadCache::Print() const { for (int cl = 0; cl < kNumClasses; ++cl) { - MESSAGE(" %5" PRIuS " : %4d len; %4d lo\n", + MESSAGE(" %5" PRIuS " : %4" PRIuS " len; %4d lo\n", ByteSizeForClass(cl), list_[cl].length(), list_[cl].lowwatermark()); @@ -2642,6 +2668,16 @@ static inline void* SpanToMallocResult(Span *span) { CheckedMallocResult(reinterpret_cast<void*>(span->start << kPageShift)); } +// Helper for do_malloc(). +static inline void* do_malloc_pages(Length num_pages) { + Span *span; + { + SpinLockHolder h(&pageheap_lock); + span = pageheap->New(num_pages); + } + return span == NULL ? NULL : SpanToMallocResult(span); +} + static inline void* do_malloc(size_t size) { void* ret = NULL; @@ -2652,17 +2688,12 @@ static inline void* do_malloc(size_t size) { if (span != NULL) { ret = SpanToMallocResult(span); } - } else if (size > kMaxSize) { - // Use page-level allocator - SpinLockHolder h(&pageheap_lock); - Span* span = pageheap->New(pages(size)); - if (span != NULL) { - ret = SpanToMallocResult(span); - } - } else { + } else if (size <= kMaxSize) { // The common case, and also the simplest. This just pops the - // size-appropriate freelist, afer replenishing it if it's empty. + // size-appropriate freelist, after replenishing it if it's empty. ret = CheckedMallocResult(heap->Allocate(size)); + } else { + ret = do_malloc_pages(pages(size)); } if (ret == NULL) errno = ENOMEM; return ret; |