summaryrefslogtreecommitdiff
path: root/src/tcmalloc.cc
diff options
context:
space:
mode:
authorcsilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>2008-02-13 00:55:09 +0000
committercsilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>2008-02-13 00:55:09 +0000
commit8a0a3101bc6a7d56ac04b278f28bdf3f95b00a3c (patch)
tree46f871a3160a4023201d72b1b04a9a88e3d88b78 /src/tcmalloc.cc
parentb43ba444fcd74fa7c3260f6b2494dcbaa3fdb296 (diff)
downloadgperftools-8a0a3101bc6a7d56ac04b278f28bdf3f95b00a3c.tar.gz
Tue Feb 12 12:28:32 2008 Google Inc. <opensource@google.com>
* google-perftools: version 0.95 release * Better -- not perfect -- support for linux-ppc (csilvers) * Fix race condition in libunwind stacktrace (aruns) * Speed up x86 spinlock locking (m3b) * Improve heap-checker performance (maxim) * Heap checker traverses more ptrs inside heap-alloced objects (maxim) * Remove deprecated ProfilerThreadState function (cgd) * Update libunwind documentation for statically linked binaries (aruns) git-svn-id: http://gperftools.googlecode.com/svn/trunk@44 6b5cf1ce-ec42-a296-1ba9-69fdba395a50
Diffstat (limited to 'src/tcmalloc.cc')
-rw-r--r--src/tcmalloc.cc225
1 files changed, 128 insertions, 97 deletions
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 76435a7..a3f8733 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -277,7 +277,9 @@ static const int add_amount[2] = { 7, 127 + (120 << 7) };
static unsigned char class_array[377];
// Compute index of the class_array[] entry for a given size
-static inline int ClassIndex(size_t s) {
+static inline int ClassIndex(int s) {
+ ASSERT(0 <= s);
+ ASSERT(s <= kMaxSize);
const int i = (s > kMaxSmallSize);
return (s + add_amount[i]) >> shift_amount[i];
}
@@ -381,7 +383,7 @@ static inline size_t SLL_Size(void *head) {
// Setup helper functions.
-static inline int SizeClass(size_t size) {
+static inline int SizeClass(int size) {
return class_array[ClassIndex(size)];
}
@@ -780,14 +782,14 @@ static StackTrace* growth_stacks = NULL;
template <int BITS> class MapSelector {
public:
typedef TCMalloc_PageMap3<BITS-kPageShift> Type;
- typedef PackedCache<BITS, uint64> CacheType;
+ typedef PackedCache<BITS-kPageShift, uint64_t> CacheType;
};
// A two-level map for 32-bit machines
template <> class MapSelector<32> {
public:
typedef TCMalloc_PageMap2<32-kPageShift> Type;
- typedef PackedCache<32-kPageShift, uint16> CacheType;
+ typedef PackedCache<32-kPageShift, uint16_t> CacheType;
};
// -------------------------------------------------------------------------
@@ -893,10 +895,11 @@ class TCMalloc_PageHeap {
// Remove span from its free list, and move any leftover part of
// span into appropriate free lists. Also update "span" to have
// length exactly "n" and mark it as non-free so it can be returned
- // to the client.
+ // to the client. After all that, decrease free_pages_ by n and
+ // return span.
//
// "released" is true iff "span" was found on a "returned" list.
- void Carve(Span* span, Length n, bool released);
+ Span* Carve(Span* span, Length n, bool released);
void RecordSpan(Span* span) {
pagemap_.set(span->start, span);
@@ -943,25 +946,19 @@ Span* TCMalloc_PageHeap::New(Length n) {
// Find first size >= n that has a non-empty list
for (Length s = n; s < kMaxPages; s++) {
- Span* ll = NULL;
+ Span* ll = &free_[s].normal;
bool released = false;
- if (!DLL_IsEmpty(&free_[s].normal)) {
- // Found normal span
- ll = &free_[s].normal;
- } else if (!DLL_IsEmpty(&free_[s].returned)) {
- // Found returned span; reallocate it
+ // If we're lucky, ll is non-empty, meaning it has a suitable span.
+ if (DLL_IsEmpty(ll)) {
+ // Alternatively, maybe there's a usable returned span.
ll = &free_[s].returned;
released = true;
- } else {
- // Keep looking in larger classes
- continue;
+ if (DLL_IsEmpty(ll)) {
+ // Still no luck, so keep looking in larger classes.
+ continue;
+ }
}
-
- Span* result = ll->next;
- Carve(result, n, released);
- ASSERT(Check());
- free_pages_ -= n;
- return result;
+ return Carve(ll->next, n, released);
}
Span* result = AllocLarge(n);
@@ -1010,13 +1007,7 @@ Span* TCMalloc_PageHeap::AllocLarge(Length n) {
}
}
- if (best != NULL) {
- Carve(best, n, from_released);
- ASSERT(Check());
- free_pages_ -= n;
- return best;
- }
- return NULL;
+ return best == NULL ? NULL : Carve(best, n, from_released);
}
Span* TCMalloc_PageHeap::Split(Span* span, Length n) {
@@ -1036,7 +1027,7 @@ Span* TCMalloc_PageHeap::Split(Span* span, Length n) {
return leftover;
}
-void TCMalloc_PageHeap::Carve(Span* span, Length n, bool released) {
+Span* TCMalloc_PageHeap::Carve(Span* span, Length n, bool released) {
ASSERT(n > 0);
DLL_Remove(span);
span->free = 0;
@@ -1058,6 +1049,9 @@ void TCMalloc_PageHeap::Carve(Span* span, Length n, bool released) {
span->length = n;
pagemap_.set(span->start + n - 1, span);
}
+ ASSERT(Check());
+ free_pages_ -= n;
+ return span;
}
void TCMalloc_PageHeap::Delete(Span* span) {
@@ -1357,8 +1351,17 @@ void TCMalloc_PageHeap::ReleaseFreePages() {
class TCMalloc_ThreadCache_FreeList {
private:
void* list_; // Linked list of nodes
- uint16_t length_; // Current length
- uint16_t lowater_; // Low water mark for list length
+
+#ifdef _LP64
+ // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
+ // Since it won't cost any space, let's make these fields 32 bits each.
+ uint32_t length_; // Current length
+ uint32_t lowater_; // Low water mark for list length
+#else
+ // If we aren't using 64-bit pointers then pack these into less space.
+ uint16_t length_;
+ uint16_t lowater_;
+#endif
public:
void Init() {
@@ -1368,7 +1371,7 @@ class TCMalloc_ThreadCache_FreeList {
}
// Return current length of list
- int length() const {
+ size_t length() const {
return length_;
}
@@ -1414,14 +1417,19 @@ class TCMalloc_ThreadCache {
private:
typedef TCMalloc_ThreadCache_FreeList FreeList;
- size_t size_; // Combined size of data
- pthread_t tid_; // Which thread owns it
- bool in_setspecific_; // In call to pthread_setspecific?
- FreeList list_[kNumClasses]; // Array indexed by size-class
+ // Warning: the offset of list_ affects performance. On general
+ // principles, we don't like list_[x] to span multiple L1 cache
+ // lines. However, merely placing list_ at offset 0 here seems to
+ // cause cache conflicts.
// We sample allocations, biased by the size of the allocation
- uint32_t rnd_; // Cheap random number generator
size_t bytes_until_sample_; // Bytes until we sample next
+ uint32_t rnd_; // Cheap random number generator
+
+ size_t size_; // Combined size of data
+ pthread_t tid_; // Which thread owns it
+ FreeList list_[kNumClasses]; // Array indexed by size-class
+ bool in_setspecific_; // In call to pthread_setspecific?
// Allocate a new heap. REQUIRES: pageheap_lock is held.
static inline TCMalloc_ThreadCache* NewHeap(pthread_t tid);
@@ -1445,8 +1453,13 @@ class TCMalloc_ThreadCache {
void* Allocate(size_t size);
void Deallocate(void* ptr, size_t size_class);
- void FetchFromCentralCache(size_t cl);
- void ReleaseToCentralCache(size_t cl, int N);
+ // Gets and returns an object from the central cache, and, if possible,
+ // also adds some objects of that size class to this thread cache.
+ void* FetchFromCentralCache(size_t cl, size_t byte_size);
+
+ // Releases N items from this thread cache. Returns size_.
+ size_t ReleaseToCentralCache(FreeList* src, size_t cl, int N);
+
void Scavenge();
void Print() const;
@@ -1479,11 +1492,11 @@ class TCMalloc_Central_FreeList {
// These methods all do internal locking.
// Insert the specified range into the central freelist. N is the number of
- // elements in the range.
+ // elements in the range. RemoveRange() is the opposite operation.
void InsertRange(void *start, void *end, int N);
- // Returns the actual number of fetched elements into N.
- void RemoveRange(void **start, void **end, int *N);
+ // Returns the actual number of fetched elements and sets *start and *end.
+ int RemoveRange(void **start, void **end, int N);
// Returns the number of free objects in cache.
int length() {
@@ -1542,7 +1555,7 @@ class TCMalloc_Central_FreeList {
// spans if it allows it to shrink the cache. Return false if it failed to
// shrink the cache. Decrements cache_size_ on succeess.
// May temporarily take lock_. If it takes lock_, the locked_size_class
- // lock is released to the thread from holding two size class locks
+ // lock is released to keep the thread from holding two size class locks
// concurrently which could lead to a deadlock.
bool ShrinkCache(int locked_size_class, bool force);
@@ -1780,41 +1793,39 @@ void TCMalloc_Central_FreeList::InsertRange(void *start, void *end, int N) {
ReleaseListToSpans(start);
}
-void TCMalloc_Central_FreeList::RemoveRange(void **start, void **end, int *N) {
- int num = *N;
- ASSERT(num > 0);
-
- SpinLockHolder h(&lock_);
- if (num == num_objects_to_move[size_class_] && used_slots_ > 0) {
+int TCMalloc_Central_FreeList::RemoveRange(void **start, void **end, int N) {
+ ASSERT(N > 0);
+ lock_.Lock();
+ if (N == num_objects_to_move[size_class_] && used_slots_ > 0) {
int slot = --used_slots_;
ASSERT(slot >= 0);
TCEntry *entry = &tc_slots_[slot];
*start = entry->head;
*end = entry->tail;
- return;
+ lock_.Unlock();
+ return N;
}
+ int result = 0;
+ void* head = NULL;
+ void* tail = NULL;
// TODO: Prefetch multiple TCEntries?
- void *tail = FetchFromSpansSafe();
- if (!tail) {
- // We are completely out of memory.
- *start = *end = NULL;
- *N = 0;
- return;
- }
-
- SLL_SetNext(tail, NULL);
- void *head = tail;
- int count = 1;
- while (count < num) {
- void *t = FetchFromSpans();
- if (!t) break;
- SLL_Push(&head, t);
- count++;
+ tail = FetchFromSpansSafe();
+ if (tail != NULL) {
+ SLL_SetNext(tail, NULL);
+ head = tail;
+ result = 1;
+ while (result < N) {
+ void *t = FetchFromSpans();
+ if (!t) break;
+ SLL_Push(&head, t);
+ result++;
+ }
}
+ lock_.Unlock();
*start = head;
*end = tail;
- *N = count;
+ return result;
}
@@ -1929,7 +1940,7 @@ void TCMalloc_ThreadCache::Cleanup() {
// Put unused memory back into central cache
for (int cl = 0; cl < kNumClasses; ++cl) {
if (list_[cl].length() > 0) {
- ReleaseToCentralCache(cl, list_[cl].length());
+ ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
}
}
}
@@ -1937,41 +1948,55 @@ void TCMalloc_ThreadCache::Cleanup() {
inline void* TCMalloc_ThreadCache::Allocate(size_t size) {
ASSERT(size <= kMaxSize);
const size_t cl = SizeClass(size);
+ const size_t alloc_size = ByteSizeForClass(cl);
FreeList* list = &list_[cl];
if (list->empty()) {
- FetchFromCentralCache(cl);
- if (list->empty()) return NULL;
+ return FetchFromCentralCache(cl, alloc_size);
}
- size_ -= ByteSizeForClass(cl);
+ size_ -= alloc_size;
return list->Pop();
}
inline void TCMalloc_ThreadCache::Deallocate(void* ptr, size_t cl) {
- size_ += ByteSizeForClass(cl);
FreeList* list = &list_[cl];
+ ssize_t list_headroom =
+ static_cast<ssize_t>(kMaxFreeListLength - 1) - list->length();
+ size_ += ByteSizeForClass(cl);
+ size_t cache_size = size_;
+ ssize_t size_headroom = per_thread_cache_size - cache_size - 1;
list->Push(ptr);
- // If enough data is free, put back into central cache
- if (list->length() > kMaxFreeListLength) {
- ReleaseToCentralCache(cl, num_objects_to_move[cl]);
+
+ // There are two relatively uncommon things that require further work.
+ // In the common case we're done, and in that case we need a single branch
+ // because of the bitwise-or trick that follows.
+ if ((list_headroom | size_headroom) < 0) {
+ if (list_headroom < 0) {
+ cache_size = ReleaseToCentralCache(list, cl, num_objects_to_move[cl]);
+ }
+ if (cache_size >= per_thread_cache_size) Scavenge();
}
- if (size_ >= per_thread_cache_size) Scavenge();
}
-// Remove some objects of class "cl" from central cache and add to thread heap
-void TCMalloc_ThreadCache::FetchFromCentralCache(size_t cl) {
- int fetch_count = num_objects_to_move[cl];
+// Remove some objects of class "cl" from central cache and add to thread heap.
+// On success, return the first object for immediate use; otherwise return NULL.
+void* TCMalloc_ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) {
void *start, *end;
- central_cache[cl].RemoveRange(&start, &end, &fetch_count);
- list_[cl].PushRange(fetch_count, start, end);
- size_ += ByteSizeForClass(cl) * fetch_count;
+ int fetch_count = central_cache[cl].RemoveRange(&start, &end,
+ num_objects_to_move[cl]);
+ ASSERT((start == NULL) == (fetch_count == 0));
+ if (--fetch_count >= 0) {
+ size_ += byte_size * fetch_count;
+ list_[cl].PushRange(fetch_count, SLL_Next(start), end);
+ }
+ return start;
}
// Remove some objects of class "cl" from thread heap and add to central cache
-void TCMalloc_ThreadCache::ReleaseToCentralCache(size_t cl, int N) {
- ASSERT(N > 0);
- FreeList* src = &list_[cl];
+size_t TCMalloc_ThreadCache::ReleaseToCentralCache(FreeList* src,
+ size_t cl, int N) {
+ ASSERT(src == &list_[cl]);
if (N > src->length()) N = src->length();
- size_ -= N*ByteSizeForClass(cl);
+ size_t delta_bytes = N * ByteSizeForClass(cl);
// We return prepackaged chains of the correct size to the central cache.
// TODO: Use the same format internally in the thread caches?
@@ -1985,6 +2010,7 @@ void TCMalloc_ThreadCache::ReleaseToCentralCache(size_t cl, int N) {
void *tail, *head;
src->PopRange(N, &head, &tail);
central_cache[cl].InsertRange(head, tail, N);
+ return size_ -= delta_bytes;
}
// Release idle memory to the central cache
@@ -2002,7 +2028,7 @@ void TCMalloc_ThreadCache::Scavenge() {
const int lowmark = list->lowwatermark();
if (lowmark > 0) {
const int drop = (lowmark > 1) ? lowmark/2 : 1;
- ReleaseToCentralCache(cl, drop);
+ ReleaseToCentralCache(list, cl, drop);
}
list->clear_lowwatermark();
}
@@ -2249,7 +2275,7 @@ void TCMalloc_ThreadCache::RecomputeThreadCacheSize() {
void TCMalloc_ThreadCache::Print() const {
for (int cl = 0; cl < kNumClasses; ++cl) {
- MESSAGE(" %5" PRIuS " : %4d len; %4d lo\n",
+ MESSAGE(" %5" PRIuS " : %4" PRIuS " len; %4d lo\n",
ByteSizeForClass(cl),
list_[cl].length(),
list_[cl].lowwatermark());
@@ -2642,6 +2668,16 @@ static inline void* SpanToMallocResult(Span *span) {
CheckedMallocResult(reinterpret_cast<void*>(span->start << kPageShift));
}
+// Helper for do_malloc().
+static inline void* do_malloc_pages(Length num_pages) {
+ Span *span;
+ {
+ SpinLockHolder h(&pageheap_lock);
+ span = pageheap->New(num_pages);
+ }
+ return span == NULL ? NULL : SpanToMallocResult(span);
+}
+
static inline void* do_malloc(size_t size) {
void* ret = NULL;
@@ -2652,17 +2688,12 @@ static inline void* do_malloc(size_t size) {
if (span != NULL) {
ret = SpanToMallocResult(span);
}
- } else if (size > kMaxSize) {
- // Use page-level allocator
- SpinLockHolder h(&pageheap_lock);
- Span* span = pageheap->New(pages(size));
- if (span != NULL) {
- ret = SpanToMallocResult(span);
- }
- } else {
+ } else if (size <= kMaxSize) {
// The common case, and also the simplest. This just pops the
- // size-appropriate freelist, afer replenishing it if it's empty.
+ // size-appropriate freelist, after replenishing it if it's empty.
ret = CheckedMallocResult(heap->Allocate(size));
+ } else {
+ ret = do_malloc_pages(pages(size));
}
if (ret == NULL) errno = ENOMEM;
return ret;