summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAliaksey Kandratsenka <alk@tut.by>2015-08-02 19:28:03 -0700
committerAliaksey Kandratsenka <alk@tut.by>2015-08-02 19:36:27 -0700
commit54505f1d50c2d1f4676f5e87090b64a117fd980e (patch)
tree135f4ea4b4c31e809bdbaba6221da5cffb29fd88
parent73c0c8c61b84e268bafd961bf304b2e4d296142f (diff)
downloadgperftools-54505f1d50c2d1f4676f5e87090b64a117fd980e.tar.gz
help clang with inlining important fast-path functions
Clang's recent focus on code size doesn't help us in malloc fast-path because somehow clang completely ignores inline directives. In order to help clang generate code that was actually intended by original authors, we're adding always_inline attribute to key fast-path functions. Clang also guessed likely branch "wrong" in couple places. Which is now addressed by UNLIKELY declarations there.
-rw-r--r--src/tcmalloc.cc36
-rw-r--r--src/thread_cache.h4
2 files changed, 26 insertions, 14 deletions
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 3e8e233..b7d1913 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -132,6 +132,17 @@
#include "tcmalloc_guard.h" // for TCMallocGuard
#include "thread_cache.h" // for ThreadCache
+#ifdef __clang__
+// clang's apparent focus on code size somehow causes it to ignore
+// normal inline directives even for few functions which inlining is
+// key for performance. In order to get performance of clang's
+// generated code closer to normal, we're forcing inlining via
+// attribute.
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#else
+#define ALWAYS_INLINE inline
+#endif
+
#if (defined(_WIN32) && !defined(__CYGWIN__) && !defined(__CYGWIN32__)) && !defined(WIN32_OVERRIDE_ALLOCATORS)
# define WIN32_DO_PATCHING 1
#endif
@@ -1154,7 +1165,7 @@ inline void* do_malloc_pages(ThreadCache* heap, size_t size) {
return result;
}
-inline void* do_malloc_small(ThreadCache* heap, size_t size) {
+ALWAYS_INLINE void* do_malloc_small(ThreadCache* heap, size_t size) {
ASSERT(Static::IsInited());
ASSERT(heap != NULL);
size_t cl = Static::sizemap()->SizeClass(size);
@@ -1169,7 +1180,7 @@ inline void* do_malloc_small(ThreadCache* heap, size_t size) {
}
}
-inline void* do_malloc(size_t size) {
+ALWAYS_INLINE void* do_malloc(size_t size) {
if (ThreadCache::have_tls &&
LIKELY(size < ThreadCache::MinSizeForSlowPath())) {
return do_malloc_small(ThreadCache::GetCacheWhichMustBePresent(), size);
@@ -1184,7 +1195,7 @@ static void *retry_malloc(void* size) {
return do_malloc(reinterpret_cast<size_t>(size));
}
-inline void* do_malloc_or_cpp_alloc(size_t size) {
+ALWAYS_INLINE void* do_malloc_or_cpp_alloc(size_t size) {
void *rv = do_malloc(size);
if (LIKELY(rv != NULL)) {
return rv;
@@ -1193,7 +1204,7 @@ inline void* do_malloc_or_cpp_alloc(size_t size) {
false, true);
}
-inline void* do_calloc(size_t n, size_t elem_size) {
+ALWAYS_INLINE void* do_calloc(size_t n, size_t elem_size) {
// Overflow check
const size_t size = n * elem_size;
if (elem_size != 0 && size / elem_size != n) return NULL;
@@ -1225,10 +1236,10 @@ inline void free_null_or_invalid(void* ptr, void (*invalid_free_fn)(void*)) {
//
// To maximize speed in the common case, we usually get here with
// heap_must_be_valid being a manifest constant equal to true.
-inline void do_free_helper(void* ptr,
- void (*invalid_free_fn)(void*),
- ThreadCache* heap,
- bool heap_must_be_valid) {
+ALWAYS_INLINE void do_free_helper(void* ptr,
+ void (*invalid_free_fn)(void*),
+ ThreadCache* heap,
+ bool heap_must_be_valid) {
ASSERT((Static::IsInited() && heap != NULL) || !heap_must_be_valid);
if (!heap_must_be_valid && !Static::IsInited()) {
// We called free() before malloc(). This can occur if the
@@ -1288,7 +1299,8 @@ inline void do_free_helper(void* ptr,
//
// We can usually detect the case where ptr is not pointing to a page that
// tcmalloc is using, and in those cases we invoke invalid_free_fn.
-inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) {
+ALWAYS_INLINE void do_free_with_callback(void* ptr,
+ void (*invalid_free_fn)(void*)) {
ThreadCache* heap = NULL;
if (LIKELY(ThreadCache::IsFastPathAllowed())) {
heap = ThreadCache::GetCacheWhichMustBePresent();
@@ -1300,7 +1312,7 @@ inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) {
}
// The default "do_free" that uses the default callback.
-inline void do_free(void* ptr) {
+ALWAYS_INLINE void do_free(void* ptr) {
return do_free_with_callback(ptr, &InvalidFree);
}
@@ -1329,7 +1341,7 @@ inline size_t GetSizeWithCallback(const void* ptr,
// This lets you call back to a given function pointer if ptr is invalid.
// It is used primarily by windows code which wants a specialized callback.
-inline void* do_realloc_with_callback(
+ALWAYS_INLINE void* do_realloc_with_callback(
void* old_ptr, size_t new_size,
void (*invalid_free_fn)(void*),
size_t (*invalid_get_size_fn)(const void*)) {
@@ -1374,7 +1386,7 @@ inline void* do_realloc_with_callback(
}
}
-inline void* do_realloc(void* old_ptr, size_t new_size) {
+ALWAYS_INLINE void* do_realloc(void* old_ptr, size_t new_size) {
return do_realloc_with_callback(old_ptr, new_size,
&InvalidFree, &InvalidGetSizeForRealloc);
}
diff --git a/src/thread_cache.h b/src/thread_cache.h
index 5edcdfb..81a020e 100644
--- a/src/thread_cache.h
+++ b/src/thread_cache.h
@@ -350,7 +350,7 @@ inline void* ThreadCache::Allocate(size_t size, size_t cl) {
ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));
FreeList* list = &list_[cl];
- if (list->empty()) {
+ if (UNLIKELY(list->empty())) {
return FetchFromCentralCache(cl, size);
}
size_ -= size;
@@ -374,7 +374,7 @@ inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
// There are two relatively uncommon things that require further work.
// In the common case we're done, and in that case we need a single branch
// because of the bitwise-or trick that follows.
- if ((list_headroom | size_headroom) < 0) {
+ if (UNLIKELY((list_headroom | size_headroom) < 0)) {
if (list_headroom < 0) {
ListTooLong(list, cl);
}