From 3014cf142e5a2409c88ab4559f3274434ed9a29b Mon Sep 17 00:00:00 2001 From: csilvers Date: Thu, 18 Nov 2010 01:07:25 +0000 Subject: * Suppress all large allocs when report threshold==0 * Clarified meaning of various malloc stats * Change from ATTRIBUTED_DEPRECATED to comments * Make array-size a var to compile under clang * Reduce page map key size under x86_64 by 4.4MB * Added full qualification to MemoryBarrier * Support systems that capitalize /proc weirdly * Avoid gcc warning: exporting type in unnamed ns * Add some dynamic annotations for gcc attributes * Add support for census profiler in pprof * Speed up pprof's ExtractSymbols * Speed up GoogleOnce * Add pkg-config (.pc) files * Detect when __environ exists but is NULL * Improve spinlock contention performance * Add GetFreeListSizes * Improve sampling_test, eg by adding no-inline * Relax malloc_extension test-check for big pages * Add proper library version number information * Update from autoconf 2.64 to 2.65 * Better document how to write a server that works with pprof * Change FillProcSelfMaps to better handle out-of-space * No longer hook _aligned_malloc/free in windows * Handle function-forwarding in DLLs when patching (in windows) * Update .vcproj files that had wrong .cc files in them (!) * get rid of unnecessary 'size < 0' * fix comments a bit in sysinfo.cc * another go at improving malloc-stats output * fix comment typo in profiler.cc * Add a few more thread annotations * Try to read TSC frequency from 'tsc_freq_khz' * Fix annotalysis/TSAN incompatibility * Add pprof --evince to go along with --gv * Document need for sampling to use GetHeapSample * Fix flakiness in malloc_extension_test * Separate out synchronization profiling routines git-svn-id: http://gperftools.googlecode.com/svn/trunk@99 6b5cf1ce-ec42-a296-1ba9-69fdba395a50 --- src/base/atomicops-internals-arm-gcc.h | 234 +++++++++++++++++++++++++++++++++ src/base/basictypes.h | 2 +- src/base/dynamic_annotations.h | 117 ++++++++++++++++- src/base/logging.h | 5 +- src/base/low_level_alloc.cc | 6 +- src/base/spinlock.cc | 166 +++++++++++++++-------- src/base/spinlock.h | 65 ++++----- src/base/spinlock_internal.cc | 77 +++++++++++ src/base/spinlock_internal.h | 64 +++++++++ src/base/spinlock_linux-inl.h | 52 ++++---- src/base/spinlock_posix-inl.h | 32 +++-- src/base/spinlock_win32-inl.h | 18 ++- src/base/synchronization_profiling.h | 50 +++++++ src/base/sysinfo.cc | 107 ++++++++++----- src/base/sysinfo.h | 2 +- src/base/thread_annotations.h | 4 +- src/base/vdso_support.cc | 2 +- src/common.h | 13 ++ src/config.h.in | 7 + src/debugallocation.cc | 16 ++- src/google/heap-checker.h | 2 +- src/google/malloc_extension.h | 47 ++++++- src/heap-profile-table.cc | 3 +- src/malloc_extension.cc | 12 +- src/memory_region_map.cc | 1 - src/page_heap.cc | 29 ++++ src/page_heap.h | 15 ++- src/pprof | 85 ++++++++---- src/profiler.cc | 2 +- src/system-alloc.cc | 30 ++++- src/tcmalloc.cc | 190 +++++++++++++++++++++----- src/tests/debugallocation_test.cc | 10 +- src/tests/malloc_extension_test.cc | 26 ++++ src/tests/sampling_test.cc | 2 + src/tests/sampling_test.sh | 4 +- src/tests/system-alloc_unittest.cc | 14 ++ src/tests/tcmalloc_unittest.cc | 23 ++-- src/windows/config.h | 2 +- src/windows/patch_functions.cc | 46 +++---- src/windows/port.cc | 12 ++ src/windows/port.h | 2 + 41 files changed, 1299 insertions(+), 297 deletions(-) create mode 100644 src/base/atomicops-internals-arm-gcc.h create mode 100644 src/base/spinlock_internal.cc create mode 100644 src/base/spinlock_internal.h create mode 100644 src/base/synchronization_profiling.h (limited to 'src') diff --git a/src/base/atomicops-internals-arm-gcc.h b/src/base/atomicops-internals-arm-gcc.h new file mode 100644 index 0000000..423e993 --- /dev/null +++ b/src/base/atomicops-internals-arm-gcc.h @@ -0,0 +1,234 @@ +/* Copyright (c) 2010, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Lei Zhang, Sasha Levitskiy + */ + +// This file is an internal atomic implementation, use base/atomicops.h instead. +// +// LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears. + +#ifndef BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ +#define BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ + +#include +#include "base/basictypes.h" // For COMPILE_ASSERT + +typedef int32_t Atomic32; + +namespace base { +namespace subtle { + +typedef int64_t Atomic64; + +// 0xffff0fc0 is the hard coded address of a function provided by +// the kernel which implements an atomic compare-exchange. On older +// ARM architecture revisions (pre-v6) this may be implemented using +// a syscall. This address is stable, and in active use (hard coded) +// by at least glibc-2.7 and the Android C library. +// pLinuxKernelCmpxchg has both acquire and release barrier sematincs. +typedef Atomic32 (*LinuxKernelCmpxchgFunc)(Atomic32 old_value, + Atomic32 new_value, + volatile Atomic32* ptr); +LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg __attribute__((weak)) = + (LinuxKernelCmpxchgFunc) 0xffff0fc0; + +typedef void (*LinuxKernelMemoryBarrierFunc)(void); +LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier __attribute__((weak)) = + (LinuxKernelMemoryBarrierFunc) 0xffff0fa0; + + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev_value = *ptr; + do { + if (!pLinuxKernelCmpxchg(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 old_value; + do { + old_value = *ptr; + } while (pLinuxKernelCmpxchg(old_value, new_value, + const_cast(ptr))); + return old_value; +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + for (;;) { + // Atomic exchange the old value with an incremented one. + Atomic32 old_value = *ptr; + Atomic32 new_value = old_value + increment; + if (pLinuxKernelCmpxchg(old_value, new_value, + const_cast(ptr)) == 0) { + // The exchange took place as expected. + return new_value; + } + // Otherwise, *ptr changed mid-loop and we need to retry. + } +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return Barrier_AtomicIncrement(ptr, increment); +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void MemoryBarrier() { + pLinuxKernelMemoryBarrier(); +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + + +// 64-bit versions are not implemented yet. + +inline void NotImplementedFatalError(const char *function_name) { + fprintf(stderr, "64-bit %s() not implemented on this platform\n", + function_name); + abort(); +} + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + NotImplementedFatalError("NoBarrier_CompareAndSwap"); + return 0; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + NotImplementedFatalError("NoBarrier_AtomicExchange"); + return 0; +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + NotImplementedFatalError("NoBarrier_AtomicIncrement"); + return 0; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + NotImplementedFatalError("Barrier_AtomicIncrement"); + return 0; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + NotImplementedFatalError("NoBarrier_Store"); +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + NoBarrier_AtomicExchange(ptr, value); + // acts as a barrier in this implementation +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + NotImplementedFatalError("Release_Store"); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + NotImplementedFatalError("NoBarrier_Load"); + return 0; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 value = NoBarrier_Load(ptr); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return NoBarrier_Load(ptr); +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +} // namespace base::subtle +} // namespace base + +#endif // BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ diff --git a/src/base/basictypes.h b/src/base/basictypes.h index ab9cdab..0f21fca 100644 --- a/src/base/basictypes.h +++ b/src/base/basictypes.h @@ -109,7 +109,7 @@ const int64 kint64min = ( ((( int64) kint32min) << 32) | 0 ); // Also allow for printing of a pthread_t. #define GPRIuPTHREAD "lu" #define GPRIxPTHREAD "lx" -#if defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__APPLE__) +#if defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__APPLE__) || defined(__FreeBSD__) #define PRINTABLE_PTHREAD(pthreadt) reinterpret_cast(pthreadt) #else #define PRINTABLE_PTHREAD(pthreadt) pthreadt diff --git a/src/base/dynamic_annotations.h b/src/base/dynamic_annotations.h index 10642fd..6283f7e 100644 --- a/src/base/dynamic_annotations.h +++ b/src/base/dynamic_annotations.h @@ -370,6 +370,41 @@ #endif /* DYNAMIC_ANNOTATIONS_ENABLED */ +/* Macro definitions for GCC attributes that allow static thread safety + analysis to recognize and use some of the dynamic annotations as + escape hatches. + TODO(lcwu): remove the check for __SUPPORT_DYN_ANNOTATION__ once the + default crosstool/GCC supports these GCC attributes. */ + +#define ANNOTALYSIS_STATIC_INLINE +#define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY ; + +#if defined(__GNUC__) && defined(__SUPPORT_TS_ANNOTATION__) \ + && (!defined(SWIG)) && defined(__SUPPORT_DYN_ANNOTATION__) + +#if DYNAMIC_ANNOTATIONS_ENABLED == 0 +#define ANNOTALYSIS_ONLY 1 +#undef ANNOTALYSIS_STATIC_INLINE +#define ANNOTALYSIS_STATIC_INLINE static inline +#undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +#define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY {} +#endif +#define ANNOTALYSIS_IGNORE_READS_BEGIN __attribute__ ((ignore_reads_begin)) +#define ANNOTALYSIS_IGNORE_READS_END __attribute__ ((ignore_reads_end)) +#define ANNOTALYSIS_IGNORE_WRITES_BEGIN __attribute__ ((ignore_writes_begin)) +#define ANNOTALYSIS_IGNORE_WRITES_END __attribute__ ((ignore_writes_end)) +#define ANNOTALYSIS_UNPROTECTED_READ __attribute__ ((unprotected_read)) + +#else + +#define ANNOTALYSIS_IGNORE_READS_BEGIN +#define ANNOTALYSIS_IGNORE_READS_END +#define ANNOTALYSIS_IGNORE_WRITES_BEGIN +#define ANNOTALYSIS_IGNORE_WRITES_END +#define ANNOTALYSIS_UNPROTECTED_READ + +#endif + /* Use the macros above rather than using these functions directly. */ #ifdef __cplusplus extern "C" { @@ -431,10 +466,18 @@ void AnnotateTraceMemory(const char *file, int line, const volatile void *arg); void AnnotateThreadName(const char *file, int line, const char *name); -void AnnotateIgnoreReadsBegin(const char *file, int line); -void AnnotateIgnoreReadsEnd(const char *file, int line); -void AnnotateIgnoreWritesBegin(const char *file, int line); -void AnnotateIgnoreWritesEnd(const char *file, int line); +ANNOTALYSIS_STATIC_INLINE +void AnnotateIgnoreReadsBegin(const char *file, int line) + ANNOTALYSIS_IGNORE_READS_BEGIN ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +ANNOTALYSIS_STATIC_INLINE +void AnnotateIgnoreReadsEnd(const char *file, int line) + ANNOTALYSIS_IGNORE_READS_END ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +ANNOTALYSIS_STATIC_INLINE +void AnnotateIgnoreWritesBegin(const char *file, int line) + ANNOTALYSIS_IGNORE_WRITES_BEGIN ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +ANNOTALYSIS_STATIC_INLINE +void AnnotateIgnoreWritesEnd(const char *file, int line) + ANNOTALYSIS_IGNORE_WRITES_END ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY void AnnotateEnableRaceDetection(const char *file, int line, int enable); void AnnotateNoOp(const char *file, int line, const volatile void *arg); @@ -485,7 +528,8 @@ double ValgrindSlowdown(void); one can use ... = ANNOTATE_UNPROTECTED_READ(x); */ template - inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x) { + inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x) + ANNOTALYSIS_UNPROTECTED_READ { ANNOTATE_IGNORE_READS_BEGIN(); T res = x; ANNOTATE_IGNORE_READS_END(); @@ -511,4 +555,67 @@ double ValgrindSlowdown(void); #endif /* DYNAMIC_ANNOTATIONS_ENABLED */ +/* Annotalysis, a GCC based static analyzer, is able to understand and use + some of the dynamic annotations defined in this file. However, dynamic + annotations are usually disabled in the opt mode (to avoid additional + runtime overheads) while Annotalysis only works in the opt mode. + In order for Annotalysis to use these dynamic annotations when they + are disabled, we re-define these annotations here. Note that unlike the + original macro definitions above, these macros are expanded to calls to + static inline functions so that the compiler will be able to remove the + calls after the analysis. */ + +#ifdef ANNOTALYSIS_ONLY + + #undef ANNOTALYSIS_ONLY + + /* Undefine and re-define the macros that the static analyzer understands. */ + #undef ANNOTATE_IGNORE_READS_BEGIN + #define ANNOTATE_IGNORE_READS_BEGIN() \ + AnnotateIgnoreReadsBegin(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_READS_END + #define ANNOTATE_IGNORE_READS_END() \ + AnnotateIgnoreReadsEnd(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_WRITES_BEGIN + #define ANNOTATE_IGNORE_WRITES_BEGIN() \ + AnnotateIgnoreWritesBegin(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_WRITES_END + #define ANNOTATE_IGNORE_WRITES_END() \ + AnnotateIgnoreWritesEnd(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN + #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \ + do { \ + ANNOTATE_IGNORE_READS_BEGIN(); \ + ANNOTATE_IGNORE_WRITES_BEGIN(); \ + }while(0) \ + + #undef ANNOTATE_IGNORE_READS_AND_WRITES_END + #define ANNOTATE_IGNORE_READS_AND_WRITES_END() \ + do { \ + ANNOTATE_IGNORE_WRITES_END(); \ + ANNOTATE_IGNORE_READS_END(); \ + }while(0) \ + + #if defined(__cplusplus) + #undef ANNOTATE_UNPROTECTED_READ + template + inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x) + __attribute__ ((unprotected_read)) { + ANNOTATE_IGNORE_READS_BEGIN(); + T res = x; + ANNOTATE_IGNORE_READS_END(); + return res; + } + #endif /* __cplusplus */ + +#endif /* ANNOTALYSIS_ONLY */ + +/* Undefine the macros intended only in this file. */ +#undef ANNOTALYSIS_STATIC_INLINE +#undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY + #endif /* BASE_DYNAMIC_ANNOTATIONS_H_ */ diff --git a/src/base/logging.h b/src/base/logging.h index 6aa5c3f..fe25acf 100644 --- a/src/base/logging.h +++ b/src/base/logging.h @@ -49,10 +49,13 @@ // On some systems (like freebsd), we can't call write() at all in a // global constructor, perhaps because errno hasn't been set up. +// (In windows, we can't call it because it might call malloc.) // Calling the write syscall is safer (it doesn't set errno), so we // prefer that. Note we don't care about errno for logging: we just // do logging on a best-effort basis. -#ifdef HAVE_SYS_SYSCALL_H +#if defined(_MSC_VER) +#define WRITE_TO_STDERR(buf, len) WriteToStderr(buf, len); // in port.cc +#elif defined(HAVE_SYS_SYSCALL_H) #include #define WRITE_TO_STDERR(buf, len) syscall(SYS_write, STDERR_FILENO, buf, len) #else diff --git a/src/base/low_level_alloc.cc b/src/base/low_level_alloc.cc index 7ca3953..8864629 100644 --- a/src/base/low_level_alloc.cc +++ b/src/base/low_level_alloc.cc @@ -59,7 +59,9 @@ // --------------------------------------------------------------------------- static const int kMaxLevel = 30; -namespace { +// We put this class-only struct in a namespace to avoid polluting the +// global namespace with this struct name (thus risking an ODR violation). +namespace low_level_alloc_internal { // This struct describes one allocated block, or one free block. struct AllocList { struct Header { @@ -79,6 +81,8 @@ namespace { // LLA_SkiplistLevels() }; } +using low_level_alloc_internal::AllocList; + // --------------------------------------------------------------------------- // A trivial skiplist implementation. This is used to keep the freelist diff --git a/src/base/spinlock.cc b/src/base/spinlock.cc index 48cdc89..1413923 100644 --- a/src/base/spinlock.cc +++ b/src/base/spinlock.cc @@ -32,47 +32,28 @@ */ #include -#include /* For nanosleep() */ -#ifdef HAVE_SCHED_H -#include /* For sched_yield() */ -#endif -#ifdef HAVE_UNISTD_H -#include /* For read() */ -#endif -#include /* for open(), O_RDONLY */ -#include /* for strncmp */ -#include #include "base/spinlock.h" +#include "base/synchronization_profiling.h" +#include "base/spinlock_internal.h" #include "base/cycleclock.h" #include "base/sysinfo.h" /* for NumCPUs() */ -// We can do contention-profiling of SpinLocks, but the code is in -// mutex.cc, which is not always linked in with spinlock. Hence we -// provide this weak definition, which is used if mutex.cc isn't linked in. -ATTRIBUTE_WEAK extern void SubmitSpinLockProfileData(const void *, int64); -void SubmitSpinLockProfileData(const void *, int64) {} +// NOTE on the Lock-state values: +// +// kSpinLockFree represents the unlocked state +// kSpinLockHeld represents the locked state with no waiters +// +// Values greater than kSpinLockHeld represent the locked state with waiters, +// where the value is the time the current lock holder had to +// wait before obtaining the lock. The kSpinLockSleeper state is a special +// "locked with waiters" state that indicates that a sleeper needs to +// be woken, but the thread that just released the lock didn't wait. static int adaptive_spin_count = 0; const base::LinkerInitialized SpinLock::LINKER_INITIALIZED = base::LINKER_INITIALIZED; -// The OS-specific header included below must provide two calls: -// Wait until *w becomes zero, atomically set it to 1 and return. -// static void SpinLockWait(volatile Atomic32 *w); -// -// Hint that a thread waiting in SpinLockWait() could now make progress. May -// do nothing. This call may not read or write *w; it must use only the -// address. -// static void SpinLockWake(volatile Atomic32 *w); -#if defined(_WIN32) -#include "base/spinlock_win32-inl.h" -#elif defined(__linux__) -#include "base/spinlock_linux-inl.h" -#else -#include "base/spinlock_posix-inl.h" -#endif - namespace { struct SpinLock_InitHelper { SpinLock_InitHelper() { @@ -91,36 +72,111 @@ static SpinLock_InitHelper init_helper; } // unnamed namespace +// Monitor the lock to see if its value changes within some time period +// (adaptive_spin_count loop iterations). A timestamp indicating +// when the thread initially started waiting for the lock is passed in via +// the initial_wait_timestamp value. The total wait time in cycles for the +// lock is returned in the wait_cycles parameter. The last value read +// from the lock is returned from the method. +Atomic32 SpinLock::SpinLoop(int64 initial_wait_timestamp, + Atomic32* wait_cycles) { + int c = adaptive_spin_count; + while (base::subtle::NoBarrier_Load(&lockword_) != kSpinLockFree && --c > 0) { + } + Atomic32 spin_loop_wait_cycles = CalculateWaitCycles(initial_wait_timestamp); + Atomic32 lock_value = + base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree, + spin_loop_wait_cycles); + *wait_cycles = spin_loop_wait_cycles; + return lock_value; +} void SpinLock::SlowLock() { - int c = adaptive_spin_count; + // The lock was not obtained initially, so this thread needs to wait for + // it. Record the current timestamp in the local variable wait_start_time + // so the total wait time can be stored in the lockword once this thread + // obtains the lock. + int64 wait_start_time = CycleClock::Now(); + Atomic32 wait_cycles; + Atomic32 lock_value = SpinLoop(wait_start_time, &wait_cycles); - // Spin a few times in the hope that the lock holder releases the lock - while ((c > 0) && (lockword_ != 0)) { - c--; - } + int lock_wait_call_count = 0; + while (lock_value != kSpinLockFree) { + // If the lock is currently held, but not marked as having a sleeper, mark + // it as having a sleeper. + if (lock_value == kSpinLockHeld) { + // Here, just "mark" that the thread is going to sleep. Don't store the + // lock wait time in the lock as that will cause the current lock + // owner to think it experienced contention. + lock_value = base::subtle::Acquire_CompareAndSwap(&lockword_, + kSpinLockHeld, + kSpinLockSleeper); + if (lock_value == kSpinLockHeld) { + // Successfully transitioned to kSpinLockSleeper. Pass + // kSpinLockSleeper to the SpinLockWait routine to properly indicate + // the last lock_value observed. + lock_value = kSpinLockSleeper; + } else if (lock_value == kSpinLockFree) { + // Lock is free again, so try and aquire it before sleeping. The + // new lock state will be the number of cycles this thread waited if + // this thread obtains the lock. + lock_value = base::subtle::Acquire_CompareAndSwap(&lockword_, + kSpinLockFree, + wait_cycles); + continue; // skip the delay at the end of the loop + } + } - if (lockword_ == 1) { - int32 now = (CycleClock::Now() >> PROFILE_TIMESTAMP_SHIFT); - // Don't loose the lock: make absolutely sure "now" is not zero - now |= 1; - // Atomically replace the value of lockword_ with "now" if - // lockword_ is 1, thereby remembering the first timestamp to - // be recorded. - base::subtle::NoBarrier_CompareAndSwap(&lockword_, 1, now); - // base::subtle::NoBarrier_CompareAndSwap() returns: - // 0: the lock is/was available; nothing stored - // 1: our timestamp was stored - // > 1: an older timestamp is already in lockword_; nothing stored + // Wait for an OS specific delay. + base::internal::SpinLockDelay(&lockword_, lock_value, + ++lock_wait_call_count); + // Spin again after returning from the wait routine to give this thread + // some chance of obtaining the lock. + lock_value = SpinLoop(wait_start_time, &wait_cycles); } - - SpinLockWait(&lockword_); // wait until lock acquired; OS specific } -void SpinLock::SlowUnlock(int64 wait_timestamp) { - SpinLockWake(&lockword_); // wake waiter if necessary; OS specific +// The wait time for contentionz lock profiling must fit into 32 bits. +// However, the lower 32-bits of the cycle counter wrap around too quickly +// with high frequency processors, so a right-shift by 7 is performed to +// quickly divide the cycles by 128. Using these 32 bits, reduces the +// granularity of time measurement to 128 cycles, and loses track +// of wait time for waits greater than 109 seconds on a 5 GHz machine +// [(2^32 cycles/5 Ghz)*128 = 109.95 seconds]. Waits this long should be +// very rare and the reduced granularity should not be an issue given +// processors in the Google fleet operate at a minimum of one billion +// cycles/sec. +enum { PROFILE_TIMESTAMP_SHIFT = 7 }; + +void SpinLock::SlowUnlock(uint64 wait_cycles) { + base::internal::SpinLockWake(&lockword_, false); // wake waiter if necessary + + // Collect contentionz profile info, expanding the wait_cycles back out to + // the full value. If wait_cycles is <= kSpinLockSleeper, then no wait + // was actually performed, so don't record the wait time. Note, that the + // CalculateWaitCycles method adds in kSpinLockSleeper cycles + // unconditionally to guarantee the wait time is not kSpinLockFree or + // kSpinLockHeld. The adding in of these small number of cycles may + // overestimate the contention by a slight amount 50% of the time. However, + // if this code tried to correct for that addition by subtracting out the + // kSpinLockSleeper amount that would underestimate the contention slightly + // 50% of the time. Both ways get the wrong answer, so the code + // overestimates to be more conservative. Overestimating also makes the code + // a little simpler. + // + if (wait_cycles > kSpinLockSleeper) { + base::SubmitSpinLockProfileData(this, + wait_cycles << PROFILE_TIMESTAMP_SHIFT); + } +} - // Collect contentionz profile info. Subtract one from wait_timestamp as - // antidote to "now |= 1;" in SlowLock(). - SubmitSpinLockProfileData(this, wait_timestamp - 1); +inline int32 SpinLock::CalculateWaitCycles(int64 wait_start_time) { + int32 wait_cycles = ((CycleClock::Now() - wait_start_time) >> + PROFILE_TIMESTAMP_SHIFT); + // The number of cycles waiting for the lock is used as both the + // wait_cycles and lock value, so it can't be kSpinLockFree or + // kSpinLockHeld. Make sure the value returned is at least + // kSpinLockSleeper. + wait_cycles |= kSpinLockSleeper; + return wait_cycles; } diff --git a/src/base/spinlock.h b/src/base/spinlock.h index 9e633c4..c2be4fd 100644 --- a/src/base/spinlock.h +++ b/src/base/spinlock.h @@ -44,14 +44,14 @@ #define BASE_SPINLOCK_H_ #include -#include "base/basictypes.h" #include "base/atomicops.h" +#include "base/basictypes.h" #include "base/dynamic_annotations.h" #include "base/thread_annotations.h" class LOCKABLE SpinLock { public: - SpinLock() : lockword_(0) { } + SpinLock() : lockword_(kSpinLockFree) { } // Special constructor for use with static SpinLock objects. E.g., // @@ -70,18 +70,21 @@ class LOCKABLE SpinLock { // TODO(csilvers): uncomment the annotation when we figure out how to // support this macro with 0 args (see thread_annotations.h) inline void Lock() /*EXCLUSIVE_LOCK_FUNCTION()*/ { - if (Acquire_CompareAndSwap(&lockword_, 0, 1) != 0) { + if (base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree, + kSpinLockHeld) != kSpinLockFree) { SlowLock(); } ANNOTATE_RWLOCK_ACQUIRED(this, 1); } - // Acquire this SpinLock and return true if the acquisition can be - // done without blocking, else return false. If this SpinLock is - // free at the time of the call, TryLock will return true with high - // probability. + // Try to acquire this SpinLock without blocking and return true if the + // acquisition was successful. If the lock was not acquired, false is + // returned. If this SpinLock is free at the time of the call, TryLock + // will return true with high probability. inline bool TryLock() EXCLUSIVE_TRYLOCK_FUNCTION(true) { - bool res = (Acquire_CompareAndSwap(&lockword_, 0, 1) == 0); + bool res = + (base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree, + kSpinLockHeld) == kSpinLockFree); if (res) { ANNOTATE_RWLOCK_ACQUIRED(this, 1); } @@ -92,47 +95,37 @@ class LOCKABLE SpinLock { // TODO(csilvers): uncomment the annotation when we figure out how to // support this macro with 0 args (see thread_annotations.h) inline void Unlock() /*UNLOCK_FUNCTION()*/ { - // This is defined in mutex.cc. - extern void SubmitSpinLockProfileData(const void *, int64); - - int64 wait_timestamp = static_cast(lockword_); + uint64 wait_cycles = + static_cast(base::subtle::NoBarrier_Load(&lockword_)); ANNOTATE_RWLOCK_RELEASED(this, 1); - Release_Store(&lockword_, 0); - if (wait_timestamp != 1) { + base::subtle::Release_Store(&lockword_, kSpinLockFree); + if (wait_cycles != kSpinLockHeld) { // Collect contentionz profile info, and speed the wakeup of any waiter. - // The lockword_ value indicates when the waiter started waiting. - SlowUnlock(wait_timestamp); + // The wait_cycles value indicates how long this thread spent waiting + // for the lock. + SlowUnlock(wait_cycles); } } - // Report if we think the lock can be held by this thread. - // When the lock is truly held by the invoking thread - // we will always return true. - // Indended to be used as CHECK(lock.IsHeld()); + // Determine if the lock is held. When the lock is held by the invoking + // thread, true will always be returned. Intended to be used as + // CHECK(lock.IsHeld()). inline bool IsHeld() const { - return lockword_ != 0; + return base::subtle::NoBarrier_Load(&lockword_) != kSpinLockFree; } - // The timestamp for contention lock profiling must fit into 31 bits. - // as lockword_ is 32 bits and we loose an additional low-order bit due - // to the statement "now |= 1" in SlowLock(). - // To select 31 bits from the 64-bit cycle counter, we shift right by - // PROFILE_TIMESTAMP_SHIFT = 7. - // Using these 31 bits, we reduce granularity of time measurement to - // 256 cycles, and will loose track of wait time for waits greater than - // 109 seconds on a 5 GHz machine, longer for faster clock cycles. - // Waits this long should be very rare. - enum { PROFILE_TIMESTAMP_SHIFT = 7 }; - static const base::LinkerInitialized LINKER_INITIALIZED; // backwards compat private: - // Lock-state: 0 means unlocked; 1 means locked with no waiters; values - // greater than 1 indicate locked with waiters, where the value is the time - // the first waiter started waiting and is used for contention profiling. + enum { kSpinLockFree = 0 }; + enum { kSpinLockHeld = 1 }; + enum { kSpinLockSleeper = 2 }; + volatile Atomic32 lockword_; void SlowLock(); - void SlowUnlock(int64 wait_timestamp); + void SlowUnlock(uint64 wait_cycles); + Atomic32 SpinLoop(int64 initial_wait_timestamp, Atomic32* wait_cycles); + inline int32 CalculateWaitCycles(int64 wait_start_time); DISALLOW_COPY_AND_ASSIGN(SpinLock); }; diff --git a/src/base/spinlock_internal.cc b/src/base/spinlock_internal.cc new file mode 100644 index 0000000..b5b6ca4 --- /dev/null +++ b/src/base/spinlock_internal.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2010, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// The OS-specific header included below must provide two calls: +// base::internal::SpinLockDelay() and base::internal::SpinLockWake(). +// See spinlock_internal.h for the spec of SpinLockWake(). + +// void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) +// SpinLockDelay() generates an apprproate spin delay on iteration "loop" of a +// spin loop on location *w, whose previously observed value was "value". +// SpinLockDelay() may do nothing, may yield the CPU, may sleep a clock tick, +// or may wait for a delay that can be truncated by a call to SpinlockWake(w). +// In all cases, it must return in bounded time even if SpinlockWake() is not +// called. + +#include "base/spinlock_internal.h" + +#if defined(_WIN32) +#include "base/spinlock_win32-inl.h" +#elif defined(__linux__) +#include "base/spinlock_linux-inl.h" +#else +#include "base/spinlock_posix-inl.h" +#endif + +namespace base { +namespace internal { + +// See spinlock_internal.h for spec. +int32 SpinLockWait(volatile Atomic32 *w, int n, + const SpinLockWaitTransition trans[]) { + int32 v; + bool done = false; + for (int loop = 0; !done; loop++) { + v = base::subtle::Acquire_Load(w); + int i; + for (i = 0; i != n && v != trans[i].from; i++) { + } + if (i == n) { + SpinLockDelay(w, v, loop); // no matching transition + } else if (trans[i].to == v || // null transition + base::subtle::Acquire_CompareAndSwap(w, v, trans[i].to) == v) { + done = trans[i].done; + } + } + return v; +} + +} // namespace internal +} // namespace base diff --git a/src/base/spinlock_internal.h b/src/base/spinlock_internal.h new file mode 100644 index 0000000..4494260 --- /dev/null +++ b/src/base/spinlock_internal.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2010, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * This file is an internal part spinlock.cc and once.cc + * It may not be used directly by code outside of //base. + */ + +#ifndef BASE_SPINLOCK_INTERNAL_H_ +#define BASE_SPINLOCK_INTERNAL_H_ + +#include +#include "base/basictypes.h" +#include "base/atomicops.h" + +namespace base { +namespace internal { + +// SpinLockWait() waits until it can perform one of several transitions from +// "from" to "to". It returns when it performs a transition where done==true. +struct SpinLockWaitTransition { + int32 from; + int32 to; + bool done; +}; + +// Wait until *w can transition from trans[i].from to trans[i].to for some i +// satisfying 0<=i #include +#include #include "base/linux_syscall_support.h" #define FUTEX_WAIT 0 @@ -48,7 +49,7 @@ static struct InitModule { int x = 0; // futexes are ints, so we can use them only when // that's the same size as the lockword_ in SpinLock. - have_futex = (sizeof (Atomic32) == sizeof (int) && + have_futex = (sizeof (Atomic32) == sizeof (int) && sys_futex(&x, FUTEX_WAKE, 1, 0) >= 0); if (have_futex && sys_futex(&x, FUTEX_WAKE | futex_private_flag, 1, 0) < 0) { @@ -56,36 +57,41 @@ static struct InitModule { } } } init_module; + } // anonymous namespace -static void SpinLockWait(volatile Atomic32 *w) { - int save_errno = errno; - struct timespec tm; - tm.tv_sec = 0; - if (have_futex) { - int value; - tm.tv_nsec = 1000000; // 1ms; really we're trying to sleep for one kernel - // clock tick - while ((value = base::subtle::Acquire_CompareAndSwap(w, 0, 1)) != 0) { - sys_futex(reinterpret_cast(const_cast(w)), - FUTEX_WAIT | futex_private_flag, - value, reinterpret_cast(&tm)); - } - } else { - tm.tv_nsec = 2000001; // above 2ms so linux 2.4 doesn't spin - if (base::subtle::NoBarrier_Load(w) != 0) { - sched_yield(); + +namespace base { +namespace internal { + +void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) { + if (loop != 0) { + int save_errno = errno; + struct timespec tm; + tm.tv_sec = 0; + if (have_futex) { + tm.tv_nsec = 1000000; // 1ms; really we're trying to sleep for one + // kernel clock tick + } else { + tm.tv_nsec = 2000001; // above 2ms so linux 2.4 doesn't spin } - while (base::subtle::Acquire_CompareAndSwap(w, 0, 1) != 0) { + if (have_futex) { + sys_futex(reinterpret_cast(const_cast(w)), + FUTEX_WAIT | futex_private_flag, + value, reinterpret_cast(&tm)); + } else { nanosleep(&tm, NULL); } + errno = save_errno; } - errno = save_errno; } -static void SpinLockWake(volatile Atomic32 *w) { +void SpinLockWake(volatile Atomic32 *w, bool all) { if (have_futex) { sys_futex(reinterpret_cast(const_cast(w)), - FUTEX_WAKE | futex_private_flag, 1, 0); + FUTEX_WAKE | futex_private_flag, all? INT_MAX : 1, 0); } } + +} // namespace internal +} // namespace base diff --git a/src/base/spinlock_posix-inl.h b/src/base/spinlock_posix-inl.h index 0d933c0..d188ebd 100644 --- a/src/base/spinlock_posix-inl.h +++ b/src/base/spinlock_posix-inl.h @@ -28,25 +28,35 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * --- - * This file is a Posix-specific part of spinlock.cc + * This file is a Posix-specific part of spinlock_internal.cc */ -#include -#include +#include +#include +#ifdef HAVE_SCHED_H +#include /* For sched_yield() */ +#endif +#include /* For nanosleep() */ -static void SpinLockWait(volatile Atomic32 *w) { +namespace base { +namespace internal { + +void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) { int save_errno = errno; - struct timespec tm; - tm.tv_sec = 0; - tm.tv_nsec = 1000000; - if (base::subtle::NoBarrier_Load(w) != 0) { + if (loop == 0) { + } else if (loop == 1) { sched_yield(); - } - while (base::subtle::Acquire_CompareAndSwap(w, 0, 1) != 0) { + } else { + struct timespec tm; + tm.tv_sec = 0; + tm.tv_nsec = 1000000; nanosleep(&tm, NULL); } errno = save_errno; } -static void SpinLockWake(volatile Atomic32 *w) { +void SpinLockWake(volatile Atomic32 *w, bool all) { } + +} // namespace internal +} // namespace base diff --git a/src/base/spinlock_win32-inl.h b/src/base/spinlock_win32-inl.h index 9058939..ee23541 100644 --- a/src/base/spinlock_win32-inl.h +++ b/src/base/spinlock_win32-inl.h @@ -28,20 +28,26 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * --- - * This file is a Win32-specific part of spinlock.cc + * This file is a Win32-specific part of spinlock_internal.cc */ #include -static void SpinLockWait(volatile Atomic32 *w) { - if (base::subtle::NoBarrier_Load(w) != 0) { +namespace base { +namespace internal { + +void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) { + if (loop == 0) { + } else if (loop == 1) { Sleep(0); - } - while (base::subtle::Acquire_CompareAndSwap(w, 0, 1) != 0) { + } else { Sleep(1); } } -static void SpinLockWake(volatile Atomic32 *w) { +void SpinLockWake(volatile Atomic32 *w, bool all) { } + +} // namespace internal +} // namespace base diff --git a/src/base/synchronization_profiling.h b/src/base/synchronization_profiling.h new file mode 100644 index 0000000..cf02c21 --- /dev/null +++ b/src/base/synchronization_profiling.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2010, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Chris Ruemmler + */ + +#ifndef BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_ +#define BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_ + +#include "base/basictypes.h" + +namespace base { + +// We can do contention-profiling of SpinLocks, but the code is in +// mutex.cc, which is not always linked in with spinlock. Hence we +// provide a weak definition, which are used if mutex.cc isn't linked in. + +// Submit the number of cycles the spinlock spent contending. +ATTRIBUTE_WEAK extern void SubmitSpinLockProfileData(const void *, int64); +extern void SubmitSpinLockProfileData(const void *contendedlock, + int64 wait_cycles) {} +} +#endif // BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_ diff --git a/src/base/sysinfo.cc b/src/base/sysinfo.cc index 7cfa051..c1e2aef 100644 --- a/src/base/sysinfo.cc +++ b/src/base/sysinfo.cc @@ -111,20 +111,23 @@ // 8K), so it's not an ideal solution. const char* GetenvBeforeMain(const char* name) { #if defined(HAVE___ENVIRON) // if we have it, it's declared in unistd.h - const int namelen = strlen(name); - for (char** p = __environ; *p; p++) { - if (!memcmp(*p, name, namelen) && (*p)[namelen] == '=') // it's a match - return *p + namelen+1; // point after = + if (__environ) { // can exist but be NULL, if statically linked + const int namelen = strlen(name); + for (char** p = __environ; *p; p++) { + if (!memcmp(*p, name, namelen) && (*p)[namelen] == '=') // it's a match + return *p + namelen+1; // point after = + } + return NULL; } - return NULL; -#elif defined(PLATFORM_WINDOWS) +#endif +#if defined(PLATFORM_WINDOWS) // TODO(mbelshe) - repeated calls to this function will overwrite the // contents of the static buffer. - static char envbuf[1024]; // enough to hold any envvar we care about - if (!GetEnvironmentVariableA(name, envbuf, sizeof(envbuf)-1)) + static char envvar_buf[1024]; // enough to hold any envvar we care about + if (!GetEnvironmentVariableA(name, envvar_buf, sizeof(envvar_buf)-1)) return NULL; - return envbuf; -#else + return envvar_buf; +#endif // static is ok because this function should only be called before // main(), when we're single-threaded. static char envbuf[16<<10]; @@ -152,7 +155,6 @@ const char* GetenvBeforeMain(const char* name) { p = endp + 1; } return NULL; // env var never found -#endif } // This takes as an argument an environment-variable name (like @@ -229,6 +231,26 @@ static int64 EstimateCyclesPerSecond(const int estimate_time_ms) { return guess; } +// Helper function for reading an int from a file. Returns true if successful +// and the memory location pointed to by value is set to the value read. +static bool ReadIntFromFile(const char *file, int *value) { + bool ret = false; + int fd = open(file, O_RDONLY); + if (fd != -1) { + char line[1024]; + char* err; + memset(line, '\0', sizeof(line)); + read(fd, line, sizeof(line) - 1); + const int temp_value = strtol(line, &err, 10); + if (line[0] != '\0' && (*err == '\n' || *err == '\0')) { + *value = temp_value; + ret = true; + } + close(fd); + } + return ret; +} + // WARNING: logging calls back to InitializeSystemInfo() so it must // not invoke any logging code. Also, InitializeSystemInfo() can be // called before main() -- in fact it *must* be since already_called @@ -254,26 +276,31 @@ static void InitializeSystemInfo() { #if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__) char line[1024]; char* err; + int freq; + + // If the kernel is exporting the tsc frequency use that. There are issues + // where cpuinfo_max_freq cannot be relied on because the BIOS may be + // exporintg an invalid p-state (on x86) or p-states may be used to put the + // processor in a new mode (turbo mode). Essentially, those frequencies + // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as + // well. + if (!saw_mhz && + ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) { + // The value is in kHz (as the file name suggests). For example, on a + // 2GHz warpstation, the file contains the value "2000000". + cpuinfo_cycles_per_second = freq * 1000.0; + saw_mhz = true; + } // If CPU scaling is in effect, we want to use the *maximum* frequency, // not whatever CPU speed some random processor happens to be using now. - if (!saw_mhz) { - const char* pname0 = - "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"; - int fd0 = open(pname0, O_RDONLY); - if (fd0 != -1) { - memset(line, '\0', sizeof(line)); - read(fd0, line, sizeof(line)); - const int max_freq = strtol(line, &err, 10); - if (line[0] != '\0' && (*err == '\n' || *err == '\0')) { - // The value is in kHz. For example, on a 2GHz machine, the file - // contains the value "2000000". Historically this file contained no - // newline, but at some point the kernel started appending a newline. - cpuinfo_cycles_per_second = max_freq * 1000.0; - saw_mhz = true; - } - close(fd0); - } + if (!saw_mhz && + ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", + &freq)) { + // The value is in kHz. For example, on a 2GHz machine, the file + // contains the value "2000000". + cpuinfo_cycles_per_second = freq * 1000.0; + saw_mhz = true; } // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq. @@ -311,20 +338,20 @@ static void InitializeSystemInfo() { if (newline != NULL) *newline = '\0'; - if (!saw_mhz && strncmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) { + if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) { const char* freqstr = strchr(line, ':'); if (freqstr) { cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0; if (freqstr[1] != '\0' && *err == '\0') saw_mhz = true; } - } else if (strncmp(line, "bogomips", sizeof("bogomips")-1) == 0) { + } else if (strncasecmp(line, "bogomips", sizeof("bogomips")-1) == 0) { const char* freqstr = strchr(line, ':'); if (freqstr) bogo_clock = strtod(freqstr+1, &err) * 1000000.0; if (freqstr == NULL || freqstr[1] == '\0' || *err != '\0') bogo_clock = 1.0; - } else if (strncmp(line, "processor", sizeof("processor")-1) == 0) { + } else if (strncasecmp(line, "processor", sizeof("processor")-1) == 0) { num_cpus++; // count up every time we see an "processor :" entry } } while (chars_read > 0); @@ -888,9 +915,10 @@ namespace tcmalloc { // Helper to add the list of mapped shared libraries to a profile. // Fill formatted "/proc/self/maps" contents into buffer 'buf' of size 'size' -// and return the actual size occupied in 'buf'. +// and return the actual size occupied in 'buf'. We fill wrote_all to true +// if we successfully wrote all proc lines to buf, false else. // We do not provision for 0-terminating 'buf'. -int FillProcSelfMaps(char buf[], int size) { +int FillProcSelfMaps(char buf[], int size, bool* wrote_all) { ProcMapsIterator::Buffer iterbuf; ProcMapsIterator it(0, &iterbuf); // 0 means "current pid" @@ -898,10 +926,17 @@ int FillProcSelfMaps(char buf[], int size) { int64 inode; char *flags, *filename; int bytes_written = 0; + *wrote_all = true; while (it.Next(&start, &end, &flags, &offset, &inode, &filename)) { - bytes_written += it.FormatLine(buf + bytes_written, size - bytes_written, - start, end, flags, offset, inode, filename, - 0); + const int line_length = it.FormatLine(buf + bytes_written, + size - bytes_written, + start, end, flags, offset, + inode, filename, 0); + if (line_length == 0) + *wrote_all = false; // failed to write this line out + else + bytes_written += line_length; + } return bytes_written; } diff --git a/src/base/sysinfo.h b/src/base/sysinfo.h index 0bcc1f5..8bae5e3 100644 --- a/src/base/sysinfo.h +++ b/src/base/sysinfo.h @@ -226,7 +226,7 @@ class ProcMapsIterator { // Helper routines namespace tcmalloc { -int FillProcSelfMaps(char buf[], int size); +int FillProcSelfMaps(char buf[], int size, bool* wrote_all); void DumpProcSelfMaps(RawFD fd); } diff --git a/src/base/thread_annotations.h b/src/base/thread_annotations.h index f1b3593..f57b299 100644 --- a/src/base/thread_annotations.h +++ b/src/base/thread_annotations.h @@ -46,7 +46,9 @@ #define BASE_THREAD_ANNOTATIONS_H_ -#if defined(__GNUC__) && defined(__SUPPORT_TS_ANNOTATION__) && (!defined(SWIG)) +#if defined(__GNUC__) \ + && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) \ + && defined(__SUPPORT_TS_ANNOTATION__) && (!defined(SWIG)) #define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x)) #else #define THREAD_ANNOTATION_ATTRIBUTE__(x) // no-op diff --git a/src/base/vdso_support.cc b/src/base/vdso_support.cc index fce7c2c..73c6545 100644 --- a/src/base/vdso_support.cc +++ b/src/base/vdso_support.cc @@ -395,7 +395,7 @@ const void *VDSOSupport::Init() { } // Subtle: this code runs outside of any locks; prevent compiler // from assigning to getcpu_fn_ more than once. - MemoryBarrier(); + base::subtle::MemoryBarrier(); getcpu_fn_ = fn; return vdso_base_; } diff --git a/src/common.h b/src/common.h index e2906d6..53050ca 100644 --- a/src/common.h +++ b/src/common.h @@ -77,6 +77,8 @@ static const size_t kPageSize = 1 << kPageShift; static const size_t kMaxSize = 8u * kPageSize; static const size_t kAlignment = 8; static const size_t kLargeSizeClass = 0; +// For all span-lengths < kMaxPages we keep an exact-size list. +static const size_t kMaxPages = 1 << (20 - kPageShift); // Default bound on the total amount of thread caches. static const size_t kDefaultOverallThreadCacheSize = 8u * kMaxThreadCacheSize; @@ -102,6 +104,17 @@ static const int kMaxDynamicFreeListLength = 8192; static const Length kMaxValidPages = (~static_cast(0)) >> kPageShift; +#ifdef __x86_64__ +// All current and planned x86_64 processors only look at the lower 48 bits +// in virtual to physical address translation. The top 16 are thus unused. +// TODO(rus): Under what operating systems can we increase it safely to 17? +// This lets us use smaller page maps. On first allocation, a 36-bit page map +// uses only 96 KB instead of the 4.5 MB used by a 52-bit page map. +static const int kAddressBits = 48; +#else +static const int kAddressBits = 8 * sizeof(void*); +#endif + namespace tcmalloc { // Convert byte size into pages. This won't overflow, but may return diff --git a/src/config.h.in b/src/config.h.in index a1d5c68..6ee2db0 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -119,6 +119,9 @@ /* Define to 1 if the system has the type `struct mallinfo'. */ #undef HAVE_STRUCT_MALLINFO +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_PARAM_H + /* Define to 1 if you have the header file. */ #undef HAVE_SYS_PRCTL_H @@ -173,6 +176,10 @@ /* Define to 1 if int32_t is equivalent to intptr_t */ #undef INT32_EQUALS_INTPTR +/* Define to the sub-directory in which libtool stores uninstalled libraries. + */ +#undef LT_OBJDIR + /* Define to 1 if your C compiler doesn't accept -c and -o together. */ #undef NO_MINUS_C_MINUS_O diff --git a/src/debugallocation.cc b/src/debugallocation.cc index 949fbe9..3b34c8c 100644 --- a/src/debugallocation.cc +++ b/src/debugallocation.cc @@ -497,7 +497,7 @@ class MallocBlock { // practical effect is that allocations are limited to 4Gb or so, even if // the address space could take more. static size_t max_size_t = ~0; - if (size < 0 || size > max_size_t - sizeof(MallocBlock)) { + if (size > max_size_t - sizeof(MallocBlock)) { RAW_LOG(ERROR, "Massive size passed to malloc: %"PRIuS"", size); return NULL; } @@ -1356,6 +1356,20 @@ class DebugMallocImplementation : public ParentImplementation { virtual size_t GetEstimatedAllocatedSize(size_t size) { return size; } + + virtual void GetFreeListSizes(vector* v) { + static const char* kDebugFreeQueue = "debug.free_queue"; + + ParentImplementation::GetFreeListSizes(v); + + MallocExtension::FreeListInfo i; + i.type = kDebugFreeQueue; + i.min_object_size = 0; + i.max_object_size = numeric_limits::max(); + i.total_bytes_free = MallocBlock::FreeQueueSize(); + v->push_back(i); + } + }; static DebugMallocImplementation debug_malloc_implementation; diff --git a/src/google/heap-checker.h b/src/google/heap-checker.h index c0ee8a8..f46f353 100644 --- a/src/google/heap-checker.h +++ b/src/google/heap-checker.h @@ -136,7 +136,7 @@ class PERFTOOLS_DLL_DECL HeapLeakChecker { bool NoLeaks() { return DoNoLeaks(DO_NOT_SYMBOLIZE); } // These forms are obsolete; use NoLeaks() instead. - // TODO(csilvers): mark with ATTRIBUTE_DEPRECATED. + // TODO(csilvers): mark as DEPRECATED. bool QuickNoLeaks() { return NoLeaks(); } bool BriefNoLeaks() { return NoLeaks(); } bool SameHeap() { return NoLeaks(); } diff --git a/src/google/malloc_extension.h b/src/google/malloc_extension.h index 3fbefc9..a2e956e 100644 --- a/src/google/malloc_extension.h +++ b/src/google/malloc_extension.h @@ -50,6 +50,7 @@ #include #endif #include +#include // Annoying stuff for windows -- makes sure clients can import these functions #ifndef PERFTOOLS_DLL_DECL @@ -102,12 +103,17 @@ class PERFTOOLS_DLL_DECL MallocExtension { // that allocated these objects. The format of the returned output // is equivalent to the output of the heap profiler and can // therefore be passed to "pprof". + // NOTE: by default, tcmalloc does not do any heap sampling, and this + // function will always return an empty sample. To get useful + // data from GetHeapSample, you must also set the environment + // variable TCMALLOC_SAMPLE_PARAMETER to a value such as 524288. virtual void GetHeapSample(MallocExtensionWriter* writer); // Outputs to "writer" the stack traces that caused growth in the // address space size. The format of the returned output is // equivalent to the output of the heap profiler and can therefore - // be passed to "pprof". + // be passed to "pprof". (This does not depend on, or require, + // TCMALLOC_SAMPLE_PARAMETER.) virtual void GetHeapGrowthStacks(MallocExtensionWriter* writer); // Invokes func(arg, range) for every controlled memory @@ -244,6 +250,45 @@ class PERFTOOLS_DLL_DECL MallocExtension { // malloc implementation during initialization. static void Register(MallocExtension* implementation); + // Returns detailed information about malloc's freelists. For each list, + // return a FreeListInfo: + struct FreeListInfo { + size_t min_object_size; + size_t max_object_size; + size_t total_bytes_free; + const char* type; + }; + // Each item in the vector refers to a different freelist. The lists + // are identified by the range of allocations that objects in the + // list can satisfy ([min_object_size, max_object_size]) and the + // type of freelist (see below). The current size of the list is + // returned in total_bytes_free (which count against a processes + // resident and virtual size). + // + // Currently supported types are: + // + // "tcmalloc.page{_unmapped}" - tcmalloc's page heap. An entry for each size + // class in the page heap is returned. Bytes in "page_unmapped" + // are no longer backed by physical memory and do not count against + // the resident size of a process. + // + // "tcmalloc.large{_unmapped}" - tcmalloc's list of objects larger + // than the largest page heap size class. Only one "large" + // entry is returned. There is no upper-bound on the size + // of objects in the large free list; this call returns + // kint64max for max_object_size. Bytes in + // "large_unmapped" are no longer backed by physical memory + // and do not count against the resident size of a process. + // + // "tcmalloc.central" - tcmalloc's central free-list. One entry per + // size-class is returned. Never unmapped. + // + // "debug.free_queue" - free objects queued by the debug allocator + // and not returned to tcmalloc. + // + // "tcmalloc.thread" - tcmalloc's per-thread caches. Never unmapped. + virtual void GetFreeListSizes(std::vector* v); + protected: // Get a list of stack traces of sampled allocation points. Returns // a pointer to a "new[]-ed" result array, and stores the sample diff --git a/src/heap-profile-table.cc b/src/heap-profile-table.cc index ecaf75f..6d75c4a 100644 --- a/src/heap-profile-table.cc +++ b/src/heap-profile-table.cc @@ -342,7 +342,8 @@ int HeapProfileTable::FillOrderedProfile(char buf[], int size) const { // any gaps. Whew! int map_length = snprintf(buf, size, "%s", kProcSelfMapsHeader); if (map_length < 0 || map_length >= size) return 0; - map_length += FillProcSelfMaps(buf + map_length, size - map_length); + bool dummy; // "wrote_all" -- did /proc/self/maps fit in its entirety? + map_length += FillProcSelfMaps(buf + map_length, size - map_length, &dummy); RAW_DCHECK(map_length <= size, ""); char* const map_start = buf + size - map_length; // move to end memmove(map_start, buf, map_length); diff --git a/src/malloc_extension.cc b/src/malloc_extension.cc index c2f8b54..1272068 100644 --- a/src/malloc_extension.cc +++ b/src/malloc_extension.cc @@ -52,6 +52,7 @@ #include "maybe_threads.h" using STL_NAMESPACE::string; +using STL_NAMESPACE::vector; static void DumpAddressMap(string* result) { *result += "\nMAPPED_LIBRARIES:\n"; @@ -59,9 +60,11 @@ static void DumpAddressMap(string* result) { const size_t old_resultlen = result->size(); for (int amap_size = 10240; amap_size < 10000000; amap_size *= 2) { result->resize(old_resultlen + amap_size); + bool wrote_all = false; const int bytes_written = - tcmalloc::FillProcSelfMaps(&((*result)[old_resultlen]), amap_size); - if (bytes_written < amap_size - 1) { // we fit! + tcmalloc::FillProcSelfMaps(&((*result)[old_resultlen]), amap_size, + &wrote_all); + if (wrote_all) { // we fit! (*result)[old_resultlen + bytes_written] = '\0'; result->resize(old_resultlen + bytes_written); return; @@ -167,6 +170,11 @@ size_t MallocExtension::GetAllocatedSize(void* p) { return 0; } +void MallocExtension::GetFreeListSizes( + vector* v) { + v->clear(); +} + // The current malloc extension object. static pthread_once_t module_init = PTHREAD_ONCE_INIT; diff --git a/src/memory_region_map.cc b/src/memory_region_map.cc index f6bed45..3f8509f 100644 --- a/src/memory_region_map.cc +++ b/src/memory_region_map.cc @@ -117,7 +117,6 @@ #include "memory_region_map.h" -#include "base/linux_syscall_support.h" #include "base/logging.h" #include "base/low_level_alloc.h" #include "malloc_hook-inl.h" diff --git a/src/page_heap.cc b/src/page_heap.cc index 1e63cb9..c92e16b 100644 --- a/src/page_heap.cc +++ b/src/page_heap.cc @@ -338,6 +338,35 @@ static double PagesToMB(uint64_t pages) { return (pages << kPageShift) / 1048576.0; } +void PageHeap::GetClassSizes(int64 class_sizes_normal[kMaxPages], + int64 class_sizes_returned[kMaxPages], + int64* normal_pages_in_spans, + int64* returned_pages_in_spans) { + + for (int s = 0; s < kMaxPages; s++) { + if (class_sizes_normal != NULL) { + class_sizes_normal[s] = DLL_Length(&free_[s].normal); + } + if (class_sizes_returned != NULL) { + class_sizes_returned[s] = DLL_Length(&free_[s].returned); + } + } + + if (normal_pages_in_spans != NULL) { + *normal_pages_in_spans = 0; + for (Span* s = large_.normal.next; s != &large_.normal; s = s->next) { + *normal_pages_in_spans += s->length;; + } + } + + if (returned_pages_in_spans != NULL) { + *returned_pages_in_spans = 0; + for (Span* s = large_.returned.next; s != &large_.returned; s = s->next) { + *returned_pages_in_spans += s->length; + } + } +} + void PageHeap::Dump(TCMalloc_Printer* out) { int nonempty_sizes = 0; for (int s = 0; s < kMaxPages; s++) { diff --git a/src/page_heap.h b/src/page_heap.h index 74030d2..545bdda 100644 --- a/src/page_heap.h +++ b/src/page_heap.h @@ -140,6 +140,10 @@ class PERFTOOLS_DLL_DECL PageHeap { uint64_t unmapped_bytes; // Total bytes on returned freelists }; inline Stats stats() const { return stats_; } + void GetClassSizes(int64 class_sizes_normal[kMaxPages], + int64 class_sizes_returned[kMaxPages], + int64* normal_pages_in_spans, + int64* returned_pages_in_spans); bool Check(); // Like Check() but does some more comprehensive checking. @@ -176,11 +180,8 @@ class PERFTOOLS_DLL_DECL PageHeap { // should keep this value big because various incarnations of Linux // have small limits on the number of mmap() regions per // address-space. - static const int kMinSystemAlloc = 1 << (20 - kPageShift); - - // For all span-lengths < kMaxPages we keep an exact-size list. - // REQUIRED: kMaxPages >= kMinSystemAlloc; - static const size_t kMaxPages = kMinSystemAlloc; + // REQUIRED: kMinSystemAlloc <= kMaxPages; + static const int kMinSystemAlloc = kMaxPages; // Never delay scavenging for more than the following number of // deallocated pages. With 4K pages, this comes to 4GB of @@ -192,8 +193,8 @@ class PERFTOOLS_DLL_DECL PageHeap { static const int kDefaultReleaseDelay = 1 << 18; // Pick the appropriate map and cache types based on pointer size - typedef MapSelector<8*sizeof(uintptr_t)>::Type PageMap; - typedef MapSelector<8*sizeof(uintptr_t)>::CacheType PageMapCache; + typedef MapSelector::Type PageMap; + typedef MapSelector::CacheType PageMapCache; PageMap pagemap_; mutable PageMapCache pagemap_cache_; diff --git a/src/pprof b/src/pprof index e67e42e..a503964 100755 --- a/src/pprof +++ b/src/pprof @@ -89,6 +89,7 @@ my %obj_tool_map = ( ); my $DOT = "dot"; # leave non-absolute, since it may be in /usr/local my $GV = "gv"; +my $EVINCE = "evince"; # could also be xpdf or perhaps acroread my $KCACHEGRIND = "kcachegrind"; my $PS2PDF = "ps2pdf"; # These are used for dynamic profiles @@ -103,6 +104,7 @@ my $GROWTH_PAGE = "/pprof/growth"; my $CONTENTION_PAGE = "/pprof/contention"; my $WALL_PAGE = "/pprof/wall(?:\\?.*)?"; # accepts options like namefilter my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?"; +my $CENSUSPROFILE_PAGE = "/pprof/censusprofile"; # must support "?seconds=#" my $SYMBOL_PAGE = "/pprof/symbol"; # must support symbol lookup via POST my $PROGRAM_NAME_PAGE = "/pprof/cmdline"; @@ -110,7 +112,7 @@ my $PROGRAM_NAME_PAGE = "/pprof/cmdline"; # All the alternatives must begin with /. my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" . "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" . - "$FILTEREDPROFILE_PAGE)"; + "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)"; # default binary name my $UNKNOWN_BINARY = "(unknown)"; @@ -148,7 +150,7 @@ pprof [options] The / can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile, $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall, - or /pprof/filteredprofile. + $CENSUSPROFILE_PAGE, or /pprof/filteredprofile. For instance: "pprof http://myserver.com:80$HEAP_PAGE". If / is omitted, the service defaults to $PROFILE_PAGE (cpu profiling). pprof --symbols @@ -180,6 +182,7 @@ Output type: --text Generate text report --callgrind Generate callgrind format to stdout --gv Generate Postscript and display + --evince Generate PDF and display --web Generate SVG and display --list= Generate source listing of matching routines --disasm= Generate disassembly of matching routines @@ -304,6 +307,7 @@ sub Init() { $main::opt_disasm = ""; $main::opt_symbols = 0; $main::opt_gv = 0; + $main::opt_evince = 0; $main::opt_web = 0; $main::opt_dot = 0; $main::opt_ps = 0; @@ -372,6 +376,7 @@ sub Init() { "disasm=s" => \$main::opt_disasm, "symbols!" => \$main::opt_symbols, "gv!" => \$main::opt_gv, + "evince!" => \$main::opt_evince, "web!" => \$main::opt_web, "dot!" => \$main::opt_dot, "ps!" => \$main::opt_ps, @@ -452,6 +457,7 @@ sub Init() { ($main::opt_disasm eq '' ? 0 : 1) + ($main::opt_symbols == 0 ? 0 : 1) + $main::opt_gv + + $main::opt_evince + $main::opt_web + $main::opt_dot + $main::opt_ps + @@ -646,6 +652,8 @@ sub Main() { if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { if ($main::opt_gv) { RunGV(TempName($main::next_tmpfile, "ps"), ""); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), ""); } elsif ($main::opt_web) { my $tmp = TempName($main::next_tmpfile, "svg"); RunWeb($tmp); @@ -708,6 +716,12 @@ sub RunGV { } } +sub RunEvince { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + system("$EVINCE " . $fname . $bg); +} + sub RunWeb { my $fname = shift; print STDERR "Loading web page file:///$fname\n"; @@ -805,6 +819,7 @@ sub InteractiveCommand { $main::opt_disasm = 0; $main::opt_list = 0; $main::opt_gv = 0; + $main::opt_evince = 0; $main::opt_cum = 0; if (m/^\s*(text|top)(\d*)\s*(.*)/) { @@ -878,11 +893,14 @@ sub InteractiveCommand { PrintDisassembly($libs, $flat, $cumulative, $routine, $total); return 1; } - if (m/^\s*(gv|web)\s*(.*)/) { + if (m/^\s*(gv|web|evince)\s*(.*)/) { $main::opt_gv = 0; + $main::opt_evince = 0; $main::opt_web = 0; if ($1 eq "gv") { $main::opt_gv = 1; + } elsif ($1 eq "evince") { + $main::opt_evince = 1; } elsif ($1 eq "web") { $main::opt_web = 1; } @@ -902,6 +920,8 @@ sub InteractiveCommand { if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { if ($main::opt_gv) { RunGV(TempName($main::next_tmpfile, "ps"), " &"); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), " &"); } elsif ($main::opt_web) { RunWeb(TempName($main::next_tmpfile, "svg")); } @@ -1685,6 +1705,8 @@ sub PrintDot { my $output; if ($main::opt_gv) { $output = "| $DOT -Tps2 >" . TempName($main::next_tmpfile, "ps"); + } elsif ($main::opt_evince) { + $output = "| $DOT -Tps2 | $PS2PDF - " . TempName($main::next_tmpfile, "pdf"); } elsif ($main::opt_ps) { $output = "| $DOT -Tps2"; } elsif ($main::opt_pdf) { @@ -2955,7 +2977,7 @@ sub FetchDynamicProfile { my $fetcher = AddFetchTimeout($URL_FETCHER, $fetch_timeout); my $cmd = "$fetcher '$url' > '$tmp_profile'"; - if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/){ + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){ print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n ${real_profile}\n"; if ($encourage_patience) { print STDERR "Be patient...\n"; @@ -3531,16 +3553,18 @@ sub ReadHeapProfile { # The sampling frequency is the rate of a Poisson process. # This means that the probability of sampling an allocation of # size X with sampling rate Y is 1 - exp(-X/Y) - my $ratio; - $ratio = (($s1*1.0)/$n1)/($sample_adjustment); - my $scale_factor; - $scale_factor = 1/(1 - exp(-$ratio)); - $n1 *= $scale_factor; - $s1 *= $scale_factor; - $ratio = (($s2*1.0)/$n2)/($sample_adjustment); - $scale_factor = 1/(1 - exp(-$ratio)); - $n2 *= $scale_factor; - $s2 *= $scale_factor; + if ($n1 != 0) { + my $ratio = (($s1*1.0)/$n1)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n1 *= $scale_factor; + $s1 *= $scale_factor; + } + if ($n2 != 0) { + my $ratio = (($s2*1.0)/$n2)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n2 *= $scale_factor; + $s2 *= $scale_factor; + } } else { # Remote-heap version 1 my $ratio; @@ -4091,9 +4115,15 @@ sub ExtractSymbols { my $symbols = {}; - # Map each PC value to the containing library - my %seen = (); - foreach my $lib (@{$libs}) { + # Map each PC value to the containing library. To make this faster, + # we sort libraries by their starting pc value (highest first), and + # advance through the libraries as we advance the pc. Sometimes the + # addresses of libraries may overlap with the addresses of the main + # binary, so to make sure the libraries 'win', we iterate over the + # libraries in reverse order (which assumes the binary doesn't start + # in the middle of a library, which seems a fair assumption). + my @pcs = (sort { $a cmp $b } keys(%{$pcset})); # pcset is 0-extended strings + foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) { my $libname = $lib->[0]; my $start = $lib->[1]; my $finish = $lib->[2]; @@ -4101,12 +4131,21 @@ sub ExtractSymbols { # Get list of pcs that belong in this library. my $contained = []; - foreach my $pc (keys(%{$pcset})) { - if (!$seen{$pc} && ($pc ge $start) && ($pc le $finish)) { - $seen{$pc} = 1; - push(@{$contained}, $pc); - } - } + my ($start_pc_index, $finish_pc_index); + # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index]. + for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0; + $finish_pc_index--) { + last if $pcs[$finish_pc_index - 1] le $finish; + } + # Find smallest start_pc_index such that $start <= $pc[$start_pc_index]. + for ($start_pc_index = $finish_pc_index; $start_pc_index > 0; + $start_pc_index--) { + last if $pcs[$start_pc_index - 1] lt $start; + } + # This keeps PC values higher than $pc[$finish_pc_index] in @pcs, + # in case there are overlaps in libraries and the main binary. + @{$contained} = splice(@pcs, $start_pc_index, + $finish_pc_index - $start_pc_index); # Map to symbols MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols); } diff --git a/src/profiler.cc b/src/profiler.cc index 3ac51d4..38fbb93 100644 --- a/src/profiler.cc +++ b/src/profiler.cc @@ -111,7 +111,7 @@ class CpuProfiler { int (*filter_)(void*); void* filter_arg_; - // Opague token returned by the profile handler. To be used when calling + // Opaque token returned by the profile handler. To be used when calling // ProfileHandlerUnregisterCallback. ProfileHandlerToken* prof_handler_token_; diff --git a/src/system-alloc.cc b/src/system-alloc.cc index 2505959..e589469 100644 --- a/src/system-alloc.cc +++ b/src/system-alloc.cc @@ -46,6 +46,7 @@ #include #endif #include +#include "common.h" #include "system-alloc.h" #include "internal_logging.h" #include "base/logging.h" @@ -73,6 +74,24 @@ static const bool kDebugMode = false; static const bool kDebugMode = true; #endif +// Anonymous namespace to avoid name conflicts on "CheckAddressBits". +namespace { + +// Check that no bit is set at position ADDRESS_BITS or higher. +template bool CheckAddressBits(uintptr_t ptr) { + return (ptr >> ADDRESS_BITS) == 0; +} + +// Specialize for the bit width of a pointer to avoid undefined shift. +template <> bool CheckAddressBits<8 * sizeof(void*)>(uintptr_t ptr) { + return true; +} + +} // Anonymous namespace to avoid name conflicts on "CheckAddressBits". + +COMPILE_ASSERT(kAddressBits <= 8 * sizeof(void*), + address_bits_larger_than_pointer_size); + // Structure for discovering alignment union MemoryAligner { void* p; @@ -443,7 +462,16 @@ void* TCMalloc_SystemAlloc(size_t size, size_t *actual_size, if (a == NULL) continue; if (a->usable_ && !a->failed_) { void* result = a->Alloc(size, actual_size, alignment); - if (result != NULL) return result; + if (result != NULL) { + if (actual_size) { + CheckAddressBits( + reinterpret_cast(result) + *actual_size - 1); + } else { + CheckAddressBits( + reinterpret_cast(result) + size - 1); + } + return result; + } } } diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc index 93bdd1d..245407d 100644 --- a/src/tcmalloc.cc +++ b/src/tcmalloc.cc @@ -110,6 +110,7 @@ #include #include #include +#include #include #include "base/commandlineflags.h" #include "base/basictypes.h" // gets us PRIu64 @@ -136,7 +137,9 @@ # define WIN32_DO_PATCHING 1 #endif -using std::max; +using STL_NAMESPACE::max; +using STL_NAMESPACE::numeric_limits; +using STL_NAMESPACE::vector; using tcmalloc::AlignmentForSize; using tcmalloc::PageHeap; using tcmalloc::PageHeapAllocator; @@ -439,6 +442,52 @@ static void DumpStats(TCMalloc_Printer* out, int level) { static const double MB = 1048576.0; + const uint64_t virtual_memory_used = (stats.pageheap.system_bytes + + stats.metadata_bytes); + const uint64_t physical_memory_used = (virtual_memory_used + - stats.pageheap.unmapped_bytes); + const uint64_t bytes_in_use_by_app = (physical_memory_used + - stats.metadata_bytes + - stats.pageheap.free_bytes + - stats.central_bytes + - stats.transfer_bytes + - stats.thread_bytes); + + out->printf( + "------------------------------------------------\n" + "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes in use by application\n" + "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in page heap freelist\n" + "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in central cache freelist\n" + "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in transfer cache freelist\n" + "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in thread cache freelists\n" + "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in malloc metadata\n" + "MALLOC: ------------\n" + "MALLOC: = %12" PRIu64 " (%7.1f MB) Actual memory used (physical + swap)\n" + "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes released to OS (aka unmapped)\n" + "MALLOC: ------------\n" + "MALLOC: = %12" PRIu64 " (%7.1f MB) Virtual address space used\n" + "MALLOC:\n" + "MALLOC: %12" PRIu64 " Spans in use\n" + "MALLOC: %12" PRIu64 " Thread heaps in use\n" + "MALLOC: %12" PRIu64 " Tcmalloc page size\n" + "------------------------------------------------\n" + "Call ReleaseFreeMemory() to release freelist memory to the OS" + " (via madvise()).\n" + "Bytes released to the OS take up virtual address space" + " but no physical memory.\n", + bytes_in_use_by_app, bytes_in_use_by_app / MB, + stats.pageheap.free_bytes, stats.pageheap.free_bytes / MB, + stats.central_bytes, stats.central_bytes / MB, + stats.transfer_bytes, stats.transfer_bytes / MB, + stats.thread_bytes, stats.thread_bytes / MB, + stats.metadata_bytes, stats.metadata_bytes / MB, + physical_memory_used, physical_memory_used / MB, + stats.pageheap.unmapped_bytes, stats.pageheap.unmapped_bytes / MB, + virtual_memory_used, virtual_memory_used / MB, + uint64_t(Static::span_allocator()->inuse()), + uint64_t(ThreadCache::HeapsInUse()), + uint64_t(kPageSize)); + if (level >= 2) { out->printf("------------------------------------------------\n"); out->printf("Size class breakdown\n"); @@ -464,38 +513,6 @@ static void DumpStats(TCMalloc_Printer* out, int level) { out->printf("------------------------------------------------\n"); DumpSystemAllocatorStats(out); } - - const uint64_t bytes_in_use = stats.pageheap.system_bytes - - stats.pageheap.free_bytes - - stats.pageheap.unmapped_bytes - - stats.central_bytes - - stats.transfer_bytes - - stats.thread_bytes; - - out->printf("------------------------------------------------\n" - "MALLOC: %12" PRIu64 " (%7.1f MB) Heap size\n" - "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes in use by application\n" - "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes free in page heap\n" - "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes unmapped in page heap\n" - "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes free in central cache\n" - "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes free in transfer cache\n" - "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes free in thread caches\n" - "MALLOC: %12" PRIu64 " Spans in use\n" - "MALLOC: %12" PRIu64 " Thread heaps in use\n" - "MALLOC: %12" PRIu64 " (%7.1f MB) Metadata allocated\n" - "MALLOC: %12" PRIu64 " Tcmalloc page size\n" - "------------------------------------------------\n", - stats.pageheap.system_bytes, stats.pageheap.system_bytes / MB, - bytes_in_use, bytes_in_use / MB, - stats.pageheap.free_bytes, stats.pageheap.free_bytes / MB, - stats.pageheap.unmapped_bytes, stats.pageheap.unmapped_bytes / MB, - stats.central_bytes, stats.central_bytes / MB, - stats.transfer_bytes, stats.transfer_bytes / MB, - stats.thread_bytes, stats.thread_bytes / MB, - uint64_t(Static::span_allocator()->inuse()), - uint64_t(ThreadCache::HeapsInUse()), - stats.metadata_bytes, stats.metadata_bytes / MB, - uint64_t(kPageSize)); } static void PrintStats(int level) { @@ -609,6 +626,20 @@ class TCMallocImplementation : public MallocExtension { } } + // We may print an extra, tcmalloc-specific warning message here. + virtual void GetHeapSample(MallocExtensionWriter* writer) { + if (FLAGS_tcmalloc_sample_parameter == 0) { + const char* const kWarningMsg = + "#\n# WARNING: This heap profile does not have any data in it,\n" + "# because the application was run with heap sampling turned off.\n" + "# To get useful data from from GetHeapSample(), you must first\n" + "# set the environment variable TCMALLOC_SAMPLE_PARAMETER to a\n" + "# positive sampling period, such as 524288.\n#\n"; + writer->append(kWarningMsg, strlen(kWarningMsg)); + } + MallocExtension::GetHeapSample(writer); + } + virtual void** ReadStackTraces(int* sample_period) { tcmalloc::StackTraceTable table; { @@ -753,6 +784,99 @@ class TCMallocImplementation : public MallocExtension { // unnamed namespace, we need to move the definition below it in the // file. virtual size_t GetAllocatedSize(void* ptr); + + virtual void GetFreeListSizes(vector* v) { + static const char* kCentralCacheType = "tcmalloc.central"; + static const char* kTransferCacheType = "tcmalloc.transfer"; + static const char* kThreadCacheType = "tcmalloc.thread"; + static const char* kPageHeapType = "tcmalloc.page"; + static const char* kPageHeapUnmappedType = "tcmalloc.page_unmapped"; + static const char* kLargeSpanType = "tcmalloc.large"; + static const char* kLargeUnmappedSpanType = "tcmalloc.large_unmapped"; + + v->clear(); + + // central class information + int64 prev_class_size = 0; + for (int cl = 1; cl < kNumClasses; ++cl) { + size_t class_size = Static::sizemap()->ByteSizeForClass(cl); + MallocExtension::FreeListInfo i; + i.min_object_size = prev_class_size + 1; + i.max_object_size = class_size; + i.total_bytes_free = + Static::central_cache()[cl].length() * class_size; + i.type = kCentralCacheType; + v->push_back(i); + + // transfer cache + i.total_bytes_free = + Static::central_cache()[cl].tc_length() * class_size; + i.type = kTransferCacheType; + v->push_back(i); + + prev_class_size = Static::sizemap()->ByteSizeForClass(cl); + } + + // Add stats from per-thread heaps + uint64_t class_count[kNumClasses]; + memset(class_count, 0, sizeof(class_count)); + { + SpinLockHolder h(Static::pageheap_lock()); + uint64_t thread_bytes = 0; + ThreadCache::GetThreadStats(&thread_bytes, class_count); + } + + prev_class_size = 0; + for (int cl = 1; cl < kNumClasses; ++cl) { + MallocExtension::FreeListInfo i; + i.min_object_size = prev_class_size + 1; + i.max_object_size = Static::sizemap()->ByteSizeForClass(cl); + i.total_bytes_free = + class_count[cl] * Static::sizemap()->ByteSizeForClass(cl); + i.type = kThreadCacheType; + v->push_back(i); + } + + // append page heap info + int64 page_count_normal[kMaxPages]; + int64 page_count_returned[kMaxPages]; + int64 span_count_normal; + int64 span_count_returned; + { + SpinLockHolder h(Static::pageheap_lock()); + Static::pageheap()->GetClassSizes(page_count_normal, + page_count_returned, + &span_count_normal, + &span_count_returned); + } + + // spans: mapped + MallocExtension::FreeListInfo span_info; + span_info.type = kLargeSpanType; + span_info.max_object_size = (numeric_limits::max)(); + span_info.min_object_size = kMaxPages << kPageShift; + span_info.total_bytes_free = span_count_normal << kPageShift; + v->push_back(span_info); + + // spans: unmapped + span_info.type = kLargeUnmappedSpanType; + span_info.total_bytes_free = span_count_returned << kPageShift; + v->push_back(span_info); + + for (int s = 1; s < kMaxPages; s++) { + MallocExtension::FreeListInfo i; + i.max_object_size = (s << kPageShift); + i.min_object_size = ((s - 1) << kPageShift); + + i.type = kPageHeapType; + i.total_bytes_free = (s << kPageShift) * page_count_normal[s]; + v->push_back(i); + + i.type = kPageHeapUnmappedType; + i.total_bytes_free = (s << kPageShift) * page_count_returned[s]; + v->push_back(i); + } + } }; // The constructor allocates an object to ensure that initialization diff --git a/src/tests/debugallocation_test.cc b/src/tests/debugallocation_test.cc index c482187..f10e2dc 100644 --- a/src/tests/debugallocation_test.cc +++ b/src/tests/debugallocation_test.cc @@ -259,7 +259,10 @@ TEST(DebugAllocationTest, GetAllocatedSizeTest) { } TEST(DebugAllocationTest, HugeAlloc) { - const size_t kTooBig = ~static_cast(0); + // This must not be a const variable so it doesn't form an + // integral-constant-expression which can be *statically* rejected by the + // compiler as too large for the allocation. + size_t kTooBig = ~static_cast(0); void* a = NULL; char* b = NULL; @@ -273,8 +276,9 @@ TEST(DebugAllocationTest, HugeAlloc) { EXPECT_EQ(NULL, b); // kAlsoTooBig is small enough not to get caught by debugallocation's check, - // but will still fall through to tcmalloc's check. - const size_t kAlsoTooBig = kTooBig - 1024; + // but will still fall through to tcmalloc's check. This must also be + // a non-const variable. See kTooBig for more details. + size_t kAlsoTooBig = kTooBig - 1024; a = malloc(kAlsoTooBig); EXPECT_EQ(NULL, a); diff --git a/src/tests/malloc_extension_test.cc b/src/tests/malloc_extension_test.cc index ef76766..60f4919 100644 --- a/src/tests/malloc_extension_test.cc +++ b/src/tests/malloc_extension_test.cc @@ -39,6 +39,8 @@ #include #include +using STL_NAMESPACE::vector; + int main(int argc, char** argv) { void* a = malloc(1000); @@ -70,6 +72,30 @@ int main(int argc, char** argv) { ASSERT_LE(MallocExtension_GetAllocatedSize(a), 5000); ASSERT_GE(MallocExtension_GetEstimatedAllocatedSize(1000), 1000); + // test invariant: size of freelist = heap_size - allocated_bytes + free(malloc(32000)); + size_t heap_size = 0; + size_t allocated = 0; + ASSERT_TRUE(MallocExtension::instance()->GetNumericProperty( + "generic.current_allocated_bytes", &allocated)); + ASSERT_TRUE(MallocExtension::instance()->GetNumericProperty( + "generic.heap_size", &heap_size)); + vector info; + MallocExtension::instance()->GetFreeListSizes(&info); + + ASSERT_GE(info.size(), 0); + int64 free_bytes = 0; + for (vector::const_iterator it = info.begin(); + it != info.end(); + ++it) { + free_bytes += it->total_bytes_free; + } + + // don't expect an exact equality since the calls to query the heap + // themselves free and allocate memory + size_t error = abs((heap_size - allocated) - free_bytes); + ASSERT_LT(error, 0.15 * heap_size); + free(a); printf("DONE\n"); diff --git a/src/tests/sampling_test.cc b/src/tests/sampling_test.cc index b75e70e..c1bd693 100644 --- a/src/tests/sampling_test.cc +++ b/src/tests/sampling_test.cc @@ -45,6 +45,8 @@ using std::string; +extern "C" void* AllocateAllocate() ATTRIBUTE_NOINLINE; + extern "C" void* AllocateAllocate() { // The VLOG's are mostly to discourage inlining VLOG(1, "Allocating some more"); diff --git a/src/tests/sampling_test.sh b/src/tests/sampling_test.sh index 8c96bc1..2a58426 100755 --- a/src/tests/sampling_test.sh +++ b/src/tests/sampling_test.sh @@ -81,13 +81,13 @@ mkdir "$OUTDIR" || die "Unable to create $OUTDIR" echo "Testing heap output..." "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap" \ - | grep '^ *[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \ + | grep '[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \ || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap" echo "OK" echo "Testing growth output..." "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth" \ - | grep '^ *[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \ + | grep '[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \ || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth" echo "OK" diff --git a/src/tests/system-alloc_unittest.cc b/src/tests/system-alloc_unittest.cc index a160a34..da76285 100644 --- a/src/tests/system-alloc_unittest.cc +++ b/src/tests/system-alloc_unittest.cc @@ -38,7 +38,9 @@ #include // another place uintptr_t might be defined #endif #include +#include #include "base/logging.h" +#include "common.h" #include "system-alloc.h" class ArraySysAllocator : public SysAllocator { @@ -98,6 +100,18 @@ static void TestBasicInvoked() { CHECK(a.invoked_); } +#if 0 // could port this to various OSs, but won't bother for now +TEST(AddressBits, CpuVirtualBits) { + // Check that kAddressBits is as least as large as either the number of bits + // in a pointer or as the number of virtual bits handled by the processor. + // To be effective this test must be run on each processor model. + const int kPointerBits = 8 * sizeof(void*); + const int kImplementedVirtualBits = NumImplementedVirtualBits(); + + CHECK_GE(kAddressBits, min(kImplementedVirtualBits, kPointerBits)); +} +#endif + int main(int argc, char** argv) { TestBasicInvoked(); diff --git a/src/tests/tcmalloc_unittest.cc b/src/tests/tcmalloc_unittest.cc index 522c0d9..c528846 100644 --- a/src/tests/tcmalloc_unittest.cc +++ b/src/tests/tcmalloc_unittest.cc @@ -100,17 +100,12 @@ # define cfree free // don't bother to try to test these obsolete fns # define valloc malloc # define pvalloc malloc -# ifdef PERFTOOLS_NO_ALIGNED_MALLOC -# define _aligned_malloc(size, alignment) malloc(size) -# else -# include // for _aligned_malloc -# endif -# define memalign(alignment, size) _aligned_malloc(size, alignment) -// Assume if we fail, it's because of out-of-memory. -// Note, this isn't a perfect analogue: we don't enforce constraints on "align" +// I'd like to map posix_memalign to _aligned_malloc, but _aligned_malloc +// must be paired with _aligned_free (not normal free), which is too +// invasive a change to how we allocate memory here. So just bail # include -# define posix_memalign(pptr, align, size) \ - ((*(pptr)=_aligned_malloc(size, align)) ? 0 : ENOMEM) +# define memalign(alignment, size) malloc(size) +# define posix_memalign(pptr, align, size) ((*(pptr)=malloc(size)) ? 0 : ENOMEM) #endif // On systems (like freebsd) that don't define MAP_ANONYMOUS, use the old @@ -1033,6 +1028,14 @@ static int RunAllTests(int argc, char** argv) { free(p1); VerifyDeleteHookWasCalled(); + // Windows has _aligned_malloc. Let's test that that's captured too. +#if (defined(_MSC_VER) || defined(__MINGW32__)) && !defined(PERFTOOLS_NO_ALIGNED_MALLOC) + p1 = _aligned_malloc(sizeof(p1) * 2, 64); + VerifyNewHookWasCalled(); + _aligned_free(p1); + VerifyDeleteHookWasCalled(); +#endif + p1 = valloc(60); VerifyNewHookWasCalled(); free(p1); diff --git a/src/windows/config.h b/src/windows/config.h index 6d6f771..0b91031 100644 --- a/src/windows/config.h +++ b/src/windows/config.h @@ -92,7 +92,7 @@ #undef HAVE_LINUX_PTRACE_H /* Define to 1 if you have the header file. */ -#undef HAVE_MALLOC_H +#define HAVE_MALLOC_H 1 /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H diff --git a/src/windows/patch_functions.cc b/src/windows/patch_functions.cc index deb841b..fc57c82 100644 --- a/src/windows/patch_functions.cc +++ b/src/windows/patch_functions.cc @@ -175,7 +175,7 @@ class LibcInfo { kNew, kNewArray, kDelete, kDeleteArray, kNewNothrow, kNewArrayNothrow, kDeleteNothrow, kDeleteArrayNothrow, // These are windows-only functions from malloc.h - k_Msize, k_Expand, k_Aligned_malloc, k_Aligned_free, + k_Msize, k_Expand, kNumFunctions }; @@ -274,12 +274,12 @@ template class LibcInfoWithPatchFunctions : public LibcInfo { const std::nothrow_t&) __THROW; static size_t Perftools__msize(void *ptr) __THROW; static void* Perftools__expand(void *ptr, size_t size) __THROW; - static void* Perftools__aligned_malloc(size_t size, size_t alignment) __THROW; - static void Perftools__aligned_free(void *ptr) __THROW; // malloc.h also defines these functions: + // _aligned_malloc, _aligned_free, // _recalloc, _aligned_offset_malloc, _aligned_realloc, _aligned_recalloc // _aligned_offset_realloc, _aligned_offset_recalloc, _malloca, _freea // But they seem pretty obscure, and I'm fine not overriding them for now. + // It may be they all call into malloc/free anyway. }; // This is a subset of MODDULEENTRY32, that we need for patching. @@ -300,10 +300,19 @@ struct ModuleEntryCopy { ModuleEntryCopy(const MODULEINFO& mi) { this->modBaseAddr = mi.lpBaseOfDll; this->modBaseSize = mi.SizeOfImage; - for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++) - rgProcAddresses[i] = (GenericFnPtr)::GetProcAddress( + LPVOID modEndAddr = (char*)mi.lpBaseOfDll + mi.SizeOfImage; + for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++) { + FARPROC target = ::GetProcAddress( reinterpret_cast(mi.lpBaseOfDll), LibcInfo::function_name(i)); + // Sometimes a DLL forwards a function to a function in another + // DLL. We don't want to patch those forwarded functions -- + // they'll get patched when the other DLL is processed. + if (target >= modBaseAddr && target < modEndAddr) + rgProcAddresses[i] = (GenericFnPtr)target; + else + rgProcAddresses[i] = (GenericFnPtr)NULL; + } } }; @@ -390,7 +399,7 @@ const char* const LibcInfo::function_name_[] = { NULL, // kMangledNewArrayNothrow, NULL, // kMangledDeleteNothrow, NULL, // kMangledDeleteArrayNothrow, - "_msize", "_expand", "_aligned_malloc", "_aligned_free", + "_msize", "_expand", }; // For mingw, I can't patch the new/delete here, because the @@ -421,14 +430,6 @@ const GenericFnPtr LibcInfo::static_fn_[] = { #endif (GenericFnPtr)&::_msize, (GenericFnPtr)&::_expand, -#ifdef PERFTOOLS_NO_ALIGNED_MALLOC // for older versions of mingw - // _aligned_malloc isn't always available in mingw, so don't try to patch. - (GenericFnPtr)NULL, - (GenericFnPtr)NULL, -#else - (GenericFnPtr)&::_aligned_malloc, - (GenericFnPtr)&::_aligned_free, -#endif }; template GenericFnPtr LibcInfoWithPatchFunctions::origstub_fn_[] = { @@ -451,8 +452,6 @@ const GenericFnPtr LibcInfoWithPatchFunctions::perftools_fn_[] = { (GenericFnPtr)&Perftools_deletearray_nothrow, (GenericFnPtr)&Perftools__msize, (GenericFnPtr)&Perftools__expand, - (GenericFnPtr)&Perftools__aligned_malloc, - (GenericFnPtr)&Perftools__aligned_free, }; /*static*/ WindowsInfo::FunctionInfo WindowsInfo::function_info_[] = { @@ -908,21 +907,6 @@ void* LibcInfoWithPatchFunctions::Perftools__expand(void *ptr, return NULL; } -template -void* LibcInfoWithPatchFunctions::Perftools__aligned_malloc(size_t size, - size_t alignment) - __THROW { - void* result = do_memalign_or_cpp_memalign(alignment, size); - MallocHook::InvokeNewHook(result, size); - return result; -} - -template -void LibcInfoWithPatchFunctions::Perftools__aligned_free(void *ptr) __THROW { - MallocHook::InvokeDeleteHook(ptr); - do_free_with_callback(ptr, (void (*)(void*))origstub_fn_[k_Aligned_free]); -} - LPVOID WINAPI WindowsInfo::Perftools_HeapAlloc(HANDLE hHeap, DWORD dwFlags, DWORD_PTR dwBytes) { LPVOID result = ((LPVOID (WINAPI *)(HANDLE, DWORD, DWORD_PTR)) diff --git a/src/windows/port.cc b/src/windows/port.cc index d62fa9d..32f3c31 100644 --- a/src/windows/port.cc +++ b/src/windows/port.cc @@ -83,6 +83,18 @@ extern "C" PERFTOOLS_DLL_DECL void* __sbrk(ptrdiff_t increment) { return NULL; } +// We need to write to 'stderr' without having windows allocate memory. +// The safest way is via a low-level call like WriteConsoleA(). But +// even then we need to be sure to print in small bursts so as to not +// require memory allocation. +extern "C" PERFTOOLS_DLL_DECL void WriteToStderr(const char* buf, int len) { + // Looks like windows allocates for writes of >80 bytes + for (int i = 0; i < len; i += 80) { + write(STDERR_FILENO, buf + i, std::min(80, len - i)); + } +} + + // ----------------------------------------------------------------------- // Threads code diff --git a/src/windows/port.h b/src/windows/port.h index 66745d1..81a68e6 100644 --- a/src/windows/port.h +++ b/src/windows/port.h @@ -277,6 +277,8 @@ enum { STDIN_FILENO = 0, STDOUT_FILENO = 1, STDERR_FILENO = 2 }; #define O_RDONLY _O_RDONLY #endif +extern "C" PERFTOOLS_DLL_DECL void WriteToStderr(const char* buf, int len); + // ----------------------------------- SYSTEM/PROCESS typedef int pid_t; #define getpid _getpid -- cgit v1.2.1