diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/base/low_level_alloc.cc | 89 | ||||
-rw-r--r-- | src/base/low_level_alloc.h | 22 | ||||
-rw-r--r-- | src/google/malloc_hook.h | 6 | ||||
-rw-r--r-- | src/malloc_hook.cc | 28 | ||||
-rwxr-xr-x | src/pprof | 7 | ||||
-rw-r--r-- | src/stacktrace_x86-inl.h | 5 | ||||
-rw-r--r-- | src/tcmalloc.cc | 4 | ||||
-rwxr-xr-x | src/tests/sampling_test.sh | 16 | ||||
-rw-r--r-- | src/thread_cache.cc | 6 | ||||
-rw-r--r-- | src/windows/patch_functions.cc | 264 |
10 files changed, 333 insertions, 114 deletions
diff --git a/src/base/low_level_alloc.cc b/src/base/low_level_alloc.cc index 900ae4e..2bbce54 100644 --- a/src/base/low_level_alloc.cc +++ b/src/base/low_level_alloc.cc @@ -67,7 +67,7 @@ namespace { // first. Valid in both allocated and unallocated blocks intptr_t magic; // kMagicAllocated or kMagicUnallocated xor this LowLevelAlloc::Arena *arena; // pointer to parent arena - void *dummy_for_alignment; // aligns regions to 0 mod 2*sizeof(void*) + void *dummy_for_alignment; // aligns regions to 0 mod 2*sizeof(void*) } header; // Next two fields: in unallocated blocks: freelist skiplist data @@ -197,15 +197,57 @@ struct LowLevelAlloc::Arena { // pointer. static struct LowLevelAlloc::Arena default_arena; -// A non-malloc-hooked arena: used only to allocate metadata for arenas that +// Non-malloc-hooked arenas: used only to allocate metadata for arenas that // do not want malloc hook reporting, so that for them there's no malloc hook // reporting even during arena creation. static struct LowLevelAlloc::Arena unhooked_arena; +static struct LowLevelAlloc::Arena unhooked_async_sig_safe_arena; // magic numbers to identify allocated and unallocated blocks static const intptr_t kMagicAllocated = 0x4c833e95; static const intptr_t kMagicUnallocated = ~kMagicAllocated; +namespace { + class ArenaLock { + public: + explicit ArenaLock(LowLevelAlloc::Arena *arena) : + left_(false), mask_valid_(false), arena_(arena) { + if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) != 0) { + // We've decided not to support async-signal-safe arena use until + // there a demonstrated need. Here's how one could do it though + // (would need to be made more portable). +#if 0 + sigset_t all; + sigfillset(&all); + this->mask_valid_ = + (pthread_sigmask(SIG_BLOCK, &all, &this->mask_) == 0); +#else + RAW_CHECK(false, "We do not yet support async-signal-safe arena."); +#endif + } + this->arena_->mu.Lock(); + } + ~ArenaLock() { RAW_CHECK(this->left_, "haven't left Arena region"); } + void Leave() { + this->arena_->mu.Unlock(); +#if 0 + if (this->mask_valid_) { + pthread_sigmask(SIG_SETMASK, &this->mask_, 0); + } +#endif + this->left_ = true; + } + private: + bool left_; // whether left region + bool mask_valid_; +#if 0 + sigset_t mask_; // old mask of blocked signals +#endif + LowLevelAlloc::Arena *arena_; + DISALLOW_COPY_AND_ASSIGN(ArenaLock); + }; +} // anonymous namespace + // create an appropriate magic number for an object at "ptr" // "magic" should be kMagicAllocated or kMagicUnallocated inline static intptr_t Magic(intptr_t magic, AllocList::Header *ptr) { @@ -235,6 +277,8 @@ static void ArenaInit(LowLevelAlloc::Arena *arena) { // Default arena should be hooked, e.g. for heap-checker to trace // pointer chains through objects in the default arena. arena->flags = LowLevelAlloc::kCallMallocHook; + } else if (arena == &unhooked_async_sig_safe_arena) { + arena->flags = LowLevelAlloc::kAsyncSignalSafe; } else { arena->flags = 0; // other arenas' flags may be overridden by client, // but unhooked_arena will have 0 in 'flags'. @@ -246,9 +290,12 @@ static void ArenaInit(LowLevelAlloc::Arena *arena) { LowLevelAlloc::Arena *LowLevelAlloc::NewArena(int32 flags, Arena *meta_data_arena) { RAW_CHECK(meta_data_arena != 0, "must pass a valid arena"); - if (meta_data_arena == &default_arena && - (flags & LowLevelAlloc::kCallMallocHook) == 0) { - meta_data_arena = &unhooked_arena; + if (meta_data_arena == &default_arena) { + if ((flags & LowLevelAlloc::kAsyncSignalSafe) != 0) { + meta_data_arena = &unhooked_async_sig_safe_arena; + } else if ((flags & LowLevelAlloc::kCallMallocHook) == 0) { + meta_data_arena = &unhooked_arena; + } } // Arena(0) uses the constructor for non-static contexts Arena *result = @@ -262,9 +309,9 @@ LowLevelAlloc::Arena *LowLevelAlloc::NewArena(int32 flags, bool LowLevelAlloc::DeleteArena(Arena *arena) { RAW_CHECK(arena != 0 && arena != &default_arena && arena != &unhooked_arena, "may not delete default arena"); - arena->mu.Lock(); + ArenaLock section(arena); bool empty = (arena->allocation_count == 0); - arena->mu.Unlock(); + section.Leave(); if (empty) { while (arena->freelist.next[0] != 0) { AllocList *region = arena->freelist.next[0]; @@ -279,7 +326,13 @@ bool LowLevelAlloc::DeleteArena(Arena *arena) { "empty arena has non-page-aligned block size"); RAW_CHECK(reinterpret_cast<intptr_t>(region) % arena->pagesize == 0, "empty arena has non-page-aligned block"); - RAW_CHECK(munmap(region, size) == 0, + int munmap_result; + if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) == 0) { + munmap_result = munmap(region, size); + } else { + munmap_result = MallocHook::UnhookedMUnmap(region, size); + } + RAW_CHECK(munmap_result == 0, "LowLevelAlloc::DeleteArena: munmap failed address"); } Free(arena); @@ -363,21 +416,21 @@ void LowLevelAlloc::Free(void *v) { if ((arena->flags & kCallMallocHook) != 0) { MallocHook::InvokeDeleteHook(v); } - arena->mu.Lock(); + ArenaLock section(arena); AddToFreelist(v, arena); RAW_CHECK(arena->allocation_count > 0, "nothing in arena to free"); arena->allocation_count--; - arena->mu.Unlock(); + section.Leave(); } } // allocates and returns a block of size bytes, to be freed with Free() // L < arena->mu -void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) { +static void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) { void *result = 0; if (request != 0) { AllocList *s; // will point to region that satisfies request - arena->mu.Lock(); + ArenaLock section(arena); ArenaInit(arena); // round up with header size_t req_rnd = RoundUp(request + sizeof (s->header), arena->roundup); @@ -399,8 +452,14 @@ void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) { // mmap generous 64K chunks to decrease // the chances/impact of fragmentation: size_t new_pages_size = RoundUp(req_rnd, arena->pagesize * 16); - void *new_pages = mmap(0, new_pages_size, - PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + void *new_pages; + if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) != 0) { + new_pages = MallocHook::UnhookedMMap(0, new_pages_size, + PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + } else { + new_pages = mmap(0, new_pages_size, + PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + } RAW_CHECK(new_pages != MAP_FAILED, "mmap error"); arena->mu.Lock(); s = reinterpret_cast<AllocList *>(new_pages); @@ -425,7 +484,7 @@ void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) { s->header.magic = Magic(kMagicAllocated, &s->header); RAW_CHECK(s->header.arena == arena, ""); arena->allocation_count++; - arena->mu.Unlock(); + section.Leave(); result = &s->levels; } ANNOTATE_NEW_MEMORY(result, request); diff --git a/src/base/low_level_alloc.h b/src/base/low_level_alloc.h index 0df1298..393b3d2 100644 --- a/src/base/low_level_alloc.h +++ b/src/base/low_level_alloc.h @@ -72,14 +72,20 @@ class LowLevelAlloc { // meta_data_arena; the DefaultArena() can be passed for meta_data_arena. // These values may be ored into flags: enum { - // Calls to Alloc() and Free() will be reported - // via the MallocHook interface. - // The DefaultArena() has this flag on. - // NewArena(flags, DefaultArena()) w/o this bit in 'flags', - // on the other hand, will not cause any MallocHook - // calls even during the NewArena call itself - // (it will in fact use meta_data_arena different from DefaultArena()). - kCallMallocHook = 0x0001 + // Report calls to Alloc() and Free() via the MallocHook interface. + // Set in the DefaultArena. + kCallMallocHook = 0x0001, + + // Make calls to Alloc(), Free() be async-signal-safe. Not set in + // DefaultArena(). + kAsyncSignalSafe = 0x0002, + + // When used with DefaultArena(), the NewArena() and DeleteArena() calls + // obey the flags given explicitly in the NewArena() call, even if those + // flags differ from the settings in DefaultArena(). So the call + // NewArena(kAsyncSignalSafe, DefaultArena()) is itself async-signal-safe, + // as well as generatating an arena that provides async-signal-safe + // Alloc/Free. }; static Arena *NewArena(int32 flags, Arena *meta_data_arena); diff --git a/src/google/malloc_hook.h b/src/google/malloc_hook.h index f2713e9..48d92da 100644 --- a/src/google/malloc_hook.h +++ b/src/google/malloc_hook.h @@ -191,6 +191,12 @@ class PERFTOOLS_DLL_DECL MallocHook { int skip_count) { return MallocHook_GetCallerStackTrace(result, max_depth, skip_count); } + + // Unhooked versions of mmap() and munmap(). These should be used + // only by experts, since they bypass heapchecking, etc. + static void* UnhookedMMap(void *start, size_t length, int prot, int flags, + int fd, off_t offset); + static int UnhookedMUnmap(void *start, size_t length); }; #endif /* _MALLOC_HOOK_H_ */ diff --git a/src/malloc_hook.cc b/src/malloc_hook.cc index d1ad12a..2a7f542 100644 --- a/src/malloc_hook.cc +++ b/src/malloc_hook.cc @@ -423,7 +423,7 @@ static inline void* do_mmap64(void *start, size_t length, return result; } -# endif +# endif // defined(__x86_64__) // We use do_mmap64 abstraction to put MallocHook::InvokeMmapHook // calls right into mmap and mmap64, so that the stack frames in the caller's @@ -472,7 +472,7 @@ extern "C" void* mmap(void *start, size_t length, int prot, int flags, return result; } -#endif +#endif // !defined(__USE_FILE_OFFSET64) || !defined(__REDIRECT_NTH) extern "C" int munmap(void* start, size_t length) __THROW { MallocHook::InvokeMunmapHook(start, length); @@ -501,4 +501,26 @@ extern "C" void* sbrk(ptrdiff_t increment) __THROW { return result; } -#endif +/*static*/void* MallocHook::UnhookedMMap(void *start, size_t length, int prot, + int flags, int fd, off_t offset) { + return do_mmap64(start, length, prot, flags, fd, offset); +} + +/*static*/int MallocHook::UnhookedMUnmap(void *start, size_t length) { + return sys_munmap(start, length); +} + +#else // defined(__linux) && + // (defined(__i386__) || defined(__x86_64__) || defined(__PPC__)) + +/*static*/void* MallocHook::UnhookedMMap(void *start, size_t length, int prot, + int flags, int fd, off_t offset) { + return mmap(start, length, prot, flags, fd, offset); +} + +/*static*/int MallocHook::UnhookedMUnmap(void *start, size_t length) { + return munmap(start, length); +} + +#endif // defined(__linux) && + // (defined(__i386__) || defined(__x86_64__) || defined(__PPC__)) @@ -1977,7 +1977,12 @@ sub RemoveUninterestingFrames { '__builtin_vec_delete', '__builtin_vec_new', 'operator new', - 'operator new[]') { + 'operator new[]', + # These mark the beginning/end of our custom sections + '__start_google_malloc', + '__stop_google_malloc', + '__start_malloc_hook', + '__stop_malloc_hook') { $skip{$name} = 1; $skip{"_" . $name} = 1; # Mach (OS X) adds a _ prefix to everything } diff --git a/src/stacktrace_x86-inl.h b/src/stacktrace_x86-inl.h index 5650c86..05701e7 100644 --- a/src/stacktrace_x86-inl.h +++ b/src/stacktrace_x86-inl.h @@ -49,6 +49,11 @@ #elif defined(HAVE_UCONTEXT_H) #include <ucontext.h> // for ucontext_t #elif defined(HAVE_CYGWIN_SIGNAL_H) +// cygwin/signal.h has a buglet where it uses pthread_attr_t without +// #including <pthread.h> itself. So we have to do it. +# ifdef HAVE_PTHREAD +# include <pthread.h> +# endif #include <cygwin/signal.h> typedef ucontext ucontext_t; #endif diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc index 70dd8c6..625301e 100644 --- a/src/tcmalloc.cc +++ b/src/tcmalloc.cc @@ -762,9 +762,9 @@ TCMallocGuard::TCMallocGuard() { // patch the windows VirtualAlloc, etc. PatchWindowsFunctions(); // defined in windows/patch_functions.cc #endif - free(malloc(1)); + tc_free(tc_malloc(1)); ThreadCache::InitTSD(); - free(malloc(1)); + tc_free(tc_malloc(1)); MallocExtension::Register(new TCMallocImplementation); } } diff --git a/src/tests/sampling_test.sh b/src/tests/sampling_test.sh index 149d27b..8c96bc1 100755 --- a/src/tests/sampling_test.sh +++ b/src/tests/sampling_test.sh @@ -62,10 +62,10 @@ if ! cat "$SAMPLING_TEST_BINARY" >/dev/null 2>&1; then SAMPLING_TEST_BINARY="$SAMPLING_TEST_BINARY".exe fi -die() { - echo "FAILED" - echo "reason:" - echo "$@" +die() { # runs the command given as arguments, and then dies. + echo "FAILED. Output from $@" + echo "----" + "$@" echo "----" exit 1 } @@ -79,16 +79,16 @@ mkdir "$OUTDIR" || die "Unable to create $OUTDIR" # 50M to 99M. "$SAMPLING_TEST" "$OUTDIR/out" -echo -n "Testing heap output..." +echo "Testing heap output..." "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap" \ | grep '^ *[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \ - || die `"$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap"` + || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap" echo "OK" -echo -n "Testing growth output..." +echo "Testing growth output..." "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth" \ | grep '^ *[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \ - || die `"$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth"` + || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth" echo "OK" echo "PASS" diff --git a/src/thread_cache.cc b/src/thread_cache.cc index fd44a70..64f4deb 100644 --- a/src/thread_cache.cc +++ b/src/thread_cache.cc @@ -299,12 +299,6 @@ int ThreadCache::GetSamplePeriod() { } void ThreadCache::InitModule() { - // There is a slight potential race here because of double-checked - // locking idiom. However, as long as the program does a small - // allocation before switching to multi-threaded mode, we will be - // fine. We increase the chances of doing such a small allocation - // by doing one in the constructor of the module_enter_exit_hook - // object declared below. SpinLockHolder h(Static::pageheap_lock()); if (!phinited) { Static::InitStaticVars(); diff --git a/src/windows/patch_functions.cc b/src/windows/patch_functions.cc index 992161f..d13b7b8 100644 --- a/src/windows/patch_functions.cc +++ b/src/windows/patch_functions.cc @@ -72,6 +72,11 @@ #error This file is intended for patching allocators - use override_functions.cc instead. #endif +// We use psapi. Non-MSVC systems will have to link this in themselves. +#ifdef _MSC_VER +#pragma comment(lib, "Psapi.lib") +#endif + // Make sure we always use the 'old' names of the psapi functions. #ifndef PSAPI_VERSION #define PSAPI_VERSION 1 @@ -80,6 +85,8 @@ #include <windows.h> #include <malloc.h> // for _msize and _expand #include <Psapi.h> // for EnumProcessModules, GetModuleInformation, etc. +#include <set> +#include <map> #include <vector> #include <base/logging.h> #include "base/spinlock.h" @@ -122,29 +129,7 @@ typedef void (*GenericFnPtr)(); using sidestep::PreamblePatcher; -// This is a subset of MODDULEENTRY32, that we need for patching -struct ModuleEntryCopy { - LPVOID modBaseAddr; - DWORD modBaseSize; - HMODULE hModule; - TCHAR szModule[kMaxModuleNameSize]; - - ModuleEntryCopy() { - modBaseAddr = NULL; - modBaseSize = 0; - hModule = NULL; - strcpy(szModule, "<executable>"); - } - ModuleEntryCopy(HANDLE hprocess, HMODULE hmodule, const MODULEINFO& mi) { - this->modBaseAddr = mi.lpBaseOfDll; - this->modBaseSize = mi.SizeOfImage; - this->hModule = hmodule; - // TODO(csilvers): we could make more efficient by calling this - // lazily (not until this->szModule is needed, which is often never). - GetModuleBaseNameA(hprocess, hmodule, - this->szModule, sizeof(this->szModule)); - } -}; +struct ModuleEntryCopy; // defined below // These functions are how we override the memory allocation // functions, just like tcmalloc.cc and malloc_hook.cc do. @@ -160,33 +145,19 @@ class LibcInfo { LibcInfo() { memset(this, 0, sizeof(*this)); // easiest way to initialize the array } - bool SameAs(const LibcInfo& that) const { - return (is_valid() && - module_base_address_ == that.module_base_address_ && - module_base_size_ == that.module_base_size_); - } - bool SameAsME32(const ModuleEntryCopy& me32) const { - return (is_valid() && - module_base_address_ == me32.modBaseAddr && - module_base_size_ == me32.modBaseSize); - } + bool SameAs(const LibcInfo& that) const; + bool SameAsModuleEntry(const ModuleEntryCopy& module_entry) const; + bool patched() const { return is_valid() && module_name_[0] != '\0'; } const char* module_name() const { return is_valid() ? module_name_ : ""; } void set_is_valid(bool b) { is_valid_ = b; } - // These shouldn't have to be public, since only subclasses of - // LibcInfo need it, but they do. Maybe something to do with - // templates. Shrug. - bool is_valid() const { return is_valid_; } - GenericFnPtr windows_fn(int ifunction) const { - return windows_fn_[ifunction]; - } - // Populates all the windows_fn_[] vars based on our module info. // Returns false if windows_fn_ is all NULL's, because there's - // nothing to patch. Also populates the me32 info. - bool PopulateWindowsFn(const ModuleEntryCopy& me32); + // nothing to patch. Also populates the rest of the module_entry + // info, such as the module's name. + bool PopulateWindowsFn(const ModuleEntryCopy& module_entry); protected: void CopyFrom(const LibcInfo& that) { @@ -237,6 +208,24 @@ class LibcInfo { const void *module_base_address_; size_t module_base_size_; char module_name_[kMaxModuleNameSize]; + + public: + // These shouldn't have to be public, since only subclasses of + // LibcInfo need it, but they do. Maybe something to do with + // templates. Shrug. I hide them down here so users won't see + // them. :-) (OK, I also need to define ctrgProcAddress late.) + bool is_valid() const { return is_valid_; } + GenericFnPtr windows_fn(int ifunction) const { + return windows_fn_[ifunction]; + } + // These three are needed by ModuleEntryCopy. + static const int ctrgProcAddress = kNumFunctions; + static GenericFnPtr static_fn(int ifunction) { + return static_fn_[ifunction]; + } + static const char* const function_name(int ifunction) { + return function_name_[ifunction]; + } }; // Template trickiness: logically, a LibcInfo would include @@ -294,6 +283,43 @@ template<int> class LibcInfoWithPatchFunctions : public LibcInfo { // But they seem pretty obscure, and I'm fine not overriding them for now. }; +// This is a subset of MODDULEENTRY32, that we need for patching. +struct ModuleEntryCopy { + LPVOID modBaseAddr; + DWORD modBaseSize; + HMODULE hModule; + TCHAR szModule[kMaxModuleNameSize]; + // This is not part of MODDULEENTRY32, but is needed to avoid making + // windows syscalls while we're holding patch_all_modules_lock (see + // lock-inversion comments at patch_all_modules_lock definition, below). + GenericFnPtr rgProcAddresses[LibcInfo::ctrgProcAddress]; + + ModuleEntryCopy() { + modBaseAddr = NULL; + modBaseSize = 0; + hModule = NULL; + strcpy(szModule, "<executable>"); + for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++) + rgProcAddresses[i] = LibcInfo::static_fn(i); + } + ModuleEntryCopy(HANDLE hprocess, HMODULE hmodule, const MODULEINFO& mi) { + this->modBaseAddr = mi.lpBaseOfDll; + this->modBaseSize = mi.SizeOfImage; + this->hModule = hmodule; + // TODO(csilvers): we could make more efficient by calling these + // lazily (not until the vars are needed, which is often never). + // However, there's tricky business with calling windows functions + // inside the patch_all_modules_lock (see the lock inversion + // comments with the patch_all_modules_lock definition, below), so + // it's safest to do it all here, where no lock is needed. + ::GetModuleBaseNameA(hprocess, hmodule, + this->szModule, sizeof(this->szModule)); + for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++) + rgProcAddresses[i] = + (GenericFnPtr)::GetProcAddress(hModule, LibcInfo::function_name(i)); + } +}; + // This class is easier because there's only one of them. class WindowsInfo { public: @@ -347,6 +373,11 @@ class WindowsInfo { // If you run out, just add a few more to the array. You'll also need // to update the switch statement in PatchOneModule(), and the list in // UnpatchWindowsFunctions(). +// main_executable and main_executable_windows are two windows into +// the same executable. One is responsible for patching the libc +// routines that live in the main executable (if any) to use tcmalloc; +// the other is responsible for patching the windows routines like +// HeapAlloc/etc to use tcmalloc. static LibcInfoWithPatchFunctions<0> main_executable; static LibcInfoWithPatchFunctions<1> libc1; static LibcInfoWithPatchFunctions<2> libc2; @@ -448,20 +479,28 @@ const GenericFnPtr LibcInfoWithPatchFunctions<T>::perftools_fn_[] = { { "FreeLibrary", NULL, NULL, (GenericFnPtr)&Perftools_FreeLibrary }, }; -bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& me32) { +bool LibcInfo::SameAs(const LibcInfo& that) const { + return (is_valid() && + module_base_address_ == that.module_base_address_ && + module_base_size_ == that.module_base_size_); +} + +bool LibcInfo::SameAsModuleEntry(const ModuleEntryCopy& module_entry) const { + return (is_valid() && + module_base_address_ == module_entry.modBaseAddr && + module_base_size_ == module_entry.modBaseSize); +} + +bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& module_entry) { // First, store the location of the function to patch before // patching it. If none of these functions are found in the module, // then this module has no libc in it, and we just return false. for (int i = 0; i < kNumFunctions; i++) { if (!function_name_[i]) // we can turn off patching by unsetting name continue; - GenericFnPtr fn = NULL; - if (me32.hModule == NULL) { // used for the main executable - // This is used only for a statically-linked-in libc. - fn = static_fn_[i]; - } else { - fn = (GenericFnPtr)::GetProcAddress(me32.hModule, function_name_[i]); - } + // The ::GetProcAddress calls were done in the ModuleEntryCopy + // constructor, so we don't have to make any windows calls here. + const GenericFnPtr fn = module_entry.rgProcAddresses[i]; if (fn) { windows_fn_[i] = PreamblePatcher::ResolveTarget(fn); } @@ -514,15 +553,15 @@ bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& me32) { CHECK(windows_fn_[kRealloc]); // OK, we successfully patched. Let's store our member information. - module_base_address_ = me32.modBaseAddr; - module_base_size_ = me32.modBaseSize; - strcpy(module_name_, me32.szModule); + module_base_address_ = module_entry.modBaseAddr; + module_base_size_ = module_entry.modBaseSize; + strcpy(module_name_, module_entry.szModule); return true; } template<int T> bool LibcInfoWithPatchFunctions<T>::Patch(const LibcInfo& me_info) { - CopyFrom(me_info); // copies the me32 and the windows_fn_ array + CopyFrom(me_info); // copies the module_entry and the windows_fn_ array for (int i = 0; i < kNumFunctions; i++) { if (windows_fn_[i] && windows_fn_[i] != perftools_fn_[i]) { // if origstub_fn_ is not NULL, it's left around from a previous @@ -530,7 +569,10 @@ bool LibcInfoWithPatchFunctions<T>::Patch(const LibcInfo& me_info) { // Since we've patched Unpatch() not to delete origstub_fn_ (it // causes problems in some contexts, though obviously not this // one), we should delete it now, before setting it to NULL. - delete[] reinterpret_cast<char*>(origstub_fn_[i]); + // NOTE: casting from a function to a pointer is contra the C++ + // spec. It's not safe on IA64, but is on i386. We use + // a C-style cast here to emphasize this is not legal C++. + delete[] (char*)(origstub_fn_[i]); origstub_fn_[i] = NULL; // Patch() will fill this in CHECK_EQ(sidestep::SIDESTEP_SUCCESS, PreamblePatcher::Patch(windows_fn_[i], perftools_fn_[i], @@ -569,7 +611,10 @@ void WindowsInfo::Patch() { // Since we've patched Unpatch() not to delete origstub_fn_ (it // causes problems in some contexts, though obviously not this // one), we should delete it now, before setting it to NULL. - delete[] reinterpret_cast<char*>(function_info_[i].origstub_fn); + // NOTE: casting from a function to a pointer is contra the C++ + // spec. It's not safe on IA64, but is on i386. We use + // a C-style cast here to emphasize this is not legal C++. + delete[] (char*)(function_info_[i].origstub_fn); function_info_[i].origstub_fn = NULL; // Patch() will fill this in CHECK_EQ(sidestep::SIDESTEP_SUCCESS, PreamblePatcher::Patch(function_info_[i].windows_fn, @@ -594,7 +639,7 @@ void PatchOneModuleLocked(const LibcInfo& me_info) { // Double-check we haven't seen this module before. for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) { if (g_module_libcs[i]->SameAs(me_info)) { - fprintf(stderr, "%s:%d: FATAL ERROR: %s double-patched somehow.\n", + fprintf(stderr, "%s:%d: FATAL PERFTOOLS ERROR: %s double-patched somehow.\n", __FILE__, __LINE__, g_module_libcs[i]->module_name()); CHECK(false); } @@ -617,34 +662,53 @@ void PatchOneModuleLocked(const LibcInfo& me_info) { } } } - printf("ERROR: Too many modules containing libc in this executable\n"); + printf("PERFTOOLS ERROR: Too many modules containing libc in this executable\n"); } void PatchMainExecutableLocked() { if (main_executable.patched()) return; // main executable has already been patched - ModuleEntryCopy fake_me32; // we make a fake one to pass into Patch() - main_executable.PopulateWindowsFn(fake_me32); + ModuleEntryCopy fake_module_entry; // make a fake one to pass into Patch() + // No need to call PopulateModuleEntryProcAddresses on the main executable. + main_executable.PopulateWindowsFn(fake_module_entry); main_executable.Patch(main_executable); } +// This lock is subject to a subtle and annoying lock inversion +// problem: it may interact badly with unknown internal windows locks. +// In particular, windows may be holding a lock when it calls +// LoadLibraryExW and FreeLibrary, which we've patched. We have those +// routines call PatchAllModules, which acquires this lock. If we +// make windows system calls while holding this lock, those system +// calls may need the internal windows locks that are being held in +// the call to LoadLibraryExW, resulting in deadlock. The solution is +// to be very careful not to call *any* windows routines while holding +// patch_all_modules_lock, inside PatchAllModules(). static SpinLock patch_all_modules_lock(SpinLock::LINKER_INITIALIZED); // Iterates over all the modules currently loaded by the executable, // and makes sure they're all patched. For ones that aren't, we patch // them in. We also check that every module we had patched in the // past is still loaded, and update internal data structures if so. -void PatchAllModules() { +// We return true if this PatchAllModules did any work, false else. +bool PatchAllModules() { std::vector<ModuleEntryCopy> modules; + bool made_changes = false; const HANDLE hCurrentProcess = GetCurrentProcess(); MODULEINFO mi; DWORD cbNeeded = 0; HMODULE hModules[kMaxModules]; // max # of modules we support in one process - if (EnumProcessModules(hCurrentProcess, hModules, sizeof(hModules), - &cbNeeded)) { + if (::EnumProcessModules(hCurrentProcess, hModules, sizeof(hModules), + &cbNeeded)) { for (int i = 0; i < cbNeeded / sizeof(*hModules); ++i) { - if (GetModuleInformation(hCurrentProcess, hModules[i], &mi, sizeof(mi))) + if (i >= kMaxModules) { + printf("PERFTOOLS ERROR: Too many modules in this executable to try" + " to patch them all (if you need to, raise kMaxModules in" + " patch_functions.cc).\n"); + break; + } + if (::GetModuleInformation(hCurrentProcess, hModules[i], &mi, sizeof(mi))) modules.push_back(ModuleEntryCopy(hCurrentProcess, hModules[i], mi)); } } @@ -658,7 +722,7 @@ void PatchAllModules() { bool still_loaded = false; for (std::vector<ModuleEntryCopy>::iterator it = modules.begin(); it != modules.end(); ++it) { - if (g_module_libcs[i]->SameAsME32(*it)) { + if (g_module_libcs[i]->SameAsModuleEntry(*it)) { // Both g_module_libcs[i] and it are still valid. Mark it by // removing it from the vector; mark g_module_libcs[i] by // setting a bool. @@ -672,23 +736,30 @@ void PatchAllModules() { // We could call Unpatch() here, but why bother? The module // has gone away, so nobody is going to call into it anyway. g_module_libcs[i]->set_is_valid(false); + made_changes = true; } } // We've handled all the g_module_libcs. Now let's handle the rest - // of the me32's: those that haven't already been loaded. + // of the module-entries: those that haven't already been loaded. for (std::vector<ModuleEntryCopy>::const_iterator it = modules.begin(); it != modules.end(); ++it) { LibcInfo libc_info; - if (libc_info.PopulateWindowsFn(*it)) // true==module has libc routines - PatchOneModuleLocked(libc_info); // updates num_patched_modules + if (libc_info.PopulateWindowsFn(*it)) { // true==module has libc routines + PatchOneModuleLocked(libc_info); // updates num_patched_modules + made_changes = true; + } } // Now that we've dealt with the modules (dlls), update the main // executable. We do this last because PatchMainExecutableLocked // wants to look at how other modules were patched. - PatchMainExecutableLocked(); + if (!main_executable.patched()) { + PatchMainExecutableLocked(); + made_changes = true; + } } + return made_changes; } @@ -697,6 +768,8 @@ void PatchAllModules() { // --------------------------------------------------------------------- // PatchWindowsFunctions() // This is the function that is exposed to the outside world. +// It should be called before the program becomes multi-threaded, +// since main_executable_windows.Patch() is not thread-safe. // --------------------------------------------------------------------- void PatchWindowsFunctions() { @@ -956,19 +1029,68 @@ BOOL WINAPI WindowsInfo::Perftools_UnmapViewOfFile(LPCVOID lpBaseAddress) { lpBaseAddress); } +// * nopatching_set holds modules (specified by full file-path) that +// don't contain any libc routines, nor depend on any DLLs that do: +// that is, files where PatchAllModules ended up being a no-op. We +// know we don't need to patch those files if we see them again in the +// future (which can happen in windows apps, which can load+unload a +// given module many times during a single program run). +// * patching_set holds modules that *do* require us to do patching +// work when we load them. (This map is keyed by handle, so we need +// to be careful to clear each entry in FreeLibrary; the map data is +// the necessary refcount.) This set is used as an optimization in +// FreeLibrary to tell if we had done any patching in LoadLibraryExW. +// If we didn't, FreeLibrary doesn't need to clear up the +// patching-work, but if we did, it does. +static std::map<HMODULE, int>* patching_map = NULL; +static std::set<std::string>* nopatching_set = NULL; + HMODULE WINAPI WindowsInfo::Perftools_LoadLibraryExW(LPCWSTR lpFileName, HANDLE hFile, DWORD dwFlags) { HMODULE rv = ((HMODULE (WINAPI *)(LPCWSTR, HANDLE, DWORD)) function_info_[kLoadLibraryExW].origstub_fn)( lpFileName, hFile, dwFlags); - PatchAllModules(); // this will patch any newly loaded libraries + // This will patch any newly loaded libraries, if patching needs to + // be done. If so, PatchAllModules() will return true. If no + // patching needs to be done (this library didn't contain a libc), + // we'll mark it in a set, so we can skip patching next time. + char szFullPath[PATH_MAX]; + if (::GetModuleFileNameA(rv, szFullPath, sizeof(szFullPath))) { + SpinLockHolder h(&patch_all_modules_lock); + if (nopatching_set && nopatching_set->count(szFullPath) > 0) + return rv; + } + const bool made_changes = PatchAllModules(); + { + SpinLockHolder h(&patch_all_modules_lock); + if (made_changes) { + if (!patching_map) patching_map = new std::map<HMODULE, int>; + ++(*patching_map)[rv]; + } else { + if (!nopatching_set) nopatching_set = new std::set<std::string>; + nopatching_set->insert(szFullPath); + } + } return rv; } BOOL WINAPI WindowsInfo::Perftools_FreeLibrary(HMODULE hLibModule) { BOOL rv = ((BOOL (WINAPI *)(HMODULE)) function_info_[kFreeLibrary].origstub_fn)(hLibModule); + { + // This is an optimization: if we know this module never did any work + // in LoadLibraryEx (because it didn't get put into patching_map), we + // know we don't need to revisit the patching here. + SpinLockHolder h(&patch_all_modules_lock); + if (patching_map) { + std::map<HMODULE, int>::iterator it = patching_map->find(hLibModule); + if (it == patching_map->end()) + return rv; + if (--it->second <= 0) // decrement the refcount + patching_map->erase(it); // hLibModule is now a dangling handle + } + } PatchAllModules(); // this will fix up the list of patched libraries return rv; } |