10 files changed, 333 insertions, 114 deletions
diff --git a/src/base/low_level_alloc.cc b/src/base/low_level_alloc.cc
index 900ae4e..2bbce54 100644
--- a/src/base/low_level_alloc.cc
+++ b/src/base/low_level_alloc.cc
@@ -67,7 +67,7 @@ namespace {
                       // first.  Valid in both allocated and unallocated blocks
       intptr_t magic; // kMagicAllocated or kMagicUnallocated xor this
       LowLevelAlloc::Arena *arena; // pointer to parent arena
-      void *dummy_for_alignment;   // aligns regions to 0 mod 2*sizeof(void*) 
+      void *dummy_for_alignment;   // aligns regions to 0 mod 2*sizeof(void*)
     } header;
 
     // Next two fields: in unallocated blocks: freelist skiplist data
@@ -197,15 +197,57 @@ struct LowLevelAlloc::Arena {
 // pointer.
 static struct LowLevelAlloc::Arena default_arena;
 
-// A non-malloc-hooked arena: used only to allocate metadata for arenas that
+// Non-malloc-hooked arenas: used only to allocate metadata for arenas that
 // do not want malloc hook reporting, so that for them there's no malloc hook
 // reporting even during arena creation.
 static struct LowLevelAlloc::Arena unhooked_arena;
+static struct LowLevelAlloc::Arena unhooked_async_sig_safe_arena;
 
 // magic numbers to identify allocated and unallocated blocks
 static const intptr_t kMagicAllocated = 0x4c833e95;
 static const intptr_t kMagicUnallocated = ~kMagicAllocated;
 
+namespace {
+  class ArenaLock {
+   public:
+    explicit ArenaLock(LowLevelAlloc::Arena *arena) :
+        left_(false), mask_valid_(false), arena_(arena) {
+      if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) != 0) {
+      // We've decided not to support async-signal-safe arena use until
+      // there a demonstrated need.  Here's how one could do it though
+      // (would need to be made more portable).
+#if 0
+        sigset_t all;
+        sigfillset(&all);
+        this->mask_valid_ =
+            (pthread_sigmask(SIG_BLOCK, &all, &this->mask_) == 0);
+#else
+        RAW_CHECK(false, "We do not yet support async-signal-safe arena.");
+#endif
+      }
+      this->arena_->mu.Lock();
+    }
+    ~ArenaLock() { RAW_CHECK(this->left_, "haven't left Arena region"); }
+    void Leave() {
+      this->arena_->mu.Unlock();
+#if 0
+      if (this->mask_valid_) {
+        pthread_sigmask(SIG_SETMASK, &this->mask_, 0);
+      }
+#endif
+      this->left_ = true;
+    }
+   private:
+    bool left_;       // whether left region
+    bool mask_valid_;
+#if 0
+    sigset_t mask_;   // old mask of blocked signals
+#endif
+    LowLevelAlloc::Arena *arena_;
+    DISALLOW_COPY_AND_ASSIGN(ArenaLock);
+  };
+} // anonymous namespace
+
 // create an appropriate magic number for an object at "ptr"
 // "magic" should be kMagicAllocated or kMagicUnallocated
 inline static intptr_t Magic(intptr_t magic, AllocList::Header *ptr) {
@@ -235,6 +277,8 @@ static void ArenaInit(LowLevelAlloc::Arena *arena) {
       // Default arena should be hooked, e.g. for heap-checker to trace
       // pointer chains through objects in the default arena.
       arena->flags = LowLevelAlloc::kCallMallocHook;
+    } else if (arena == &unhooked_async_sig_safe_arena) {
+      arena->flags = LowLevelAlloc::kAsyncSignalSafe;
     } else {
       arena->flags = 0;   // other arenas' flags may be overridden by client,
                           // but unhooked_arena will have 0 in 'flags'.
@@ -246,9 +290,12 @@ static void ArenaInit(LowLevelAlloc::Arena *arena) {
 LowLevelAlloc::Arena *LowLevelAlloc::NewArena(int32 flags,
                                               Arena *meta_data_arena) {
   RAW_CHECK(meta_data_arena != 0, "must pass a valid arena");
-  if (meta_data_arena == &default_arena  &&
-      (flags & LowLevelAlloc::kCallMallocHook) == 0) {
-    meta_data_arena = &unhooked_arena;
+  if (meta_data_arena == &default_arena) {
+    if ((flags & LowLevelAlloc::kAsyncSignalSafe) != 0) {
+      meta_data_arena = &unhooked_async_sig_safe_arena;
+    } else if ((flags & LowLevelAlloc::kCallMallocHook) == 0) {
+      meta_data_arena = &unhooked_arena;
+    }
   }
   // Arena(0) uses the constructor for non-static contexts
   Arena *result =
@@ -262,9 +309,9 @@ LowLevelAlloc::Arena *LowLevelAlloc::NewArena(int32 flags,
 bool LowLevelAlloc::DeleteArena(Arena *arena) {
   RAW_CHECK(arena != 0 && arena != &default_arena && arena != &unhooked_arena,
             "may not delete default arena");
-  arena->mu.Lock();
+  ArenaLock section(arena);
   bool empty = (arena->allocation_count == 0);
-  arena->mu.Unlock();
+  section.Leave();
   if (empty) {
     while (arena->freelist.next[0] != 0) {
       AllocList *region = arena->freelist.next[0];
@@ -279,7 +326,13 @@ bool LowLevelAlloc::DeleteArena(Arena *arena) {
                 "empty arena has non-page-aligned block size");
       RAW_CHECK(reinterpret_cast<intptr_t>(region) % arena->pagesize == 0,
                 "empty arena has non-page-aligned block");
-      RAW_CHECK(munmap(region, size) == 0,
+      int munmap_result;
+      if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) == 0) {
+        munmap_result = munmap(region, size);
+      } else {
+        munmap_result = MallocHook::UnhookedMUnmap(region, size);
+      }
+      RAW_CHECK(munmap_result == 0,
                 "LowLevelAlloc::DeleteArena:  munmap failed address");
     }
     Free(arena);
@@ -363,21 +416,21 @@ void LowLevelAlloc::Free(void *v) {
     if ((arena->flags & kCallMallocHook) != 0) {
       MallocHook::InvokeDeleteHook(v);
     }
-    arena->mu.Lock();
+    ArenaLock section(arena);
     AddToFreelist(v, arena);
     RAW_CHECK(arena->allocation_count > 0, "nothing in arena to free");
     arena->allocation_count--;
-    arena->mu.Unlock();
+    section.Leave();
   }
 }
 
 // allocates and returns a block of size bytes, to be freed with Free()
 // L < arena->mu
-void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) {
+static void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) {
   void *result = 0;
   if (request != 0) {
     AllocList *s;       // will point to region that satisfies request
-    arena->mu.Lock();
+    ArenaLock section(arena);
     ArenaInit(arena);
     // round up with header
     size_t req_rnd = RoundUp(request + sizeof (s->header), arena->roundup);
@@ -399,8 +452,14 @@ void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) {
       // mmap generous 64K chunks to decrease
       // the chances/impact of fragmentation:
       size_t new_pages_size = RoundUp(req_rnd, arena->pagesize * 16);
-      void *new_pages = mmap(0, new_pages_size,
-                     PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+      void *new_pages;
+      if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) != 0) {
+        new_pages = MallocHook::UnhookedMMap(0, new_pages_size,
+            PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+      } else {
+        new_pages = mmap(0, new_pages_size,
+            PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+      }
       RAW_CHECK(new_pages != MAP_FAILED, "mmap error");
       arena->mu.Lock();
       s = reinterpret_cast<AllocList *>(new_pages);
@@ -425,7 +484,7 @@ void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) {
     s->header.magic = Magic(kMagicAllocated, &s->header);
     RAW_CHECK(s->header.arena == arena, "");
     arena->allocation_count++;
-    arena->mu.Unlock();
+    section.Leave();
     result = &s->levels;
   }
   ANNOTATE_NEW_MEMORY(result, request);
diff --git a/src/base/low_level_alloc.h b/src/base/low_level_alloc.h
index 0df1298..393b3d2 100644
--- a/src/base/low_level_alloc.h
+++ b/src/base/low_level_alloc.h
@@ -72,14 +72,20 @@ class LowLevelAlloc {
   // meta_data_arena; the DefaultArena() can be passed for meta_data_arena.
   // These values may be ored into flags:
   enum {
-    // Calls to Alloc() and Free() will be reported
-    // via the MallocHook interface.
-    // The DefaultArena() has this flag on.
-    // NewArena(flags, DefaultArena()) w/o this bit in 'flags',
-    // on the other hand, will not cause any MallocHook
-    // calls even during the NewArena call itself
-    // (it will in fact use meta_data_arena different from DefaultArena()).
-    kCallMallocHook = 0x0001
+    // Report calls to Alloc() and Free() via the MallocHook interface.
+    // Set in the DefaultArena.
+    kCallMallocHook = 0x0001,
+
+    // Make calls to Alloc(), Free() be async-signal-safe.  Not set in
+    // DefaultArena().
+    kAsyncSignalSafe = 0x0002,
+
+    // When used with DefaultArena(), the NewArena() and DeleteArena() calls
+    // obey the flags given explicitly in the NewArena() call, even if those
+    // flags differ from the settings in DefaultArena().  So the call
+    // NewArena(kAsyncSignalSafe, DefaultArena()) is itself async-signal-safe,
+    // as well as generatating an arena that provides async-signal-safe
+    // Alloc/Free.
   };
   static Arena *NewArena(int32 flags, Arena *meta_data_arena);
 
diff --git a/src/google/malloc_hook.h b/src/google/malloc_hook.h
index f2713e9..48d92da 100644
--- a/src/google/malloc_hook.h
+++ b/src/google/malloc_hook.h
@@ -191,6 +191,12 @@ class PERFTOOLS_DLL_DECL MallocHook {
                                         int skip_count) {
     return MallocHook_GetCallerStackTrace(result, max_depth, skip_count);
   }
+
+  // Unhooked versions of mmap() and munmap().   These should be used
+  // only by experts, since they bypass heapchecking, etc.
+  static void* UnhookedMMap(void *start, size_t length, int prot, int flags,
+                            int fd, off_t offset);
+  static int UnhookedMUnmap(void *start, size_t length);
 };
 
 #endif /* _MALLOC_HOOK_H_ */
diff --git a/src/malloc_hook.cc b/src/malloc_hook.cc
index d1ad12a..2a7f542 100644
--- a/src/malloc_hook.cc
+++ b/src/malloc_hook.cc
@@ -423,7 +423,7 @@ static inline void* do_mmap64(void *start, size_t length,
   return result;
 }
 
-# endif
+# endif  // defined(__x86_64__)
 
 // We use do_mmap64 abstraction to put MallocHook::InvokeMmapHook
 // calls right into mmap and mmap64, so that the stack frames in the caller's
@@ -472,7 +472,7 @@ extern "C" void* mmap(void *start, size_t length, int prot, int flags,
   return result;
 }
 
-#endif
+#endif  // !defined(__USE_FILE_OFFSET64) || !defined(__REDIRECT_NTH)
 
 extern "C" int munmap(void* start, size_t length) __THROW {
   MallocHook::InvokeMunmapHook(start, length);
@@ -501,4 +501,26 @@ extern "C" void* sbrk(ptrdiff_t increment) __THROW {
   return result;
 }
 
-#endif
+/*static*/void* MallocHook::UnhookedMMap(void *start, size_t length, int prot,
+                                         int flags, int fd, off_t offset) {
+  return do_mmap64(start, length, prot, flags, fd, offset);
+}
+
+/*static*/int MallocHook::UnhookedMUnmap(void *start, size_t length) {
+  return sys_munmap(start, length);
+}
+
+#else   // defined(__linux) &&
+        // (defined(__i386__) || defined(__x86_64__) || defined(__PPC__))
+
+/*static*/void* MallocHook::UnhookedMMap(void *start, size_t length, int prot,
+                                         int flags, int fd, off_t offset) {
+  return mmap(start, length, prot, flags, fd, offset);
+}
+
+/*static*/int MallocHook::UnhookedMUnmap(void *start, size_t length) {
+  return munmap(start, length);
+}
+
+#endif  // defined(__linux) &&
+        // (defined(__i386__) || defined(__x86_64__) || defined(__PPC__))
diff --git a/src/pprof b/src/pprof
index b7c7d07..62e19c7 100755
--- a/src/pprof
+++ b/src/pprof
@@ -1977,7 +1977,12 @@ sub RemoveUninterestingFrames {
                       '__builtin_vec_delete',
                       '__builtin_vec_new',
                       'operator new',
-                      'operator new[]') {
+                      'operator new[]',
+		      # These mark the beginning/end of our custom sections
+		      '__start_google_malloc',
+		      '__stop_google_malloc',
+		      '__start_malloc_hook',
+		      '__stop_malloc_hook') {
       $skip{$name} = 1;
       $skip{"_" . $name} = 1;   # Mach (OS X) adds a _ prefix to everything
     }
diff --git a/src/stacktrace_x86-inl.h b/src/stacktrace_x86-inl.h
index 5650c86..05701e7 100644
--- a/src/stacktrace_x86-inl.h
+++ b/src/stacktrace_x86-inl.h
@@ -49,6 +49,11 @@
 #elif defined(HAVE_UCONTEXT_H)
 #include <ucontext.h>  // for ucontext_t
 #elif defined(HAVE_CYGWIN_SIGNAL_H)
+// cygwin/signal.h has a buglet where it uses pthread_attr_t without
+// #including <pthread.h> itself.  So we have to do it.
+# ifdef HAVE_PTHREAD
+# include <pthread.h>
+# endif
 #include <cygwin/signal.h>
 typedef ucontext ucontext_t;
 #endif
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 70dd8c6..625301e 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -762,9 +762,9 @@ TCMallocGuard::TCMallocGuard() {
     // patch the windows VirtualAlloc, etc.
     PatchWindowsFunctions();    // defined in windows/patch_functions.cc
 #endif
-    free(malloc(1));
+    tc_free(tc_malloc(1));
     ThreadCache::InitTSD();
-    free(malloc(1));
+    tc_free(tc_malloc(1));
     MallocExtension::Register(new TCMallocImplementation);
   }
 }
diff --git a/src/tests/sampling_test.sh b/src/tests/sampling_test.sh
index 149d27b..8c96bc1 100755
--- a/src/tests/sampling_test.sh
+++ b/src/tests/sampling_test.sh
@@ -62,10 +62,10 @@ if ! cat "$SAMPLING_TEST_BINARY" >/dev/null 2>&1; then
   SAMPLING_TEST_BINARY="$SAMPLING_TEST_BINARY".exe
 fi
 
-die() {
-    echo "FAILED"
-    echo "reason:"
-    echo "$@"
+die() {    # runs the command given as arguments, and then dies.
+    echo "FAILED.  Output from $@"
+    echo "----"
+    "$@"
     echo "----"
     exit 1
 }
@@ -79,16 +79,16 @@ mkdir "$OUTDIR" || die "Unable to create $OUTDIR"
 # 50M to 99M.
 "$SAMPLING_TEST" "$OUTDIR/out"
 
-echo -n "Testing heap output..."
+echo "Testing heap output..."
 "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap" \
    | grep '^ *[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \
-   || die `"$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap"`
+   || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap"
 echo "OK"
 
-echo -n "Testing growth output..."
+echo "Testing growth output..."
 "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth" \
    | grep '^ *[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \
-   || die `"$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth"`
+   || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth"
 echo "OK"
 
 echo "PASS"
diff --git a/src/thread_cache.cc b/src/thread_cache.cc
index fd44a70..64f4deb 100644
--- a/src/thread_cache.cc
+++ b/src/thread_cache.cc
@@ -299,12 +299,6 @@ int ThreadCache::GetSamplePeriod() {
 }
 
 void ThreadCache::InitModule() {
-  // There is a slight potential race here because of double-checked
-  // locking idiom.  However, as long as the program does a small
-  // allocation before switching to multi-threaded mode, we will be
-  // fine.  We increase the chances of doing such a small allocation
-  // by doing one in the constructor of the module_enter_exit_hook
-  // object declared below.
   SpinLockHolder h(Static::pageheap_lock());
   if (!phinited) {
     Static::InitStaticVars();
diff --git a/src/windows/patch_functions.cc b/src/windows/patch_functions.cc
index 992161f..d13b7b8 100644
--- a/src/windows/patch_functions.cc
+++ b/src/windows/patch_functions.cc
@@ -72,6 +72,11 @@
 #error This file is intended for patching allocators - use override_functions.cc instead.
 #endif
 
+// We use psapi.  Non-MSVC systems will have to link this in themselves.
+#ifdef _MSC_VER
+#pragma comment(lib, "Psapi.lib")
+#endif
+
 // Make sure we always use the 'old' names of the psapi functions.
 #ifndef PSAPI_VERSION
 #define PSAPI_VERSION 1
@@ -80,6 +85,8 @@
 #include <windows.h>
 #include <malloc.h>       // for _msize and _expand
 #include <Psapi.h>        // for EnumProcessModules, GetModuleInformation, etc.
+#include <set>
+#include <map>
 #include <vector>
 #include <base/logging.h>
 #include "base/spinlock.h"
@@ -122,29 +129,7 @@ typedef void (*GenericFnPtr)();
 
 using sidestep::PreamblePatcher;
 
-// This is a subset of MODDULEENTRY32, that we need for patching
-struct ModuleEntryCopy {
-  LPVOID  modBaseAddr;
-  DWORD   modBaseSize;
-  HMODULE hModule;
-  TCHAR   szModule[kMaxModuleNameSize];
-
-  ModuleEntryCopy() {
-    modBaseAddr = NULL;
-    modBaseSize = 0;
-    hModule = NULL;
-    strcpy(szModule, "<executable>");
-  }
-  ModuleEntryCopy(HANDLE hprocess, HMODULE hmodule, const MODULEINFO& mi) {
-    this->modBaseAddr = mi.lpBaseOfDll;
-    this->modBaseSize = mi.SizeOfImage;
-    this->hModule = hmodule;
-    // TODO(csilvers): we could make more efficient by calling this
-    // lazily (not until this->szModule is needed, which is often never).
-    GetModuleBaseNameA(hprocess, hmodule,
-                       this->szModule, sizeof(this->szModule));
-  }
-};
+struct ModuleEntryCopy;   // defined below
 
 // These functions are how we override the memory allocation
 // functions, just like tcmalloc.cc and malloc_hook.cc do.
@@ -160,33 +145,19 @@ class LibcInfo {
   LibcInfo() {
     memset(this, 0, sizeof(*this));  // easiest way to initialize the array
   }
-  bool SameAs(const LibcInfo& that) const {
-    return (is_valid() &&
-            module_base_address_ == that.module_base_address_ &&
-            module_base_size_ == that.module_base_size_);
-  }
-  bool SameAsME32(const ModuleEntryCopy& me32) const {
-    return (is_valid() &&
-            module_base_address_ == me32.modBaseAddr &&
-            module_base_size_ == me32.modBaseSize);
-  }
+  bool SameAs(const LibcInfo& that) const;
+  bool SameAsModuleEntry(const ModuleEntryCopy& module_entry) const;
+
   bool patched() const { return is_valid() && module_name_[0] != '\0'; }
   const char* module_name() const { return is_valid() ? module_name_ : ""; }
 
   void set_is_valid(bool b) { is_valid_ = b; }
 
-  // These shouldn't have to be public, since only subclasses of
-  // LibcInfo need it, but they do.  Maybe something to do with
-  // templates.  Shrug.
-  bool is_valid() const { return is_valid_; }
-  GenericFnPtr windows_fn(int ifunction) const {
-    return windows_fn_[ifunction];
-  }
-
   // Populates all the windows_fn_[] vars based on our module info.
   // Returns false if windows_fn_ is all NULL's, because there's
-  // nothing to patch.  Also populates the me32 info.
-  bool PopulateWindowsFn(const ModuleEntryCopy& me32);
+  // nothing to patch.  Also populates the rest of the module_entry
+  // info, such as the module's name.
+  bool PopulateWindowsFn(const ModuleEntryCopy& module_entry);
 
  protected:
   void CopyFrom(const LibcInfo& that) {
@@ -237,6 +208,24 @@ class LibcInfo {
   const void *module_base_address_;
   size_t module_base_size_;
   char module_name_[kMaxModuleNameSize];
+
+ public:
+  // These shouldn't have to be public, since only subclasses of
+  // LibcInfo need it, but they do.  Maybe something to do with
+  // templates.  Shrug.  I hide them down here so users won't see
+  // them. :-)  (OK, I also need to define ctrgProcAddress late.)
+  bool is_valid() const { return is_valid_; }
+  GenericFnPtr windows_fn(int ifunction) const {
+    return windows_fn_[ifunction];
+  }
+  // These three are needed by ModuleEntryCopy.
+  static const int ctrgProcAddress = kNumFunctions;
+  static GenericFnPtr static_fn(int ifunction) {
+    return static_fn_[ifunction];
+  }
+  static const char* const function_name(int ifunction) {
+    return function_name_[ifunction];
+  }
 };
 
 // Template trickiness: logically, a LibcInfo would include
@@ -294,6 +283,43 @@ template<int> class LibcInfoWithPatchFunctions : public LibcInfo {
   // But they seem pretty obscure, and I'm fine not overriding them for now.
 };
 
+// This is a subset of MODDULEENTRY32, that we need for patching.
+struct ModuleEntryCopy {
+  LPVOID  modBaseAddr;
+  DWORD   modBaseSize;
+  HMODULE hModule;
+  TCHAR   szModule[kMaxModuleNameSize];
+  // This is not part of MODDULEENTRY32, but is needed to avoid making
+  // windows syscalls while we're holding patch_all_modules_lock (see
+  // lock-inversion comments at patch_all_modules_lock definition, below).
+  GenericFnPtr rgProcAddresses[LibcInfo::ctrgProcAddress];
+
+  ModuleEntryCopy() {
+    modBaseAddr = NULL;
+    modBaseSize = 0;
+    hModule = NULL;
+    strcpy(szModule, "<executable>");
+    for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++)
+      rgProcAddresses[i] = LibcInfo::static_fn(i);
+  }
+  ModuleEntryCopy(HANDLE hprocess, HMODULE hmodule, const MODULEINFO& mi) {
+    this->modBaseAddr = mi.lpBaseOfDll;
+    this->modBaseSize = mi.SizeOfImage;
+    this->hModule = hmodule;
+    // TODO(csilvers): we could make more efficient by calling these
+    // lazily (not until the vars are needed, which is often never).
+    // However, there's tricky business with calling windows functions
+    // inside the patch_all_modules_lock (see the lock inversion
+    // comments with the patch_all_modules_lock definition, below), so
+    // it's safest to do it all here, where no lock is needed.
+    ::GetModuleBaseNameA(hprocess, hmodule,
+                         this->szModule, sizeof(this->szModule));
+    for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++)
+      rgProcAddresses[i] =
+          (GenericFnPtr)::GetProcAddress(hModule, LibcInfo::function_name(i));
+  }
+};
+
 // This class is easier because there's only one of them.
 class WindowsInfo {
  public:
@@ -347,6 +373,11 @@ class WindowsInfo {
 // If you run out, just add a few more to the array.  You'll also need
 // to update the switch statement in PatchOneModule(), and the list in
 // UnpatchWindowsFunctions().
+// main_executable and main_executable_windows are two windows into
+// the same executable.  One is responsible for patching the libc
+// routines that live in the main executable (if any) to use tcmalloc;
+// the other is responsible for patching the windows routines like
+// HeapAlloc/etc to use tcmalloc.
 static LibcInfoWithPatchFunctions<0> main_executable;
 static LibcInfoWithPatchFunctions<1> libc1;
 static LibcInfoWithPatchFunctions<2> libc2;
@@ -448,20 +479,28 @@ const GenericFnPtr LibcInfoWithPatchFunctions<T>::perftools_fn_[] = {
   { "FreeLibrary", NULL, NULL, (GenericFnPtr)&Perftools_FreeLibrary },
 };
 
-bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& me32) {
+bool LibcInfo::SameAs(const LibcInfo& that) const {
+  return (is_valid() &&
+          module_base_address_ == that.module_base_address_ &&
+          module_base_size_ == that.module_base_size_);
+}
+
+bool LibcInfo::SameAsModuleEntry(const ModuleEntryCopy& module_entry) const {
+  return (is_valid() &&
+          module_base_address_ == module_entry.modBaseAddr &&
+          module_base_size_ == module_entry.modBaseSize);
+}
+
+bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& module_entry) {
   // First, store the location of the function to patch before
   // patching it.  If none of these functions are found in the module,
   // then this module has no libc in it, and we just return false.
   for (int i = 0; i < kNumFunctions; i++) {
     if (!function_name_[i])     // we can turn off patching by unsetting name
       continue;
-    GenericFnPtr fn = NULL;
-    if (me32.hModule == NULL) { // used for the main executable
-      // This is used only for a statically-linked-in libc.
-      fn = static_fn_[i];
-    } else {
-      fn = (GenericFnPtr)::GetProcAddress(me32.hModule, function_name_[i]);
-    }
+    // The ::GetProcAddress calls were done in the ModuleEntryCopy
+    // constructor, so we don't have to make any windows calls here.
+    const GenericFnPtr fn = module_entry.rgProcAddresses[i];
     if (fn) {
       windows_fn_[i] = PreamblePatcher::ResolveTarget(fn);
     }
@@ -514,15 +553,15 @@ bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& me32) {
   CHECK(windows_fn_[kRealloc]);
 
   // OK, we successfully patched.  Let's store our member information.
-  module_base_address_ = me32.modBaseAddr;
-  module_base_size_ = me32.modBaseSize;
-  strcpy(module_name_, me32.szModule);
+  module_base_address_ = module_entry.modBaseAddr;
+  module_base_size_ = module_entry.modBaseSize;
+  strcpy(module_name_, module_entry.szModule);
   return true;
 }
 
 template<int T>
 bool LibcInfoWithPatchFunctions<T>::Patch(const LibcInfo& me_info) {
-  CopyFrom(me_info);   // copies the me32 and the windows_fn_ array
+  CopyFrom(me_info);   // copies the module_entry and the windows_fn_ array
   for (int i = 0; i < kNumFunctions; i++) {
     if (windows_fn_[i] && windows_fn_[i] != perftools_fn_[i]) {
       // if origstub_fn_ is not NULL, it's left around from a previous
@@ -530,7 +569,10 @@ bool LibcInfoWithPatchFunctions<T>::Patch(const LibcInfo& me_info) {
       // Since we've patched Unpatch() not to delete origstub_fn_ (it
       // causes problems in some contexts, though obviously not this
       // one), we should delete it now, before setting it to NULL.
-      delete[] reinterpret_cast<char*>(origstub_fn_[i]);
+      // NOTE: casting from a function to a pointer is contra the C++
+      //       spec.  It's not safe on IA64, but is on i386.  We use
+      //       a C-style cast here to emphasize this is not legal C++.
+      delete[] (char*)(origstub_fn_[i]);
       origstub_fn_[i] = NULL;   // Patch() will fill this in
       CHECK_EQ(sidestep::SIDESTEP_SUCCESS,
                PreamblePatcher::Patch(windows_fn_[i], perftools_fn_[i],
@@ -569,7 +611,10 @@ void WindowsInfo::Patch() {
     // Since we've patched Unpatch() not to delete origstub_fn_ (it
     // causes problems in some contexts, though obviously not this
     // one), we should delete it now, before setting it to NULL.
-    delete[] reinterpret_cast<char*>(function_info_[i].origstub_fn);
+    // NOTE: casting from a function to a pointer is contra the C++
+    //       spec.  It's not safe on IA64, but is on i386.  We use
+    //       a C-style cast here to emphasize this is not legal C++.
+    delete[] (char*)(function_info_[i].origstub_fn);
     function_info_[i].origstub_fn = NULL;  // Patch() will fill this in
     CHECK_EQ(sidestep::SIDESTEP_SUCCESS,
              PreamblePatcher::Patch(function_info_[i].windows_fn,
@@ -594,7 +639,7 @@ void PatchOneModuleLocked(const LibcInfo& me_info) {
   // Double-check we haven't seen this module before.
   for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) {
     if (g_module_libcs[i]->SameAs(me_info)) {
-      fprintf(stderr, "%s:%d: FATAL ERROR: %s double-patched somehow.\n",
+      fprintf(stderr, "%s:%d: FATAL PERFTOOLS ERROR: %s double-patched somehow.\n",
               __FILE__, __LINE__, g_module_libcs[i]->module_name());
       CHECK(false);
     }
@@ -617,34 +662,53 @@ void PatchOneModuleLocked(const LibcInfo& me_info) {
       }
     }
   }
-  printf("ERROR: Too many modules containing libc in this executable\n");
+  printf("PERFTOOLS ERROR: Too many modules containing libc in this executable\n");
 }
 
 void PatchMainExecutableLocked() {
   if (main_executable.patched())
     return;    // main executable has already been patched
-  ModuleEntryCopy fake_me32;   // we make a fake one to pass into Patch()
-  main_executable.PopulateWindowsFn(fake_me32);
+  ModuleEntryCopy fake_module_entry;   // make a fake one to pass into Patch()
+  // No need to call PopulateModuleEntryProcAddresses on the main executable.
+  main_executable.PopulateWindowsFn(fake_module_entry);
   main_executable.Patch(main_executable);
 }
 
+// This lock is subject to a subtle and annoying lock inversion
+// problem: it may interact badly with unknown internal windows locks.
+// In particular, windows may be holding a lock when it calls
+// LoadLibraryExW and FreeLibrary, which we've patched.  We have those
+// routines call PatchAllModules, which acquires this lock.  If we
+// make windows system calls while holding this lock, those system
+// calls may need the internal windows locks that are being held in
+// the call to LoadLibraryExW, resulting in deadlock.  The solution is
+// to be very careful not to call *any* windows routines while holding
+// patch_all_modules_lock, inside PatchAllModules().
 static SpinLock patch_all_modules_lock(SpinLock::LINKER_INITIALIZED);
 
 // Iterates over all the modules currently loaded by the executable,
 // and makes sure they're all patched.  For ones that aren't, we patch
 // them in.  We also check that every module we had patched in the
 // past is still loaded, and update internal data structures if so.
-void PatchAllModules() {
+// We return true if this PatchAllModules did any work, false else.
+bool PatchAllModules() {
   std::vector<ModuleEntryCopy> modules;
+  bool made_changes = false;
 
   const HANDLE hCurrentProcess = GetCurrentProcess();
   MODULEINFO mi;
   DWORD cbNeeded = 0;
   HMODULE hModules[kMaxModules];  // max # of modules we support in one process
-  if (EnumProcessModules(hCurrentProcess, hModules, sizeof(hModules),
-                         &cbNeeded)) {
+  if (::EnumProcessModules(hCurrentProcess, hModules, sizeof(hModules),
+                           &cbNeeded)) {
     for (int i = 0; i < cbNeeded / sizeof(*hModules); ++i) {
-      if (GetModuleInformation(hCurrentProcess, hModules[i], &mi, sizeof(mi)))
+      if (i >= kMaxModules) {
+        printf("PERFTOOLS ERROR: Too many modules in this executable to try"
+               " to patch them all (if you need to, raise kMaxModules in"
+               " patch_functions.cc).\n");
+        break;
+      }
+      if (::GetModuleInformation(hCurrentProcess, hModules[i], &mi, sizeof(mi)))
         modules.push_back(ModuleEntryCopy(hCurrentProcess, hModules[i], mi));
     }
   }
@@ -658,7 +722,7 @@ void PatchAllModules() {
       bool still_loaded = false;
       for (std::vector<ModuleEntryCopy>::iterator it = modules.begin();
            it != modules.end(); ++it) {
-        if (g_module_libcs[i]->SameAsME32(*it)) {
+        if (g_module_libcs[i]->SameAsModuleEntry(*it)) {
           // Both g_module_libcs[i] and it are still valid.  Mark it by
           // removing it from the vector; mark g_module_libcs[i] by
           // setting a bool.
@@ -672,23 +736,30 @@ void PatchAllModules() {
         // We could call Unpatch() here, but why bother?  The module
         // has gone away, so nobody is going to call into it anyway.
         g_module_libcs[i]->set_is_valid(false);
+        made_changes = true;
       }
     }
 
     // We've handled all the g_module_libcs.  Now let's handle the rest
-    // of the me32's: those that haven't already been loaded.
+    // of the module-entries: those that haven't already been loaded.
     for (std::vector<ModuleEntryCopy>::const_iterator it = modules.begin();
          it != modules.end(); ++it) {
       LibcInfo libc_info;
-      if (libc_info.PopulateWindowsFn(*it))  // true==module has libc routines
-        PatchOneModuleLocked(libc_info);     // updates num_patched_modules
+      if (libc_info.PopulateWindowsFn(*it)) { // true==module has libc routines
+        PatchOneModuleLocked(libc_info);      // updates num_patched_modules
+        made_changes = true;
+      }
     }
 
     // Now that we've dealt with the modules (dlls), update the main
     // executable.  We do this last because PatchMainExecutableLocked
     // wants to look at how other modules were patched.
-    PatchMainExecutableLocked();
+    if (!main_executable.patched()) {
+      PatchMainExecutableLocked();
+      made_changes = true;
+    }
   }
+  return made_changes;
 }
 
 
@@ -697,6 +768,8 @@ void PatchAllModules() {
 // ---------------------------------------------------------------------
 // PatchWindowsFunctions()
 //    This is the function that is exposed to the outside world.
+//    It should be called before the program becomes multi-threaded,
+//    since main_executable_windows.Patch() is not thread-safe.
 // ---------------------------------------------------------------------
 
 void PatchWindowsFunctions() {
@@ -956,19 +1029,68 @@ BOOL WINAPI WindowsInfo::Perftools_UnmapViewOfFile(LPCVOID lpBaseAddress) {
               lpBaseAddress);
 }
 
+// * nopatching_set holds modules (specified by full file-path) that
+// don't contain any libc routines, nor depend on any DLLs that do:
+// that is, files where PatchAllModules ended up being a no-op.  We
+// know we don't need to patch those files if we see them again in the
+// future (which can happen in windows apps, which can load+unload a
+// given module many times during a single program run).
+// * patching_set holds modules that *do* require us to do patching
+// work when we load them.  (This map is keyed by handle, so we need
+// to be careful to clear each entry in FreeLibrary; the map data is
+// the necessary refcount.)  This set is used as an optimization in
+// FreeLibrary to tell if we had done any patching in LoadLibraryExW.
+// If we didn't, FreeLibrary doesn't need to clear up the
+// patching-work, but if we did, it does.
+static std::map<HMODULE, int>* patching_map = NULL;
+static std::set<std::string>* nopatching_set = NULL;
+
 HMODULE WINAPI WindowsInfo::Perftools_LoadLibraryExW(LPCWSTR lpFileName,
                                                      HANDLE hFile,
                                                      DWORD dwFlags) {
   HMODULE rv = ((HMODULE (WINAPI *)(LPCWSTR, HANDLE, DWORD))
                 function_info_[kLoadLibraryExW].origstub_fn)(
                     lpFileName, hFile, dwFlags);
-  PatchAllModules();   // this will patch any newly loaded libraries
+  // This will patch any newly loaded libraries, if patching needs to
+  // be done.  If so, PatchAllModules() will return true.  If no
+  // patching needs to be done (this library didn't contain a libc),
+  // we'll mark it in a set, so we can skip patching next time.
+  char szFullPath[PATH_MAX];
+  if (::GetModuleFileNameA(rv, szFullPath, sizeof(szFullPath))) {
+    SpinLockHolder h(&patch_all_modules_lock);
+    if (nopatching_set && nopatching_set->count(szFullPath) > 0)
+      return rv;
+  }
+  const bool made_changes = PatchAllModules();
+  {
+    SpinLockHolder h(&patch_all_modules_lock);
+    if (made_changes) {
+      if (!patching_map) patching_map = new std::map<HMODULE, int>;
+      ++(*patching_map)[rv];
+    } else {
+      if (!nopatching_set) nopatching_set = new std::set<std::string>;
+      nopatching_set->insert(szFullPath);
+    }
+  }
   return rv;
 }
 
 BOOL WINAPI WindowsInfo::Perftools_FreeLibrary(HMODULE hLibModule) {
   BOOL rv = ((BOOL (WINAPI *)(HMODULE))
              function_info_[kFreeLibrary].origstub_fn)(hLibModule);
+  {
+    // This is an optimization: if we know this module never did any work
+    // in LoadLibraryEx (because it didn't get put into patching_map), we
+    // know we don't need to revisit the patching here.
+    SpinLockHolder h(&patch_all_modules_lock);
+    if (patching_map) {
+      std::map<HMODULE, int>::iterator it = patching_map->find(hLibModule);
+      if (it == patching_map->end())
+        return rv;
+      if (--it->second <= 0)      // decrement the refcount
+        patching_map->erase(it);  // hLibModule is now a dangling handle
+    }
+  }
   PatchAllModules();    // this will fix up the list of patched libraries
   return rv;
 }