From 3014cf142e5a2409c88ab4559f3274434ed9a29b Mon Sep 17 00:00:00 2001
From: csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>
Date: Thu, 18 Nov 2010 01:07:25 +0000
Subject: 	* Suppress all large allocs when report threshold==0 	*
 Clarified meaning of various malloc stats 	* Change from
 ATTRIBUTED_DEPRECATED to comments 	* Make array-size a var to compile
 under clang 	* Reduce page map key size under x86_64 by 4.4MB 	*
 Added full qualification to MemoryBarrier 	* Support systems that
 capitalize /proc weirdly 	* Avoid gcc warning: exporting type in unnamed
 ns 	* Add some dynamic annotations for gcc attributes 	* Add support
 for census profiler in pprof 	* Speed up pprof's ExtractSymbols 	*
 Speed up GoogleOnce 	* Add pkg-config (.pc) files 	* Detect when
 __environ exists but is NULL 	* Improve spinlock contention performance 
 * Add GetFreeListSizes 	* Improve sampling_test, eg by adding
 no-inline 	* Relax malloc_extension test-check for big pages 	* Add
 proper library version number information 	* Update from autoconf 2.64 to
 2.65 	* Better document how to write a server that works with pprof 	*
 Change FillProcSelfMaps to better handle out-of-space 	* No longer hook
 _aligned_malloc/free in windows 	* Handle function-forwarding in DLLs
 when patching (in windows) 	* Update .vcproj files that had wrong .cc
 files in them (!) 	* get rid of unnecessary 'size < 0' 	* fix comments
 a bit in sysinfo.cc 	* another go at improving malloc-stats output 	* fix
 comment typo in profiler.cc 	* Add a few more thread annotations 	* Try
 to read TSC frequency from 'tsc_freq_khz' 	* Fix annotalysis/TSAN
 incompatibility 	* Add pprof --evince to go along with --gv 	*
 Document need for sampling to use GetHeapSample 	* Fix flakiness in
 malloc_extension_test 	* Separate out synchronization profiling routines

git-svn-id: http://gperftools.googlecode.com/svn/trunk@99 6b5cf1ce-ec42-a296-1ba9-69fdba395a50
---
 src/base/atomicops-internals-arm-gcc.h | 234 +++++++++++++++++++++++++++++++++
 src/base/basictypes.h                  |   2 +-
 src/base/dynamic_annotations.h         | 117 ++++++++++++++++-
 src/base/logging.h                     |   5 +-
 src/base/low_level_alloc.cc            |   6 +-
 src/base/spinlock.cc                   | 166 +++++++++++++++--------
 src/base/spinlock.h                    |  65 ++++-----
 src/base/spinlock_internal.cc          |  77 +++++++++++
 src/base/spinlock_internal.h           |  64 +++++++++
 src/base/spinlock_linux-inl.h          |  52 ++++----
 src/base/spinlock_posix-inl.h          |  32 +++--
 src/base/spinlock_win32-inl.h          |  18 ++-
 src/base/synchronization_profiling.h   |  50 +++++++
 src/base/sysinfo.cc                    | 107 ++++++++++-----
 src/base/sysinfo.h                     |   2 +-
 src/base/thread_annotations.h          |   4 +-
 src/base/vdso_support.cc               |   2 +-
 src/common.h                           |  13 ++
 src/config.h.in                        |   7 +
 src/debugallocation.cc                 |  16 ++-
 src/google/heap-checker.h              |   2 +-
 src/google/malloc_extension.h          |  47 ++++++-
 src/heap-profile-table.cc              |   3 +-
 src/malloc_extension.cc                |  12 +-
 src/memory_region_map.cc               |   1 -
 src/page_heap.cc                       |  29 ++++
 src/page_heap.h                        |  15 ++-
 src/pprof                              |  85 ++++++++----
 src/profiler.cc                        |   2 +-
 src/system-alloc.cc                    |  30 ++++-
 src/tcmalloc.cc                        | 190 +++++++++++++++++++++-----
 src/tests/debugallocation_test.cc      |  10 +-
 src/tests/malloc_extension_test.cc     |  26 ++++
 src/tests/sampling_test.cc             |   2 +
 src/tests/sampling_test.sh             |   4 +-
 src/tests/system-alloc_unittest.cc     |  14 ++
 src/tests/tcmalloc_unittest.cc         |  23 ++--
 src/windows/config.h                   |   2 +-
 src/windows/patch_functions.cc         |  46 +++----
 src/windows/port.cc                    |  12 ++
 src/windows/port.h                     |   2 +
 41 files changed, 1299 insertions(+), 297 deletions(-)
 create mode 100644 src/base/atomicops-internals-arm-gcc.h
 create mode 100644 src/base/spinlock_internal.cc
 create mode 100644 src/base/spinlock_internal.h
 create mode 100644 src/base/synchronization_profiling.h

(limited to 'src')

diff --git a/src/base/atomicops-internals-arm-gcc.h b/src/base/atomicops-internals-arm-gcc.h
new file mode 100644
index 0000000..423e993
--- /dev/null
+++ b/src/base/atomicops-internals-arm-gcc.h
@@ -0,0 +1,234 @@
+/* Copyright (c) 2010, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Lei Zhang, Sasha Levitskiy
+ */
+
+// This file is an internal atomic implementation, use base/atomicops.h instead.
+//
+// LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears.
+
+#ifndef BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_
+#define BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_
+
+#include <stdio.h>
+#include "base/basictypes.h"  // For COMPILE_ASSERT
+
+typedef int32_t Atomic32;
+
+namespace base {
+namespace subtle {
+
+typedef int64_t Atomic64;
+
+// 0xffff0fc0 is the hard coded address of a function provided by
+// the kernel which implements an atomic compare-exchange. On older
+// ARM architecture revisions (pre-v6) this may be implemented using
+// a syscall. This address is stable, and in active use (hard coded)
+// by at least glibc-2.7 and the Android C library.
+// pLinuxKernelCmpxchg has both acquire and release barrier sematincs.
+typedef Atomic32 (*LinuxKernelCmpxchgFunc)(Atomic32 old_value,
+                                           Atomic32 new_value,
+                                           volatile Atomic32* ptr);
+LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg __attribute__((weak)) =
+    (LinuxKernelCmpxchgFunc) 0xffff0fc0;
+
+typedef void (*LinuxKernelMemoryBarrierFunc)(void);
+LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier __attribute__((weak)) =
+    (LinuxKernelMemoryBarrierFunc) 0xffff0fa0;
+
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 prev_value = *ptr;
+  do {
+    if (!pLinuxKernelCmpxchg(old_value, new_value,
+                             const_cast<Atomic32*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
+                                         Atomic32 new_value) {
+  Atomic32 old_value;
+  do {
+    old_value = *ptr;
+  } while (pLinuxKernelCmpxchg(old_value, new_value,
+                               const_cast<Atomic32*>(ptr)));
+  return old_value;
+}
+
+inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
+                                        Atomic32 increment) {
+  for (;;) {
+    // Atomic exchange the old value with an incremented one.
+    Atomic32 old_value = *ptr;
+    Atomic32 new_value = old_value + increment;
+    if (pLinuxKernelCmpxchg(old_value, new_value,
+                            const_cast<Atomic32*>(ptr)) == 0) {
+      // The exchange took place as expected.
+      return new_value;
+    }
+    // Otherwise, *ptr changed mid-loop and we need to retry.
+  }
+}
+
+inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,
+                                          Atomic32 increment) {
+  return Barrier_AtomicIncrement(ptr, increment);
+}
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
+
+inline void MemoryBarrier() {
+  pLinuxKernelMemoryBarrier();
+}
+
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  MemoryBarrier();
+  *ptr = value;
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  Atomic32 value = *ptr;
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+
+// 64-bit versions are not implemented yet.
+
+inline void NotImplementedFatalError(const char *function_name) {
+  fprintf(stderr, "64-bit %s() not implemented on this platform\n",
+          function_name);
+  abort();
+}
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  NotImplementedFatalError("NoBarrier_CompareAndSwap");
+  return 0;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  NotImplementedFatalError("NoBarrier_AtomicExchange");
+  return 0;
+}
+
+inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr,
+                                          Atomic64 increment) {
+  NotImplementedFatalError("NoBarrier_AtomicIncrement");
+  return 0;
+}
+
+inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr,
+                                        Atomic64 increment) {
+  NotImplementedFatalError("Barrier_AtomicIncrement");
+  return 0;
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NotImplementedFatalError("NoBarrier_Store");
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_AtomicExchange(ptr, value);
+              // acts as a barrier in this implementation
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NotImplementedFatalError("Release_Store");
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  NotImplementedFatalError("NoBarrier_Load");
+  return 0;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = NoBarrier_Load(ptr);
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
+  MemoryBarrier();
+  return NoBarrier_Load(ptr);
+}
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+}  // namespace base::subtle
+}  // namespace base
+
+#endif  // BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_
diff --git a/src/base/basictypes.h b/src/base/basictypes.h
index ab9cdab..0f21fca 100644
--- a/src/base/basictypes.h
+++ b/src/base/basictypes.h
@@ -109,7 +109,7 @@ const  int64 kint64min =  ( ((( int64) kint32min) << 32) | 0 );
 // Also allow for printing of a pthread_t.
 #define GPRIuPTHREAD "lu"
 #define GPRIxPTHREAD "lx"
-#if defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__APPLE__)
+#if defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__APPLE__) || defined(__FreeBSD__)
 #define PRINTABLE_PTHREAD(pthreadt) reinterpret_cast<uintptr_t>(pthreadt)
 #else
 #define PRINTABLE_PTHREAD(pthreadt) pthreadt
diff --git a/src/base/dynamic_annotations.h b/src/base/dynamic_annotations.h
index 10642fd..6283f7e 100644
--- a/src/base/dynamic_annotations.h
+++ b/src/base/dynamic_annotations.h
@@ -370,6 +370,41 @@
 
 #endif  /* DYNAMIC_ANNOTATIONS_ENABLED */
 
+/* Macro definitions for GCC attributes that allow static thread safety
+   analysis to recognize and use some of the dynamic annotations as
+   escape hatches.
+   TODO(lcwu): remove the check for __SUPPORT_DYN_ANNOTATION__ once the
+   default crosstool/GCC supports these GCC attributes.  */
+
+#define ANNOTALYSIS_STATIC_INLINE
+#define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY ;
+
+#if defined(__GNUC__) && defined(__SUPPORT_TS_ANNOTATION__) \
+  && (!defined(SWIG)) && defined(__SUPPORT_DYN_ANNOTATION__)
+
+#if DYNAMIC_ANNOTATIONS_ENABLED == 0
+#define ANNOTALYSIS_ONLY 1
+#undef ANNOTALYSIS_STATIC_INLINE
+#define ANNOTALYSIS_STATIC_INLINE static inline
+#undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+#define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY {}
+#endif
+#define ANNOTALYSIS_IGNORE_READS_BEGIN   __attribute__ ((ignore_reads_begin))
+#define ANNOTALYSIS_IGNORE_READS_END     __attribute__ ((ignore_reads_end))
+#define ANNOTALYSIS_IGNORE_WRITES_BEGIN  __attribute__ ((ignore_writes_begin))
+#define ANNOTALYSIS_IGNORE_WRITES_END    __attribute__ ((ignore_writes_end))
+#define ANNOTALYSIS_UNPROTECTED_READ     __attribute__ ((unprotected_read))
+
+#else
+
+#define ANNOTALYSIS_IGNORE_READS_BEGIN
+#define ANNOTALYSIS_IGNORE_READS_END
+#define ANNOTALYSIS_IGNORE_WRITES_BEGIN
+#define ANNOTALYSIS_IGNORE_WRITES_END
+#define ANNOTALYSIS_UNPROTECTED_READ
+
+#endif
+
 /* Use the macros above rather than using these functions directly. */
 #ifdef __cplusplus
 extern "C" {
@@ -431,10 +466,18 @@ void AnnotateTraceMemory(const char *file, int line,
                          const volatile void *arg);
 void AnnotateThreadName(const char *file, int line,
                         const char *name);
-void AnnotateIgnoreReadsBegin(const char *file, int line);
-void AnnotateIgnoreReadsEnd(const char *file, int line);
-void AnnotateIgnoreWritesBegin(const char *file, int line);
-void AnnotateIgnoreWritesEnd(const char *file, int line);
+ANNOTALYSIS_STATIC_INLINE
+void AnnotateIgnoreReadsBegin(const char *file, int line)
+    ANNOTALYSIS_IGNORE_READS_BEGIN ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+ANNOTALYSIS_STATIC_INLINE
+void AnnotateIgnoreReadsEnd(const char *file, int line)
+    ANNOTALYSIS_IGNORE_READS_END ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+ANNOTALYSIS_STATIC_INLINE
+void AnnotateIgnoreWritesBegin(const char *file, int line)
+    ANNOTALYSIS_IGNORE_WRITES_BEGIN ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+ANNOTALYSIS_STATIC_INLINE
+void AnnotateIgnoreWritesEnd(const char *file, int line)
+    ANNOTALYSIS_IGNORE_WRITES_END ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
 void AnnotateEnableRaceDetection(const char *file, int line, int enable);
 void AnnotateNoOp(const char *file, int line,
                   const volatile void *arg);
@@ -485,7 +528,8 @@ double ValgrindSlowdown(void);
      one can use
         ... = ANNOTATE_UNPROTECTED_READ(x); */
   template <class T>
-  inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x) {
+  inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x)
+      ANNOTALYSIS_UNPROTECTED_READ {
     ANNOTATE_IGNORE_READS_BEGIN();
     T res = x;
     ANNOTATE_IGNORE_READS_END();
@@ -511,4 +555,67 @@ double ValgrindSlowdown(void);
 
 #endif /* DYNAMIC_ANNOTATIONS_ENABLED */
 
+/* Annotalysis, a GCC based static analyzer, is able to understand and use
+   some of the dynamic annotations defined in this file. However, dynamic
+   annotations are usually disabled in the opt mode (to avoid additional
+   runtime overheads) while Annotalysis only works in the opt mode.
+   In order for Annotalysis to use these dynamic annotations when they
+   are disabled, we re-define these annotations here. Note that unlike the
+   original macro definitions above, these macros are expanded to calls to
+   static inline functions so that the compiler will be able to remove the
+   calls after the analysis. */
+
+#ifdef ANNOTALYSIS_ONLY
+
+  #undef ANNOTALYSIS_ONLY
+
+  /* Undefine and re-define the macros that the static analyzer understands. */
+  #undef ANNOTATE_IGNORE_READS_BEGIN
+  #define ANNOTATE_IGNORE_READS_BEGIN()           \
+    AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
+
+  #undef ANNOTATE_IGNORE_READS_END
+  #define ANNOTATE_IGNORE_READS_END()             \
+    AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
+
+  #undef ANNOTATE_IGNORE_WRITES_BEGIN
+  #define ANNOTATE_IGNORE_WRITES_BEGIN()          \
+    AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+
+  #undef ANNOTATE_IGNORE_WRITES_END
+  #define ANNOTATE_IGNORE_WRITES_END()            \
+    AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+
+  #undef ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN()       \
+    do {                                                 \
+      ANNOTATE_IGNORE_READS_BEGIN();                     \
+      ANNOTATE_IGNORE_WRITES_BEGIN();                    \
+    }while(0)                                            \
+
+  #undef ANNOTATE_IGNORE_READS_AND_WRITES_END
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_END()  \
+    do {                                          \
+      ANNOTATE_IGNORE_WRITES_END();               \
+      ANNOTATE_IGNORE_READS_END();                \
+    }while(0)                                     \
+
+  #if defined(__cplusplus)
+    #undef ANNOTATE_UNPROTECTED_READ
+    template <class T>
+    inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x)
+        __attribute__ ((unprotected_read)) {
+      ANNOTATE_IGNORE_READS_BEGIN();
+      T res = x;
+      ANNOTATE_IGNORE_READS_END();
+      return res;
+    }
+  #endif /* __cplusplus */
+
+#endif /* ANNOTALYSIS_ONLY */
+
+/* Undefine the macros intended only in this file. */
+#undef ANNOTALYSIS_STATIC_INLINE
+#undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+
 #endif  /* BASE_DYNAMIC_ANNOTATIONS_H_ */
diff --git a/src/base/logging.h b/src/base/logging.h
index 6aa5c3f..fe25acf 100644
--- a/src/base/logging.h
+++ b/src/base/logging.h
@@ -49,10 +49,13 @@
 
 // On some systems (like freebsd), we can't call write() at all in a
 // global constructor, perhaps because errno hasn't been set up.
+// (In windows, we can't call it because it might call malloc.)
 // Calling the write syscall is safer (it doesn't set errno), so we
 // prefer that.  Note we don't care about errno for logging: we just
 // do logging on a best-effort basis.
-#ifdef HAVE_SYS_SYSCALL_H
+#if defined(_MSC_VER)
+#define WRITE_TO_STDERR(buf, len) WriteToStderr(buf, len);  // in port.cc
+#elif defined(HAVE_SYS_SYSCALL_H)
 #include <sys/syscall.h>
 #define WRITE_TO_STDERR(buf, len) syscall(SYS_write, STDERR_FILENO, buf, len)
 #else
diff --git a/src/base/low_level_alloc.cc b/src/base/low_level_alloc.cc
index 7ca3953..8864629 100644
--- a/src/base/low_level_alloc.cc
+++ b/src/base/low_level_alloc.cc
@@ -59,7 +59,9 @@
 // ---------------------------------------------------------------------------
 static const int kMaxLevel = 30;
 
-namespace {
+// We put this class-only struct in a namespace to avoid polluting the
+// global namespace with this struct name (thus risking an ODR violation).
+namespace low_level_alloc_internal {
   // This struct describes one allocated block, or one free block.
   struct AllocList {
     struct Header {
@@ -79,6 +81,8 @@ namespace {
                                   // LLA_SkiplistLevels()
   };
 }
+using low_level_alloc_internal::AllocList;
+
 
 // ---------------------------------------------------------------------------
 // A trivial skiplist implementation.  This is used to keep the freelist
diff --git a/src/base/spinlock.cc b/src/base/spinlock.cc
index 48cdc89..1413923 100644
--- a/src/base/spinlock.cc
+++ b/src/base/spinlock.cc
@@ -32,47 +32,28 @@
  */
 
 #include <config.h>
-#include <time.h>       /* For nanosleep() */
-#ifdef HAVE_SCHED_H
-#include <sched.h>      /* For sched_yield() */
-#endif
-#ifdef HAVE_UNISTD_H
-#include <unistd.h>     /* For read() */
-#endif
-#include <fcntl.h>      /* for open(), O_RDONLY */
-#include <string.h>     /* for strncmp */
-#include <errno.h>
 #include "base/spinlock.h"
+#include "base/synchronization_profiling.h"
+#include "base/spinlock_internal.h"
 #include "base/cycleclock.h"
 #include "base/sysinfo.h"   /* for NumCPUs() */
 
-// We can do contention-profiling of SpinLocks, but the code is in
-// mutex.cc, which is not always linked in with spinlock.  Hence we
-// provide this weak definition, which is used if mutex.cc isn't linked in.
-ATTRIBUTE_WEAK extern void SubmitSpinLockProfileData(const void *, int64);
-void SubmitSpinLockProfileData(const void *, int64) {}
+// NOTE on the Lock-state values:
+//
+//   kSpinLockFree represents the unlocked state
+//   kSpinLockHeld represents the locked state with no waiters
+//
+// Values greater than kSpinLockHeld represent the locked state with waiters,
+// where the value is the time the current lock holder had to
+// wait before obtaining the lock.  The kSpinLockSleeper state is a special
+// "locked with waiters" state that indicates that a sleeper needs to
+// be woken, but the thread that just released the lock didn't wait.
 
 static int adaptive_spin_count = 0;
 
 const base::LinkerInitialized SpinLock::LINKER_INITIALIZED =
     base::LINKER_INITIALIZED;
 
-// The OS-specific header included below must provide two calls:
-// Wait until *w becomes zero, atomically set it to 1 and return.
-//    static void SpinLockWait(volatile Atomic32 *w);
-// 
-// Hint that a thread waiting in SpinLockWait() could now make progress.  May
-// do nothing.  This call may not read or write *w; it must use only the
-// address.
-//    static void SpinLockWake(volatile Atomic32 *w);
-#if defined(_WIN32)
-#include "base/spinlock_win32-inl.h"
-#elif defined(__linux__)
-#include "base/spinlock_linux-inl.h"
-#else
-#include "base/spinlock_posix-inl.h"
-#endif
-
 namespace {
 struct SpinLock_InitHelper {
   SpinLock_InitHelper() {
@@ -91,36 +72,111 @@ static SpinLock_InitHelper init_helper;
 
 }  // unnamed namespace
 
+// Monitor the lock to see if its value changes within some time period
+// (adaptive_spin_count loop iterations).  A timestamp indicating
+// when the thread initially started waiting for the lock is passed in via
+// the initial_wait_timestamp value.  The total wait time in cycles for the
+// lock is returned in the wait_cycles parameter.  The last value read
+// from the lock is returned from the method.
+Atomic32 SpinLock::SpinLoop(int64 initial_wait_timestamp,
+                            Atomic32* wait_cycles) {
+  int c = adaptive_spin_count;
+  while (base::subtle::NoBarrier_Load(&lockword_) != kSpinLockFree && --c > 0) {
+  }
+  Atomic32 spin_loop_wait_cycles = CalculateWaitCycles(initial_wait_timestamp);
+  Atomic32 lock_value =
+      base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree,
+                                           spin_loop_wait_cycles);
+  *wait_cycles = spin_loop_wait_cycles;
+  return lock_value;
+}
 
 void SpinLock::SlowLock() {
-  int c = adaptive_spin_count;
+  // The lock was not obtained initially, so this thread needs to wait for
+  // it.  Record the current timestamp in the local variable wait_start_time
+  // so the total wait time can be stored in the lockword once this thread
+  // obtains the lock.
+  int64 wait_start_time = CycleClock::Now();
+  Atomic32 wait_cycles;
+  Atomic32 lock_value = SpinLoop(wait_start_time, &wait_cycles);
 
-  // Spin a few times in the hope that the lock holder releases the lock
-  while ((c > 0) && (lockword_ != 0)) {
-    c--;
-  }
+  int lock_wait_call_count = 0;
+  while (lock_value != kSpinLockFree) {
+    // If the lock is currently held, but not marked as having a sleeper, mark
+    // it as having a sleeper.
+    if (lock_value == kSpinLockHeld) {
+      // Here, just "mark" that the thread is going to sleep.  Don't store the
+      // lock wait time in the lock as that will cause the current lock
+      // owner to think it experienced contention.
+      lock_value = base::subtle::Acquire_CompareAndSwap(&lockword_,
+                                                        kSpinLockHeld,
+                                                        kSpinLockSleeper);
+      if (lock_value == kSpinLockHeld) {
+        // Successfully transitioned to kSpinLockSleeper.  Pass
+        // kSpinLockSleeper to the SpinLockWait routine to properly indicate
+        // the last lock_value observed.
+        lock_value = kSpinLockSleeper;
+      } else if (lock_value == kSpinLockFree) {
+        // Lock is free again, so try and aquire it before sleeping.  The
+        // new lock state will be the number of cycles this thread waited if
+        // this thread obtains the lock.
+        lock_value = base::subtle::Acquire_CompareAndSwap(&lockword_,
+                                                          kSpinLockFree,
+                                                          wait_cycles);
+        continue;  // skip the delay at the end of the loop
+      }
+    }
 
-  if (lockword_ == 1) {
-    int32 now = (CycleClock::Now() >> PROFILE_TIMESTAMP_SHIFT);
-    // Don't loose the lock: make absolutely sure "now" is not zero
-    now |= 1;
-    // Atomically replace the value of lockword_ with "now" if
-    // lockword_ is 1, thereby remembering the first timestamp to
-    // be recorded.
-    base::subtle::NoBarrier_CompareAndSwap(&lockword_, 1, now);
-    // base::subtle::NoBarrier_CompareAndSwap() returns:
-    //   0: the lock is/was available; nothing stored
-    //   1: our timestamp was stored
-    //   > 1: an older timestamp is already in lockword_; nothing stored
+    // Wait for an OS specific delay.
+    base::internal::SpinLockDelay(&lockword_, lock_value,
+                                  ++lock_wait_call_count);
+    // Spin again after returning from the wait routine to give this thread
+    // some chance of obtaining the lock.
+    lock_value = SpinLoop(wait_start_time, &wait_cycles);
   }
-
-  SpinLockWait(&lockword_);  // wait until lock acquired; OS specific
 }
 
-void SpinLock::SlowUnlock(int64 wait_timestamp) {
-  SpinLockWake(&lockword_);  // wake waiter if necessary; OS specific 
+// The wait time for contentionz lock profiling must fit into 32 bits.
+// However, the lower 32-bits of the cycle counter wrap around too quickly
+// with high frequency processors, so a right-shift by 7 is performed to
+// quickly divide the cycles by 128.  Using these 32 bits, reduces the
+// granularity of time measurement to 128 cycles, and loses track
+// of wait time for waits greater than 109 seconds on a 5 GHz machine
+// [(2^32 cycles/5 Ghz)*128 = 109.95 seconds]. Waits this long should be
+// very rare and the reduced granularity should not be an issue given
+// processors in the Google fleet operate at a minimum of one billion
+// cycles/sec.
+enum { PROFILE_TIMESTAMP_SHIFT = 7 };
+
+void SpinLock::SlowUnlock(uint64 wait_cycles) {
+  base::internal::SpinLockWake(&lockword_, false);  // wake waiter if necessary
+
+  // Collect contentionz profile info, expanding the wait_cycles back out to
+  // the full value.  If wait_cycles is <= kSpinLockSleeper, then no wait
+  // was actually performed, so don't record the wait time.  Note, that the
+  // CalculateWaitCycles method adds in kSpinLockSleeper cycles
+  // unconditionally to guarantee the wait time is not kSpinLockFree or
+  // kSpinLockHeld.  The adding in of these small number of cycles may
+  // overestimate the contention by a slight amount 50% of the time.  However,
+  // if this code tried to correct for that addition by subtracting out the
+  // kSpinLockSleeper amount that would underestimate the contention slightly
+  // 50% of the time.  Both ways get the wrong answer, so the code
+  // overestimates to be more conservative. Overestimating also makes the code
+  // a little simpler.
+  //
+  if (wait_cycles > kSpinLockSleeper) {
+    base::SubmitSpinLockProfileData(this,
+                                    wait_cycles << PROFILE_TIMESTAMP_SHIFT);
+  }
+}
 
-  // Collect contentionz profile info.  Subtract one from wait_timestamp as
-  // antidote to "now |= 1;" in SlowLock().
-  SubmitSpinLockProfileData(this, wait_timestamp - 1);
+inline int32 SpinLock::CalculateWaitCycles(int64 wait_start_time) {
+  int32 wait_cycles = ((CycleClock::Now() - wait_start_time) >>
+                       PROFILE_TIMESTAMP_SHIFT);
+  // The number of cycles waiting for the lock is used as both the
+  // wait_cycles and lock value, so it can't be kSpinLockFree or
+  // kSpinLockHeld.  Make sure the value returned is at least
+  // kSpinLockSleeper.
+  wait_cycles |= kSpinLockSleeper;
+  return wait_cycles;
 }
diff --git a/src/base/spinlock.h b/src/base/spinlock.h
index 9e633c4..c2be4fd 100644
--- a/src/base/spinlock.h
+++ b/src/base/spinlock.h
@@ -44,14 +44,14 @@
 #define BASE_SPINLOCK_H_
 
 #include <config.h>
-#include "base/basictypes.h"
 #include "base/atomicops.h"
+#include "base/basictypes.h"
 #include "base/dynamic_annotations.h"
 #include "base/thread_annotations.h"
 
 class LOCKABLE SpinLock {
  public:
-  SpinLock() : lockword_(0) { }
+  SpinLock() : lockword_(kSpinLockFree) { }
 
   // Special constructor for use with static SpinLock objects.  E.g.,
   //
@@ -70,18 +70,21 @@ class LOCKABLE SpinLock {
   // TODO(csilvers): uncomment the annotation when we figure out how to
   //                 support this macro with 0 args (see thread_annotations.h)
   inline void Lock() /*EXCLUSIVE_LOCK_FUNCTION()*/ {
-    if (Acquire_CompareAndSwap(&lockword_, 0, 1) != 0) {
+    if (base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree,
+                                             kSpinLockHeld) != kSpinLockFree) {
       SlowLock();
     }
     ANNOTATE_RWLOCK_ACQUIRED(this, 1);
   }
 
-  // Acquire this SpinLock and return true if the acquisition can be
-  // done without blocking, else return false.  If this SpinLock is
-  // free at the time of the call, TryLock will return true with high
-  // probability.
+  // Try to acquire this SpinLock without blocking and return true if the
+  // acquisition was successful.  If the lock was not acquired, false is
+  // returned.  If this SpinLock is free at the time of the call, TryLock
+  // will return true with high probability.
   inline bool TryLock() EXCLUSIVE_TRYLOCK_FUNCTION(true) {
-    bool res = (Acquire_CompareAndSwap(&lockword_, 0, 1) == 0);
+    bool res =
+        (base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree,
+                                              kSpinLockHeld) == kSpinLockFree);
     if (res) {
       ANNOTATE_RWLOCK_ACQUIRED(this, 1);
     }
@@ -92,47 +95,37 @@ class LOCKABLE SpinLock {
   // TODO(csilvers): uncomment the annotation when we figure out how to
   //                 support this macro with 0 args (see thread_annotations.h)
   inline void Unlock() /*UNLOCK_FUNCTION()*/ {
-    // This is defined in mutex.cc.
-    extern void SubmitSpinLockProfileData(const void *, int64);
-
-    int64 wait_timestamp = static_cast<uint32>(lockword_);
+    uint64 wait_cycles =
+        static_cast<uint64>(base::subtle::NoBarrier_Load(&lockword_));
     ANNOTATE_RWLOCK_RELEASED(this, 1);
-    Release_Store(&lockword_, 0);
-    if (wait_timestamp != 1) {
+    base::subtle::Release_Store(&lockword_, kSpinLockFree);
+    if (wait_cycles != kSpinLockHeld) {
       // Collect contentionz profile info, and speed the wakeup of any waiter.
-      // The lockword_ value indicates when the waiter started waiting.
-      SlowUnlock(wait_timestamp);
+      // The wait_cycles value indicates how long this thread spent waiting
+      // for the lock.
+      SlowUnlock(wait_cycles);
     }
   }
 
-  // Report if we think the lock can be held by this thread.
-  // When the lock is truly held by the invoking thread
-  // we will always return true.
-  // Indended to be used as CHECK(lock.IsHeld());
+  // Determine if the lock is held.  When the lock is held by the invoking
+  // thread, true will always be returned. Intended to be used as
+  // CHECK(lock.IsHeld()).
   inline bool IsHeld() const {
-    return lockword_ != 0;
+    return base::subtle::NoBarrier_Load(&lockword_) != kSpinLockFree;
   }
 
-  // The timestamp for contention lock profiling must fit into 31 bits.
-  // as lockword_ is 32 bits and we loose an additional low-order bit due
-  // to the statement "now |= 1" in SlowLock().
-  // To select 31 bits from the 64-bit cycle counter, we shift right by
-  // PROFILE_TIMESTAMP_SHIFT = 7.
-  // Using these 31 bits, we reduce granularity of time measurement to
-  // 256 cycles, and will loose track of wait time for waits greater than
-  // 109 seconds on a 5 GHz machine, longer for faster clock cycles.
-  // Waits this long should be very rare.
-  enum { PROFILE_TIMESTAMP_SHIFT = 7 };
-
   static const base::LinkerInitialized LINKER_INITIALIZED;  // backwards compat
  private:
-  // Lock-state:  0 means unlocked; 1 means locked with no waiters; values
-  // greater than 1 indicate locked with waiters, where the value is the time
-  // the first waiter started waiting and is used for contention profiling.
+  enum { kSpinLockFree = 0 };
+  enum { kSpinLockHeld = 1 };
+  enum { kSpinLockSleeper = 2 };
+
   volatile Atomic32 lockword_;
 
   void SlowLock();
-  void SlowUnlock(int64 wait_timestamp);
+  void SlowUnlock(uint64 wait_cycles);
+  Atomic32 SpinLoop(int64 initial_wait_timestamp, Atomic32* wait_cycles);
+  inline int32 CalculateWaitCycles(int64 wait_start_time);
 
   DISALLOW_COPY_AND_ASSIGN(SpinLock);
 };
diff --git a/src/base/spinlock_internal.cc b/src/base/spinlock_internal.cc
new file mode 100644
index 0000000..b5b6ca4
--- /dev/null
+++ b/src/base/spinlock_internal.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2010, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// The OS-specific header included below must provide two calls:
+// base::internal::SpinLockDelay() and base::internal::SpinLockWake().
+// See spinlock_internal.h for the spec of SpinLockWake().
+
+// void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop)
+// SpinLockDelay() generates an apprproate spin delay on iteration "loop" of a
+// spin loop on location *w, whose previously observed value was "value".
+// SpinLockDelay() may do nothing, may yield the CPU, may sleep a clock tick,
+// or may wait for a delay that can be truncated by a call to SpinlockWake(w).
+// In all cases, it must return in bounded time even if SpinlockWake() is not
+// called.
+
+#include "base/spinlock_internal.h"
+
+#if defined(_WIN32)
+#include "base/spinlock_win32-inl.h"
+#elif defined(__linux__)
+#include "base/spinlock_linux-inl.h"
+#else
+#include "base/spinlock_posix-inl.h"
+#endif
+
+namespace base {
+namespace internal {
+
+// See spinlock_internal.h for spec.
+int32 SpinLockWait(volatile Atomic32 *w, int n,
+                   const SpinLockWaitTransition trans[]) {
+  int32 v;
+  bool done = false;
+  for (int loop = 0; !done; loop++) {
+    v = base::subtle::Acquire_Load(w);
+    int i;
+    for (i = 0; i != n && v != trans[i].from; i++) {
+    }
+    if (i == n) {
+      SpinLockDelay(w, v, loop);     // no matching transition
+    } else if (trans[i].to == v ||   // null transition
+               base::subtle::Acquire_CompareAndSwap(w, v, trans[i].to) == v) {
+      done = trans[i].done;
+    }
+  }
+  return v;
+}
+
+} // namespace internal
+} // namespace base
diff --git a/src/base/spinlock_internal.h b/src/base/spinlock_internal.h
new file mode 100644
index 0000000..4494260
--- /dev/null
+++ b/src/base/spinlock_internal.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2010, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * This file is an internal part spinlock.cc and once.cc
+ * It may not be used directly by code outside of //base.
+ */
+
+#ifndef BASE_SPINLOCK_INTERNAL_H_
+#define BASE_SPINLOCK_INTERNAL_H_
+
+#include <config.h>
+#include "base/basictypes.h"
+#include "base/atomicops.h"
+
+namespace base {
+namespace internal {
+
+// SpinLockWait() waits until it can perform one of several transitions from
+// "from" to "to".  It returns when it performs a transition where done==true.
+struct SpinLockWaitTransition {
+  int32 from;
+  int32 to;
+  bool done;
+};
+
+// Wait until *w can transition from trans[i].from to trans[i].to for some i
+// satisfying 0<=i<n && trans[i].done, atomically make the transition,
+// then return the old value of *w.   Make any other atomic tranistions
+// where !trans[i].done, but continue waiting.
+int32 SpinLockWait(volatile Atomic32 *w, int n,
+                   const SpinLockWaitTransition trans[]);
+void SpinLockWake(volatile Atomic32 *w, bool all);
+void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop);
+
+} // namespace internal
+} // namespace base
+#endif
diff --git a/src/base/spinlock_linux-inl.h b/src/base/spinlock_linux-inl.h
index 0df09a3..5e571b1 100644
--- a/src/base/spinlock_linux-inl.h
+++ b/src/base/spinlock_linux-inl.h
@@ -28,11 +28,12 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * ---
- * This file is a Linux-specific part of spinlock.cc
+ * This file is a Linux-specific part of spinlock_internal.cc
  */
 
 #include <sched.h>
 #include <time.h>
+#include <limits.h>
 #include "base/linux_syscall_support.h"
 
 #define FUTEX_WAIT 0
@@ -48,7 +49,7 @@ static struct InitModule {
     int x = 0;
     // futexes are ints, so we can use them only when
     // that's the same size as the lockword_ in SpinLock.
-    have_futex = (sizeof (Atomic32) == sizeof (int) && 
+    have_futex = (sizeof (Atomic32) == sizeof (int) &&
                   sys_futex(&x, FUTEX_WAKE, 1, 0) >= 0);
     if (have_futex &&
         sys_futex(&x, FUTEX_WAKE | futex_private_flag, 1, 0) < 0) {
@@ -56,36 +57,41 @@ static struct InitModule {
     }
   }
 } init_module;
+
 }  // anonymous namespace
 
-static void SpinLockWait(volatile Atomic32 *w) {
-  int save_errno = errno;
-  struct timespec tm;
-  tm.tv_sec = 0;
-  if (have_futex) {
-    int value;
-    tm.tv_nsec = 1000000;   // 1ms; really we're trying to sleep for one kernel
-                            // clock tick
-    while ((value = base::subtle::Acquire_CompareAndSwap(w, 0, 1)) != 0) {
-      sys_futex(reinterpret_cast<int *>(const_cast<Atomic32 *>(w)),
-          FUTEX_WAIT | futex_private_flag,
-          value, reinterpret_cast<struct kernel_timespec *>(&tm));
-    }
-  } else {
-    tm.tv_nsec = 2000001;       // above 2ms so linux 2.4 doesn't spin
-    if (base::subtle::NoBarrier_Load(w) != 0) {
-      sched_yield();
+
+namespace base {
+namespace internal {
+
+void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) {
+  if (loop != 0) {
+    int save_errno = errno;
+    struct timespec tm;
+    tm.tv_sec = 0;
+    if (have_futex) {
+      tm.tv_nsec = 1000000;   // 1ms; really we're trying to sleep for one
+                              // kernel clock tick
+    } else {
+      tm.tv_nsec = 2000001;   // above 2ms so linux 2.4 doesn't spin
     }
-    while (base::subtle::Acquire_CompareAndSwap(w, 0, 1) != 0) {
+    if (have_futex) {
+      sys_futex(reinterpret_cast<int *>(const_cast<Atomic32 *>(w)),
+                FUTEX_WAIT | futex_private_flag,
+                value, reinterpret_cast<struct kernel_timespec *>(&tm));
+    } else {
       nanosleep(&tm, NULL);
     }
+    errno = save_errno;
   }
-  errno = save_errno;
 }
 
-static void SpinLockWake(volatile Atomic32 *w) {
+void SpinLockWake(volatile Atomic32 *w, bool all) {
   if (have_futex) {
     sys_futex(reinterpret_cast<int *>(const_cast<Atomic32 *>(w)),
-              FUTEX_WAKE | futex_private_flag, 1, 0);
+              FUTEX_WAKE | futex_private_flag, all? INT_MAX : 1, 0);
   }
 }
+
+} // namespace internal
+} // namespace base
diff --git a/src/base/spinlock_posix-inl.h b/src/base/spinlock_posix-inl.h
index 0d933c0..d188ebd 100644
--- a/src/base/spinlock_posix-inl.h
+++ b/src/base/spinlock_posix-inl.h
@@ -28,25 +28,35 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * ---
- * This file is a Posix-specific part of spinlock.cc
+ * This file is a Posix-specific part of spinlock_internal.cc
  */
 
-#include <sched.h>
-#include <time.h>
+#include <config.h>
+#include <errno.h>
+#ifdef HAVE_SCHED_H
+#include <sched.h>      /* For sched_yield() */
+#endif
+#include <time.h>       /* For nanosleep() */
 
-static void SpinLockWait(volatile Atomic32 *w) {
+namespace base {
+namespace internal {
+
+void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) {
   int save_errno = errno;
-  struct timespec tm;
-  tm.tv_sec = 0;
-  tm.tv_nsec = 1000000;
-  if (base::subtle::NoBarrier_Load(w) != 0) {
+  if (loop == 0) {
+  } else if (loop == 1) {
     sched_yield();
-  }
-  while (base::subtle::Acquire_CompareAndSwap(w, 0, 1) != 0) {
+  } else {
+    struct timespec tm;
+    tm.tv_sec = 0;
+    tm.tv_nsec = 1000000;
     nanosleep(&tm, NULL);
   }
   errno = save_errno;
 }
 
-static void SpinLockWake(volatile Atomic32 *w) {
+void SpinLockWake(volatile Atomic32 *w, bool all) {
 }
+
+} // namespace internal
+} // namespace base
diff --git a/src/base/spinlock_win32-inl.h b/src/base/spinlock_win32-inl.h
index 9058939..ee23541 100644
--- a/src/base/spinlock_win32-inl.h
+++ b/src/base/spinlock_win32-inl.h
@@ -28,20 +28,26 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * ---
- * This file is a Win32-specific part of spinlock.cc
+ * This file is a Win32-specific part of spinlock_internal.cc
  */
 
 
 #include <windows.h>
 
-static void SpinLockWait(volatile Atomic32 *w) {
-  if (base::subtle::NoBarrier_Load(w) != 0) {
+namespace base {
+namespace internal {
+
+void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) {
+  if (loop == 0) {
+  } else if (loop == 1) {
     Sleep(0);
-  }
-  while (base::subtle::Acquire_CompareAndSwap(w, 0, 1) != 0) {
+  } else {
     Sleep(1);
   }
 }
 
-static void SpinLockWake(volatile Atomic32 *w) {
+void SpinLockWake(volatile Atomic32 *w, bool all) {
 }
+
+} // namespace internal
+} // namespace base
diff --git a/src/base/synchronization_profiling.h b/src/base/synchronization_profiling.h
new file mode 100644
index 0000000..cf02c21
--- /dev/null
+++ b/src/base/synchronization_profiling.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2010, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Chris Ruemmler
+ */
+
+#ifndef BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_
+#define BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_
+
+#include "base/basictypes.h"
+
+namespace base {
+
+// We can do contention-profiling of SpinLocks, but the code is in
+// mutex.cc, which is not always linked in with spinlock.  Hence we
+// provide a weak definition, which are used if mutex.cc isn't linked in.
+
+// Submit the number of cycles the spinlock spent contending.
+ATTRIBUTE_WEAK extern void SubmitSpinLockProfileData(const void *, int64);
+extern void SubmitSpinLockProfileData(const void *contendedlock,
+                                      int64 wait_cycles) {}
+}
+#endif  // BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_
diff --git a/src/base/sysinfo.cc b/src/base/sysinfo.cc
index 7cfa051..c1e2aef 100644
--- a/src/base/sysinfo.cc
+++ b/src/base/sysinfo.cc
@@ -111,20 +111,23 @@
 // 8K), so it's not an ideal solution.
 const char* GetenvBeforeMain(const char* name) {
 #if defined(HAVE___ENVIRON)   // if we have it, it's declared in unistd.h
-  const int namelen = strlen(name);
-  for (char** p = __environ; *p; p++) {
-    if (!memcmp(*p, name, namelen) && (*p)[namelen] == '=')  // it's a match
-      return *p + namelen+1;                                 // point after =
+  if (__environ) {            // can exist but be NULL, if statically linked
+    const int namelen = strlen(name);
+    for (char** p = __environ; *p; p++) {
+      if (!memcmp(*p, name, namelen) && (*p)[namelen] == '=')  // it's a match
+        return *p + namelen+1;                                 // point after =
+    }
+    return NULL;
   }
-  return NULL;
-#elif defined(PLATFORM_WINDOWS)
+#endif
+#if defined(PLATFORM_WINDOWS)
   // TODO(mbelshe) - repeated calls to this function will overwrite the
   // contents of the static buffer.
-  static char envbuf[1024];  // enough to hold any envvar we care about
-  if (!GetEnvironmentVariableA(name, envbuf, sizeof(envbuf)-1))
+  static char envvar_buf[1024];  // enough to hold any envvar we care about
+  if (!GetEnvironmentVariableA(name, envvar_buf, sizeof(envvar_buf)-1))
     return NULL;
-  return envbuf;
-#else
+  return envvar_buf;
+#endif
   // static is ok because this function should only be called before
   // main(), when we're single-threaded.
   static char envbuf[16<<10];
@@ -152,7 +155,6 @@ const char* GetenvBeforeMain(const char* name) {
     p = endp + 1;
   }
   return NULL;                   // env var never found
-#endif
 }
 
 // This takes as an argument an environment-variable name (like
@@ -229,6 +231,26 @@ static int64 EstimateCyclesPerSecond(const int estimate_time_ms) {
   return guess;
 }
 
+// Helper function for reading an int from a file. Returns true if successful
+// and the memory location pointed to by value is set to the value read.
+static bool ReadIntFromFile(const char *file, int *value) {
+  bool ret = false;
+  int fd = open(file, O_RDONLY);
+  if (fd != -1) {
+    char line[1024];
+    char* err;
+    memset(line, '\0', sizeof(line));
+    read(fd, line, sizeof(line) - 1);
+    const int temp_value = strtol(line, &err, 10);
+    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
+      *value = temp_value;
+      ret = true;
+    }
+    close(fd);
+  }
+  return ret;
+}
+
 // WARNING: logging calls back to InitializeSystemInfo() so it must
 // not invoke any logging code.  Also, InitializeSystemInfo() can be
 // called before main() -- in fact it *must* be since already_called
@@ -254,26 +276,31 @@ static void InitializeSystemInfo() {
 #if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
   char line[1024];
   char* err;
+  int freq;
+
+  // If the kernel is exporting the tsc frequency use that. There are issues
+  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
+  // exporintg an invalid p-state (on x86) or p-states may be used to put the
+  // processor in a new mode (turbo mode). Essentially, those frequencies
+  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
+  // well.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
+      // The value is in kHz (as the file name suggests).  For example, on a
+      // 2GHz warpstation, the file contains the value "2000000".
+      cpuinfo_cycles_per_second = freq * 1000.0;
+      saw_mhz = true;
+  }
 
   // If CPU scaling is in effect, we want to use the *maximum* frequency,
   // not whatever CPU speed some random processor happens to be using now.
-  if (!saw_mhz) {
-    const char* pname0 =
-        "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq";
-    int fd0 = open(pname0, O_RDONLY);
-    if (fd0 != -1) {
-      memset(line, '\0', sizeof(line));
-      read(fd0, line, sizeof(line));
-      const int max_freq = strtol(line, &err, 10);
-      if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
-        // The value is in kHz.  For example, on a 2GHz machine, the file
-        // contains the value "2000000".  Historically this file contained no
-        // newline, but at some point the kernel started appending a newline.
-        cpuinfo_cycles_per_second = max_freq * 1000.0;
-        saw_mhz = true;
-      }
-      close(fd0);
-    }
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
+    // The value is in kHz.  For example, on a 2GHz machine, the file
+    // contains the value "2000000".
+    cpuinfo_cycles_per_second = freq * 1000.0;
+    saw_mhz = true;
   }
 
   // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
@@ -311,20 +338,20 @@ static void InitializeSystemInfo() {
     if (newline != NULL)
       *newline = '\0';
 
-    if (!saw_mhz && strncmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) {
+    if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) {
       const char* freqstr = strchr(line, ':');
       if (freqstr) {
         cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0;
         if (freqstr[1] != '\0' && *err == '\0')
           saw_mhz = true;
       }
-    } else if (strncmp(line, "bogomips", sizeof("bogomips")-1) == 0) {
+    } else if (strncasecmp(line, "bogomips", sizeof("bogomips")-1) == 0) {
       const char* freqstr = strchr(line, ':');
       if (freqstr)
         bogo_clock = strtod(freqstr+1, &err) * 1000000.0;
       if (freqstr == NULL || freqstr[1] == '\0' || *err != '\0')
         bogo_clock = 1.0;
-    } else if (strncmp(line, "processor", sizeof("processor")-1) == 0) {
+    } else if (strncasecmp(line, "processor", sizeof("processor")-1) == 0) {
       num_cpus++;  // count up every time we see an "processor :" entry
     }
   } while (chars_read > 0);
@@ -888,9 +915,10 @@ namespace tcmalloc {
 
 // Helper to add the list of mapped shared libraries to a profile.
 // Fill formatted "/proc/self/maps" contents into buffer 'buf' of size 'size'
-// and return the actual size occupied in 'buf'.
+// and return the actual size occupied in 'buf'.  We fill wrote_all to true
+// if we successfully wrote all proc lines to buf, false else.
 // We do not provision for 0-terminating 'buf'.
-int FillProcSelfMaps(char buf[], int size) {
+int FillProcSelfMaps(char buf[], int size, bool* wrote_all) {
   ProcMapsIterator::Buffer iterbuf;
   ProcMapsIterator it(0, &iterbuf);   // 0 means "current pid"
 
@@ -898,10 +926,17 @@ int FillProcSelfMaps(char buf[], int size) {
   int64 inode;
   char *flags, *filename;
   int bytes_written = 0;
+  *wrote_all = true;
   while (it.Next(&start, &end, &flags, &offset, &inode, &filename)) {
-    bytes_written += it.FormatLine(buf + bytes_written, size - bytes_written,
-                                   start, end, flags, offset, inode, filename,
-                                   0);
+    const int line_length = it.FormatLine(buf + bytes_written,
+                                          size - bytes_written,
+                                          start, end, flags, offset,
+                                          inode, filename, 0);
+    if (line_length == 0)
+      *wrote_all = false;     // failed to write this line out
+    else
+      bytes_written += line_length;
+
   }
   return bytes_written;
 }
diff --git a/src/base/sysinfo.h b/src/base/sysinfo.h
index 0bcc1f5..8bae5e3 100644
--- a/src/base/sysinfo.h
+++ b/src/base/sysinfo.h
@@ -226,7 +226,7 @@ class ProcMapsIterator {
 // Helper routines
 
 namespace tcmalloc {
-int FillProcSelfMaps(char buf[], int size);
+int FillProcSelfMaps(char buf[], int size, bool* wrote_all);
 void DumpProcSelfMaps(RawFD fd);
 }
 
diff --git a/src/base/thread_annotations.h b/src/base/thread_annotations.h
index f1b3593..f57b299 100644
--- a/src/base/thread_annotations.h
+++ b/src/base/thread_annotations.h
@@ -46,7 +46,9 @@
 #define BASE_THREAD_ANNOTATIONS_H_
 
 
-#if defined(__GNUC__) && defined(__SUPPORT_TS_ANNOTATION__) && (!defined(SWIG))
+#if defined(__GNUC__) \
+  && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) \
+  && defined(__SUPPORT_TS_ANNOTATION__) && (!defined(SWIG))
 #define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
 #else
 #define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
diff --git a/src/base/vdso_support.cc b/src/base/vdso_support.cc
index fce7c2c..73c6545 100644
--- a/src/base/vdso_support.cc
+++ b/src/base/vdso_support.cc
@@ -395,7 +395,7 @@ const void *VDSOSupport::Init() {
   }
   // Subtle: this code runs outside of any locks; prevent compiler
   // from assigning to getcpu_fn_ more than once.
-  MemoryBarrier();
+  base::subtle::MemoryBarrier();
   getcpu_fn_ = fn;
   return vdso_base_;
 }
diff --git a/src/common.h b/src/common.h
index e2906d6..53050ca 100644
--- a/src/common.h
+++ b/src/common.h
@@ -77,6 +77,8 @@ static const size_t kPageSize   = 1 << kPageShift;
 static const size_t kMaxSize    = 8u * kPageSize;
 static const size_t kAlignment  = 8;
 static const size_t kLargeSizeClass = 0;
+// For all span-lengths < kMaxPages we keep an exact-size list.
+static const size_t kMaxPages = 1 << (20 - kPageShift);
 
 // Default bound on the total amount of thread caches.
 static const size_t kDefaultOverallThreadCacheSize = 8u * kMaxThreadCacheSize;
@@ -102,6 +104,17 @@ static const int kMaxDynamicFreeListLength = 8192;
 
 static const Length kMaxValidPages = (~static_cast<Length>(0)) >> kPageShift;
 
+#ifdef __x86_64__
+// All current and planned x86_64 processors only look at the lower 48 bits
+// in virtual to physical address translation.  The top 16 are thus unused.
+// TODO(rus): Under what operating systems can we increase it safely to 17?
+// This lets us use smaller page maps.  On first allocation, a 36-bit page map
+// uses only 96 KB instead of the 4.5 MB used by a 52-bit page map.
+static const int kAddressBits = 48;
+#else
+static const int kAddressBits = 8 * sizeof(void*);
+#endif
+
 namespace tcmalloc {
 
 // Convert byte size into pages.  This won't overflow, but may return
diff --git a/src/config.h.in b/src/config.h.in
index a1d5c68..6ee2db0 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -119,6 +119,9 @@
 /* Define to 1 if the system has the type `struct mallinfo'. */
 #undef HAVE_STRUCT_MALLINFO
 
+/* Define to 1 if you have the <sys/param.h> header file. */
+#undef HAVE_SYS_PARAM_H
+
 /* Define to 1 if you have the <sys/prctl.h> header file. */
 #undef HAVE_SYS_PRCTL_H
 
@@ -173,6 +176,10 @@
 /* Define to 1 if int32_t is equivalent to intptr_t */
 #undef INT32_EQUALS_INTPTR
 
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#undef LT_OBJDIR
+
 /* Define to 1 if your C compiler doesn't accept -c and -o together. */
 #undef NO_MINUS_C_MINUS_O
 
diff --git a/src/debugallocation.cc b/src/debugallocation.cc
index 949fbe9..3b34c8c 100644
--- a/src/debugallocation.cc
+++ b/src/debugallocation.cc
@@ -497,7 +497,7 @@ class MallocBlock {
     // practical effect is that allocations are limited to 4Gb or so, even if
     // the address space could take more.
     static size_t max_size_t = ~0;
-    if (size < 0 || size > max_size_t - sizeof(MallocBlock)) {
+    if (size > max_size_t - sizeof(MallocBlock)) {
       RAW_LOG(ERROR, "Massive size passed to malloc: %"PRIuS"", size);
       return NULL;
     }
@@ -1356,6 +1356,20 @@ class DebugMallocImplementation : public ParentImplementation {
   virtual size_t GetEstimatedAllocatedSize(size_t size) {
     return size;
   }
+
+  virtual void GetFreeListSizes(vector<MallocExtension::FreeListInfo>* v) {
+    static const char* kDebugFreeQueue = "debug.free_queue";
+
+    ParentImplementation::GetFreeListSizes(v);
+
+    MallocExtension::FreeListInfo i;
+    i.type = kDebugFreeQueue;
+    i.min_object_size = 0;
+    i.max_object_size = numeric_limits<size_t>::max();
+    i.total_bytes_free = MallocBlock::FreeQueueSize();
+    v->push_back(i);
+  }
+
  };
 
 static DebugMallocImplementation debug_malloc_implementation;
diff --git a/src/google/heap-checker.h b/src/google/heap-checker.h
index c0ee8a8..f46f353 100644
--- a/src/google/heap-checker.h
+++ b/src/google/heap-checker.h
@@ -136,7 +136,7 @@ class PERFTOOLS_DLL_DECL HeapLeakChecker {
   bool NoLeaks() { return DoNoLeaks(DO_NOT_SYMBOLIZE); }
 
   // These forms are obsolete; use NoLeaks() instead.
-  // TODO(csilvers): mark with ATTRIBUTE_DEPRECATED.
+  // TODO(csilvers): mark as DEPRECATED.
   bool QuickNoLeaks()  { return NoLeaks(); }
   bool BriefNoLeaks()  { return NoLeaks(); }
   bool SameHeap()      { return NoLeaks(); }
diff --git a/src/google/malloc_extension.h b/src/google/malloc_extension.h
index 3fbefc9..a2e956e 100644
--- a/src/google/malloc_extension.h
+++ b/src/google/malloc_extension.h
@@ -50,6 +50,7 @@
 #include <stdint.h>
 #endif
 #include <string>
+#include <vector>
 
 // Annoying stuff for windows -- makes sure clients can import these functions
 #ifndef PERFTOOLS_DLL_DECL
@@ -102,12 +103,17 @@ class PERFTOOLS_DLL_DECL MallocExtension {
   // that allocated these objects.  The format of the returned output
   // is equivalent to the output of the heap profiler and can
   // therefore be passed to "pprof".
+  // NOTE: by default, tcmalloc does not do any heap sampling, and this
+  //       function will always return an empty sample.  To get useful
+  //       data from GetHeapSample, you must also set the environment
+  //       variable TCMALLOC_SAMPLE_PARAMETER to a value such as 524288.
   virtual void GetHeapSample(MallocExtensionWriter* writer);
 
   // Outputs to "writer" the stack traces that caused growth in the
   // address space size.  The format of the returned output is
   // equivalent to the output of the heap profiler and can therefore
-  // be passed to "pprof".
+  // be passed to "pprof".  (This does not depend on, or require,
+  // TCMALLOC_SAMPLE_PARAMETER.)
   virtual void GetHeapGrowthStacks(MallocExtensionWriter* writer);
 
   // Invokes func(arg, range) for every controlled memory
@@ -244,6 +250,45 @@ class PERFTOOLS_DLL_DECL MallocExtension {
   // malloc implementation during initialization.
   static void Register(MallocExtension* implementation);
 
+  // Returns detailed information about malloc's freelists. For each list,
+  // return a FreeListInfo:
+  struct FreeListInfo {
+    size_t min_object_size;
+    size_t max_object_size;
+    size_t total_bytes_free;
+    const char* type;
+  };
+  // Each item in the vector refers to a different freelist. The lists
+  // are identified by the range of allocations that objects in the
+  // list can satisfy ([min_object_size, max_object_size]) and the
+  // type of freelist (see below). The current size of the list is
+  // returned in total_bytes_free (which count against a processes
+  // resident and virtual size).
+  //
+  // Currently supported types are:
+  //
+  // "tcmalloc.page{_unmapped}" - tcmalloc's page heap. An entry for each size
+  //          class in the page heap is returned. Bytes in "page_unmapped"
+  //          are no longer backed by physical memory and do not count against
+  //          the resident size of a process.
+  //
+  // "tcmalloc.large{_unmapped}" - tcmalloc's list of objects larger
+  //          than the largest page heap size class. Only one "large"
+  //          entry is returned. There is no upper-bound on the size
+  //          of objects in the large free list; this call returns
+  //          kint64max for max_object_size.  Bytes in
+  //          "large_unmapped" are no longer backed by physical memory
+  //          and do not count against the resident size of a process.
+  //
+  // "tcmalloc.central" - tcmalloc's central free-list. One entry per
+  //          size-class is returned. Never unmapped.
+  //
+  // "debug.free_queue" - free objects queued by the debug allocator
+  //                      and not returned to tcmalloc.
+  //
+  // "tcmalloc.thread" - tcmalloc's per-thread caches. Never unmapped.
+  virtual void GetFreeListSizes(std::vector<FreeListInfo>* v);
+
  protected:
   // Get a list of stack traces of sampled allocation points.  Returns
   // a pointer to a "new[]-ed" result array, and stores the sample
diff --git a/src/heap-profile-table.cc b/src/heap-profile-table.cc
index ecaf75f..6d75c4a 100644
--- a/src/heap-profile-table.cc
+++ b/src/heap-profile-table.cc
@@ -342,7 +342,8 @@ int HeapProfileTable::FillOrderedProfile(char buf[], int size) const {
   // any gaps.  Whew!
   int map_length = snprintf(buf, size, "%s", kProcSelfMapsHeader);
   if (map_length < 0 || map_length >= size) return 0;
-  map_length += FillProcSelfMaps(buf + map_length, size - map_length);
+  bool dummy;   // "wrote_all" -- did /proc/self/maps fit in its entirety?
+  map_length += FillProcSelfMaps(buf + map_length, size - map_length, &dummy);
   RAW_DCHECK(map_length <= size, "");
   char* const map_start = buf + size - map_length;      // move to end
   memmove(map_start, buf, map_length);
diff --git a/src/malloc_extension.cc b/src/malloc_extension.cc
index c2f8b54..1272068 100644
--- a/src/malloc_extension.cc
+++ b/src/malloc_extension.cc
@@ -52,6 +52,7 @@
 #include "maybe_threads.h"
 
 using STL_NAMESPACE::string;
+using STL_NAMESPACE::vector;
 
 static void DumpAddressMap(string* result) {
   *result += "\nMAPPED_LIBRARIES:\n";
@@ -59,9 +60,11 @@ static void DumpAddressMap(string* result) {
   const size_t old_resultlen = result->size();
   for (int amap_size = 10240; amap_size < 10000000; amap_size *= 2) {
     result->resize(old_resultlen + amap_size);
+    bool wrote_all = false;
     const int bytes_written =
-        tcmalloc::FillProcSelfMaps(&((*result)[old_resultlen]), amap_size);
-    if (bytes_written < amap_size - 1) {   // we fit!
+        tcmalloc::FillProcSelfMaps(&((*result)[old_resultlen]), amap_size,
+                                   &wrote_all);
+    if (wrote_all) {   // we fit!
       (*result)[old_resultlen + bytes_written] = '\0';
       result->resize(old_resultlen + bytes_written);
       return;
@@ -167,6 +170,11 @@ size_t MallocExtension::GetAllocatedSize(void* p) {
   return 0;
 }
 
+void MallocExtension::GetFreeListSizes(
+    vector<MallocExtension::FreeListInfo>* v) {
+  v->clear();
+}
+
 // The current malloc extension object.
 
 static pthread_once_t module_init = PTHREAD_ONCE_INIT;
diff --git a/src/memory_region_map.cc b/src/memory_region_map.cc
index f6bed45..3f8509f 100644
--- a/src/memory_region_map.cc
+++ b/src/memory_region_map.cc
@@ -117,7 +117,6 @@
 
 #include "memory_region_map.h"
 
-#include "base/linux_syscall_support.h"
 #include "base/logging.h"
 #include "base/low_level_alloc.h"
 #include "malloc_hook-inl.h"
diff --git a/src/page_heap.cc b/src/page_heap.cc
index 1e63cb9..c92e16b 100644
--- a/src/page_heap.cc
+++ b/src/page_heap.cc
@@ -338,6 +338,35 @@ static double PagesToMB(uint64_t pages) {
   return (pages << kPageShift) / 1048576.0;
 }
 
+void PageHeap::GetClassSizes(int64 class_sizes_normal[kMaxPages],
+                             int64 class_sizes_returned[kMaxPages],
+                             int64* normal_pages_in_spans,
+                             int64* returned_pages_in_spans) {
+
+  for (int s = 0; s < kMaxPages; s++) {
+    if (class_sizes_normal != NULL) {
+      class_sizes_normal[s] = DLL_Length(&free_[s].normal);
+    }
+    if (class_sizes_returned != NULL) {
+      class_sizes_returned[s] = DLL_Length(&free_[s].returned);
+    }
+  }
+
+  if (normal_pages_in_spans != NULL) {
+    *normal_pages_in_spans = 0;
+    for (Span* s = large_.normal.next; s != &large_.normal; s = s->next) {
+      *normal_pages_in_spans += s->length;;
+    }
+  }
+
+  if (returned_pages_in_spans != NULL) {
+    *returned_pages_in_spans = 0;
+    for (Span* s = large_.returned.next; s != &large_.returned; s = s->next) {
+      *returned_pages_in_spans += s->length;
+    }
+  }
+}
+
 void PageHeap::Dump(TCMalloc_Printer* out) {
   int nonempty_sizes = 0;
   for (int s = 0; s < kMaxPages; s++) {
diff --git a/src/page_heap.h b/src/page_heap.h
index 74030d2..545bdda 100644
--- a/src/page_heap.h
+++ b/src/page_heap.h
@@ -140,6 +140,10 @@ class PERFTOOLS_DLL_DECL PageHeap {
     uint64_t unmapped_bytes;  // Total bytes on returned freelists
   };
   inline Stats stats() const { return stats_; }
+  void GetClassSizes(int64 class_sizes_normal[kMaxPages],
+                     int64 class_sizes_returned[kMaxPages],
+                     int64* normal_pages_in_spans,
+                     int64* returned_pages_in_spans);
 
   bool Check();
   // Like Check() but does some more comprehensive checking.
@@ -176,11 +180,8 @@ class PERFTOOLS_DLL_DECL PageHeap {
   // should keep this value big because various incarnations of Linux
   // have small limits on the number of mmap() regions per
   // address-space.
-  static const int kMinSystemAlloc = 1 << (20 - kPageShift);
-
-  // For all span-lengths < kMaxPages we keep an exact-size list.
-  // REQUIRED: kMaxPages >= kMinSystemAlloc;
-  static const size_t kMaxPages = kMinSystemAlloc;
+  // REQUIRED: kMinSystemAlloc <= kMaxPages;
+  static const int kMinSystemAlloc = kMaxPages;
 
   // Never delay scavenging for more than the following number of
   // deallocated pages.  With 4K pages, this comes to 4GB of
@@ -192,8 +193,8 @@ class PERFTOOLS_DLL_DECL PageHeap {
   static const int kDefaultReleaseDelay = 1 << 18;
 
   // Pick the appropriate map and cache types based on pointer size
-  typedef MapSelector<8*sizeof(uintptr_t)>::Type PageMap;
-  typedef MapSelector<8*sizeof(uintptr_t)>::CacheType PageMapCache;
+  typedef MapSelector<kAddressBits>::Type PageMap;
+  typedef MapSelector<kAddressBits>::CacheType PageMapCache;
   PageMap pagemap_;
   mutable PageMapCache pagemap_cache_;
 
diff --git a/src/pprof b/src/pprof
index e67e42e..a503964 100755
--- a/src/pprof
+++ b/src/pprof
@@ -89,6 +89,7 @@ my %obj_tool_map = (
 );
 my $DOT = "dot";          # leave non-absolute, since it may be in /usr/local
 my $GV = "gv";
+my $EVINCE = "evince";    # could also be xpdf or perhaps acroread
 my $KCACHEGRIND = "kcachegrind";
 my $PS2PDF = "ps2pdf";
 # These are used for dynamic profiles
@@ -103,6 +104,7 @@ my $GROWTH_PAGE = "/pprof/growth";
 my $CONTENTION_PAGE = "/pprof/contention";
 my $WALL_PAGE = "/pprof/wall(?:\\?.*)?";  # accepts options like namefilter
 my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
+my $CENSUSPROFILE_PAGE = "/pprof/censusprofile";  # must support "?seconds=#"
 my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
 my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
 
@@ -110,7 +112,7 @@ my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
 # All the alternatives must begin with /.
 my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
                "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
-               "$FILTEREDPROFILE_PAGE)";
+               "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)";
 
 # default binary name
 my $UNKNOWN_BINARY = "(unknown)";
@@ -148,7 +150,7 @@ pprof [options] <profile>
 
    The /<service> can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile,
                          $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall,
-                         or /pprof/filteredprofile.
+                         $CENSUSPROFILE_PAGE, or /pprof/filteredprofile.
    For instance: "pprof http://myserver.com:80$HEAP_PAGE".
    If /<service> is omitted, the service defaults to $PROFILE_PAGE (cpu profiling).
 pprof --symbols <program>
@@ -180,6 +182,7 @@ Output type:
    --text              Generate text report
    --callgrind         Generate callgrind format to stdout
    --gv                Generate Postscript and display
+   --evince            Generate PDF and display
    --web               Generate SVG and display
    --list=<regexp>     Generate source listing of matching routines
    --disasm=<regexp>   Generate disassembly of matching routines
@@ -304,6 +307,7 @@ sub Init() {
   $main::opt_disasm = "";
   $main::opt_symbols = 0;
   $main::opt_gv = 0;
+  $main::opt_evince = 0;
   $main::opt_web = 0;
   $main::opt_dot = 0;
   $main::opt_ps = 0;
@@ -372,6 +376,7 @@ sub Init() {
              "disasm=s"       => \$main::opt_disasm,
              "symbols!"       => \$main::opt_symbols,
              "gv!"            => \$main::opt_gv,
+             "evince!"        => \$main::opt_evince,
              "web!"           => \$main::opt_web,
              "dot!"           => \$main::opt_dot,
              "ps!"            => \$main::opt_ps,
@@ -452,6 +457,7 @@ sub Init() {
       ($main::opt_disasm eq '' ? 0 : 1) +
       ($main::opt_symbols == 0 ? 0 : 1) +
       $main::opt_gv +
+      $main::opt_evince +
       $main::opt_web +
       $main::opt_dot +
       $main::opt_ps +
@@ -646,6 +652,8 @@ sub Main() {
       if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
         if ($main::opt_gv) {
           RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_evince) {
+	  RunEvince(TempName($main::next_tmpfile, "pdf"), "");
         } elsif ($main::opt_web) {
           my $tmp = TempName($main::next_tmpfile, "svg");
           RunWeb($tmp);
@@ -708,6 +716,12 @@ sub RunGV {
   }
 }
 
+sub RunEvince {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  system("$EVINCE " . $fname . $bg);
+}
+
 sub RunWeb {
   my $fname = shift;
   print STDERR "Loading web page file:///$fname\n";
@@ -805,6 +819,7 @@ sub InteractiveCommand {
   $main::opt_disasm = 0;
   $main::opt_list = 0;
   $main::opt_gv = 0;
+  $main::opt_evince = 0;
   $main::opt_cum = 0;
 
   if (m/^\s*(text|top)(\d*)\s*(.*)/) {
@@ -878,11 +893,14 @@ sub InteractiveCommand {
     PrintDisassembly($libs, $flat, $cumulative, $routine, $total);
     return 1;
   }
-  if (m/^\s*(gv|web)\s*(.*)/) {
+  if (m/^\s*(gv|web|evince)\s*(.*)/) {
     $main::opt_gv = 0;
+    $main::opt_evince = 0;
     $main::opt_web = 0;
     if ($1 eq "gv") {
       $main::opt_gv = 1;
+    } elsif ($1 eq "evince") {
+      $main::opt_evince = 1;
     } elsif ($1 eq "web") {
       $main::opt_web = 1;
     }
@@ -902,6 +920,8 @@ sub InteractiveCommand {
     if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
       if ($main::opt_gv) {
         RunGV(TempName($main::next_tmpfile, "ps"), " &");
+      } elsif ($main::opt_evince) {
+        RunEvince(TempName($main::next_tmpfile, "pdf"), " &");
       } elsif ($main::opt_web) {
         RunWeb(TempName($main::next_tmpfile, "svg"));
       }
@@ -1685,6 +1705,8 @@ sub PrintDot {
   my $output;
   if ($main::opt_gv) {
     $output = "| $DOT -Tps2 >" . TempName($main::next_tmpfile, "ps");
+  } elsif ($main::opt_evince) {
+    $output = "| $DOT -Tps2 | $PS2PDF - " . TempName($main::next_tmpfile, "pdf");
   } elsif ($main::opt_ps) {
     $output = "| $DOT -Tps2";
   } elsif ($main::opt_pdf) {
@@ -2955,7 +2977,7 @@ sub FetchDynamicProfile {
 
     my $fetcher = AddFetchTimeout($URL_FETCHER, $fetch_timeout);
     my $cmd = "$fetcher '$url' > '$tmp_profile'";
-    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/){
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){
       print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
       if ($encourage_patience) {
         print STDERR "Be patient...\n";
@@ -3531,16 +3553,18 @@ sub ReadHeapProfile {
           # The sampling frequency is the rate of a Poisson process.
           # This means that the probability of sampling an allocation of
           # size X with sampling rate Y is 1 - exp(-X/Y)
-          my $ratio;
-          $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-          my $scale_factor;
-          $scale_factor = 1/(1 - exp(-$ratio));
-          $n1 *= $scale_factor;
-          $s1 *= $scale_factor;
-          $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-          $scale_factor = 1/(1 - exp(-$ratio));
-          $n2 *= $scale_factor;
-          $s2 *= $scale_factor;
+	  if ($n1 != 0) {
+	    my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+	    my $scale_factor = 1/(1 - exp(-$ratio));
+	    $n1 *= $scale_factor;
+	    $s1 *= $scale_factor;
+	  }
+	  if ($n2 != 0) {
+	    my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+	    my $scale_factor = 1/(1 - exp(-$ratio));
+	    $n2 *= $scale_factor;
+	    $s2 *= $scale_factor;
+	  }
         } else {
           # Remote-heap version 1
           my $ratio;
@@ -4091,9 +4115,15 @@ sub ExtractSymbols {
 
   my $symbols = {};
 
-  # Map each PC value to the containing library
-  my %seen = ();
-  foreach my $lib (@{$libs}) {
+  # Map each PC value to the containing library.  To make this faster,
+  # we sort libraries by their starting pc value (highest first), and
+  # advance through the libraries as we advance the pc.  Sometimes the
+  # addresses of libraries may overlap with the addresses of the main
+  # binary, so to make sure the libraries 'win', we iterate over the
+  # libraries in reverse order (which assumes the binary doesn't start
+  # in the middle of a library, which seems a fair assumption).
+  my @pcs = (sort { $a cmp $b } keys(%{$pcset}));  # pcset is 0-extended strings
+  foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) {
     my $libname = $lib->[0];
     my $start = $lib->[1];
     my $finish = $lib->[2];
@@ -4101,12 +4131,21 @@ sub ExtractSymbols {
 
     # Get list of pcs that belong in this library.
     my $contained = [];
-    foreach my $pc (keys(%{$pcset})) {
-      if (!$seen{$pc} && ($pc ge $start) && ($pc le $finish)) {
-        $seen{$pc} = 1;
-        push(@{$contained}, $pc);
-      }
-    }
+    my ($start_pc_index, $finish_pc_index);
+    # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index].
+    for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0;
+	 $finish_pc_index--) {
+      last if $pcs[$finish_pc_index - 1] le $finish;
+    }
+    # Find smallest start_pc_index such that $start <= $pc[$start_pc_index].
+    for ($start_pc_index = $finish_pc_index; $start_pc_index > 0;
+	 $start_pc_index--) {
+      last if $pcs[$start_pc_index - 1] lt $start;
+    }
+    # This keeps PC values higher than $pc[$finish_pc_index] in @pcs,
+    # in case there are overlaps in libraries and the main binary.
+    @{$contained} = splice(@pcs, $start_pc_index,
+			   $finish_pc_index - $start_pc_index);
     # Map to symbols
     MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
   }
diff --git a/src/profiler.cc b/src/profiler.cc
index 3ac51d4..38fbb93 100644
--- a/src/profiler.cc
+++ b/src/profiler.cc
@@ -111,7 +111,7 @@ class CpuProfiler {
   int           (*filter_)(void*);
   void*         filter_arg_;
 
-  // Opague token returned by the profile handler. To be used when calling
+  // Opaque token returned by the profile handler. To be used when calling
   // ProfileHandlerUnregisterCallback.
   ProfileHandlerToken* prof_handler_token_;
 
diff --git a/src/system-alloc.cc b/src/system-alloc.cc
index 2505959..e589469 100644
--- a/src/system-alloc.cc
+++ b/src/system-alloc.cc
@@ -46,6 +46,7 @@
 #include <sys/mman.h>
 #endif
 #include <errno.h>
+#include "common.h"
 #include "system-alloc.h"
 #include "internal_logging.h"
 #include "base/logging.h"
@@ -73,6 +74,24 @@ static const bool kDebugMode = false;
 static const bool kDebugMode = true;
 #endif
 
+// Anonymous namespace to avoid name conflicts on "CheckAddressBits".
+namespace {
+
+// Check that no bit is set at position ADDRESS_BITS or higher.
+template <int ADDRESS_BITS> bool CheckAddressBits(uintptr_t ptr) {
+  return (ptr >> ADDRESS_BITS) == 0;
+}
+
+// Specialize for the bit width of a pointer to avoid undefined shift.
+template <> bool CheckAddressBits<8 * sizeof(void*)>(uintptr_t ptr) {
+  return true;
+}
+
+}  // Anonymous namespace to avoid name conflicts on "CheckAddressBits".
+
+COMPILE_ASSERT(kAddressBits <= 8 * sizeof(void*),
+               address_bits_larger_than_pointer_size);
+
 // Structure for discovering alignment
 union MemoryAligner {
   void*  p;
@@ -443,7 +462,16 @@ void* TCMalloc_SystemAlloc(size_t size, size_t *actual_size,
       if (a == NULL) continue;
       if (a->usable_ && !a->failed_) {
         void* result = a->Alloc(size, actual_size, alignment);
-        if (result != NULL) return result;
+        if (result != NULL) {
+          if (actual_size) {
+            CheckAddressBits<kAddressBits>(
+                reinterpret_cast<uintptr_t>(result) + *actual_size - 1);
+          } else {
+            CheckAddressBits<kAddressBits>(
+                reinterpret_cast<uintptr_t>(result) + size - 1);
+          }
+          return result;
+        }
       }
     }
 
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 93bdd1d..245407d 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -110,6 +110,7 @@
 #include <errno.h>
 #include <stdarg.h>
 #include <algorithm>
+#include <limits>
 #include <google/tcmalloc.h>
 #include "base/commandlineflags.h"
 #include "base/basictypes.h"               // gets us PRIu64
@@ -136,7 +137,9 @@
 # define WIN32_DO_PATCHING 1
 #endif
 
-using std::max;
+using STL_NAMESPACE::max;
+using STL_NAMESPACE::numeric_limits;
+using STL_NAMESPACE::vector;
 using tcmalloc::AlignmentForSize;
 using tcmalloc::PageHeap;
 using tcmalloc::PageHeapAllocator;
@@ -439,6 +442,52 @@ static void DumpStats(TCMalloc_Printer* out, int level) {
 
   static const double MB = 1048576.0;
 
+  const uint64_t virtual_memory_used = (stats.pageheap.system_bytes
+                                        + stats.metadata_bytes);
+  const uint64_t physical_memory_used = (virtual_memory_used
+                                         - stats.pageheap.unmapped_bytes);
+  const uint64_t bytes_in_use_by_app = (physical_memory_used
+                                        - stats.metadata_bytes
+                                        - stats.pageheap.free_bytes
+                                        - stats.central_bytes
+                                        - stats.transfer_bytes
+                                        - stats.thread_bytes);
+
+  out->printf(
+      "------------------------------------------------\n"
+      "MALLOC:   %12" PRIu64 " (%7.1f MB) Bytes in use by application\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in page heap freelist\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in central cache freelist\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in transfer cache freelist\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in thread cache freelists\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes in malloc metadata\n"
+      "MALLOC:   ------------\n"
+      "MALLOC: = %12" PRIu64 " (%7.1f MB) Actual memory used (physical + swap)\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MB) Bytes released to OS (aka unmapped)\n"
+      "MALLOC:   ------------\n"
+      "MALLOC: = %12" PRIu64 " (%7.1f MB) Virtual address space used\n"
+      "MALLOC:\n"
+      "MALLOC:   %12" PRIu64 "              Spans in use\n"
+      "MALLOC:   %12" PRIu64 "              Thread heaps in use\n"
+      "MALLOC:   %12" PRIu64 "              Tcmalloc page size\n"
+      "------------------------------------------------\n"
+      "Call ReleaseFreeMemory() to release freelist memory to the OS"
+      " (via madvise()).\n"
+      "Bytes released to the OS take up virtual address space"
+      " but no physical memory.\n",
+      bytes_in_use_by_app, bytes_in_use_by_app / MB,
+      stats.pageheap.free_bytes, stats.pageheap.free_bytes / MB,
+      stats.central_bytes, stats.central_bytes / MB,
+      stats.transfer_bytes, stats.transfer_bytes / MB,
+      stats.thread_bytes, stats.thread_bytes / MB,
+      stats.metadata_bytes, stats.metadata_bytes / MB,
+      physical_memory_used, physical_memory_used / MB,
+      stats.pageheap.unmapped_bytes, stats.pageheap.unmapped_bytes / MB,
+      virtual_memory_used, virtual_memory_used / MB,
+      uint64_t(Static::span_allocator()->inuse()),
+      uint64_t(ThreadCache::HeapsInUse()),
+      uint64_t(kPageSize));
+
   if (level >= 2) {
     out->printf("------------------------------------------------\n");
     out->printf("Size class breakdown\n");
@@ -464,38 +513,6 @@ static void DumpStats(TCMalloc_Printer* out, int level) {
     out->printf("------------------------------------------------\n");
     DumpSystemAllocatorStats(out);
   }
-
-  const uint64_t bytes_in_use = stats.pageheap.system_bytes
-                                - stats.pageheap.free_bytes
-                                - stats.pageheap.unmapped_bytes
-                                - stats.central_bytes
-                                - stats.transfer_bytes
-                                - stats.thread_bytes;
-
-  out->printf("------------------------------------------------\n"
-              "MALLOC: %12" PRIu64 " (%7.1f MB) Heap size\n"
-              "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes in use by application\n"
-              "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes free in page heap\n"
-              "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes unmapped in page heap\n"
-              "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes free in central cache\n"
-              "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes free in transfer cache\n"
-              "MALLOC: %12" PRIu64 " (%7.1f MB) Bytes free in thread caches\n"
-              "MALLOC: %12" PRIu64 "              Spans in use\n"
-              "MALLOC: %12" PRIu64 "              Thread heaps in use\n"
-              "MALLOC: %12" PRIu64 " (%7.1f MB) Metadata allocated\n"
-              "MALLOC: %12" PRIu64 "              Tcmalloc page size\n"
-              "------------------------------------------------\n",
-              stats.pageheap.system_bytes, stats.pageheap.system_bytes / MB,
-              bytes_in_use, bytes_in_use / MB,
-              stats.pageheap.free_bytes, stats.pageheap.free_bytes / MB,
-              stats.pageheap.unmapped_bytes, stats.pageheap.unmapped_bytes / MB,
-              stats.central_bytes, stats.central_bytes / MB,
-              stats.transfer_bytes, stats.transfer_bytes / MB,
-              stats.thread_bytes, stats.thread_bytes / MB,
-              uint64_t(Static::span_allocator()->inuse()),
-              uint64_t(ThreadCache::HeapsInUse()),
-              stats.metadata_bytes, stats.metadata_bytes / MB,
-              uint64_t(kPageSize));
 }
 
 static void PrintStats(int level) {
@@ -609,6 +626,20 @@ class TCMallocImplementation : public MallocExtension {
     }
   }
 
+  // We may print an extra, tcmalloc-specific warning message here.
+  virtual void GetHeapSample(MallocExtensionWriter* writer) {
+    if (FLAGS_tcmalloc_sample_parameter == 0) {
+      const char* const kWarningMsg =
+          "#\n# WARNING: This heap profile does not have any data in it,\n"
+          "# because the application was run with heap sampling turned off.\n"
+          "# To get useful data from from GetHeapSample(), you must first\n"
+          "# set the environment variable TCMALLOC_SAMPLE_PARAMETER to a\n"
+          "# positive sampling period, such as 524288.\n#\n";
+      writer->append(kWarningMsg, strlen(kWarningMsg));
+    }
+    MallocExtension::GetHeapSample(writer);
+  }
+
   virtual void** ReadStackTraces(int* sample_period) {
     tcmalloc::StackTraceTable table;
     {
@@ -753,6 +784,99 @@ class TCMallocImplementation : public MallocExtension {
   // unnamed namespace, we need to move the definition below it in the
   // file.
   virtual size_t GetAllocatedSize(void* ptr);
+
+  virtual void GetFreeListSizes(vector<MallocExtension::FreeListInfo>* v) {
+    static const char* kCentralCacheType = "tcmalloc.central";
+    static const char* kTransferCacheType = "tcmalloc.transfer";
+    static const char* kThreadCacheType = "tcmalloc.thread";
+    static const char* kPageHeapType = "tcmalloc.page";
+    static const char* kPageHeapUnmappedType = "tcmalloc.page_unmapped";
+    static const char* kLargeSpanType = "tcmalloc.large";
+    static const char* kLargeUnmappedSpanType = "tcmalloc.large_unmapped";
+
+    v->clear();
+
+    // central class information
+    int64 prev_class_size = 0;
+    for (int cl = 1; cl < kNumClasses; ++cl) {
+      size_t class_size = Static::sizemap()->ByteSizeForClass(cl);
+      MallocExtension::FreeListInfo i;
+      i.min_object_size = prev_class_size + 1;
+      i.max_object_size = class_size;
+      i.total_bytes_free =
+          Static::central_cache()[cl].length() * class_size;
+      i.type = kCentralCacheType;
+      v->push_back(i);
+
+      // transfer cache
+      i.total_bytes_free =
+          Static::central_cache()[cl].tc_length() * class_size;
+      i.type = kTransferCacheType;
+      v->push_back(i);
+
+      prev_class_size = Static::sizemap()->ByteSizeForClass(cl);
+    }
+
+    // Add stats from per-thread heaps
+    uint64_t class_count[kNumClasses];
+    memset(class_count, 0, sizeof(class_count));
+    {
+      SpinLockHolder h(Static::pageheap_lock());
+      uint64_t thread_bytes = 0;
+      ThreadCache::GetThreadStats(&thread_bytes, class_count);
+    }
+
+    prev_class_size = 0;
+    for (int cl = 1; cl < kNumClasses; ++cl) {
+      MallocExtension::FreeListInfo i;
+      i.min_object_size = prev_class_size + 1;
+      i.max_object_size = Static::sizemap()->ByteSizeForClass(cl);
+      i.total_bytes_free =
+          class_count[cl] * Static::sizemap()->ByteSizeForClass(cl);
+      i.type = kThreadCacheType;
+      v->push_back(i);
+    }
+
+    // append page heap info
+    int64 page_count_normal[kMaxPages];
+    int64 page_count_returned[kMaxPages];
+    int64 span_count_normal;
+    int64 span_count_returned;
+    {
+      SpinLockHolder h(Static::pageheap_lock());
+      Static::pageheap()->GetClassSizes(page_count_normal,
+                                        page_count_returned,
+                                        &span_count_normal,
+                                        &span_count_returned);
+    }
+
+    // spans: mapped
+    MallocExtension::FreeListInfo span_info;
+    span_info.type = kLargeSpanType;
+    span_info.max_object_size = (numeric_limits<size_t>::max)();
+    span_info.min_object_size = kMaxPages << kPageShift;
+    span_info.total_bytes_free = span_count_normal << kPageShift;
+    v->push_back(span_info);
+
+    // spans: unmapped
+    span_info.type = kLargeUnmappedSpanType;
+    span_info.total_bytes_free = span_count_returned << kPageShift;
+    v->push_back(span_info);
+
+    for (int s = 1; s < kMaxPages; s++) {
+      MallocExtension::FreeListInfo i;
+      i.max_object_size = (s << kPageShift);
+      i.min_object_size = ((s - 1) << kPageShift);
+
+      i.type = kPageHeapType;
+      i.total_bytes_free = (s << kPageShift) * page_count_normal[s];
+      v->push_back(i);
+
+      i.type = kPageHeapUnmappedType;
+      i.total_bytes_free = (s << kPageShift) * page_count_returned[s];
+      v->push_back(i);
+    }
+  }
 };
 
 // The constructor allocates an object to ensure that initialization
diff --git a/src/tests/debugallocation_test.cc b/src/tests/debugallocation_test.cc
index c482187..f10e2dc 100644
--- a/src/tests/debugallocation_test.cc
+++ b/src/tests/debugallocation_test.cc
@@ -259,7 +259,10 @@ TEST(DebugAllocationTest, GetAllocatedSizeTest) {
 }
 
 TEST(DebugAllocationTest, HugeAlloc) {
-  const size_t kTooBig = ~static_cast<size_t>(0);
+  // This must not be a const variable so it doesn't form an
+  // integral-constant-expression which can be *statically* rejected by the
+  // compiler as too large for the allocation.
+  size_t kTooBig = ~static_cast<size_t>(0);
   void* a = NULL;
   char* b = NULL;
 
@@ -273,8 +276,9 @@ TEST(DebugAllocationTest, HugeAlloc) {
   EXPECT_EQ(NULL, b);
 
   // kAlsoTooBig is small enough not to get caught by debugallocation's check,
-  // but will still fall through to tcmalloc's check.
-  const size_t kAlsoTooBig = kTooBig - 1024;
+  // but will still fall through to tcmalloc's check. This must also be
+  // a non-const variable. See kTooBig for more details.
+  size_t kAlsoTooBig = kTooBig - 1024;
 
   a = malloc(kAlsoTooBig);
   EXPECT_EQ(NULL, a);
diff --git a/src/tests/malloc_extension_test.cc b/src/tests/malloc_extension_test.cc
index ef76766..60f4919 100644
--- a/src/tests/malloc_extension_test.cc
+++ b/src/tests/malloc_extension_test.cc
@@ -39,6 +39,8 @@
 #include <google/malloc_extension.h>
 #include <google/malloc_extension_c.h>
 
+using STL_NAMESPACE::vector;
+
 int main(int argc, char** argv) {
   void* a = malloc(1000);
 
@@ -70,6 +72,30 @@ int main(int argc, char** argv) {
   ASSERT_LE(MallocExtension_GetAllocatedSize(a), 5000);
   ASSERT_GE(MallocExtension_GetEstimatedAllocatedSize(1000), 1000);
 
+  // test invariant: size of freelist = heap_size - allocated_bytes
+  free(malloc(32000));
+  size_t heap_size = 0;
+  size_t allocated = 0;
+  ASSERT_TRUE(MallocExtension::instance()->GetNumericProperty(
+      "generic.current_allocated_bytes", &allocated));
+  ASSERT_TRUE(MallocExtension::instance()->GetNumericProperty(
+      "generic.heap_size", &heap_size));
+  vector<MallocExtension::FreeListInfo> info;
+  MallocExtension::instance()->GetFreeListSizes(&info);
+
+  ASSERT_GE(info.size(), 0);
+  int64 free_bytes = 0;
+  for (vector<MallocExtension::FreeListInfo>::const_iterator it = info.begin();
+       it != info.end();
+       ++it) {
+    free_bytes += it->total_bytes_free;
+  }
+
+  // don't expect an exact equality since the calls to query the heap
+  // themselves free and allocate memory
+  size_t error = abs((heap_size - allocated) - free_bytes);
+  ASSERT_LT(error, 0.15 * heap_size);
+
   free(a);
 
   printf("DONE\n");
diff --git a/src/tests/sampling_test.cc b/src/tests/sampling_test.cc
index b75e70e..c1bd693 100644
--- a/src/tests/sampling_test.cc
+++ b/src/tests/sampling_test.cc
@@ -45,6 +45,8 @@
 
 using std::string;
 
+extern "C" void* AllocateAllocate() ATTRIBUTE_NOINLINE;
+
 extern "C" void* AllocateAllocate() {
   // The VLOG's are mostly to discourage inlining
   VLOG(1, "Allocating some more");
diff --git a/src/tests/sampling_test.sh b/src/tests/sampling_test.sh
index 8c96bc1..2a58426 100755
--- a/src/tests/sampling_test.sh
+++ b/src/tests/sampling_test.sh
@@ -81,13 +81,13 @@ mkdir "$OUTDIR" || die "Unable to create $OUTDIR"
 
 echo "Testing heap output..."
 "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap" \
-   | grep '^ *[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \
+   | grep '[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \
    || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap"
 echo "OK"
 
 echo "Testing growth output..."
 "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth" \
-   | grep '^ *[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \
+   | grep '[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \
    || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth"
 echo "OK"
 
diff --git a/src/tests/system-alloc_unittest.cc b/src/tests/system-alloc_unittest.cc
index a160a34..da76285 100644
--- a/src/tests/system-alloc_unittest.cc
+++ b/src/tests/system-alloc_unittest.cc
@@ -38,7 +38,9 @@
 #include <inttypes.h>           // another place uintptr_t might be defined
 #endif
 #include <sys/types.h>
+#include <algorithm>
 #include "base/logging.h"
+#include "common.h"
 #include "system-alloc.h"
 
 class ArraySysAllocator : public SysAllocator {
@@ -98,6 +100,18 @@ static void TestBasicInvoked() {
   CHECK(a.invoked_);
 }
 
+#if 0  // could port this to various OSs, but won't bother for now
+TEST(AddressBits, CpuVirtualBits) {
+  // Check that kAddressBits is as least as large as either the number of bits
+  // in a pointer or as the number of virtual bits handled by the processor.
+  // To be effective this test must be run on each processor model.
+  const int kPointerBits = 8 * sizeof(void*);
+  const int kImplementedVirtualBits = NumImplementedVirtualBits();
+
+  CHECK_GE(kAddressBits, min(kImplementedVirtualBits, kPointerBits));
+}
+#endif
+
 int main(int argc, char** argv) {
   TestBasicInvoked();
 
diff --git a/src/tests/tcmalloc_unittest.cc b/src/tests/tcmalloc_unittest.cc
index 522c0d9..c528846 100644
--- a/src/tests/tcmalloc_unittest.cc
+++ b/src/tests/tcmalloc_unittest.cc
@@ -100,17 +100,12 @@
 # define cfree free         // don't bother to try to test these obsolete fns
 # define valloc malloc
 # define pvalloc malloc
-# ifdef PERFTOOLS_NO_ALIGNED_MALLOC
-#   define _aligned_malloc(size, alignment) malloc(size)
-# else
-#   include <malloc.h>      // for _aligned_malloc
-# endif
-# define memalign(alignment, size) _aligned_malloc(size, alignment)
-// Assume if we fail, it's because of out-of-memory.
-// Note, this isn't a perfect analogue: we don't enforce constraints on "align"
+// I'd like to map posix_memalign to _aligned_malloc, but _aligned_malloc
+// must be paired with _aligned_free (not normal free), which is too
+// invasive a change to how we allocate memory here.  So just bail
 # include <errno.h>
-# define posix_memalign(pptr, align, size) \
-      ((*(pptr)=_aligned_malloc(size, align)) ? 0 : ENOMEM)
+# define memalign(alignment, size)         malloc(size)
+# define posix_memalign(pptr, align, size) ((*(pptr)=malloc(size)) ? 0 : ENOMEM)
 #endif
 
 // On systems (like freebsd) that don't define MAP_ANONYMOUS, use the old
@@ -1033,6 +1028,14 @@ static int RunAllTests(int argc, char** argv) {
     free(p1);
     VerifyDeleteHookWasCalled();
 
+    // Windows has _aligned_malloc.  Let's test that that's captured too.
+#if (defined(_MSC_VER) || defined(__MINGW32__)) && !defined(PERFTOOLS_NO_ALIGNED_MALLOC)
+    p1 = _aligned_malloc(sizeof(p1) * 2, 64);
+    VerifyNewHookWasCalled();
+    _aligned_free(p1);
+    VerifyDeleteHookWasCalled();
+#endif
+
     p1 = valloc(60);
     VerifyNewHookWasCalled();
     free(p1);
diff --git a/src/windows/config.h b/src/windows/config.h
index 6d6f771..0b91031 100644
--- a/src/windows/config.h
+++ b/src/windows/config.h
@@ -92,7 +92,7 @@
 #undef HAVE_LINUX_PTRACE_H
 
 /* Define to 1 if you have the <malloc.h> header file. */
-#undef HAVE_MALLOC_H
+#define HAVE_MALLOC_H 1
 
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
diff --git a/src/windows/patch_functions.cc b/src/windows/patch_functions.cc
index deb841b..fc57c82 100644
--- a/src/windows/patch_functions.cc
+++ b/src/windows/patch_functions.cc
@@ -175,7 +175,7 @@ class LibcInfo {
     kNew, kNewArray, kDelete, kDeleteArray,
     kNewNothrow, kNewArrayNothrow, kDeleteNothrow, kDeleteArrayNothrow,
     // These are windows-only functions from malloc.h
-    k_Msize, k_Expand, k_Aligned_malloc, k_Aligned_free,
+    k_Msize, k_Expand,
     kNumFunctions
   };
 
@@ -274,12 +274,12 @@ template<int> class LibcInfoWithPatchFunctions : public LibcInfo {
                                             const std::nothrow_t&) __THROW;
   static size_t Perftools__msize(void *ptr) __THROW;
   static void* Perftools__expand(void *ptr, size_t size) __THROW;
-  static void* Perftools__aligned_malloc(size_t size, size_t alignment) __THROW;
-  static void Perftools__aligned_free(void *ptr) __THROW;
   // malloc.h also defines these functions:
+  //   _aligned_malloc, _aligned_free,
   //   _recalloc, _aligned_offset_malloc, _aligned_realloc, _aligned_recalloc
   //   _aligned_offset_realloc, _aligned_offset_recalloc, _malloca, _freea
   // But they seem pretty obscure, and I'm fine not overriding them for now.
+  // It may be they all call into malloc/free anyway.
 };
 
 // This is a subset of MODDULEENTRY32, that we need for patching.
@@ -300,10 +300,19 @@ struct ModuleEntryCopy {
   ModuleEntryCopy(const MODULEINFO& mi) {
     this->modBaseAddr = mi.lpBaseOfDll;
     this->modBaseSize = mi.SizeOfImage;
-    for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++)
-      rgProcAddresses[i] = (GenericFnPtr)::GetProcAddress(
+    LPVOID modEndAddr = (char*)mi.lpBaseOfDll + mi.SizeOfImage;
+    for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++) {
+      FARPROC target = ::GetProcAddress(
           reinterpret_cast<const HMODULE>(mi.lpBaseOfDll),
           LibcInfo::function_name(i));
+      // Sometimes a DLL forwards a function to a function in another
+      // DLL.  We don't want to patch those forwarded functions --
+      // they'll get patched when the other DLL is processed.
+      if (target >= modBaseAddr && target < modEndAddr)
+        rgProcAddresses[i] = (GenericFnPtr)target;
+      else
+        rgProcAddresses[i] = (GenericFnPtr)NULL;
+    }
   }
 };
 
@@ -390,7 +399,7 @@ const char* const LibcInfo::function_name_[] = {
   NULL,  // kMangledNewArrayNothrow,
   NULL,  // kMangledDeleteNothrow,
   NULL,  // kMangledDeleteArrayNothrow,
-  "_msize", "_expand", "_aligned_malloc", "_aligned_free",
+  "_msize", "_expand",
 };
 
 // For mingw, I can't patch the new/delete here, because the
@@ -421,14 +430,6 @@ const GenericFnPtr LibcInfo::static_fn_[] = {
 #endif
   (GenericFnPtr)&::_msize,
   (GenericFnPtr)&::_expand,
-#ifdef PERFTOOLS_NO_ALIGNED_MALLOC   // for older versions of mingw
-  // _aligned_malloc isn't always available in mingw, so don't try to patch.
-  (GenericFnPtr)NULL,
-  (GenericFnPtr)NULL,
-#else
-  (GenericFnPtr)&::_aligned_malloc,
-  (GenericFnPtr)&::_aligned_free,
-#endif
 };
 
 template<int T> GenericFnPtr LibcInfoWithPatchFunctions<T>::origstub_fn_[] = {
@@ -451,8 +452,6 @@ const GenericFnPtr LibcInfoWithPatchFunctions<T>::perftools_fn_[] = {
   (GenericFnPtr)&Perftools_deletearray_nothrow,
   (GenericFnPtr)&Perftools__msize,
   (GenericFnPtr)&Perftools__expand,
-  (GenericFnPtr)&Perftools__aligned_malloc,
-  (GenericFnPtr)&Perftools__aligned_free,
 };
 
 /*static*/ WindowsInfo::FunctionInfo WindowsInfo::function_info_[] = {
@@ -908,21 +907,6 @@ void* LibcInfoWithPatchFunctions<T>::Perftools__expand(void *ptr,
   return NULL;
 }
 
-template<int T>
-void* LibcInfoWithPatchFunctions<T>::Perftools__aligned_malloc(size_t size,
-                                                               size_t alignment)
-    __THROW {
-  void* result = do_memalign_or_cpp_memalign(alignment, size);
-  MallocHook::InvokeNewHook(result, size);
-  return result;
-}
-
-template<int T>
-void LibcInfoWithPatchFunctions<T>::Perftools__aligned_free(void *ptr) __THROW {
-  MallocHook::InvokeDeleteHook(ptr);
-  do_free_with_callback(ptr, (void (*)(void*))origstub_fn_[k_Aligned_free]);
-}
-
 LPVOID WINAPI WindowsInfo::Perftools_HeapAlloc(HANDLE hHeap, DWORD dwFlags,
                                                DWORD_PTR dwBytes) {
   LPVOID result = ((LPVOID (WINAPI *)(HANDLE, DWORD, DWORD_PTR))
diff --git a/src/windows/port.cc b/src/windows/port.cc
index d62fa9d..32f3c31 100644
--- a/src/windows/port.cc
+++ b/src/windows/port.cc
@@ -83,6 +83,18 @@ extern "C" PERFTOOLS_DLL_DECL void* __sbrk(ptrdiff_t increment) {
   return NULL;
 }
 
+// We need to write to 'stderr' without having windows allocate memory.
+// The safest way is via a low-level call like WriteConsoleA().  But
+// even then we need to be sure to print in small bursts so as to not
+// require memory allocation.
+extern "C" PERFTOOLS_DLL_DECL void WriteToStderr(const char* buf, int len) {
+  // Looks like windows allocates for writes of >80 bytes
+  for (int i = 0; i < len; i += 80) {
+    write(STDERR_FILENO, buf + i, std::min(80, len - i));
+  }
+}
+
+
 // -----------------------------------------------------------------------
 // Threads code
 
diff --git a/src/windows/port.h b/src/windows/port.h
index 66745d1..81a68e6 100644
--- a/src/windows/port.h
+++ b/src/windows/port.h
@@ -277,6 +277,8 @@ enum { STDIN_FILENO = 0, STDOUT_FILENO = 1, STDERR_FILENO = 2 };
 #define O_RDONLY  _O_RDONLY
 #endif
 
+extern "C" PERFTOOLS_DLL_DECL void WriteToStderr(const char* buf, int len);
+
 // ----------------------------------- SYSTEM/PROCESS
 typedef int pid_t;
 #define getpid  _getpid
-- 
cgit v1.2.1