Fri Jan 27 14:04:27 2006 Google Inc. <opensource@google.com>

* google-perftools: version 0.6 release * More sophisticated stacktrace usage, possibly using libunwind (aruns) * Update pprof to handle 64-bit profiles (dehnert) * Fix GetStackTrace to correctly return top stackframe (sanjay) * Add ANSI compliance for new and new[], including new_handler (jkearney) * More accuracy by reading ELF files directly rather than objdump (mec) * Add readline support for pprof (addi) * Add #includes for PPC (csilvers) * New PC-detection routine for ibook powerpc (asbestoshead) * Vastly improved tcmalloc unittest (csilvers) * Move documentation from /usr/doc to /usr/share/doc git-svn-id: http://gperftools.googlecode.com/svn/trunk@19 6b5cf1ce-ec42-a296-1ba9-69fdba395a50
author: csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> 2007-03-22 04:46:29 +0000
committer: csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> 2007-03-22 04:46:29 +0000
commit: 60a3a2ce77ed2713b2eedd20952d9cfc56ff7ccf (patch)
tree: 234b959ff2d934386ecfc08a7477f5c94290b709
parent: 298274f8d4f474d2b16a35c8babc58817088c59e (diff)
download: gperftools-60a3a2ce77ed2713b2eedd20952d9cfc56ff7ccf.tar.gz
20 files changed, 2843 insertions, 597 deletions
diff --git a/ChangeLog b/ChangeLog
index d52a139..dd3827d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -61,3 +61,17 @@ Mon Nov 14 17:28:59 2005  Google Inc. <opensource@google.com>
 	* Add va_start/va_end calls around vsnprintf() (csilvers)
 	* Write our own __syscall_return(), since it's not defined
 	  consistently on all 64-bit linux distros (markus)
+
+Fri Jan 27 14:04:27 2006  Google Inc. <opensource@google.com>
+
+	* google-perftools: version 0.6 release
+	* More sophisticated stacktrace usage, possibly using libunwind (aruns)
+	* Update pprof to handle 64-bit profiles (dehnert)
+	* Fix GetStackTrace to correctly return top stackframe (sanjay)
+	* Add ANSI compliance for new and new[], including new_handler (jkearney)
+	* More accuracy by reading ELF files directly rather than objdump (mec)
+	* Add readline support for pprof (addi)
+	* Add #includes for PPC (csilvers)
+	* New PC-detection routine for ibook powerpc (asbestoshead)
+	* Vastly improved tcmalloc unittest (csilvers)
+	* Move documentation from /usr/doc to /usr/share/doc
diff --git a/Makefile.am b/Makefile.am
index cab257c..8d39bbb 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -21,7 +21,7 @@ perftoolsincludedir = $(googleincludedir)/perftools
 # We'll add to this later, on a library-by-library basis
 perftoolsinclude_HEADERS =
 
-docdir = $(prefix)/doc/$(PACKAGE)-$(VERSION)
+docdir = $(prefix)/share/doc/$(PACKAGE)-$(VERSION)
 # This is for HTML and other documentation you want to install.
 # Add your documentation files (in doc/) in addition to these
 # top-level boilerplate files.  Also add a TODO file if you have one.
@@ -53,7 +53,11 @@ dist_doc_DATA += doc/index.html
 ### The header files we use.  We divide into categories based on directory
 S_STACKTRACE_INCLUDES =
 SG_STACKTRACE_INCLUDES = src/google/stacktrace.h
-SGP_STACKTRACE_INCLUDES = src/config.h
+SGP_STACKTRACE_INCLUDES = src/config.h \
+			  src/stacktrace_generic-inl.h \
+			  src/stacktrace_libunwind-inl.h \
+			  src/stacktrace_x86_64-inl.h \
+			  src/stacktrace_x86-inl.h
 STACKTRACE_INCLUDES = $(S_STACKTRACE_INCLUDES) $(SG_STACKTRACE_INCLUDES) $(SGP_STACKTRACE_INCLUDES)
 googleinclude_HEADERS += $(SG_STACKTRACE_INCLUDES)
 perftoolsinclude_HEADERS += $(SGP_STACKTRACE_INCLUDES)
@@ -324,7 +328,7 @@ CPU_PROFILER_SYMBOLS = '(ProfilerStart|ProfilerStop|ProfilerEnable|ProfilerDisab
 libprofiler_la_LDFLAGS = -export-symbols-regex $(CPU_PROFILER_SYMBOLS)
 
 ### Unittests
-check_SCRIPTS += profiler_unittest_sh
+check_SCRIPTS += profiler_unittest_sh pprof_unittest
 noinst_SCRIPTS += src/tests/profiler_unittest.sh
 
 # These are sub-programs used by profiler_unittest.sh
@@ -352,6 +356,9 @@ profiler4_unittest_CXXFLAGS = -g $(PTHREAD_CFLAGS) -DWITH_THREADS
 profiler4_unittest_LDFLAGS = $(PTHREAD_CFLAGS)
 profiler4_unittest_LDADD = -lprofiler $(PTHREAD_LIBS)
 
+pprof_unittest: $(top_srcdir)/src/pprof
+	$< -test
+
 ### Documentation
 dist_man_MANS = doc/pprof.1
 dist_doc_DATA += doc/cpu_profiler.html \
diff --git a/configure b/configure
index 6db11b6..5b99e4d 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.57 for google-perftools 0.5.
+# Generated by GNU Autoconf 2.57 for google-perftools 0.6.
 #
 # Report bugs to <opensource@google.com>.
 #
@@ -422,8 +422,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='google-perftools'
 PACKAGE_TARNAME='google-perftools'
-PACKAGE_VERSION='0.5'
-PACKAGE_STRING='google-perftools 0.5'
+PACKAGE_VERSION='0.6'
+PACKAGE_STRING='google-perftools 0.6'
 PACKAGE_BUGREPORT='opensource@google.com'
 
 ac_unique_file="README"
@@ -953,7 +953,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures google-perftools 0.5 to adapt to many kinds of systems.
+\`configure' configures google-perftools 0.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1019,7 +1019,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of google-perftools 0.5:";;
+     short | recursive ) echo "Configuration of google-perftools 0.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1125,7 +1125,7 @@ fi
 test -n "$ac_init_help" && exit 0
 if $ac_init_version; then
   cat <<\_ACEOF
-google-perftools configure 0.5
+google-perftools configure 0.6
 generated by GNU Autoconf 2.57
 
 Copyright 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, 2002
@@ -1140,7 +1140,7 @@ cat >&5 <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by google-perftools $as_me 0.5, which was
+It was created by google-perftools $as_me 0.6, which was
 generated by GNU Autoconf 2.57.  Invocation command line was
 
   $ $0 $@
@@ -1733,7 +1733,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE=google-perftools
- VERSION=0.5
+ VERSION=0.6
 
 
 cat >>confdefs.h <<_ACEOF
@@ -20799,6 +20799,99 @@ _ACEOF
 
 
 fi
+echo "$as_me:$LINENO: checking for struct sigcontext.regs->nip" >&5
+echo $ECHO_N "checking for struct sigcontext.regs->nip... $ECHO_C" >&6
+if test "${ac_cv_member_struct_sigcontext_regs__nip+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+#line $LINENO "configure"
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <signal.h>
+
+int
+main ()
+{
+static struct sigcontext ac_aggr;
+if (ac_aggr.regs->nip)
+return 0;
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+         { ac_try='test -s conftest.$ac_objext'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_member_struct_sigcontext_regs__nip=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+cat >conftest.$ac_ext <<_ACEOF
+#line $LINENO "configure"
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <signal.h>
+
+int
+main ()
+{
+static struct sigcontext ac_aggr;
+if (sizeof ac_aggr.regs->nip)
+return 0;
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
+  (eval $ac_compile) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+         { ac_try='test -s conftest.$ac_objext'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_member_struct_sigcontext_regs__nip=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ac_cv_member_struct_sigcontext_regs__nip=no
+fi
+rm -f conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f conftest.$ac_objext conftest.$ac_ext
+fi
+echo "$as_me:$LINENO: result: $ac_cv_member_struct_sigcontext_regs__nip" >&5
+echo "${ECHO_T}$ac_cv_member_struct_sigcontext_regs__nip" >&6
+if test $ac_cv_member_struct_sigcontext_regs__nip = yes; then
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_STRUCT_SIGCONTEXT_REGS__NIP 1
+_ACEOF
+
+
+fi
 
 
 # Defines PRIuS
@@ -22300,7 +22393,7 @@ _ASBOX
 } >&5
 cat >&5 <<_CSEOF
 
-This file was extended by google-perftools $as_me 0.5, which was
+This file was extended by google-perftools $as_me 0.6, which was
 generated by GNU Autoconf 2.57.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22363,7 +22456,7 @@ _ACEOF
 
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-google-perftools config.status 0.5
+google-perftools config.status 0.6
 configured by $0, generated by GNU Autoconf 2.57,
   with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/configure.ac b/configure.ac
index c908272..aa1826b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5,7 +5,7 @@
 # make sure we're interpreted by some minimal autoconf
 AC_PREREQ(2.57)
 
-AC_INIT(google-perftools, 0.5, opensource@google.com)
+AC_INIT(google-perftools, 0.6, opensource@google.com)
 # The argument here is just something that should be in the current directory
 # (for sanity checking)
 AC_CONFIG_SRCDIR(README)
@@ -43,7 +43,8 @@ AC_CHECK_MEMBERS([struct sigcontext.sc_eip,
                   struct sigcontext.eip,
                   struct sigcontext.rip,
                   struct sigcontext.sc_ip,
-                  struct siginfo.si_faddr],,,
+                  struct siginfo.si_faddr,
+		  struct sigcontext.regs->nip],,,
                  [#include <signal.h>])
 
 # Defines PRIuS
diff --git a/packages/rpm/rpm.spec b/packages/rpm/rpm.spec
index 24c0432..60e903a 100644
--- a/packages/rpm/rpm.spec
+++ b/packages/rpm/rpm.spec
@@ -15,7 +15,6 @@ Packager: Google <opensource@google.com>
 Source: http://goog-perftools.sourceforge.net/%{NAME}-%{PACKAGE_VERSION}.tar.gz
 Distribution: Redhat 7 and above.
 Buildroot: %{_tmppath}/%{name}-root
-Docdir: %prefix/doc
 Prefix: %prefix
 
 %description
diff --git a/src/base/thread_lister.c b/src/base/thread_lister.c
index 7eca594..6def758 100644
--- a/src/base/thread_lister.c
+++ b/src/base/thread_lister.c
@@ -31,6 +31,7 @@
  * Author: Markus Gutschke
  */
 
+#include <stdio.h>         // needed for NULL on some powerpc platforms (?!)
 #include "base/thread_lister.h"
 #include "base/linuxthreads.h"
 /* Include other thread listers here that define THREADS macro
diff --git a/src/config.h.in b/src/config.h.in
index 365c4f8..2abb5bd 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -78,6 +78,9 @@
 /* Define to 1 if `eip' is member of `struct sigcontext'. */
 #undef HAVE_STRUCT_SIGCONTEXT_EIP
 
+/* Define to 1 if `regs->nip' is member of `struct sigcontext'. */
+#undef HAVE_STRUCT_SIGCONTEXT_REGS__NIP
+
 /* Define to 1 if `rip' is member of `struct sigcontext'. */
 #undef HAVE_STRUCT_SIGCONTEXT_RIP
 
diff --git a/src/heap-checker.cc b/src/heap-checker.cc
index f081f97..b34eea2 100644
--- a/src/heap-checker.cc
+++ b/src/heap-checker.cc
@@ -45,13 +45,14 @@
 #include <google/perftools/hash_set.h>
 #include <algorithm>
 
+#include <fcntl.h>
+#include <string.h>
 #include <errno.h>
 #include <unistd.h>
-#include <string.h>
-#include <sys/stat.h>
+#include <sys/mman.h>
 #include <sys/poll.h>
+#include <sys/stat.h>
 #include <sys/types.h>
-#include <fcntl.h>
 #include <assert.h>
 
 #ifdef HAVE_LINUX_PTRACE_H
@@ -61,6 +62,8 @@
 #include <syscall.h>
 #endif
 
+#include <elf.h>
+
 #include <google/stacktrace.h>
 #include <google/heap-profiler.h>
 #include <google/heap-checker.h>
@@ -393,51 +396,219 @@ static int GetStatusOutput(const char*  command, string* output) {
   return pclose(f);
 }
 
-// A ProcMapsTask to record global data to ignore later
-// that belongs to 'library' mapped at 'start_address' with 'file_offset'.
-static void RecordGlobalDataLocked(const char* library,
-                                   uint64 start_address,
-                                   uint64 file_offset) {
-  HeapProfiler::MESSAGE(2, "HeapChecker: Looking into %s\n", library);
-  string command("/usr/bin/objdump -hw ");
-  command.append(library);
-  string output;
-  if (GetStatusOutput(command.c_str(), &output) != 0) {
-    HeapProfiler::MESSAGE(-1, "HeapChecker: "
-                          "Failed executing %s\n", command.c_str());
-    abort();
-  }
-  const char* output_start = output.c_str();
+// RAII class for a file descriptor.
+
+class FileDescriptor {
+  public:
+   FileDescriptor(int fd) : fd_(fd) { ; }
+   ~FileDescriptor() { if (fd_ >= 0) close(fd_); }
+   int Close() { int fd = fd_; fd_ = -1; return close(fd); }
+   operator int() { return fd_; }
+  private:
+   int fd_;
+};
+
+// This function takes the fields from a /proc/self/maps line:
+//
+//   start_address  start address of a memory region.
+//   end_address    end address of a memory region
+//   permissions    rwx + private/shared bit
+//   file_offset    file offset within the mapped file
+//   (major:minor)  major and minor device number of the mapped file
+//   inode          inode number of the mapped file
+//   filename       filename of the mapped file
+//
+// First, if the region is not writeable, then it cannot have any heap
+// pointers in it.
+//
+// It would be simple to just mark every writeable memory region as live.
+// However, that would pick up unused bottom pieces of thread stacks
+// and other rot.  So we need more complexity:
+//
+// Second, if the region is anonymous, we ignore it.  That skips the
+// main stack and the thread stacks which are picked up elsewhere.
+// Unfortunately that also skips the BSS portions of shared library
+// segments, and we need to pick those up.
+//
+// So, for each writable memory region that is mapped to a file,
+// we recover the original segment information from that file
+// (segment headers, not section headers).  We pick out the segment
+// that contains the given "file_offset", figure out where that
+// segment was loaded into memory, and register all of the memory
+// addresses for that segment.
+//
+// Picking out the right segment is a bit of a mess because the
+// same file offset can appear in multiple segments!  For example:
+//
+//   LOAD  0x000000 0x08048000 0x08048000 0x231a8 0x231a8 R E 0x1000
+//   LOAD  0x0231a8 0x0806c1a8 0x0806c1a8 0x01360 0x014e8 RW  0x1000
+//
+// File offset 0x23000 appears in both segments.  The second segment
+// stars at 0x23000 because of rounding.  Fortunately, we skip the
+// first segment because it is not writeable.  Most of the shared
+// objects we see have one read-executable segment and one read-write
+// segment so we really skate.
+//
+// If a shared library has no initialized data, only BSS, then the
+// size of the read-write LOAD segment will be zero, the dynamic loader
+// will create an anonymous memory region for the BSS but no named
+// segment [I think -- gotta check ld-linux.so -- mec].  We will
+// overlook that segment and never get here.  This is a bug.
+
+static bool RecordGlobalDataLocked(uint64 start_address,
+                                   uint64 end_address,
+                                   const char* permissions,
+                                   uint64 file_offset,
+                                   int64 inode,
+                                   const char* filename) {
+  // Ignore non-writeable regions.
+  if (strchr(permissions, 'w') == NULL)
+    return true;
+
+  // Ignore anonymous regions.
+  // This drops BSS regions which causes us much work later.
+  if (inode == 0)
+    return true;
+
+  // Grab some ELF types.
+#ifdef _LP64
+  typedef Elf64_Ehdr ElfFileHeader;
+  typedef Elf64_Phdr ElfProgramHeader;
+#else
+  typedef Elf32_Ehdr ElfFileHeader;
+  typedef Elf32_Phdr ElfProgramHeader;
+#endif
 
-  if (FLAGS_heap_profile_log >= 5) {
-    HeapProfiler::MESSAGE(5, "HeapChecker: Looking at objdump\n");
-    write(STDERR_FILENO, output.data(), output.size());
+  // Cannot mmap the ELF file because of the live ProcMapsIterator.
+  // Have to read little pieces.  Fortunately there are just two.
+  HeapProfiler::MESSAGE(2, "HeapChecker: Looking into %s\n", filename);
+  FileDescriptor fd_elf(open(filename, O_RDONLY));
+  if (fd_elf < 0)
+    return false;
+
+  // Read and validate the file header.
+  ElfFileHeader efh;
+  if (read(fd_elf, &efh, sizeof(efh)) != sizeof(efh))
+    return false;
+  if (memcmp(&efh.e_ident[0], ELFMAG, SELFMAG) != 0)
+    return false;
+  if (efh.e_version != EV_CURRENT)
+    return false;
+  if (efh.e_type != ET_EXEC && efh.e_type != ET_DYN)
+    return false;
+
+  // Read the segment headers.
+  if (efh.e_phentsize != sizeof(ElfProgramHeader))
+    return false;
+  if (lseek(fd_elf, efh.e_phoff, SEEK_SET) != efh.e_phoff)
+    return false;
+  const size_t phsize = efh.e_phnum * efh.e_phentsize;
+  ElfProgramHeader* eph = new ElfProgramHeader[efh.e_phnum];
+  if (read(fd_elf, eph, phsize) != phsize) {
+    delete[] eph;
+    return false;
   }
 
-  while (1) {
-    char sec_name[11];
-    uint64 sec_size, sec_vmaddr, sec_lmaddr, sec_offset;
-    if (sscanf(output_start, "%*d .%10s "LLX" "LLX" "LLX" "LLX" ",
-               sec_name, &sec_size, &sec_vmaddr,
-               &sec_lmaddr, &sec_offset) == 5) {
-      if (strcmp(sec_name, "data") == 0 ||
-          strcmp(sec_name, "bss") == 0) {
-        uint64 real_start = start_address + sec_offset - file_offset;
-        HeapProfiler::MESSAGE(4, "HeapChecker: "
-                              "Got section %s: %p of "LLX" bytes\n",
-                              sec_name,
-                              reinterpret_cast<void*>(real_start),
-                              sec_size);
-        (*library_live_objects)[library].
-          push_back(AllocObject(reinterpret_cast<void*>(real_start),
-                                sec_size, IN_GLOBAL_DATA));
+  // Gonna need this page size for page boundary considerations.
+  // Better be a power of 2.
+  const int int_page_size = getpagesize();
+  if (int_page_size <= 0 || int_page_size & (int_page_size-1))
+    abort();
+  const uint64 page_size = int_page_size;
+
+  // Walk the segment headers.
+  // Find the segment header that contains the given file offset.
+  bool found_load_segment = false;
+  for (int iph = 0; iph < efh.e_phnum; ++iph) {
+    HeapProfiler::MESSAGE(3, "HeapChecker: %s %d: p_type: %d p_flags: %x",
+                          filename, iph, eph[iph].p_type, eph[iph].p_flags);
+    if (eph[iph].p_type == PT_LOAD && eph[iph].p_flags & PF_W) {
+      // Sanity check the segment header.
+      if (eph[iph].p_vaddr != eph[iph].p_paddr) {
+        delete[] eph;
+        return false;
+      }
+      if ((eph[iph].p_vaddr  & (page_size-1)) !=
+          (eph[iph].p_offset & (page_size-1))) {
+        delete[] eph;
+        return false;
+      }
+
+      // The segment is not aligned in the ELF file, but will be
+      // aligned in memory.  So round it to page boundaries.
+      // Note: we lose if p_end is on the last page.
+      const uint64 p_start = eph[iph].p_offset &~ (page_size-1);
+      const uint64 p_end = ((eph[iph].p_offset + eph[iph].p_memsz)
+                         + (page_size-1)) &~ (page_size-1);
+      if (p_end < p_start) {
+        delete[] eph;
+        return false;
+      }
+      if (file_offset >= p_start && file_offset < p_end) {
+        // Found it.
+        if (found_load_segment) {
+          delete[] eph;
+          return false;
+        }
+        found_load_segment = true;
+
+        // [p_start, p_end) is the file segment, where p_end is extended
+        // for BSS (which does not actually come from the file).
+        //
+        // [start_address, end_address) is the memory region from
+        // /proc/self/maps.
+        //
+        // The point of correspondence is:
+        //   file_offset in filespace <-> start_address in memoryspace
+        //
+        // This point of correspondence is reliable because the kernel
+        // virtual memory system actually uses this information for
+        // demand-paging the file.
+        //
+        // A single file segment can get loaded into several contiguous
+        // memory regions with different permissions.  I have seen as
+        // many as four regions: no-permission alignment region +
+        // read-only-after-relocation region + read-write data region +
+        // read-write anonymous bss region.  So file_offset from the
+        // memory region may not equal to p_start from the file segement;
+        // file_offset can be anywhere in [p_start, p_end).
+
+        // Check that [start_address, end_address) is wholly contained
+        // in [p_start, p_end).
+        if (end_address < start_address ||
+            end_address - start_address > p_end - file_offset) {
+          delete[] eph;
+          return false;
+        }
+
+        // Calculate corresponding address and length for [p_start, p_end).
+        if (file_offset - p_start > start_address) {
+          delete[] eph;
+          return false;
+        }
+        void* addr = reinterpret_cast<void*>(start_address -
+                                             (file_offset - p_start));
+        const uintptr_t length = p_end - p_start;
+
+        // That is what we need.
+        (*library_live_objects)[filename].
+          push_back(AllocObject(addr, length, IN_GLOBAL_DATA));
       }
     }
-    // skip to the next line
-    const char* next = strpbrk(output_start, "\n\r");
-    if (next == NULL) break;
-    output_start = next + 1;
   }
+  delete[] eph;
+
+  if (!found_load_segment) {
+    HeapProfiler::MESSAGE(-1,
+      "HeapChecker: no LOAD segment found in %s\n",
+      filename);
+    return false;
+  }
+
+  if (fd_elf.Close() < 0)
+    return false;
+
+  return true;
 }
 
 // See if 'library' from /proc/self/maps has base name 'library_base'
@@ -522,26 +693,32 @@ HeapLeakChecker::UseProcMaps(ProcMapsTask proc_maps_task) {
     HeapProfiler::MESSAGE(4, "HeapChecker: "
                           "Looking at /proc/self/maps line:\n  %s\n",
                           proc_map_line);
-    if (proc_maps_task == DISABLE_LIBRARY_ALLOCS  &&
-        strncmp(permissions, "r-xp", 4) == 0  &&  inode != 0) {
-      if (start_address >= end_address)  abort();
-      DisableLibraryAllocs(filename,
-                           reinterpret_cast<void*>(start_address),
-                           reinterpret_cast<void*>(end_address));
-    }
-    if (proc_maps_task == RECORD_GLOBAL_DATA_LOCKED  &&
-        // grhat based on Red Hat Linux 9
-        (strncmp(permissions, "rw-p", 4) == 0  ||
-         // Fedora Core 3
-         strncmp(permissions, "rwxp", 4) == 0)  &&
-        inode != 0) {
-      if (start_address >= end_address)  abort();
-      RecordGlobalDataLocked(proc_map_line + size, start_address, file_offset);
-    }
+
+    if (start_address >= end_address)
+      abort();
+
     // Determine if any shared libraries are present.
-    if (strstr(filename, "lib") && strstr(filename, ".so")) {
+    if (inode != 0 && strstr(filename, "lib") && strstr(filename, ".so")) {
       saw_shared_lib = true;
     }
+
+    if (proc_maps_task == DISABLE_LIBRARY_ALLOCS) {
+      if (inode != 0 && strncmp(permissions, "r-xp", 4) == 0) {
+        DisableLibraryAllocs(filename,
+                             reinterpret_cast<void*>(start_address),
+                             reinterpret_cast<void*>(end_address));
+      }
+    }
+
+    if (proc_maps_task == RECORD_GLOBAL_DATA_LOCKED) {
+      if (!RecordGlobalDataLocked(start_address, end_address, permissions,
+                                  file_offset, inode, filename)) {
+        HeapProfiler::MESSAGE(
+          -1, "HeapChecker: failed RECORD_GLOBAL_DATA_LOCKED on %s\n",
+          filename);
+        abort();
+      }
+    }
   }
   fclose(fp);
 
@@ -825,7 +1002,7 @@ void HeapLeakChecker::DisableChecksUp(int stack_frames) {
   if (!heap_checker_on) return;
   if (stack_frames < 1)  abort();
   void* stack[1];
-  if (GetStackTrace(stack, 1, stack_frames) != 1)  abort();
+  if (GetStackTrace(stack, 1, stack_frames+1) != 1)  abort();
   DisableChecksAt(stack[0]);
 }
 
@@ -840,7 +1017,7 @@ bool HeapLeakChecker::HaveDisabledChecksUp(int stack_frames) {
   if (!heap_checker_on) return false;
   if (stack_frames < 1)  abort();
   void* stack[1];
-  if (GetStackTrace(stack, 1, stack_frames) != 1)  abort();
+  if (GetStackTrace(stack, 1, stack_frames+1) != 1)  abort();
   return HaveDisabledChecksAt(stack[0]);
 }
 
@@ -865,14 +1042,14 @@ void HeapLeakChecker::DisableChecksIn(const char* pattern) {
 void* HeapLeakChecker::GetDisableChecksStart() {
   if (!heap_checker_on) return NULL;
   void* start_address;
-  if (GetStackTrace(&start_address, 1, 0) != 1)  abort();
+  if (GetStackTrace(&start_address, 1, 1) != 1)  abort();
   return start_address;
 }
 
 void HeapLeakChecker::DisableChecksToHereFrom(void* start_address) {
   if (!heap_checker_on) return;
   void* end_address;
-  if (GetStackTrace(&end_address, 1, 0) != 1)  abort();
+  if (GetStackTrace(&end_address, 1, 1) != 1)  abort();
   if (start_address > end_address)  swap(start_address, end_address);
   DisableChecksFromTo(start_address, end_address,
                       10000);  // practically no stack depth limit:
diff --git a/src/heap-profiler.cc b/src/heap-profiler.cc
index 45eb908..7cb7d56 100644
--- a/src/heap-profiler.cc
+++ b/src/heap-profiler.cc
@@ -869,10 +869,14 @@ void HeapProfiler::EarlyStartLocked() {
   // Our first allocation after registering our hook is treated specially by
   // RecordAlloc();  It looks at the stack and counts how many frames up we
   // are.  First we record the current stack pointer.
-  void* here[1];
-  GetStackTrace(here, 1, 0);
-  // This actually records the frame above this one.  We take this into account
-  // in RecordAlloc.
+  // Note: The stacktrace implementations differ about how many args they
+  // fill when skip is non-zero.  Safest just to reserve maxdepth space.
+  void* here[2];
+  GetStackTrace(here, 2, 1);
+  // Skip the first frame.  It points to the current offset within this
+  // function, which will have changed by the time we get to the malloc()
+  // call which triggers.  Instead, we store our parent function's offset,
+  // which is shared by both us and by the malloc() call below.
   recordalloc_reference_stack_position_ = here[0];
   done_first_alloc_ = false; // Initialization has not occured yet
   void* first_alloc = malloc(kFirstAllocationNumBytes);
diff --git a/src/malloc_extension.cc b/src/malloc_extension.cc
index 1a42e6e..0260a34 100644
--- a/src/malloc_extension.cc
+++ b/src/malloc_extension.cc
@@ -249,6 +249,7 @@ void MallocExtension::GetHeapSample(string* result) {
 
   // TODO(menage) Get this working in google-perftools
   //DumpAddressMap(DebugStringWriter, result);
+  delete[] entries;
 }
 
 void MallocExtension::GetHeapGrowthStacks(std::string* result) {
diff --git a/src/pprof b/src/pprof
index 46366bc..e80063c 100755
--- a/src/pprof
+++ b/src/pprof
@@ -1,6 +1,6 @@
 #! /usr/bin/perl -w
 
-# Copyright (c) 2005, Google Inc.
+# Copyright (c) 1998-2006, Google Inc.
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
@@ -68,21 +68,31 @@
 use strict;
 use Getopt::Long;
 
-# These are the external binaries we use.  We hard-code in the path
-# because some people have colorizing versions of these binaries which
-# can cause trouble.
-my $OBJDUMP = "/usr/bin/objdump";
-my $NM = "/usr/bin/nm";
-my $ADDR2LINE = "/usr/bin/addr2line";
+# These are the object tools we use, which come from various sources.
+# We want to invoke them directly, rather than via users' aliases and/or
+# search paths, because some people have colorizing versions of them that
+# can cause trouble, or because the default versions on a system may not
+# deal with 64-bit objects if 64-bit profiles are being analyzed.
+# (However, we will default to the user's defaults if we can't find better.)
+# See ConfigureObjTools near the bottom of this script.
+my %obj_tool_map = (
+  "objdump" => "objdump",
+  "nm" => "nm",
+  "addr2line" => "addr2line",
+);
 my $DOT = "dot";          # leave non-absolute, since it may be in /usr/local
 my $GV = "gv";
 
+# There is a pervasive dependency on the length (in hex characters, i.e.,
+# nibbles) of an address, distinguishing between 32-bit and 64-bit profiles:
+my $address_length = 8;   # Hope for 32-bit, reset if 64-bit detected.
+
 ##### Argument parsing #####
 
 sub usage_string {
   return <<'EOF';
-Usage: pprof [options] <program> <profile>
-   Prints specified cpu- or heap-profile 
+Usage: pprof [options] <program> <profile> ...
+   Prints specified cpu- or heap-profile
    
 Options:
    --cum               Sort by cumulative data
@@ -111,7 +121,7 @@ Heap-Profile Options:
    --alloc_space       Display allocated (mega)bytes
    --alloc_objects     Display allocated objects
    --show_bytes        Display space in bytes
-   --drop_negative     Ignore negaive differences
+   --drop_negative     Ignore negative differences
    
 Call-graph Options:
    --nodecount=<n>     Show at most so many nodes [default=80]
@@ -121,6 +131,12 @@ Call-graph Options:
    --ignore=<regexp>   Ignore nodes matching <regexp>
    --scale=<n>         Set GV scaling [default=0]
    
+Miscellaneous:
+   --tools=<prefix>    Prefix for object tool pathnames
+   --test              Run unit tests
+   --help              This message
+   --version           Version information
+
 Examples:
    
 pprof /bin/ls ls.prof
@@ -140,9 +156,9 @@ EOF
 
 sub version_string {
   return <<'EOF'
-pprof (part of google-perftools 0.1)
+pprof (part of google-perftools 0.6)
 
-Copyright 1998-2005 Google Inc.
+Copyright 1998-2006 Google Inc.
 
 This is BSD licensed software; see the source for copying conditions
 and license information.
@@ -151,14 +167,14 @@ PARTICULAR PURPOSE.
 EOF
 }
 
-sub fatal {
+sub usage {
   my $msg = shift;
   print STDERR "$msg\n\n";
   print STDERR usage_string();
   print STDERR "\nFATAL ERROR: $msg\n";    # just as a reminder
   exit(1);
 }
-  
+
 
 $main::opt_help = 0;
 $main::opt_version = 0;
@@ -194,6 +210,10 @@ $main::opt_show_bytes    = 0;
 $main::opt_drop_negative = 0;
 $main::opt_interactive   = 0;
 
+$main::opt_tools   = "";
+$main::opt_debug   = 0;
+$main::opt_test    = 0;
+
 # Are we printing a heap profile?
 $main::heap_profile = 0;
 
@@ -229,7 +249,10 @@ GetOptions("help!"          => \$main::opt_help,
 	   "alloc_objects!" => \$main::opt_alloc_objects,
 	   "show_bytes!"    => \$main::opt_show_bytes,
 	   "drop_negative!" => \$main::opt_drop_negative,
-	   ) || fatal("Invalid option(s)");
+           "tools=s"        => \$main::opt_tools,
+           "test!"          => \$main::opt_test,
+           "debug!"         => \$main::opt_debug,
+	   ) || usage("Invalid option(s)");
 
 # Deal with the standard --help and --version
 if ($main::opt_help) {
@@ -255,7 +278,7 @@ if ($main::opt_inuse_space +
     $main::opt_inuse_objects +
     $main::opt_alloc_space +
     $main::opt_alloc_objects > 1) {
-  fatal("Specify at most on of --inuse/--alloc options");
+  usage("Specify at most on of --inuse/--alloc options");
 }
 
 # Check output granularities
@@ -266,7 +289,7 @@ my $grains =
   $main::opt_files +
   0;
 if ($grains > 1) {
-  fatal("Only specify one output granularity option");
+  usage("Only specify one output granularity option");
 }
 if ($grains == 0) {
   $main::opt_functions = 1;
@@ -282,14 +305,30 @@ my $modes =
   $main::opt_gif +
   0;
 if ($modes > 1) {
-  fatal("Only specify one output mode");
+  usage("Only specify one output mode");
 }
 if ($modes == 0) {
   $main::opt_text = 1;
 }
 
-my $prog = shift || fatal("Did not specify program");
-my $pfile_arg = shift || fatal("Did not specify profile file");
+if ($main::opt_test) {
+  RunUnitTests();
+  # Should not return
+  exit(1);
+}
+
+# Binary name and profile arguments list
+$main::prog = "";
+@main::pfile_args = ();
+
+$main::prog = shift || usage("Did not specify program");
+scalar(@ARGV) || usage("Did not specify profile file");
+
+# Parse profile file/location arguments
+foreach my $farg (@ARGV) {
+  unshift(@main::pfile_args, $farg);
+}
+ConfigureObjTools($main::prog);
 
 ##### Main section #####
 
@@ -298,16 +337,21 @@ $main::tmpfile_sym = "/tmp/pprof$$.sym";
 $main::tmpfile_ps = "/tmp/pprof$$";
 $main::next_tmpfile = 0;
 $main::collected_profile = undef;
+@main::profile_files = ();
+#$main::op_time = time();
 $SIG{'INT'} = \&sighandler;
 
-# Read profile data
-my $pfile = FetchDynamicProfile($prog, $pfile_arg);
-my $data = ReadProfile($prog, $pfile);
+# Fetch all profile data
+FetchDynamicProfiles();
+
+# Read one profile, pick the last item on the list
+my $data = ReadProfile($main::prog, pop(@main::profile_files));
 my $profile = $data->{profile};
 my $libs = $data->{libs};       # Info about main program and shared libraries
 
 # List of function names to skip
 $main::skip = ();
+$main::skip_regexp = 'NOMATCH';
 if ($main::heap_profile) {
   foreach my $name ('calloc',
                     'cfree',
@@ -317,22 +361,33 @@ if ($main::heap_profile) {
                     'pvalloc',
                     'valloc',
                     'realloc',
+                    'do_malloc',
+		    'DoSampledAllocation',
                     '__builtin_delete',
                     '__builtin_new',
                     '__builtin_vec_delete',
                     '__builtin_vec_new') {
     $main::skip{$name} = 1;
   }
+  $main::skip_regexp = "TCMalloc";
 }
 if ($main::lock_profile) {
-  foreach my $name ('Mutex::Unlock') {
-    $main::skip{$name} = 1;
+  foreach my $vname ('Mutex::Unlock', 'Mutex::UnlockSlow') {
+    $main::skip{$vname} = 1;
+  }
+}
+
+# Add additional profiles, if available.
+if (scalar(@main::profile_files) > 0) {
+  foreach my $pname (@main::profile_files) {
+    my $p = ReadProfile($main::prog, $pname)->{profile};
+    $profile = AddProfile($profile, $p);
   }
 }
 
 # Subtract base from profile, if specified
 if ($main::opt_base ne '') {
-  my $base = ReadProfile($prog, $main::opt_base)->{profile};
+  my $base = ReadProfile($main::prog, $main::opt_base)->{profile};
   $profile = SubtractProfile($profile, $base);
 }
 
@@ -369,13 +424,17 @@ if (!$main::opt_interactive) {
   } elsif ($main::opt_text) {
     PrintText($symbols, $flat, $cumulative, $total, -1);
   } else {
-    if (PrintDot($prog, $symbols, $profile, $flat, $cumulative, $total)) {
+    if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
       if ($main::opt_gv) {
-	# Some versions of gv use -scale, and some use --scale.  *sigh*
-	# We use --help to determine if gv expects one dash or two.
-	system("$GV --help >/dev/null 2>&1 " .
-	       "&& $GV --scale=$main::opt_scale $main::tmpfile_ps " .
-	       "|| $GV -scale $main::opt_scale $main::tmpfile_ps")
+        if (!system("$GV --version >/dev/null 2>&1")) {
+	  # Options using double dash are supported by this gv version.
+	  system("$GV --scale=$main::opt_scale " .
+		 PsTempName($main::next_tmpfile));
+        } else {
+          # Old gv version - only supports options that use single dash.
+	  system("$GV -scale $main::opt_scale " .
+		 PsTempName($main::next_tmpfile));
+        }
       }
     } else {
       exit(1);
@@ -390,108 +449,139 @@ exit(0);
 
 
 ##### Interactive helper routines #####
+
 sub InteractiveMode {
   $| = 1;	# Make output unbuffered for interactive mode
   my $orig_profile = $profile;
-  while (1) {
-    print "(pprof) ";
-    $_ = <STDIN>;
-    if (!defined($_)) {
-      print "\n";
-      last;
-    }
-    if (m/^ *quit/) {
-      last;
+
+  # Use ReadLine if it's installed.
+  if ( defined(eval {require Term::ReadLine}) ) {
+    my $term = new Term::ReadLine 'pprof';
+    while ( defined ($_ = $term->readline('(pprof) '))) {
+      $term->addhistory($_) if /\S/;
+      if (!InteractiveCommand($orig_profile, $_)) {
+	last;    # exit when we get an interactive command to quit
+      }
     }
-    if (m/^ *help/) {
-      InteractiveHelpMessage();
-      next;
+  } else {       # don't have readline
+    while (1) {
+      print "(pprof) ";
+      $_ = <STDIN>;
+      if (!InteractiveCommand($orig_profile, $_)) {
+	last;    # exit when we get an interactive command to quit
+      }
     }
-    # Clear all the options
-    $main::opt_lines = 0;
-    $main::opt_text = 0;
-    $main::opt_disasm = 0;
-    $main::opt_list = 0;
-    $main::opt_gv = 0;
-    $main::opt_cum = 0;
+  }
+}
+
+# Takes two args: orig profile, and command to run.
+# Returns 1 if we should keep going, or 0 if we were asked to quit
+sub InteractiveCommand {
+  my($orig_profile, $command) = @_;
+  $_ = $command;                # just to make future m//'s easier
+  if (!defined($_)) {
+    print "\n";
+    return 0;
+  }
+  if (m/^ *quit/) {
+    return 0;
+  }
+  if (m/^ *help/) {
+    InteractiveHelpMessage();
+    return 1;
+  }
+  # Clear all the options
+  $main::opt_lines = 0;
+  $main::opt_text = 0;
+  $main::opt_disasm = 0;
+  $main::opt_list = 0;
+  $main::opt_gv = 0;
+  $main::opt_cum = 0;
 
-    if (m/^ *(text|top)(\d*) *(.*)/) {
-      $main::opt_text = 1;
+  if (m/^ *(text|top)(\d*) *(.*)/) {
+    $main::opt_text = 1;
 
-      my $line_limit = ($2 ne "") ? int($2) : 10;
+    my $line_limit = ($2 ne "") ? int($2) : 10;
 
-      my $routine;
-      my $ignore;
-      ($routine, $ignore) = ParseInteractiveArgs($3);
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($3);
 
-      my $profile = ProcessProfile($orig_profile, "", $ignore);
-      my $reduced = ReduceProfile($symbols, $profile);
+    my $profile = ProcessProfile($orig_profile, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
 
-      # Get derived profiles
-      my $flat = FlatProfile($reduced);
-      my $cumulative = CumulativeProfile($reduced);
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
 
-      PrintText($symbols, $flat, $cumulative, $total, $line_limit);
-      next;
-    }
-    if (m/^ *list *(.+)/) {
-      $main::opt_list = 1;
+    PrintText($symbols, $flat, $cumulative, $total, $line_limit);
+    return 1;
+  }
+  if (m/^ *list *(.+)/) {
+    $main::opt_list = 1;
 
-      my $routine;
-      my $ignore;
-      ($routine, $ignore) = ParseInteractiveArgs($1);
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($1);
 
-      my $profile = ProcessProfile($orig_profile, "", $ignore);
-      my $reduced = ReduceProfile($symbols, $profile);
+    my $profile = ProcessProfile($orig_profile, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
 
-      # Get derived profiles
-      my $flat = FlatProfile($reduced);
-      my $cumulative = CumulativeProfile($reduced);
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
 
-      PrintListing($libs, $flat, $cumulative, $routine);
-      next;
-    }
-    if (m/^ *disasm *(.+)/) {
-      $main::opt_disasm = 1;
+    PrintListing($libs, $flat, $cumulative, $routine);
+    return 1;
+  }
+  if (m/^ *disasm *(.+)/) {
+    $main::opt_disasm = 1;
 
-      my $routine;
-      my $ignore;
-      ($routine, $ignore) = ParseInteractiveArgs($1);
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($1);
 
-      # Process current profile to account for various settings
-      my $profile = ProcessProfile($orig_profile, "", $ignore);
-      my $reduced = ReduceProfile($symbols, $profile);
+    # Process current profile to account for various settings
+    my $profile = ProcessProfile($orig_profile, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
 
-      # Get derived profiles
-      my $flat = FlatProfile($reduced);
-      my $cumulative = CumulativeProfile($reduced);
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
 
-      PrintDisassembly($libs, $flat, $cumulative, $routine);
-      next;
-    }
-    if (m/^ *gv *(.*)/) {
-      $main::opt_gv = 1;
+    PrintDisassembly($libs, $flat, $cumulative, $routine);
+    return 1;
+  }
+  if (m/^ *gv *(.*)/) {
+    $main::opt_gv = 1;
 
-      my $focus;
-      my $ignore;
-      ($focus, $ignore) = ParseInteractiveArgs($1);
+    my $focus;
+    my $ignore;
+    ($focus, $ignore) = ParseInteractiveArgs($1);
 
-      # Process current profile to account for various settings
-      my $profile = ProcessProfile($orig_profile, $focus, $ignore);
-      my $reduced = ReduceProfile($symbols, $profile);
+    # Process current profile to account for various settings
+    my $profile = ProcessProfile($orig_profile, $focus, $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
 
-      # Get derived profiles
-      my $flat = FlatProfile($reduced);
-      my $cumulative = CumulativeProfile($reduced);
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
 
-      if (PrintDot($prog, $symbols, $profile, $flat, $cumulative, $total)) {
-	system("gv -scale $main::opt_scale -noresize " . 
-	       PsTempName($main::next_tmpfile) . " &");
-	$main::next_tmpfile++;
+    if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+      if (!system("$GV --version >/dev/null 2>&1")) {
+        # Options using double dash are supported by this gv version.
+        system("$GV --scale=$main::opt_scale --noresize " .
+               PsTempName($main::next_tmpfile) . " &");
+      } else {
+        # Old gv version - only supports options that use single dash.
+        system("$GV -scale $main::opt_scale -noresize " .
+               PsTempName($main::next_tmpfile) . " &");
       }
-      next;
+      $main::next_tmpfile++;
     }
+    return 1;
   }
+  return 1;
 }
 
 
@@ -503,19 +593,23 @@ sub ProcessProfile {
   # Process current profile to account for various settings
   my $profile = $orig_profile;
   my $total_count = TotalProfile($profile);
-  print "Total: ", $total_count, " samples\n";
+  printf("Total: %s %s\n", Unparse($total_count), Units());
   if ($focus ne '') {
     $profile = FocusProfile($symbols, $profile, $focus);
     my $focus_count = TotalProfile($profile);
-    printf "After focusing on '%s': %d samples of %d (%0.1f%%)\n", 
-      $focus, $focus_count, $total_count, ($focus_count*100.0) / $total_count;
+    printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n",
+	   $focus,
+	   Unparse($focus_count), Units(),
+	   Unparse($total_count), ($focus_count*100.0) / $total_count);
   }
   if ($ignore ne '') {
     $profile = IgnoreProfile($symbols, $profile, $ignore);
     my $ignore_count = TotalProfile($profile);
-    printf "After ignoring '%s': %d samples of %d (%0.1f%%)\n", 
-      $ignore, $ignore_count, $total_count, 
-	($ignore_count*100.0) / $total_count;
+    printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n",
+	   $ignore,
+	   Unparse($ignore_count), Units(),
+	   Unparse($total_count),
+	   ($ignore_count*100.0) / $total_count);
   }
 
   return $profile;
@@ -531,12 +625,12 @@ Commands:
       Show graphical hierarchical display of current profile.  Without
       any arguments, shows all samples in the profile.  With the optional
       "focus" argument, restricts the samples shown to just those where
-      the "focus" regular expression matches a routine name on the stack 
+      the "focus" regular expression matches a routine name on the stack
       trace.
 
   list [routine_regexp] [-ignore1] [-ignore2]
       Show source listing of routines whose names match "routine_regexp"
-  
+
   top [--cum] [-ignore1] [-ignore2]
   top20 [--cum] [-ignore1] [-ignore2]
   top37 [--cum] [-ignore1] [-ignore2]
@@ -585,7 +679,7 @@ sub PsTempName {
   my $fnum = shift;
   return "$main::tmpfile_ps" . "." . "$fnum" . ".ps";
 }
-  
+
 # Print text output
 sub PrintText {
   my $symbols = shift;
@@ -636,24 +730,27 @@ sub PrintDisassembly {
 
   foreach my $lib (@{$libs}) {
     my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts);
-    my $offset = $lib->[1] - $lib->[3];
-    foreach my $routine (keys(%{$symbol_table})) {
+    my $offset = AddressSub($lib->[1], $lib->[3]);
+    foreach my $routine (sort ByName keys(%{$symbol_table})) {
       my $start_addr = $symbol_table->{$routine}->[0];
       my $end_addr = $symbol_table->{$routine}->[1];
       # See if there are any samples in this routine
       my $total_flat = 0;
       my $total_cum = 0;
-      for (my $addr = $start_addr; $addr < $end_addr; $addr++) {
-        $total_flat += GetEntry($flat, sprintf("0x%x", $addr+$offset));
-        $total_cum += GetEntry($cumulative, sprintf("0x%x", $addr+$offset));
+      my $length = hex(AddressSub($end_addr, $start_addr));
+      my $addr = AddressAdd($start_addr, $offset);
+      for (my $i = 0; $i < $length; $i++) {
+        $total_cum += GetEntry($cumulative, $addr);
+        $total_flat += GetEntry($flat, $addr);
+        $addr = AddressInc($addr);
       }
 
       # Skip disassembly if there are no samples in routine
       next if ($total_cum == 0);
 
       print "ROUTINE ====================== $routine\n";
-      printf "%6s %6s Total samples (flat / cumulative)\n",
-        Unparse($total_flat), Unparse($total_cum);
+      printf "%6s %6s Total %s (flat / cumulative)\n",
+        Unparse($total_flat), Unparse($total_cum), Units();
 
       my @instructions = Disassemble($lib->[0], $offset,
                                      $start_addr, $end_addr);
@@ -666,7 +763,8 @@ sub PrintDisassembly {
         }
         my $f = GetEntry($flat, $e->[0]);
         my $c = GetEntry($cumulative, $e->[0]);
-        my $address = $e->[0];  $address =~ s/^0x//;
+        my $address = $e->[0];
+        $address =~ s/^0x//;
         printf("%6s %6s %-20s %8s: %6s\n",
                UnparseAlt($f),
                UnparseAlt($c),
@@ -689,10 +787,11 @@ sub Disassemble {
   my $start_addr = shift;
   my $end_addr = shift;
 
-  my $cmd = sprintf("$OBJDUMP -d -l --no-show-raw-insn " .
-                    "--start-address=%d --stop-address=%d $prog",
-                    $start_addr, $end_addr);
-  open(OBJDUMP, "$cmd |") || error("$OBJDUMP: $!\n");
+  my $objdump = $obj_tool_map{"objdump"};
+  my $cmd = sprintf("$objdump -d -l --no-show-raw-insn " .
+                    "--start-address=0x$start_addr " .
+                    "--stop-address=0x$end_addr $prog");
+  open(OBJDUMP, "$cmd |") || error("$objdump: $!\n");
   my @result = ();
   my $filename = "";
   my $linenumber = -1;
@@ -704,8 +803,9 @@ sub Disassemble {
       $filename = $1;
       $linenumber = $2;
     } elsif (m/^ +([0-9a-f]+):\s*(.*)/) {
-      # Disassembly line
-      my $k = sprintf("0x%x", hex($1) + $offset);
+      # Disassembly line -- zero-extend address to full length
+      my $addr = HexExtend($1);
+      my $k = AddressAdd($addr, $offset);
       push(@result, [$k, $filename, $linenumber, $2]);
     }
   }
@@ -727,18 +827,21 @@ sub PrintListing {
 
   foreach my $lib (@{$libs}) {
     my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts);
-    my $offset = $lib->[1] - $lib->[3];
+    my $offset = AddressSub($lib->[1], $lib->[3]);
     foreach my $routine (sort ByName keys(%{$symbol_table})) {
       # Print if there are any samples in this routine
       my $start_addr = $symbol_table->{$routine}->[0];
       my $end_addr = $symbol_table->{$routine}->[1];
-      for (my $addr = $start_addr; $addr < $end_addr; $addr++) {
-        if (defined($cumulative->{sprintf("0x%x", $addr+$offset)})) {
+      my $length = hex(AddressSub($end_addr, $start_addr));
+      my $addr = AddressAdd($start_addr, $offset);
+      for (my $i = 0; $i < $length; $i++) {
+        if (defined($cumulative->{$addr})) {
           PrintSource($lib->[0], $offset,
                       $routine, $flat, $cumulative,
                       $start_addr, $end_addr);
           last;
         }
+        $addr = AddressInc($addr);
       }
     }
   }
@@ -794,7 +897,7 @@ sub PrintSource {
     }
   }
 
-  # Hack 3: Extend last line forward until its indentation is less than 
+  # Hack 3: Extend last line forward until its indentation is less than
   # the indentation we saw on $firstline
   my $oldlastline = $lastline;
   {
@@ -808,17 +911,17 @@ sub PrintSource {
       $l++;
       my $indent = Indentation($_);
       if ($l >= $firstline) {
-	if ($first_indentation < 0 && $indent >= 0) {
-	  $first_indentation = $indent;
-	  last if ($first_indentation == 0);
-	}
+        if ($first_indentation < 0 && $indent >= 0) {
+          $first_indentation = $indent;
+          last if ($first_indentation == 0);
+        }
       }
       if ($l >= $lastline && $indent >= 0) {
-	if ($indent >= $first_indentation) {
-	  $lastline = $l+1;
-	} else {
-	  last;
-	}
+        if ($indent >= $first_indentation) {
+          $lastline = $l+1;
+        } else {
+          last;
+        }
       }
     }
     close(FILE);
@@ -871,8 +974,8 @@ sub PrintSource {
   my $l = 0;
   while (<FILE>) {
     $l++;
-    if ($l >= $firstline - 5 && 
-	(($l <= $oldlastline + 5) || ($l <= $lastline))) {
+    if ($l >= $firstline - 5 &&
+        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
       chop;
       my $text = $_;
       printf("%6s %6s %4d: %s\n",
@@ -1000,20 +1103,21 @@ sub PrintDot {
 
   # Get edges and counts per edge
   my %edge = ();
+  my $n;
   foreach my $k (keys(%{$raw})) {
     # TODO: omit low %age edges
-    my $n = $raw->{$k};
+    $n = $raw->{$k};
     my @addrs = split(/\n/, $k);
     for (my $i = 1; $i <= $#addrs; $i++) {
       my $src = OutputKey($symbols, $addrs[$i]);
       my $dst = OutputKey($symbols, $addrs[$i-1]);
       #next if ($src eq $dst);  # Avoid self-edges?
       if (exists($node{$src}) && exists($node{$dst})) {
-        my $e = "$src\001$dst";
-        if (!exists($edge{$e})) {
-          $edge{$e} = 0;
+        my $edge_label = "$src\001$dst";
+        if (!exists($edge{$edge_label})) {
+          $edge{$edge_label} = 0;
         }
-        $edge{$e} += $n;
+        $edge{$edge_label} += $n;
       }
     }
   }
@@ -1021,7 +1125,7 @@ sub PrintDot {
   # Print edges
   foreach my $e (keys(%edge)) {
     my @x = split(/\001/, $e);
-    my $n = $edge{$e};
+    $n = $edge{$e};
 
     if (abs($n) > $edgelimit) {
       # Compute line width based on edge count
@@ -1054,8 +1158,10 @@ sub OutputKey {
   my $a = shift;
 
   # Skip large addresses since they sometimes show up as fake entries on RH9
-  if (hex($a) > 0x7fffffff) {
-    return '';
+  if (length($a) > 8) {
+    if ($a gt "7fffffffffffffff") { return ''; }
+  } else {
+    if (hex($a) > 0x7fffffff) { return ''; }
   }
 
   # Extract symbolic info for address
@@ -1069,7 +1175,7 @@ sub OutputKey {
   }
 
   # We drop a few well-known names
-  if ($main::skip{$func}) {
+  if ($main::skip{$func} || ($func =~ m/$main::skip_regexp/)) {
     return '';
   }
 
@@ -1114,6 +1220,8 @@ sub Unparse {
         return sprintf("%.1f", $num / 1048576.0);
       }
     }
+  } elsif ($main::lock_profile) {
+    return sprintf("%.3f", $num / 1e9);	# Convert nanoseconds to seconds
   } else {
     return sprintf("%d", $num);
   }
@@ -1141,6 +1249,8 @@ sub Units {
         return "MB";
       }
     }
+  } elsif ($main::lock_profile) {
+    return "seconds";
   } else {
     return "samples";
   }
@@ -1264,6 +1374,25 @@ sub TotalProfile {
   return $result;
 }
 
+# Add A to B
+sub AddProfile {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  # add all keys in A
+  foreach my $k (keys(%{$A})) {
+    my $v = $A->{$k};
+    AddEntry($R, $k, $v);
+  }
+  # add all keys in B
+  foreach my $k (keys(%{$B})) {
+    my $v = $B->{$k};
+    AddEntry($R, $k, $v);
+  }
+  return $R;
+}
+
 # Subtract B from A
 sub SubtractProfile {
   my $A = shift;
@@ -1278,7 +1407,7 @@ sub SubtractProfile {
     AddEntry($R, $k, $v);
   }
   if (!$main::opt_drop_negative) {
-    # take care of when substracted profile has more things
+    # Take care of when subtracted profile has more entries
     foreach my $k (keys(%{$B})) {
       if (!exists($A->{$k})) {
         AddEntry($R, $k, 0 - $B->{$k});
@@ -1310,16 +1439,90 @@ sub AddEntry {
   $profile->{$k} += $n;
 }
 
+# Add a stack of entries to specified profile, and add them to the $pcs
+# list.
+sub AddEntries {
+  my $profile = shift;
+  my $pcs = shift;
+  my $stack = shift;
+  my $count = shift;
+  my @k = ();
+
+  foreach my $e (split(/\s+/, $stack)) {
+    my $pc = HexExtend($e);
+    $pcs->{$pc} = 1;
+    push @k, $pc;
+  }
+  AddEntry($profile, (join "\n", @k), $count);
+}
+
 ##### Code to profile a server dynamically #####
 
 sub FetchDynamicProfile {
   my $binary_name = shift;
   my $profile_name = shift;
+  my $fetch_name_only = shift;
+  my $encourage_patience = shift;
 
   # TODO: Add support for fetching profiles dynamically from a server
   return $profile_name;
 }
 
+# Collect profiles in parallel
+sub FetchDynamicProfiles {
+  my $items = scalar(@main::pfile_args);
+  my $levels = log($items) / log(2);
+
+  if ($items == 1) {
+    $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1);
+  } else {
+    # math rounding issues
+    if ((2 ** $levels) < $items) {
+     $levels++;
+    }
+    my $count = scalar(@main::pfile_args);
+    for (my $i = 0; $i < $count; $i++) {
+      $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0);
+    }
+    print STDERR "Fetching $count profiles, Be patient...\n";
+    FetchDynamicProfilesRecurse($levels, 0, 0);
+    $main::collected_profile = join(" \\\n    ", @main::profile_files);
+  }
+}
+
+# Recursively fork a process to get enough processes
+# collecting profiles
+sub FetchDynamicProfilesRecurse {
+  my $maxlevel = shift;
+  my $level = shift;
+  my $position = shift;
+
+  if (my $pid = fork()) {
+    $position = 0 | ($position << 1);
+    TryCollectProfile($maxlevel, $level, $position);
+    wait;
+  } else {
+    $position = 1 | ($position << 1);
+    TryCollectProfile($maxlevel, $level, $position);
+    exit(0);
+  }
+}
+
+# Collect a single profile
+sub TryCollectProfile {
+  my $maxlevel = shift;
+  my $level = shift;
+  my $position = shift;
+
+  if ($level >= ($maxlevel - 1)) {
+    if ($position < scalar(@main::pfile_args)) {
+      FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0);
+    }
+  } else {
+    FetchDynamicProfilesRecurse($maxlevel, $level+1, $position);
+  }
+}
+
 ##### Parsing code #####
 
 # Parse profile generated by common/profiler.cc and return a reference
@@ -1328,7 +1531,7 @@ sub FetchDynamicProfile {
 #      $result->{period}      Sampling period (in microseconds)
 #      $result->{profile}     Profile object
 #      $result->{map}         Memory map info from profile
-#      $result->{pcs}         List of all PC values seen
+#      $result->{pcs}         Hash of all PC values seen, key is hex address
 sub ReadProfile {
   my $prog = shift;
   my $fname = shift;
@@ -1371,6 +1574,12 @@ sub ReadCPUProfile {
   my $nbytes = read(PROFILE, $str, (stat PROFILE)[7]);   # read entire file
   close(PROFILE);
 
+  my $version;
+  my $period;
+  my $i;
+  my $profile = {};
+  my $pcs = {};
+
   # Parse string into array of slots.
   # L! is needed for 64-bit # platforms, but not supported on 5.005
   # (despite the manpage claims)
@@ -1384,37 +1593,95 @@ sub ReadCPUProfile {
 
   my @slots = unpack($format, $str);
 
-  # Read header
-  if ($#slots < 1 || $slots[0] != 0 || $slots[1] < 3) {
+  # Read header.  The current header version is a 5-element structure
+  # containing:
+  #   0: header count (always 0)
+  #   1: header "words" (after this one: 3)
+  #   2: format version (0)
+  #   3: sampling period (usec)
+  #   4: unused padding (always 0)
+  # The header words are 32-bit or 64-bit depending on the ABI of the program
+  # that generated the profile.  In the 64-bit case, since our x86-architecture
+  # machines are little-endian, the actual value of each of these elements is
+  # in the first 32-bit word, and the second is always zero.  The @slots array
+  # above was read as a sequence of 32-bit words in both cases, so we need to
+  # explicitly check for both cases.  A typical slot sequence for each is:
+  #   32-bit:  0 3 0 100 0
+  #   64-bit:  0 0  3 0  0 0  100 0  0 0
+  #
+  if ($#slots < 4 || $slots[0] != 0 ) {
     error("$fname: not a profile file, or old format profile file\n");
   }
-  my $version = $slots[2];
-  my $period = $slots[3];
-  my $i = 2 + $slots[1];
+  if ($slots[1] >= 3) {
+    # Normal 32-bit header:
+    $version = $slots[2];
+    $period = $slots[3];
+    $i = 2 + $slots[1];
+    $address_length = 8;
+
+    # Parse profile
+    while ($i <= $#slots) {
+      my $n = $slots[$i++];
+      my $d = $slots[$i++];
+      if ($slots[$i] == 0) {
+        # End of profile data marker
+        $i += $d;
+        last;
+      }
 
-  # Parse profile
-  my $profile = {};
-  my $pcs = {};
-  while ($i <= $#slots) {
-    my $n = $slots[$i++];
-    my $d = $slots[$i++];
-    if ($slots[$i] == 0) {
-      # End of profile data marker
+      # Make key out of the stack entries
+      my @k = ();
+      for (my $j = 0; $j < $d; $j++) {
+        my $pc = sprintf("%08x", $slots[$i+$j]);
+        $pcs->{$pc} = 1;
+        push @k, $pc;
+      }
+
+      AddEntry($profile, (join "\n", @k), $n);
       $i += $d;
-      last;
     }
 
-    # Make key out of the stack entries
-    my $k = "";
-    for (my $j = 0; $j < $d; $j++) {
-      my $pc = $slots[$i+$j];
-      $pcs->{$pc} = 1;
-      $k .= sprintf("\n0x%x", $pc);
-    }
-    $k =~ s/^\n//;
+  # Normal 64-bit header:  All entries are doubled in size.  The first
+  # word (little-endian) should contain the real value, the second should
+  # be zero.
+  } elsif ($#slots < 9 || $slots[1] != 0 || $slots[2] < 3 || $slots[3] != 0
+        || $slots[5] != 0 || $slots[7] != 0) {
+      error("$fname: not a profile file, or old format profile file\n");
+  } else {
+    $version = $slots[4];
+    $period = $slots[6];
+    $i = 4 + 2 * $slots[2];
+    $address_length = 16;
+
+    # Parse profile
+    while ($i <= $#slots) {
+      my $n = $slots[$i++];
+      my $nhi = $slots[$i++];
+      # Huge counts may coerce to floating point, keeping scale, not precision
+      if ($nhi != 0) { $n += $nhi*(2**32); }
+      my $d = $slots[$i++];
+      if ($slots[$i++] != 0) {
+        my $addr = sprintf("%o", 4 * $i);
+        print STDERR "At index $i ($addr):\n";
+        error("$fname: stack trace depth >= 2**32\n");
+      }
+      if ($slots[$i] == 0 && $slots[$i+1] == 0) {
+        # End of profile data marker
+        $i += 2 * $d;
+        last;
+      }
 
-    AddEntry($profile, $k, $n);
-    $i += $d;
+      # Make key out of the stack entries
+      my @k = ();
+      for (my $j = $d; $j--; ) {
+        my $pclo = $slots[$i++];
+        my $pchi = $slots[$i++];
+        my $pc = sprintf("%08x%08x", $pchi, $pclo);
+        $pcs->{$pc} = 1;
+        push @k, $pc;
+      }
+      AddEntry($profile, (join "\n", @k), $n);
+    }
   }
 
   # Parse map
@@ -1458,6 +1725,7 @@ sub ReadHeapProfile {
   my $profile = {};
   my $pcs = {};
   my $map = "";
+
   while (<PROFILE>) {
     if (/^MAPPED_LIBRARIES:/) {
       # Read the /proc/self/maps data
@@ -1493,15 +1761,7 @@ sub ReadHeapProfile {
       my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
 
       my @counts = ($n1, $s1, $n2, $s2);
-      my $n = $counts[$index];
-      my $k = "";
-      foreach my $e (split(/\s+/, $stack)) {
-        my $pc = hex($e);
-        $pcs->{$pc} = 1;
-        $k .= sprintf("\n0x%x", $pc);
-      }
-      $k =~ s/^\n//;
-      AddEntry($profile, $k, $n);
+      AddEntries($profile, $pcs, $stack, $counts[$index]);
     }
   }
 
@@ -1516,28 +1776,33 @@ sub ReadHeapProfile {
 
 sub ReadSynchProfile {
   my ($prog, $fname, $header) = @_;
-  my ($line, $map, $pc, @k, $count, $stack);
 
-  $map = '';
+  my $map = '';
   my $profile = {};
   my $pcs = {};
   my $sampling_period = 1;
+  my $cyclespernanosec = 2.8;	# Default assumption for old binaries
+  my $seen_clockrate = 0;
+  my $line;
 
   while ( $line = <PROFILE> ) {
     if ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
          $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
-      # Sample entry
-      ($count, $stack) = ($1, $2);
-      $count = 1 if $count !~ /^\d+$/;
-
-      @k = ();
-      foreach $pc (split /\s+/, $stack) {
-        $pcs->{hex($pc)} = 1;
-        push @k, $pc;
+      my ($count, $stack) = ($1, $2);
+      if ($count !~ /^\d+$/) {
+        next;
       }
-      AddEntry($profile, (join "\n", @k), $count);
+
+      # Convert cycles to nanoseconds
+      $count /= $cyclespernanosec;
+      AddEntries($profile, $pcs, $stack, $count);
+
+    } elsif ( $line =~ m|cycles/second = (\d+)|) {
+      $cyclespernanosec = $1 / 1e9;
+      $seen_clockrate = 1;
     } elsif ( $line =~ /sampling period = (\d+)/ ) {
       $sampling_period = $1;
+
     } else {
       # Memory map entry
       $map .= $line;
@@ -1545,15 +1810,30 @@ sub ReadSynchProfile {
   }
   close PROFILE;
 
+  if (!$seen_clockrate) {
+    printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n",
+                   $cyclespernanosec);
+  }
+
   my $r = {};
-  $r->{'version'} = 0;
-  $r->{'period'} = $sampling_period;
-  $r->{'profile'} = $profile;
-  $r->{'libs'} = ParseLibraries($prog, $map, $pcs);
-  $r->{'pcs'} = $pcs;
+  $r->{version} = 0;
+  $r->{period} = $sampling_period;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
   return $r;
 }
 
+# Given a hex value in the form "0x1abcd" return "0001abcd" or
+# "000000000001abcd", depending on the current address length.
+# There's probably a more idiomatic (or faster) way to do this...
+sub HexExtend {
+  my $addr = shift;
+
+  $addr =~ s/^0x//;
+  return substr("000000000000000".$addr, -$address_length);
+}
+
 ##### Symbol extraction #####
 
 # Split /proc/pid/maps dump into a list of libraries
@@ -1564,7 +1844,14 @@ sub ParseLibraries {
 
   my $result = [];
   my $h = "[a-f0-9]+";
+  my $zero_offset = HexExtend("0");
+
+  my $buildvar = "";
   foreach my $l (split("\n", $map)) {
+    if ($l =~ m/^\s*build=(.*)$/) {
+      $buildvar = $1;
+    }
+
     my $start;
     my $finish;
     my $offset;
@@ -1572,32 +1859,40 @@ sub ParseLibraries {
     if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.so(\.\d+)*\w*)/) {
       # Full line from /proc/self/maps.  Example:
       #   40000000-40015000 r-xp 00000000 03:01 12845071   /lib/ld-2.3.2.so
-      $start = hex($1);
-      $finish = hex($2);
-      $offset = hex($3);
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = HexExtend($3);
       $lib = $4;
     } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) {
       # Cooked line from DumpAddressMap.  Example:
       #   40000000-40015000: /lib/ld-2.3.2.so
-      $start = hex($1);
-      $finish = hex($2);
-      $offset = 0;
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = $zero_offset;
       $lib = $3;
     } else {
       next;
     }
 
+    # Expand "$build" variable if available
+    $lib =~ s/\$build\b/$buildvar/g;
+
     # Get objdump output from the library file to figure out how to
     # map between mapped addresses and addresses in the library.
-    open(OBJDUMP, "$OBJDUMP -h $lib |") || error("$OBJDUMP $lib: $!\n");
+    my $objdump = $obj_tool_map{"objdump"};
+    open(OBJDUMP, "$objdump -h $lib |")
+                  || error("$objdump $lib: $!\n");
     while (<OBJDUMP>) {
       # Idx Name          Size      VMA       LMA       File off  Algn
       #  10 .text         00104b2c  420156f0  420156f0  000156f0  2**4
+      # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file
+      # offset may still be 8.  But AddressSub below will still handle that.
       my @x = split;
       if (($#x >= 6) && ($x[1] eq '.text')) {
-        my $vma = hex($x[3]);
-        my $file_offset = hex($x[5]);
-        $offset += $vma - $file_offset;
+        my $vma = $x[3];
+        my $file_offset = $x[5];
+        my $vma_offset = AddressSub($vma, $file_offset);
+        $offset = AddressAdd($offset, $vma_offset);
         last;
       }
     }
@@ -1607,15 +1902,173 @@ sub ParseLibraries {
   }
 
   # Append special entry for the main program
-  my $max_pc = 0;
+  my $max_pc = "0";
   foreach my $pc (keys(%{$pcs})) {
-    if ($pc > $max_pc) { $max_pc = $pc; }
+    if ($pc gt $max_pc) { $max_pc = $pc; }
   }
-  push(@{$result}, [$prog, 0, $max_pc, 0]);
+  push(@{$result}, [$prog, $zero_offset, $max_pc, $zero_offset]);
 
   return $result;
 }
 
+# Add two hex addresses of length $address_length.
+# Run pprof --test for unit test if this is changed.
+sub AddressAdd {
+  my $addr1 = shift;
+  my $addr2 = shift;
+  my $sum;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16);
+    return sprintf("%08x", $sum);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize carry handling.
+
+    if ($main::opt_debug and $main::opt_test) {
+      print STDERR "AddressAdd $addr1 + $addr2 = ";
+    }
+
+    my $a1 = substr($addr1,-7);
+    $addr1 = substr($addr1,0,-7);
+    my $a2 = substr($addr2,-7);
+    $addr2 = substr($addr2,0,-7);
+    $sum = hex($a1) + hex($a2);
+    my $c = 0;
+    if ($sum > 0xfffffff) {
+      $c = 1;
+      $sum -= 0x10000000;
+    }
+    my $r = sprintf("%07x", $sum);
+
+    $a1 = substr($addr1,-7);
+    $addr1 = substr($addr1,0,-7);
+    $a2 = substr($addr2,-7);
+    $addr2 = substr($addr2,0,-7);
+    $sum = hex($a1) + hex($a2) + $c;
+    $c = 0;
+    if ($sum > 0xfffffff) {
+      $c = 1;
+      $sum -= 0x10000000;
+    }
+    $r = sprintf("%07x", $sum) . $r;
+
+    $sum = hex($addr1) + hex($addr2) + $c;
+    if ($sum > 0xff) { $sum -= 0x100; }
+    $r = sprintf("%02x", $sum) . $r;
+
+    if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; }
+
+    return $r;
+  }
+}
+
+
+# Subtract two hex addresses of length $address_length.
+# Run pprof --test for unit test if this is changed.
+sub AddressSub {
+  my $addr1 = shift;
+  my $addr2 = shift;
+  my $diff;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16);
+    return sprintf("%08x", $diff);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize borrow handling.
+    # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; }
+
+    my $a1 = hex(substr($addr1,-7));
+    $addr1 = substr($addr1,0,-7);
+    my $a2 = hex(substr($addr2,-7));
+    $addr2 = substr($addr2,0,-7);
+    my $b = 0;
+    if ($a2 > $a1) {
+      $b = 1;
+      $a1 += 0x10000000;
+    }
+    $diff = $a1 - $a2;
+    my $r = sprintf("%07x", $diff);
+
+    $a1 = hex(substr($addr1,-7));
+    $addr1 = substr($addr1,0,-7);
+    $a2 = hex(substr($addr2,-7)) + $b;
+    $addr2 = substr($addr2,0,-7);
+    $b = 0;
+    if ($a2 > $a1) {
+      $b = 1;
+      $a1 += 0x10000000;
+    }
+    $diff = $a1 - $a2;
+    $r = sprintf("%07x", $diff) . $r;
+
+    $a1 = hex($addr1);
+    $a2 = hex($addr2) + $b;
+    if ($a2 > $a1) { $a1 += 0x100; }
+    $diff = $a1 - $a2;
+    $r = sprintf("%02x", $diff) . $r;
+
+    # if ($main::opt_debug) { print STDERR "$r\n"; }
+
+    return $r;
+  }
+}
+
+# Increment a hex addresses of length $address_length.
+# Run pprof --test for unit test if this is changed.
+sub AddressInc {
+  my $addr = shift;
+  my $sum;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $sum = (hex($addr)+1) % (0x10000000 * 16);
+    return sprintf("%08x", $sum);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize carry handling.
+    # We are always doing this to step through the addresses in a function,
+    # and will almost never overflow the first chunk, so we check for this
+    # case and exit early.
+
+    # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; }
+
+    my $a1 = substr($addr,-7);
+    $addr = substr($addr,0,-7);
+    $sum = hex($a1) + 1;
+    my $r = sprintf("%07x", $sum);
+    if ($sum <= 0xfffffff) {
+      $r = $addr . $r;
+      # if ($main::opt_debug) { print STDERR "$r\n"; }
+      return HexExtend($r);
+    } else {
+      $r = "0000000";
+    }
+
+    $a1 = substr($addr,-7);
+    $addr = substr($addr,0,-7);
+    $sum = hex($a1) + 1;
+    $r = sprintf("%07x", $sum) . $r;
+    if ($sum <= 0xfffffff) {
+      $r = $addr . $r;
+      # if ($main::opt_debug) { print STDERR "$r\n"; }
+      return HexExtend($r);
+    } else {
+      $r = "00000000000000";
+    }
+
+    $sum = hex($addr) + 1;
+    if ($sum > 0xff) { $sum -= 0x100; }
+    $r = sprintf("%02x", $sum) . $r;
+
+    # if ($main::opt_debug) { print STDERR "$r\n"; }
+    return $r;
+  }
+}
+
 # Extract symbols for all PC values found in profile
 sub ExtractSymbols {
   my $libs = shift;
@@ -1635,13 +2088,13 @@ sub ExtractSymbols {
     # Get list of pcs that belong in this library.
     my $contained = [];
     foreach my $pc (keys(%{$pcset})) {
-      if (!$seen{$pc} && ($pc >= $start) && ($pc <= $finish)) {
+      if (!$seen{$pc} && ($pc ge $start) && ($pc le $finish)) {
         $seen{$pc} = 1;
         push(@{$contained}, $pc);
       }
     }
     # Map to symbols
-    MapToSymbols($libname, $start - $offset, $contained, $symbols);
+    MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
   }
 
   return $symbols;
@@ -1675,13 +2128,15 @@ sub GetLineNumbers {
   # Make file with all PC values
   open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n");
   for (my $i = 0; $i <= $#{$pclist}; $i++) {
-    printf ADDRESSES ("0x%x\n", $pclist->[$i] - $offset);
+    # addr2line always reads hex addresses, and does not need '0x' prefix.
+    printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset));
   }
   close(ADDRESSES);
 
   # Pass to addr2line
-  open(SYMBOLS, "$ADDR2LINE -f -C -e $image <$main::tmpfile_sym |")
-    || error("$ADDR2LINE: $!\n");
+  my $addr2line = $obj_tool_map{"addr2line"};
+  open(SYMBOLS, "$addr2line -f -C -e $image <$main::tmpfile_sym |")
+    || error("$addr2line: $!\n");
   my $count = 0;
   while (<SYMBOLS>) {
     chop;
@@ -1694,7 +2149,7 @@ sub GetLineNumbers {
       $filelinenum =~ s|^.*/([^/]+:\d+)$|$1|;    # Remove directory name
     }
 
-    my $pcstr = sprintf("0x%x", $pclist->[$count]);
+    my $pcstr = $pclist->[$count];
     if (defined($symbols->{$pcstr})) {
       # Override just the line-number portion.  The function name portion
       # is less buggy when computed using nm instead of addr2line.
@@ -1708,7 +2163,7 @@ sub GetLineNumbers {
   close(SYMBOLS);
 }
 
-# Alternate implementation
+# Use nm to map the list of referenced PCs to symbols
 sub MapSymbolsWithNM {
   my $image = shift;
   my $offset = shift;
@@ -1717,14 +2172,15 @@ sub MapSymbolsWithNM {
 
   # Get nm output sorted by increasing address
   my $symbol_table = GetProcedureBoundaries($image, ".");
-  my @names = sort { $symbol_table->{$a}->[0] <=> $symbol_table->{$b}->[0] }
+  # Start addresses are already the right length (8 or 16 hex digits).
+  my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] }
     keys(%{$symbol_table});
 
   if ($#names < 0) {
-    # No symbols: just use address
+    # No symbols: just use addresses
     foreach my $pc (@{$pclist}) {
-      my $pcstr = sprintf("0x%x", $pc);
-      $symbols->{$pcstr} = [$pcstr, "?", $pcstr];
+      my $pcstr = "0x" . $pc;
+      $symbols->{$pc} = [$pcstr, "?", $pcstr];
     }
     return;
   }
@@ -1733,22 +2189,21 @@ sub MapSymbolsWithNM {
   my $index = 0;
   my $fullname = $names[0];
   my $name = ShortFunctionName($fullname);
-  foreach my $pc (sort { $a <=> $b } @{$pclist}) {
+  foreach my $pc (sort { $a cmp $b } @{$pclist}) {
     # Adjust for mapped offset
-    my $mpc = $pc - $offset;
-    while (($index < $#names) && ($mpc >= $symbol_table->{$fullname}->[1])){
+    my $mpc = AddressSub($pc, $offset);
+    while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){
       $index++;
       $fullname = $names[$index];
       $name = ShortFunctionName($fullname);
     }
-    my $pcstr = sprintf("0x%x", $pc);
-    $symbols->{$pcstr} = [$name, "?", $fullname];
+    $symbols->{$pc} = [$name, "?", $fullname];
   }
 }
 
 sub ShortFunctionName {
   my $function = shift;
-  while ($function =~ s/\([^()]*\)//g) { }   # Remove argument types
+  while ($function =~ s/\([^()]*\)(\s*const)?//g) { }   # Argument types
   while ($function =~ s/<[^<>]*>//g)  { }    # Remove template arguments
   $function =~ s/^.*\s+(\w+::)/$1/;          # Remove leading type
   return $function;
@@ -1756,6 +2211,70 @@ sub ShortFunctionName {
 
 ##### Miscellaneous #####
 
+# Find the right versions of the above object tools to use.  The argument
+# is the program file being analyzed, and should be an ELF 32-bit or ELF
+# 64-bit executable file.  If it is, we select the default tools location
+# based on that; otherwise, we'll emit a warning and take the user's
+# defaults.
+sub ConfigureObjTools {
+  my $prog_file = shift;
+
+  # Figure out the right default pathname prefix based on the program file
+  # type:
+  my $default_prefix = "/usr/bin/";
+  my $file_type = `/usr/bin/file $prog_file`;
+  if ($file_type =~ /ELF 32-bit/) {
+    $default_prefix = "/usr/bin/";
+  } elsif ($file_type =~ /ELF 64-bit/) {
+    # Change $address_length to 16 if the program file is ELF 64-bit.
+    # We can't detect this from many (most?) heap or lock contention
+    # profiles, since the actual addresses referenced are generally in low
+    # memory even for 64-bit programs.
+    $address_length = 16;
+  } else {
+    print STDERR "WARNING: program $prog_file is apparently not an ELF file\n";
+    # Don't change the default prefix.
+  }
+
+  # Go fill in %obj_tool_map with the pathnames to use:
+  foreach my $tool (keys %obj_tool_map) {
+    $obj_tool_map{$tool} = ConfigureTool($tool, $default_prefix);
+  }
+}
+
+sub ConfigureTool {
+  my $tool = shift;
+  my $prefix = shift;
+  my $path;
+
+  if ($main::opt_debug) {
+    print STDERR "Configuring '$tool' with default prefix '$prefix'\n";
+  }
+
+  # Try a specific prefix specified by the user:
+  if ($main::opt_tools ne "") {
+    $path = $main::opt_tools . $tool;
+    if ($main::opt_debug) { print STDERR "  (a) Trying '$path'\n"; }
+    if (-x $path) { return $path; }
+  }
+
+  # Try the default prefix passed to us:
+  $path = $prefix . $tool;
+  if ($main::opt_debug) { print STDERR "  (b) Trying '$path'\n"; }
+  if (-x $path) { return $path; }
+
+  # Try the normal system default (/usr/bin/):
+  if ($prefix ne "/usr/bin") {
+    $path = "/usr/bin/$tool";
+    if ($main::opt_debug) { print STDERR "  (c) Trying '$path'\n"; }
+    if (-x $path) { return $path; }
+  }
+
+  # If all else fails, hope the bare toolname works:
+  if ($main::opt_debug) { print STDERR "  Returning '$tool'\n"; }
+  return $tool;
+}
+
 sub cleanup {
   unlink($main::tmpfile_sym);
   for (my $i = 0; $i < $main::next_tmpfile; $i++) {
@@ -1763,12 +2282,15 @@ sub cleanup {
   }
   # We leave any collected profiles in $HOME/pprof in case the user wants
   # to look at them later.  We print a message informing them of this.
-  if (defined($main::collected_profile)) {
-    print STDERR "Dynamically gathered profile is in $main::collected_profile\n";
+  if ((scalar(@main::profile_files) > 0) &&
+      defined($main::collected_profile)) {
+    if (scalar(@main::profile_files) == 1) {
+      print STDERR "Dynamically gathered profile is in $main::collected_profile\n";
+    }
     print STDERR "If you want to investigate this profile further, you can do:\n";
     print STDERR "\n";
     print STDERR "  pprof \\\n";
-    print STDERR "    $prog \\\n";
+    print STDERR "    $main::prog \\\n";
     print STDERR "    $main::collected_profile\n";
     print STDERR "\n";
   }
@@ -1786,16 +2308,6 @@ sub error {
   exit(1);
 }
 
-# Return a list of all routines that match $regexp.
-# For each routine, the following list is returned:
-#       $result->[i]->[0]       Routine name
-#       $result->[i]->[1]       Start address
-#       $result->[i]->[2]       Finish address
-#       $result->[i]->[3]       Image file name (program or shared library)
-#       $result->[i]->[4]       Offset for image in address space
-sub GetMatchingRoutines {
-}
-
 
 # Gets the procedure boundaries for all routines in "$image" whose names
 # match "$regexp" and returns them in a hashtable mapping from procedure
@@ -1805,15 +2317,16 @@ sub GetProcedureBoundaries {
   my $regexp = shift;
 
   my $symbol_table = {};
-  open(NM, "$NM -C -n $image |") || error("$NM: $!\n");
-  my $last_start = "0x0";
+  my $nm = $obj_tool_map{"nm"};
+  open(NM, "$nm -C -n $image |") || error("$nm: $!\n");
+  my $last_start = "0";
   my $routine = "";
   while (<NM>) {
     if (m/^([0-9a-f]+) . (..*)/) {
       my $start_val = $1;
       my $this_routine = $2;
       if (defined($routine) && $routine =~ m/$regexp/) {
-        $symbol_table->{$routine} = [hex($last_start), hex($start_val)];
+        $symbol_table->{$routine} = [$last_start, $start_val];
       }
       $last_start = $start_val;
       $routine = $this_routine;
@@ -1823,3 +2336,218 @@ sub GetProcedureBoundaries {
 
   return $symbol_table;
 }
+
+
+# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings.
+# To make them more readable, we add underscores at interesting places.
+# This routine removes the underscores, producing the canonical representation
+# used by pprof to represent addresses, particularly in the tested routines.
+sub CanonicalHex {
+  my $arg = shift;
+  return join '', (split '_',$arg);
+}
+
+
+# Unit test for AddressAdd:
+sub AddressAddUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressAdd ($row->[0], $row->[1]);
+    if ($sum ne $row->[2]) {
+      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[2];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
+    my $expected = join '', (split '_',$row->[2]);
+    if ($sum ne CanonicalHex($row->[2])) {
+      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[2];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Unit test for AddressSub:
+sub AddressSubUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressSub ($row->[0], $row->[1]);
+    if ($sum ne $row->[3]) {
+      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[3];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
+    if ($sum ne CanonicalHex($row->[3])) {
+      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[3];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Unit test for AddressInc:
+sub AddressIncUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressInc ($row->[0]);
+    if ($sum ne $row->[4]) {
+      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
+             $row->[0], $row->[4];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressInc (CanonicalHex($row->[0]));
+    if ($sum ne CanonicalHex($row->[4])) {
+      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
+             $row->[0], $row->[4];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Driver for unit tests.
+# Currently just the address add/subtract/increment routines for 64-bit.
+sub RunUnitTests {
+  my $error_count = 0;
+
+  # This is a list of tuples [a, b, a+b, a-b, a+1]
+  my $unit_test_data_8 = [
+    [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)],
+    [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)],
+    [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)],
+    [qw(00000001 ffffffff 00000000 00000002 00000002)],
+    [qw(00000001 fffffff0 fffffff1 00000011 00000002)],
+  ];
+  my $unit_test_data_16 = [
+    # The implementation handles data in 7-nibble chunks, so those are the
+    # interesting boundaries.
+    [qw(aaaaaaaa 50505050
+        00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)],
+    [qw(50505050 aaaaaaaa
+        00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)],
+    [qw(ffffffff aaaaaaaa
+        00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)],
+    [qw(00000001 ffffffff
+        00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)],
+    [qw(00000001 fffffff0
+        00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)],
+
+    [qw(00_a00000a_aaaaaaa 50505050
+        00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)],
+    [qw(0f_fff0005_0505050 aaaaaaaa
+        0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)],
+    [qw(00_000000f_fffffff 01_800000a_aaaaaaa
+        01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)],
+    [qw(00_0000000_0000001 ff_fffffff_fffffff
+        00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)],
+    [qw(00_0000000_0000001 ff_fffffff_ffffff0
+        ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)],
+  ];
+
+  $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16);
+  $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16);
+  $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16);
+  if ($error_count > 0) {
+    print STDERR $error_count, " errors: FAILED\n";
+  } else {
+    print STDERR "PASS\n";
+  }
+  exit ($error_count);
+}
diff --git a/src/profiler.cc b/src/profiler.cc
index 7197d42..3633f6f 100644
--- a/src/profiler.cc
+++ b/src/profiler.cc
@@ -1,10 +1,10 @@
 // Copyright (c) 2005, Google Inc.
 // All rights reserved.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
@@ -14,7 +14,7 @@
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -96,7 +96,7 @@ DEFINE_string(cpu_profile, "",
 typedef struct siginfo SigStructure;
 inline void* GetPC(const SigStructure& sig_structure ) {
   return (void*)sig_structure.si_faddr; // maybe not correct
-} 
+}
 
 #elif defined HAVE_STRUCT_SIGCONTEXT_SC_EIP
 typedef struct sigcontext SigStructure;
@@ -126,6 +126,13 @@ inline void* GetPC(const SigStructure& sig_structure ) {
 typedef struct ucontext SigStructure;
 inline void* GetPC(const SigStructure& sig_structure ) {
   return (void*)sig_structure.uc_mcontext.gregs[REG_RIP];
+}
+
+#elif defined HAVE_STRUCT_SIGCONTEXT_REGS__NIP
+typedef struct sigcontext SigStructure;
+inline void* GetPC(const SigStructure& sig_structure ) {
+  return (void*)sig_structure.regs->nip;
+}
 
 #else
 #error I dont know what your PC is
@@ -141,7 +148,7 @@ class ProfileData {
 
   // Is profiling turned on at all
   inline bool enabled() { return out_ >= 0; }
-    
+
   // What is the frequency of interrupts (ticks per second)
   inline int frequency() { return frequency_; }
 
@@ -156,7 +163,7 @@ class ProfileData {
   void Stop();
 
   void GetCurrentState(ProfilerState* state);
-  
+
  private:
   static const int kMaxStackDepth = 64;         // Max stack depth profiled
   static const int kMaxFrequency = 4000;        // Largest allowed frequency
@@ -189,7 +196,7 @@ class ProfileData {
   pthread_mutex_t table_lock_;  // Cannot use "Mutex" in signal handlers
 #endif
   Bucket*       hash_;          // hash table
-  
+
   Slot*         evict_;         // evicted entries
   int           num_evicted_;   // how many evicted entries?
   int           out_;           // fd for output file
@@ -297,8 +304,8 @@ ProfileData::ProfileData() :
 
   if (!Start(fname)) {
     fprintf(stderr, "Can't turn on cpu profiling: ");
-    perror(fname); 
-    exit(1); 
+    perror(fname);
+    exit(1);
   }
 }
 
@@ -322,8 +329,8 @@ bool ProfileData::Start(const char* fname) {
   fname_ = strdup(fname);
 
   LOCK(&table_lock_);
-  
-  // Reset counters 
+
+  // Reset counters
   num_evicted_ = 0;
   count_       = 0;
   evictions_   = 0;
@@ -484,8 +491,8 @@ void ProfileData::FlushTable() {
 void ProfileData::Add(unsigned long pc) {
   void* stack[kMaxStackDepth];
   stack[0] = (void*)pc;
-  int depth = GetStackTrace(stack+1, kMaxStackDepth-1, 
-                            3/*Removes sighandlers*/);
+  int depth = GetStackTrace(stack+1, kMaxStackDepth-1,
+                            4/*Removes Add,prof_handler,sighandlers*/);
   depth++;              // To account for pc value
 
   // Make hash-value
@@ -519,7 +526,7 @@ void ProfileData::Add(unsigned long pc) {
       }
     }
   }
-  
+
   if (!done) {
     // Evict entry with smallest count
     Entry* e = &bucket->entry[0];
@@ -532,7 +539,7 @@ void ProfileData::Add(unsigned long pc) {
       evictions_++;
       Evict(*e);
     }
-    
+
     // Use the newly evicted entry
     e->depth = depth;
     e->count = 1;
@@ -594,7 +601,7 @@ void ProfilerFlush() {
   pdata.FlushTable();
 }
 
-bool ProfilingIsEnabledForAllThreads() { 
+bool ProfilingIsEnabledForAllThreads() {
   return pdata.enabled();
 }
 
diff --git a/src/stacktrace.cc b/src/stacktrace.cc
index 708d6ce..859d52a 100644
--- a/src/stacktrace.cc
+++ b/src/stacktrace.cc
@@ -40,145 +40,31 @@
 
 // Linux/x86 implementation (requires the binary to be compiled with
 // frame pointers)
-#if (defined(__i386__) || defined(__x86_64__)) && \
-    defined(__linux) && !defined(NO_FRAME_POINTER)
+#if defined(__i386__) && defined(__linux) && !defined(NO_FRAME_POINTER)
 #define IMPLEMENTED_STACK_TRACE
-
-#include <stdint.h>   // for uintptr_t
-
-// Given a pointer to a stack frame, locate and return the calling
-// stackframe, or return NULL if no stackframe can be found. Perform
-// sanity checks to reduce the chance that a bad pointer is returned.
-static void **NextStackFrame(void **old_sp) {
-  void **new_sp = (void **) *old_sp;
-
-  // Check that the transition from frame pointer old_sp to frame
-  // pointer new_sp isn't clearly bogus
-  if (new_sp <= old_sp) return NULL;
-  if ((uintptr_t)new_sp & (sizeof(void *) - 1)) return NULL;
-#ifdef __i386__
-  // On 64-bit machines, the stack pointer can be very close to
-  // 0xffffffff, so we explicitly check for a pointer into the
-  // last two pages in the address space
-  if ((uintptr_t)new_sp >= 0xffffe000) return NULL;
-#endif
-  if ((uintptr_t)new_sp - (uintptr_t)old_sp > 100000) return NULL;
-  return new_sp;
-}
-
-// Note: the code for GetStackExtent below is pretty similar to this one;
-//       change both if chaning one.
-int GetStackTrace(void** result, int max_depth, int skip_count) {
-  void **sp;
-#ifdef __i386__
-  // Stack frame format:
-  //    sp[0]   pointer to previous frame
-  //    sp[1]   caller address
-  //    sp[2]   first argument
-  //    ...
-  sp = (void **)&result - 2;
-#endif
-
-#ifdef __x86_64__
-  // Arguments are passed in registers on x86-64, so we can't just
-  // offset from &result
-  sp = (void **) __builtin_frame_address(0);
-#endif
-
-  int n = 0;
-  skip_count++;         // Do not include the "GetStackTrace" frame
-  while (sp && n < max_depth) {
-    if (*(sp+1) == (void *)0) {
-      // In 64-bit code, we often see a frame that
-      // points to itself and has a return address of 0.
-      break;
-    }
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-      result[n++] = *(sp+1);
-    }
-    void** new_sp = NextStackFrame(sp);
-    if (!new_sp) break;
-    sp = new_sp;
-  }
-  return n;
-}
-
-// Note: the code is pretty similar to GetStackTrace above;
-//       change both if chaning one.
-bool GetStackExtent(void* sp,  void** stack_top, void** stack_bottom) {
-  void** cur_sp;
-
-  if (sp != NULL) {
-    cur_sp = (void**)sp;
-    *stack_top = sp;
-  } else {
-#ifdef __i386__
-    // Stack frame format:
-    //    sp[0]   pointer to previous frame
-    //    sp[1]   caller address
-    //    sp[2]   first argument
-    //    ...
-    cur_sp = (void**)&sp - 2;
+#include "stacktrace_x86-inl.h"
 #endif
 
-#ifdef __x86_64__
-    // Arguments are passed in registers on x86-64, so we can't just
-    // offset from &result
-    cur_sp = (void**)__builtin_frame_address(0);
+#if !defined(IMPLEMENTED_STACK_TRACE) && defined(USE_LIBUNWIND) && HAVE_LIBUNWIND_H
+#define IMPLEMENTED_STACK_TRACE
+// This is turned off by default. Possible reasons for turning on in the
+// future:
+// 1. Compiler independence
+// 2. Architecture independence
+// 3. A more liberal MIT license, which allows use with multiple compilers
+#include "stacktrace_libunwind-inl.h"
 #endif
-    *stack_top = NULL;
-  }
-
-  while (cur_sp) {
-    void** new_sp = NextStackFrame(cur_sp);
-    if (!new_sp) {
-      *stack_bottom = (void*)cur_sp;
-      return true;
-    }
-    cur_sp = new_sp;
-    if (*stack_top == NULL)  *stack_top = (void*)cur_sp;
-    // get out of the stack frame for this call
-  }
-  return false;
-}
 
+#if !defined(IMPLEMENTED_STACK_TRACE) && defined(__x86_64__) && HAVE_UNWIND_H
+#define IMPLEMENTED_STACK_TRACE
+#include "stacktrace_x86_64-inl.h"
 #endif
 
-// Portable implementation - just use glibc
-// 
-// Note:  The glibc implementation may cause a call to malloc.
-// This can cause a deadlock in HeapProfiler.
-#if !defined(IMPLEMENTED_STACK_TRACE) && defined(HAVE_EXECINFO_H)
-#include <stdlib.h>
-#include <execinfo.h>
-
-int GetStackTrace(void** result, int max_depth, int skip_count) {
-  static const int kStackLength = 64;
-  void * stack[kStackLength];
-  int size;
-
-  size = backtrace(stack, kStackLength);
-  skip_count++;  // we want to skip the current frame as well
-  int result_count = size - skip_count;
-  if ( result_count < 0 )
-    result_count = 0;
-  else if ( result_count > max_depth )
-    result_count = max_depth;
-
-  for (int i = 0; i < result_count; i++)
-    result[i] = stack[i + skip_count];
-
-  return result_count;
-}
-
-bool GetStackExtent(void* sp,  void** stack_bottom, void** stack_top) {
-  return false;  // can't climb up
-}
-
+#if !defined(IMPLEMENTED_STACK_TRACE) && !defined(__x86_64__) && HAVE_EXECINFO_H
+#define IMPLEMENTED_STACK_TRACE
+#include "stacktrace_generic-inl.h"
 #endif
 
-#if !defined(IMPLEMENTED_STACK_TRACE) && !defined(HAVE_EXECINFO_H)
+#ifndef IMPLEMENTED_STACK_TRACE
 #error Cannot calculate stack trace: will need to write for your environment
 #endif
diff --git a/src/stacktrace_generic-inl.h b/src/stacktrace_generic-inl.h
new file mode 100644
index 0000000..28a539a
--- /dev/null
+++ b/src/stacktrace_generic-inl.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Portable implementation - just use glibc
+//
+// Note:  The glibc implementation may cause a call to malloc.
+// This can cause a deadlock in HeapProfiler.
+#include <execinfo.h>
+#include <stdlib.h>
+#include "google/stacktrace.h"
+
+int GetStackTrace(void** result, int max_depth, int skip_count) {
+  static const int kStackLength = 64;
+  void * stack[kStackLength];
+  int size;
+
+  size = backtrace(stack, kStackLength);
+  skip_count++;  // we want to skip the current frame as well
+  int result_count = size - skip_count;
+  if (result_count < 0)
+    result_count = 0;
+  if (result_count > max_depth)
+    result_count = max_depth;
+  for (int i = 0; i < result_count; i++)
+    result[i] = stack[i + skip_count];
+
+  return result_count;
+}
+
+bool GetStackExtent(void* sp,  void** stack_top, void** stack_bottom) {
+  return false;  // can't climb up
+}
diff --git a/src/stacktrace_libunwind-inl.h b/src/stacktrace_libunwind-inl.h
new file mode 100644
index 0000000..42c28d3
--- /dev/null
+++ b/src/stacktrace_libunwind-inl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Arun Sharma
+//
+// Produce stack trace using libunwind
+
+extern "C" {
+#include <assert.h>
+#include <libunwind.h>
+}
+#include "google/stacktrace.h"
+
+int GetStackTrace(void** result, int max_depth, int skip_count) {
+  void *ip;
+  int ret, n = 0;
+  unw_cursor_t cursor;
+  unw_context_t uc;
+
+  unw_getcontext(&uc);
+  ret = unw_init_local(&cursor, &uc);
+  assert(ret >= 0);
+  skip_count++;         // Do not include the "GetStackTrace" frame
+
+  do {
+    ret = unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip);
+    assert(ret == 0);
+    if (skip_count > 0) {
+      skip_count--;
+    } else {
+      result[n++] = ip;
+    }
+    ret = unw_step(&cursor);
+    assert(ret >= 0);
+  } while ((n < max_depth) && (ret > 0));
+
+  return n;
+}
+
+bool GetStackExtent(void* sp,  void** stack_top, void** stack_bottom) {
+  return false;  // Not implemented yet
+}
diff --git a/src/stacktrace_x86-inl.h b/src/stacktrace_x86-inl.h
new file mode 100644
index 0000000..cdab19b
--- /dev/null
+++ b/src/stacktrace_x86-inl.h
@@ -0,0 +1,119 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Produce stack trace
+
+#include <stdint.h>   // for uintptr_t
+#include <stdlib.h>   // for NULL
+#include "google/stacktrace.h"
+
+// Given a pointer to a stack frame, locate and return the calling
+// stackframe, or return NULL if no stackframe can be found. Perform
+// sanity checks to reduce the chance that a bad pointer is returned.
+static void **NextStackFrame(void **old_sp) {
+  void **new_sp = (void **) *old_sp;
+
+  // Check that the transition from frame pointer old_sp to frame
+  // pointer new_sp isn't clearly bogus
+  if (new_sp <= old_sp) return NULL;
+  if ((uintptr_t)new_sp & (sizeof(void *) - 1)) return NULL;
+#ifdef __i386__
+  // On 64-bit machines, the stack pointer can be very close to
+  // 0xffffffff, so we explicitly check for a pointer into the
+  // last two pages in the address space
+  if ((uintptr_t)new_sp >= 0xffffe000) return NULL;
+#endif
+  if ((uintptr_t)new_sp - (uintptr_t)old_sp > 100000) return NULL;
+  return new_sp;
+}
+
+// Note: the code for GetStackExtent below is pretty similar to this one;
+//       change both if chaning one.
+int GetStackTrace(void** result, int max_depth, int skip_count) {
+  void **sp;
+  // Stack frame format:
+  //    sp[0]   pointer to previous frame
+  //    sp[1]   caller address
+  //    sp[2]   first argument
+  //    ...
+  sp = (void **)&result - 2;
+
+  int n = 0;
+  while (sp && n < max_depth) {
+    if (*(sp+1) == (void *)0) {
+      // In 64-bit code, we often see a frame that
+      // points to itself and has a return address of 0.
+      break;
+    }
+    if (skip_count > 0) {
+      skip_count--;
+    } else {
+      result[n++] = *(sp+1);
+    }
+    void** new_sp = NextStackFrame(sp);
+    if (!new_sp) break;
+    sp = new_sp;
+  }
+  return n;
+}
+
+// Note: the code is pretty similar to GetStackTrace above;
+//       change both if changing one.
+bool GetStackExtent(void* sp,  void** stack_top, void** stack_bottom) {
+  void** cur_sp;
+
+  if (sp != NULL) {
+    cur_sp = (void**)sp;
+    *stack_top = sp;
+  } else {
+    // Stack frame format:
+    //    sp[0]   pointer to previous frame
+    //    sp[1]   caller address
+    //    sp[2]   first argument
+    //    ...
+    cur_sp = (void**)&sp - 2;
+
+    *stack_top = NULL;
+  }
+
+  while (cur_sp) {
+    void** new_sp = NextStackFrame(cur_sp);
+    if (!new_sp) {
+      *stack_bottom = (void*)cur_sp;
+      return true;
+    }
+    cur_sp = new_sp;
+    if (*stack_top == NULL)  *stack_top = (void*)cur_sp;
+    // get out of the stack frame for this call
+  }
+  return false;
+}
diff --git a/src/stacktrace_x86_64-inl.h b/src/stacktrace_x86_64-inl.h
new file mode 100644
index 0000000..3cdfe13
--- /dev/null
+++ b/src/stacktrace_x86_64-inl.h
@@ -0,0 +1,134 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Arun Sharma
+//
+// Produce stack trace using libgcc
+
+extern "C" {
+#include <stdlib.h> // for NULL
+#include <unwind.h> // ABI defined unwinder
+}
+#include "google/stacktrace.h"
+
+typedef struct {
+  void **result;
+  int max_depth;
+  int skip_count;
+  int count;
+  void *top;
+  void *bottom;
+} trace_arg_t;
+
+
+// Workaround for the malloc() in _Unwind_Backtrace() issue.
+static _Unwind_Reason_Code nop_backtrace(struct _Unwind_Context *uc, void *opq) {
+  return _URC_NO_REASON;
+}
+
+
+// This code is not considered ready to run until
+// static initializers run so that we are guaranteed
+// that any malloc-related initialization is done.
+static bool ready_to_run = false;
+class StackTraceInit {
+ public:
+   StackTraceInit() {
+     // Extra call to force initialization
+     _Unwind_Backtrace(nop_backtrace, NULL);
+     ready_to_run = true;
+   }
+};
+
+static StackTraceInit module_initializer;  // Force initialization
+
+static _Unwind_Reason_Code GetOneFrame(struct _Unwind_Context *uc, void *opq) {
+  trace_arg_t *targ = (trace_arg_t *) opq;
+
+  if (targ->skip_count > 0) {
+    targ->skip_count--;
+  } else {
+    targ->result[targ->count++] = (void *) _Unwind_GetIP(uc);
+  }
+
+  if (targ->count == targ->max_depth)
+    return _URC_END_OF_STACK;
+
+  return _URC_NO_REASON;
+}
+
+int GetStackTrace(void** result, int max_depth, int skip_count) {
+  if (!ready_to_run)
+    return 0;
+
+  trace_arg_t targ;
+
+  skip_count += 1;         // Do not include the "GetStackTrace" frame
+
+  targ.result = result;
+  targ.max_depth = max_depth;
+  targ.skip_count = skip_count;
+  targ.count = 0;
+
+  _Unwind_Backtrace(GetOneFrame, &targ);
+
+  return targ.count;
+}
+
+static _Unwind_Reason_Code SearchExtents(struct _Unwind_Context *uc, void *opq) {
+  trace_arg_t *targ = (trace_arg_t *) opq;
+
+  targ->bottom = (void *) _Unwind_GetCFA(uc);
+
+  if (targ->top == NULL)
+    targ->top = targ->bottom;
+
+  return _URC_NO_REASON;
+}
+
+bool GetStackExtent(void* sp,  void** stack_top, void** stack_bottom) {
+  if (!ready_to_run)
+    return false;
+
+  trace_arg_t targ;
+
+  targ.top = NULL;
+  targ.bottom = NULL;
+
+  _Unwind_Backtrace(SearchExtents, &targ);
+
+  if ((targ.top != NULL) && (targ.bottom != NULL)) {
+    *stack_bottom = targ.bottom;
+    *stack_top = targ.top;
+    return true;
+  }
+
+  return false;
+}
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 2eb9ef4..5dc062e 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -1,10 +1,10 @@
 // Copyright (c) 2005, Google Inc.
 // All rights reserved.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
@@ -14,7 +14,7 @@
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -127,7 +127,7 @@ static const int kMinSystemAlloc = 1 << (20 - kPageShift);
 // amortize the lock overhead for accessing the central list.  Making
 // it too big may temporarily cause unnecessary memory wastage in the
 // per-thread free list until the scavenger cleans up the list.
-static const int kNumObjectsToMove = 32;
+static int num_objects_to_move[kNumClasses];
 
 // Maximum length we allow a per-thread free-list to have before we
 // move objects from it into the corresponding central free-list.  We
@@ -170,6 +170,22 @@ static size_t class_to_size[kNumClasses];
 // Mapping from size class to number of pages to allocate at a time
 static size_t class_to_pages[kNumClasses];
 
+
+
+// TransferCache is used to cache transfers of num_objects_to_move[size_class]
+// back and forth between thread caches and the central cache for a given size
+// class.
+struct TCEntry {
+  void *head;  // Head of chain of objects.
+  void *tail;  // Tail of chain of objects.
+};
+// A central cache freelist can have anywhere from 0 to kNumTransferEntries
+// slots to put link list chains into.  To keep memory usage bounded the total
+// number of TCEntries across size classes is fixed.  Currently each size
+// class is initially given one TCEntry which also means that the maximum any
+// one class can have is kNumClasses.
+static const int kNumTransferEntries = kNumClasses;
+
 // Return floor(log2(n)) for n > 0.
 #if (defined __i386__ || defined __x86_64__) && defined __GNUC__
 static inline int LgFloor(size_t n) {
@@ -201,6 +217,70 @@ static inline int LgFloor(size_t n) {
 }
 #endif
 
+
+// Some very basic linked list functions for dealing with using void * as
+// storage.
+
+static inline void *SLL_Next(void *t) {
+  return *(reinterpret_cast<void**>(t));
+}
+
+static inline void SLL_SetNext(void *t, void *n) {
+  *(reinterpret_cast<void**>(t)) = n;
+}
+
+static inline void SLL_Push(void **list, void *element) {
+  SLL_SetNext(element, *list);
+  *list = element;
+}
+
+static inline void *SLL_Pop(void **list) {
+  void *result = *list;
+  *list = SLL_Next(*list);
+  return result;
+}
+
+
+// Remove N elements from a linked list to which head points.  head will be
+// modified to point to the new head.  start and end will point to the first
+// and last nodes of the range.  Note that end will point to NULL after this
+// function is called.
+static inline void SLL_PopRange(void **head, int N, void **start, void **end) {
+  if (N == 0) {
+    *start = NULL;
+    *end = NULL;
+    return;
+  }
+
+  void *tmp = *head;
+  for (int i = 1; i < N; ++i) {
+    tmp = SLL_Next(tmp);
+  }
+
+  *start = *head;
+  *end = tmp;
+  *head = SLL_Next(tmp);
+  // Unlink range from list.
+  SLL_SetNext(tmp, NULL);
+}
+
+static inline void SLL_PushRange(void **head, void *start, void *end) {
+  if (!start) return;
+  SLL_SetNext(end, *head);
+  *head = start;
+}
+
+static inline size_t SLL_Size(void *head) {
+  int count = 0;
+  while (head) {
+    count++;
+    head = SLL_Next(head);
+  }
+  return count;
+}
+
+// Setup helper functions.
+
 static inline int SizeClass(size_t size) {
   if (size == 0) size = 1;
   const int lg = LgFloor(size);
@@ -213,6 +293,19 @@ static inline size_t ByteSizeForClass(size_t cl) {
   return class_to_size[cl];
 }
 
+
+static int NumMoveSize(size_t size) {
+  if (size == 0) return 0;
+  // Use approx 64k transfers between thread and central caches.
+  int num = static_cast<int>(64.0 * 1024.0 / size);
+  if (num < 2) num = 2;
+  // Clamp well below kMaxFreeListLength to avoid ping pong between central
+  // and thread caches.
+  if (num > static_cast<int>(0.8 * kMaxFreeListLength))
+    num = static_cast<int>(0.8 * kMaxFreeListLength);
+  return num;
+}
+
 // Initialize the mapping arrays
 static void InitSizeClasses() {
   // Special initialization for small sizes
@@ -290,6 +383,11 @@ static void InitSizeClasses() {
       abort();
     }
   }
+
+  // Initialize the num_objects_to_move array.
+  for (size_t cl = 1; cl  < kNumClasses; ++cl) {
+    num_objects_to_move[cl] = NumMoveSize(ByteSizeForClass(cl));
+  }
 }
 
 // -------------------------------------------------------------------------
@@ -833,7 +931,7 @@ void TCMalloc_PageHeap::Dump(TCMalloc_Printer* out) {
 
 static void RecordGrowth(size_t growth) {
   StackTrace* t = stacktrace_allocator.New();
-  t->depth = GetStackTrace(t->stack, kMaxStackDepth-1, 4);
+  t->depth = GetStackTrace(t->stack, kMaxStackDepth-1, 3);
   t->size = growth;
   t->stack[kMaxStackDepth-1] = reinterpret_cast<void*>(growth_stacks);
   growth_stacks = t;
@@ -939,18 +1037,26 @@ class TCMalloc_ThreadCache_FreeList {
   void clear_lowwatermark() { lowater_ = length_; }
 
   void Push(void* ptr) {
-    *(reinterpret_cast<void**>(ptr)) = list_;
-    list_ = ptr;
+    SLL_Push(&list_, ptr);
     length_++;
   }
 
   void* Pop() {
     ASSERT(list_ != NULL);
-    void* result = list_;
-    list_ = *(reinterpret_cast<void**>(result));
     length_--;
     if (length_ < lowater_) lowater_ = length_;
-    return result;
+    return SLL_Pop(&list_);
+  }
+
+  void PushRange(int N, void *start, void *end) {
+    SLL_PushRange(&list_, start, end);
+    length_ += N;
+  }
+
+  void PopRange(int N, void **start, void **end) {
+    SLL_PopRange(&list_, N, start, end);
+    ASSERT(length_ >= N);
+    length_ -= N;
   }
 };
 
@@ -1017,34 +1123,98 @@ class TCMalloc_Central_FreeList {
  public:
   void Init(size_t cl);
 
-  // REQUIRES: lock_ is held
-  // Insert object.
-  // May temporarily release lock_.
-  void Insert(void* object);
+  // These methods all do internal locking.
 
+  // Insert the specified range into the central freelist.  N is the number of
+  // elements in the range.
+  void InsertRange(void *start, void *end, int N);
+
+  // Returns the actual number of fetched elements into N.
+  void RemoveRange(void **start, void **end, int *N);
+
+  // Returns the number of free objects in cache.
+  int length() {
+    SpinLockHolder h(&lock_);
+    return counter_;
+  }
+
+  // Returns the number of free objects in the transfer cache.
+  int tc_length() {
+    SpinLockHolder h(&lock_);
+    return used_slots_ * num_objects_to_move[size_class_];
+  }
+
+ private:
   // REQUIRES: lock_ is held
   // Remove object from cache and return.
   // Return NULL if no free entries in cache.
-  void* Remove();
+  void* FetchFromSpans();
 
   // REQUIRES: lock_ is held
-  // Populate cache by fetching from the page heap.
+  // Remove object from cache and return.  Fetches
+  // from pageheap if cache is empty.  Only returns
+  // NULL on allocation failure.
+  void* FetchFromSpansSafe();
+
+  // REQUIRES: lock_ is held
+  // Release a linked list of objects to spans.
   // May temporarily release lock_.
-  void Populate();
+  void ReleaseListToSpans(void *start);
 
   // REQUIRES: lock_ is held
-  // Number of free objects in cache
-  int length() const { return counter_; }
+  // Release an object to spans.
+  // May temporarily release lock_.
+  void ReleaseToSpans(void* object);
+
+  // REQUIRES: lock_ is held
+  // Populate cache by fetching from the page heap.
+  // May temporarily release lock_.
+  void Populate();
 
-  // Lock -- exposed because caller grabs it before touching this object
+  // REQUIRES: lock is held.
+  // Tries to make room for a TCEntry.  If the cache is full it will try to
+  // expand it at the cost of some other cache size.  Return false if there is
+  // no space.
+  bool MakeCacheSpace();
+
+  // REQUIRES: lock_ for locked_size_class is held.
+  // Picks a "random" size class to steal TCEntry slot from.  In reality it
+  // just iterates over the sizeclasses but does so without taking a lock.
+  // Returns true on success.
+  // May temporarily lock a "random" size class.
+  static bool EvictRandomSizeClass(int locked_size_class, bool force);
+
+  // REQUIRES: lock_ is *not* held.
+  // Tries to shrink the Cache.  If force is true it will relase objects to
+  // spans if it allows it to shrink the cache.  Return false if it failed to
+  // shrink the cache.  Decrements cache_size_ on succeess.
+  // May temporarily take lock_.  If it takes lock_, the locked_size_class
+  // lock is released to the thread from holding two size class locks
+  // concurrently which could lead to a deadlock.
+  bool ShrinkCache(int locked_size_class, bool force);
+
+  // This lock protects all the data members.  cached_entries and cache_size_
+  // may be looked at without holding the lock.
   SpinLock lock_;
 
- private:
-  // We keep linked lists of empty and non-emoty spans.
+  // We keep linked lists of empty and non-empty spans.
   size_t   size_class_;     // My size class
   Span     empty_;          // Dummy header for list of empty spans
   Span     nonempty_;       // Dummy header for list of non-empty spans
   size_t   counter_;        // Number of free objects in cache entry
+
+  // Here we reserve space for TCEntry cache slots.  Since one size class can
+  // end up getting all the TCEntries quota in the system we just preallocate
+  // sufficient number of entries here.
+  TCEntry tc_slots_[kNumTransferEntries];
+
+  // Number of currently used cached entries in tc_slots_.  This variable is
+  // updated under a lock but can be read without one.
+  int32_t used_slots_;
+  // The current number of slots for this size class.  This is an
+  // adaptive value that is increased if there is lots of traffic
+  // on a given size class.
+  int32_t cache_size_;
 };
 
 // Pad each CentralCache object to multiple of 64 bytes
@@ -1104,9 +1274,21 @@ void TCMalloc_Central_FreeList::Init(size_t cl) {
   DLL_Init(&empty_);
   DLL_Init(&nonempty_);
   counter_ = 0;
+
+  cache_size_ = 1;
+  used_slots_ = 0;
+  ASSERT(cache_size_ <= kNumTransferEntries);
+}
+
+void TCMalloc_Central_FreeList::ReleaseListToSpans(void* start) {
+  while (start) {
+    void *next = SLL_Next(start);
+    ReleaseToSpans(start);
+    start = next;
+  }
 }
 
-void TCMalloc_Central_FreeList::Insert(void* object) {
+void TCMalloc_Central_FreeList::ReleaseToSpans(void* object) {
   const PageID p = reinterpret_cast<uintptr_t>(object) >> kPageShift;
   Span* span = pageheap->GetDescriptor(p);
   ASSERT(span != NULL);
@@ -1151,7 +1333,139 @@ void TCMalloc_Central_FreeList::Insert(void* object) {
   }
 }
 
-void* TCMalloc_Central_FreeList::Remove() {
+bool TCMalloc_Central_FreeList::EvictRandomSizeClass(
+    int locked_size_class, bool force) {
+  static int race_counter = 0;
+  int t = race_counter++;  // Updated without a lock, but who cares.
+  if (t >= kNumClasses) {
+    while (t >= kNumClasses) {
+      t -= kNumClasses;
+    }
+    race_counter = t;
+  }
+  ASSERT(t >= 0);
+  ASSERT(t < kNumClasses);
+  if (t == locked_size_class) return false;
+  return central_cache[t].ShrinkCache(locked_size_class, force);
+}
+
+bool TCMalloc_Central_FreeList::MakeCacheSpace() {
+  // Is there room in the cache?
+  if (used_slots_ < cache_size_) return true;
+  // Check if we can expand this cache?
+  if (cache_size_ == kNumTransferEntries) return false;
+  // Ok, we'll try to grab an entry from some other size class.
+  if (EvictRandomSizeClass(size_class_, false) ||
+      EvictRandomSizeClass(size_class_, true)) {
+    // Succeeded in evicting, we're going to make our cache larger.
+    cache_size_++;
+    return true;
+  }
+  return false;
+}
+
+
+namespace {
+class LockInverter {
+ private:
+  TCMalloc_SpinLock *held_, *temp_;
+ public:
+  inline explicit LockInverter(TCMalloc_SpinLock* held, TCMalloc_SpinLock *temp)
+    : held_(held), temp_(temp) { held_->Unlock(); temp_->Lock(); }
+  inline ~LockInverter() { temp_->Unlock(); held_->Lock();  }
+};
+}
+
+bool TCMalloc_Central_FreeList::ShrinkCache(int locked_size_class, bool force) {
+  // Start with a quick check without taking a lock.
+  if (cache_size_ == 0) return false;
+  // We don't evict from a full cache unless we are 'forcing'.
+  if (force == false && used_slots_ == cache_size_) return false;
+
+  // Grab lock, but first release the other lock held by this thread.  We use
+  // the lock inverter to ensure that we never hold two size class locks
+  // concurrently.  That can create a deadlock because there is no well
+  // defined nesting order.
+  LockInverter li(&central_cache[locked_size_class].lock_, &lock_);
+  ASSERT(used_slots_ <= cache_size_);
+  ASSERT(0 <= cache_size_);
+  if (cache_size_ == 0) return false;
+  if (used_slots_ == cache_size_) {
+    if (force == false) return false;
+    // ReleaseListToSpans releases the lock, so we have to make all the
+    // updates to the central list before calling it.
+    cache_size_--;
+    used_slots_--;
+    ReleaseListToSpans(tc_slots_[used_slots_].head);
+    return true;
+  }
+  cache_size_--;
+  return true;
+}
+
+void TCMalloc_Central_FreeList::InsertRange(void *start, void *end, int N) {
+  SpinLockHolder h(&lock_);
+  if (N == num_objects_to_move[size_class_] &&
+    MakeCacheSpace()) {
+    int slot = used_slots_++;
+    ASSERT(slot >=0);
+    ASSERT(slot < kNumTransferEntries);
+    TCEntry *entry = &tc_slots_[slot];
+    entry->head = start;
+    entry->tail = end;
+    return;
+  }
+  ReleaseListToSpans(start);
+}
+
+void TCMalloc_Central_FreeList::RemoveRange(void **start, void **end, int *N) {
+  int num = *N;
+  ASSERT(num > 0);
+
+  SpinLockHolder h(&lock_);
+  if (num == num_objects_to_move[size_class_] && used_slots_ > 0) {
+    int slot = --used_slots_;
+    ASSERT(slot >= 0);
+    TCEntry *entry = &tc_slots_[slot];
+    *start = entry->head;
+    *end = entry->tail;
+    return;
+  }
+
+  // TODO: Prefetch multiple TCEntries?
+  void *tail = FetchFromSpansSafe();
+  if (!tail) {
+    // We are completely out of memory.
+    *start = *end = NULL;
+    *N = 0;
+    return;
+  }
+
+  SLL_SetNext(tail, NULL);
+  void *head = tail;
+  int count = 1;
+  while (count < num) {
+    void *t = FetchFromSpans();
+    if (!t) break;
+    SLL_Push(&head, t);
+    count++;
+  }
+  *start = head;
+  *end = tail;
+  *N = count;
+}
+
+
+void* TCMalloc_Central_FreeList::FetchFromSpansSafe() {
+  void *t = FetchFromSpans();
+  if (!t) {
+    Populate();
+    t = FetchFromSpans();
+  }
+  return t;
+}
+
+void* TCMalloc_Central_FreeList::FetchFromSpans() {
   if (DLL_IsEmpty(&nonempty_)) return NULL;
   Span* span = nonempty_.next;
 
@@ -1244,11 +1558,8 @@ void TCMalloc_ThreadCache::Init(pthread_t tid) {
 void TCMalloc_ThreadCache::Cleanup() {
   // Put unused memory back into central cache
   for (int cl = 0; cl < kNumClasses; ++cl) {
-    FreeList* src = &list_[cl];
-    TCMalloc_Central_FreeList* dst = &central_cache[cl];
-    SpinLockHolder h(&dst->lock_);
-    while (!src->empty()) {
-      dst->Insert(src->Pop());
+    if (list_[cl].length() > 0) {
+      ReleaseToCentralCache(cl, list_[cl].length());
     }
   }
 }
@@ -1271,43 +1582,39 @@ inline void TCMalloc_ThreadCache::Deallocate(void* ptr, size_t cl) {
   list->Push(ptr);
   // If enough data is free, put back into central cache
   if (list->length() > kMaxFreeListLength) {
-    ReleaseToCentralCache(cl, kNumObjectsToMove);
+    ReleaseToCentralCache(cl, num_objects_to_move[cl]);
   }
   if (size_ >= per_thread_cache_size) Scavenge();
 }
 
 // Remove some objects of class "cl" from central cache and add to thread heap
 void TCMalloc_ThreadCache::FetchFromCentralCache(size_t cl) {
-  TCMalloc_Central_FreeList* src = &central_cache[cl];
-  FreeList* dst = &list_[cl];
-  SpinLockHolder h(&src->lock_);
-  for (int i = 0; i < kNumObjectsToMove; i++) {
-    void* object = src->Remove();
-   if (object == NULL) {
-      if (i == 0) {
-        src->Populate();        // Temporarily releases src->lock_
-        object = src->Remove();
-     }
-      if (object == NULL) {
-        break;
-      }
-    }
-    dst->Push(object);
-    size_ += ByteSizeForClass(cl);
-  }
+  int fetch_count = num_objects_to_move[cl];
+  void *start, *end;
+  central_cache[cl].RemoveRange(&start, &end, &fetch_count);
+  list_[cl].PushRange(fetch_count, start, end);
+  size_ += ByteSizeForClass(cl) * fetch_count;
 }
 
 // Remove some objects of class "cl" from thread heap and add to central cache
 void TCMalloc_ThreadCache::ReleaseToCentralCache(size_t cl, int N) {
+  ASSERT(N > 0);
   FreeList* src = &list_[cl];
-  TCMalloc_Central_FreeList* dst = &central_cache[cl];
-  SpinLockHolder h(&dst->lock_);
   if (N > src->length()) N = src->length();
   size_ -= N*ByteSizeForClass(cl);
-  while (N-- > 0) {
-    void* ptr = src->Pop();
-    dst->Insert(ptr);
+
+  // We return prepackaged chains of the correct size to the central cache.
+  // TODO: Use the same format internally in the thread caches?
+  int batch_size = num_objects_to_move[cl];
+  while (N > batch_size) {
+    void *tail, *head;
+    src->PopRange(batch_size, &head, &tail);
+    central_cache[cl].InsertRange(head, tail, batch_size);
+    N -= batch_size;
   }
+  void *tail, *head;
+  src->PopRange(N, &head, &tail);
+  central_cache[cl].InsertRange(head, tail, N);
 }
 
 // Release idle memory to the central cache
@@ -1395,7 +1702,7 @@ void TCMalloc_ThreadCache::InitTSD() {
   ASSERT(!tsd_inited);
   perftools_pthread_key_create(&heap_key, DeleteCache);
   tsd_inited = true;
-    
+
   // We may have used a fake pthread_t for the main thread.  Fix it.
   pthread_t zero;
   memset(&zero, 0, sizeof(zero));
@@ -1499,6 +1806,7 @@ struct TCMallocStats {
   uint64_t system_bytes;        // Bytes alloced from system
   uint64_t thread_bytes;        // Bytes in thread caches
   uint64_t central_bytes;       // Bytes in central cache
+  uint64_t transfer_bytes;      // Bytes in central transfer cache
   uint64_t pageheap_bytes;      // Bytes in page heap
   uint64_t metadata_bytes;      // Bytes alloced for metadata
 };
@@ -1506,11 +1814,14 @@ struct TCMallocStats {
 // Get stats into "r".  Also get per-size-class counts if class_count != NULL
 static void ExtractStats(TCMallocStats* r, uint64_t* class_count) {
   r->central_bytes = 0;
+  r->transfer_bytes = 0;
   for (int cl = 0; cl < kNumClasses; ++cl) {
-    SpinLockHolder h(&central_cache[cl].lock_);
     const int length = central_cache[cl].length();
+    const int tc_length = central_cache[cl].tc_length();
     r->central_bytes += static_cast<uint64_t>(ByteSizeForClass(cl)) * length;
-    if (class_count) class_count[cl] = length;
+    r->transfer_bytes +=
+      static_cast<uint64_t>(ByteSizeForClass(cl)) * tc_length;
+    if (class_count) class_count[cl] = length + tc_length;
   }
 
   // Add stats from per-thread heaps
@@ -1564,6 +1875,7 @@ static void DumpStats(TCMalloc_Printer* out, int level) {
   const uint64_t bytes_in_use = stats.system_bytes
                                 - stats.pageheap_bytes
                                 - stats.central_bytes
+                                - stats.transfer_bytes
                                 - stats.thread_bytes;
 
   out->printf("------------------------------------------------\n"
@@ -1571,6 +1883,7 @@ static void DumpStats(TCMalloc_Printer* out, int level) {
               "MALLOC: %12" LLU " Bytes in use by application\n"
               "MALLOC: %12" LLU " Bytes free in page heap\n"
               "MALLOC: %12" LLU " Bytes free in central cache\n"
+              "MALLOC: %12" LLU " Bytes free in transfer cache\n"
               "MALLOC: %12" LLU " Bytes free in thread caches\n"
               "MALLOC: %12" LLU " Spans in use\n"
               "MALLOC: %12" LLU " Thread heaps in use\n"
@@ -1580,6 +1893,7 @@ static void DumpStats(TCMalloc_Printer* out, int level) {
               bytes_in_use,
               stats.pageheap_bytes,
               stats.central_bytes,
+              stats.transfer_bytes,
               stats.thread_bytes,
               uint64_t(span_allocator.inuse()),
               uint64_t(threadheap_allocator.inuse()),
@@ -1787,7 +2101,7 @@ static Span* DoSampledAllocation(size_t size) {
   }
 
   // Fill stack trace and record properly
-  stack->depth = GetStackTrace(stack->stack, kMaxStackDepth, 2);
+  stack->depth = GetStackTrace(stack->stack, kMaxStackDepth, 1);
   stack->size = size;
   span->sample = 1;
   span->objects = stack;
@@ -1797,28 +2111,34 @@ static Span* DoSampledAllocation(size_t size) {
 }
 
 static inline void* do_malloc(size_t size) {
+  void* ret = NULL;
 
-  if (TCMallocDebug::level >= TCMallocDebug::kVerbose) 
+  if (TCMallocDebug::level >= TCMallocDebug::kVerbose) {
     MESSAGE("In tcmalloc do_malloc(%" PRIuS")\n", size);
+  }
   // The following call forces module initialization
   TCMalloc_ThreadCache* heap = TCMalloc_ThreadCache::GetCache();
   if (heap->SampleAllocation(size)) {
     Span* span = DoSampledAllocation(size);
-    if (span == NULL) return NULL;
-    return reinterpret_cast<void*>(span->start << kPageShift);
+    if (span != NULL) {
+      ret = reinterpret_cast<void*>(span->start << kPageShift);
+    }
   } else if (size > kMaxSize) {
     // Use page-level allocator
     SpinLockHolder h(&pageheap_lock);
     Span* span = pageheap->New(pages(size));
-    if (span == NULL) return NULL;
-    return reinterpret_cast<void*>(span->start << kPageShift);
+    if (span != NULL) {
+      ret = reinterpret_cast<void*>(span->start << kPageShift);
+    }
   } else {
-    return heap->Allocate(size);
+    ret = heap->Allocate(size);
   }
+  if (ret == NULL) errno = ENOMEM;
+  return ret;
 }
 
 static inline void do_free(void* ptr) {
-  if (TCMallocDebug::level >= TCMallocDebug::kVerbose) 
+  if (TCMallocDebug::level >= TCMallocDebug::kVerbose)
     MESSAGE("In tcmalloc do_free(%p)\n", ptr);
   if (ptr == NULL) return;
   ASSERT(pageheap != NULL);  // Should not call free() before malloc()
@@ -1835,8 +2155,8 @@ static inline void do_free(void* ptr) {
       heap->Deallocate(ptr, cl);
     } else {
       // Delete directly into central cache
-      SpinLockHolder h(&central_cache[cl].lock_);
-      central_cache[cl].Insert(ptr);
+      SLL_SetNext(ptr, NULL);
+      central_cache[cl].InsertRange(ptr, ptr, 1);
     }
   } else {
     SpinLockHolder h(&pageheap_lock);
@@ -2045,39 +2365,88 @@ extern "C" void* realloc(void* old_ptr, size_t new_size) {
 }
 
 #ifndef COMPILER_INTEL
-#define OPNEW_THROW
-#define OPDELETE_THROW
+#define OP_THROWNOTHING
+#define OP_THROWBADALLOC
 #else
-#define OPNEW_THROW throw(std::bad_alloc)
-#define OPDELETE_THROW throw()
+#define OP_THROWNOTHING throw()
+#define OP_THROWBADALLOC throw(std::bad_alloc)
 #endif
 
-void* operator new(size_t size) OPNEW_THROW {
-  void* p = do_malloc(size);
-  if (p == NULL) {
-    MESSAGE("Unable to allocate %" PRIuS " bytes: new failed\n", size);
-    abort();
+static SpinLock set_new_handler_lock = SPINLOCK_INITIALIZER;
+
+static inline void* cpp_alloc(size_t size, bool nothrow) {
+  for (;;) {
+    void* p = do_malloc(size);
+#ifdef PREANSINEW
+    MallocHook::InvokeNewHook(p, size);
+    return p;
+#else
+    if (p == NULL) {  // allocation failed
+      // Get the current new handler.  NB: this function is not
+      // thread-safe.  We make a feeble stab at making it so here, but
+      // this lock only protects against tcmalloc interfering with
+      // itself, not with other libraries calling set_new_handler.
+      std::new_handler nh;
+      {
+        SpinLockHolder h(&set_new_handler_lock);
+        nh = std::set_new_handler(0);
+        (void) std::set_new_handler(nh);
+      }
+      // If no new_handler is established, the allocation failed.
+      if (!nh) {
+        if (nothrow) return 0;
+        throw std::bad_alloc();
+      }
+      // Otherwise, try the new_handler.  If it returns, retry the
+      // allocation.  If it throws std::bad_alloc, fail the allocation.
+      // if it throws something else, don't interfere.
+      try {
+        (*nh)();
+      } catch (const std::bad_alloc&) {
+        if (!nothrow) throw;
+        MallocHook::InvokeNewHook(p, size);
+        return p;
+      }
+    } else {  // allocation success
+      MallocHook::InvokeNewHook(p, size);
+      return p;
+    }
+#endif
   }
-  MallocHook::InvokeNewHook(p, size);
-  return p;
 }
 
-void operator delete(void* p) OPDELETE_THROW {
+void* operator new(size_t size) OP_THROWBADALLOC {
+  return cpp_alloc(size, false);
+}
+
+void* operator new(size_t size, const std::nothrow_t&) OP_THROWNOTHING {
+  return cpp_alloc(size, true);
+}
+
+void operator delete(void* p) OP_THROWNOTHING {
   MallocHook::InvokeDeleteHook(p);
   do_free(p);
 }
 
-void* operator new[](size_t size) OPNEW_THROW {
-  void* p = do_malloc(size);
-  if (p == NULL) {
-    MESSAGE("Unable to allocate %" PRIuS " bytes: new failed\n", size);
-    abort();
-  }
-  MallocHook::InvokeNewHook(p, size);
-  return p;
+void operator delete(void* p, const std::nothrow_t&) OP_THROWNOTHING {
+  MallocHook::InvokeDeleteHook(p);
+  do_free(p);
+}
+
+void* operator new[](size_t size) OP_THROWBADALLOC {
+  return cpp_alloc(size, false);
+}
+
+void* operator new[](size_t size, const std::nothrow_t&) OP_THROWNOTHING {
+  return cpp_alloc(size, true);
+}
+
+void operator delete[](void* p) OP_THROWNOTHING {
+  MallocHook::InvokeDeleteHook(p);
+  do_free(p);
 }
 
-void operator delete[](void* p) OPDELETE_THROW {
+void operator delete[](void* p, const std::nothrow_t&) OP_THROWNOTHING {
   MallocHook::InvokeDeleteHook(p);
   do_free(p);
 }
@@ -2143,11 +2512,14 @@ extern "C" struct mallinfo mallinfo(void) {
   // Unfortunately, the struct contains "int" field, so some of the
   // size values will be truncated.
   info.arena     = static_cast<int>(stats.system_bytes);
-  info.fsmblks   = static_cast<int>(stats.thread_bytes + stats.central_bytes);
+  info.fsmblks   = static_cast<int>(stats.thread_bytes
+                                    + stats.central_bytes
+                                    + stats.transfer_bytes);
   info.fordblks  = static_cast<int>(stats.pageheap_bytes);
   info.uordblks  = static_cast<int>(stats.system_bytes
                                     - stats.thread_bytes
                                     - stats.central_bytes
+                                    - stats.transfer_bytes
                                     - stats.pageheap_bytes);
 
   return info;
diff --git a/src/tests/stacktrace_unittest.cc b/src/tests/stacktrace_unittest.cc
index 34a7718..b73cf4b 100644
--- a/src/tests/stacktrace_unittest.cc
+++ b/src/tests/stacktrace_unittest.cc
@@ -42,6 +42,7 @@
 // backtrace, and maybe print the backtrace to stdout. 
 
 //-----------------------------------------------------------------------//
+void CheckStackTraceLeaf();
 void CheckStackTrace4(int i);
 void CheckStackTrace3(int i);
 void CheckStackTrace2(int i);
@@ -51,8 +52,9 @@ void CheckStackTrace(int i);
 
 // The sequence of functions whose return addresses we expect to see in the
 // backtrace.
-const int BACKTRACE_STEPS = 5;
+const int BACKTRACE_STEPS = 6;
 void * expected_stack[BACKTRACE_STEPS] = { 
+  (void *) &CheckStackTraceLeaf,
   (void *) &CheckStackTrace4,
   (void *) &CheckStackTrace3,
   (void *) &CheckStackTrace2,
@@ -77,10 +79,6 @@ void * expected_stack[BACKTRACE_STEPS] = {
 // ...
 // stack[5] is ret addr within CheckStackTrace
 
-// The google version does not include the caller in the 
-// backtrace.  Some other version might.  (glibc backtrace()?)
-const int self_in_backtrace = 0;
-
 //-----------------------------------------------------------------------//
 
 void CheckRetAddrIsInFunction( void * ret_addr, void * function_start_addr)
@@ -96,14 +94,20 @@ void CheckStackTraceLeaf(void) {
   const int STACK_LEN = 10;
   void *stack[STACK_LEN];
   int size;
+    void *top, *bottom;
 
   size = GetStackTrace(stack, STACK_LEN, 0);
   printf("Obtained %d stack frames.\n", size);
   CHECK_LE(size, STACK_LEN);
-  
+
+  // GetStackExtent() may not be implemented
+  if (GetStackExtent(NULL, &top, &bottom)) {
+    printf("%p %p\n", top, bottom);
+  }
+
   for (int i = 0; i < BACKTRACE_STEPS; i++)
   {
-    CheckRetAddrIsInFunction(stack[i + self_in_backtrace], expected_stack[i]);
+    CheckRetAddrIsInFunction(stack[i], expected_stack[i]);
   }
 
 
@@ -111,6 +115,7 @@ void CheckStackTraceLeaf(void) {
   {
     char **strings = backtrace_symbols(stack, size);
     
+    printf("Obtained %d stack frames.\n", size);
     for (int i = 0; i < size; i++)
       printf("%s\n", strings[i]);
     printf("CheckStackTrace() addr: %p\n", &CheckStackTrace);
diff --git a/src/tests/tcmalloc_unittest.cc b/src/tests/tcmalloc_unittest.cc
index 22a6c29..c151c2a 100644
--- a/src/tests/tcmalloc_unittest.cc
+++ b/src/tests/tcmalloc_unittest.cc
@@ -1,10 +1,10 @@
 // Copyright (c) 2005, Google Inc.
 // All rights reserved.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
@@ -14,7 +14,7 @@
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -28,33 +28,598 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 // ---
-// Author: Paul Menage
+// Unittest for the TCMalloc implementation.
 //
-// TODO(menage) Turn this into a real unittest ...
+// * The test consists of a set of threads.
+// * Each thread maintains a set of allocated objects, with
+//   a bound on the total amount of data in the set.
+// * Each allocated object's contents are generated by
+//   hashing the object pointer, and a generation count
+//   in the object.  This allows us to easily check for
+//   data corruption.
+// * At any given step, the thread can do any of the following:
+//     a. Allocate an object
+//     b. Increment an object's generation count and update
+//        its contents.
+//     c. Pass the object to another thread
+//     d. Free an object
+//   Also, at the end of every step, object(s) are freed to maintain
+//   the memory upper-bound.
 
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
-#include <stdint.h>
-#include "google/malloc_extension.h"
+#include <stdint.h>      // for intptr_t
+#include <unistd.h>      // for getpid()
+#include <pthread.h>
+#include <vector>
+#include <string>
+#include <new>
+#include "base/logging.h"
+
+#define LOGSTREAM   stdout
+
+using std::vector;
+using std::string;
+
+static const int FLAGS_numtests = 50000;
+static const int FLAGS_log_every_n_tests = 10000;
+
+// Testing parameters
+static const int FLAGS_lgmaxsize = 16;   // lg() of the max size object to alloc
+static const int FLAGS_numthreads = 10;  // Number of threads
+static const int FLAGS_threadmb = 4;     // Max memory size allocated by thread
+
+// Weights of different operations
+static const int FLAGS_mallocweight = 50;   // Weight for picking malloc
+static const int FLAGS_freeweight = 50;     // Weight for picking free
+static const int FLAGS_updateweight = 10;   // Weight for picking update
+static const int FLAGS_passweight = 1;      // Weight for passing object
+
+static const int kSizeBits = 8 * sizeof(size_t);
+static const size_t kMaxSize = ~static_cast<size_t>(0);
+static const size_t kMaxSignedSize = ((size_t(1) << (kSizeBits-1)) - 1);
+
+static const size_t kNotTooBig = 100000;
+static const size_t kTooBig = kMaxSignedSize;
+
+static int news_handled = 0;
+
+// Global array of threads
+class TesterThread;
+static TesterThread** threads;
+
+// To help with generating random numbers
+class TestHarness {
+ private:
+  // Information kept per type
+  struct Type {
+    string      name;
+    int         type;
+    int         weight;
+  };
+
+ public:
+  TestHarness(int seed)
+      : types_(new vector<Type>), total_weight_(0), num_tests_(0) {
+    srandom(seed);
+  }
+  ~TestHarness() {
+    delete types_;
+  }
+
+  // Add operation type with specified weight.  When starting a new
+  // iteration, an operation type is picked with probability
+  // proportional to its weight.
+  //
+  // "type" must be non-negative.
+  // "weight" must be non-negative.
+  void AddType(int type, int weight, const char* name);
+
+  // Call this to get the type of operation for the next iteration.
+  // It returns a random operation type from the set of registered
+  // operations.  Returns -1 if tests should finish.
+  int PickType();
+
+  // If n == 0, returns the next pseudo-random number in the range [0 .. 0]
+  // If n != 0, returns the next pseudo-random number in the range [0 .. n)
+  int Uniform(int n) {
+    if (n == 0) {
+      return random() * 0;
+    } else {
+      return random() % n;
+    }
+  }
+  // Pick "base" uniformly from range [0,max_log] and then return
+  // "base" random bits.  The effect is to pick a number in the range
+  // [0,2^max_log-1] with bias towards smaller numbers.
+  int Skewed(int max_log) {
+    const int base = random() % (max_log+1);
+    return random() % (1 << base);
+  }
+
+ private:
+  vector<Type>*         types_;         // Registered types
+  int                   total_weight_;  // Total weight of all types
+  int                   num_tests_;     // Num tests run so far
+};
+
+void TestHarness::AddType(int type, int weight, const char* name) {
+  Type t;
+  t.name = name;
+  t.type = type;
+  t.weight = weight;
+  types_->push_back(t);
+  total_weight_ += weight;
+}
+
+int TestHarness::PickType() {
+  if (num_tests_ >= FLAGS_numtests) return -1;
+
+  assert(total_weight_ > 0);
+  // This is a little skewed if total_weight_ doesn't divide 2^31, but it's close
+  int v = Uniform(total_weight_);
+  int i;
+  for (i = 0; i < types_->size(); i++) {
+    v -= (*types_)[i].weight;
+    if (v < 0) {
+      break;
+    }
+  }
+
+  assert(i < types_->size());
+  if ((num_tests_ % FLAGS_log_every_n_tests) == 0) {
+    fprintf(LOGSTREAM, "Test %d out of %d: %s\n",
+            num_tests_, FLAGS_numtests, (*types_)[i].name.c_str());
+  }
+  num_tests_++;
+  return (*types_)[i].type;
+}
+
+
+// Info kept per thread
+class TesterThread {
+ private:
+  // Info kept per allocated object
+  struct Object {
+    char*       ptr;                    // Allocated pointer
+    int         size;                   // Allocated size
+    int         generation;             // Generation counter of object contents
+  };
+
+  pthread_mutex_t       lock_;          // For passing in another thread's obj
+  int                   id_;            // My thread id
+  TestHarness           rnd_;           // For generating random numbers
+  vector<Object>        heap_;          // This thread's heap
+  vector<Object>        passed_;        // Pending objects passed from others
+  size_t                heap_size_;     // Current heap size
+  int                   locks_ok_;      // Number of OK TryLock() ops
+  int                   locks_failed_;  // Number of failed TryLock() ops
+
+  // Type of operations
+  enum Type { MALLOC, FREE, UPDATE, PASS };
+
+ public:
+  TesterThread(int id)
+    : id_(id),
+      rnd_(id+1),
+      heap_size_(0),
+      locks_ok_(0),
+      locks_failed_(0) {
+    CHECK_EQ(pthread_mutex_init(&lock_, NULL), 0);
+  }
+
+  virtual ~TesterThread() {
+    fprintf(LOGSTREAM, "Thread %2d: locks %6d ok; %6d trylocks failed\n",
+            id_, locks_ok_, locks_failed_);
+    if (locks_ok_ + locks_failed_ >= 1000) {
+      CHECK_LE(locks_failed_, locks_ok_ / 2);
+    }
+  }
+
+  virtual void Run() {
+    rnd_.AddType(MALLOC, FLAGS_mallocweight,  "malloc");
+    rnd_.AddType(FREE,   FLAGS_freeweight,    "free");
+    rnd_.AddType(UPDATE, FLAGS_updateweight,  "update");
+    rnd_.AddType(PASS,   FLAGS_passweight,    "pass");
+
+    while (true) {
+      AcquirePassedObjects();
+
+      switch (rnd_.PickType()) {
+        case MALLOC:  AllocateObject(); break;
+        case FREE:    FreeObject();     break;
+        case UPDATE:  UpdateObject();   break;
+        case PASS:    PassObject();     break;
+        case -1:      goto done;
+        default:      assert("" == "Unknown type");
+      }
+
+      ShrinkHeap();
+    }
+
+ done:
+    DeleteHeap();
+  }
+
+  // Allocate a new object
+  void AllocateObject() {
+    Object object;
+    object.size = rnd_.Skewed(FLAGS_lgmaxsize);
+    object.ptr = reinterpret_cast<char*>(malloc(object.size));
+    object.generation = 0;
+    FillContents(&object);
+    heap_.push_back(object);
+    heap_size_ += object.size;
+  }
+
+  // Mutate a random object
+  void UpdateObject() {
+    if (heap_.empty()) return;
+    const int index = rnd_.Uniform(heap_.size());
+    CheckContents(heap_[index]);
+    heap_[index].generation++;
+    FillContents(&heap_[index]);
+  }
+
+  // Free a random object
+  void FreeObject() {
+    if (heap_.empty()) return;
+    const int index = rnd_.Uniform(heap_.size());
+    Object object = heap_[index];
+    CheckContents(object);
+    free(object.ptr);
+    heap_size_ -= object.size;
+    heap_[index] = heap_[heap_.size()-1];
+    heap_.pop_back();
+  }
+
+  // Delete all objects in the heap
+  void DeleteHeap() {
+    while (!heap_.empty()) {
+      FreeObject();
+    }
+  }
+
+  // Free objects until our heap is small enough
+  void ShrinkHeap() {
+    while (heap_size_ > FLAGS_threadmb << 20) {
+      assert(!heap_.empty());
+      FreeObject();
+    }
+  }
+
+  // Pass a random object to another thread
+  void PassObject() {
+    // Pick object to pass
+    if (heap_.empty()) return;
+    const int index = rnd_.Uniform(heap_.size());
+    Object object = heap_[index];
+    CheckContents(object);
+
+    // Pick thread to pass
+    const int tid = rnd_.Uniform(FLAGS_numthreads);
+    TesterThread* thread = threads[tid];
+
+    if (pthread_mutex_trylock(&thread->lock_) == 0) {
+      // Pass the object
+      locks_ok_++;
+      thread->passed_.push_back(object);
+      CHECK_EQ(pthread_mutex_unlock(&thread->lock_), 0);
+      heap_size_ -= object.size;
+      heap_[index] = heap_[heap_.size()-1];
+      heap_.pop_back();
+    } else {
+      locks_failed_++;
+    }
+  }
+
+  // Grab any objects passed to this thread by another thread
+  void AcquirePassedObjects() {
+    // We do not create unnecessary contention by always using
+    // TryLock().  Plus we unlock immediately after swapping passed
+    // objects into a local vector.
+    vector<Object> copy;
+    { // Locking scope
+      if (pthread_mutex_trylock(&lock_) != 0) {
+        locks_failed_++;
+        return;
+      }
+      locks_ok_++;
+      swap(copy, passed_);
+      CHECK_EQ(pthread_mutex_unlock(&lock_), 0);
+    }
+
+    for (int i = 0; i < copy.size(); ++i) {
+      const Object& object = copy[i];
+      CheckContents(object);
+      heap_.push_back(object);
+      heap_size_ += object.size;
+    }
+  }
+
+  // Fill object contents according to ptr/generation
+  void FillContents(Object* object) {
+    // It doesn't need to be a great random number generator, but it does
+    // need to be a re-entrant one, since this is run in a thread
+    unsigned short xsubi[3];   // arguments for nrand48()
+    xsubi[0] = (reinterpret_cast<intptr_t>(object->ptr) & 0x7fff0000) >> 16;
+    xsubi[1] = (reinterpret_cast<intptr_t>(object->ptr) & 0x0000ffff);
+    xsubi[2] = 0x1234;
+
+    for (int i = 0; i < object->generation; ++i) {
+      nrand48(xsubi);
+    }
+    const char c = static_cast<char>(nrand48(xsubi));
+    memset(object->ptr, c, object->size);
+  }
+
+  // Check object contents
+  void CheckContents(const Object& object) {
+    // It doesn't need to be a great random number generator, but it does
+    // need to be a re-entrant one, since this is run in a thread
+    unsigned short xsubi[3];   // arguments for nrand48()
+    xsubi[0] = (reinterpret_cast<intptr_t>(object.ptr) & 0x7fff0000) >> 16;
+    xsubi[1] = (reinterpret_cast<intptr_t>(object.ptr) & 0x0000ffff);
+    xsubi[2] = 0x1234;
+
+    for (int i = 0; i < object.generation; ++i) {
+      nrand48(xsubi);
+    }
+
+    // For large objects, we just check a prefix/suffix
+    const char expected = static_cast<char>(nrand48(xsubi));
+    const int limit1 = object.size < 32 ? object.size : 32;
+    const int start2 = limit1 > object.size - 32 ? limit1 : object.size - 32;
+    for (int i = 0; i < limit1; ++i) {
+      CHECK_EQ(object.ptr[i], expected);
+    }
+    for (int i = start2; i < object.size; ++i) {
+      CHECK_EQ(object.ptr[i], expected);
+    }
+  }
+};
+
+static void* RunThread(void *vthread) {
+  TesterThread* t = reinterpret_cast<TesterThread*>(vthread);
+  t->Run();
+  return NULL;
+}
+
+static void TryHugeAllocation(size_t s) {
+  void* p = malloc(s);
+  CHECK(p == NULL);   // huge allocation s should fail!
+}
+
+static void TestHugeAllocations() {
+  // Check that asking for stuff tiny bit smaller than largest possible
+  // size returns NULL.
+  for (size_t i = 0; i < 10000; i++) {
+    TryHugeAllocation(kMaxSize - i);
+  }
+
+  // Check that asking for stuff near signed/unsigned boundary returns NULL
+  for (size_t i = 0; i < 100; i++) {
+    TryHugeAllocation(kMaxSignedSize - i);
+    TryHugeAllocation(kMaxSignedSize + i);
+  }
+}
+
+static void TestCalloc(size_t n, size_t s, bool ok) {
+  char* p = reinterpret_cast<char*>(calloc(n, s));
+  fprintf(LOGSTREAM, "calloc(%x, %x): %p\n", n, s, p);
+  if (!ok) {
+    CHECK(p == NULL);  // calloc(n, s) should succeed
+  } else {
+    CHECK(p != NULL);  // calloc(n, s) should not succeed
+    for (int i = 0; i < n*s; i++) {
+      CHECK(p[i] == '\0');
+    }
+    free(p);
+  }
+}
+
+static void TestNewHandler() throw (std::bad_alloc) {
+  ++news_handled;
+  throw std::bad_alloc();
+}
+
+static void TestOneNew(void* (*func)(size_t)) {
+  // success test
+  try {
+    void* ptr = (*func)(kNotTooBig);
+    if (0 == ptr) {
+      fprintf(LOGSTREAM, "allocation should not have failed.\n");
+      abort();
+    }
+  } catch (...) {
+    fprintf(LOGSTREAM, "allocation threw unexpected exception.\n");
+    abort();
+  }
+
+  // failure test
+  // we should always receive a bad_alloc exception
+  try {
+    (*func)(kTooBig);
+    fprintf(LOGSTREAM, "allocation should have failed.\n");
+    abort();
+  } catch (const std::bad_alloc&) {
+    // correct
+  } catch (...) {
+    fprintf(LOGSTREAM, "allocation threw unexpected exception.\n");
+    abort();
+  }
+}
+
+static void TestNew(void* (*func)(size_t)) {
+  news_handled = 0;
+
+  // test without new_handler:
+  std::new_handler saved_handler = std::set_new_handler(0);
+  TestOneNew(func);
+
+  // test with new_handler:
+  std::set_new_handler(TestNewHandler);
+  TestOneNew(func);
+  if (news_handled != 1) {
+    fprintf(LOGSTREAM, "new_handler was not called.\n");
+    abort();
+  }
+  std::set_new_handler(saved_handler);
+}
+
+static void TestOneNothrowNew(void* (*func)(size_t, const std::nothrow_t&)) {
+  // success test
+  try {
+    void* ptr = (*func)(kNotTooBig, std::nothrow);
+    if (0 == ptr) {
+      fprintf(LOGSTREAM, "allocation should not have failed.\n");
+      abort();
+    }
+  } catch (...) {
+    fprintf(LOGSTREAM, "allocation threw unexpected exception.\n");
+    abort();
+  }
+
+  // failure test
+  // we should always receive a bad_alloc exception
+  try {
+    if ((*func)(kTooBig, std::nothrow) != 0) {
+      fprintf(LOGSTREAM, "allocation should have failed.\n");
+      abort();
+    }
+  } catch (...) {
+    fprintf(LOGSTREAM, "nothrow allocation threw unexpected exception.\n");
+    abort();
+  }
+}
+
+static void TestNothrowNew(void* (*func)(size_t, const std::nothrow_t&)) {
+  news_handled = 0;
+
+  // test without new_handler:
+  std::new_handler saved_handler = std::set_new_handler(0);
+  TestOneNothrowNew(func);
+
+  // test with new_handler:
+  std::set_new_handler(TestNewHandler);
+  TestOneNothrowNew(func);
+  if (news_handled != 1) {
+    fprintf(LOGSTREAM, "nothrow new_handler was not called.\n");
+    abort();
+  }
+  std::set_new_handler(saved_handler);
+}
+
+
+int main(int argc, char** argv) {
+  // TODO(csilvers): port MemoryUsage() over so the test can use that
+#if 0
+  // Allocate and deallocate blocks of increasing sizes to check if the alloc
+  // metadata fragments the memory. (Do not put other allocations/deallocations
+  // before this test, it may break).
+  {
+    size_t memory_usage = MemoryUsage(getpid());
+    fprintf(LOGSTREAM, "==== Testing fragmentation\n");
+    for ( int i = 200; i < 240; ++i ) {
+      int size = i << 20;
+      void *test1 = malloc(size);
+      for ( int j = 0; j < size; j += (1 << 12) ) {
+        static_cast<char*>(test1)[j] = 1;
+      }
+      free(test1);
+    }
+    // There may still be a bit of fragmentation at the beginning, until we
+    // reach kPageMapBigAllocationThreshold bytes so we check for
+    // 200 + 240 + margin.
+    CHECK_LT(MemoryUsage(getpid()), memory_usage + (450 << 20) );
+  }
+#endif
+
+  // Check that empty allocation works
+  fprintf(LOGSTREAM, "==== Testing empty allocation\n");
+  {
+    void* p1 = malloc(0);
+    CHECK(p1 != NULL);
+    void* p2 = malloc(0);
+    CHECK(p2 != NULL);
+    CHECK(p1 != p2);
+    free(p1);
+    free(p2);
+  }
+
+  // Check that 512MB can be allocated
+  fprintf(LOGSTREAM, "==== Testing large allocation\n");
+  {
+    void* p = malloc(512<<20);
+    CHECK(p != NULL);   // could not allocate 512MB
+    free(p);
+  }
+
+  // Check that large allocations fail with NULL instead of crashing
+  fprintf(LOGSTREAM, "==== Testing out of memory\n");
+  for (int s = 0; ; s += (10<<20)) {
+    void* large_object = malloc(s);
+    if (large_object == NULL) break;
+    free(large_object);
+  }
+
+  // Check that huge allocations fail with NULL instead of crashing
+  fprintf(LOGSTREAM, "==== Testing huge allocations\n");
+  TestHugeAllocations();
+
+  // Check calloc() with various arguments
+  fprintf(LOGSTREAM, "==== Testing calloc\n");
+  TestCalloc(0, 0, true);
+  TestCalloc(0, 1, true);
+  TestCalloc(1, 1, true);
+  TestCalloc(1<<10, 0, true);
+  TestCalloc(1<<20, 0, true);
+  TestCalloc(0, 1<<10, true);
+  TestCalloc(0, 1<<20, true);
+  TestCalloc(1<<20, 2, true);
+  TestCalloc(2, 1<<20, true);
+  TestCalloc(1000, 1000, true);
+
+  TestCalloc(kMaxSize, 2, false);
+  TestCalloc(2, kMaxSize, false);
+  TestCalloc(kMaxSize, kMaxSize, false);
+
+  TestCalloc(kMaxSignedSize, 3, false);
+  TestCalloc(3, kMaxSignedSize, false);
+  TestCalloc(kMaxSignedSize, kMaxSignedSize, false);
+
+  fprintf(LOGSTREAM, "Testing operator new(nothrow).\n");
+  TestNothrowNew(&::operator new);
+  fprintf(LOGSTREAM, "Testing operator new[](nothrow).\n");
+  TestNothrowNew(&::operator new[]);
+  fprintf(LOGSTREAM, "Testing operator new.\n");
+  TestNew(&::operator new);
+  fprintf(LOGSTREAM, "Testing operator new[].\n");
+  TestNew(&::operator new[]);
+  fprintf(LOGSTREAM, "Done testing C++ operators.\n");
 
-#define BUFSIZE (100 << 10)
+  // Create threads
+  fprintf(LOGSTREAM, "==== Testing threaded allocation/deallocation\n");
+  threads = new TesterThread*[FLAGS_numthreads];
+  pthread_t* thread_ids = new pthread_t[FLAGS_numthreads];
+  for (int i = 0; i < FLAGS_numthreads; ++i) {
+    threads[i] = new TesterThread(i);
+  }
 
-int main(int argc, char **argv) {
-  char *buf1 = (char *)malloc(BUFSIZE);
-  memset(buf1, 0, BUFSIZE);
-  printf("Allocated buf1 via malloc() at %p\n", buf1);
+  // Start
+  for (int i = 0; i < FLAGS_numthreads; ++i) {
+    CHECK_EQ(pthread_create(&thread_ids[i], NULL, RunThread, threads[i]), 0);
+  }
 
-  char *buf2 = new char[BUFSIZE];
-  memset(buf2, 0, BUFSIZE);
-  printf("Allocated buf2 via new at %p\n", buf2);
-  
-  free(buf1);
-  delete[] buf2;
+  // Wait
+  for (int i = 0; i < FLAGS_numthreads; ++i) {
+    void* junk;
+    CHECK_EQ(pthread_join(thread_ids[i], &junk), 0);
+  }
 
-  char buffer[10 << 10];
-  MallocExtension::instance()->GetStats(buffer, sizeof(buffer));
-  printf("Malloc stats:\n%s\n", buffer);
+  for (int i = 0; i < FLAGS_numthreads; ++i) delete threads[i];    // Cleanup
 
+  fprintf(LOGSTREAM, "PASS\n");
   return 0;
 }
author	csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>	2007-03-22 04:46:29 +0000
committer	csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>	2007-03-22 04:46:29 +0000
commit	60a3a2ce77ed2713b2eedd20952d9cfc56ff7ccf (patch)
tree	234b959ff2d934386ecfc08a7477f5c94290b709
parent	298274f8d4f474d2b16a35c8babc58817088c59e (diff)
download	gperftools-60a3a2ce77ed2713b2eedd20952d9cfc56ff7ccf.tar.gz