From cb7393cbe2d737009001fd9d30dad568bac7a3d8 Mon Sep 17 00:00:00 2001
From: csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>
Date: Mon, 21 Jun 2010 15:59:56 +0000
Subject: 	* Default to not sampling in tcmalloc (csilvers) 	* Add
 -DTCMALLOC_LARGE_PAGES: better perf for some workloads (rus) 	* Extend pprof
 --tools to allow per-tool configs (csilvers) 	* Have STL_Allocator pass on #
 bytes to free (richardfang) 	* Add a header guard to config.h (csilvers) 
 * DOC: Clean up documentation around tcmalloc.slack_bytes (fikes) 	* DOC:
 Document ProfilerFlush, ProfilerStartWithOptions (csilvers) 	* PORTING:
 Work around a gcc 4.5.0 optimization bug (csilvers) 	* PORTING: Use
 -fno-builtin-malloc and friends when compiling tcmalloc 	* PORTING:
 Define _WIN32_WINNT high enough for mingw (csilvers) 	* PORTING: Work around
 libtool bug getting deps wrong in some cases 	* Update README.windows to
 emphasize $IncludeDir more (csilvers) 	* Rename README.windows to
 README_windows.txt (csilvers)

git-svn-id: http://gperftools.googlecode.com/svn/trunk@95 6b5cf1ce-ec42-a296-1ba9-69fdba395a50
---
 Makefile.am                    |  64 +++++++++++++++----
 Makefile.in                    | 103 ++++++++++++++++++++++---------
 README                         |  69 ++++++++++++---------
 configure                      |   9 +++
 configure.ac                   |  14 +++++
 doc/cpuprofile.html            |   5 ++
 doc/tcmalloc.html              |  34 +++++++++--
 src/base/dynamic_annotations.c |  17 ++++++
 src/base/dynamic_annotations.h |  13 ++++
 src/base/stl_allocator.h       |   4 +-
 src/central_freelist.cc        |  24 +++++++-
 src/common.h                   |  43 +++++++++++--
 src/config.h.in                |   7 +++
 src/google/malloc_extension.h  |  25 ++++----
 src/heap-checker.cc            |   3 +
 src/internal_logging.h         |   4 +-
 src/linked_list.h              |   2 +
 src/memory_region_map.h        |   2 +-
 src/page_heap.cc               |  97 ++++++++++++-----------------
 src/page_heap.h                |  71 +++++----------------
 src/pprof                      |  35 +++++++----
 src/sampler.cc                 |   7 +--
 src/span.h                     |   4 --
 src/tcmalloc.cc                | 136 ++++++++++++++++++++++++++---------------
 src/tests/frag_unittest.cc     |   7 ++-
 src/tests/page_heap_test.cc    |   2 +-
 src/tests/testutil.cc          |   2 +-
 src/thread_cache.cc            |   3 +-
 src/thread_cache.h             |  16 -----
 src/windows/config.h           |   5 +-
 src/windows/mingw.h            |  13 ++++
 31 files changed, 532 insertions(+), 308 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 73635db..8395013 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -17,9 +17,17 @@ endif !WITH_STACK_TRACE
 # This is mostly based on configure options
 AM_CXXFLAGS =
 
-# These are good warnings to turn on by default,
+# These are good warnings to turn on by default.  We also tell gcc
+# that malloc, free, realloc, mmap, etc. are not builtins (these flags
+# are supported since gcc 3.1.1).  gcc doesn't think most of them are
+# builtins now in any case, but it's best to be explicit in case that
+# changes one day.  gcc ignores functions it doesn't understand.
 if GCC
-AM_CXXFLAGS += -Wall -Wwrite-strings -Woverloaded-virtual -Wno-sign-compare
+AM_CXXFLAGS += -Wall -Wwrite-strings -Woverloaded-virtual -Wno-sign-compare \
+               -fno-builtin-malloc -fno-builtin-free -fno-builtin-realloc \
+               -fno-builtin-calloc -fno-builtin-cfree \
+               -fno-builtin-memalign -fno-builtin-posix_memalign \
+               -fno-builtin-valloc -fno-builtin-pvalloc
 endif GCC
 
 # The -no-undefined flag allows libtool to generate shared libraries for
@@ -96,7 +104,7 @@ docdir = $(prefix)/share/doc/$(PACKAGE)-$(VERSION)
 # Add your documentation files (in doc/) in addition to these
 # top-level boilerplate files.  Also add a TODO file if you have one.
 # We'll add to this later, on a library-by-library basis
-dist_doc_DATA = AUTHORS COPYING ChangeLog INSTALL NEWS README README.windows \
+dist_doc_DATA = AUTHORS COPYING ChangeLog INSTALL NEWS README README_windows.txt \
                 TODO
 
 # The libraries (.so's) you want to install
@@ -400,7 +408,7 @@ libtcmalloc_minimal_la_SOURCES = $(TCMALLOC_CC) $(TCMALLOC_MINIMAL_INCLUDES)
 libtcmalloc_minimal_la_CXXFLAGS = -DNO_TCMALLOC_SAMPLES \
                                   $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS)
 libtcmalloc_minimal_la_LDFLAGS = $(PTHREAD_CFLAGS)
-libtcmalloc_minimal_la_LIBADD = $(PTHREAD_LIBS) libtcmalloc_minimal_internal.la
+libtcmalloc_minimal_la_LIBADD = libtcmalloc_minimal_internal.la $(PTHREAD_LIBS)
 
 # For windows, we're playing around with trying to do some stacktrace
 # support even with libtcmalloc_minimal.  For everyone else, though,
@@ -442,6 +450,13 @@ tcmalloc_minimal_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
                                     $(TCMALLOC_UNITTEST_INCLUDES)
 tcmalloc_minimal_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 tcmalloc_minimal_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS)
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+# We also put pthreads after tcmalloc, because some pthread
+# implementations define their own malloc, and we need to go on the
+# first linkline to make sure our malloc 'wins'.
 tcmalloc_minimal_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) \
                                   liblogging.la $(PTHREAD_LIBS)
 
@@ -750,13 +765,13 @@ libtcmalloc_internal_la_SOURCES = $(libtcmalloc_minimal_internal_la_SOURCES) \
 libtcmalloc_internal_la_CXXFLAGS = $(PTHREAD_CFLAGS) -DNDEBUG \
                                    $(AM_CXXFLAGS) $(NO_EXCEPTIONS)
 libtcmalloc_internal_la_LDFLAGS = $(PTHREAD_CFLAGS)
-libtcmalloc_internal_la_LIBADD = $(PTHREAD_LIBS) libstacktrace.la
+libtcmalloc_internal_la_LIBADD = libstacktrace.la $(PTHREAD_LIBS)
 
 lib_LTLIBRARIES += libtcmalloc.la
 libtcmalloc_la_SOURCES = $(TCMALLOC_CC) $(TCMALLOC_INCLUDES)
 libtcmalloc_la_CXXFLAGS = $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS)
 libtcmalloc_la_LDFLAGS = $(PTHREAD_CFLAGS)
-libtcmalloc_la_LIBADD = $(PTHREAD_LIBS) libtcmalloc_internal.la
+libtcmalloc_la_LIBADD = libtcmalloc_internal.la $(PTHREAD_LIBS)
 
 if WITH_HEAP_CHECKER
 # heap-checker-bcad is last, in hopes its global ctor will run first.
@@ -789,6 +804,13 @@ tcmalloc_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
                             $(TCMALLOC_UNITTEST_INCLUDES)
 tcmalloc_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 tcmalloc_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS)
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+# We also put pthreads after tcmalloc, because some pthread
+# implementations define their own malloc, and we need to go on the
+# first linkline to make sure our malloc 'wins'.
 tcmalloc_unittest_LDADD = $(LIBTCMALLOC) liblogging.la $(PTHREAD_LIBS)
 
 # This makes sure it's safe to link in both tcmalloc and
@@ -803,6 +825,13 @@ tcmalloc_both_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
 tcmalloc_both_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 tcmalloc_both_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS)
 if WITH_CPU_PROFILER
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+# We also put pthreads after tcmalloc, because some pthread
+# implementations define their own malloc, and we need to go on the
+# first linkline to make sure our malloc 'wins'.
 tcmalloc_both_unittest_LDADD = $(LIBTCMALLOC) $(LIBTCMALLOC_MINIMAL) \
                                libprofiler.la liblogging.la $(PTHREAD_LIBS)
 else
@@ -822,6 +851,10 @@ raw_printer_test_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 raw_printer_test_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS)
 raw_printer_test_LDADD = $(LIBTCMALLOC) $(PTHREAD_LIBS)
 
+# sampler_test and sampling_test both require sampling to be turned
+# on, which it's not by default.  Use the "standard" value of 2^19.
+TESTS_ENVIRONMENT += TCMALLOC_SAMPLE_PARAMETER=524288
+
 TESTS += sampler_test
 WINDOWS_PROJECTS += vsprojects/sampler_test/sampler_test.vcproj
 sampler_test_SOURCES = src/tests/sampler_test.cc \
@@ -909,8 +942,14 @@ heap_checker_unittest_SOURCES = src/tests/heap-checker_unittest.cc \
                                 $(HEAP_CHECKER_UNITTEST_INCLUDES)
 heap_checker_unittest_CXXFLAGS = -g $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 heap_checker_unittest_LDFLAGS = -g $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS)
-# tcmalloc has to be specified last!
-heap_checker_unittest_LDADD = $(PTHREAD_LIBS) liblogging.la $(LIBTCMALLOC)
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+# We also put pthreads after tcmalloc, because some pthread
+# implementations define their own malloc, and we need to go on the
+# first linkline to make sure our malloc 'wins'.
+heap_checker_unittest_LDADD = $(LIBTCMALLOC) liblogging.la $(PTHREAD_LIBS)
 
 endif WITH_HEAP_CHECKER
 
@@ -1003,9 +1042,12 @@ noinst_PROGRAMS += heap-checker_debug_unittest
 heap_checker_debug_unittest_SOURCES = $(heap_checker_unittest_SOURCES)
 heap_checker_debug_unittest_CXXFLAGS = $(heap_checker_unittest_CXXFLAGS)
 heap_checker_debug_unittest_LDFLAGS = $(heap_checker_unittest_LDFLAGS)
-# tcmalloc has to be specified last!
-heap_checker_debug_unittest_LDADD = $(PTHREAD_LIBS) liblogging.la \
-                                    libtcmalloc_debug.la
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+heap_checker_debug_unittest_LDADD = libtcmalloc_debug.la liblogging.la \
+                                    $(PTHREAD_LIBS)
 
 endif WITH_HEAP_CHECKER
 endif WITH_DEBUGALLOC
diff --git a/Makefile.in b/Makefile.in
index a717bed..0e51024 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -46,8 +46,17 @@ build_triplet = @build@
 host_triplet = @host@
 @WITH_STACK_TRACE_FALSE@am__append_1 = -DNO_TCMALLOC_SAMPLES
 
-# These are good warnings to turn on by default,
-@GCC_TRUE@am__append_2 = -Wall -Wwrite-strings -Woverloaded-virtual -Wno-sign-compare
+# These are good warnings to turn on by default.  We also tell gcc
+# that malloc, free, realloc, mmap, etc. are not builtins (these flags
+# are supported since gcc 3.1.1).  gcc doesn't think most of them are
+# builtins now in any case, but it's best to be explicit in case that
+# changes one day.  gcc ignores functions it doesn't understand.
+@GCC_TRUE@am__append_2 = -Wall -Wwrite-strings -Woverloaded-virtual -Wno-sign-compare \
+@GCC_TRUE@               -fno-builtin-malloc -fno-builtin-free -fno-builtin-realloc \
+@GCC_TRUE@               -fno-builtin-calloc -fno-builtin-cfree \
+@GCC_TRUE@               -fno-builtin-memalign -fno-builtin-posix_memalign \
+@GCC_TRUE@               -fno-builtin-valloc -fno-builtin-pvalloc
+
 
 # These are x86-specific, having to do with frame-pointers.  In
 # particular, some x86_64 systems do not insert frame pointers by
@@ -152,11 +161,15 @@ bin_PROGRAMS =
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	raw_printer_test \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	sampler_test \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	sampling_test.sh$(EXEEXT)
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_35 = vsprojects/sampler_test/sampler_test.vcproj
+
+# sampler_test and sampling_test both require sampling to be turned
+# on, which it's not by default.  Use the "standard" value of 2^19.
 
 # These unittests often need to run binaries.  They're in the current dir
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_36 = BINDIR=. \
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_35 = TCMALLOC_SAMPLE_PARAMETER=524288 \
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	BINDIR=. \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	TMPDIR=/tmp/perftools
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_36 = vsprojects/sampler_test/sampler_test.vcproj
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_37 = $(sampling_test_sh_SOURCES)
 
 # This is the sub-program used by sampling_test.sh
@@ -345,8 +358,8 @@ libsysinfo_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
 am_libsysinfo_la_OBJECTS = sysinfo.lo $(am__objects_1)
 libsysinfo_la_OBJECTS = $(am_libsysinfo_la_OBJECTS)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_DEPENDENCIES =  \
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(am__DEPENDENCIES_1) \
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	libtcmalloc_internal.la
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	libtcmalloc_internal.la \
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(am__DEPENDENCIES_1)
 am__libtcmalloc_la_SOURCES_DIST = src/tcmalloc.cc src/common.h \
 	src/internal_logging.h src/system-alloc.h \
 	src/packed-cache-inl.h src/base/spinlock.h \
@@ -394,8 +407,8 @@ libtcmalloc_la_OBJECTS = $(am_libtcmalloc_la_OBJECTS)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am_libtcmalloc_la_rpath = -rpath \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(libdir)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__DEPENDENCIES_3 =  \
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(am__DEPENDENCIES_1) \
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	libtcmalloc_internal.la
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	libtcmalloc_internal.la \
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(am__DEPENDENCIES_1)
 @WITH_CPU_PROFILER_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_and_profiler_la_DEPENDENCIES = $(am__DEPENDENCIES_3)
 am__libtcmalloc_and_profiler_la_SOURCES_DIST = src/tcmalloc.cc \
 	src/common.h src/internal_logging.h src/system-alloc.h \
@@ -486,8 +499,8 @@ libtcmalloc_debug_la_OBJECTS = $(am_libtcmalloc_debug_la_OBJECTS)
 @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am_libtcmalloc_debug_la_rpath = -rpath \
 @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(libdir)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_internal_la_DEPENDENCIES =  \
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(am__DEPENDENCIES_1) \
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	libstacktrace.la
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	libstacktrace.la \
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(am__DEPENDENCIES_1)
 am__libtcmalloc_internal_la_SOURCES_DIST = src/common.cc \
 	src/internal_logging.cc src/system-alloc.cc \
 	src/memfs_malloc.cc src/central_freelist.cc src/page_heap.cc \
@@ -550,8 +563,8 @@ am__objects_21 = libtcmalloc_internal_la-common.lo \
 libtcmalloc_internal_la_OBJECTS =  \
 	$(am_libtcmalloc_internal_la_OBJECTS)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am_libtcmalloc_internal_la_rpath =
-libtcmalloc_minimal_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
-	libtcmalloc_minimal_internal.la
+libtcmalloc_minimal_la_DEPENDENCIES = libtcmalloc_minimal_internal.la \
+	$(am__DEPENDENCIES_1)
 am__libtcmalloc_minimal_la_SOURCES_DIST = src/tcmalloc.cc src/common.h \
 	src/internal_logging.h src/system-alloc.h \
 	src/packed-cache-inl.h src/base/spinlock.h \
@@ -574,8 +587,8 @@ am__libtcmalloc_minimal_la_SOURCES_DIST = src/tcmalloc.cc src/common.h \
 am_libtcmalloc_minimal_la_OBJECTS = $(am__objects_22) \
 	$(am__objects_20)
 libtcmalloc_minimal_la_OBJECTS = $(am_libtcmalloc_minimal_la_OBJECTS)
-am__DEPENDENCIES_4 = $(am__DEPENDENCIES_1) \
-	libtcmalloc_minimal_internal.la
+am__DEPENDENCIES_4 = libtcmalloc_minimal_internal.la \
+	$(am__DEPENDENCIES_1)
 @WITH_DEBUGALLOC_TRUE@libtcmalloc_minimal_debug_la_DEPENDENCIES =  \
 @WITH_DEBUGALLOC_TRUE@	$(am__DEPENDENCIES_4)
 am__libtcmalloc_minimal_debug_la_SOURCES_DIST =  \
@@ -782,9 +795,9 @@ am__heap_checker_debug_unittest_SOURCES_DIST =  \
 @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@am_heap_checker_debug_unittest_OBJECTS = $(am__objects_27)
 heap_checker_debug_unittest_OBJECTS =  \
 	$(am_heap_checker_debug_unittest_OBJECTS)
-@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_DEPENDENCIES = $(am__DEPENDENCIES_1) \
+@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_DEPENDENCIES = libtcmalloc_debug.la \
 @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@	liblogging.la \
-@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@	libtcmalloc_debug.la
+@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@	$(am__DEPENDENCIES_1)
 am__heap_checker_debug_unittest_sh_SOURCES_DIST =  \
 	src/tests/heap-checker_unittest.sh
 am_heap_checker_debug_unittest_sh_OBJECTS =
@@ -803,8 +816,8 @@ heap_checker_unittest_OBJECTS = $(am_heap_checker_unittest_OBJECTS)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__DEPENDENCIES_6 =  \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	libtcmalloc.la
 @WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_DEPENDENCIES =  \
-@WITH_HEAP_CHECKER_TRUE@	$(am__DEPENDENCIES_1) liblogging.la \
-@WITH_HEAP_CHECKER_TRUE@	$(am__DEPENDENCIES_6)
+@WITH_HEAP_CHECKER_TRUE@	$(am__DEPENDENCIES_6) liblogging.la \
+@WITH_HEAP_CHECKER_TRUE@	$(am__DEPENDENCIES_1)
 am__heap_checker_unittest_sh_SOURCES_DIST =  \
 	src/tests/heap-checker_unittest.sh
 am_heap_checker_unittest_sh_OBJECTS =
@@ -1329,7 +1342,7 @@ man1dir = $(mandir)/man1
 NROFF = nroff
 MANS = $(dist_man_MANS)
 am__dist_doc_DATA_DIST = AUTHORS COPYING ChangeLog INSTALL NEWS README \
-	README.windows TODO doc/index.html doc/designstyle.css \
+	README_windows.txt TODO doc/index.html doc/designstyle.css \
 	doc/pprof_remote_servers.html doc/tcmalloc.html \
 	doc/overview.gif doc/pageheap.gif doc/spanmap.gif \
 	doc/threadheap.gif doc/t-test1.times.txt \
@@ -1597,7 +1610,7 @@ noinst_HEADERS = src/google/tcmalloc.h.in
 # one day we figure it out.  Regardless, installing the dot files isn't the
 # end of the world.
 dist_doc_DATA = AUTHORS COPYING ChangeLog INSTALL NEWS README \
-	README.windows TODO doc/index.html doc/designstyle.css \
+	README_windows.txt TODO doc/index.html doc/designstyle.css \
 	$(am__append_12) doc/tcmalloc.html doc/overview.gif \
 	doc/pageheap.gif doc/spanmap.gif doc/threadheap.gif \
 	doc/t-test1.times.txt \
@@ -1658,7 +1671,7 @@ WINDOWS_PROJECTS = google-perftools.sln \
 	vsprojects/realloc_unittest/realloc_unittest.vcproj \
 	vsprojects/stack_trace_table_test/stack_trace_table_test.vcproj \
 	vsprojects/thread_dealloc_unittest/thread_dealloc_unittest.vcproj \
-	$(am__append_35)
+	$(am__append_36)
 
 # unittests you want to run when people type 'make check'.
 # Note: tests cannot take any arguments!
@@ -1690,7 +1703,7 @@ TESTS = low_level_alloc_unittest atomicops_unittest $(am__append_11) \
 # TESTS_ENVIRONMENT sets environment variables for when you run unittest.
 # We always get "srcdir" set for free.
 # We'll add to this later, on a library-by-library basis.
-TESTS_ENVIRONMENT = $(am__append_13) $(am__append_36)
+TESTS_ENVIRONMENT = $(am__append_13) $(am__append_35)
 # All script tests should be added here
 noinst_SCRIPTS = $(am__append_16) $(am__append_25) $(am__append_37) \
 	$(am__append_40) $(am__append_43) $(am__append_58)
@@ -1916,7 +1929,7 @@ libtcmalloc_minimal_la_CXXFLAGS = -DNO_TCMALLOC_SAMPLES \
                                   $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS)
 
 libtcmalloc_minimal_la_LDFLAGS = $(PTHREAD_CFLAGS)
-libtcmalloc_minimal_la_LIBADD = $(PTHREAD_LIBS) libtcmalloc_minimal_internal.la
+libtcmalloc_minimal_la_LIBADD = libtcmalloc_minimal_internal.la $(PTHREAD_LIBS)
 @MINGW_FALSE@LIBTCMALLOC_MINIMAL = libtcmalloc_minimal.la
 
 # For windows, we're playing around with trying to do some stacktrace
@@ -1930,6 +1943,13 @@ tcmalloc_minimal_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
 
 tcmalloc_minimal_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 tcmalloc_minimal_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS)
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+# We also put pthreads after tcmalloc, because some pthread
+# implementations define their own malloc, and we need to go on the
+# first linkline to make sure our malloc 'wins'.
 tcmalloc_minimal_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) \
                                   liblogging.la $(PTHREAD_LIBS)
 
@@ -2098,7 +2118,7 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(NO_EXCEPTIONS) \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(am__append_31)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_internal_la_LDFLAGS = $(PTHREAD_CFLAGS)
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_internal_la_LIBADD = $(PTHREAD_LIBS) libstacktrace.la
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_internal_la_LIBADD = libstacktrace.la $(PTHREAD_LIBS)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_SOURCES =  \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(TCMALLOC_CC) \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(TCMALLOC_INCLUDES) \
@@ -2108,7 +2128,7 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(AM_CXXFLAGS) \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	$(am__append_32)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_LDFLAGS = $(PTHREAD_CFLAGS)
-@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_LIBADD = $(PTHREAD_LIBS) libtcmalloc_internal.la
+@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_LIBADD = libtcmalloc_internal.la $(PTHREAD_LIBS)
 @WITH_HEAP_CHECKER_FALSE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@HEAP_CHECKER_SOURCES = 
 
 # heap-checker-bcad is last, in hopes its global ctor will run first.
@@ -2131,6 +2151,13 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS)
 
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS)
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+# We also put pthreads after tcmalloc, because some pthread
+# implementations define their own malloc, and we need to go on the
+# first linkline to make sure our malloc 'wins'.
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_unittest_LDADD = $(LIBTCMALLOC) liblogging.la $(PTHREAD_LIBS)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_both_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@                                 src/tests/testutil.h src/tests/testutil.cc \
@@ -2141,6 +2168,13 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS)
 @WITH_CPU_PROFILER_FALSE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_both_unittest_LDADD = $(LIBTCMALLOC) $(LIBTCMALLOC_MINIMAL) \
 @WITH_CPU_PROFILER_FALSE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@                               liblogging.la $(PTHREAD_LIBS)
 
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+# We also put pthreads after tcmalloc, because some pthread
+# implementations define their own malloc, and we need to go on the
+# first linkline to make sure our malloc 'wins'.
 @WITH_CPU_PROFILER_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_both_unittest_LDADD = $(LIBTCMALLOC) $(LIBTCMALLOC_MINIMAL) \
 @WITH_CPU_PROFILER_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@                               libprofiler.la liblogging.la $(PTHREAD_LIBS)
 
@@ -2193,8 +2227,14 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS)
 
 @WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_CXXFLAGS = -g $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 @WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_LDFLAGS = -g $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS)
-# tcmalloc has to be specified last!
-@WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_LDADD = $(PTHREAD_LIBS) liblogging.la $(LIBTCMALLOC)
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+# We also put pthreads after tcmalloc, because some pthread
+# implementations define their own malloc, and we need to go on the
+# first linkline to make sure our malloc 'wins'.
+@WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_LDADD = $(LIBTCMALLOC) liblogging.la $(PTHREAD_LIBS)
 @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_debug_la_SOURCES = src/debugallocation.cc $(HEAP_CHECKER_SOURCES) \
 @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@                               $(TCMALLOC_INCLUDES)
 
@@ -2227,9 +2267,12 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS)
 @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_SOURCES = $(heap_checker_unittest_SOURCES)
 @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_CXXFLAGS = $(heap_checker_unittest_CXXFLAGS)
 @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_LDFLAGS = $(heap_checker_unittest_LDFLAGS)
-# tcmalloc has to be specified last!
-@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_LDADD = $(PTHREAD_LIBS) liblogging.la \
-@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@                                    libtcmalloc_debug.la
+# We want libtcmalloc last on the link line, but due to a bug in
+# libtool involving convenience libs, they need to come last on the
+# link line in order to get dependency ordering right.  This is ok:
+# convenience libraries are .a's, so tcmalloc is still the last .so.
+@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_LDADD = libtcmalloc_debug.la liblogging.la \
+@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@                                    $(PTHREAD_LIBS)
 
 
 ### ------- CPU profiler
diff --git a/README b/README
index c7ef8c5..40ac8dc 100644
--- a/README
+++ b/README
@@ -4,34 +4,6 @@ There are known issues with some perftools functionality on x86_64
 systems.  See 64-BIT ISSUES, below.
 
 
-CPU PROFILER
-------------
-See doc/cpu-profiler.html for information about how to use the CPU
-profiler and analyze its output.
-
-As a quick-start, do the following after installing this package:
-
-1) Link your executable with -lprofiler
-2) Run your executable with the CPUPROFILE environment var set:
-     $ CPUPROFILE=/tmp/prof.out <path/to/binary> [binary args]
-3) Run pprof to analyze the CPU usage
-     $ pprof <path/to/binary> /tmp/prof.out      # -pg-like text output
-     $ pprof --gv <path/to/binary> /tmp/prof.out # really cool graphical output
-
-There are other environment variables, besides CPUPROFILE, you can set
-to adjust the cpu-profiler behavior; cf "ENVIRONMENT VARIABLES" below.
-
-The CPU profiler is available on all unix-based systems we've tested;
-see INSTALL for more details.  It is not currently available on Windows.
-
-NOTE: CPU profiling doesn't work after fork (unless you immediately
-      do an exec()-like call afterwards).  Furthermore, if you do
-      fork, and the child calls exit(), it may corrupt the profile
-      data.  You can use _exit() to work around this.  We hope to have
-      a fix for both problems in the next release of perftools
-      (hopefully perftools 1.2).
-
-
 TCMALLOC
 --------
 Just link in -ltcmalloc or -ltcmalloc_minimal to get the advantages of
@@ -42,6 +14,19 @@ tcmalloc functionality is available on all systems we've tested; see
 INSTALL for more details.  See README.windows for instructions on
 using tcmalloc on Windows.
 
+NOTE: When compiling with programs with gcc, that you plan to link
+with libtcmalloc, it's safest to pass in the flags
+
+ -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+
+when compiling.  gcc makes some optimizations assuming it is using its
+own, built-in malloc; that assumption obviously isn't true with
+tcmalloc.  In practice, we haven't seen any problems with this, but
+the expected risk is highest for users who register their own malloc
+hooks with tcmalloc (using google/malloc_hook.h).  The risk is lowest
+for folks who use tcmalloc_minimal (or, of course, who pass in the
+above flags :-) ).
+
 
 HEAP PROFILER
 -------------
@@ -96,6 +81,34 @@ The heap checker is only available on Linux at this time; see INSTALL
 for more details.
 
 
+CPU PROFILER
+------------
+See doc/cpu-profiler.html for information about how to use the CPU
+profiler and analyze its output.
+
+As a quick-start, do the following after installing this package:
+
+1) Link your executable with -lprofiler
+2) Run your executable with the CPUPROFILE environment var set:
+     $ CPUPROFILE=/tmp/prof.out <path/to/binary> [binary args]
+3) Run pprof to analyze the CPU usage
+     $ pprof <path/to/binary> /tmp/prof.out      # -pg-like text output
+     $ pprof --gv <path/to/binary> /tmp/prof.out # really cool graphical output
+
+There are other environment variables, besides CPUPROFILE, you can set
+to adjust the cpu-profiler behavior; cf "ENVIRONMENT VARIABLES" below.
+
+The CPU profiler is available on all unix-based systems we've tested;
+see INSTALL for more details.  It is not currently available on Windows.
+
+NOTE: CPU profiling doesn't work after fork (unless you immediately
+      do an exec()-like call afterwards).  Furthermore, if you do
+      fork, and the child calls exit(), it may corrupt the profile
+      data.  You can use _exit() to work around this.  We hope to have
+      a fix for both problems in the next release of perftools
+      (hopefully perftools 1.2).
+
+
 EVERYTHING IN ONE
 -----------------
 If you want the CPU profiler, heap profiler, and heap leak-checker to
diff --git a/configure b/configure
index 04e143b..9a3048c 100755
--- a/configure
+++ b/configure
@@ -21533,6 +21533,15 @@ _ACEOF
 $as_echo "#define PERFTOOLS_DLL_DECL /**/" >>confdefs.h
 
 
+# In theory, config.h files shouldn't need a header guard, but we do,
+# because we (maybe) #include windows/mingw.h from within config.h,
+# and it #includes other .h files.  These all have header guards, so
+# the end result is if config.h is #included twice, its #undefs get
+# evaluated twice, but all the ones in mingw.h/etc only get evaluated
+# once, potentially causing trouble.  c.f.
+#   http://code.google.com/p/google-perftools/issues/detail?id=246
+
+
 # MinGW uses autoconf, but also needs the windows shim routines
 # (since it doesn't have its own support for, say, pthreads).
 # This requires us to #include a special header file, and also to
diff --git a/configure.ac b/configure.ac
index e93cdc4..adbb2e5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -301,6 +301,18 @@ AC_DEFINE(PERFTOOLS_DLL_DECL,,
 	   internally, to compile the DLL, and every DLL source file
 	   #includes "config.h" before anything else.])
 
+# In theory, config.h files shouldn't need a header guard, but we do,
+# because we (maybe) #include windows/mingw.h from within config.h,
+# and it #includes other .h files.  These all have header guards, so
+# the end result is if config.h is #included twice, its #undefs get
+# evaluated twice, but all the ones in mingw.h/etc only get evaluated
+# once, potentially causing trouble.  c.f.
+#   http://code.google.com/p/google-perftools/issues/detail?id=246
+AH_TOP([
+#ifndef GOOGLE_PERFTOOLS_CONFIG_H_
+#define GOOGLE_PERFTOOLS_CONFIG_H_
+])
+
 # MinGW uses autoconf, but also needs the windows shim routines
 # (since it doesn't have its own support for, say, pthreads).
 # This requires us to #include a special header file, and also to
@@ -309,6 +321,8 @@ AH_BOTTOM([
 #ifdef __MINGW32__
 #include "windows/mingw.h"
 #endif
+
+#endif  /* #ifndef GOOGLE_PERFTOOLS_CONFIG_H_ */
 ])
 AM_CONDITIONAL(MINGW, expr $host : '.*-mingw' >/dev/null 2>&1)
 
diff --git a/doc/cpuprofile.html b/doc/cpuprofile.html
index 3d2b4cc..f029e78 100644
--- a/doc/cpuprofile.html
+++ b/doc/cpuprofile.html
@@ -71,6 +71,11 @@ CPUPROFILE with the child's process id).</p>
 <p>For security reasons, CPU profiling will not write to a file -- and
 is thus not usable -- for setuid programs.</p>
 
+<p>See the include-file <code>google/profiler.h</code> for
+advanced-use functions, including <code>ProfilerFlush()</code> and
+<code>ProfilerStartWithOptions()</code>.</p>
+
+
 <H2>Modifying Runtime Behavior</H2>
 
 <p>You can more finely control the behavior of the CPU profiler via
diff --git a/doc/tcmalloc.html b/doc/tcmalloc.html
index 4f60f92..9d7ab7e 100644
--- a/doc/tcmalloc.html
+++ b/doc/tcmalloc.html
@@ -462,11 +462,15 @@ environment variables.</p>
 
 <tr valign=top>
   <td><code>TCMALLOC_SAMPLE_PARAMETER</code></td>
-  <td>default: 524288</td>
+  <td>default: 0</td>
   <td>
     The approximate gap between sampling actions.  That is, we
     take one sample approximately once every
     <code>tcmalloc_sample_parmeter</code> bytes of allocation.
+    This sampled heap information is available via
+    <code>MallocExtension::GetHeapSample()</code> or
+    <code>MallocExtension::ReadStackTraces()</code>.  A reasonable
+    value is 524288.
   </td>
 </tr>
 
@@ -674,12 +678,34 @@ you can access them with a call like
   </td>
 </tr>
 
+<tr valign=top>
+  <td><code>tcmalloc.pageheap_free_bytes</code></td>
+  <td>
+    Number of bytes in free, mapped pages in page heap.  These bytes
+    can be used to fulfill allocation requests.  They always count
+    towards virtual memory usage, and unless the underlying memory is
+    swapped out by the OS, they also count towards physical memory
+    usage.
+  </td>
+</tr>
+
+<tr valign=top>
+  <td><code>tcmalloc.pageheap_unmapped_bytes</code></td>
+  <td>
+    Number of bytes in free, unmapped pages in page heap.  These are
+    bytes that have been released back to the OS, possibly by one of
+    the MallocExtension "Release" calls.  They can be used to fulfill
+    allocation requests, but typically incur a page fault.  They
+    always count towards virtual memory usage, and depending on the
+    OS, typically do not count towards physical memory usage.
+  </td>
+</tr>
+
 <tr valign=top>
   <td><code>tcmalloc.slack_bytes</code></td>
   <td>
-    A measure of memory fragmentation (how much memory is reserved by
-    TCMalloc but unlikely to ever be able to serve an allocation
-    request).
+    Sum of pageheap_free_bytes and pageheap_unmapped_bytes.  Provided
+    for backwards compatibility only.  Do not use.
   </td>
 </tr>
 
diff --git a/src/base/dynamic_annotations.c b/src/base/dynamic_annotations.c
index cdefaa7..bddd693 100644
--- a/src/base/dynamic_annotations.c
+++ b/src/base/dynamic_annotations.c
@@ -141,8 +141,25 @@ int RunningOnValgrind(void) {
   static volatile int running_on_valgrind = -1;
   /* C doesn't have thread-safe initialization of statics, and we
      don't want to depend on pthread_once here, so hack it. */
+  ANNOTATE_BENIGN_RACE(&running_on_valgrind, "safe hack");
   int local_running_on_valgrind = running_on_valgrind;
   if (local_running_on_valgrind == -1)
     running_on_valgrind = local_running_on_valgrind = GetRunningOnValgrind();
   return local_running_on_valgrind;
 }
+
+/* See the comments in dynamic_annotations.h */
+double ValgrindSlowdown() {
+  if (RunningOnValgrind() == 0) {
+    return 1.0;
+  }
+  /* Same initialization hack as in RunningOnValgrind(). */
+  static volatile double slowdown = 0.0;
+  ANNOTATE_BENIGN_RACE(&slowdown, "safe hack");
+  int local_slowdown = slowdown;
+  if (local_slowdown == 0.0) {
+    char *env = getenv("VALGRIND_SLOWDOWN");
+    slowdown = local_slowdown = env ? atof(env) : 50.0;
+  }
+  return local_slowdown;
+}
diff --git a/src/base/dynamic_annotations.h b/src/base/dynamic_annotations.h
index dae1a14..ceb9809 100644
--- a/src/base/dynamic_annotations.h
+++ b/src/base/dynamic_annotations.h
@@ -457,6 +457,19 @@ void AnnotateFlushState(const char *file, int line);
  */
 int RunningOnValgrind(void);
 
+/* ValgrindSlowdown returns:
+    * 1.0, if (RunningOnValgrind() == 0)
+    * 50.0, if (RunningOnValgrind() != 0 && getenv("VALGRIND_SLOWDOWN") == NULL)
+    * atof(getenv("VALGRIND_SLOWDOWN")) otherwise
+   This function can be used to scale timeout values:
+   EXAMPLE:
+   for (;;) {
+     DoExpensiveBackgroundTask();
+     SleepForSeconds(5 * ValgrindSlowdown());
+   }
+ */
+double ValgrindSlowdown();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/base/stl_allocator.h b/src/base/stl_allocator.h
index b0ddc68..7b0b8ca 100644
--- a/src/base/stl_allocator.h
+++ b/src/base/stl_allocator.h
@@ -45,7 +45,7 @@
 // Generic allocator class for STL objects
 // that uses a given type-less allocator Alloc, which must provide:
 //   static void* Alloc::Allocate(size_t size);
-//   static void Alloc::Free(void* ptr);
+//   static void Alloc::Free(void* ptr, size_t size);
 //
 // STL_Allocator<T, MyAlloc> provides the same thread-safety
 // guarantees as MyAlloc.
@@ -82,7 +82,7 @@ class STL_Allocator {
     RAW_DCHECK((n * sizeof(T)) / sizeof(T) == n, "n is too big to allocate");
     return static_cast<T*>(Alloc::Allocate(n * sizeof(T)));
   }
-  void deallocate(pointer p, size_type /*n*/) { Alloc::Free(p); }
+  void deallocate(pointer p, size_type n) { Alloc::Free(p, n * sizeof(T)); }
 
   size_type max_size() const { return size_t(-1) / sizeof(T); }
 
diff --git a/src/central_freelist.cc b/src/central_freelist.cc
index 5b7dfbb..da498e6 100644
--- a/src/central_freelist.cc
+++ b/src/central_freelist.cc
@@ -57,9 +57,22 @@ void CentralFreeList::ReleaseListToSpans(void* start) {
   }
 }
 
-void CentralFreeList::ReleaseToSpans(void* object) {
+// MapObjectToSpan should logically be part of ReleaseToSpans.  But
+// this triggers an optimization bug in gcc 4.5.0.  Moving to a
+// separate function, and making sure that function isn't inlined,
+// seems to fix the problem.  It also should be fixed for gcc 4.5.1.
+static
+#if __GNUC__ == 4 && __GNUC_MINOR__ == 5 && __GNUC_PATCHLEVEL__ == 0
+__attribute__ ((noinline))
+#endif
+Span* MapObjectToSpan(void* object) {
   const PageID p = reinterpret_cast<uintptr_t>(object) >> kPageShift;
   Span* span = Static::pageheap()->GetDescriptor(p);
+  return span;
+}
+
+void CentralFreeList::ReleaseToSpans(void* object) {
+  Span* span = MapObjectToSpan(object);
   ASSERT(span != NULL);
   ASSERT(span->refcount > 0);
 
@@ -266,7 +279,8 @@ void CentralFreeList::Populate() {
   Span* span;
   {
     SpinLockHolder h(Static::pageheap_lock());
-    span = Static::pageheap()->New(npages, size_class_, kPageSize);
+    span = Static::pageheap()->New(npages);
+    if (span) Static::pageheap()->RegisterSizeClass(span, size_class_);
   }
   if (span == NULL) {
     MESSAGE("tcmalloc: allocation failed", npages << kPageShift);
@@ -274,6 +288,12 @@ void CentralFreeList::Populate() {
     return;
   }
   ASSERT(span->length == npages);
+  // Cache sizeclass info eagerly.  Locking is not necessary.
+  // (Instead of being eager, we could just replace any stale info
+  // about this span, but that seems to be no better in practice.)
+  for (int i = 0; i < npages; i++) {
+    Static::pageheap()->CacheSizeClass(span->start + i, size_class_);
+  }
 
   // Split the block into pieces and add to the free-list
   // TODO: coloring of objects to avoid cache conflicts?
diff --git a/src/common.h b/src/common.h
index b0278eb..5226998 100644
--- a/src/common.h
+++ b/src/common.h
@@ -54,16 +54,45 @@ typedef uintptr_t Length;
 // Configuration
 //-------------------------------------------------------------------
 
-// Not all possible combinations of the following parameters make
-// sense.  In particular, if kMaxSize increases, you may have to
-// increase kNumClasses as well.
+// Using large pages speeds up the execution at a cost of larger memory use.
+// Deallocation may speed up by a factor as the page map gets 8x smaller, so
+// lookups in the page map result in fewer L2 cache misses, which translates to
+// speedup for application/platform combinations with high L2 cache pressure.
+// As the number of size classes increases with large pages, we increase
+// the thread cache allowance to avoid passing more free ranges to and from
+// central lists.  Also, larger pages are less likely to get freed.
+// These two factors cause a bounded increase in memory use.
+
+#if defined(TCMALLOC_LARGE_PAGES)
+static const size_t kPageShift  = 15;
+static const size_t kNumClasses = 95;
+static const size_t kMaxThreadCacheSize = 4 << 20;
+#else
 static const size_t kPageShift  = 12;
+static const size_t kNumClasses = 61;
+static const size_t kMaxThreadCacheSize = 2 << 20;
+#endif
+
 static const size_t kPageSize   = 1 << kPageShift;
 static const size_t kMaxSize    = 8u * kPageSize;
 static const size_t kAlignment  = 8;
-static const size_t kNumClasses = 61;
 static const size_t kLargeSizeClass = 0;
 
+// Default bound on the total amount of thread caches.
+static const size_t kDefaultOverallThreadCacheSize = 8u * kMaxThreadCacheSize;
+
+// Lower bound on the per-thread cache sizes
+static const size_t kMinThreadCacheSize = kMaxSize * 2;
+
+// The number of bytes one ThreadCache will steal from another when
+// the first ThreadCache is forced to Scavenge(), delaying the
+// next call to Scavenge for this thread.
+static const size_t kStealAmount = 1 << 16;
+
+// The number of times that a deallocation can cause a freelist to
+// go over its max_length() before shrinking max_length().
+static const int kMaxOverages = 3;
+
 // Maximum length we allow a per-thread free-list to have before we
 // move objects from it into the corresponding central free-list.  We
 // want this big to avoid locking the central free-list too often.  It
@@ -115,8 +144,10 @@ class SizeMap {
   //   ...
   //   32768      (32768 + 127 + (120<<7)) / 128  376
   static const int kMaxSmallSize = 1024;
-  unsigned char class_array_[377];
-  
+  static const size_t kClassArraySize =
+      (((1 << kPageShift) * 8u + 127 + (120 << 7)) >> 7) + 1;
+  unsigned char class_array_[kClassArraySize];
+
   // Compute index of the class_array[] entry for a given size
   static inline int ClassIndex(int s) {
     ASSERT(0 <= s);
diff --git a/src/config.h.in b/src/config.h.in
index 49bbf0d..a1d5c68 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -1,5 +1,10 @@
 /* src/config.h.in.  Generated from configure.ac by autoheader.  */
 
+
+#ifndef GOOGLE_PERFTOOLS_CONFIG_H_
+#define GOOGLE_PERFTOOLS_CONFIG_H_
+
+
 /* Define to 1 if compiler supports __builtin_stack_pointer */
 #undef HAVE_BUILTIN_STACK_POINTER
 
@@ -240,3 +245,5 @@
 #include "windows/mingw.h"
 #endif
 
+#endif  /* #ifndef GOOGLE_PERFTOOLS_CONFIG_H_ */
+
diff --git a/src/google/malloc_extension.h b/src/google/malloc_extension.h
index fc272c9..9c05897 100644
--- a/src/google/malloc_extension.h
+++ b/src/google/malloc_extension.h
@@ -145,21 +145,22 @@ class PERFTOOLS_DLL_DECL MallocExtension {
   //      Number of bytes used across all thread caches.
   //      This property is not writable.
   //
-  // "tcmalloc.slack_bytes"
-  //      Number of bytes allocated from system, but not currently in
-  //      use by malloced objects.  I.e., bytes available for
-  //      allocation without needing more bytes from system.  It is
-  //      the sum of pageheap_free_bytes and pageheap_unmapped_bytes.
-  //      This property is not writable.
-  //
   // "tcmalloc.pageheap_free_bytes"
-  //      Number of bytes in free, mapped pages in pageheap
-  //      This property is not writable.
+  //      Number of bytes in free, mapped pages in page heap.  These
+  //      bytes can be used to fulfill allocation requests.  They
+  //      always count towards virtual memory usage, and unless the
+  //      underlying memory is swapped out by the OS, they also count
+  //      towards physical memory usage.  This property is not writable.
   //
   // "tcmalloc.pageheap_unmapped_bytes"
-  //      Number of bytes in free, unmapped pages in pageheap
-  //      This property is not writable.
-  //
+  //        Number of bytes in free, unmapped pages in page heap.
+  //        These are bytes that have been released back to the OS,
+  //        possibly by one of the MallocExtension "Release" calls.
+  //        They can be used to fulfill allocation requests, but
+  //        typically incur a page fault.  They always count towards
+  //        virtual memory usage, and depending on the OS, typically
+  //        do not count towards physical memory usage.  This property
+  //        is not writable.
   // -------------------------------------------------------------------
 
   // Get the named "property"'s value.  Returns true if the property
diff --git a/src/heap-checker.cc b/src/heap-checker.cc
index 2779c97..2b0b854 100644
--- a/src/heap-checker.cc
+++ b/src/heap-checker.cc
@@ -304,6 +304,9 @@ class HeapLeakChecker::Allocator {
     if (p) alloc_count_ -= 1;
     LowLevelAlloc::Free(p);
   }
+  static void Free(void* p, size_t /* n */) {
+    Free(p);
+  }
   // destruct, free, and make *p to be NULL
   template<typename T> static void DeleteAndNull(T** p) {
     (*p)->~T();
diff --git a/src/internal_logging.h b/src/internal_logging.h
index 731b2d9..0cb9ba2 100644
--- a/src/internal_logging.h
+++ b/src/internal_logging.h
@@ -119,9 +119,7 @@ do {                                                                     \
 #ifndef NDEBUG
 #define ASSERT(cond) CHECK_CONDITION(cond)
 #else
-#define ASSERT(cond)                            \
-  do {                                          \
-  } while (0 && (cond))
+#define ASSERT(cond) ((void) 0)
 #endif
 
 // Print into buffer
diff --git a/src/linked_list.h b/src/linked_list.h
index 638174b..4b0af1b 100644
--- a/src/linked_list.h
+++ b/src/linked_list.h
@@ -36,6 +36,8 @@
 #ifndef TCMALLOC_LINKED_LIST_H_
 #define TCMALLOC_LINKED_LIST_H_
 
+#include <stddef.h>
+
 namespace tcmalloc {
 
 inline void *SLL_Next(void *t) {
diff --git a/src/memory_region_map.h b/src/memory_region_map.h
index f88c7b9..776abb3 100644
--- a/src/memory_region_map.h
+++ b/src/memory_region_map.h
@@ -231,7 +231,7 @@ class MemoryRegionMap {
     static void *Allocate(size_t n) {
       return LowLevelAlloc::AllocWithArena(n, arena_);
     }
-    static void Free(const void *p) {
+    static void Free(const void *p, size_t /* n */) {
       LowLevelAlloc::Free(const_cast<void*>(p));
     }
   };
diff --git a/src/page_heap.cc b/src/page_heap.cc
index 7bfeea4..1e63cb9 100644
--- a/src/page_heap.cc
+++ b/src/page_heap.cc
@@ -61,64 +61,49 @@ PageHeap::PageHeap()
   }
 }
 
-// Returns the minimum number of pages necessary to ensure that an
-// allocation of size n can be aligned to the given alignment.
-static Length AlignedAllocationSize(Length n, size_t alignment) {
-  ASSERT(alignment >= kPageSize);
-  return n + tcmalloc::pages(alignment - kPageSize);
-}
-
-Span* PageHeap::New(Length n, size_t sc, size_t align) {
+Span* PageHeap::New(Length n) {
   ASSERT(Check());
   ASSERT(n > 0);
 
-  if (align < kPageSize) {
-    align = kPageSize;
-  }
-
-  Length aligned_size = AlignedAllocationSize(n, align);
-
   // Find first size >= n that has a non-empty list
-  for (Length s = aligned_size; s < kMaxPages; s++) {
+  for (Length s = n; s < kMaxPages; s++) {
     Span* ll = &free_[s].normal;
     // If we're lucky, ll is non-empty, meaning it has a suitable span.
     if (!DLL_IsEmpty(ll)) {
       ASSERT(ll->next->location == Span::ON_NORMAL_FREELIST);
-      return Carve(ll->next, n, sc, align);
+      return Carve(ll->next, n);
     }
     // Alternatively, maybe there's a usable returned span.
     ll = &free_[s].returned;
     if (!DLL_IsEmpty(ll)) {
       ASSERT(ll->next->location == Span::ON_RETURNED_FREELIST);
-      return Carve(ll->next, n, sc, align);
+      return Carve(ll->next, n);
     }
     // Still no luck, so keep looking in larger classes.
   }
 
-  Span* result = AllocLarge(n, sc, align);
+  Span* result = AllocLarge(n);
   if (result != NULL) return result;
 
   // Grow the heap and try again
-  if (!GrowHeap(aligned_size)) {
+  if (!GrowHeap(n)) {
     ASSERT(Check());
     return NULL;
   }
 
-  return AllocLarge(n, sc, align);
+  return AllocLarge(n);
 }
 
-Span* PageHeap::AllocLarge(Length n, size_t sc, size_t align) {
-  // Find the best span (closest to n in size).
+Span* PageHeap::AllocLarge(Length n) {
+  // find the best span (closest to n in size).
   // The following loops implements address-ordered best-fit.
   Span *best = NULL;
 
-  Length aligned_size = AlignedAllocationSize(n, align);
-
   // Search through normal list
   for (Span* span = large_.normal.next;
        span != &large_.normal;
        span = span->next) {
-    if (span->length >= aligned_size) {
+    if (span->length >= n) {
       if ((best == NULL)
           || (span->length < best->length)
           || ((span->length == best->length) && (span->start < best->start))) {
@@ -132,7 +117,7 @@ Span* PageHeap::AllocLarge(Length n, size_t sc, size_t align) {
   for (Span* span = large_.returned.next;
        span != &large_.returned;
        span = span->next) {
-    if (span->length >= aligned_size) {
+    if (span->length >= n) {
       if ((best == NULL)
           || (span->length < best->length)
           || ((span->length == best->length) && (span->start < best->start))) {
@@ -142,18 +127,19 @@ Span* PageHeap::AllocLarge(Length n, size_t sc, size_t align) {
     }
   }
 
-  return best == NULL ? NULL : Carve(best, n, sc, align);
+  return best == NULL ? NULL : Carve(best, n);
 }
 
 Span* PageHeap::Split(Span* span, Length n) {
   ASSERT(0 < n);
   ASSERT(n < span->length);
-  ASSERT((span->location != Span::IN_USE) || span->sizeclass == 0);
+  ASSERT(span->location == Span::IN_USE);
+  ASSERT(span->sizeclass == 0);
   Event(span, 'T', n);
 
   const int extra = span->length - n;
   Span* leftover = NewSpan(span->start + n, extra);
-  leftover->location = span->location;
+  ASSERT(leftover->location == Span::IN_USE);
   Event(leftover, 'U', extra);
   RecordSpan(leftover);
   pagemap_.set(span->start + n - 1, span); // Update map from pageid to span
@@ -162,44 +148,25 @@ Span* PageHeap::Split(Span* span, Length n) {
   return leftover;
 }
 
-Span* PageHeap::Carve(Span* span, Length n, size_t sc, size_t align) {
+Span* PageHeap::Carve(Span* span, Length n) {
   ASSERT(n > 0);
   ASSERT(span->location != Span::IN_USE);
-  ASSERT(align >= kPageSize);
-
-  Length align_pages = align >> kPageShift;
+  const int old_location = span->location;
   RemoveFromFreeList(span);
-
-  if (span->start & (align_pages - 1)) {
-    Length skip_for_alignment = align_pages - (span->start & (align_pages - 1));
-    Span* aligned = Split(span, skip_for_alignment);
-    PrependToFreeList(span); // Skip coalescing - no candidates possible
-    span = aligned;
-  }
+  span->location = Span::IN_USE;
+  Event(span, 'A', n);
 
   const int extra = span->length - n;
   ASSERT(extra >= 0);
   if (extra > 0) {
-    Span* leftover = Split(span, n);
-    PrependToFreeList(leftover);
+    Span* leftover = NewSpan(span->start + n, extra);
+    leftover->location = old_location;
+    Event(leftover, 'S', extra);
+    RecordSpan(leftover);
+    PrependToFreeList(leftover);  // Skip coalescing - no candidates possible
+    span->length = n;
+    pagemap_.set(span->start + n - 1, span);
   }
-
-  span->location = Span::IN_USE;
-  span->sizeclass = sc;
-  Event(span, 'A', n);
-
-  // Cache sizeclass info eagerly.  Locking is not necessary.
-  // (Instead of being eager, we could just replace any stale info
-  // about this span, but that seems to be no better in practice.)
-  CacheSizeClass(span->start, sc);
-
-  if (sc != kLargeSizeClass) {
-    for (Length i = 1; i < n; i++) {
-      pagemap_.set(span->start + i, span);
-      CacheSizeClass(span->start + i, sc);
-    }
-  }
-
   ASSERT(Check());
   return span;
 }
@@ -351,6 +318,18 @@ Length PageHeap::ReleaseAtLeastNPages(Length num_pages) {
   return released_pages;
 }
 
+void PageHeap::RegisterSizeClass(Span* span, size_t sc) {
+  // Associate span object with all interior pages as well
+  ASSERT(span->location == Span::IN_USE);
+  ASSERT(GetDescriptor(span->start) == span);
+  ASSERT(GetDescriptor(span->start+span->length-1) == span);
+  Event(span, 'C', sc);
+  span->sizeclass = sc;
+  for (Length i = 1; i < span->length-1; i++) {
+    pagemap_.set(span->start+i, span);
+  }
+}
+
 static double MB(uint64_t bytes) {
   return bytes / 1048576.0;
 }
diff --git a/src/page_heap.h b/src/page_heap.h
index de36266..74030d2 100644
--- a/src/page_heap.h
+++ b/src/page_heap.h
@@ -93,49 +93,21 @@ class PERFTOOLS_DLL_DECL PageHeap {
  public:
   PageHeap();
 
-  // Allocate a run of "n" pages.  Returns NULL if out of memory.
-  // Caller should not pass "n == 0" -- instead, n should have been
-  // rounded up already.  The span will be used for allocating objects
-  // with the specifled sizeclass sc (sc must be zero for large
-  // objects). The first page of the span will be aligned to the value
-  // specified by align, which must be a power of two.
-  Span* New(Length n, size_t sc, size_t align);
+  // Allocate a run of "n" pages.  Returns zero if out of memory.
+  // Caller should not pass "n == 0" -- instead, n should have
+  // been rounded up already.
+  Span* New(Length n);
 
   // Delete the span "[p, p+n-1]".
   // REQUIRES: span was returned by earlier call to New() and
   //           has not yet been deleted.
   void Delete(Span* span);
 
-  // Gets either the size class of addr, if it is a small object, or it's span.
-  // Return:
-  // if addr is invalid:
-  //   leave *out_sc and *out_span unchanged and return false;
-  // if addr is valid and has a small size class:
-  //   *out_sc = the size class
-  //   *out_span = <undefined>
-  //   return true
-  // if addr is valid and has a large size class:
-  //   *out_sc = kLargeSizeClass
-  //   *out_span = the span pointer
-  //   return true
-  bool GetSizeClassOrSpan(void* addr, size_t* out_sc, Span** out_span) {
-    const PageID p = reinterpret_cast<uintptr_t>(addr) >> kPageShift;
-    size_t cl = GetSizeClassIfCached(p);
-    Span* span = NULL;
-
-    if (cl != kLargeSizeClass) {
-      ASSERT(cl == GetDescriptor(p)->sizeclass);
-    } else {
-      span = GetDescriptor(p);
-      if (!span) {
-        return false;
-      }
-      cl = span->sizeclass;
-    }
-    *out_span = span;
-    *out_sc = cl;
-    return true;
-  }
+  // Mark an allocated span as being used for small objects of the
+  // specified size-class.
+  // REQUIRES: span was returned by an earlier call to New()
+  //           and has not yet been deleted.
+  void RegisterSizeClass(Span* span, size_t sc);
 
   // Split an allocated span into two spans: one of length "n" pages
   // followed by another span of length "span->length - n" pages.
@@ -143,29 +115,14 @@ class PERFTOOLS_DLL_DECL PageHeap {
   // Returns a pointer to the second span.
   //
   // REQUIRES: "0 < n < span->length"
-  // REQUIRES: a) the span is free or b) sizeclass == 0
+  // REQUIRES: span->location == IN_USE
+  // REQUIRES: span->sizeclass == 0
   Span* Split(Span* span, Length n);
 
   // Return the descriptor for the specified page.  Returns NULL if
   // this PageID was not allocated previously.
   inline Span* GetDescriptor(PageID p) const {
-    Span* ret = reinterpret_cast<Span*>(pagemap_.get(p));
-#ifndef NDEBUG
-    if (ret != NULL && ret->location == Span::IN_USE) {
-      size_t cl = GetSizeClassIfCached(p);
-      // Three cases:
-      //  - The object is not cached
-      //  - The object is cached correctly
-      //  - It is a large object and we're not looking at the first
-      //    page. This happens in coalescing.
-      ASSERT(cl == kLargeSizeClass || cl == ret->sizeclass ||
-             (ret->start != p && ret->sizeclass == kLargeSizeClass));
-      // If the object is sampled, it must have be kLargeSizeClass
-      ASSERT(ret->sizeclass == kLargeSizeClass || !ret->sample);
-    }
-#endif
-
-    return ret;
+    return reinterpret_cast<Span*>(pagemap_.get(p));
   }
 
   // Dump state to stderr
@@ -266,7 +223,7 @@ class PERFTOOLS_DLL_DECL PageHeap {
   // length exactly "n" and mark it as non-free so it can be returned
   // to the client.  After all that, decrease free_pages_ by n and
   // return span.
-  Span* Carve(Span* span, Length n, size_t sc, size_t align);
+  Span* Carve(Span* span, Length n);
 
   void RecordSpan(Span* span) {
     pagemap_.set(span->start, span);
@@ -277,7 +234,7 @@ class PERFTOOLS_DLL_DECL PageHeap {
 
   // Allocate a large span of length == n.  If successful, returns a
   // span of exactly the specified length.  Else, returns NULL.
-  Span* AllocLarge(Length n, size_t sc, size_t align);
+  Span* AllocLarge(Length n);
 
   // Coalesce span with neighboring spans if possible, prepend to
   // appropriate free list, and adjust stats.
diff --git a/src/pprof b/src/pprof
index 8aff380..8d4ddcf 100755
--- a/src/pprof
+++ b/src/pprof
@@ -215,7 +215,7 @@ Call-graph Options:
                        (i.e. direct leak generators) more visible
 
 Miscellaneous:
-   --tools=<prefix>    Prefix for object tool pathnames
+   --tools=<prefix or binary:fullpath>[,...]   \$PATH for object tool pathnames
    --test              Run unit tests
    --help              This message
    --version           Version information
@@ -4331,18 +4331,27 @@ sub ConfigureTool {
   my $tool = shift;
   my $path;
 
-  if ($main::opt_tools ne "") {
-    # Use a prefix specified by the --tools option...
-    $path = $main::opt_tools . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by --tools $main::opt_tools\n");
-    }
-  } elsif (exists $ENV{"PPROF_TOOLS"} &&
-           $ENV{"PPROF_TOOLS"} ne "") {
-    #... or specified with the PPROF_TOOLS environment variable...
-    $path = $ENV{"PPROF_TOOLS"} . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by PPROF_TOOLS=$ENV{PPROF_TOOLS}\n");
+  # --tools (or $PPROF_TOOLS) is a comma separated list, where each
+  # item is either a) a pathname prefix, or b) a map of the form
+  # <tool>:<path>.  First we look for an entry of type (b) for our
+  # tool.  If one is found, we use it.  Otherwise, we consider all the
+  # pathname prefixes in turn, until one yields an existing file.  If
+  # none does, we use a default path.
+  my $tools = $main::opt_tools || $ENV{"PPROF_TOOLS"} || "";
+  if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) {
+    $path = $2;
+    # TODO(csilvers): sanity-check that $path exists?  Hard if it's relative.
+  } elsif ($tools) {
+    foreach my $prefix (split(',', $tools)) {
+      next if ($prefix =~ /:/);    # ignore "tool:fullpath" entries in the list
+      if (-x $prefix . $tool) {
+        $path = $prefix . $tool;
+        last;
+      }
+    }
+    if (!$path) {
+      error("No '$tool' found with prefix specified by " .
+            "--tools (or \$PPROF_TOOLS) '$tools'\n");
     }
   } else {
     # ... otherwise use the version that exists in the same directory as
diff --git a/src/sampler.cc b/src/sampler.cc
index cbc6ab4..a13544a 100755
--- a/src/sampler.cc
+++ b/src/sampler.cc
@@ -42,16 +42,15 @@ using std::min;
 // The approximate gap in bytes between sampling actions.
 // I.e., we take one sample approximately once every
 // tcmalloc_sample_parameter bytes of allocation
-// i.e. about once every 512KB.
+// i.e. about once every 512KB if value is 1<<19.
 #ifdef NO_TCMALLOC_SAMPLES
 DEFINE_int64(tcmalloc_sample_parameter, 0,
              "Unused: code is compiled with NO_TCMALLOC_SAMPLES");
 #else
 DEFINE_int64(tcmalloc_sample_parameter,
-             EnvToInt64("TCMALLOC_SAMPLE_PARAMETER", 1<<19),
+             EnvToInt64("TCMALLOC_SAMPLE_PARAMETER", 0),
              "The approximate gap in bytes between sampling actions. "
-             "This must be between 1 and 1<<58.");
-// Note: there are other places in this file where the number 19 occurs.
+             "This must be between 1 and 2^58.");
 #endif
 
 namespace tcmalloc {
diff --git a/src/span.h b/src/span.h
index b3483ca..ab9a796 100644
--- a/src/span.h
+++ b/src/span.h
@@ -60,10 +60,6 @@ struct Span {
   int value[64];
 #endif
 
-  void* start_ptr() {
-    return reinterpret_cast<void*>(start << kPageShift);
-  }
-
   // What freelist the span is on: IN_USE if on none, or normal or returned
   enum { IN_USE, ON_NORMAL_FREELIST, ON_RETURNED_FREELIST };
 };
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 011fc91..13d2c23 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -469,6 +469,7 @@ static void DumpStats(TCMalloc_Printer* out, int level) {
               "MALLOC: %12" PRIu64 "              Spans in use\n"
               "MALLOC: %12" PRIu64 "              Thread heaps in use\n"
               "MALLOC: %12" PRIu64 " (%7.1f MB) Metadata allocated\n"
+              "MALLOC: %12" PRIu64 "              Tcmalloc page size\n"
               "------------------------------------------------\n",
               stats.pageheap.system_bytes, stats.pageheap.system_bytes / MB,
               bytes_in_use, bytes_in_use / MB,
@@ -479,7 +480,8 @@ static void DumpStats(TCMalloc_Printer* out, int level) {
               stats.thread_bytes, stats.thread_bytes / MB,
               uint64_t(Static::span_allocator()->inuse()),
               uint64_t(ThreadCache::HeapsInUse()),
-              stats.metadata_bytes, stats.metadata_bytes / MB);
+              stats.metadata_bytes, stats.metadata_bytes / MB,
+              uint64_t(kPageSize));
 }
 
 static void PrintStats(int level) {
@@ -637,9 +639,8 @@ class TCMallocImplementation : public MallocExtension {
     }
 
     if (strcmp(name, "tcmalloc.slack_bytes") == 0) {
-      // We assume that bytes in the page heap are not fragmented too
-      // badly, and are therefore available for allocation without
-      // growing the pageheap system byte count.
+      // Kept for backwards compatibility.  Now defined externally as:
+      //    pageheap_free_bytes + pageheap_unmapped_bytes.
       SpinLockHolder l(Static::pageheap_lock());
       PageHeap::Stats stats = Static::pageheap()->stats();
       *value = stats.free_bytes + stats.unmapped_bytes;
@@ -798,25 +799,22 @@ static TCMallocGuard module_enter_exit_hook;
 // Helpers for the exported routines below
 //-------------------------------------------------------------------
 
-static inline void* CheckedMallocResult(void *result) {
-  Span* fetched_span;
-  size_t cl;
-
-  if (result != NULL) {
-    ASSERT(Static::pageheap()->GetSizeClassOrSpan(result, &cl, &fetched_span));
-  }
+static inline bool CheckCachedSizeClass(void *ptr) {
+  PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
+  size_t cached_value = Static::pageheap()->GetSizeClassIfCached(p);
+  return cached_value == 0 ||
+      cached_value == Static::pageheap()->GetDescriptor(p)->sizeclass;
+}
 
+static inline void* CheckedMallocResult(void *result) {
+  ASSERT(result == NULL || CheckCachedSizeClass(result));
   return result;
 }
 
 static inline void* SpanToMallocResult(Span *span) {
-  Span* fetched_span = NULL;
-  size_t cl = 0;
-  ASSERT(Static::pageheap()->GetSizeClassOrSpan(span->start_ptr(),
-                                                &cl, &fetched_span));
-  ASSERT(cl == kLargeSizeClass);
-  ASSERT(span == fetched_span);
-  return span->start_ptr();
+  Static::pageheap()->CacheSizeClass(span->start, 0);
+  return
+      CheckedMallocResult(reinterpret_cast<void*>(span->start << kPageShift));
 }
 
 static void* DoSampledAllocation(size_t size) {
@@ -827,8 +825,7 @@ static void* DoSampledAllocation(size_t size) {
 
   SpinLockHolder h(Static::pageheap_lock());
   // Allocate span
-  Span *span = Static::pageheap()->New(tcmalloc::pages(size == 0 ? 1 : size),
-                                       kLargeSizeClass, kPageSize);
+  Span *span = Static::pageheap()->New(tcmalloc::pages(size == 0 ? 1 : size));
   if (span == NULL) {
     return NULL;
   }
@@ -919,7 +916,7 @@ inline void* do_malloc_pages(ThreadCache* heap, size_t size) {
     report_large = should_report_large(num_pages);
   } else {
     SpinLockHolder h(Static::pageheap_lock());
-    Span* span = Static::pageheap()->New(num_pages, kLargeSizeClass, kPageSize);
+    Span* span = Static::pageheap()->New(num_pages);
     result = (span == NULL ? NULL : SpanToMallocResult(span));
     report_large = should_report_large(num_pages);
   }
@@ -975,22 +972,28 @@ static inline ThreadCache* GetCacheIfPresent() {
 inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) {
   if (ptr == NULL) return;
   ASSERT(Static::pageheap() != NULL);  // Should not call free() before malloc()
-  Span* span;
-  size_t cl;
-
-  if (!Static::pageheap()->GetSizeClassOrSpan(ptr, &cl, &span)) {
-    // result can be false because the pointer passed in is invalid
-    // (not something returned by malloc or friends), or because the
-    // pointer was allocated with some other allocator besides
-    // tcmalloc.  The latter can happen if tcmalloc is linked in via
-    // a dynamic library, but is not listed last on the link line.
-    // In that case, libraries after it on the link line will
-    // allocate with libc malloc, but free with tcmalloc's free.
-    (*invalid_free_fn)(ptr);  // Decide how to handle the bad free request
-    return;
+  const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
+  Span* span = NULL;
+  size_t cl = Static::pageheap()->GetSizeClassIfCached(p);
+
+  if (cl == 0) {
+    span = Static::pageheap()->GetDescriptor(p);
+    if (!span) {
+      // span can be NULL because the pointer passed in is invalid
+      // (not something returned by malloc or friends), or because the
+      // pointer was allocated with some other allocator besides
+      // tcmalloc.  The latter can happen if tcmalloc is linked in via
+      // a dynamic library, but is not listed last on the link line.
+      // In that case, libraries after it on the link line will
+      // allocate with libc malloc, but free with tcmalloc's free.
+      (*invalid_free_fn)(ptr);  // Decide how to handle the bad free request
+      return;
+    }
+    cl = span->sizeclass;
+    Static::pageheap()->CacheSizeClass(p, cl);
   }
-
-  if (cl != kLargeSizeClass) {
+  if (cl != 0) {
+    ASSERT(!Static::pageheap()->GetDescriptor(p)->sample);
     ThreadCache* heap = GetCacheIfPresent();
     if (heap != NULL) {
       heap->Deallocate(ptr, cl);
@@ -1001,7 +1004,8 @@ inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) {
     }
   } else {
     SpinLockHolder h(Static::pageheap_lock());
-    ASSERT(span != NULL && ptr == span->start_ptr());
+    ASSERT(reinterpret_cast<uintptr_t>(ptr) % kPageSize == 0);
+    ASSERT(span != NULL && span->start == p);
     if (span->sample) {
       tcmalloc::DLL_Remove(span);
       Static::stacktrace_allocator()->Delete(
@@ -1021,17 +1025,20 @@ inline size_t GetSizeWithCallback(void* ptr,
                                   size_t (*invalid_getsize_fn)(void*)) {
   if (ptr == NULL)
     return 0;
-
-  Span* span;
-  size_t cl;
-  if (!Static::pageheap()->GetSizeClassOrSpan(ptr, &cl, &span)) {
-    return (*invalid_getsize_fn)(ptr);
-  }
-
-  if (cl != kLargeSizeClass) {
+  const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
+  size_t cl = Static::pageheap()->GetSizeClassIfCached(p);
+  if (cl != 0) {
     return Static::sizemap()->ByteSizeForClass(cl);
   } else {
-    return span->length << kPageShift;
+    Span *span = Static::pageheap()->GetDescriptor(p);
+    if (span == NULL) {  // means we do not own this memory
+      return (*invalid_getsize_fn)(ptr);
+    } else if (span->sizeclass != 0) {
+      Static::pageheap()->CacheSizeClass(p, span->sizeclass);
+      return Static::sizemap()->ByteSizeForClass(span->sizeclass);
+    } else {
+      return span->length << kPageShift;
+    }
   }
 }
 
@@ -1126,10 +1133,39 @@ void* do_memalign(size_t align, size_t size) {
   // We will allocate directly from the page heap
   SpinLockHolder h(Static::pageheap_lock());
 
-  // Any page-level allocation will be fine
-  Span* span = Static::pageheap()->New(tcmalloc::pages(size),
-                                       kLargeSizeClass, align);
-  return span == NULL ? NULL : SpanToMallocResult(span);
+  if (align <= kPageSize) {
+    // Any page-level allocation will be fine
+    // TODO: We could put the rest of this page in the appropriate
+    // TODO: cache but it does not seem worth it.
+    Span* span = Static::pageheap()->New(tcmalloc::pages(size));
+    return span == NULL ? NULL : SpanToMallocResult(span);
+  }
+
+  // Allocate extra pages and carve off an aligned portion
+  const Length alloc = tcmalloc::pages(size + align);
+  Span* span = Static::pageheap()->New(alloc);
+  if (span == NULL) return NULL;
+
+  // Skip starting portion so that we end up aligned
+  Length skip = 0;
+  while ((((span->start+skip) << kPageShift) & (align - 1)) != 0) {
+    skip++;
+  }
+  ASSERT(skip < alloc);
+  if (skip > 0) {
+    Span* rest = Static::pageheap()->Split(span, skip);
+    Static::pageheap()->Delete(span);
+    span = rest;
+  }
+
+  // Skip trailing portion that we do not need to return
+  const Length needed = tcmalloc::pages(size);
+  ASSERT(span->length >= needed);
+  if (span->length > needed) {
+    Span* trailer = Static::pageheap()->Split(span, needed);
+    Static::pageheap()->Delete(trailer);
+  }
+  return SpanToMallocResult(span);
 }
 
 // Helpers for use by exported routines below:
diff --git a/src/tests/frag_unittest.cc b/src/tests/frag_unittest.cc
index 08494b4..160c41c 100644
--- a/src/tests/frag_unittest.cc
+++ b/src/tests/frag_unittest.cc
@@ -44,13 +44,16 @@
 #endif
 #include <vector>
 #include "base/logging.h"
+#include "common.h"
 #include <google/malloc_extension.h>
 
 using std::vector;
 
 int main(int argc, char** argv) {
-  static const int kAllocSize = 36<<10; // Bigger than tcmalloc page size
-  static const int kTotalAlloc = 400 << 20; // Allocate 400MB in total
+  // Make kAllocSize larger than tcmalloc page size.
+  static const int kAllocSize = 9 << kPageShift;
+  // Allocate 400MB in total.
+  static const int kTotalAlloc = 400 << 20;
   static const int kAllocIterations = kTotalAlloc / kAllocSize;
 
   // Allocate lots of objects
diff --git a/src/tests/page_heap_test.cc b/src/tests/page_heap_test.cc
index fd444da..9120b78 100644
--- a/src/tests/page_heap_test.cc
+++ b/src/tests/page_heap_test.cc
@@ -26,7 +26,7 @@ static void TestPageHeap_Stats() {
   CheckStats(ph, 0, 0, 0);
 
   // Allocate a span 's1'
-  tcmalloc::Span* s1 = ph->New(256, kLargeSizeClass, kPageSize);
+  tcmalloc::Span* s1 = ph->New(256);
   CheckStats(ph, 256, 0, 0);
 
   // Split span 's1' into 's1', 's2'.  Delete 's2'
diff --git a/src/tests/testutil.cc b/src/tests/testutil.cc
index f2b8592..745de99 100644
--- a/src/tests/testutil.cc
+++ b/src/tests/testutil.cc
@@ -80,7 +80,7 @@ struct FunctionAndId {
   int id;
 };
 
-#if defined(NO_THREADS) || !(defined(HAVE_PTHREADS) || defined(_WIN32))
+#if defined(NO_THREADS) || !(defined(HAVE_PTHREAD) || defined(_WIN32))
 
 extern "C" void RunThread(void (*fn)()) {
   (*fn)();
diff --git a/src/thread_cache.cc b/src/thread_cache.cc
index 64f4deb..8d31117 100644
--- a/src/thread_cache.cc
+++ b/src/thread_cache.cc
@@ -42,7 +42,8 @@ using std::min;
 using std::max;
 
 DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
-             EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES", 16<<20),
+             EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
+                        kDefaultOverallThreadCacheSize),
              "Bound on the total amount of bytes allocated to "
              "thread caches.  This bound is not strict, so it is possible "
              "for the cache to go over this bound in certain circumstances. ");
diff --git a/src/thread_cache.h b/src/thread_cache.h
index 1165447..352c683 100644
--- a/src/thread_cache.h
+++ b/src/thread_cache.h
@@ -63,9 +63,6 @@ inline bool KernelSupportsTLS() {
 
 class ThreadCache {
  public:
-  // Default bound on the total amount of thread caches.
-  static const size_t kDefaultOverallThreadCacheSize = 16 << 20;
-
   // All ThreadCache objects are kept in a linked list (for stats collection)
   ThreadCache* next_;
   ThreadCache* prev_;
@@ -213,19 +210,6 @@ class ThreadCache {
     }
   };
 
-  // The number of bytes one ThreadCache will steal from another when
-  // the first ThreadCache is forced to Scavenge(), delaying the
-  // next call to Scavenge for this thread.
-  static const size_t kStealAmount = 1 << 16;
-
-  // Lower and upper bounds on the per-thread cache sizes
-  static const size_t kMinThreadCacheSize = kMaxSize * 2; //kStealAmount;
-  static const size_t kMaxThreadCacheSize = 2 << 20;
-
-  // The number of times that a deallocation can cause a freelist to
-  // go over its max_length() before shrinking max_length().
-  static const int kMaxOverages = 3;
-
   // Gets and returns an object from the central cache, and, if possible,
   // also adds some objects of that size class to this thread cache.
   void* FetchFromCentralCache(size_t cl, size_t byte_size);
diff --git a/src/windows/config.h b/src/windows/config.h
index b5d9bb6..6d6f771 100644
--- a/src/windows/config.h
+++ b/src/windows/config.h
@@ -154,7 +154,7 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
-/* Define to 1 if you have the <sys/ucontext.h> header file. */
+/* <sys/ucontext.h> is broken on redhat 7 */
 #undef HAVE_SYS_UCONTEXT_H
 
 /* Define to 1 if you have the <sys/wait.h> header file. */
@@ -172,6 +172,9 @@
 /* Define to 1 if you have the <unwind.h> header file. */
 #undef HAVE_UNWIND_H
 
+/* Define to 1 if you have the <valgrind.h> header file. */
+#undef HAVE_VALGRIND_H
+
 /* define if your compiler has __attribute__ */
 #undef HAVE___ATTRIBUTE__
 
diff --git a/src/windows/mingw.h b/src/windows/mingw.h
index e69b5da..747b285 100644
--- a/src/windows/mingw.h
+++ b/src/windows/mingw.h
@@ -45,10 +45,23 @@
 # define PERFTOOLS_NO_ALIGNED_MALLOC 1
 #endif
 
+// This must be defined before the windows.h is included.  We need at
+// least 0x0400 for mutex.h to have access to TryLock, and at least
+// 0x0501 for patch_functions.cc to have access to GetModuleHandleEx.
+// (This latter is an optimization we could take out if need be.)
+#ifndef _WIN32_WINNT
+# define _WIN32_WINNT 0x0501
+#endif
+
 #include "windows/port.h"
 
 #define HAVE_SNPRINTF 1
 
+// Some mingw distributions have a pthreads wrapper, but it doesn't
+// work as well as native windows spinlocks (at least for us).  So
+// pretend the pthreads wrapper doesn't exist, even when it does.
+#undef HAVE_PTHREAD
+
 #endif  /* __MINGW32__ */
 
 #endif  /* GOOGLE_PERFTOOLS_WINDOWS_MINGW_H_ */
-- 
cgit v1.2.1