From cb7393cbe2d737009001fd9d30dad568bac7a3d8 Mon Sep 17 00:00:00 2001 From: csilvers Date: Mon, 21 Jun 2010 15:59:56 +0000 Subject: * Default to not sampling in tcmalloc (csilvers) * Add -DTCMALLOC_LARGE_PAGES: better perf for some workloads (rus) * Extend pprof --tools to allow per-tool configs (csilvers) * Have STL_Allocator pass on # bytes to free (richardfang) * Add a header guard to config.h (csilvers) * DOC: Clean up documentation around tcmalloc.slack_bytes (fikes) * DOC: Document ProfilerFlush, ProfilerStartWithOptions (csilvers) * PORTING: Work around a gcc 4.5.0 optimization bug (csilvers) * PORTING: Use -fno-builtin-malloc and friends when compiling tcmalloc * PORTING: Define _WIN32_WINNT high enough for mingw (csilvers) * PORTING: Work around libtool bug getting deps wrong in some cases * Update README.windows to emphasize $IncludeDir more (csilvers) * Rename README.windows to README_windows.txt (csilvers) git-svn-id: http://gperftools.googlecode.com/svn/trunk@95 6b5cf1ce-ec42-a296-1ba9-69fdba395a50 --- Makefile.am | 64 +++++++++++++++---- Makefile.in | 103 ++++++++++++++++++++++--------- README | 69 ++++++++++++--------- configure | 9 +++ configure.ac | 14 +++++ doc/cpuprofile.html | 5 ++ doc/tcmalloc.html | 34 +++++++++-- src/base/dynamic_annotations.c | 17 ++++++ src/base/dynamic_annotations.h | 13 ++++ src/base/stl_allocator.h | 4 +- src/central_freelist.cc | 24 +++++++- src/common.h | 43 +++++++++++-- src/config.h.in | 7 +++ src/google/malloc_extension.h | 25 ++++---- src/heap-checker.cc | 3 + src/internal_logging.h | 4 +- src/linked_list.h | 2 + src/memory_region_map.h | 2 +- src/page_heap.cc | 97 ++++++++++++----------------- src/page_heap.h | 71 +++++---------------- src/pprof | 35 +++++++---- src/sampler.cc | 7 +-- src/span.h | 4 -- src/tcmalloc.cc | 136 ++++++++++++++++++++++++++--------------- src/tests/frag_unittest.cc | 7 ++- src/tests/page_heap_test.cc | 2 +- src/tests/testutil.cc | 2 +- src/thread_cache.cc | 3 +- src/thread_cache.h | 16 ----- src/windows/config.h | 5 +- src/windows/mingw.h | 13 ++++ 31 files changed, 532 insertions(+), 308 deletions(-) diff --git a/Makefile.am b/Makefile.am index 73635db..8395013 100644 --- a/Makefile.am +++ b/Makefile.am @@ -17,9 +17,17 @@ endif !WITH_STACK_TRACE # This is mostly based on configure options AM_CXXFLAGS = -# These are good warnings to turn on by default, +# These are good warnings to turn on by default. We also tell gcc +# that malloc, free, realloc, mmap, etc. are not builtins (these flags +# are supported since gcc 3.1.1). gcc doesn't think most of them are +# builtins now in any case, but it's best to be explicit in case that +# changes one day. gcc ignores functions it doesn't understand. if GCC -AM_CXXFLAGS += -Wall -Wwrite-strings -Woverloaded-virtual -Wno-sign-compare +AM_CXXFLAGS += -Wall -Wwrite-strings -Woverloaded-virtual -Wno-sign-compare \ + -fno-builtin-malloc -fno-builtin-free -fno-builtin-realloc \ + -fno-builtin-calloc -fno-builtin-cfree \ + -fno-builtin-memalign -fno-builtin-posix_memalign \ + -fno-builtin-valloc -fno-builtin-pvalloc endif GCC # The -no-undefined flag allows libtool to generate shared libraries for @@ -96,7 +104,7 @@ docdir = $(prefix)/share/doc/$(PACKAGE)-$(VERSION) # Add your documentation files (in doc/) in addition to these # top-level boilerplate files. Also add a TODO file if you have one. # We'll add to this later, on a library-by-library basis -dist_doc_DATA = AUTHORS COPYING ChangeLog INSTALL NEWS README README.windows \ +dist_doc_DATA = AUTHORS COPYING ChangeLog INSTALL NEWS README README_windows.txt \ TODO # The libraries (.so's) you want to install @@ -400,7 +408,7 @@ libtcmalloc_minimal_la_SOURCES = $(TCMALLOC_CC) $(TCMALLOC_MINIMAL_INCLUDES) libtcmalloc_minimal_la_CXXFLAGS = -DNO_TCMALLOC_SAMPLES \ $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS) libtcmalloc_minimal_la_LDFLAGS = $(PTHREAD_CFLAGS) -libtcmalloc_minimal_la_LIBADD = $(PTHREAD_LIBS) libtcmalloc_minimal_internal.la +libtcmalloc_minimal_la_LIBADD = libtcmalloc_minimal_internal.la $(PTHREAD_LIBS) # For windows, we're playing around with trying to do some stacktrace # support even with libtcmalloc_minimal. For everyone else, though, @@ -442,6 +450,13 @@ tcmalloc_minimal_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \ $(TCMALLOC_UNITTEST_INCLUDES) tcmalloc_minimal_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS) tcmalloc_minimal_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS) +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +# We also put pthreads after tcmalloc, because some pthread +# implementations define their own malloc, and we need to go on the +# first linkline to make sure our malloc 'wins'. tcmalloc_minimal_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) \ liblogging.la $(PTHREAD_LIBS) @@ -750,13 +765,13 @@ libtcmalloc_internal_la_SOURCES = $(libtcmalloc_minimal_internal_la_SOURCES) \ libtcmalloc_internal_la_CXXFLAGS = $(PTHREAD_CFLAGS) -DNDEBUG \ $(AM_CXXFLAGS) $(NO_EXCEPTIONS) libtcmalloc_internal_la_LDFLAGS = $(PTHREAD_CFLAGS) -libtcmalloc_internal_la_LIBADD = $(PTHREAD_LIBS) libstacktrace.la +libtcmalloc_internal_la_LIBADD = libstacktrace.la $(PTHREAD_LIBS) lib_LTLIBRARIES += libtcmalloc.la libtcmalloc_la_SOURCES = $(TCMALLOC_CC) $(TCMALLOC_INCLUDES) libtcmalloc_la_CXXFLAGS = $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS) libtcmalloc_la_LDFLAGS = $(PTHREAD_CFLAGS) -libtcmalloc_la_LIBADD = $(PTHREAD_LIBS) libtcmalloc_internal.la +libtcmalloc_la_LIBADD = libtcmalloc_internal.la $(PTHREAD_LIBS) if WITH_HEAP_CHECKER # heap-checker-bcad is last, in hopes its global ctor will run first. @@ -789,6 +804,13 @@ tcmalloc_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \ $(TCMALLOC_UNITTEST_INCLUDES) tcmalloc_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS) tcmalloc_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS) +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +# We also put pthreads after tcmalloc, because some pthread +# implementations define their own malloc, and we need to go on the +# first linkline to make sure our malloc 'wins'. tcmalloc_unittest_LDADD = $(LIBTCMALLOC) liblogging.la $(PTHREAD_LIBS) # This makes sure it's safe to link in both tcmalloc and @@ -803,6 +825,13 @@ tcmalloc_both_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \ tcmalloc_both_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS) tcmalloc_both_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS) if WITH_CPU_PROFILER +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +# We also put pthreads after tcmalloc, because some pthread +# implementations define their own malloc, and we need to go on the +# first linkline to make sure our malloc 'wins'. tcmalloc_both_unittest_LDADD = $(LIBTCMALLOC) $(LIBTCMALLOC_MINIMAL) \ libprofiler.la liblogging.la $(PTHREAD_LIBS) else @@ -822,6 +851,10 @@ raw_printer_test_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS) raw_printer_test_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS) raw_printer_test_LDADD = $(LIBTCMALLOC) $(PTHREAD_LIBS) +# sampler_test and sampling_test both require sampling to be turned +# on, which it's not by default. Use the "standard" value of 2^19. +TESTS_ENVIRONMENT += TCMALLOC_SAMPLE_PARAMETER=524288 + TESTS += sampler_test WINDOWS_PROJECTS += vsprojects/sampler_test/sampler_test.vcproj sampler_test_SOURCES = src/tests/sampler_test.cc \ @@ -909,8 +942,14 @@ heap_checker_unittest_SOURCES = src/tests/heap-checker_unittest.cc \ $(HEAP_CHECKER_UNITTEST_INCLUDES) heap_checker_unittest_CXXFLAGS = -g $(PTHREAD_CFLAGS) $(AM_CXXFLAGS) heap_checker_unittest_LDFLAGS = -g $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS) -# tcmalloc has to be specified last! -heap_checker_unittest_LDADD = $(PTHREAD_LIBS) liblogging.la $(LIBTCMALLOC) +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +# We also put pthreads after tcmalloc, because some pthread +# implementations define their own malloc, and we need to go on the +# first linkline to make sure our malloc 'wins'. +heap_checker_unittest_LDADD = $(LIBTCMALLOC) liblogging.la $(PTHREAD_LIBS) endif WITH_HEAP_CHECKER @@ -1003,9 +1042,12 @@ noinst_PROGRAMS += heap-checker_debug_unittest heap_checker_debug_unittest_SOURCES = $(heap_checker_unittest_SOURCES) heap_checker_debug_unittest_CXXFLAGS = $(heap_checker_unittest_CXXFLAGS) heap_checker_debug_unittest_LDFLAGS = $(heap_checker_unittest_LDFLAGS) -# tcmalloc has to be specified last! -heap_checker_debug_unittest_LDADD = $(PTHREAD_LIBS) liblogging.la \ - libtcmalloc_debug.la +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +heap_checker_debug_unittest_LDADD = libtcmalloc_debug.la liblogging.la \ + $(PTHREAD_LIBS) endif WITH_HEAP_CHECKER endif WITH_DEBUGALLOC diff --git a/Makefile.in b/Makefile.in index a717bed..0e51024 100644 --- a/Makefile.in +++ b/Makefile.in @@ -46,8 +46,17 @@ build_triplet = @build@ host_triplet = @host@ @WITH_STACK_TRACE_FALSE@am__append_1 = -DNO_TCMALLOC_SAMPLES -# These are good warnings to turn on by default, -@GCC_TRUE@am__append_2 = -Wall -Wwrite-strings -Woverloaded-virtual -Wno-sign-compare +# These are good warnings to turn on by default. We also tell gcc +# that malloc, free, realloc, mmap, etc. are not builtins (these flags +# are supported since gcc 3.1.1). gcc doesn't think most of them are +# builtins now in any case, but it's best to be explicit in case that +# changes one day. gcc ignores functions it doesn't understand. +@GCC_TRUE@am__append_2 = -Wall -Wwrite-strings -Woverloaded-virtual -Wno-sign-compare \ +@GCC_TRUE@ -fno-builtin-malloc -fno-builtin-free -fno-builtin-realloc \ +@GCC_TRUE@ -fno-builtin-calloc -fno-builtin-cfree \ +@GCC_TRUE@ -fno-builtin-memalign -fno-builtin-posix_memalign \ +@GCC_TRUE@ -fno-builtin-valloc -fno-builtin-pvalloc + # These are x86-specific, having to do with frame-pointers. In # particular, some x86_64 systems do not insert frame pointers by @@ -152,11 +161,15 @@ bin_PROGRAMS = @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ raw_printer_test \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ sampler_test \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ sampling_test.sh$(EXEEXT) -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_35 = vsprojects/sampler_test/sampler_test.vcproj + +# sampler_test and sampling_test both require sampling to be turned +# on, which it's not by default. Use the "standard" value of 2^19. # These unittests often need to run binaries. They're in the current dir -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_36 = BINDIR=. \ +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_35 = TCMALLOC_SAMPLE_PARAMETER=524288 \ +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ BINDIR=. \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ TMPDIR=/tmp/perftools +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_36 = vsprojects/sampler_test/sampler_test.vcproj @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__append_37 = $(sampling_test_sh_SOURCES) # This is the sub-program used by sampling_test.sh @@ -345,8 +358,8 @@ libsysinfo_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \ am_libsysinfo_la_OBJECTS = sysinfo.lo $(am__objects_1) libsysinfo_la_OBJECTS = $(am_libsysinfo_la_OBJECTS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_DEPENDENCIES = \ -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(am__DEPENDENCIES_1) \ -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ libtcmalloc_internal.la +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ libtcmalloc_internal.la \ +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(am__DEPENDENCIES_1) am__libtcmalloc_la_SOURCES_DIST = src/tcmalloc.cc src/common.h \ src/internal_logging.h src/system-alloc.h \ src/packed-cache-inl.h src/base/spinlock.h \ @@ -394,8 +407,8 @@ libtcmalloc_la_OBJECTS = $(am_libtcmalloc_la_OBJECTS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am_libtcmalloc_la_rpath = -rpath \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(libdir) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__DEPENDENCIES_3 = \ -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(am__DEPENDENCIES_1) \ -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ libtcmalloc_internal.la +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ libtcmalloc_internal.la \ +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(am__DEPENDENCIES_1) @WITH_CPU_PROFILER_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_and_profiler_la_DEPENDENCIES = $(am__DEPENDENCIES_3) am__libtcmalloc_and_profiler_la_SOURCES_DIST = src/tcmalloc.cc \ src/common.h src/internal_logging.h src/system-alloc.h \ @@ -486,8 +499,8 @@ libtcmalloc_debug_la_OBJECTS = $(am_libtcmalloc_debug_la_OBJECTS) @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am_libtcmalloc_debug_la_rpath = -rpath \ @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(libdir) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_internal_la_DEPENDENCIES = \ -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(am__DEPENDENCIES_1) \ -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ libstacktrace.la +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ libstacktrace.la \ +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(am__DEPENDENCIES_1) am__libtcmalloc_internal_la_SOURCES_DIST = src/common.cc \ src/internal_logging.cc src/system-alloc.cc \ src/memfs_malloc.cc src/central_freelist.cc src/page_heap.cc \ @@ -550,8 +563,8 @@ am__objects_21 = libtcmalloc_internal_la-common.lo \ libtcmalloc_internal_la_OBJECTS = \ $(am_libtcmalloc_internal_la_OBJECTS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am_libtcmalloc_internal_la_rpath = -libtcmalloc_minimal_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \ - libtcmalloc_minimal_internal.la +libtcmalloc_minimal_la_DEPENDENCIES = libtcmalloc_minimal_internal.la \ + $(am__DEPENDENCIES_1) am__libtcmalloc_minimal_la_SOURCES_DIST = src/tcmalloc.cc src/common.h \ src/internal_logging.h src/system-alloc.h \ src/packed-cache-inl.h src/base/spinlock.h \ @@ -574,8 +587,8 @@ am__libtcmalloc_minimal_la_SOURCES_DIST = src/tcmalloc.cc src/common.h \ am_libtcmalloc_minimal_la_OBJECTS = $(am__objects_22) \ $(am__objects_20) libtcmalloc_minimal_la_OBJECTS = $(am_libtcmalloc_minimal_la_OBJECTS) -am__DEPENDENCIES_4 = $(am__DEPENDENCIES_1) \ - libtcmalloc_minimal_internal.la +am__DEPENDENCIES_4 = libtcmalloc_minimal_internal.la \ + $(am__DEPENDENCIES_1) @WITH_DEBUGALLOC_TRUE@libtcmalloc_minimal_debug_la_DEPENDENCIES = \ @WITH_DEBUGALLOC_TRUE@ $(am__DEPENDENCIES_4) am__libtcmalloc_minimal_debug_la_SOURCES_DIST = \ @@ -782,9 +795,9 @@ am__heap_checker_debug_unittest_SOURCES_DIST = \ @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@am_heap_checker_debug_unittest_OBJECTS = $(am__objects_27) heap_checker_debug_unittest_OBJECTS = \ $(am_heap_checker_debug_unittest_OBJECTS) -@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_DEPENDENCIES = $(am__DEPENDENCIES_1) \ +@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_DEPENDENCIES = libtcmalloc_debug.la \ @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@ liblogging.la \ -@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@ libtcmalloc_debug.la +@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@ $(am__DEPENDENCIES_1) am__heap_checker_debug_unittest_sh_SOURCES_DIST = \ src/tests/heap-checker_unittest.sh am_heap_checker_debug_unittest_sh_OBJECTS = @@ -803,8 +816,8 @@ heap_checker_unittest_OBJECTS = $(am_heap_checker_unittest_OBJECTS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__DEPENDENCIES_6 = \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ libtcmalloc.la @WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_DEPENDENCIES = \ -@WITH_HEAP_CHECKER_TRUE@ $(am__DEPENDENCIES_1) liblogging.la \ -@WITH_HEAP_CHECKER_TRUE@ $(am__DEPENDENCIES_6) +@WITH_HEAP_CHECKER_TRUE@ $(am__DEPENDENCIES_6) liblogging.la \ +@WITH_HEAP_CHECKER_TRUE@ $(am__DEPENDENCIES_1) am__heap_checker_unittest_sh_SOURCES_DIST = \ src/tests/heap-checker_unittest.sh am_heap_checker_unittest_sh_OBJECTS = @@ -1329,7 +1342,7 @@ man1dir = $(mandir)/man1 NROFF = nroff MANS = $(dist_man_MANS) am__dist_doc_DATA_DIST = AUTHORS COPYING ChangeLog INSTALL NEWS README \ - README.windows TODO doc/index.html doc/designstyle.css \ + README_windows.txt TODO doc/index.html doc/designstyle.css \ doc/pprof_remote_servers.html doc/tcmalloc.html \ doc/overview.gif doc/pageheap.gif doc/spanmap.gif \ doc/threadheap.gif doc/t-test1.times.txt \ @@ -1597,7 +1610,7 @@ noinst_HEADERS = src/google/tcmalloc.h.in # one day we figure it out. Regardless, installing the dot files isn't the # end of the world. dist_doc_DATA = AUTHORS COPYING ChangeLog INSTALL NEWS README \ - README.windows TODO doc/index.html doc/designstyle.css \ + README_windows.txt TODO doc/index.html doc/designstyle.css \ $(am__append_12) doc/tcmalloc.html doc/overview.gif \ doc/pageheap.gif doc/spanmap.gif doc/threadheap.gif \ doc/t-test1.times.txt \ @@ -1658,7 +1671,7 @@ WINDOWS_PROJECTS = google-perftools.sln \ vsprojects/realloc_unittest/realloc_unittest.vcproj \ vsprojects/stack_trace_table_test/stack_trace_table_test.vcproj \ vsprojects/thread_dealloc_unittest/thread_dealloc_unittest.vcproj \ - $(am__append_35) + $(am__append_36) # unittests you want to run when people type 'make check'. # Note: tests cannot take any arguments! @@ -1690,7 +1703,7 @@ TESTS = low_level_alloc_unittest atomicops_unittest $(am__append_11) \ # TESTS_ENVIRONMENT sets environment variables for when you run unittest. # We always get "srcdir" set for free. # We'll add to this later, on a library-by-library basis. -TESTS_ENVIRONMENT = $(am__append_13) $(am__append_36) +TESTS_ENVIRONMENT = $(am__append_13) $(am__append_35) # All script tests should be added here noinst_SCRIPTS = $(am__append_16) $(am__append_25) $(am__append_37) \ $(am__append_40) $(am__append_43) $(am__append_58) @@ -1916,7 +1929,7 @@ libtcmalloc_minimal_la_CXXFLAGS = -DNO_TCMALLOC_SAMPLES \ $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS) libtcmalloc_minimal_la_LDFLAGS = $(PTHREAD_CFLAGS) -libtcmalloc_minimal_la_LIBADD = $(PTHREAD_LIBS) libtcmalloc_minimal_internal.la +libtcmalloc_minimal_la_LIBADD = libtcmalloc_minimal_internal.la $(PTHREAD_LIBS) @MINGW_FALSE@LIBTCMALLOC_MINIMAL = libtcmalloc_minimal.la # For windows, we're playing around with trying to do some stacktrace @@ -1930,6 +1943,13 @@ tcmalloc_minimal_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \ tcmalloc_minimal_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS) tcmalloc_minimal_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS) +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +# We also put pthreads after tcmalloc, because some pthread +# implementations define their own malloc, and we need to go on the +# first linkline to make sure our malloc 'wins'. tcmalloc_minimal_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) \ liblogging.la $(PTHREAD_LIBS) @@ -2098,7 +2118,7 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(NO_EXCEPTIONS) \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(am__append_31) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_internal_la_LDFLAGS = $(PTHREAD_CFLAGS) -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_internal_la_LIBADD = $(PTHREAD_LIBS) libstacktrace.la +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_internal_la_LIBADD = libstacktrace.la $(PTHREAD_LIBS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_SOURCES = \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(TCMALLOC_CC) \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(TCMALLOC_INCLUDES) \ @@ -2108,7 +2128,7 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(AM_CXXFLAGS) \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(am__append_32) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_LDFLAGS = $(PTHREAD_CFLAGS) -@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_LIBADD = $(PTHREAD_LIBS) libtcmalloc_internal.la +@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_la_LIBADD = libtcmalloc_internal.la $(PTHREAD_LIBS) @WITH_HEAP_CHECKER_FALSE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@HEAP_CHECKER_SOURCES = # heap-checker-bcad is last, in hopes its global ctor will run first. @@ -2131,6 +2151,13 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_unittest_LDFLAGS = $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS) +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +# We also put pthreads after tcmalloc, because some pthread +# implementations define their own malloc, and we need to go on the +# first linkline to make sure our malloc 'wins'. @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_unittest_LDADD = $(LIBTCMALLOC) liblogging.la $(PTHREAD_LIBS) @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_both_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \ @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ src/tests/testutil.h src/tests/testutil.cc \ @@ -2141,6 +2168,13 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS) @WITH_CPU_PROFILER_FALSE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_both_unittest_LDADD = $(LIBTCMALLOC) $(LIBTCMALLOC_MINIMAL) \ @WITH_CPU_PROFILER_FALSE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ liblogging.la $(PTHREAD_LIBS) +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +# We also put pthreads after tcmalloc, because some pthread +# implementations define their own malloc, and we need to go on the +# first linkline to make sure our malloc 'wins'. @WITH_CPU_PROFILER_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@tcmalloc_both_unittest_LDADD = $(LIBTCMALLOC) $(LIBTCMALLOC_MINIMAL) \ @WITH_CPU_PROFILER_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ libprofiler.la liblogging.la $(PTHREAD_LIBS) @@ -2193,8 +2227,14 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS) @WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_CXXFLAGS = -g $(PTHREAD_CFLAGS) $(AM_CXXFLAGS) @WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_LDFLAGS = -g $(PTHREAD_CFLAGS) $(TCMALLOC_FLAGS) -# tcmalloc has to be specified last! -@WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_LDADD = $(PTHREAD_LIBS) liblogging.la $(LIBTCMALLOC) +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +# We also put pthreads after tcmalloc, because some pthread +# implementations define their own malloc, and we need to go on the +# first linkline to make sure our malloc 'wins'. +@WITH_HEAP_CHECKER_TRUE@heap_checker_unittest_LDADD = $(LIBTCMALLOC) liblogging.la $(PTHREAD_LIBS) @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@libtcmalloc_debug_la_SOURCES = src/debugallocation.cc $(HEAP_CHECKER_SOURCES) \ @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@ $(TCMALLOC_INCLUDES) @@ -2227,9 +2267,12 @@ thread_dealloc_unittest_LDADD = $(LIBTCMALLOC_MINIMAL) $(PTHREAD_LIBS) @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_SOURCES = $(heap_checker_unittest_SOURCES) @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_CXXFLAGS = $(heap_checker_unittest_CXXFLAGS) @WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_LDFLAGS = $(heap_checker_unittest_LDFLAGS) -# tcmalloc has to be specified last! -@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_LDADD = $(PTHREAD_LIBS) liblogging.la \ -@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@ libtcmalloc_debug.la +# We want libtcmalloc last on the link line, but due to a bug in +# libtool involving convenience libs, they need to come last on the +# link line in order to get dependency ordering right. This is ok: +# convenience libraries are .a's, so tcmalloc is still the last .so. +@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@heap_checker_debug_unittest_LDADD = libtcmalloc_debug.la liblogging.la \ +@WITH_DEBUGALLOC_TRUE@@WITH_HEAP_CHECKER_TRUE@ $(PTHREAD_LIBS) ### ------- CPU profiler diff --git a/README b/README index c7ef8c5..40ac8dc 100644 --- a/README +++ b/README @@ -4,34 +4,6 @@ There are known issues with some perftools functionality on x86_64 systems. See 64-BIT ISSUES, below. -CPU PROFILER ------------- -See doc/cpu-profiler.html for information about how to use the CPU -profiler and analyze its output. - -As a quick-start, do the following after installing this package: - -1) Link your executable with -lprofiler -2) Run your executable with the CPUPROFILE environment var set: - $ CPUPROFILE=/tmp/prof.out [binary args] -3) Run pprof to analyze the CPU usage - $ pprof /tmp/prof.out # -pg-like text output - $ pprof --gv /tmp/prof.out # really cool graphical output - -There are other environment variables, besides CPUPROFILE, you can set -to adjust the cpu-profiler behavior; cf "ENVIRONMENT VARIABLES" below. - -The CPU profiler is available on all unix-based systems we've tested; -see INSTALL for more details. It is not currently available on Windows. - -NOTE: CPU profiling doesn't work after fork (unless you immediately - do an exec()-like call afterwards). Furthermore, if you do - fork, and the child calls exit(), it may corrupt the profile - data. You can use _exit() to work around this. We hope to have - a fix for both problems in the next release of perftools - (hopefully perftools 1.2). - - TCMALLOC -------- Just link in -ltcmalloc or -ltcmalloc_minimal to get the advantages of @@ -42,6 +14,19 @@ tcmalloc functionality is available on all systems we've tested; see INSTALL for more details. See README.windows for instructions on using tcmalloc on Windows. +NOTE: When compiling with programs with gcc, that you plan to link +with libtcmalloc, it's safest to pass in the flags + + -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free + +when compiling. gcc makes some optimizations assuming it is using its +own, built-in malloc; that assumption obviously isn't true with +tcmalloc. In practice, we haven't seen any problems with this, but +the expected risk is highest for users who register their own malloc +hooks with tcmalloc (using google/malloc_hook.h). The risk is lowest +for folks who use tcmalloc_minimal (or, of course, who pass in the +above flags :-) ). + HEAP PROFILER ------------- @@ -96,6 +81,34 @@ The heap checker is only available on Linux at this time; see INSTALL for more details. +CPU PROFILER +------------ +See doc/cpu-profiler.html for information about how to use the CPU +profiler and analyze its output. + +As a quick-start, do the following after installing this package: + +1) Link your executable with -lprofiler +2) Run your executable with the CPUPROFILE environment var set: + $ CPUPROFILE=/tmp/prof.out [binary args] +3) Run pprof to analyze the CPU usage + $ pprof /tmp/prof.out # -pg-like text output + $ pprof --gv /tmp/prof.out # really cool graphical output + +There are other environment variables, besides CPUPROFILE, you can set +to adjust the cpu-profiler behavior; cf "ENVIRONMENT VARIABLES" below. + +The CPU profiler is available on all unix-based systems we've tested; +see INSTALL for more details. It is not currently available on Windows. + +NOTE: CPU profiling doesn't work after fork (unless you immediately + do an exec()-like call afterwards). Furthermore, if you do + fork, and the child calls exit(), it may corrupt the profile + data. You can use _exit() to work around this. We hope to have + a fix for both problems in the next release of perftools + (hopefully perftools 1.2). + + EVERYTHING IN ONE ----------------- If you want the CPU profiler, heap profiler, and heap leak-checker to diff --git a/configure b/configure index 04e143b..9a3048c 100755 --- a/configure +++ b/configure @@ -21533,6 +21533,15 @@ _ACEOF $as_echo "#define PERFTOOLS_DLL_DECL /**/" >>confdefs.h +# In theory, config.h files shouldn't need a header guard, but we do, +# because we (maybe) #include windows/mingw.h from within config.h, +# and it #includes other .h files. These all have header guards, so +# the end result is if config.h is #included twice, its #undefs get +# evaluated twice, but all the ones in mingw.h/etc only get evaluated +# once, potentially causing trouble. c.f. +# http://code.google.com/p/google-perftools/issues/detail?id=246 + + # MinGW uses autoconf, but also needs the windows shim routines # (since it doesn't have its own support for, say, pthreads). # This requires us to #include a special header file, and also to diff --git a/configure.ac b/configure.ac index e93cdc4..adbb2e5 100644 --- a/configure.ac +++ b/configure.ac @@ -301,6 +301,18 @@ AC_DEFINE(PERFTOOLS_DLL_DECL,, internally, to compile the DLL, and every DLL source file #includes "config.h" before anything else.]) +# In theory, config.h files shouldn't need a header guard, but we do, +# because we (maybe) #include windows/mingw.h from within config.h, +# and it #includes other .h files. These all have header guards, so +# the end result is if config.h is #included twice, its #undefs get +# evaluated twice, but all the ones in mingw.h/etc only get evaluated +# once, potentially causing trouble. c.f. +# http://code.google.com/p/google-perftools/issues/detail?id=246 +AH_TOP([ +#ifndef GOOGLE_PERFTOOLS_CONFIG_H_ +#define GOOGLE_PERFTOOLS_CONFIG_H_ +]) + # MinGW uses autoconf, but also needs the windows shim routines # (since it doesn't have its own support for, say, pthreads). # This requires us to #include a special header file, and also to @@ -309,6 +321,8 @@ AH_BOTTOM([ #ifdef __MINGW32__ #include "windows/mingw.h" #endif + +#endif /* #ifndef GOOGLE_PERFTOOLS_CONFIG_H_ */ ]) AM_CONDITIONAL(MINGW, expr $host : '.*-mingw' >/dev/null 2>&1) diff --git a/doc/cpuprofile.html b/doc/cpuprofile.html index 3d2b4cc..f029e78 100644 --- a/doc/cpuprofile.html +++ b/doc/cpuprofile.html @@ -71,6 +71,11 @@ CPUPROFILE with the child's process id).

For security reasons, CPU profiling will not write to a file -- and is thus not usable -- for setuid programs.

+

See the include-file google/profiler.h for +advanced-use functions, including ProfilerFlush() and +ProfilerStartWithOptions().

+ +

Modifying Runtime Behavior

You can more finely control the behavior of the CPU profiler via diff --git a/doc/tcmalloc.html b/doc/tcmalloc.html index 4f60f92..9d7ab7e 100644 --- a/doc/tcmalloc.html +++ b/doc/tcmalloc.html @@ -462,11 +462,15 @@ environment variables.

TCMALLOC_SAMPLE_PARAMETER - default: 524288 + default: 0 The approximate gap between sampling actions. That is, we take one sample approximately once every tcmalloc_sample_parmeter bytes of allocation. + This sampled heap information is available via + MallocExtension::GetHeapSample() or + MallocExtension::ReadStackTraces(). A reasonable + value is 524288. @@ -674,12 +678,34 @@ you can access them with a call like + + tcmalloc.pageheap_free_bytes + + Number of bytes in free, mapped pages in page heap. These bytes + can be used to fulfill allocation requests. They always count + towards virtual memory usage, and unless the underlying memory is + swapped out by the OS, they also count towards physical memory + usage. + + + + + tcmalloc.pageheap_unmapped_bytes + + Number of bytes in free, unmapped pages in page heap. These are + bytes that have been released back to the OS, possibly by one of + the MallocExtension "Release" calls. They can be used to fulfill + allocation requests, but typically incur a page fault. They + always count towards virtual memory usage, and depending on the + OS, typically do not count towards physical memory usage. + + + tcmalloc.slack_bytes - A measure of memory fragmentation (how much memory is reserved by - TCMalloc but unlikely to ever be able to serve an allocation - request). + Sum of pageheap_free_bytes and pageheap_unmapped_bytes. Provided + for backwards compatibility only. Do not use. diff --git a/src/base/dynamic_annotations.c b/src/base/dynamic_annotations.c index cdefaa7..bddd693 100644 --- a/src/base/dynamic_annotations.c +++ b/src/base/dynamic_annotations.c @@ -141,8 +141,25 @@ int RunningOnValgrind(void) { static volatile int running_on_valgrind = -1; /* C doesn't have thread-safe initialization of statics, and we don't want to depend on pthread_once here, so hack it. */ + ANNOTATE_BENIGN_RACE(&running_on_valgrind, "safe hack"); int local_running_on_valgrind = running_on_valgrind; if (local_running_on_valgrind == -1) running_on_valgrind = local_running_on_valgrind = GetRunningOnValgrind(); return local_running_on_valgrind; } + +/* See the comments in dynamic_annotations.h */ +double ValgrindSlowdown() { + if (RunningOnValgrind() == 0) { + return 1.0; + } + /* Same initialization hack as in RunningOnValgrind(). */ + static volatile double slowdown = 0.0; + ANNOTATE_BENIGN_RACE(&slowdown, "safe hack"); + int local_slowdown = slowdown; + if (local_slowdown == 0.0) { + char *env = getenv("VALGRIND_SLOWDOWN"); + slowdown = local_slowdown = env ? atof(env) : 50.0; + } + return local_slowdown; +} diff --git a/src/base/dynamic_annotations.h b/src/base/dynamic_annotations.h index dae1a14..ceb9809 100644 --- a/src/base/dynamic_annotations.h +++ b/src/base/dynamic_annotations.h @@ -457,6 +457,19 @@ void AnnotateFlushState(const char *file, int line); */ int RunningOnValgrind(void); +/* ValgrindSlowdown returns: + * 1.0, if (RunningOnValgrind() == 0) + * 50.0, if (RunningOnValgrind() != 0 && getenv("VALGRIND_SLOWDOWN") == NULL) + * atof(getenv("VALGRIND_SLOWDOWN")) otherwise + This function can be used to scale timeout values: + EXAMPLE: + for (;;) { + DoExpensiveBackgroundTask(); + SleepForSeconds(5 * ValgrindSlowdown()); + } + */ +double ValgrindSlowdown(); + #ifdef __cplusplus } #endif diff --git a/src/base/stl_allocator.h b/src/base/stl_allocator.h index b0ddc68..7b0b8ca 100644 --- a/src/base/stl_allocator.h +++ b/src/base/stl_allocator.h @@ -45,7 +45,7 @@ // Generic allocator class for STL objects // that uses a given type-less allocator Alloc, which must provide: // static void* Alloc::Allocate(size_t size); -// static void Alloc::Free(void* ptr); +// static void Alloc::Free(void* ptr, size_t size); // // STL_Allocator provides the same thread-safety // guarantees as MyAlloc. @@ -82,7 +82,7 @@ class STL_Allocator { RAW_DCHECK((n * sizeof(T)) / sizeof(T) == n, "n is too big to allocate"); return static_cast(Alloc::Allocate(n * sizeof(T))); } - void deallocate(pointer p, size_type /*n*/) { Alloc::Free(p); } + void deallocate(pointer p, size_type n) { Alloc::Free(p, n * sizeof(T)); } size_type max_size() const { return size_t(-1) / sizeof(T); } diff --git a/src/central_freelist.cc b/src/central_freelist.cc index 5b7dfbb..da498e6 100644 --- a/src/central_freelist.cc +++ b/src/central_freelist.cc @@ -57,9 +57,22 @@ void CentralFreeList::ReleaseListToSpans(void* start) { } } -void CentralFreeList::ReleaseToSpans(void* object) { +// MapObjectToSpan should logically be part of ReleaseToSpans. But +// this triggers an optimization bug in gcc 4.5.0. Moving to a +// separate function, and making sure that function isn't inlined, +// seems to fix the problem. It also should be fixed for gcc 4.5.1. +static +#if __GNUC__ == 4 && __GNUC_MINOR__ == 5 && __GNUC_PATCHLEVEL__ == 0 +__attribute__ ((noinline)) +#endif +Span* MapObjectToSpan(void* object) { const PageID p = reinterpret_cast(object) >> kPageShift; Span* span = Static::pageheap()->GetDescriptor(p); + return span; +} + +void CentralFreeList::ReleaseToSpans(void* object) { + Span* span = MapObjectToSpan(object); ASSERT(span != NULL); ASSERT(span->refcount > 0); @@ -266,7 +279,8 @@ void CentralFreeList::Populate() { Span* span; { SpinLockHolder h(Static::pageheap_lock()); - span = Static::pageheap()->New(npages, size_class_, kPageSize); + span = Static::pageheap()->New(npages); + if (span) Static::pageheap()->RegisterSizeClass(span, size_class_); } if (span == NULL) { MESSAGE("tcmalloc: allocation failed", npages << kPageShift); @@ -274,6 +288,12 @@ void CentralFreeList::Populate() { return; } ASSERT(span->length == npages); + // Cache sizeclass info eagerly. Locking is not necessary. + // (Instead of being eager, we could just replace any stale info + // about this span, but that seems to be no better in practice.) + for (int i = 0; i < npages; i++) { + Static::pageheap()->CacheSizeClass(span->start + i, size_class_); + } // Split the block into pieces and add to the free-list // TODO: coloring of objects to avoid cache conflicts? diff --git a/src/common.h b/src/common.h index b0278eb..5226998 100644 --- a/src/common.h +++ b/src/common.h @@ -54,16 +54,45 @@ typedef uintptr_t Length; // Configuration //------------------------------------------------------------------- -// Not all possible combinations of the following parameters make -// sense. In particular, if kMaxSize increases, you may have to -// increase kNumClasses as well. +// Using large pages speeds up the execution at a cost of larger memory use. +// Deallocation may speed up by a factor as the page map gets 8x smaller, so +// lookups in the page map result in fewer L2 cache misses, which translates to +// speedup for application/platform combinations with high L2 cache pressure. +// As the number of size classes increases with large pages, we increase +// the thread cache allowance to avoid passing more free ranges to and from +// central lists. Also, larger pages are less likely to get freed. +// These two factors cause a bounded increase in memory use. + +#if defined(TCMALLOC_LARGE_PAGES) +static const size_t kPageShift = 15; +static const size_t kNumClasses = 95; +static const size_t kMaxThreadCacheSize = 4 << 20; +#else static const size_t kPageShift = 12; +static const size_t kNumClasses = 61; +static const size_t kMaxThreadCacheSize = 2 << 20; +#endif + static const size_t kPageSize = 1 << kPageShift; static const size_t kMaxSize = 8u * kPageSize; static const size_t kAlignment = 8; -static const size_t kNumClasses = 61; static const size_t kLargeSizeClass = 0; +// Default bound on the total amount of thread caches. +static const size_t kDefaultOverallThreadCacheSize = 8u * kMaxThreadCacheSize; + +// Lower bound on the per-thread cache sizes +static const size_t kMinThreadCacheSize = kMaxSize * 2; + +// The number of bytes one ThreadCache will steal from another when +// the first ThreadCache is forced to Scavenge(), delaying the +// next call to Scavenge for this thread. +static const size_t kStealAmount = 1 << 16; + +// The number of times that a deallocation can cause a freelist to +// go over its max_length() before shrinking max_length(). +static const int kMaxOverages = 3; + // Maximum length we allow a per-thread free-list to have before we // move objects from it into the corresponding central free-list. We // want this big to avoid locking the central free-list too often. It @@ -115,8 +144,10 @@ class SizeMap { // ... // 32768 (32768 + 127 + (120<<7)) / 128 376 static const int kMaxSmallSize = 1024; - unsigned char class_array_[377]; - + static const size_t kClassArraySize = + (((1 << kPageShift) * 8u + 127 + (120 << 7)) >> 7) + 1; + unsigned char class_array_[kClassArraySize]; + // Compute index of the class_array[] entry for a given size static inline int ClassIndex(int s) { ASSERT(0 <= s); diff --git a/src/config.h.in b/src/config.h.in index 49bbf0d..a1d5c68 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -1,5 +1,10 @@ /* src/config.h.in. Generated from configure.ac by autoheader. */ + +#ifndef GOOGLE_PERFTOOLS_CONFIG_H_ +#define GOOGLE_PERFTOOLS_CONFIG_H_ + + /* Define to 1 if compiler supports __builtin_stack_pointer */ #undef HAVE_BUILTIN_STACK_POINTER @@ -240,3 +245,5 @@ #include "windows/mingw.h" #endif +#endif /* #ifndef GOOGLE_PERFTOOLS_CONFIG_H_ */ + diff --git a/src/google/malloc_extension.h b/src/google/malloc_extension.h index fc272c9..9c05897 100644 --- a/src/google/malloc_extension.h +++ b/src/google/malloc_extension.h @@ -145,21 +145,22 @@ class PERFTOOLS_DLL_DECL MallocExtension { // Number of bytes used across all thread caches. // This property is not writable. // - // "tcmalloc.slack_bytes" - // Number of bytes allocated from system, but not currently in - // use by malloced objects. I.e., bytes available for - // allocation without needing more bytes from system. It is - // the sum of pageheap_free_bytes and pageheap_unmapped_bytes. - // This property is not writable. - // // "tcmalloc.pageheap_free_bytes" - // Number of bytes in free, mapped pages in pageheap - // This property is not writable. + // Number of bytes in free, mapped pages in page heap. These + // bytes can be used to fulfill allocation requests. They + // always count towards virtual memory usage, and unless the + // underlying memory is swapped out by the OS, they also count + // towards physical memory usage. This property is not writable. // // "tcmalloc.pageheap_unmapped_bytes" - // Number of bytes in free, unmapped pages in pageheap - // This property is not writable. - // + // Number of bytes in free, unmapped pages in page heap. + // These are bytes that have been released back to the OS, + // possibly by one of the MallocExtension "Release" calls. + // They can be used to fulfill allocation requests, but + // typically incur a page fault. They always count towards + // virtual memory usage, and depending on the OS, typically + // do not count towards physical memory usage. This property + // is not writable. // ------------------------------------------------------------------- // Get the named "property"'s value. Returns true if the property diff --git a/src/heap-checker.cc b/src/heap-checker.cc index 2779c97..2b0b854 100644 --- a/src/heap-checker.cc +++ b/src/heap-checker.cc @@ -304,6 +304,9 @@ class HeapLeakChecker::Allocator { if (p) alloc_count_ -= 1; LowLevelAlloc::Free(p); } + static void Free(void* p, size_t /* n */) { + Free(p); + } // destruct, free, and make *p to be NULL template static void DeleteAndNull(T** p) { (*p)->~T(); diff --git a/src/internal_logging.h b/src/internal_logging.h index 731b2d9..0cb9ba2 100644 --- a/src/internal_logging.h +++ b/src/internal_logging.h @@ -119,9 +119,7 @@ do { \ #ifndef NDEBUG #define ASSERT(cond) CHECK_CONDITION(cond) #else -#define ASSERT(cond) \ - do { \ - } while (0 && (cond)) +#define ASSERT(cond) ((void) 0) #endif // Print into buffer diff --git a/src/linked_list.h b/src/linked_list.h index 638174b..4b0af1b 100644 --- a/src/linked_list.h +++ b/src/linked_list.h @@ -36,6 +36,8 @@ #ifndef TCMALLOC_LINKED_LIST_H_ #define TCMALLOC_LINKED_LIST_H_ +#include + namespace tcmalloc { inline void *SLL_Next(void *t) { diff --git a/src/memory_region_map.h b/src/memory_region_map.h index f88c7b9..776abb3 100644 --- a/src/memory_region_map.h +++ b/src/memory_region_map.h @@ -231,7 +231,7 @@ class MemoryRegionMap { static void *Allocate(size_t n) { return LowLevelAlloc::AllocWithArena(n, arena_); } - static void Free(const void *p) { + static void Free(const void *p, size_t /* n */) { LowLevelAlloc::Free(const_cast(p)); } }; diff --git a/src/page_heap.cc b/src/page_heap.cc index 7bfeea4..1e63cb9 100644 --- a/src/page_heap.cc +++ b/src/page_heap.cc @@ -61,64 +61,49 @@ PageHeap::PageHeap() } } -// Returns the minimum number of pages necessary to ensure that an -// allocation of size n can be aligned to the given alignment. -static Length AlignedAllocationSize(Length n, size_t alignment) { - ASSERT(alignment >= kPageSize); - return n + tcmalloc::pages(alignment - kPageSize); -} - -Span* PageHeap::New(Length n, size_t sc, size_t align) { +Span* PageHeap::New(Length n) { ASSERT(Check()); ASSERT(n > 0); - if (align < kPageSize) { - align = kPageSize; - } - - Length aligned_size = AlignedAllocationSize(n, align); - // Find first size >= n that has a non-empty list - for (Length s = aligned_size; s < kMaxPages; s++) { + for (Length s = n; s < kMaxPages; s++) { Span* ll = &free_[s].normal; // If we're lucky, ll is non-empty, meaning it has a suitable span. if (!DLL_IsEmpty(ll)) { ASSERT(ll->next->location == Span::ON_NORMAL_FREELIST); - return Carve(ll->next, n, sc, align); + return Carve(ll->next, n); } // Alternatively, maybe there's a usable returned span. ll = &free_[s].returned; if (!DLL_IsEmpty(ll)) { ASSERT(ll->next->location == Span::ON_RETURNED_FREELIST); - return Carve(ll->next, n, sc, align); + return Carve(ll->next, n); } // Still no luck, so keep looking in larger classes. } - Span* result = AllocLarge(n, sc, align); + Span* result = AllocLarge(n); if (result != NULL) return result; // Grow the heap and try again - if (!GrowHeap(aligned_size)) { + if (!GrowHeap(n)) { ASSERT(Check()); return NULL; } - return AllocLarge(n, sc, align); + return AllocLarge(n); } -Span* PageHeap::AllocLarge(Length n, size_t sc, size_t align) { - // Find the best span (closest to n in size). +Span* PageHeap::AllocLarge(Length n) { + // find the best span (closest to n in size). // The following loops implements address-ordered best-fit. Span *best = NULL; - Length aligned_size = AlignedAllocationSize(n, align); - // Search through normal list for (Span* span = large_.normal.next; span != &large_.normal; span = span->next) { - if (span->length >= aligned_size) { + if (span->length >= n) { if ((best == NULL) || (span->length < best->length) || ((span->length == best->length) && (span->start < best->start))) { @@ -132,7 +117,7 @@ Span* PageHeap::AllocLarge(Length n, size_t sc, size_t align) { for (Span* span = large_.returned.next; span != &large_.returned; span = span->next) { - if (span->length >= aligned_size) { + if (span->length >= n) { if ((best == NULL) || (span->length < best->length) || ((span->length == best->length) && (span->start < best->start))) { @@ -142,18 +127,19 @@ Span* PageHeap::AllocLarge(Length n, size_t sc, size_t align) { } } - return best == NULL ? NULL : Carve(best, n, sc, align); + return best == NULL ? NULL : Carve(best, n); } Span* PageHeap::Split(Span* span, Length n) { ASSERT(0 < n); ASSERT(n < span->length); - ASSERT((span->location != Span::IN_USE) || span->sizeclass == 0); + ASSERT(span->location == Span::IN_USE); + ASSERT(span->sizeclass == 0); Event(span, 'T', n); const int extra = span->length - n; Span* leftover = NewSpan(span->start + n, extra); - leftover->location = span->location; + ASSERT(leftover->location == Span::IN_USE); Event(leftover, 'U', extra); RecordSpan(leftover); pagemap_.set(span->start + n - 1, span); // Update map from pageid to span @@ -162,44 +148,25 @@ Span* PageHeap::Split(Span* span, Length n) { return leftover; } -Span* PageHeap::Carve(Span* span, Length n, size_t sc, size_t align) { +Span* PageHeap::Carve(Span* span, Length n) { ASSERT(n > 0); ASSERT(span->location != Span::IN_USE); - ASSERT(align >= kPageSize); - - Length align_pages = align >> kPageShift; + const int old_location = span->location; RemoveFromFreeList(span); - - if (span->start & (align_pages - 1)) { - Length skip_for_alignment = align_pages - (span->start & (align_pages - 1)); - Span* aligned = Split(span, skip_for_alignment); - PrependToFreeList(span); // Skip coalescing - no candidates possible - span = aligned; - } + span->location = Span::IN_USE; + Event(span, 'A', n); const int extra = span->length - n; ASSERT(extra >= 0); if (extra > 0) { - Span* leftover = Split(span, n); - PrependToFreeList(leftover); + Span* leftover = NewSpan(span->start + n, extra); + leftover->location = old_location; + Event(leftover, 'S', extra); + RecordSpan(leftover); + PrependToFreeList(leftover); // Skip coalescing - no candidates possible + span->length = n; + pagemap_.set(span->start + n - 1, span); } - - span->location = Span::IN_USE; - span->sizeclass = sc; - Event(span, 'A', n); - - // Cache sizeclass info eagerly. Locking is not necessary. - // (Instead of being eager, we could just replace any stale info - // about this span, but that seems to be no better in practice.) - CacheSizeClass(span->start, sc); - - if (sc != kLargeSizeClass) { - for (Length i = 1; i < n; i++) { - pagemap_.set(span->start + i, span); - CacheSizeClass(span->start + i, sc); - } - } - ASSERT(Check()); return span; } @@ -351,6 +318,18 @@ Length PageHeap::ReleaseAtLeastNPages(Length num_pages) { return released_pages; } +void PageHeap::RegisterSizeClass(Span* span, size_t sc) { + // Associate span object with all interior pages as well + ASSERT(span->location == Span::IN_USE); + ASSERT(GetDescriptor(span->start) == span); + ASSERT(GetDescriptor(span->start+span->length-1) == span); + Event(span, 'C', sc); + span->sizeclass = sc; + for (Length i = 1; i < span->length-1; i++) { + pagemap_.set(span->start+i, span); + } +} + static double MB(uint64_t bytes) { return bytes / 1048576.0; } diff --git a/src/page_heap.h b/src/page_heap.h index de36266..74030d2 100644 --- a/src/page_heap.h +++ b/src/page_heap.h @@ -93,49 +93,21 @@ class PERFTOOLS_DLL_DECL PageHeap { public: PageHeap(); - // Allocate a run of "n" pages. Returns NULL if out of memory. - // Caller should not pass "n == 0" -- instead, n should have been - // rounded up already. The span will be used for allocating objects - // with the specifled sizeclass sc (sc must be zero for large - // objects). The first page of the span will be aligned to the value - // specified by align, which must be a power of two. - Span* New(Length n, size_t sc, size_t align); + // Allocate a run of "n" pages. Returns zero if out of memory. + // Caller should not pass "n == 0" -- instead, n should have + // been rounded up already. + Span* New(Length n); // Delete the span "[p, p+n-1]". // REQUIRES: span was returned by earlier call to New() and // has not yet been deleted. void Delete(Span* span); - // Gets either the size class of addr, if it is a small object, or it's span. - // Return: - // if addr is invalid: - // leave *out_sc and *out_span unchanged and return false; - // if addr is valid and has a small size class: - // *out_sc = the size class - // *out_span = - // return true - // if addr is valid and has a large size class: - // *out_sc = kLargeSizeClass - // *out_span = the span pointer - // return true - bool GetSizeClassOrSpan(void* addr, size_t* out_sc, Span** out_span) { - const PageID p = reinterpret_cast(addr) >> kPageShift; - size_t cl = GetSizeClassIfCached(p); - Span* span = NULL; - - if (cl != kLargeSizeClass) { - ASSERT(cl == GetDescriptor(p)->sizeclass); - } else { - span = GetDescriptor(p); - if (!span) { - return false; - } - cl = span->sizeclass; - } - *out_span = span; - *out_sc = cl; - return true; - } + // Mark an allocated span as being used for small objects of the + // specified size-class. + // REQUIRES: span was returned by an earlier call to New() + // and has not yet been deleted. + void RegisterSizeClass(Span* span, size_t sc); // Split an allocated span into two spans: one of length "n" pages // followed by another span of length "span->length - n" pages. @@ -143,29 +115,14 @@ class PERFTOOLS_DLL_DECL PageHeap { // Returns a pointer to the second span. // // REQUIRES: "0 < n < span->length" - // REQUIRES: a) the span is free or b) sizeclass == 0 + // REQUIRES: span->location == IN_USE + // REQUIRES: span->sizeclass == 0 Span* Split(Span* span, Length n); // Return the descriptor for the specified page. Returns NULL if // this PageID was not allocated previously. inline Span* GetDescriptor(PageID p) const { - Span* ret = reinterpret_cast(pagemap_.get(p)); -#ifndef NDEBUG - if (ret != NULL && ret->location == Span::IN_USE) { - size_t cl = GetSizeClassIfCached(p); - // Three cases: - // - The object is not cached - // - The object is cached correctly - // - It is a large object and we're not looking at the first - // page. This happens in coalescing. - ASSERT(cl == kLargeSizeClass || cl == ret->sizeclass || - (ret->start != p && ret->sizeclass == kLargeSizeClass)); - // If the object is sampled, it must have be kLargeSizeClass - ASSERT(ret->sizeclass == kLargeSizeClass || !ret->sample); - } -#endif - - return ret; + return reinterpret_cast(pagemap_.get(p)); } // Dump state to stderr @@ -266,7 +223,7 @@ class PERFTOOLS_DLL_DECL PageHeap { // length exactly "n" and mark it as non-free so it can be returned // to the client. After all that, decrease free_pages_ by n and // return span. - Span* Carve(Span* span, Length n, size_t sc, size_t align); + Span* Carve(Span* span, Length n); void RecordSpan(Span* span) { pagemap_.set(span->start, span); @@ -277,7 +234,7 @@ class PERFTOOLS_DLL_DECL PageHeap { // Allocate a large span of length == n. If successful, returns a // span of exactly the specified length. Else, returns NULL. - Span* AllocLarge(Length n, size_t sc, size_t align); + Span* AllocLarge(Length n); // Coalesce span with neighboring spans if possible, prepend to // appropriate free list, and adjust stats. diff --git a/src/pprof b/src/pprof index 8aff380..8d4ddcf 100755 --- a/src/pprof +++ b/src/pprof @@ -215,7 +215,7 @@ Call-graph Options: (i.e. direct leak generators) more visible Miscellaneous: - --tools= Prefix for object tool pathnames + --tools=[,...] \$PATH for object tool pathnames --test Run unit tests --help This message --version Version information @@ -4331,18 +4331,27 @@ sub ConfigureTool { my $tool = shift; my $path; - if ($main::opt_tools ne "") { - # Use a prefix specified by the --tools option... - $path = $main::opt_tools . $tool; - if (!-x $path) { - error("No '$tool' found with prefix specified by --tools $main::opt_tools\n"); - } - } elsif (exists $ENV{"PPROF_TOOLS"} && - $ENV{"PPROF_TOOLS"} ne "") { - #... or specified with the PPROF_TOOLS environment variable... - $path = $ENV{"PPROF_TOOLS"} . $tool; - if (!-x $path) { - error("No '$tool' found with prefix specified by PPROF_TOOLS=$ENV{PPROF_TOOLS}\n"); + # --tools (or $PPROF_TOOLS) is a comma separated list, where each + # item is either a) a pathname prefix, or b) a map of the form + # :. First we look for an entry of type (b) for our + # tool. If one is found, we use it. Otherwise, we consider all the + # pathname prefixes in turn, until one yields an existing file. If + # none does, we use a default path. + my $tools = $main::opt_tools || $ENV{"PPROF_TOOLS"} || ""; + if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) { + $path = $2; + # TODO(csilvers): sanity-check that $path exists? Hard if it's relative. + } elsif ($tools) { + foreach my $prefix (split(',', $tools)) { + next if ($prefix =~ /:/); # ignore "tool:fullpath" entries in the list + if (-x $prefix . $tool) { + $path = $prefix . $tool; + last; + } + } + if (!$path) { + error("No '$tool' found with prefix specified by " . + "--tools (or \$PPROF_TOOLS) '$tools'\n"); } } else { # ... otherwise use the version that exists in the same directory as diff --git a/src/sampler.cc b/src/sampler.cc index cbc6ab4..a13544a 100755 --- a/src/sampler.cc +++ b/src/sampler.cc @@ -42,16 +42,15 @@ using std::min; // The approximate gap in bytes between sampling actions. // I.e., we take one sample approximately once every // tcmalloc_sample_parameter bytes of allocation -// i.e. about once every 512KB. +// i.e. about once every 512KB if value is 1<<19. #ifdef NO_TCMALLOC_SAMPLES DEFINE_int64(tcmalloc_sample_parameter, 0, "Unused: code is compiled with NO_TCMALLOC_SAMPLES"); #else DEFINE_int64(tcmalloc_sample_parameter, - EnvToInt64("TCMALLOC_SAMPLE_PARAMETER", 1<<19), + EnvToInt64("TCMALLOC_SAMPLE_PARAMETER", 0), "The approximate gap in bytes between sampling actions. " - "This must be between 1 and 1<<58."); -// Note: there are other places in this file where the number 19 occurs. + "This must be between 1 and 2^58."); #endif namespace tcmalloc { diff --git a/src/span.h b/src/span.h index b3483ca..ab9a796 100644 --- a/src/span.h +++ b/src/span.h @@ -60,10 +60,6 @@ struct Span { int value[64]; #endif - void* start_ptr() { - return reinterpret_cast(start << kPageShift); - } - // What freelist the span is on: IN_USE if on none, or normal or returned enum { IN_USE, ON_NORMAL_FREELIST, ON_RETURNED_FREELIST }; }; diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc index 011fc91..13d2c23 100644 --- a/src/tcmalloc.cc +++ b/src/tcmalloc.cc @@ -469,6 +469,7 @@ static void DumpStats(TCMalloc_Printer* out, int level) { "MALLOC: %12" PRIu64 " Spans in use\n" "MALLOC: %12" PRIu64 " Thread heaps in use\n" "MALLOC: %12" PRIu64 " (%7.1f MB) Metadata allocated\n" + "MALLOC: %12" PRIu64 " Tcmalloc page size\n" "------------------------------------------------\n", stats.pageheap.system_bytes, stats.pageheap.system_bytes / MB, bytes_in_use, bytes_in_use / MB, @@ -479,7 +480,8 @@ static void DumpStats(TCMalloc_Printer* out, int level) { stats.thread_bytes, stats.thread_bytes / MB, uint64_t(Static::span_allocator()->inuse()), uint64_t(ThreadCache::HeapsInUse()), - stats.metadata_bytes, stats.metadata_bytes / MB); + stats.metadata_bytes, stats.metadata_bytes / MB, + uint64_t(kPageSize)); } static void PrintStats(int level) { @@ -637,9 +639,8 @@ class TCMallocImplementation : public MallocExtension { } if (strcmp(name, "tcmalloc.slack_bytes") == 0) { - // We assume that bytes in the page heap are not fragmented too - // badly, and are therefore available for allocation without - // growing the pageheap system byte count. + // Kept for backwards compatibility. Now defined externally as: + // pageheap_free_bytes + pageheap_unmapped_bytes. SpinLockHolder l(Static::pageheap_lock()); PageHeap::Stats stats = Static::pageheap()->stats(); *value = stats.free_bytes + stats.unmapped_bytes; @@ -798,25 +799,22 @@ static TCMallocGuard module_enter_exit_hook; // Helpers for the exported routines below //------------------------------------------------------------------- -static inline void* CheckedMallocResult(void *result) { - Span* fetched_span; - size_t cl; - - if (result != NULL) { - ASSERT(Static::pageheap()->GetSizeClassOrSpan(result, &cl, &fetched_span)); - } +static inline bool CheckCachedSizeClass(void *ptr) { + PageID p = reinterpret_cast(ptr) >> kPageShift; + size_t cached_value = Static::pageheap()->GetSizeClassIfCached(p); + return cached_value == 0 || + cached_value == Static::pageheap()->GetDescriptor(p)->sizeclass; +} +static inline void* CheckedMallocResult(void *result) { + ASSERT(result == NULL || CheckCachedSizeClass(result)); return result; } static inline void* SpanToMallocResult(Span *span) { - Span* fetched_span = NULL; - size_t cl = 0; - ASSERT(Static::pageheap()->GetSizeClassOrSpan(span->start_ptr(), - &cl, &fetched_span)); - ASSERT(cl == kLargeSizeClass); - ASSERT(span == fetched_span); - return span->start_ptr(); + Static::pageheap()->CacheSizeClass(span->start, 0); + return + CheckedMallocResult(reinterpret_cast(span->start << kPageShift)); } static void* DoSampledAllocation(size_t size) { @@ -827,8 +825,7 @@ static void* DoSampledAllocation(size_t size) { SpinLockHolder h(Static::pageheap_lock()); // Allocate span - Span *span = Static::pageheap()->New(tcmalloc::pages(size == 0 ? 1 : size), - kLargeSizeClass, kPageSize); + Span *span = Static::pageheap()->New(tcmalloc::pages(size == 0 ? 1 : size)); if (span == NULL) { return NULL; } @@ -919,7 +916,7 @@ inline void* do_malloc_pages(ThreadCache* heap, size_t size) { report_large = should_report_large(num_pages); } else { SpinLockHolder h(Static::pageheap_lock()); - Span* span = Static::pageheap()->New(num_pages, kLargeSizeClass, kPageSize); + Span* span = Static::pageheap()->New(num_pages); result = (span == NULL ? NULL : SpanToMallocResult(span)); report_large = should_report_large(num_pages); } @@ -975,22 +972,28 @@ static inline ThreadCache* GetCacheIfPresent() { inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) { if (ptr == NULL) return; ASSERT(Static::pageheap() != NULL); // Should not call free() before malloc() - Span* span; - size_t cl; - - if (!Static::pageheap()->GetSizeClassOrSpan(ptr, &cl, &span)) { - // result can be false because the pointer passed in is invalid - // (not something returned by malloc or friends), or because the - // pointer was allocated with some other allocator besides - // tcmalloc. The latter can happen if tcmalloc is linked in via - // a dynamic library, but is not listed last on the link line. - // In that case, libraries after it on the link line will - // allocate with libc malloc, but free with tcmalloc's free. - (*invalid_free_fn)(ptr); // Decide how to handle the bad free request - return; + const PageID p = reinterpret_cast(ptr) >> kPageShift; + Span* span = NULL; + size_t cl = Static::pageheap()->GetSizeClassIfCached(p); + + if (cl == 0) { + span = Static::pageheap()->GetDescriptor(p); + if (!span) { + // span can be NULL because the pointer passed in is invalid + // (not something returned by malloc or friends), or because the + // pointer was allocated with some other allocator besides + // tcmalloc. The latter can happen if tcmalloc is linked in via + // a dynamic library, but is not listed last on the link line. + // In that case, libraries after it on the link line will + // allocate with libc malloc, but free with tcmalloc's free. + (*invalid_free_fn)(ptr); // Decide how to handle the bad free request + return; + } + cl = span->sizeclass; + Static::pageheap()->CacheSizeClass(p, cl); } - - if (cl != kLargeSizeClass) { + if (cl != 0) { + ASSERT(!Static::pageheap()->GetDescriptor(p)->sample); ThreadCache* heap = GetCacheIfPresent(); if (heap != NULL) { heap->Deallocate(ptr, cl); @@ -1001,7 +1004,8 @@ inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) { } } else { SpinLockHolder h(Static::pageheap_lock()); - ASSERT(span != NULL && ptr == span->start_ptr()); + ASSERT(reinterpret_cast(ptr) % kPageSize == 0); + ASSERT(span != NULL && span->start == p); if (span->sample) { tcmalloc::DLL_Remove(span); Static::stacktrace_allocator()->Delete( @@ -1021,17 +1025,20 @@ inline size_t GetSizeWithCallback(void* ptr, size_t (*invalid_getsize_fn)(void*)) { if (ptr == NULL) return 0; - - Span* span; - size_t cl; - if (!Static::pageheap()->GetSizeClassOrSpan(ptr, &cl, &span)) { - return (*invalid_getsize_fn)(ptr); - } - - if (cl != kLargeSizeClass) { + const PageID p = reinterpret_cast(ptr) >> kPageShift; + size_t cl = Static::pageheap()->GetSizeClassIfCached(p); + if (cl != 0) { return Static::sizemap()->ByteSizeForClass(cl); } else { - return span->length << kPageShift; + Span *span = Static::pageheap()->GetDescriptor(p); + if (span == NULL) { // means we do not own this memory + return (*invalid_getsize_fn)(ptr); + } else if (span->sizeclass != 0) { + Static::pageheap()->CacheSizeClass(p, span->sizeclass); + return Static::sizemap()->ByteSizeForClass(span->sizeclass); + } else { + return span->length << kPageShift; + } } } @@ -1126,10 +1133,39 @@ void* do_memalign(size_t align, size_t size) { // We will allocate directly from the page heap SpinLockHolder h(Static::pageheap_lock()); - // Any page-level allocation will be fine - Span* span = Static::pageheap()->New(tcmalloc::pages(size), - kLargeSizeClass, align); - return span == NULL ? NULL : SpanToMallocResult(span); + if (align <= kPageSize) { + // Any page-level allocation will be fine + // TODO: We could put the rest of this page in the appropriate + // TODO: cache but it does not seem worth it. + Span* span = Static::pageheap()->New(tcmalloc::pages(size)); + return span == NULL ? NULL : SpanToMallocResult(span); + } + + // Allocate extra pages and carve off an aligned portion + const Length alloc = tcmalloc::pages(size + align); + Span* span = Static::pageheap()->New(alloc); + if (span == NULL) return NULL; + + // Skip starting portion so that we end up aligned + Length skip = 0; + while ((((span->start+skip) << kPageShift) & (align - 1)) != 0) { + skip++; + } + ASSERT(skip < alloc); + if (skip > 0) { + Span* rest = Static::pageheap()->Split(span, skip); + Static::pageheap()->Delete(span); + span = rest; + } + + // Skip trailing portion that we do not need to return + const Length needed = tcmalloc::pages(size); + ASSERT(span->length >= needed); + if (span->length > needed) { + Span* trailer = Static::pageheap()->Split(span, needed); + Static::pageheap()->Delete(trailer); + } + return SpanToMallocResult(span); } // Helpers for use by exported routines below: diff --git a/src/tests/frag_unittest.cc b/src/tests/frag_unittest.cc index 08494b4..160c41c 100644 --- a/src/tests/frag_unittest.cc +++ b/src/tests/frag_unittest.cc @@ -44,13 +44,16 @@ #endif #include #include "base/logging.h" +#include "common.h" #include using std::vector; int main(int argc, char** argv) { - static const int kAllocSize = 36<<10; // Bigger than tcmalloc page size - static const int kTotalAlloc = 400 << 20; // Allocate 400MB in total + // Make kAllocSize larger than tcmalloc page size. + static const int kAllocSize = 9 << kPageShift; + // Allocate 400MB in total. + static const int kTotalAlloc = 400 << 20; static const int kAllocIterations = kTotalAlloc / kAllocSize; // Allocate lots of objects diff --git a/src/tests/page_heap_test.cc b/src/tests/page_heap_test.cc index fd444da..9120b78 100644 --- a/src/tests/page_heap_test.cc +++ b/src/tests/page_heap_test.cc @@ -26,7 +26,7 @@ static void TestPageHeap_Stats() { CheckStats(ph, 0, 0, 0); // Allocate a span 's1' - tcmalloc::Span* s1 = ph->New(256, kLargeSizeClass, kPageSize); + tcmalloc::Span* s1 = ph->New(256); CheckStats(ph, 256, 0, 0); // Split span 's1' into 's1', 's2'. Delete 's2' diff --git a/src/tests/testutil.cc b/src/tests/testutil.cc index f2b8592..745de99 100644 --- a/src/tests/testutil.cc +++ b/src/tests/testutil.cc @@ -80,7 +80,7 @@ struct FunctionAndId { int id; }; -#if defined(NO_THREADS) || !(defined(HAVE_PTHREADS) || defined(_WIN32)) +#if defined(NO_THREADS) || !(defined(HAVE_PTHREAD) || defined(_WIN32)) extern "C" void RunThread(void (*fn)()) { (*fn)(); diff --git a/src/thread_cache.cc b/src/thread_cache.cc index 64f4deb..8d31117 100644 --- a/src/thread_cache.cc +++ b/src/thread_cache.cc @@ -42,7 +42,8 @@ using std::min; using std::max; DEFINE_int64(tcmalloc_max_total_thread_cache_bytes, - EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES", 16<<20), + EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES", + kDefaultOverallThreadCacheSize), "Bound on the total amount of bytes allocated to " "thread caches. This bound is not strict, so it is possible " "for the cache to go over this bound in certain circumstances. "); diff --git a/src/thread_cache.h b/src/thread_cache.h index 1165447..352c683 100644 --- a/src/thread_cache.h +++ b/src/thread_cache.h @@ -63,9 +63,6 @@ inline bool KernelSupportsTLS() { class ThreadCache { public: - // Default bound on the total amount of thread caches. - static const size_t kDefaultOverallThreadCacheSize = 16 << 20; - // All ThreadCache objects are kept in a linked list (for stats collection) ThreadCache* next_; ThreadCache* prev_; @@ -213,19 +210,6 @@ class ThreadCache { } }; - // The number of bytes one ThreadCache will steal from another when - // the first ThreadCache is forced to Scavenge(), delaying the - // next call to Scavenge for this thread. - static const size_t kStealAmount = 1 << 16; - - // Lower and upper bounds on the per-thread cache sizes - static const size_t kMinThreadCacheSize = kMaxSize * 2; //kStealAmount; - static const size_t kMaxThreadCacheSize = 2 << 20; - - // The number of times that a deallocation can cause a freelist to - // go over its max_length() before shrinking max_length(). - static const int kMaxOverages = 3; - // Gets and returns an object from the central cache, and, if possible, // also adds some objects of that size class to this thread cache. void* FetchFromCentralCache(size_t cl, size_t byte_size); diff --git a/src/windows/config.h b/src/windows/config.h index b5d9bb6..6d6f771 100644 --- a/src/windows/config.h +++ b/src/windows/config.h @@ -154,7 +154,7 @@ /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 -/* Define to 1 if you have the header file. */ +/* is broken on redhat 7 */ #undef HAVE_SYS_UCONTEXT_H /* Define to 1 if you have the header file. */ @@ -172,6 +172,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_UNWIND_H +/* Define to 1 if you have the header file. */ +#undef HAVE_VALGRIND_H + /* define if your compiler has __attribute__ */ #undef HAVE___ATTRIBUTE__ diff --git a/src/windows/mingw.h b/src/windows/mingw.h index e69b5da..747b285 100644 --- a/src/windows/mingw.h +++ b/src/windows/mingw.h @@ -45,10 +45,23 @@ # define PERFTOOLS_NO_ALIGNED_MALLOC 1 #endif +// This must be defined before the windows.h is included. We need at +// least 0x0400 for mutex.h to have access to TryLock, and at least +// 0x0501 for patch_functions.cc to have access to GetModuleHandleEx. +// (This latter is an optimization we could take out if need be.) +#ifndef _WIN32_WINNT +# define _WIN32_WINNT 0x0501 +#endif + #include "windows/port.h" #define HAVE_SNPRINTF 1 +// Some mingw distributions have a pthreads wrapper, but it doesn't +// work as well as native windows spinlocks (at least for us). So +// pretend the pthreads wrapper doesn't exist, even when it does. +#undef HAVE_PTHREAD + #endif /* __MINGW32__ */ #endif /* GOOGLE_PERFTOOLS_WINDOWS_MINGW_H_ */ -- cgit v1.2.1