diff options
author | csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> | 2007-03-22 04:46:29 +0000 |
---|---|---|
committer | csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> | 2007-03-22 04:46:29 +0000 |
commit | 60a3a2ce77ed2713b2eedd20952d9cfc56ff7ccf (patch) | |
tree | 234b959ff2d934386ecfc08a7477f5c94290b709 | |
parent | 298274f8d4f474d2b16a35c8babc58817088c59e (diff) | |
download | gperftools-60a3a2ce77ed2713b2eedd20952d9cfc56ff7ccf.tar.gz |
Fri Jan 27 14:04:27 2006 Google Inc. <opensource@google.com>
* google-perftools: version 0.6 release
* More sophisticated stacktrace usage, possibly using libunwind (aruns)
* Update pprof to handle 64-bit profiles (dehnert)
* Fix GetStackTrace to correctly return top stackframe (sanjay)
* Add ANSI compliance for new and new[], including new_handler (jkearney)
* More accuracy by reading ELF files directly rather than objdump (mec)
* Add readline support for pprof (addi)
* Add #includes for PPC (csilvers)
* New PC-detection routine for ibook powerpc (asbestoshead)
* Vastly improved tcmalloc unittest (csilvers)
* Move documentation from /usr/doc to /usr/share/doc
git-svn-id: http://gperftools.googlecode.com/svn/trunk@19 6b5cf1ce-ec42-a296-1ba9-69fdba395a50
-rw-r--r-- | ChangeLog | 14 | ||||
-rw-r--r-- | Makefile.am | 13 | ||||
-rwxr-xr-x | configure | 113 | ||||
-rw-r--r-- | configure.ac | 5 | ||||
-rw-r--r-- | packages/rpm/rpm.spec | 1 | ||||
-rw-r--r-- | src/base/thread_lister.c | 1 | ||||
-rw-r--r-- | src/config.h.in | 3 | ||||
-rw-r--r-- | src/heap-checker.cc | 303 | ||||
-rw-r--r-- | src/heap-profiler.cc | 12 | ||||
-rw-r--r-- | src/malloc_extension.cc | 1 | ||||
-rwxr-xr-x | src/pprof | 1232 | ||||
-rw-r--r-- | src/profiler.cc | 39 | ||||
-rw-r--r-- | src/stacktrace.cc | 148 | ||||
-rw-r--r-- | src/stacktrace_generic-inl.h | 61 | ||||
-rw-r--r-- | src/stacktrace_libunwind-inl.h | 69 | ||||
-rw-r--r-- | src/stacktrace_x86-inl.h | 119 | ||||
-rw-r--r-- | src/stacktrace_x86_64-inl.h | 134 | ||||
-rw-r--r-- | src/tcmalloc.cc | 546 | ||||
-rw-r--r-- | src/tests/stacktrace_unittest.cc | 19 | ||||
-rw-r--r-- | src/tests/tcmalloc_unittest.cc | 607 |
20 files changed, 2843 insertions, 597 deletions
@@ -61,3 +61,17 @@ Mon Nov 14 17:28:59 2005 Google Inc. <opensource@google.com> * Add va_start/va_end calls around vsnprintf() (csilvers) * Write our own __syscall_return(), since it's not defined consistently on all 64-bit linux distros (markus) + +Fri Jan 27 14:04:27 2006 Google Inc. <opensource@google.com> + + * google-perftools: version 0.6 release + * More sophisticated stacktrace usage, possibly using libunwind (aruns) + * Update pprof to handle 64-bit profiles (dehnert) + * Fix GetStackTrace to correctly return top stackframe (sanjay) + * Add ANSI compliance for new and new[], including new_handler (jkearney) + * More accuracy by reading ELF files directly rather than objdump (mec) + * Add readline support for pprof (addi) + * Add #includes for PPC (csilvers) + * New PC-detection routine for ibook powerpc (asbestoshead) + * Vastly improved tcmalloc unittest (csilvers) + * Move documentation from /usr/doc to /usr/share/doc diff --git a/Makefile.am b/Makefile.am index cab257c..8d39bbb 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,7 +21,7 @@ perftoolsincludedir = $(googleincludedir)/perftools # We'll add to this later, on a library-by-library basis perftoolsinclude_HEADERS = -docdir = $(prefix)/doc/$(PACKAGE)-$(VERSION) +docdir = $(prefix)/share/doc/$(PACKAGE)-$(VERSION) # This is for HTML and other documentation you want to install. # Add your documentation files (in doc/) in addition to these # top-level boilerplate files. Also add a TODO file if you have one. @@ -53,7 +53,11 @@ dist_doc_DATA += doc/index.html ### The header files we use. We divide into categories based on directory S_STACKTRACE_INCLUDES = SG_STACKTRACE_INCLUDES = src/google/stacktrace.h -SGP_STACKTRACE_INCLUDES = src/config.h +SGP_STACKTRACE_INCLUDES = src/config.h \ + src/stacktrace_generic-inl.h \ + src/stacktrace_libunwind-inl.h \ + src/stacktrace_x86_64-inl.h \ + src/stacktrace_x86-inl.h STACKTRACE_INCLUDES = $(S_STACKTRACE_INCLUDES) $(SG_STACKTRACE_INCLUDES) $(SGP_STACKTRACE_INCLUDES) googleinclude_HEADERS += $(SG_STACKTRACE_INCLUDES) perftoolsinclude_HEADERS += $(SGP_STACKTRACE_INCLUDES) @@ -324,7 +328,7 @@ CPU_PROFILER_SYMBOLS = '(ProfilerStart|ProfilerStop|ProfilerEnable|ProfilerDisab libprofiler_la_LDFLAGS = -export-symbols-regex $(CPU_PROFILER_SYMBOLS) ### Unittests -check_SCRIPTS += profiler_unittest_sh +check_SCRIPTS += profiler_unittest_sh pprof_unittest noinst_SCRIPTS += src/tests/profiler_unittest.sh # These are sub-programs used by profiler_unittest.sh @@ -352,6 +356,9 @@ profiler4_unittest_CXXFLAGS = -g $(PTHREAD_CFLAGS) -DWITH_THREADS profiler4_unittest_LDFLAGS = $(PTHREAD_CFLAGS) profiler4_unittest_LDADD = -lprofiler $(PTHREAD_LIBS) +pprof_unittest: $(top_srcdir)/src/pprof + $< -test + ### Documentation dist_man_MANS = doc/pprof.1 dist_doc_DATA += doc/cpu_profiler.html \ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.57 for google-perftools 0.5. +# Generated by GNU Autoconf 2.57 for google-perftools 0.6. # # Report bugs to <opensource@google.com>. # @@ -422,8 +422,8 @@ SHELL=${CONFIG_SHELL-/bin/sh} # Identity of this package. PACKAGE_NAME='google-perftools' PACKAGE_TARNAME='google-perftools' -PACKAGE_VERSION='0.5' -PACKAGE_STRING='google-perftools 0.5' +PACKAGE_VERSION='0.6' +PACKAGE_STRING='google-perftools 0.6' PACKAGE_BUGREPORT='opensource@google.com' ac_unique_file="README" @@ -953,7 +953,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures google-perftools 0.5 to adapt to many kinds of systems. +\`configure' configures google-perftools 0.6 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1019,7 +1019,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of google-perftools 0.5:";; + short | recursive ) echo "Configuration of google-perftools 0.6:";; esac cat <<\_ACEOF @@ -1125,7 +1125,7 @@ fi test -n "$ac_init_help" && exit 0 if $ac_init_version; then cat <<\_ACEOF -google-perftools configure 0.5 +google-perftools configure 0.6 generated by GNU Autoconf 2.57 Copyright 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, 2002 @@ -1140,7 +1140,7 @@ cat >&5 <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by google-perftools $as_me 0.5, which was +It was created by google-perftools $as_me 0.6, which was generated by GNU Autoconf 2.57. Invocation command line was $ $0 $@ @@ -1733,7 +1733,7 @@ fi # Define the identity of the package. PACKAGE=google-perftools - VERSION=0.5 + VERSION=0.6 cat >>confdefs.h <<_ACEOF @@ -20799,6 +20799,99 @@ _ACEOF fi +echo "$as_me:$LINENO: checking for struct sigcontext.regs->nip" >&5 +echo $ECHO_N "checking for struct sigcontext.regs->nip... $ECHO_C" >&6 +if test "${ac_cv_member_struct_sigcontext_regs__nip+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + cat >conftest.$ac_ext <<_ACEOF +#line $LINENO "configure" +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +#include <signal.h> + +int +main () +{ +static struct sigcontext ac_aggr; +if (ac_aggr.regs->nip) +return 0; + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 + (eval $ac_compile) 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && + { ac_try='test -s conftest.$ac_objext' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + ac_cv_member_struct_sigcontext_regs__nip=yes +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +cat >conftest.$ac_ext <<_ACEOF +#line $LINENO "configure" +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +#include <signal.h> + +int +main () +{ +static struct sigcontext ac_aggr; +if (sizeof ac_aggr.regs->nip) +return 0; + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 + (eval $ac_compile) 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && + { ac_try='test -s conftest.$ac_objext' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + ac_cv_member_struct_sigcontext_regs__nip=yes +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +ac_cv_member_struct_sigcontext_regs__nip=no +fi +rm -f conftest.$ac_objext conftest.$ac_ext +fi +rm -f conftest.$ac_objext conftest.$ac_ext +fi +echo "$as_me:$LINENO: result: $ac_cv_member_struct_sigcontext_regs__nip" >&5 +echo "${ECHO_T}$ac_cv_member_struct_sigcontext_regs__nip" >&6 +if test $ac_cv_member_struct_sigcontext_regs__nip = yes; then + +cat >>confdefs.h <<_ACEOF +#define HAVE_STRUCT_SIGCONTEXT_REGS__NIP 1 +_ACEOF + + +fi # Defines PRIuS @@ -22300,7 +22393,7 @@ _ASBOX } >&5 cat >&5 <<_CSEOF -This file was extended by google-perftools $as_me 0.5, which was +This file was extended by google-perftools $as_me 0.6, which was generated by GNU Autoconf 2.57. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -22363,7 +22456,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF ac_cs_version="\\ -google-perftools config.status 0.5 +google-perftools config.status 0.6 configured by $0, generated by GNU Autoconf 2.57, with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\" diff --git a/configure.ac b/configure.ac index c908272..aa1826b 100644 --- a/configure.ac +++ b/configure.ac @@ -5,7 +5,7 @@ # make sure we're interpreted by some minimal autoconf AC_PREREQ(2.57) -AC_INIT(google-perftools, 0.5, opensource@google.com) +AC_INIT(google-perftools, 0.6, opensource@google.com) # The argument here is just something that should be in the current directory # (for sanity checking) AC_CONFIG_SRCDIR(README) @@ -43,7 +43,8 @@ AC_CHECK_MEMBERS([struct sigcontext.sc_eip, struct sigcontext.eip, struct sigcontext.rip, struct sigcontext.sc_ip, - struct siginfo.si_faddr],,, + struct siginfo.si_faddr, + struct sigcontext.regs->nip],,, [#include <signal.h>]) # Defines PRIuS diff --git a/packages/rpm/rpm.spec b/packages/rpm/rpm.spec index 24c0432..60e903a 100644 --- a/packages/rpm/rpm.spec +++ b/packages/rpm/rpm.spec @@ -15,7 +15,6 @@ Packager: Google <opensource@google.com> Source: http://goog-perftools.sourceforge.net/%{NAME}-%{PACKAGE_VERSION}.tar.gz Distribution: Redhat 7 and above. Buildroot: %{_tmppath}/%{name}-root -Docdir: %prefix/doc Prefix: %prefix %description diff --git a/src/base/thread_lister.c b/src/base/thread_lister.c index 7eca594..6def758 100644 --- a/src/base/thread_lister.c +++ b/src/base/thread_lister.c @@ -31,6 +31,7 @@ * Author: Markus Gutschke */ +#include <stdio.h> // needed for NULL on some powerpc platforms (?!) #include "base/thread_lister.h" #include "base/linuxthreads.h" /* Include other thread listers here that define THREADS macro diff --git a/src/config.h.in b/src/config.h.in index 365c4f8..2abb5bd 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -78,6 +78,9 @@ /* Define to 1 if `eip' is member of `struct sigcontext'. */ #undef HAVE_STRUCT_SIGCONTEXT_EIP +/* Define to 1 if `regs->nip' is member of `struct sigcontext'. */ +#undef HAVE_STRUCT_SIGCONTEXT_REGS__NIP + /* Define to 1 if `rip' is member of `struct sigcontext'. */ #undef HAVE_STRUCT_SIGCONTEXT_RIP diff --git a/src/heap-checker.cc b/src/heap-checker.cc index f081f97..b34eea2 100644 --- a/src/heap-checker.cc +++ b/src/heap-checker.cc @@ -45,13 +45,14 @@ #include <google/perftools/hash_set.h> #include <algorithm> +#include <fcntl.h> +#include <string.h> #include <errno.h> #include <unistd.h> -#include <string.h> -#include <sys/stat.h> +#include <sys/mman.h> #include <sys/poll.h> +#include <sys/stat.h> #include <sys/types.h> -#include <fcntl.h> #include <assert.h> #ifdef HAVE_LINUX_PTRACE_H @@ -61,6 +62,8 @@ #include <syscall.h> #endif +#include <elf.h> + #include <google/stacktrace.h> #include <google/heap-profiler.h> #include <google/heap-checker.h> @@ -393,51 +396,219 @@ static int GetStatusOutput(const char* command, string* output) { return pclose(f); } -// A ProcMapsTask to record global data to ignore later -// that belongs to 'library' mapped at 'start_address' with 'file_offset'. -static void RecordGlobalDataLocked(const char* library, - uint64 start_address, - uint64 file_offset) { - HeapProfiler::MESSAGE(2, "HeapChecker: Looking into %s\n", library); - string command("/usr/bin/objdump -hw "); - command.append(library); - string output; - if (GetStatusOutput(command.c_str(), &output) != 0) { - HeapProfiler::MESSAGE(-1, "HeapChecker: " - "Failed executing %s\n", command.c_str()); - abort(); - } - const char* output_start = output.c_str(); +// RAII class for a file descriptor. + +class FileDescriptor { + public: + FileDescriptor(int fd) : fd_(fd) { ; } + ~FileDescriptor() { if (fd_ >= 0) close(fd_); } + int Close() { int fd = fd_; fd_ = -1; return close(fd); } + operator int() { return fd_; } + private: + int fd_; +}; + +// This function takes the fields from a /proc/self/maps line: +// +// start_address start address of a memory region. +// end_address end address of a memory region +// permissions rwx + private/shared bit +// file_offset file offset within the mapped file +// (major:minor) major and minor device number of the mapped file +// inode inode number of the mapped file +// filename filename of the mapped file +// +// First, if the region is not writeable, then it cannot have any heap +// pointers in it. +// +// It would be simple to just mark every writeable memory region as live. +// However, that would pick up unused bottom pieces of thread stacks +// and other rot. So we need more complexity: +// +// Second, if the region is anonymous, we ignore it. That skips the +// main stack and the thread stacks which are picked up elsewhere. +// Unfortunately that also skips the BSS portions of shared library +// segments, and we need to pick those up. +// +// So, for each writable memory region that is mapped to a file, +// we recover the original segment information from that file +// (segment headers, not section headers). We pick out the segment +// that contains the given "file_offset", figure out where that +// segment was loaded into memory, and register all of the memory +// addresses for that segment. +// +// Picking out the right segment is a bit of a mess because the +// same file offset can appear in multiple segments! For example: +// +// LOAD 0x000000 0x08048000 0x08048000 0x231a8 0x231a8 R E 0x1000 +// LOAD 0x0231a8 0x0806c1a8 0x0806c1a8 0x01360 0x014e8 RW 0x1000 +// +// File offset 0x23000 appears in both segments. The second segment +// stars at 0x23000 because of rounding. Fortunately, we skip the +// first segment because it is not writeable. Most of the shared +// objects we see have one read-executable segment and one read-write +// segment so we really skate. +// +// If a shared library has no initialized data, only BSS, then the +// size of the read-write LOAD segment will be zero, the dynamic loader +// will create an anonymous memory region for the BSS but no named +// segment [I think -- gotta check ld-linux.so -- mec]. We will +// overlook that segment and never get here. This is a bug. + +static bool RecordGlobalDataLocked(uint64 start_address, + uint64 end_address, + const char* permissions, + uint64 file_offset, + int64 inode, + const char* filename) { + // Ignore non-writeable regions. + if (strchr(permissions, 'w') == NULL) + return true; + + // Ignore anonymous regions. + // This drops BSS regions which causes us much work later. + if (inode == 0) + return true; + + // Grab some ELF types. +#ifdef _LP64 + typedef Elf64_Ehdr ElfFileHeader; + typedef Elf64_Phdr ElfProgramHeader; +#else + typedef Elf32_Ehdr ElfFileHeader; + typedef Elf32_Phdr ElfProgramHeader; +#endif - if (FLAGS_heap_profile_log >= 5) { - HeapProfiler::MESSAGE(5, "HeapChecker: Looking at objdump\n"); - write(STDERR_FILENO, output.data(), output.size()); + // Cannot mmap the ELF file because of the live ProcMapsIterator. + // Have to read little pieces. Fortunately there are just two. + HeapProfiler::MESSAGE(2, "HeapChecker: Looking into %s\n", filename); + FileDescriptor fd_elf(open(filename, O_RDONLY)); + if (fd_elf < 0) + return false; + + // Read and validate the file header. + ElfFileHeader efh; + if (read(fd_elf, &efh, sizeof(efh)) != sizeof(efh)) + return false; + if (memcmp(&efh.e_ident[0], ELFMAG, SELFMAG) != 0) + return false; + if (efh.e_version != EV_CURRENT) + return false; + if (efh.e_type != ET_EXEC && efh.e_type != ET_DYN) + return false; + + // Read the segment headers. + if (efh.e_phentsize != sizeof(ElfProgramHeader)) + return false; + if (lseek(fd_elf, efh.e_phoff, SEEK_SET) != efh.e_phoff) + return false; + const size_t phsize = efh.e_phnum * efh.e_phentsize; + ElfProgramHeader* eph = new ElfProgramHeader[efh.e_phnum]; + if (read(fd_elf, eph, phsize) != phsize) { + delete[] eph; + return false; } - while (1) { - char sec_name[11]; - uint64 sec_size, sec_vmaddr, sec_lmaddr, sec_offset; - if (sscanf(output_start, "%*d .%10s "LLX" "LLX" "LLX" "LLX" ", - sec_name, &sec_size, &sec_vmaddr, - &sec_lmaddr, &sec_offset) == 5) { - if (strcmp(sec_name, "data") == 0 || - strcmp(sec_name, "bss") == 0) { - uint64 real_start = start_address + sec_offset - file_offset; - HeapProfiler::MESSAGE(4, "HeapChecker: " - "Got section %s: %p of "LLX" bytes\n", - sec_name, - reinterpret_cast<void*>(real_start), - sec_size); - (*library_live_objects)[library]. - push_back(AllocObject(reinterpret_cast<void*>(real_start), - sec_size, IN_GLOBAL_DATA)); + // Gonna need this page size for page boundary considerations. + // Better be a power of 2. + const int int_page_size = getpagesize(); + if (int_page_size <= 0 || int_page_size & (int_page_size-1)) + abort(); + const uint64 page_size = int_page_size; + + // Walk the segment headers. + // Find the segment header that contains the given file offset. + bool found_load_segment = false; + for (int iph = 0; iph < efh.e_phnum; ++iph) { + HeapProfiler::MESSAGE(3, "HeapChecker: %s %d: p_type: %d p_flags: %x", + filename, iph, eph[iph].p_type, eph[iph].p_flags); + if (eph[iph].p_type == PT_LOAD && eph[iph].p_flags & PF_W) { + // Sanity check the segment header. + if (eph[iph].p_vaddr != eph[iph].p_paddr) { + delete[] eph; + return false; + } + if ((eph[iph].p_vaddr & (page_size-1)) != + (eph[iph].p_offset & (page_size-1))) { + delete[] eph; + return false; + } + + // The segment is not aligned in the ELF file, but will be + // aligned in memory. So round it to page boundaries. + // Note: we lose if p_end is on the last page. + const uint64 p_start = eph[iph].p_offset &~ (page_size-1); + const uint64 p_end = ((eph[iph].p_offset + eph[iph].p_memsz) + + (page_size-1)) &~ (page_size-1); + if (p_end < p_start) { + delete[] eph; + return false; + } + if (file_offset >= p_start && file_offset < p_end) { + // Found it. + if (found_load_segment) { + delete[] eph; + return false; + } + found_load_segment = true; + + // [p_start, p_end) is the file segment, where p_end is extended + // for BSS (which does not actually come from the file). + // + // [start_address, end_address) is the memory region from + // /proc/self/maps. + // + // The point of correspondence is: + // file_offset in filespace <-> start_address in memoryspace + // + // This point of correspondence is reliable because the kernel + // virtual memory system actually uses this information for + // demand-paging the file. + // + // A single file segment can get loaded into several contiguous + // memory regions with different permissions. I have seen as + // many as four regions: no-permission alignment region + + // read-only-after-relocation region + read-write data region + + // read-write anonymous bss region. So file_offset from the + // memory region may not equal to p_start from the file segement; + // file_offset can be anywhere in [p_start, p_end). + + // Check that [start_address, end_address) is wholly contained + // in [p_start, p_end). + if (end_address < start_address || + end_address - start_address > p_end - file_offset) { + delete[] eph; + return false; + } + + // Calculate corresponding address and length for [p_start, p_end). + if (file_offset - p_start > start_address) { + delete[] eph; + return false; + } + void* addr = reinterpret_cast<void*>(start_address - + (file_offset - p_start)); + const uintptr_t length = p_end - p_start; + + // That is what we need. + (*library_live_objects)[filename]. + push_back(AllocObject(addr, length, IN_GLOBAL_DATA)); } } - // skip to the next line - const char* next = strpbrk(output_start, "\n\r"); - if (next == NULL) break; - output_start = next + 1; } + delete[] eph; + + if (!found_load_segment) { + HeapProfiler::MESSAGE(-1, + "HeapChecker: no LOAD segment found in %s\n", + filename); + return false; + } + + if (fd_elf.Close() < 0) + return false; + + return true; } // See if 'library' from /proc/self/maps has base name 'library_base' @@ -522,26 +693,32 @@ HeapLeakChecker::UseProcMaps(ProcMapsTask proc_maps_task) { HeapProfiler::MESSAGE(4, "HeapChecker: " "Looking at /proc/self/maps line:\n %s\n", proc_map_line); - if (proc_maps_task == DISABLE_LIBRARY_ALLOCS && - strncmp(permissions, "r-xp", 4) == 0 && inode != 0) { - if (start_address >= end_address) abort(); - DisableLibraryAllocs(filename, - reinterpret_cast<void*>(start_address), - reinterpret_cast<void*>(end_address)); - } - if (proc_maps_task == RECORD_GLOBAL_DATA_LOCKED && - // grhat based on Red Hat Linux 9 - (strncmp(permissions, "rw-p", 4) == 0 || - // Fedora Core 3 - strncmp(permissions, "rwxp", 4) == 0) && - inode != 0) { - if (start_address >= end_address) abort(); - RecordGlobalDataLocked(proc_map_line + size, start_address, file_offset); - } + + if (start_address >= end_address) + abort(); + // Determine if any shared libraries are present. - if (strstr(filename, "lib") && strstr(filename, ".so")) { + if (inode != 0 && strstr(filename, "lib") && strstr(filename, ".so")) { saw_shared_lib = true; } + + if (proc_maps_task == DISABLE_LIBRARY_ALLOCS) { + if (inode != 0 && strncmp(permissions, "r-xp", 4) == 0) { + DisableLibraryAllocs(filename, + reinterpret_cast<void*>(start_address), + reinterpret_cast<void*>(end_address)); + } + } + + if (proc_maps_task == RECORD_GLOBAL_DATA_LOCKED) { + if (!RecordGlobalDataLocked(start_address, end_address, permissions, + file_offset, inode, filename)) { + HeapProfiler::MESSAGE( + -1, "HeapChecker: failed RECORD_GLOBAL_DATA_LOCKED on %s\n", + filename); + abort(); + } + } } fclose(fp); @@ -825,7 +1002,7 @@ void HeapLeakChecker::DisableChecksUp(int stack_frames) { if (!heap_checker_on) return; if (stack_frames < 1) abort(); void* stack[1]; - if (GetStackTrace(stack, 1, stack_frames) != 1) abort(); + if (GetStackTrace(stack, 1, stack_frames+1) != 1) abort(); DisableChecksAt(stack[0]); } @@ -840,7 +1017,7 @@ bool HeapLeakChecker::HaveDisabledChecksUp(int stack_frames) { if (!heap_checker_on) return false; if (stack_frames < 1) abort(); void* stack[1]; - if (GetStackTrace(stack, 1, stack_frames) != 1) abort(); + if (GetStackTrace(stack, 1, stack_frames+1) != 1) abort(); return HaveDisabledChecksAt(stack[0]); } @@ -865,14 +1042,14 @@ void HeapLeakChecker::DisableChecksIn(const char* pattern) { void* HeapLeakChecker::GetDisableChecksStart() { if (!heap_checker_on) return NULL; void* start_address; - if (GetStackTrace(&start_address, 1, 0) != 1) abort(); + if (GetStackTrace(&start_address, 1, 1) != 1) abort(); return start_address; } void HeapLeakChecker::DisableChecksToHereFrom(void* start_address) { if (!heap_checker_on) return; void* end_address; - if (GetStackTrace(&end_address, 1, 0) != 1) abort(); + if (GetStackTrace(&end_address, 1, 1) != 1) abort(); if (start_address > end_address) swap(start_address, end_address); DisableChecksFromTo(start_address, end_address, 10000); // practically no stack depth limit: diff --git a/src/heap-profiler.cc b/src/heap-profiler.cc index 45eb908..7cb7d56 100644 --- a/src/heap-profiler.cc +++ b/src/heap-profiler.cc @@ -869,10 +869,14 @@ void HeapProfiler::EarlyStartLocked() { // Our first allocation after registering our hook is treated specially by // RecordAlloc(); It looks at the stack and counts how many frames up we // are. First we record the current stack pointer. - void* here[1]; - GetStackTrace(here, 1, 0); - // This actually records the frame above this one. We take this into account - // in RecordAlloc. + // Note: The stacktrace implementations differ about how many args they + // fill when skip is non-zero. Safest just to reserve maxdepth space. + void* here[2]; + GetStackTrace(here, 2, 1); + // Skip the first frame. It points to the current offset within this + // function, which will have changed by the time we get to the malloc() + // call which triggers. Instead, we store our parent function's offset, + // which is shared by both us and by the malloc() call below. recordalloc_reference_stack_position_ = here[0]; done_first_alloc_ = false; // Initialization has not occured yet void* first_alloc = malloc(kFirstAllocationNumBytes); diff --git a/src/malloc_extension.cc b/src/malloc_extension.cc index 1a42e6e..0260a34 100644 --- a/src/malloc_extension.cc +++ b/src/malloc_extension.cc @@ -249,6 +249,7 @@ void MallocExtension::GetHeapSample(string* result) { // TODO(menage) Get this working in google-perftools //DumpAddressMap(DebugStringWriter, result); + delete[] entries; } void MallocExtension::GetHeapGrowthStacks(std::string* result) { @@ -1,6 +1,6 @@ #! /usr/bin/perl -w -# Copyright (c) 2005, Google Inc. +# Copyright (c) 1998-2006, Google Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -68,21 +68,31 @@ use strict; use Getopt::Long; -# These are the external binaries we use. We hard-code in the path -# because some people have colorizing versions of these binaries which -# can cause trouble. -my $OBJDUMP = "/usr/bin/objdump"; -my $NM = "/usr/bin/nm"; -my $ADDR2LINE = "/usr/bin/addr2line"; +# These are the object tools we use, which come from various sources. +# We want to invoke them directly, rather than via users' aliases and/or +# search paths, because some people have colorizing versions of them that +# can cause trouble, or because the default versions on a system may not +# deal with 64-bit objects if 64-bit profiles are being analyzed. +# (However, we will default to the user's defaults if we can't find better.) +# See ConfigureObjTools near the bottom of this script. +my %obj_tool_map = ( + "objdump" => "objdump", + "nm" => "nm", + "addr2line" => "addr2line", +); my $DOT = "dot"; # leave non-absolute, since it may be in /usr/local my $GV = "gv"; +# There is a pervasive dependency on the length (in hex characters, i.e., +# nibbles) of an address, distinguishing between 32-bit and 64-bit profiles: +my $address_length = 8; # Hope for 32-bit, reset if 64-bit detected. + ##### Argument parsing ##### sub usage_string { return <<'EOF'; -Usage: pprof [options] <program> <profile> - Prints specified cpu- or heap-profile +Usage: pprof [options] <program> <profile> ... + Prints specified cpu- or heap-profile Options: --cum Sort by cumulative data @@ -111,7 +121,7 @@ Heap-Profile Options: --alloc_space Display allocated (mega)bytes --alloc_objects Display allocated objects --show_bytes Display space in bytes - --drop_negative Ignore negaive differences + --drop_negative Ignore negative differences Call-graph Options: --nodecount=<n> Show at most so many nodes [default=80] @@ -121,6 +131,12 @@ Call-graph Options: --ignore=<regexp> Ignore nodes matching <regexp> --scale=<n> Set GV scaling [default=0] +Miscellaneous: + --tools=<prefix> Prefix for object tool pathnames + --test Run unit tests + --help This message + --version Version information + Examples: pprof /bin/ls ls.prof @@ -140,9 +156,9 @@ EOF sub version_string { return <<'EOF' -pprof (part of google-perftools 0.1) +pprof (part of google-perftools 0.6) -Copyright 1998-2005 Google Inc. +Copyright 1998-2006 Google Inc. This is BSD licensed software; see the source for copying conditions and license information. @@ -151,14 +167,14 @@ PARTICULAR PURPOSE. EOF } -sub fatal { +sub usage { my $msg = shift; print STDERR "$msg\n\n"; print STDERR usage_string(); print STDERR "\nFATAL ERROR: $msg\n"; # just as a reminder exit(1); } - + $main::opt_help = 0; $main::opt_version = 0; @@ -194,6 +210,10 @@ $main::opt_show_bytes = 0; $main::opt_drop_negative = 0; $main::opt_interactive = 0; +$main::opt_tools = ""; +$main::opt_debug = 0; +$main::opt_test = 0; + # Are we printing a heap profile? $main::heap_profile = 0; @@ -229,7 +249,10 @@ GetOptions("help!" => \$main::opt_help, "alloc_objects!" => \$main::opt_alloc_objects, "show_bytes!" => \$main::opt_show_bytes, "drop_negative!" => \$main::opt_drop_negative, - ) || fatal("Invalid option(s)"); + "tools=s" => \$main::opt_tools, + "test!" => \$main::opt_test, + "debug!" => \$main::opt_debug, + ) || usage("Invalid option(s)"); # Deal with the standard --help and --version if ($main::opt_help) { @@ -255,7 +278,7 @@ if ($main::opt_inuse_space + $main::opt_inuse_objects + $main::opt_alloc_space + $main::opt_alloc_objects > 1) { - fatal("Specify at most on of --inuse/--alloc options"); + usage("Specify at most on of --inuse/--alloc options"); } # Check output granularities @@ -266,7 +289,7 @@ my $grains = $main::opt_files + 0; if ($grains > 1) { - fatal("Only specify one output granularity option"); + usage("Only specify one output granularity option"); } if ($grains == 0) { $main::opt_functions = 1; @@ -282,14 +305,30 @@ my $modes = $main::opt_gif + 0; if ($modes > 1) { - fatal("Only specify one output mode"); + usage("Only specify one output mode"); } if ($modes == 0) { $main::opt_text = 1; } -my $prog = shift || fatal("Did not specify program"); -my $pfile_arg = shift || fatal("Did not specify profile file"); +if ($main::opt_test) { + RunUnitTests(); + # Should not return + exit(1); +} + +# Binary name and profile arguments list +$main::prog = ""; +@main::pfile_args = (); + +$main::prog = shift || usage("Did not specify program"); +scalar(@ARGV) || usage("Did not specify profile file"); + +# Parse profile file/location arguments +foreach my $farg (@ARGV) { + unshift(@main::pfile_args, $farg); +} +ConfigureObjTools($main::prog); ##### Main section ##### @@ -298,16 +337,21 @@ $main::tmpfile_sym = "/tmp/pprof$$.sym"; $main::tmpfile_ps = "/tmp/pprof$$"; $main::next_tmpfile = 0; $main::collected_profile = undef; +@main::profile_files = (); +#$main::op_time = time(); $SIG{'INT'} = \&sighandler; -# Read profile data -my $pfile = FetchDynamicProfile($prog, $pfile_arg); -my $data = ReadProfile($prog, $pfile); +# Fetch all profile data +FetchDynamicProfiles(); + +# Read one profile, pick the last item on the list +my $data = ReadProfile($main::prog, pop(@main::profile_files)); my $profile = $data->{profile}; my $libs = $data->{libs}; # Info about main program and shared libraries # List of function names to skip $main::skip = (); +$main::skip_regexp = 'NOMATCH'; if ($main::heap_profile) { foreach my $name ('calloc', 'cfree', @@ -317,22 +361,33 @@ if ($main::heap_profile) { 'pvalloc', 'valloc', 'realloc', + 'do_malloc', + 'DoSampledAllocation', '__builtin_delete', '__builtin_new', '__builtin_vec_delete', '__builtin_vec_new') { $main::skip{$name} = 1; } + $main::skip_regexp = "TCMalloc"; } if ($main::lock_profile) { - foreach my $name ('Mutex::Unlock') { - $main::skip{$name} = 1; + foreach my $vname ('Mutex::Unlock', 'Mutex::UnlockSlow') { + $main::skip{$vname} = 1; + } +} + +# Add additional profiles, if available. +if (scalar(@main::profile_files) > 0) { + foreach my $pname (@main::profile_files) { + my $p = ReadProfile($main::prog, $pname)->{profile}; + $profile = AddProfile($profile, $p); } } # Subtract base from profile, if specified if ($main::opt_base ne '') { - my $base = ReadProfile($prog, $main::opt_base)->{profile}; + my $base = ReadProfile($main::prog, $main::opt_base)->{profile}; $profile = SubtractProfile($profile, $base); } @@ -369,13 +424,17 @@ if (!$main::opt_interactive) { } elsif ($main::opt_text) { PrintText($symbols, $flat, $cumulative, $total, -1); } else { - if (PrintDot($prog, $symbols, $profile, $flat, $cumulative, $total)) { + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { if ($main::opt_gv) { - # Some versions of gv use -scale, and some use --scale. *sigh* - # We use --help to determine if gv expects one dash or two. - system("$GV --help >/dev/null 2>&1 " . - "&& $GV --scale=$main::opt_scale $main::tmpfile_ps " . - "|| $GV -scale $main::opt_scale $main::tmpfile_ps") + if (!system("$GV --version >/dev/null 2>&1")) { + # Options using double dash are supported by this gv version. + system("$GV --scale=$main::opt_scale " . + PsTempName($main::next_tmpfile)); + } else { + # Old gv version - only supports options that use single dash. + system("$GV -scale $main::opt_scale " . + PsTempName($main::next_tmpfile)); + } } } else { exit(1); @@ -390,108 +449,139 @@ exit(0); ##### Interactive helper routines ##### + sub InteractiveMode { $| = 1; # Make output unbuffered for interactive mode my $orig_profile = $profile; - while (1) { - print "(pprof) "; - $_ = <STDIN>; - if (!defined($_)) { - print "\n"; - last; - } - if (m/^ *quit/) { - last; + + # Use ReadLine if it's installed. + if ( defined(eval {require Term::ReadLine}) ) { + my $term = new Term::ReadLine 'pprof'; + while ( defined ($_ = $term->readline('(pprof) '))) { + $term->addhistory($_) if /\S/; + if (!InteractiveCommand($orig_profile, $_)) { + last; # exit when we get an interactive command to quit + } } - if (m/^ *help/) { - InteractiveHelpMessage(); - next; + } else { # don't have readline + while (1) { + print "(pprof) "; + $_ = <STDIN>; + if (!InteractiveCommand($orig_profile, $_)) { + last; # exit when we get an interactive command to quit + } } - # Clear all the options - $main::opt_lines = 0; - $main::opt_text = 0; - $main::opt_disasm = 0; - $main::opt_list = 0; - $main::opt_gv = 0; - $main::opt_cum = 0; + } +} + +# Takes two args: orig profile, and command to run. +# Returns 1 if we should keep going, or 0 if we were asked to quit +sub InteractiveCommand { + my($orig_profile, $command) = @_; + $_ = $command; # just to make future m//'s easier + if (!defined($_)) { + print "\n"; + return 0; + } + if (m/^ *quit/) { + return 0; + } + if (m/^ *help/) { + InteractiveHelpMessage(); + return 1; + } + # Clear all the options + $main::opt_lines = 0; + $main::opt_text = 0; + $main::opt_disasm = 0; + $main::opt_list = 0; + $main::opt_gv = 0; + $main::opt_cum = 0; - if (m/^ *(text|top)(\d*) *(.*)/) { - $main::opt_text = 1; + if (m/^ *(text|top)(\d*) *(.*)/) { + $main::opt_text = 1; - my $line_limit = ($2 ne "") ? int($2) : 10; + my $line_limit = ($2 ne "") ? int($2) : 10; - my $routine; - my $ignore; - ($routine, $ignore) = ParseInteractiveArgs($3); + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($3); - my $profile = ProcessProfile($orig_profile, "", $ignore); - my $reduced = ReduceProfile($symbols, $profile); + my $profile = ProcessProfile($orig_profile, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); - # Get derived profiles - my $flat = FlatProfile($reduced); - my $cumulative = CumulativeProfile($reduced); + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); - PrintText($symbols, $flat, $cumulative, $total, $line_limit); - next; - } - if (m/^ *list *(.+)/) { - $main::opt_list = 1; + PrintText($symbols, $flat, $cumulative, $total, $line_limit); + return 1; + } + if (m/^ *list *(.+)/) { + $main::opt_list = 1; - my $routine; - my $ignore; - ($routine, $ignore) = ParseInteractiveArgs($1); + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($1); - my $profile = ProcessProfile($orig_profile, "", $ignore); - my $reduced = ReduceProfile($symbols, $profile); + my $profile = ProcessProfile($orig_profile, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); - # Get derived profiles - my $flat = FlatProfile($reduced); - my $cumulative = CumulativeProfile($reduced); + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); - PrintListing($libs, $flat, $cumulative, $routine); - next; - } - if (m/^ *disasm *(.+)/) { - $main::opt_disasm = 1; + PrintListing($libs, $flat, $cumulative, $routine); + return 1; + } + if (m/^ *disasm *(.+)/) { + $main::opt_disasm = 1; - my $routine; - my $ignore; - ($routine, $ignore) = ParseInteractiveArgs($1); + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($1); - # Process current profile to account for various settings - my $profile = ProcessProfile($orig_profile, "", $ignore); - my $reduced = ReduceProfile($symbols, $profile); + # Process current profile to account for various settings + my $profile = ProcessProfile($orig_profile, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); - # Get derived profiles - my $flat = FlatProfile($reduced); - my $cumulative = CumulativeProfile($reduced); + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); - PrintDisassembly($libs, $flat, $cumulative, $routine); - next; - } - if (m/^ *gv *(.*)/) { - $main::opt_gv = 1; + PrintDisassembly($libs, $flat, $cumulative, $routine); + return 1; + } + if (m/^ *gv *(.*)/) { + $main::opt_gv = 1; - my $focus; - my $ignore; - ($focus, $ignore) = ParseInteractiveArgs($1); + my $focus; + my $ignore; + ($focus, $ignore) = ParseInteractiveArgs($1); - # Process current profile to account for various settings - my $profile = ProcessProfile($orig_profile, $focus, $ignore); - my $reduced = ReduceProfile($symbols, $profile); + # Process current profile to account for various settings + my $profile = ProcessProfile($orig_profile, $focus, $ignore); + my $reduced = ReduceProfile($symbols, $profile); - # Get derived profiles - my $flat = FlatProfile($reduced); - my $cumulative = CumulativeProfile($reduced); + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); - if (PrintDot($prog, $symbols, $profile, $flat, $cumulative, $total)) { - system("gv -scale $main::opt_scale -noresize " . - PsTempName($main::next_tmpfile) . " &"); - $main::next_tmpfile++; + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { + if (!system("$GV --version >/dev/null 2>&1")) { + # Options using double dash are supported by this gv version. + system("$GV --scale=$main::opt_scale --noresize " . + PsTempName($main::next_tmpfile) . " &"); + } else { + # Old gv version - only supports options that use single dash. + system("$GV -scale $main::opt_scale -noresize " . + PsTempName($main::next_tmpfile) . " &"); } - next; + $main::next_tmpfile++; } + return 1; } + return 1; } @@ -503,19 +593,23 @@ sub ProcessProfile { # Process current profile to account for various settings my $profile = $orig_profile; my $total_count = TotalProfile($profile); - print "Total: ", $total_count, " samples\n"; + printf("Total: %s %s\n", Unparse($total_count), Units()); if ($focus ne '') { $profile = FocusProfile($symbols, $profile, $focus); my $focus_count = TotalProfile($profile); - printf "After focusing on '%s': %d samples of %d (%0.1f%%)\n", - $focus, $focus_count, $total_count, ($focus_count*100.0) / $total_count; + printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n", + $focus, + Unparse($focus_count), Units(), + Unparse($total_count), ($focus_count*100.0) / $total_count); } if ($ignore ne '') { $profile = IgnoreProfile($symbols, $profile, $ignore); my $ignore_count = TotalProfile($profile); - printf "After ignoring '%s': %d samples of %d (%0.1f%%)\n", - $ignore, $ignore_count, $total_count, - ($ignore_count*100.0) / $total_count; + printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n", + $ignore, + Unparse($ignore_count), Units(), + Unparse($total_count), + ($ignore_count*100.0) / $total_count); } return $profile; @@ -531,12 +625,12 @@ Commands: Show graphical hierarchical display of current profile. Without any arguments, shows all samples in the profile. With the optional "focus" argument, restricts the samples shown to just those where - the "focus" regular expression matches a routine name on the stack + the "focus" regular expression matches a routine name on the stack trace. list [routine_regexp] [-ignore1] [-ignore2] Show source listing of routines whose names match "routine_regexp" - + top [--cum] [-ignore1] [-ignore2] top20 [--cum] [-ignore1] [-ignore2] top37 [--cum] [-ignore1] [-ignore2] @@ -585,7 +679,7 @@ sub PsTempName { my $fnum = shift; return "$main::tmpfile_ps" . "." . "$fnum" . ".ps"; } - + # Print text output sub PrintText { my $symbols = shift; @@ -636,24 +730,27 @@ sub PrintDisassembly { foreach my $lib (@{$libs}) { my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts); - my $offset = $lib->[1] - $lib->[3]; - foreach my $routine (keys(%{$symbol_table})) { + my $offset = AddressSub($lib->[1], $lib->[3]); + foreach my $routine (sort ByName keys(%{$symbol_table})) { my $start_addr = $symbol_table->{$routine}->[0]; my $end_addr = $symbol_table->{$routine}->[1]; # See if there are any samples in this routine my $total_flat = 0; my $total_cum = 0; - for (my $addr = $start_addr; $addr < $end_addr; $addr++) { - $total_flat += GetEntry($flat, sprintf("0x%x", $addr+$offset)); - $total_cum += GetEntry($cumulative, sprintf("0x%x", $addr+$offset)); + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + $total_cum += GetEntry($cumulative, $addr); + $total_flat += GetEntry($flat, $addr); + $addr = AddressInc($addr); } # Skip disassembly if there are no samples in routine next if ($total_cum == 0); print "ROUTINE ====================== $routine\n"; - printf "%6s %6s Total samples (flat / cumulative)\n", - Unparse($total_flat), Unparse($total_cum); + printf "%6s %6s Total %s (flat / cumulative)\n", + Unparse($total_flat), Unparse($total_cum), Units(); my @instructions = Disassemble($lib->[0], $offset, $start_addr, $end_addr); @@ -666,7 +763,8 @@ sub PrintDisassembly { } my $f = GetEntry($flat, $e->[0]); my $c = GetEntry($cumulative, $e->[0]); - my $address = $e->[0]; $address =~ s/^0x//; + my $address = $e->[0]; + $address =~ s/^0x//; printf("%6s %6s %-20s %8s: %6s\n", UnparseAlt($f), UnparseAlt($c), @@ -689,10 +787,11 @@ sub Disassemble { my $start_addr = shift; my $end_addr = shift; - my $cmd = sprintf("$OBJDUMP -d -l --no-show-raw-insn " . - "--start-address=%d --stop-address=%d $prog", - $start_addr, $end_addr); - open(OBJDUMP, "$cmd |") || error("$OBJDUMP: $!\n"); + my $objdump = $obj_tool_map{"objdump"}; + my $cmd = sprintf("$objdump -d -l --no-show-raw-insn " . + "--start-address=0x$start_addr " . + "--stop-address=0x$end_addr $prog"); + open(OBJDUMP, "$cmd |") || error("$objdump: $!\n"); my @result = (); my $filename = ""; my $linenumber = -1; @@ -704,8 +803,9 @@ sub Disassemble { $filename = $1; $linenumber = $2; } elsif (m/^ +([0-9a-f]+):\s*(.*)/) { - # Disassembly line - my $k = sprintf("0x%x", hex($1) + $offset); + # Disassembly line -- zero-extend address to full length + my $addr = HexExtend($1); + my $k = AddressAdd($addr, $offset); push(@result, [$k, $filename, $linenumber, $2]); } } @@ -727,18 +827,21 @@ sub PrintListing { foreach my $lib (@{$libs}) { my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts); - my $offset = $lib->[1] - $lib->[3]; + my $offset = AddressSub($lib->[1], $lib->[3]); foreach my $routine (sort ByName keys(%{$symbol_table})) { # Print if there are any samples in this routine my $start_addr = $symbol_table->{$routine}->[0]; my $end_addr = $symbol_table->{$routine}->[1]; - for (my $addr = $start_addr; $addr < $end_addr; $addr++) { - if (defined($cumulative->{sprintf("0x%x", $addr+$offset)})) { + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + if (defined($cumulative->{$addr})) { PrintSource($lib->[0], $offset, $routine, $flat, $cumulative, $start_addr, $end_addr); last; } + $addr = AddressInc($addr); } } } @@ -794,7 +897,7 @@ sub PrintSource { } } - # Hack 3: Extend last line forward until its indentation is less than + # Hack 3: Extend last line forward until its indentation is less than # the indentation we saw on $firstline my $oldlastline = $lastline; { @@ -808,17 +911,17 @@ sub PrintSource { $l++; my $indent = Indentation($_); if ($l >= $firstline) { - if ($first_indentation < 0 && $indent >= 0) { - $first_indentation = $indent; - last if ($first_indentation == 0); - } + if ($first_indentation < 0 && $indent >= 0) { + $first_indentation = $indent; + last if ($first_indentation == 0); + } } if ($l >= $lastline && $indent >= 0) { - if ($indent >= $first_indentation) { - $lastline = $l+1; - } else { - last; - } + if ($indent >= $first_indentation) { + $lastline = $l+1; + } else { + last; + } } } close(FILE); @@ -871,8 +974,8 @@ sub PrintSource { my $l = 0; while (<FILE>) { $l++; - if ($l >= $firstline - 5 && - (($l <= $oldlastline + 5) || ($l <= $lastline))) { + if ($l >= $firstline - 5 && + (($l <= $oldlastline + 5) || ($l <= $lastline))) { chop; my $text = $_; printf("%6s %6s %4d: %s\n", @@ -1000,20 +1103,21 @@ sub PrintDot { # Get edges and counts per edge my %edge = (); + my $n; foreach my $k (keys(%{$raw})) { # TODO: omit low %age edges - my $n = $raw->{$k}; + $n = $raw->{$k}; my @addrs = split(/\n/, $k); for (my $i = 1; $i <= $#addrs; $i++) { my $src = OutputKey($symbols, $addrs[$i]); my $dst = OutputKey($symbols, $addrs[$i-1]); #next if ($src eq $dst); # Avoid self-edges? if (exists($node{$src}) && exists($node{$dst})) { - my $e = "$src\001$dst"; - if (!exists($edge{$e})) { - $edge{$e} = 0; + my $edge_label = "$src\001$dst"; + if (!exists($edge{$edge_label})) { + $edge{$edge_label} = 0; } - $edge{$e} += $n; + $edge{$edge_label} += $n; } } } @@ -1021,7 +1125,7 @@ sub PrintDot { # Print edges foreach my $e (keys(%edge)) { my @x = split(/\001/, $e); - my $n = $edge{$e}; + $n = $edge{$e}; if (abs($n) > $edgelimit) { # Compute line width based on edge count @@ -1054,8 +1158,10 @@ sub OutputKey { my $a = shift; # Skip large addresses since they sometimes show up as fake entries on RH9 - if (hex($a) > 0x7fffffff) { - return ''; + if (length($a) > 8) { + if ($a gt "7fffffffffffffff") { return ''; } + } else { + if (hex($a) > 0x7fffffff) { return ''; } } # Extract symbolic info for address @@ -1069,7 +1175,7 @@ sub OutputKey { } # We drop a few well-known names - if ($main::skip{$func}) { + if ($main::skip{$func} || ($func =~ m/$main::skip_regexp/)) { return ''; } @@ -1114,6 +1220,8 @@ sub Unparse { return sprintf("%.1f", $num / 1048576.0); } } + } elsif ($main::lock_profile) { + return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds } else { return sprintf("%d", $num); } @@ -1141,6 +1249,8 @@ sub Units { return "MB"; } } + } elsif ($main::lock_profile) { + return "seconds"; } else { return "samples"; } @@ -1264,6 +1374,25 @@ sub TotalProfile { return $result; } +# Add A to B +sub AddProfile { + my $A = shift; + my $B = shift; + + my $R = {}; + # add all keys in A + foreach my $k (keys(%{$A})) { + my $v = $A->{$k}; + AddEntry($R, $k, $v); + } + # add all keys in B + foreach my $k (keys(%{$B})) { + my $v = $B->{$k}; + AddEntry($R, $k, $v); + } + return $R; +} + # Subtract B from A sub SubtractProfile { my $A = shift; @@ -1278,7 +1407,7 @@ sub SubtractProfile { AddEntry($R, $k, $v); } if (!$main::opt_drop_negative) { - # take care of when substracted profile has more things + # Take care of when subtracted profile has more entries foreach my $k (keys(%{$B})) { if (!exists($A->{$k})) { AddEntry($R, $k, 0 - $B->{$k}); @@ -1310,16 +1439,90 @@ sub AddEntry { $profile->{$k} += $n; } +# Add a stack of entries to specified profile, and add them to the $pcs +# list. +sub AddEntries { + my $profile = shift; + my $pcs = shift; + my $stack = shift; + my $count = shift; + my @k = (); + + foreach my $e (split(/\s+/, $stack)) { + my $pc = HexExtend($e); + $pcs->{$pc} = 1; + push @k, $pc; + } + AddEntry($profile, (join "\n", @k), $count); +} + ##### Code to profile a server dynamically ##### sub FetchDynamicProfile { my $binary_name = shift; my $profile_name = shift; + my $fetch_name_only = shift; + my $encourage_patience = shift; # TODO: Add support for fetching profiles dynamically from a server return $profile_name; } +# Collect profiles in parallel +sub FetchDynamicProfiles { + my $items = scalar(@main::pfile_args); + my $levels = log($items) / log(2); + + if ($items == 1) { + $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1); + } else { + # math rounding issues + if ((2 ** $levels) < $items) { + $levels++; + } + my $count = scalar(@main::pfile_args); + for (my $i = 0; $i < $count; $i++) { + $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0); + } + print STDERR "Fetching $count profiles, Be patient...\n"; + FetchDynamicProfilesRecurse($levels, 0, 0); + $main::collected_profile = join(" \\\n ", @main::profile_files); + } +} + +# Recursively fork a process to get enough processes +# collecting profiles +sub FetchDynamicProfilesRecurse { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if (my $pid = fork()) { + $position = 0 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + wait; + } else { + $position = 1 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + exit(0); + } +} + +# Collect a single profile +sub TryCollectProfile { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if ($level >= ($maxlevel - 1)) { + if ($position < scalar(@main::pfile_args)) { + FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0); + } + } else { + FetchDynamicProfilesRecurse($maxlevel, $level+1, $position); + } +} + ##### Parsing code ##### # Parse profile generated by common/profiler.cc and return a reference @@ -1328,7 +1531,7 @@ sub FetchDynamicProfile { # $result->{period} Sampling period (in microseconds) # $result->{profile} Profile object # $result->{map} Memory map info from profile -# $result->{pcs} List of all PC values seen +# $result->{pcs} Hash of all PC values seen, key is hex address sub ReadProfile { my $prog = shift; my $fname = shift; @@ -1371,6 +1574,12 @@ sub ReadCPUProfile { my $nbytes = read(PROFILE, $str, (stat PROFILE)[7]); # read entire file close(PROFILE); + my $version; + my $period; + my $i; + my $profile = {}; + my $pcs = {}; + # Parse string into array of slots. # L! is needed for 64-bit # platforms, but not supported on 5.005 # (despite the manpage claims) @@ -1384,37 +1593,95 @@ sub ReadCPUProfile { my @slots = unpack($format, $str); - # Read header - if ($#slots < 1 || $slots[0] != 0 || $slots[1] < 3) { + # Read header. The current header version is a 5-element structure + # containing: + # 0: header count (always 0) + # 1: header "words" (after this one: 3) + # 2: format version (0) + # 3: sampling period (usec) + # 4: unused padding (always 0) + # The header words are 32-bit or 64-bit depending on the ABI of the program + # that generated the profile. In the 64-bit case, since our x86-architecture + # machines are little-endian, the actual value of each of these elements is + # in the first 32-bit word, and the second is always zero. The @slots array + # above was read as a sequence of 32-bit words in both cases, so we need to + # explicitly check for both cases. A typical slot sequence for each is: + # 32-bit: 0 3 0 100 0 + # 64-bit: 0 0 3 0 0 0 100 0 0 0 + # + if ($#slots < 4 || $slots[0] != 0 ) { error("$fname: not a profile file, or old format profile file\n"); } - my $version = $slots[2]; - my $period = $slots[3]; - my $i = 2 + $slots[1]; + if ($slots[1] >= 3) { + # Normal 32-bit header: + $version = $slots[2]; + $period = $slots[3]; + $i = 2 + $slots[1]; + $address_length = 8; + + # Parse profile + while ($i <= $#slots) { + my $n = $slots[$i++]; + my $d = $slots[$i++]; + if ($slots[$i] == 0) { + # End of profile data marker + $i += $d; + last; + } - # Parse profile - my $profile = {}; - my $pcs = {}; - while ($i <= $#slots) { - my $n = $slots[$i++]; - my $d = $slots[$i++]; - if ($slots[$i] == 0) { - # End of profile data marker + # Make key out of the stack entries + my @k = (); + for (my $j = 0; $j < $d; $j++) { + my $pc = sprintf("%08x", $slots[$i+$j]); + $pcs->{$pc} = 1; + push @k, $pc; + } + + AddEntry($profile, (join "\n", @k), $n); $i += $d; - last; } - # Make key out of the stack entries - my $k = ""; - for (my $j = 0; $j < $d; $j++) { - my $pc = $slots[$i+$j]; - $pcs->{$pc} = 1; - $k .= sprintf("\n0x%x", $pc); - } - $k =~ s/^\n//; + # Normal 64-bit header: All entries are doubled in size. The first + # word (little-endian) should contain the real value, the second should + # be zero. + } elsif ($#slots < 9 || $slots[1] != 0 || $slots[2] < 3 || $slots[3] != 0 + || $slots[5] != 0 || $slots[7] != 0) { + error("$fname: not a profile file, or old format profile file\n"); + } else { + $version = $slots[4]; + $period = $slots[6]; + $i = 4 + 2 * $slots[2]; + $address_length = 16; + + # Parse profile + while ($i <= $#slots) { + my $n = $slots[$i++]; + my $nhi = $slots[$i++]; + # Huge counts may coerce to floating point, keeping scale, not precision + if ($nhi != 0) { $n += $nhi*(2**32); } + my $d = $slots[$i++]; + if ($slots[$i++] != 0) { + my $addr = sprintf("%o", 4 * $i); + print STDERR "At index $i ($addr):\n"; + error("$fname: stack trace depth >= 2**32\n"); + } + if ($slots[$i] == 0 && $slots[$i+1] == 0) { + # End of profile data marker + $i += 2 * $d; + last; + } - AddEntry($profile, $k, $n); - $i += $d; + # Make key out of the stack entries + my @k = (); + for (my $j = $d; $j--; ) { + my $pclo = $slots[$i++]; + my $pchi = $slots[$i++]; + my $pc = sprintf("%08x%08x", $pchi, $pclo); + $pcs->{$pc} = 1; + push @k, $pc; + } + AddEntry($profile, (join "\n", @k), $n); + } } # Parse map @@ -1458,6 +1725,7 @@ sub ReadHeapProfile { my $profile = {}; my $pcs = {}; my $map = ""; + while (<PROFILE>) { if (/^MAPPED_LIBRARIES:/) { # Read the /proc/self/maps data @@ -1493,15 +1761,7 @@ sub ReadHeapProfile { my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4); my @counts = ($n1, $s1, $n2, $s2); - my $n = $counts[$index]; - my $k = ""; - foreach my $e (split(/\s+/, $stack)) { - my $pc = hex($e); - $pcs->{$pc} = 1; - $k .= sprintf("\n0x%x", $pc); - } - $k =~ s/^\n//; - AddEntry($profile, $k, $n); + AddEntries($profile, $pcs, $stack, $counts[$index]); } } @@ -1516,28 +1776,33 @@ sub ReadHeapProfile { sub ReadSynchProfile { my ($prog, $fname, $header) = @_; - my ($line, $map, $pc, @k, $count, $stack); - $map = ''; + my $map = ''; my $profile = {}; my $pcs = {}; my $sampling_period = 1; + my $cyclespernanosec = 2.8; # Default assumption for old binaries + my $seen_clockrate = 0; + my $line; while ( $line = <PROFILE> ) { if ( $line =~ /^(slow release).*thread \d+ \@\s*(.*?)\s*$/ || $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) { - # Sample entry - ($count, $stack) = ($1, $2); - $count = 1 if $count !~ /^\d+$/; - - @k = (); - foreach $pc (split /\s+/, $stack) { - $pcs->{hex($pc)} = 1; - push @k, $pc; + my ($count, $stack) = ($1, $2); + if ($count !~ /^\d+$/) { + next; } - AddEntry($profile, (join "\n", @k), $count); + + # Convert cycles to nanoseconds + $count /= $cyclespernanosec; + AddEntries($profile, $pcs, $stack, $count); + + } elsif ( $line =~ m|cycles/second = (\d+)|) { + $cyclespernanosec = $1 / 1e9; + $seen_clockrate = 1; } elsif ( $line =~ /sampling period = (\d+)/ ) { $sampling_period = $1; + } else { # Memory map entry $map .= $line; @@ -1545,15 +1810,30 @@ sub ReadSynchProfile { } close PROFILE; + if (!$seen_clockrate) { + printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n", + $cyclespernanosec); + } + my $r = {}; - $r->{'version'} = 0; - $r->{'period'} = $sampling_period; - $r->{'profile'} = $profile; - $r->{'libs'} = ParseLibraries($prog, $map, $pcs); - $r->{'pcs'} = $pcs; + $r->{version} = 0; + $r->{period} = $sampling_period; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; return $r; } +# Given a hex value in the form "0x1abcd" return "0001abcd" or +# "000000000001abcd", depending on the current address length. +# There's probably a more idiomatic (or faster) way to do this... +sub HexExtend { + my $addr = shift; + + $addr =~ s/^0x//; + return substr("000000000000000".$addr, -$address_length); +} + ##### Symbol extraction ##### # Split /proc/pid/maps dump into a list of libraries @@ -1564,7 +1844,14 @@ sub ParseLibraries { my $result = []; my $h = "[a-f0-9]+"; + my $zero_offset = HexExtend("0"); + + my $buildvar = ""; foreach my $l (split("\n", $map)) { + if ($l =~ m/^\s*build=(.*)$/) { + $buildvar = $1; + } + my $start; my $finish; my $offset; @@ -1572,32 +1859,40 @@ sub ParseLibraries { if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.so(\.\d+)*\w*)/) { # Full line from /proc/self/maps. Example: # 40000000-40015000 r-xp 00000000 03:01 12845071 /lib/ld-2.3.2.so - $start = hex($1); - $finish = hex($2); - $offset = hex($3); + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = HexExtend($3); $lib = $4; } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) { # Cooked line from DumpAddressMap. Example: # 40000000-40015000: /lib/ld-2.3.2.so - $start = hex($1); - $finish = hex($2); - $offset = 0; + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = $zero_offset; $lib = $3; } else { next; } + # Expand "$build" variable if available + $lib =~ s/\$build\b/$buildvar/g; + # Get objdump output from the library file to figure out how to # map between mapped addresses and addresses in the library. - open(OBJDUMP, "$OBJDUMP -h $lib |") || error("$OBJDUMP $lib: $!\n"); + my $objdump = $obj_tool_map{"objdump"}; + open(OBJDUMP, "$objdump -h $lib |") + || error("$objdump $lib: $!\n"); while (<OBJDUMP>) { # Idx Name Size VMA LMA File off Algn # 10 .text 00104b2c 420156f0 420156f0 000156f0 2**4 + # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file + # offset may still be 8. But AddressSub below will still handle that. my @x = split; if (($#x >= 6) && ($x[1] eq '.text')) { - my $vma = hex($x[3]); - my $file_offset = hex($x[5]); - $offset += $vma - $file_offset; + my $vma = $x[3]; + my $file_offset = $x[5]; + my $vma_offset = AddressSub($vma, $file_offset); + $offset = AddressAdd($offset, $vma_offset); last; } } @@ -1607,15 +1902,173 @@ sub ParseLibraries { } # Append special entry for the main program - my $max_pc = 0; + my $max_pc = "0"; foreach my $pc (keys(%{$pcs})) { - if ($pc > $max_pc) { $max_pc = $pc; } + if ($pc gt $max_pc) { $max_pc = $pc; } } - push(@{$result}, [$prog, 0, $max_pc, 0]); + push(@{$result}, [$prog, $zero_offset, $max_pc, $zero_offset]); return $result; } +# Add two hex addresses of length $address_length. +# Run pprof --test for unit test if this is changed. +sub AddressAdd { + my $addr1 = shift; + my $addr2 = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + + if ($main::opt_debug and $main::opt_test) { + print STDERR "AddressAdd $addr1 + $addr2 = "; + } + + my $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + my $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2); + my $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + my $r = sprintf("%07x", $sum); + + $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2) + $c; + $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + $r = sprintf("%07x", $sum) . $r; + + $sum = hex($addr1) + hex($addr2) + $c; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; } + + return $r; + } +} + + +# Subtract two hex addresses of length $address_length. +# Run pprof --test for unit test if this is changed. +sub AddressSub { + my $addr1 = shift; + my $addr2 = shift; + my $diff; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $diff); + + } else { + # Do the addition in 7-nibble chunks to trivialize borrow handling. + # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; } + + my $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + my $a2 = hex(substr($addr2,-7)); + $addr2 = substr($addr2,0,-7); + my $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + my $r = sprintf("%07x", $diff); + + $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + $a2 = hex(substr($addr2,-7)) + $b; + $addr2 = substr($addr2,0,-7); + $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + $r = sprintf("%07x", $diff) . $r; + + $a1 = hex($addr1); + $a2 = hex($addr2) + $b; + if ($a2 > $a1) { $a1 += 0x100; } + $diff = $a1 - $a2; + $r = sprintf("%02x", $diff) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + + return $r; + } +} + +# Increment a hex addresses of length $address_length. +# Run pprof --test for unit test if this is changed. +sub AddressInc { + my $addr = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr)+1) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + # We are always doing this to step through the addresses in a function, + # and will almost never overflow the first chunk, so we check for this + # case and exit early. + + # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; } + + my $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + my $r = sprintf("%07x", $sum); + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "0000000"; + } + + $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + $r = sprintf("%07x", $sum) . $r; + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "00000000000000"; + } + + $sum = hex($addr) + 1; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + return $r; + } +} + # Extract symbols for all PC values found in profile sub ExtractSymbols { my $libs = shift; @@ -1635,13 +2088,13 @@ sub ExtractSymbols { # Get list of pcs that belong in this library. my $contained = []; foreach my $pc (keys(%{$pcset})) { - if (!$seen{$pc} && ($pc >= $start) && ($pc <= $finish)) { + if (!$seen{$pc} && ($pc ge $start) && ($pc le $finish)) { $seen{$pc} = 1; push(@{$contained}, $pc); } } # Map to symbols - MapToSymbols($libname, $start - $offset, $contained, $symbols); + MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols); } return $symbols; @@ -1675,13 +2128,15 @@ sub GetLineNumbers { # Make file with all PC values open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n"); for (my $i = 0; $i <= $#{$pclist}; $i++) { - printf ADDRESSES ("0x%x\n", $pclist->[$i] - $offset); + # addr2line always reads hex addresses, and does not need '0x' prefix. + printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset)); } close(ADDRESSES); # Pass to addr2line - open(SYMBOLS, "$ADDR2LINE -f -C -e $image <$main::tmpfile_sym |") - || error("$ADDR2LINE: $!\n"); + my $addr2line = $obj_tool_map{"addr2line"}; + open(SYMBOLS, "$addr2line -f -C -e $image <$main::tmpfile_sym |") + || error("$addr2line: $!\n"); my $count = 0; while (<SYMBOLS>) { chop; @@ -1694,7 +2149,7 @@ sub GetLineNumbers { $filelinenum =~ s|^.*/([^/]+:\d+)$|$1|; # Remove directory name } - my $pcstr = sprintf("0x%x", $pclist->[$count]); + my $pcstr = $pclist->[$count]; if (defined($symbols->{$pcstr})) { # Override just the line-number portion. The function name portion # is less buggy when computed using nm instead of addr2line. @@ -1708,7 +2163,7 @@ sub GetLineNumbers { close(SYMBOLS); } -# Alternate implementation +# Use nm to map the list of referenced PCs to symbols sub MapSymbolsWithNM { my $image = shift; my $offset = shift; @@ -1717,14 +2172,15 @@ sub MapSymbolsWithNM { # Get nm output sorted by increasing address my $symbol_table = GetProcedureBoundaries($image, "."); - my @names = sort { $symbol_table->{$a}->[0] <=> $symbol_table->{$b}->[0] } + # Start addresses are already the right length (8 or 16 hex digits). + my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] } keys(%{$symbol_table}); if ($#names < 0) { - # No symbols: just use address + # No symbols: just use addresses foreach my $pc (@{$pclist}) { - my $pcstr = sprintf("0x%x", $pc); - $symbols->{$pcstr} = [$pcstr, "?", $pcstr]; + my $pcstr = "0x" . $pc; + $symbols->{$pc} = [$pcstr, "?", $pcstr]; } return; } @@ -1733,22 +2189,21 @@ sub MapSymbolsWithNM { my $index = 0; my $fullname = $names[0]; my $name = ShortFunctionName($fullname); - foreach my $pc (sort { $a <=> $b } @{$pclist}) { + foreach my $pc (sort { $a cmp $b } @{$pclist}) { # Adjust for mapped offset - my $mpc = $pc - $offset; - while (($index < $#names) && ($mpc >= $symbol_table->{$fullname}->[1])){ + my $mpc = AddressSub($pc, $offset); + while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){ $index++; $fullname = $names[$index]; $name = ShortFunctionName($fullname); } - my $pcstr = sprintf("0x%x", $pc); - $symbols->{$pcstr} = [$name, "?", $fullname]; + $symbols->{$pc} = [$name, "?", $fullname]; } } sub ShortFunctionName { my $function = shift; - while ($function =~ s/\([^()]*\)//g) { } # Remove argument types + while ($function =~ s/\([^()]*\)(\s*const)?//g) { } # Argument types while ($function =~ s/<[^<>]*>//g) { } # Remove template arguments $function =~ s/^.*\s+(\w+::)/$1/; # Remove leading type return $function; @@ -1756,6 +2211,70 @@ sub ShortFunctionName { ##### Miscellaneous ##### +# Find the right versions of the above object tools to use. The argument +# is the program file being analyzed, and should be an ELF 32-bit or ELF +# 64-bit executable file. If it is, we select the default tools location +# based on that; otherwise, we'll emit a warning and take the user's +# defaults. +sub ConfigureObjTools { + my $prog_file = shift; + + # Figure out the right default pathname prefix based on the program file + # type: + my $default_prefix = "/usr/bin/"; + my $file_type = `/usr/bin/file $prog_file`; + if ($file_type =~ /ELF 32-bit/) { + $default_prefix = "/usr/bin/"; + } elsif ($file_type =~ /ELF 64-bit/) { + # Change $address_length to 16 if the program file is ELF 64-bit. + # We can't detect this from many (most?) heap or lock contention + # profiles, since the actual addresses referenced are generally in low + # memory even for 64-bit programs. + $address_length = 16; + } else { + print STDERR "WARNING: program $prog_file is apparently not an ELF file\n"; + # Don't change the default prefix. + } + + # Go fill in %obj_tool_map with the pathnames to use: + foreach my $tool (keys %obj_tool_map) { + $obj_tool_map{$tool} = ConfigureTool($tool, $default_prefix); + } +} + +sub ConfigureTool { + my $tool = shift; + my $prefix = shift; + my $path; + + if ($main::opt_debug) { + print STDERR "Configuring '$tool' with default prefix '$prefix'\n"; + } + + # Try a specific prefix specified by the user: + if ($main::opt_tools ne "") { + $path = $main::opt_tools . $tool; + if ($main::opt_debug) { print STDERR " (a) Trying '$path'\n"; } + if (-x $path) { return $path; } + } + + # Try the default prefix passed to us: + $path = $prefix . $tool; + if ($main::opt_debug) { print STDERR " (b) Trying '$path'\n"; } + if (-x $path) { return $path; } + + # Try the normal system default (/usr/bin/): + if ($prefix ne "/usr/bin") { + $path = "/usr/bin/$tool"; + if ($main::opt_debug) { print STDERR " (c) Trying '$path'\n"; } + if (-x $path) { return $path; } + } + + # If all else fails, hope the bare toolname works: + if ($main::opt_debug) { print STDERR " Returning '$tool'\n"; } + return $tool; +} + sub cleanup { unlink($main::tmpfile_sym); for (my $i = 0; $i < $main::next_tmpfile; $i++) { @@ -1763,12 +2282,15 @@ sub cleanup { } # We leave any collected profiles in $HOME/pprof in case the user wants # to look at them later. We print a message informing them of this. - if (defined($main::collected_profile)) { - print STDERR "Dynamically gathered profile is in $main::collected_profile\n"; + if ((scalar(@main::profile_files) > 0) && + defined($main::collected_profile)) { + if (scalar(@main::profile_files) == 1) { + print STDERR "Dynamically gathered profile is in $main::collected_profile\n"; + } print STDERR "If you want to investigate this profile further, you can do:\n"; print STDERR "\n"; print STDERR " pprof \\\n"; - print STDERR " $prog \\\n"; + print STDERR " $main::prog \\\n"; print STDERR " $main::collected_profile\n"; print STDERR "\n"; } @@ -1786,16 +2308,6 @@ sub error { exit(1); } -# Return a list of all routines that match $regexp. -# For each routine, the following list is returned: -# $result->[i]->[0] Routine name -# $result->[i]->[1] Start address -# $result->[i]->[2] Finish address -# $result->[i]->[3] Image file name (program or shared library) -# $result->[i]->[4] Offset for image in address space -sub GetMatchingRoutines { -} - # Gets the procedure boundaries for all routines in "$image" whose names # match "$regexp" and returns them in a hashtable mapping from procedure @@ -1805,15 +2317,16 @@ sub GetProcedureBoundaries { my $regexp = shift; my $symbol_table = {}; - open(NM, "$NM -C -n $image |") || error("$NM: $!\n"); - my $last_start = "0x0"; + my $nm = $obj_tool_map{"nm"}; + open(NM, "$nm -C -n $image |") || error("$nm: $!\n"); + my $last_start = "0"; my $routine = ""; while (<NM>) { if (m/^([0-9a-f]+) . (..*)/) { my $start_val = $1; my $this_routine = $2; if (defined($routine) && $routine =~ m/$regexp/) { - $symbol_table->{$routine} = [hex($last_start), hex($start_val)]; + $symbol_table->{$routine} = [$last_start, $start_val]; } $last_start = $start_val; $routine = $this_routine; @@ -1823,3 +2336,218 @@ sub GetProcedureBoundaries { return $symbol_table; } + + +# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings. +# To make them more readable, we add underscores at interesting places. +# This routine removes the underscores, producing the canonical representation +# used by pprof to represent addresses, particularly in the tested routines. +sub CanonicalHex { + my $arg = shift; + return join '', (split '_',$arg); +} + + +# Unit test for AddressAdd: +sub AddressAddUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd ($row->[0], $row->[1]); + if ($sum ne $row->[2]) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + my $expected = join '', (split '_',$row->[2]); + if ($sum ne CanonicalHex($row->[2])) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressSub: +sub AddressSubUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub ($row->[0], $row->[1]); + if ($sum ne $row->[3]) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + if ($sum ne CanonicalHex($row->[3])) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressInc: +sub AddressIncUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc ($row->[0]); + if ($sum ne $row->[4]) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc (CanonicalHex($row->[0])); + if ($sum ne CanonicalHex($row->[4])) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Driver for unit tests. +# Currently just the address add/subtract/increment routines for 64-bit. +sub RunUnitTests { + my $error_count = 0; + + # This is a list of tuples [a, b, a+b, a-b, a+1] + my $unit_test_data_8 = [ + [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)], + [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)], + [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)], + [qw(00000001 ffffffff 00000000 00000002 00000002)], + [qw(00000001 fffffff0 fffffff1 00000011 00000002)], + ]; + my $unit_test_data_16 = [ + # The implementation handles data in 7-nibble chunks, so those are the + # interesting boundaries. + [qw(aaaaaaaa 50505050 + 00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)], + [qw(50505050 aaaaaaaa + 00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)], + [qw(ffffffff aaaaaaaa + 00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)], + [qw(00000001 ffffffff + 00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)], + [qw(00000001 fffffff0 + 00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)], + + [qw(00_a00000a_aaaaaaa 50505050 + 00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)], + [qw(0f_fff0005_0505050 aaaaaaaa + 0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)], + [qw(00_000000f_fffffff 01_800000a_aaaaaaa + 01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)], + [qw(00_0000000_0000001 ff_fffffff_fffffff + 00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)], + [qw(00_0000000_0000001 ff_fffffff_ffffff0 + ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)], + ]; + + $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16); + if ($error_count > 0) { + print STDERR $error_count, " errors: FAILED\n"; + } else { + print STDERR "PASS\n"; + } + exit ($error_count); +} diff --git a/src/profiler.cc b/src/profiler.cc index 7197d42..3633f6f 100644 --- a/src/profiler.cc +++ b/src/profiler.cc @@ -1,10 +1,10 @@ // Copyright (c) 2005, Google Inc. // All rights reserved. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: -// +// // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above @@ -14,7 +14,7 @@ // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. -// +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -96,7 +96,7 @@ DEFINE_string(cpu_profile, "", typedef struct siginfo SigStructure; inline void* GetPC(const SigStructure& sig_structure ) { return (void*)sig_structure.si_faddr; // maybe not correct -} +} #elif defined HAVE_STRUCT_SIGCONTEXT_SC_EIP typedef struct sigcontext SigStructure; @@ -126,6 +126,13 @@ inline void* GetPC(const SigStructure& sig_structure ) { typedef struct ucontext SigStructure; inline void* GetPC(const SigStructure& sig_structure ) { return (void*)sig_structure.uc_mcontext.gregs[REG_RIP]; +} + +#elif defined HAVE_STRUCT_SIGCONTEXT_REGS__NIP +typedef struct sigcontext SigStructure; +inline void* GetPC(const SigStructure& sig_structure ) { + return (void*)sig_structure.regs->nip; +} #else #error I dont know what your PC is @@ -141,7 +148,7 @@ class ProfileData { // Is profiling turned on at all inline bool enabled() { return out_ >= 0; } - + // What is the frequency of interrupts (ticks per second) inline int frequency() { return frequency_; } @@ -156,7 +163,7 @@ class ProfileData { void Stop(); void GetCurrentState(ProfilerState* state); - + private: static const int kMaxStackDepth = 64; // Max stack depth profiled static const int kMaxFrequency = 4000; // Largest allowed frequency @@ -189,7 +196,7 @@ class ProfileData { pthread_mutex_t table_lock_; // Cannot use "Mutex" in signal handlers #endif Bucket* hash_; // hash table - + Slot* evict_; // evicted entries int num_evicted_; // how many evicted entries? int out_; // fd for output file @@ -297,8 +304,8 @@ ProfileData::ProfileData() : if (!Start(fname)) { fprintf(stderr, "Can't turn on cpu profiling: "); - perror(fname); - exit(1); + perror(fname); + exit(1); } } @@ -322,8 +329,8 @@ bool ProfileData::Start(const char* fname) { fname_ = strdup(fname); LOCK(&table_lock_); - - // Reset counters + + // Reset counters num_evicted_ = 0; count_ = 0; evictions_ = 0; @@ -484,8 +491,8 @@ void ProfileData::FlushTable() { void ProfileData::Add(unsigned long pc) { void* stack[kMaxStackDepth]; stack[0] = (void*)pc; - int depth = GetStackTrace(stack+1, kMaxStackDepth-1, - 3/*Removes sighandlers*/); + int depth = GetStackTrace(stack+1, kMaxStackDepth-1, + 4/*Removes Add,prof_handler,sighandlers*/); depth++; // To account for pc value // Make hash-value @@ -519,7 +526,7 @@ void ProfileData::Add(unsigned long pc) { } } } - + if (!done) { // Evict entry with smallest count Entry* e = &bucket->entry[0]; @@ -532,7 +539,7 @@ void ProfileData::Add(unsigned long pc) { evictions_++; Evict(*e); } - + // Use the newly evicted entry e->depth = depth; e->count = 1; @@ -594,7 +601,7 @@ void ProfilerFlush() { pdata.FlushTable(); } -bool ProfilingIsEnabledForAllThreads() { +bool ProfilingIsEnabledForAllThreads() { return pdata.enabled(); } diff --git a/src/stacktrace.cc b/src/stacktrace.cc index 708d6ce..859d52a 100644 --- a/src/stacktrace.cc +++ b/src/stacktrace.cc @@ -40,145 +40,31 @@ // Linux/x86 implementation (requires the binary to be compiled with // frame pointers) -#if (defined(__i386__) || defined(__x86_64__)) && \ - defined(__linux) && !defined(NO_FRAME_POINTER) +#if defined(__i386__) && defined(__linux) && !defined(NO_FRAME_POINTER) #define IMPLEMENTED_STACK_TRACE - -#include <stdint.h> // for uintptr_t - -// Given a pointer to a stack frame, locate and return the calling -// stackframe, or return NULL if no stackframe can be found. Perform -// sanity checks to reduce the chance that a bad pointer is returned. -static void **NextStackFrame(void **old_sp) { - void **new_sp = (void **) *old_sp; - - // Check that the transition from frame pointer old_sp to frame - // pointer new_sp isn't clearly bogus - if (new_sp <= old_sp) return NULL; - if ((uintptr_t)new_sp & (sizeof(void *) - 1)) return NULL; -#ifdef __i386__ - // On 64-bit machines, the stack pointer can be very close to - // 0xffffffff, so we explicitly check for a pointer into the - // last two pages in the address space - if ((uintptr_t)new_sp >= 0xffffe000) return NULL; -#endif - if ((uintptr_t)new_sp - (uintptr_t)old_sp > 100000) return NULL; - return new_sp; -} - -// Note: the code for GetStackExtent below is pretty similar to this one; -// change both if chaning one. -int GetStackTrace(void** result, int max_depth, int skip_count) { - void **sp; -#ifdef __i386__ - // Stack frame format: - // sp[0] pointer to previous frame - // sp[1] caller address - // sp[2] first argument - // ... - sp = (void **)&result - 2; -#endif - -#ifdef __x86_64__ - // Arguments are passed in registers on x86-64, so we can't just - // offset from &result - sp = (void **) __builtin_frame_address(0); -#endif - - int n = 0; - skip_count++; // Do not include the "GetStackTrace" frame - while (sp && n < max_depth) { - if (*(sp+1) == (void *)0) { - // In 64-bit code, we often see a frame that - // points to itself and has a return address of 0. - break; - } - if (skip_count > 0) { - skip_count--; - } else { - result[n++] = *(sp+1); - } - void** new_sp = NextStackFrame(sp); - if (!new_sp) break; - sp = new_sp; - } - return n; -} - -// Note: the code is pretty similar to GetStackTrace above; -// change both if chaning one. -bool GetStackExtent(void* sp, void** stack_top, void** stack_bottom) { - void** cur_sp; - - if (sp != NULL) { - cur_sp = (void**)sp; - *stack_top = sp; - } else { -#ifdef __i386__ - // Stack frame format: - // sp[0] pointer to previous frame - // sp[1] caller address - // sp[2] first argument - // ... - cur_sp = (void**)&sp - 2; +#include "stacktrace_x86-inl.h" #endif -#ifdef __x86_64__ - // Arguments are passed in registers on x86-64, so we can't just - // offset from &result - cur_sp = (void**)__builtin_frame_address(0); +#if !defined(IMPLEMENTED_STACK_TRACE) && defined(USE_LIBUNWIND) && HAVE_LIBUNWIND_H +#define IMPLEMENTED_STACK_TRACE +// This is turned off by default. Possible reasons for turning on in the +// future: +// 1. Compiler independence +// 2. Architecture independence +// 3. A more liberal MIT license, which allows use with multiple compilers +#include "stacktrace_libunwind-inl.h" #endif - *stack_top = NULL; - } - - while (cur_sp) { - void** new_sp = NextStackFrame(cur_sp); - if (!new_sp) { - *stack_bottom = (void*)cur_sp; - return true; - } - cur_sp = new_sp; - if (*stack_top == NULL) *stack_top = (void*)cur_sp; - // get out of the stack frame for this call - } - return false; -} +#if !defined(IMPLEMENTED_STACK_TRACE) && defined(__x86_64__) && HAVE_UNWIND_H +#define IMPLEMENTED_STACK_TRACE +#include "stacktrace_x86_64-inl.h" #endif -// Portable implementation - just use glibc -// -// Note: The glibc implementation may cause a call to malloc. -// This can cause a deadlock in HeapProfiler. -#if !defined(IMPLEMENTED_STACK_TRACE) && defined(HAVE_EXECINFO_H) -#include <stdlib.h> -#include <execinfo.h> - -int GetStackTrace(void** result, int max_depth, int skip_count) { - static const int kStackLength = 64; - void * stack[kStackLength]; - int size; - - size = backtrace(stack, kStackLength); - skip_count++; // we want to skip the current frame as well - int result_count = size - skip_count; - if ( result_count < 0 ) - result_count = 0; - else if ( result_count > max_depth ) - result_count = max_depth; - - for (int i = 0; i < result_count; i++) - result[i] = stack[i + skip_count]; - - return result_count; -} - -bool GetStackExtent(void* sp, void** stack_bottom, void** stack_top) { - return false; // can't climb up -} - +#if !defined(IMPLEMENTED_STACK_TRACE) && !defined(__x86_64__) && HAVE_EXECINFO_H +#define IMPLEMENTED_STACK_TRACE +#include "stacktrace_generic-inl.h" #endif -#if !defined(IMPLEMENTED_STACK_TRACE) && !defined(HAVE_EXECINFO_H) +#ifndef IMPLEMENTED_STACK_TRACE #error Cannot calculate stack trace: will need to write for your environment #endif diff --git a/src/stacktrace_generic-inl.h b/src/stacktrace_generic-inl.h new file mode 100644 index 0000000..28a539a --- /dev/null +++ b/src/stacktrace_generic-inl.h @@ -0,0 +1,61 @@ +// Copyright (c) 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// --- +// Author: Sanjay Ghemawat +// +// Portable implementation - just use glibc +// +// Note: The glibc implementation may cause a call to malloc. +// This can cause a deadlock in HeapProfiler. +#include <execinfo.h> +#include <stdlib.h> +#include "google/stacktrace.h" + +int GetStackTrace(void** result, int max_depth, int skip_count) { + static const int kStackLength = 64; + void * stack[kStackLength]; + int size; + + size = backtrace(stack, kStackLength); + skip_count++; // we want to skip the current frame as well + int result_count = size - skip_count; + if (result_count < 0) + result_count = 0; + if (result_count > max_depth) + result_count = max_depth; + for (int i = 0; i < result_count; i++) + result[i] = stack[i + skip_count]; + + return result_count; +} + +bool GetStackExtent(void* sp, void** stack_top, void** stack_bottom) { + return false; // can't climb up +} diff --git a/src/stacktrace_libunwind-inl.h b/src/stacktrace_libunwind-inl.h new file mode 100644 index 0000000..42c28d3 --- /dev/null +++ b/src/stacktrace_libunwind-inl.h @@ -0,0 +1,69 @@ +// Copyright (c) 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// --- +// Author: Arun Sharma +// +// Produce stack trace using libunwind + +extern "C" { +#include <assert.h> +#include <libunwind.h> +} +#include "google/stacktrace.h" + +int GetStackTrace(void** result, int max_depth, int skip_count) { + void *ip; + int ret, n = 0; + unw_cursor_t cursor; + unw_context_t uc; + + unw_getcontext(&uc); + ret = unw_init_local(&cursor, &uc); + assert(ret >= 0); + skip_count++; // Do not include the "GetStackTrace" frame + + do { + ret = unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip); + assert(ret == 0); + if (skip_count > 0) { + skip_count--; + } else { + result[n++] = ip; + } + ret = unw_step(&cursor); + assert(ret >= 0); + } while ((n < max_depth) && (ret > 0)); + + return n; +} + +bool GetStackExtent(void* sp, void** stack_top, void** stack_bottom) { + return false; // Not implemented yet +} diff --git a/src/stacktrace_x86-inl.h b/src/stacktrace_x86-inl.h new file mode 100644 index 0000000..cdab19b --- /dev/null +++ b/src/stacktrace_x86-inl.h @@ -0,0 +1,119 @@ +// Copyright (c) 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// --- +// Author: Sanjay Ghemawat +// +// Produce stack trace + +#include <stdint.h> // for uintptr_t +#include <stdlib.h> // for NULL +#include "google/stacktrace.h" + +// Given a pointer to a stack frame, locate and return the calling +// stackframe, or return NULL if no stackframe can be found. Perform +// sanity checks to reduce the chance that a bad pointer is returned. +static void **NextStackFrame(void **old_sp) { + void **new_sp = (void **) *old_sp; + + // Check that the transition from frame pointer old_sp to frame + // pointer new_sp isn't clearly bogus + if (new_sp <= old_sp) return NULL; + if ((uintptr_t)new_sp & (sizeof(void *) - 1)) return NULL; +#ifdef __i386__ + // On 64-bit machines, the stack pointer can be very close to + // 0xffffffff, so we explicitly check for a pointer into the + // last two pages in the address space + if ((uintptr_t)new_sp >= 0xffffe000) return NULL; +#endif + if ((uintptr_t)new_sp - (uintptr_t)old_sp > 100000) return NULL; + return new_sp; +} + +// Note: the code for GetStackExtent below is pretty similar to this one; +// change both if chaning one. +int GetStackTrace(void** result, int max_depth, int skip_count) { + void **sp; + // Stack frame format: + // sp[0] pointer to previous frame + // sp[1] caller address + // sp[2] first argument + // ... + sp = (void **)&result - 2; + + int n = 0; + while (sp && n < max_depth) { + if (*(sp+1) == (void *)0) { + // In 64-bit code, we often see a frame that + // points to itself and has a return address of 0. + break; + } + if (skip_count > 0) { + skip_count--; + } else { + result[n++] = *(sp+1); + } + void** new_sp = NextStackFrame(sp); + if (!new_sp) break; + sp = new_sp; + } + return n; +} + +// Note: the code is pretty similar to GetStackTrace above; +// change both if changing one. +bool GetStackExtent(void* sp, void** stack_top, void** stack_bottom) { + void** cur_sp; + + if (sp != NULL) { + cur_sp = (void**)sp; + *stack_top = sp; + } else { + // Stack frame format: + // sp[0] pointer to previous frame + // sp[1] caller address + // sp[2] first argument + // ... + cur_sp = (void**)&sp - 2; + + *stack_top = NULL; + } + + while (cur_sp) { + void** new_sp = NextStackFrame(cur_sp); + if (!new_sp) { + *stack_bottom = (void*)cur_sp; + return true; + } + cur_sp = new_sp; + if (*stack_top == NULL) *stack_top = (void*)cur_sp; + // get out of the stack frame for this call + } + return false; +} diff --git a/src/stacktrace_x86_64-inl.h b/src/stacktrace_x86_64-inl.h new file mode 100644 index 0000000..3cdfe13 --- /dev/null +++ b/src/stacktrace_x86_64-inl.h @@ -0,0 +1,134 @@ +// Copyright (c) 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// --- +// Author: Arun Sharma +// +// Produce stack trace using libgcc + +extern "C" { +#include <stdlib.h> // for NULL +#include <unwind.h> // ABI defined unwinder +} +#include "google/stacktrace.h" + +typedef struct { + void **result; + int max_depth; + int skip_count; + int count; + void *top; + void *bottom; +} trace_arg_t; + + +// Workaround for the malloc() in _Unwind_Backtrace() issue. +static _Unwind_Reason_Code nop_backtrace(struct _Unwind_Context *uc, void *opq) { + return _URC_NO_REASON; +} + + +// This code is not considered ready to run until +// static initializers run so that we are guaranteed +// that any malloc-related initialization is done. +static bool ready_to_run = false; +class StackTraceInit { + public: + StackTraceInit() { + // Extra call to force initialization + _Unwind_Backtrace(nop_backtrace, NULL); + ready_to_run = true; + } +}; + +static StackTraceInit module_initializer; // Force initialization + +static _Unwind_Reason_Code GetOneFrame(struct _Unwind_Context *uc, void *opq) { + trace_arg_t *targ = (trace_arg_t *) opq; + + if (targ->skip_count > 0) { + targ->skip_count--; + } else { + targ->result[targ->count++] = (void *) _Unwind_GetIP(uc); + } + + if (targ->count == targ->max_depth) + return _URC_END_OF_STACK; + + return _URC_NO_REASON; +} + +int GetStackTrace(void** result, int max_depth, int skip_count) { + if (!ready_to_run) + return 0; + + trace_arg_t targ; + + skip_count += 1; // Do not include the "GetStackTrace" frame + + targ.result = result; + targ.max_depth = max_depth; + targ.skip_count = skip_count; + targ.count = 0; + + _Unwind_Backtrace(GetOneFrame, &targ); + + return targ.count; +} + +static _Unwind_Reason_Code SearchExtents(struct _Unwind_Context *uc, void *opq) { + trace_arg_t *targ = (trace_arg_t *) opq; + + targ->bottom = (void *) _Unwind_GetCFA(uc); + + if (targ->top == NULL) + targ->top = targ->bottom; + + return _URC_NO_REASON; +} + +bool GetStackExtent(void* sp, void** stack_top, void** stack_bottom) { + if (!ready_to_run) + return false; + + trace_arg_t targ; + + targ.top = NULL; + targ.bottom = NULL; + + _Unwind_Backtrace(SearchExtents, &targ); + + if ((targ.top != NULL) && (targ.bottom != NULL)) { + *stack_bottom = targ.bottom; + *stack_top = targ.top; + return true; + } + + return false; +} diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc index 2eb9ef4..5dc062e 100644 --- a/src/tcmalloc.cc +++ b/src/tcmalloc.cc @@ -1,10 +1,10 @@ // Copyright (c) 2005, Google Inc. // All rights reserved. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: -// +// // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above @@ -14,7 +14,7 @@ // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. -// +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -127,7 +127,7 @@ static const int kMinSystemAlloc = 1 << (20 - kPageShift); // amortize the lock overhead for accessing the central list. Making // it too big may temporarily cause unnecessary memory wastage in the // per-thread free list until the scavenger cleans up the list. -static const int kNumObjectsToMove = 32; +static int num_objects_to_move[kNumClasses]; // Maximum length we allow a per-thread free-list to have before we // move objects from it into the corresponding central free-list. We @@ -170,6 +170,22 @@ static size_t class_to_size[kNumClasses]; // Mapping from size class to number of pages to allocate at a time static size_t class_to_pages[kNumClasses]; + + +// TransferCache is used to cache transfers of num_objects_to_move[size_class] +// back and forth between thread caches and the central cache for a given size +// class. +struct TCEntry { + void *head; // Head of chain of objects. + void *tail; // Tail of chain of objects. +}; +// A central cache freelist can have anywhere from 0 to kNumTransferEntries +// slots to put link list chains into. To keep memory usage bounded the total +// number of TCEntries across size classes is fixed. Currently each size +// class is initially given one TCEntry which also means that the maximum any +// one class can have is kNumClasses. +static const int kNumTransferEntries = kNumClasses; + // Return floor(log2(n)) for n > 0. #if (defined __i386__ || defined __x86_64__) && defined __GNUC__ static inline int LgFloor(size_t n) { @@ -201,6 +217,70 @@ static inline int LgFloor(size_t n) { } #endif + +// Some very basic linked list functions for dealing with using void * as +// storage. + +static inline void *SLL_Next(void *t) { + return *(reinterpret_cast<void**>(t)); +} + +static inline void SLL_SetNext(void *t, void *n) { + *(reinterpret_cast<void**>(t)) = n; +} + +static inline void SLL_Push(void **list, void *element) { + SLL_SetNext(element, *list); + *list = element; +} + +static inline void *SLL_Pop(void **list) { + void *result = *list; + *list = SLL_Next(*list); + return result; +} + + +// Remove N elements from a linked list to which head points. head will be +// modified to point to the new head. start and end will point to the first +// and last nodes of the range. Note that end will point to NULL after this +// function is called. +static inline void SLL_PopRange(void **head, int N, void **start, void **end) { + if (N == 0) { + *start = NULL; + *end = NULL; + return; + } + + void *tmp = *head; + for (int i = 1; i < N; ++i) { + tmp = SLL_Next(tmp); + } + + *start = *head; + *end = tmp; + *head = SLL_Next(tmp); + // Unlink range from list. + SLL_SetNext(tmp, NULL); +} + +static inline void SLL_PushRange(void **head, void *start, void *end) { + if (!start) return; + SLL_SetNext(end, *head); + *head = start; +} + +static inline size_t SLL_Size(void *head) { + int count = 0; + while (head) { + count++; + head = SLL_Next(head); + } + return count; +} + +// Setup helper functions. + static inline int SizeClass(size_t size) { if (size == 0) size = 1; const int lg = LgFloor(size); @@ -213,6 +293,19 @@ static inline size_t ByteSizeForClass(size_t cl) { return class_to_size[cl]; } + +static int NumMoveSize(size_t size) { + if (size == 0) return 0; + // Use approx 64k transfers between thread and central caches. + int num = static_cast<int>(64.0 * 1024.0 / size); + if (num < 2) num = 2; + // Clamp well below kMaxFreeListLength to avoid ping pong between central + // and thread caches. + if (num > static_cast<int>(0.8 * kMaxFreeListLength)) + num = static_cast<int>(0.8 * kMaxFreeListLength); + return num; +} + // Initialize the mapping arrays static void InitSizeClasses() { // Special initialization for small sizes @@ -290,6 +383,11 @@ static void InitSizeClasses() { abort(); } } + + // Initialize the num_objects_to_move array. + for (size_t cl = 1; cl < kNumClasses; ++cl) { + num_objects_to_move[cl] = NumMoveSize(ByteSizeForClass(cl)); + } } // ------------------------------------------------------------------------- @@ -833,7 +931,7 @@ void TCMalloc_PageHeap::Dump(TCMalloc_Printer* out) { static void RecordGrowth(size_t growth) { StackTrace* t = stacktrace_allocator.New(); - t->depth = GetStackTrace(t->stack, kMaxStackDepth-1, 4); + t->depth = GetStackTrace(t->stack, kMaxStackDepth-1, 3); t->size = growth; t->stack[kMaxStackDepth-1] = reinterpret_cast<void*>(growth_stacks); growth_stacks = t; @@ -939,18 +1037,26 @@ class TCMalloc_ThreadCache_FreeList { void clear_lowwatermark() { lowater_ = length_; } void Push(void* ptr) { - *(reinterpret_cast<void**>(ptr)) = list_; - list_ = ptr; + SLL_Push(&list_, ptr); length_++; } void* Pop() { ASSERT(list_ != NULL); - void* result = list_; - list_ = *(reinterpret_cast<void**>(result)); length_--; if (length_ < lowater_) lowater_ = length_; - return result; + return SLL_Pop(&list_); + } + + void PushRange(int N, void *start, void *end) { + SLL_PushRange(&list_, start, end); + length_ += N; + } + + void PopRange(int N, void **start, void **end) { + SLL_PopRange(&list_, N, start, end); + ASSERT(length_ >= N); + length_ -= N; } }; @@ -1017,34 +1123,98 @@ class TCMalloc_Central_FreeList { public: void Init(size_t cl); - // REQUIRES: lock_ is held - // Insert object. - // May temporarily release lock_. - void Insert(void* object); + // These methods all do internal locking. + // Insert the specified range into the central freelist. N is the number of + // elements in the range. + void InsertRange(void *start, void *end, int N); + + // Returns the actual number of fetched elements into N. + void RemoveRange(void **start, void **end, int *N); + + // Returns the number of free objects in cache. + int length() { + SpinLockHolder h(&lock_); + return counter_; + } + + // Returns the number of free objects in the transfer cache. + int tc_length() { + SpinLockHolder h(&lock_); + return used_slots_ * num_objects_to_move[size_class_]; + } + + private: // REQUIRES: lock_ is held // Remove object from cache and return. // Return NULL if no free entries in cache. - void* Remove(); + void* FetchFromSpans(); // REQUIRES: lock_ is held - // Populate cache by fetching from the page heap. + // Remove object from cache and return. Fetches + // from pageheap if cache is empty. Only returns + // NULL on allocation failure. + void* FetchFromSpansSafe(); + + // REQUIRES: lock_ is held + // Release a linked list of objects to spans. // May temporarily release lock_. - void Populate(); + void ReleaseListToSpans(void *start); // REQUIRES: lock_ is held - // Number of free objects in cache - int length() const { return counter_; } + // Release an object to spans. + // May temporarily release lock_. + void ReleaseToSpans(void* object); + + // REQUIRES: lock_ is held + // Populate cache by fetching from the page heap. + // May temporarily release lock_. + void Populate(); - // Lock -- exposed because caller grabs it before touching this object + // REQUIRES: lock is held. + // Tries to make room for a TCEntry. If the cache is full it will try to + // expand it at the cost of some other cache size. Return false if there is + // no space. + bool MakeCacheSpace(); + + // REQUIRES: lock_ for locked_size_class is held. + // Picks a "random" size class to steal TCEntry slot from. In reality it + // just iterates over the sizeclasses but does so without taking a lock. + // Returns true on success. + // May temporarily lock a "random" size class. + static bool EvictRandomSizeClass(int locked_size_class, bool force); + + // REQUIRES: lock_ is *not* held. + // Tries to shrink the Cache. If force is true it will relase objects to + // spans if it allows it to shrink the cache. Return false if it failed to + // shrink the cache. Decrements cache_size_ on succeess. + // May temporarily take lock_. If it takes lock_, the locked_size_class + // lock is released to the thread from holding two size class locks + // concurrently which could lead to a deadlock. + bool ShrinkCache(int locked_size_class, bool force); + + // This lock protects all the data members. cached_entries and cache_size_ + // may be looked at without holding the lock. SpinLock lock_; - private: - // We keep linked lists of empty and non-emoty spans. + // We keep linked lists of empty and non-empty spans. size_t size_class_; // My size class Span empty_; // Dummy header for list of empty spans Span nonempty_; // Dummy header for list of non-empty spans size_t counter_; // Number of free objects in cache entry + + // Here we reserve space for TCEntry cache slots. Since one size class can + // end up getting all the TCEntries quota in the system we just preallocate + // sufficient number of entries here. + TCEntry tc_slots_[kNumTransferEntries]; + + // Number of currently used cached entries in tc_slots_. This variable is + // updated under a lock but can be read without one. + int32_t used_slots_; + // The current number of slots for this size class. This is an + // adaptive value that is increased if there is lots of traffic + // on a given size class. + int32_t cache_size_; }; // Pad each CentralCache object to multiple of 64 bytes @@ -1104,9 +1274,21 @@ void TCMalloc_Central_FreeList::Init(size_t cl) { DLL_Init(&empty_); DLL_Init(&nonempty_); counter_ = 0; + + cache_size_ = 1; + used_slots_ = 0; + ASSERT(cache_size_ <= kNumTransferEntries); +} + +void TCMalloc_Central_FreeList::ReleaseListToSpans(void* start) { + while (start) { + void *next = SLL_Next(start); + ReleaseToSpans(start); + start = next; + } } -void TCMalloc_Central_FreeList::Insert(void* object) { +void TCMalloc_Central_FreeList::ReleaseToSpans(void* object) { const PageID p = reinterpret_cast<uintptr_t>(object) >> kPageShift; Span* span = pageheap->GetDescriptor(p); ASSERT(span != NULL); @@ -1151,7 +1333,139 @@ void TCMalloc_Central_FreeList::Insert(void* object) { } } -void* TCMalloc_Central_FreeList::Remove() { +bool TCMalloc_Central_FreeList::EvictRandomSizeClass( + int locked_size_class, bool force) { + static int race_counter = 0; + int t = race_counter++; // Updated without a lock, but who cares. + if (t >= kNumClasses) { + while (t >= kNumClasses) { + t -= kNumClasses; + } + race_counter = t; + } + ASSERT(t >= 0); + ASSERT(t < kNumClasses); + if (t == locked_size_class) return false; + return central_cache[t].ShrinkCache(locked_size_class, force); +} + +bool TCMalloc_Central_FreeList::MakeCacheSpace() { + // Is there room in the cache? + if (used_slots_ < cache_size_) return true; + // Check if we can expand this cache? + if (cache_size_ == kNumTransferEntries) return false; + // Ok, we'll try to grab an entry from some other size class. + if (EvictRandomSizeClass(size_class_, false) || + EvictRandomSizeClass(size_class_, true)) { + // Succeeded in evicting, we're going to make our cache larger. + cache_size_++; + return true; + } + return false; +} + + +namespace { +class LockInverter { + private: + TCMalloc_SpinLock *held_, *temp_; + public: + inline explicit LockInverter(TCMalloc_SpinLock* held, TCMalloc_SpinLock *temp) + : held_(held), temp_(temp) { held_->Unlock(); temp_->Lock(); } + inline ~LockInverter() { temp_->Unlock(); held_->Lock(); } +}; +} + +bool TCMalloc_Central_FreeList::ShrinkCache(int locked_size_class, bool force) { + // Start with a quick check without taking a lock. + if (cache_size_ == 0) return false; + // We don't evict from a full cache unless we are 'forcing'. + if (force == false && used_slots_ == cache_size_) return false; + + // Grab lock, but first release the other lock held by this thread. We use + // the lock inverter to ensure that we never hold two size class locks + // concurrently. That can create a deadlock because there is no well + // defined nesting order. + LockInverter li(¢ral_cache[locked_size_class].lock_, &lock_); + ASSERT(used_slots_ <= cache_size_); + ASSERT(0 <= cache_size_); + if (cache_size_ == 0) return false; + if (used_slots_ == cache_size_) { + if (force == false) return false; + // ReleaseListToSpans releases the lock, so we have to make all the + // updates to the central list before calling it. + cache_size_--; + used_slots_--; + ReleaseListToSpans(tc_slots_[used_slots_].head); + return true; + } + cache_size_--; + return true; +} + +void TCMalloc_Central_FreeList::InsertRange(void *start, void *end, int N) { + SpinLockHolder h(&lock_); + if (N == num_objects_to_move[size_class_] && + MakeCacheSpace()) { + int slot = used_slots_++; + ASSERT(slot >=0); + ASSERT(slot < kNumTransferEntries); + TCEntry *entry = &tc_slots_[slot]; + entry->head = start; + entry->tail = end; + return; + } + ReleaseListToSpans(start); +} + +void TCMalloc_Central_FreeList::RemoveRange(void **start, void **end, int *N) { + int num = *N; + ASSERT(num > 0); + + SpinLockHolder h(&lock_); + if (num == num_objects_to_move[size_class_] && used_slots_ > 0) { + int slot = --used_slots_; + ASSERT(slot >= 0); + TCEntry *entry = &tc_slots_[slot]; + *start = entry->head; + *end = entry->tail; + return; + } + + // TODO: Prefetch multiple TCEntries? + void *tail = FetchFromSpansSafe(); + if (!tail) { + // We are completely out of memory. + *start = *end = NULL; + *N = 0; + return; + } + + SLL_SetNext(tail, NULL); + void *head = tail; + int count = 1; + while (count < num) { + void *t = FetchFromSpans(); + if (!t) break; + SLL_Push(&head, t); + count++; + } + *start = head; + *end = tail; + *N = count; +} + + +void* TCMalloc_Central_FreeList::FetchFromSpansSafe() { + void *t = FetchFromSpans(); + if (!t) { + Populate(); + t = FetchFromSpans(); + } + return t; +} + +void* TCMalloc_Central_FreeList::FetchFromSpans() { if (DLL_IsEmpty(&nonempty_)) return NULL; Span* span = nonempty_.next; @@ -1244,11 +1558,8 @@ void TCMalloc_ThreadCache::Init(pthread_t tid) { void TCMalloc_ThreadCache::Cleanup() { // Put unused memory back into central cache for (int cl = 0; cl < kNumClasses; ++cl) { - FreeList* src = &list_[cl]; - TCMalloc_Central_FreeList* dst = ¢ral_cache[cl]; - SpinLockHolder h(&dst->lock_); - while (!src->empty()) { - dst->Insert(src->Pop()); + if (list_[cl].length() > 0) { + ReleaseToCentralCache(cl, list_[cl].length()); } } } @@ -1271,43 +1582,39 @@ inline void TCMalloc_ThreadCache::Deallocate(void* ptr, size_t cl) { list->Push(ptr); // If enough data is free, put back into central cache if (list->length() > kMaxFreeListLength) { - ReleaseToCentralCache(cl, kNumObjectsToMove); + ReleaseToCentralCache(cl, num_objects_to_move[cl]); } if (size_ >= per_thread_cache_size) Scavenge(); } // Remove some objects of class "cl" from central cache and add to thread heap void TCMalloc_ThreadCache::FetchFromCentralCache(size_t cl) { - TCMalloc_Central_FreeList* src = ¢ral_cache[cl]; - FreeList* dst = &list_[cl]; - SpinLockHolder h(&src->lock_); - for (int i = 0; i < kNumObjectsToMove; i++) { - void* object = src->Remove(); - if (object == NULL) { - if (i == 0) { - src->Populate(); // Temporarily releases src->lock_ - object = src->Remove(); - } - if (object == NULL) { - break; - } - } - dst->Push(object); - size_ += ByteSizeForClass(cl); - } + int fetch_count = num_objects_to_move[cl]; + void *start, *end; + central_cache[cl].RemoveRange(&start, &end, &fetch_count); + list_[cl].PushRange(fetch_count, start, end); + size_ += ByteSizeForClass(cl) * fetch_count; } // Remove some objects of class "cl" from thread heap and add to central cache void TCMalloc_ThreadCache::ReleaseToCentralCache(size_t cl, int N) { + ASSERT(N > 0); FreeList* src = &list_[cl]; - TCMalloc_Central_FreeList* dst = ¢ral_cache[cl]; - SpinLockHolder h(&dst->lock_); if (N > src->length()) N = src->length(); size_ -= N*ByteSizeForClass(cl); - while (N-- > 0) { - void* ptr = src->Pop(); - dst->Insert(ptr); + + // We return prepackaged chains of the correct size to the central cache. + // TODO: Use the same format internally in the thread caches? + int batch_size = num_objects_to_move[cl]; + while (N > batch_size) { + void *tail, *head; + src->PopRange(batch_size, &head, &tail); + central_cache[cl].InsertRange(head, tail, batch_size); + N -= batch_size; } + void *tail, *head; + src->PopRange(N, &head, &tail); + central_cache[cl].InsertRange(head, tail, N); } // Release idle memory to the central cache @@ -1395,7 +1702,7 @@ void TCMalloc_ThreadCache::InitTSD() { ASSERT(!tsd_inited); perftools_pthread_key_create(&heap_key, DeleteCache); tsd_inited = true; - + // We may have used a fake pthread_t for the main thread. Fix it. pthread_t zero; memset(&zero, 0, sizeof(zero)); @@ -1499,6 +1806,7 @@ struct TCMallocStats { uint64_t system_bytes; // Bytes alloced from system uint64_t thread_bytes; // Bytes in thread caches uint64_t central_bytes; // Bytes in central cache + uint64_t transfer_bytes; // Bytes in central transfer cache uint64_t pageheap_bytes; // Bytes in page heap uint64_t metadata_bytes; // Bytes alloced for metadata }; @@ -1506,11 +1814,14 @@ struct TCMallocStats { // Get stats into "r". Also get per-size-class counts if class_count != NULL static void ExtractStats(TCMallocStats* r, uint64_t* class_count) { r->central_bytes = 0; + r->transfer_bytes = 0; for (int cl = 0; cl < kNumClasses; ++cl) { - SpinLockHolder h(¢ral_cache[cl].lock_); const int length = central_cache[cl].length(); + const int tc_length = central_cache[cl].tc_length(); r->central_bytes += static_cast<uint64_t>(ByteSizeForClass(cl)) * length; - if (class_count) class_count[cl] = length; + r->transfer_bytes += + static_cast<uint64_t>(ByteSizeForClass(cl)) * tc_length; + if (class_count) class_count[cl] = length + tc_length; } // Add stats from per-thread heaps @@ -1564,6 +1875,7 @@ static void DumpStats(TCMalloc_Printer* out, int level) { const uint64_t bytes_in_use = stats.system_bytes - stats.pageheap_bytes - stats.central_bytes + - stats.transfer_bytes - stats.thread_bytes; out->printf("------------------------------------------------\n" @@ -1571,6 +1883,7 @@ static void DumpStats(TCMalloc_Printer* out, int level) { "MALLOC: %12" LLU " Bytes in use by application\n" "MALLOC: %12" LLU " Bytes free in page heap\n" "MALLOC: %12" LLU " Bytes free in central cache\n" + "MALLOC: %12" LLU " Bytes free in transfer cache\n" "MALLOC: %12" LLU " Bytes free in thread caches\n" "MALLOC: %12" LLU " Spans in use\n" "MALLOC: %12" LLU " Thread heaps in use\n" @@ -1580,6 +1893,7 @@ static void DumpStats(TCMalloc_Printer* out, int level) { bytes_in_use, stats.pageheap_bytes, stats.central_bytes, + stats.transfer_bytes, stats.thread_bytes, uint64_t(span_allocator.inuse()), uint64_t(threadheap_allocator.inuse()), @@ -1787,7 +2101,7 @@ static Span* DoSampledAllocation(size_t size) { } // Fill stack trace and record properly - stack->depth = GetStackTrace(stack->stack, kMaxStackDepth, 2); + stack->depth = GetStackTrace(stack->stack, kMaxStackDepth, 1); stack->size = size; span->sample = 1; span->objects = stack; @@ -1797,28 +2111,34 @@ static Span* DoSampledAllocation(size_t size) { } static inline void* do_malloc(size_t size) { + void* ret = NULL; - if (TCMallocDebug::level >= TCMallocDebug::kVerbose) + if (TCMallocDebug::level >= TCMallocDebug::kVerbose) { MESSAGE("In tcmalloc do_malloc(%" PRIuS")\n", size); + } // The following call forces module initialization TCMalloc_ThreadCache* heap = TCMalloc_ThreadCache::GetCache(); if (heap->SampleAllocation(size)) { Span* span = DoSampledAllocation(size); - if (span == NULL) return NULL; - return reinterpret_cast<void*>(span->start << kPageShift); + if (span != NULL) { + ret = reinterpret_cast<void*>(span->start << kPageShift); + } } else if (size > kMaxSize) { // Use page-level allocator SpinLockHolder h(&pageheap_lock); Span* span = pageheap->New(pages(size)); - if (span == NULL) return NULL; - return reinterpret_cast<void*>(span->start << kPageShift); + if (span != NULL) { + ret = reinterpret_cast<void*>(span->start << kPageShift); + } } else { - return heap->Allocate(size); + ret = heap->Allocate(size); } + if (ret == NULL) errno = ENOMEM; + return ret; } static inline void do_free(void* ptr) { - if (TCMallocDebug::level >= TCMallocDebug::kVerbose) + if (TCMallocDebug::level >= TCMallocDebug::kVerbose) MESSAGE("In tcmalloc do_free(%p)\n", ptr); if (ptr == NULL) return; ASSERT(pageheap != NULL); // Should not call free() before malloc() @@ -1835,8 +2155,8 @@ static inline void do_free(void* ptr) { heap->Deallocate(ptr, cl); } else { // Delete directly into central cache - SpinLockHolder h(¢ral_cache[cl].lock_); - central_cache[cl].Insert(ptr); + SLL_SetNext(ptr, NULL); + central_cache[cl].InsertRange(ptr, ptr, 1); } } else { SpinLockHolder h(&pageheap_lock); @@ -2045,39 +2365,88 @@ extern "C" void* realloc(void* old_ptr, size_t new_size) { } #ifndef COMPILER_INTEL -#define OPNEW_THROW -#define OPDELETE_THROW +#define OP_THROWNOTHING +#define OP_THROWBADALLOC #else -#define OPNEW_THROW throw(std::bad_alloc) -#define OPDELETE_THROW throw() +#define OP_THROWNOTHING throw() +#define OP_THROWBADALLOC throw(std::bad_alloc) #endif -void* operator new(size_t size) OPNEW_THROW { - void* p = do_malloc(size); - if (p == NULL) { - MESSAGE("Unable to allocate %" PRIuS " bytes: new failed\n", size); - abort(); +static SpinLock set_new_handler_lock = SPINLOCK_INITIALIZER; + +static inline void* cpp_alloc(size_t size, bool nothrow) { + for (;;) { + void* p = do_malloc(size); +#ifdef PREANSINEW + MallocHook::InvokeNewHook(p, size); + return p; +#else + if (p == NULL) { // allocation failed + // Get the current new handler. NB: this function is not + // thread-safe. We make a feeble stab at making it so here, but + // this lock only protects against tcmalloc interfering with + // itself, not with other libraries calling set_new_handler. + std::new_handler nh; + { + SpinLockHolder h(&set_new_handler_lock); + nh = std::set_new_handler(0); + (void) std::set_new_handler(nh); + } + // If no new_handler is established, the allocation failed. + if (!nh) { + if (nothrow) return 0; + throw std::bad_alloc(); + } + // Otherwise, try the new_handler. If it returns, retry the + // allocation. If it throws std::bad_alloc, fail the allocation. + // if it throws something else, don't interfere. + try { + (*nh)(); + } catch (const std::bad_alloc&) { + if (!nothrow) throw; + MallocHook::InvokeNewHook(p, size); + return p; + } + } else { // allocation success + MallocHook::InvokeNewHook(p, size); + return p; + } +#endif } - MallocHook::InvokeNewHook(p, size); - return p; } -void operator delete(void* p) OPDELETE_THROW { +void* operator new(size_t size) OP_THROWBADALLOC { + return cpp_alloc(size, false); +} + +void* operator new(size_t size, const std::nothrow_t&) OP_THROWNOTHING { + return cpp_alloc(size, true); +} + +void operator delete(void* p) OP_THROWNOTHING { MallocHook::InvokeDeleteHook(p); do_free(p); } -void* operator new[](size_t size) OPNEW_THROW { - void* p = do_malloc(size); - if (p == NULL) { - MESSAGE("Unable to allocate %" PRIuS " bytes: new failed\n", size); - abort(); - } - MallocHook::InvokeNewHook(p, size); - return p; +void operator delete(void* p, const std::nothrow_t&) OP_THROWNOTHING { + MallocHook::InvokeDeleteHook(p); + do_free(p); +} + +void* operator new[](size_t size) OP_THROWBADALLOC { + return cpp_alloc(size, false); +} + +void* operator new[](size_t size, const std::nothrow_t&) OP_THROWNOTHING { + return cpp_alloc(size, true); +} + +void operator delete[](void* p) OP_THROWNOTHING { + MallocHook::InvokeDeleteHook(p); + do_free(p); } -void operator delete[](void* p) OPDELETE_THROW { +void operator delete[](void* p, const std::nothrow_t&) OP_THROWNOTHING { MallocHook::InvokeDeleteHook(p); do_free(p); } @@ -2143,11 +2512,14 @@ extern "C" struct mallinfo mallinfo(void) { // Unfortunately, the struct contains "int" field, so some of the // size values will be truncated. info.arena = static_cast<int>(stats.system_bytes); - info.fsmblks = static_cast<int>(stats.thread_bytes + stats.central_bytes); + info.fsmblks = static_cast<int>(stats.thread_bytes + + stats.central_bytes + + stats.transfer_bytes); info.fordblks = static_cast<int>(stats.pageheap_bytes); info.uordblks = static_cast<int>(stats.system_bytes - stats.thread_bytes - stats.central_bytes + - stats.transfer_bytes - stats.pageheap_bytes); return info; diff --git a/src/tests/stacktrace_unittest.cc b/src/tests/stacktrace_unittest.cc index 34a7718..b73cf4b 100644 --- a/src/tests/stacktrace_unittest.cc +++ b/src/tests/stacktrace_unittest.cc @@ -42,6 +42,7 @@ // backtrace, and maybe print the backtrace to stdout. //-----------------------------------------------------------------------// +void CheckStackTraceLeaf(); void CheckStackTrace4(int i); void CheckStackTrace3(int i); void CheckStackTrace2(int i); @@ -51,8 +52,9 @@ void CheckStackTrace(int i); // The sequence of functions whose return addresses we expect to see in the // backtrace. -const int BACKTRACE_STEPS = 5; +const int BACKTRACE_STEPS = 6; void * expected_stack[BACKTRACE_STEPS] = { + (void *) &CheckStackTraceLeaf, (void *) &CheckStackTrace4, (void *) &CheckStackTrace3, (void *) &CheckStackTrace2, @@ -77,10 +79,6 @@ void * expected_stack[BACKTRACE_STEPS] = { // ... // stack[5] is ret addr within CheckStackTrace -// The google version does not include the caller in the -// backtrace. Some other version might. (glibc backtrace()?) -const int self_in_backtrace = 0; - //-----------------------------------------------------------------------// void CheckRetAddrIsInFunction( void * ret_addr, void * function_start_addr) @@ -96,14 +94,20 @@ void CheckStackTraceLeaf(void) { const int STACK_LEN = 10; void *stack[STACK_LEN]; int size; + void *top, *bottom; size = GetStackTrace(stack, STACK_LEN, 0); printf("Obtained %d stack frames.\n", size); CHECK_LE(size, STACK_LEN); - + + // GetStackExtent() may not be implemented + if (GetStackExtent(NULL, &top, &bottom)) { + printf("%p %p\n", top, bottom); + } + for (int i = 0; i < BACKTRACE_STEPS; i++) { - CheckRetAddrIsInFunction(stack[i + self_in_backtrace], expected_stack[i]); + CheckRetAddrIsInFunction(stack[i], expected_stack[i]); } @@ -111,6 +115,7 @@ void CheckStackTraceLeaf(void) { { char **strings = backtrace_symbols(stack, size); + printf("Obtained %d stack frames.\n", size); for (int i = 0; i < size; i++) printf("%s\n", strings[i]); printf("CheckStackTrace() addr: %p\n", &CheckStackTrace); diff --git a/src/tests/tcmalloc_unittest.cc b/src/tests/tcmalloc_unittest.cc index 22a6c29..c151c2a 100644 --- a/src/tests/tcmalloc_unittest.cc +++ b/src/tests/tcmalloc_unittest.cc @@ -1,10 +1,10 @@ // Copyright (c) 2005, Google Inc. // All rights reserved. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: -// +// // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above @@ -14,7 +14,7 @@ // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. -// +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -28,33 +28,598 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // --- -// Author: Paul Menage +// Unittest for the TCMalloc implementation. // -// TODO(menage) Turn this into a real unittest ... +// * The test consists of a set of threads. +// * Each thread maintains a set of allocated objects, with +// a bound on the total amount of data in the set. +// * Each allocated object's contents are generated by +// hashing the object pointer, and a generation count +// in the object. This allows us to easily check for +// data corruption. +// * At any given step, the thread can do any of the following: +// a. Allocate an object +// b. Increment an object's generation count and update +// its contents. +// c. Pass the object to another thread +// d. Free an object +// Also, at the end of every step, object(s) are freed to maintain +// the memory upper-bound. #include <stdlib.h> #include <string.h> #include <stdio.h> -#include <stdint.h> -#include "google/malloc_extension.h" +#include <stdint.h> // for intptr_t +#include <unistd.h> // for getpid() +#include <pthread.h> +#include <vector> +#include <string> +#include <new> +#include "base/logging.h" + +#define LOGSTREAM stdout + +using std::vector; +using std::string; + +static const int FLAGS_numtests = 50000; +static const int FLAGS_log_every_n_tests = 10000; + +// Testing parameters +static const int FLAGS_lgmaxsize = 16; // lg() of the max size object to alloc +static const int FLAGS_numthreads = 10; // Number of threads +static const int FLAGS_threadmb = 4; // Max memory size allocated by thread + +// Weights of different operations +static const int FLAGS_mallocweight = 50; // Weight for picking malloc +static const int FLAGS_freeweight = 50; // Weight for picking free +static const int FLAGS_updateweight = 10; // Weight for picking update +static const int FLAGS_passweight = 1; // Weight for passing object + +static const int kSizeBits = 8 * sizeof(size_t); +static const size_t kMaxSize = ~static_cast<size_t>(0); +static const size_t kMaxSignedSize = ((size_t(1) << (kSizeBits-1)) - 1); + +static const size_t kNotTooBig = 100000; +static const size_t kTooBig = kMaxSignedSize; + +static int news_handled = 0; + +// Global array of threads +class TesterThread; +static TesterThread** threads; + +// To help with generating random numbers +class TestHarness { + private: + // Information kept per type + struct Type { + string name; + int type; + int weight; + }; + + public: + TestHarness(int seed) + : types_(new vector<Type>), total_weight_(0), num_tests_(0) { + srandom(seed); + } + ~TestHarness() { + delete types_; + } + + // Add operation type with specified weight. When starting a new + // iteration, an operation type is picked with probability + // proportional to its weight. + // + // "type" must be non-negative. + // "weight" must be non-negative. + void AddType(int type, int weight, const char* name); + + // Call this to get the type of operation for the next iteration. + // It returns a random operation type from the set of registered + // operations. Returns -1 if tests should finish. + int PickType(); + + // If n == 0, returns the next pseudo-random number in the range [0 .. 0] + // If n != 0, returns the next pseudo-random number in the range [0 .. n) + int Uniform(int n) { + if (n == 0) { + return random() * 0; + } else { + return random() % n; + } + } + // Pick "base" uniformly from range [0,max_log] and then return + // "base" random bits. The effect is to pick a number in the range + // [0,2^max_log-1] with bias towards smaller numbers. + int Skewed(int max_log) { + const int base = random() % (max_log+1); + return random() % (1 << base); + } + + private: + vector<Type>* types_; // Registered types + int total_weight_; // Total weight of all types + int num_tests_; // Num tests run so far +}; + +void TestHarness::AddType(int type, int weight, const char* name) { + Type t; + t.name = name; + t.type = type; + t.weight = weight; + types_->push_back(t); + total_weight_ += weight; +} + +int TestHarness::PickType() { + if (num_tests_ >= FLAGS_numtests) return -1; + + assert(total_weight_ > 0); + // This is a little skewed if total_weight_ doesn't divide 2^31, but it's close + int v = Uniform(total_weight_); + int i; + for (i = 0; i < types_->size(); i++) { + v -= (*types_)[i].weight; + if (v < 0) { + break; + } + } + + assert(i < types_->size()); + if ((num_tests_ % FLAGS_log_every_n_tests) == 0) { + fprintf(LOGSTREAM, "Test %d out of %d: %s\n", + num_tests_, FLAGS_numtests, (*types_)[i].name.c_str()); + } + num_tests_++; + return (*types_)[i].type; +} + + +// Info kept per thread +class TesterThread { + private: + // Info kept per allocated object + struct Object { + char* ptr; // Allocated pointer + int size; // Allocated size + int generation; // Generation counter of object contents + }; + + pthread_mutex_t lock_; // For passing in another thread's obj + int id_; // My thread id + TestHarness rnd_; // For generating random numbers + vector<Object> heap_; // This thread's heap + vector<Object> passed_; // Pending objects passed from others + size_t heap_size_; // Current heap size + int locks_ok_; // Number of OK TryLock() ops + int locks_failed_; // Number of failed TryLock() ops + + // Type of operations + enum Type { MALLOC, FREE, UPDATE, PASS }; + + public: + TesterThread(int id) + : id_(id), + rnd_(id+1), + heap_size_(0), + locks_ok_(0), + locks_failed_(0) { + CHECK_EQ(pthread_mutex_init(&lock_, NULL), 0); + } + + virtual ~TesterThread() { + fprintf(LOGSTREAM, "Thread %2d: locks %6d ok; %6d trylocks failed\n", + id_, locks_ok_, locks_failed_); + if (locks_ok_ + locks_failed_ >= 1000) { + CHECK_LE(locks_failed_, locks_ok_ / 2); + } + } + + virtual void Run() { + rnd_.AddType(MALLOC, FLAGS_mallocweight, "malloc"); + rnd_.AddType(FREE, FLAGS_freeweight, "free"); + rnd_.AddType(UPDATE, FLAGS_updateweight, "update"); + rnd_.AddType(PASS, FLAGS_passweight, "pass"); + + while (true) { + AcquirePassedObjects(); + + switch (rnd_.PickType()) { + case MALLOC: AllocateObject(); break; + case FREE: FreeObject(); break; + case UPDATE: UpdateObject(); break; + case PASS: PassObject(); break; + case -1: goto done; + default: assert("" == "Unknown type"); + } + + ShrinkHeap(); + } + + done: + DeleteHeap(); + } + + // Allocate a new object + void AllocateObject() { + Object object; + object.size = rnd_.Skewed(FLAGS_lgmaxsize); + object.ptr = reinterpret_cast<char*>(malloc(object.size)); + object.generation = 0; + FillContents(&object); + heap_.push_back(object); + heap_size_ += object.size; + } + + // Mutate a random object + void UpdateObject() { + if (heap_.empty()) return; + const int index = rnd_.Uniform(heap_.size()); + CheckContents(heap_[index]); + heap_[index].generation++; + FillContents(&heap_[index]); + } + + // Free a random object + void FreeObject() { + if (heap_.empty()) return; + const int index = rnd_.Uniform(heap_.size()); + Object object = heap_[index]; + CheckContents(object); + free(object.ptr); + heap_size_ -= object.size; + heap_[index] = heap_[heap_.size()-1]; + heap_.pop_back(); + } + + // Delete all objects in the heap + void DeleteHeap() { + while (!heap_.empty()) { + FreeObject(); + } + } + + // Free objects until our heap is small enough + void ShrinkHeap() { + while (heap_size_ > FLAGS_threadmb << 20) { + assert(!heap_.empty()); + FreeObject(); + } + } + + // Pass a random object to another thread + void PassObject() { + // Pick object to pass + if (heap_.empty()) return; + const int index = rnd_.Uniform(heap_.size()); + Object object = heap_[index]; + CheckContents(object); + + // Pick thread to pass + const int tid = rnd_.Uniform(FLAGS_numthreads); + TesterThread* thread = threads[tid]; + + if (pthread_mutex_trylock(&thread->lock_) == 0) { + // Pass the object + locks_ok_++; + thread->passed_.push_back(object); + CHECK_EQ(pthread_mutex_unlock(&thread->lock_), 0); + heap_size_ -= object.size; + heap_[index] = heap_[heap_.size()-1]; + heap_.pop_back(); + } else { + locks_failed_++; + } + } + + // Grab any objects passed to this thread by another thread + void AcquirePassedObjects() { + // We do not create unnecessary contention by always using + // TryLock(). Plus we unlock immediately after swapping passed + // objects into a local vector. + vector<Object> copy; + { // Locking scope + if (pthread_mutex_trylock(&lock_) != 0) { + locks_failed_++; + return; + } + locks_ok_++; + swap(copy, passed_); + CHECK_EQ(pthread_mutex_unlock(&lock_), 0); + } + + for (int i = 0; i < copy.size(); ++i) { + const Object& object = copy[i]; + CheckContents(object); + heap_.push_back(object); + heap_size_ += object.size; + } + } + + // Fill object contents according to ptr/generation + void FillContents(Object* object) { + // It doesn't need to be a great random number generator, but it does + // need to be a re-entrant one, since this is run in a thread + unsigned short xsubi[3]; // arguments for nrand48() + xsubi[0] = (reinterpret_cast<intptr_t>(object->ptr) & 0x7fff0000) >> 16; + xsubi[1] = (reinterpret_cast<intptr_t>(object->ptr) & 0x0000ffff); + xsubi[2] = 0x1234; + + for (int i = 0; i < object->generation; ++i) { + nrand48(xsubi); + } + const char c = static_cast<char>(nrand48(xsubi)); + memset(object->ptr, c, object->size); + } + + // Check object contents + void CheckContents(const Object& object) { + // It doesn't need to be a great random number generator, but it does + // need to be a re-entrant one, since this is run in a thread + unsigned short xsubi[3]; // arguments for nrand48() + xsubi[0] = (reinterpret_cast<intptr_t>(object.ptr) & 0x7fff0000) >> 16; + xsubi[1] = (reinterpret_cast<intptr_t>(object.ptr) & 0x0000ffff); + xsubi[2] = 0x1234; + + for (int i = 0; i < object.generation; ++i) { + nrand48(xsubi); + } + + // For large objects, we just check a prefix/suffix + const char expected = static_cast<char>(nrand48(xsubi)); + const int limit1 = object.size < 32 ? object.size : 32; + const int start2 = limit1 > object.size - 32 ? limit1 : object.size - 32; + for (int i = 0; i < limit1; ++i) { + CHECK_EQ(object.ptr[i], expected); + } + for (int i = start2; i < object.size; ++i) { + CHECK_EQ(object.ptr[i], expected); + } + } +}; + +static void* RunThread(void *vthread) { + TesterThread* t = reinterpret_cast<TesterThread*>(vthread); + t->Run(); + return NULL; +} + +static void TryHugeAllocation(size_t s) { + void* p = malloc(s); + CHECK(p == NULL); // huge allocation s should fail! +} + +static void TestHugeAllocations() { + // Check that asking for stuff tiny bit smaller than largest possible + // size returns NULL. + for (size_t i = 0; i < 10000; i++) { + TryHugeAllocation(kMaxSize - i); + } + + // Check that asking for stuff near signed/unsigned boundary returns NULL + for (size_t i = 0; i < 100; i++) { + TryHugeAllocation(kMaxSignedSize - i); + TryHugeAllocation(kMaxSignedSize + i); + } +} + +static void TestCalloc(size_t n, size_t s, bool ok) { + char* p = reinterpret_cast<char*>(calloc(n, s)); + fprintf(LOGSTREAM, "calloc(%x, %x): %p\n", n, s, p); + if (!ok) { + CHECK(p == NULL); // calloc(n, s) should succeed + } else { + CHECK(p != NULL); // calloc(n, s) should not succeed + for (int i = 0; i < n*s; i++) { + CHECK(p[i] == '\0'); + } + free(p); + } +} + +static void TestNewHandler() throw (std::bad_alloc) { + ++news_handled; + throw std::bad_alloc(); +} + +static void TestOneNew(void* (*func)(size_t)) { + // success test + try { + void* ptr = (*func)(kNotTooBig); + if (0 == ptr) { + fprintf(LOGSTREAM, "allocation should not have failed.\n"); + abort(); + } + } catch (...) { + fprintf(LOGSTREAM, "allocation threw unexpected exception.\n"); + abort(); + } + + // failure test + // we should always receive a bad_alloc exception + try { + (*func)(kTooBig); + fprintf(LOGSTREAM, "allocation should have failed.\n"); + abort(); + } catch (const std::bad_alloc&) { + // correct + } catch (...) { + fprintf(LOGSTREAM, "allocation threw unexpected exception.\n"); + abort(); + } +} + +static void TestNew(void* (*func)(size_t)) { + news_handled = 0; + + // test without new_handler: + std::new_handler saved_handler = std::set_new_handler(0); + TestOneNew(func); + + // test with new_handler: + std::set_new_handler(TestNewHandler); + TestOneNew(func); + if (news_handled != 1) { + fprintf(LOGSTREAM, "new_handler was not called.\n"); + abort(); + } + std::set_new_handler(saved_handler); +} + +static void TestOneNothrowNew(void* (*func)(size_t, const std::nothrow_t&)) { + // success test + try { + void* ptr = (*func)(kNotTooBig, std::nothrow); + if (0 == ptr) { + fprintf(LOGSTREAM, "allocation should not have failed.\n"); + abort(); + } + } catch (...) { + fprintf(LOGSTREAM, "allocation threw unexpected exception.\n"); + abort(); + } + + // failure test + // we should always receive a bad_alloc exception + try { + if ((*func)(kTooBig, std::nothrow) != 0) { + fprintf(LOGSTREAM, "allocation should have failed.\n"); + abort(); + } + } catch (...) { + fprintf(LOGSTREAM, "nothrow allocation threw unexpected exception.\n"); + abort(); + } +} + +static void TestNothrowNew(void* (*func)(size_t, const std::nothrow_t&)) { + news_handled = 0; + + // test without new_handler: + std::new_handler saved_handler = std::set_new_handler(0); + TestOneNothrowNew(func); + + // test with new_handler: + std::set_new_handler(TestNewHandler); + TestOneNothrowNew(func); + if (news_handled != 1) { + fprintf(LOGSTREAM, "nothrow new_handler was not called.\n"); + abort(); + } + std::set_new_handler(saved_handler); +} + + +int main(int argc, char** argv) { + // TODO(csilvers): port MemoryUsage() over so the test can use that +#if 0 + // Allocate and deallocate blocks of increasing sizes to check if the alloc + // metadata fragments the memory. (Do not put other allocations/deallocations + // before this test, it may break). + { + size_t memory_usage = MemoryUsage(getpid()); + fprintf(LOGSTREAM, "==== Testing fragmentation\n"); + for ( int i = 200; i < 240; ++i ) { + int size = i << 20; + void *test1 = malloc(size); + for ( int j = 0; j < size; j += (1 << 12) ) { + static_cast<char*>(test1)[j] = 1; + } + free(test1); + } + // There may still be a bit of fragmentation at the beginning, until we + // reach kPageMapBigAllocationThreshold bytes so we check for + // 200 + 240 + margin. + CHECK_LT(MemoryUsage(getpid()), memory_usage + (450 << 20) ); + } +#endif + + // Check that empty allocation works + fprintf(LOGSTREAM, "==== Testing empty allocation\n"); + { + void* p1 = malloc(0); + CHECK(p1 != NULL); + void* p2 = malloc(0); + CHECK(p2 != NULL); + CHECK(p1 != p2); + free(p1); + free(p2); + } + + // Check that 512MB can be allocated + fprintf(LOGSTREAM, "==== Testing large allocation\n"); + { + void* p = malloc(512<<20); + CHECK(p != NULL); // could not allocate 512MB + free(p); + } + + // Check that large allocations fail with NULL instead of crashing + fprintf(LOGSTREAM, "==== Testing out of memory\n"); + for (int s = 0; ; s += (10<<20)) { + void* large_object = malloc(s); + if (large_object == NULL) break; + free(large_object); + } + + // Check that huge allocations fail with NULL instead of crashing + fprintf(LOGSTREAM, "==== Testing huge allocations\n"); + TestHugeAllocations(); + + // Check calloc() with various arguments + fprintf(LOGSTREAM, "==== Testing calloc\n"); + TestCalloc(0, 0, true); + TestCalloc(0, 1, true); + TestCalloc(1, 1, true); + TestCalloc(1<<10, 0, true); + TestCalloc(1<<20, 0, true); + TestCalloc(0, 1<<10, true); + TestCalloc(0, 1<<20, true); + TestCalloc(1<<20, 2, true); + TestCalloc(2, 1<<20, true); + TestCalloc(1000, 1000, true); + + TestCalloc(kMaxSize, 2, false); + TestCalloc(2, kMaxSize, false); + TestCalloc(kMaxSize, kMaxSize, false); + + TestCalloc(kMaxSignedSize, 3, false); + TestCalloc(3, kMaxSignedSize, false); + TestCalloc(kMaxSignedSize, kMaxSignedSize, false); + + fprintf(LOGSTREAM, "Testing operator new(nothrow).\n"); + TestNothrowNew(&::operator new); + fprintf(LOGSTREAM, "Testing operator new[](nothrow).\n"); + TestNothrowNew(&::operator new[]); + fprintf(LOGSTREAM, "Testing operator new.\n"); + TestNew(&::operator new); + fprintf(LOGSTREAM, "Testing operator new[].\n"); + TestNew(&::operator new[]); + fprintf(LOGSTREAM, "Done testing C++ operators.\n"); -#define BUFSIZE (100 << 10) + // Create threads + fprintf(LOGSTREAM, "==== Testing threaded allocation/deallocation\n"); + threads = new TesterThread*[FLAGS_numthreads]; + pthread_t* thread_ids = new pthread_t[FLAGS_numthreads]; + for (int i = 0; i < FLAGS_numthreads; ++i) { + threads[i] = new TesterThread(i); + } -int main(int argc, char **argv) { - char *buf1 = (char *)malloc(BUFSIZE); - memset(buf1, 0, BUFSIZE); - printf("Allocated buf1 via malloc() at %p\n", buf1); + // Start + for (int i = 0; i < FLAGS_numthreads; ++i) { + CHECK_EQ(pthread_create(&thread_ids[i], NULL, RunThread, threads[i]), 0); + } - char *buf2 = new char[BUFSIZE]; - memset(buf2, 0, BUFSIZE); - printf("Allocated buf2 via new at %p\n", buf2); - - free(buf1); - delete[] buf2; + // Wait + for (int i = 0; i < FLAGS_numthreads; ++i) { + void* junk; + CHECK_EQ(pthread_join(thread_ids[i], &junk), 0); + } - char buffer[10 << 10]; - MallocExtension::instance()->GetStats(buffer, sizeof(buffer)); - printf("Malloc stats:\n%s\n", buffer); + for (int i = 0; i < FLAGS_numthreads; ++i) delete threads[i]; // Cleanup + fprintf(LOGSTREAM, "PASS\n"); return 0; } |