* Add new (std::nothrow) to debugallocation (corrado)

* Add a flag to ingore unaligned-ptr leaks (archanakannan) * PORTING: Add get-pc capabilities for a new OS (csilvers) * Don't register malloc extension under valgrind (csilvers) * Fix throw specs for our global operator new (chandlerc) * PORTING: link to instructions on windows static overrides (mbelshe) * Fix prototype differences in debugalloc (chandlerc, csilvers, wan) * Change pprof to handle big-endian input files (csilvers) * Properly align allocation sizes on Windows (antonm) * Improve IsRunningOnValgrind, using valgrind.h (csilvers, kcc) * Improve the accuracy of system_alloc actual_size (csilvers) * Add interactive callgrind support to pprof (weidenri...) * Fix off-by-one problems when symbolizing in pprof (dpeng) * Be more permissive in allowed library names, in pprof (csilvers) * PORTING: Fix pc_from_ucontext to handle cygwin and redhat7 (csilvers) * Fix stacktrace to avoid inlining (ppluzhnikov) git-svn-id: http://gperftools.googlecode.com/svn/trunk@91 6b5cf1ce-ec42-a296-1ba9-69fdba395a50
author: csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> 2010-03-23 20:39:55 +0000
committer: csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50> 2010-03-23 20:39:55 +0000
commit: 92beff88437b31f4a618640b88487e0f8dfb7017 (patch)
tree: d15e670fdc74a690d012c25e16a2d6efa4ab7d26
parent: 23dd124970bc11636feaa240394063ba5889ca54 (diff)
download: gperftools-92beff88437b31f4a618640b88487e0f8dfb7017.tar.gz
38 files changed, 5372 insertions, 1431 deletions
diff --git a/Makefile.am b/Makefile.am
index 3de910e..73635db 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -143,10 +143,11 @@ dist_doc_DATA += doc/index.html doc/designstyle.css
 LOGGING_INCLUDES = src/base/logging.h \
                    src/base/commandlineflags.h \
                    src/base/basictypes.h \
-                   src/base/dynamic_annotations.h
+                   src/base/dynamic_annotations.h \
+                   src/third_party/valgrind.h
 noinst_LTLIBRARIES += liblogging.la
 liblogging_la_SOURCES = src/base/logging.cc \
-                        src/base/dynamic_annotations.cc \
+                        src/base/dynamic_annotations.c \
                         $(LOGGING_INCLUDES)
 
 SYSINFO_INCLUDES = src/base/sysinfo.h \
@@ -279,7 +280,6 @@ googleinclude_HEADERS += $(SG_STACKTRACE_INCLUDES)
 ### Making the library
 noinst_LTLIBRARIES += libstacktrace.la
 libstacktrace_la_SOURCES = src/stacktrace.cc \
-                           src/stacktrace_with_context.cc \
                            src/base/vdso_support.cc \
                            $(STACKTRACE_INCLUDES)
 libstacktrace_la_LIBADD = $(UNWIND_LIBS) $(LIBSPINLOCK)
diff --git a/Makefile.in b/Makefile.in
index 0db79a2..a717bed 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -294,7 +294,8 @@ am__libprofiler_la_SOURCES_DIST = src/profiler.cc \
 	src/base/atomicops-internals-x86.h \
 	src/base/spinlock_win32-inl.h src/base/spinlock_linux-inl.h \
 	src/base/spinlock_posix-inl.h src/base/dynamic_annotations.h \
-	src/google/profiler.h src/google/stacktrace.h
+	src/third_party/valgrind.h src/google/profiler.h \
+	src/google/stacktrace.h
 @WITH_CPU_PROFILER_TRUE@am__objects_2 = $(am__objects_1) \
 @WITH_CPU_PROFILER_TRUE@	$(am__objects_1)
 @WITH_CPU_PROFILER_TRUE@am__objects_3 = $(am__objects_2) \
@@ -328,16 +329,14 @@ libspinlock_la_OBJECTS = $(am_libspinlock_la_OBJECTS)
 @WITH_STACK_TRACE_TRUE@	$(am__DEPENDENCIES_1) \
 @WITH_STACK_TRACE_TRUE@	$(am__DEPENDENCIES_2)
 am__libstacktrace_la_SOURCES_DIST = src/stacktrace.cc \
-	src/stacktrace_with_context.cc src/base/vdso_support.cc \
-	src/stacktrace_config.h src/stacktrace_generic-inl.h \
-	src/stacktrace_libunwind-inl.h src/stacktrace_powerpc-inl.h \
-	src/stacktrace_x86_64-inl.h src/stacktrace_x86-inl.h \
-	src/stacktrace_win32-inl.h src/base/vdso_support.h \
-	src/google/stacktrace.h
+	src/base/vdso_support.cc src/stacktrace_config.h \
+	src/stacktrace_generic-inl.h src/stacktrace_libunwind-inl.h \
+	src/stacktrace_powerpc-inl.h src/stacktrace_x86_64-inl.h \
+	src/stacktrace_x86-inl.h src/stacktrace_win32-inl.h \
+	src/base/vdso_support.h src/google/stacktrace.h
 @WITH_STACK_TRACE_TRUE@am__objects_4 = $(am__objects_1) \
 @WITH_STACK_TRACE_TRUE@	$(am__objects_1)
 @WITH_STACK_TRACE_TRUE@am_libstacktrace_la_OBJECTS = stacktrace.lo \
-@WITH_STACK_TRACE_TRUE@	stacktrace_with_context.lo \
 @WITH_STACK_TRACE_TRUE@	vdso_support.lo $(am__objects_4)
 libstacktrace_la_OBJECTS = $(am_libstacktrace_la_OBJECTS)
 @WITH_STACK_TRACE_TRUE@am_libstacktrace_la_rpath =
@@ -364,17 +363,17 @@ am__libtcmalloc_la_SOURCES_DIST = src/tcmalloc.cc src/common.h \
 	src/thread_cache.h src/stack_trace_table.h \
 	src/base/thread_annotations.h src/malloc_hook-inl.h \
 	src/maybe_threads.h src/base/logging.h \
-	src/base/dynamic_annotations.h src/addressmap-inl.h \
-	src/raw_printer.h src/base/elfcore.h src/base/googleinit.h \
-	src/base/linux_syscall_support.h src/base/linuxthreads.h \
-	src/base/stl_allocator.h src/base/sysinfo.h \
-	src/base/thread_lister.h src/heap-profile-table.h \
-	src/google/malloc_hook.h src/google/malloc_hook_c.h \
-	src/google/malloc_extension.h src/google/malloc_extension_c.h \
-	src/google/stacktrace.h src/google/heap-profiler.h \
-	src/google/heap-checker.h src/base/thread_lister.c \
-	src/base/linuxthreads.cc src/heap-checker.cc \
-	src/heap-checker-bcad.cc
+	src/base/dynamic_annotations.h src/third_party/valgrind.h \
+	src/addressmap-inl.h src/raw_printer.h src/base/elfcore.h \
+	src/base/googleinit.h src/base/linux_syscall_support.h \
+	src/base/linuxthreads.h src/base/stl_allocator.h \
+	src/base/sysinfo.h src/base/thread_lister.h \
+	src/heap-profile-table.h src/google/malloc_hook.h \
+	src/google/malloc_hook_c.h src/google/malloc_extension.h \
+	src/google/malloc_extension_c.h src/google/stacktrace.h \
+	src/google/heap-profiler.h src/google/heap-checker.h \
+	src/base/thread_lister.c src/base/linuxthreads.cc \
+	src/heap-checker.cc src/heap-checker-bcad.cc
 @MINGW_FALSE@am__objects_5 = libtcmalloc_la-tcmalloc.lo
 am__objects_6 = $(am__objects_1)
 @WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__objects_7 = $(am__objects_6) \
@@ -414,17 +413,17 @@ am__libtcmalloc_and_profiler_la_SOURCES_DIST = src/tcmalloc.cc \
 	src/thread_cache.h src/stack_trace_table.h \
 	src/base/thread_annotations.h src/malloc_hook-inl.h \
 	src/maybe_threads.h src/base/logging.h \
-	src/base/dynamic_annotations.h src/addressmap-inl.h \
-	src/raw_printer.h src/base/elfcore.h src/base/googleinit.h \
-	src/base/linux_syscall_support.h src/base/linuxthreads.h \
-	src/base/stl_allocator.h src/base/sysinfo.h \
-	src/base/thread_lister.h src/heap-profile-table.h \
-	src/google/malloc_hook.h src/google/malloc_hook_c.h \
-	src/google/malloc_extension.h src/google/malloc_extension_c.h \
-	src/google/stacktrace.h src/google/heap-profiler.h \
-	src/google/heap-checker.h src/base/thread_lister.c \
-	src/base/linuxthreads.cc src/heap-checker.cc \
-	src/heap-checker-bcad.cc src/profiler.cc \
+	src/base/dynamic_annotations.h src/third_party/valgrind.h \
+	src/addressmap-inl.h src/raw_printer.h src/base/elfcore.h \
+	src/base/googleinit.h src/base/linux_syscall_support.h \
+	src/base/linuxthreads.h src/base/stl_allocator.h \
+	src/base/sysinfo.h src/base/thread_lister.h \
+	src/heap-profile-table.h src/google/malloc_hook.h \
+	src/google/malloc_hook_c.h src/google/malloc_extension.h \
+	src/google/malloc_extension_c.h src/google/stacktrace.h \
+	src/google/heap-profiler.h src/google/heap-checker.h \
+	src/base/thread_lister.c src/base/linuxthreads.cc \
+	src/heap-checker.cc src/heap-checker-bcad.cc src/profiler.cc \
 	src/profile-handler.cc src/profiledata.cc src/profiledata.h \
 	src/profile-handler.h src/getpc.h src/base/simple_mutex.h \
 	src/google/profiler.h
@@ -467,15 +466,15 @@ am__libtcmalloc_debug_la_SOURCES_DIST = src/debugallocation.cc \
 	src/thread_cache.h src/stack_trace_table.h \
 	src/base/thread_annotations.h src/malloc_hook-inl.h \
 	src/maybe_threads.h src/base/logging.h \
-	src/base/dynamic_annotations.h src/addressmap-inl.h \
-	src/raw_printer.h src/base/elfcore.h src/base/googleinit.h \
-	src/base/linux_syscall_support.h src/base/linuxthreads.h \
-	src/base/stl_allocator.h src/base/sysinfo.h \
-	src/base/thread_lister.h src/heap-profile-table.h \
-	src/google/malloc_hook.h src/google/malloc_hook_c.h \
-	src/google/malloc_extension.h src/google/malloc_extension_c.h \
-	src/google/stacktrace.h src/google/heap-profiler.h \
-	src/google/heap-checker.h
+	src/base/dynamic_annotations.h src/third_party/valgrind.h \
+	src/addressmap-inl.h src/raw_printer.h src/base/elfcore.h \
+	src/base/googleinit.h src/base/linux_syscall_support.h \
+	src/base/linuxthreads.h src/base/stl_allocator.h \
+	src/base/sysinfo.h src/base/thread_lister.h \
+	src/heap-profile-table.h src/google/malloc_hook.h \
+	src/google/malloc_hook_c.h src/google/malloc_extension.h \
+	src/google/malloc_extension_c.h src/google/stacktrace.h \
+	src/google/heap-profiler.h src/google/heap-checker.h
 @WITH_HEAP_CHECKER_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@am__objects_17 = thread_lister.lo \
 @WITH_HEAP_CHECKER_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	libtcmalloc_debug_la-linuxthreads.lo \
 @WITH_HEAP_CHECKER_TRUE@@WITH_HEAP_PROFILER_OR_CHECKER_TRUE@	libtcmalloc_debug_la-heap-checker.lo \
@@ -513,14 +512,15 @@ am__libtcmalloc_internal_la_SOURCES_DIST = src/common.cc \
 	src/google/malloc_hook_c.h src/google/malloc_extension.h \
 	src/google/malloc_extension_c.h src/google/stacktrace.h \
 	src/base/logging.h src/base/dynamic_annotations.h \
-	src/addressmap-inl.h src/raw_printer.h src/base/elfcore.h \
-	src/base/googleinit.h src/base/linux_syscall_support.h \
-	src/base/linuxthreads.h src/base/stl_allocator.h \
-	src/base/sysinfo.h src/base/thread_lister.h \
-	src/heap-profile-table.h src/google/heap-profiler.h \
-	src/google/heap-checker.h src/base/low_level_alloc.cc \
-	src/heap-profile-table.cc src/heap-profiler.cc \
-	src/raw_printer.cc src/memory_region_map.cc
+	src/third_party/valgrind.h src/addressmap-inl.h \
+	src/raw_printer.h src/base/elfcore.h src/base/googleinit.h \
+	src/base/linux_syscall_support.h src/base/linuxthreads.h \
+	src/base/stl_allocator.h src/base/sysinfo.h \
+	src/base/thread_lister.h src/heap-profile-table.h \
+	src/google/heap-profiler.h src/google/heap-checker.h \
+	src/base/low_level_alloc.cc src/heap-profile-table.cc \
+	src/heap-profiler.cc src/raw_printer.cc \
+	src/memory_region_map.cc
 @MINGW_FALSE@am__objects_18 = libtcmalloc_internal_la-system-alloc.lo
 @MINGW_FALSE@am__objects_19 =  \
 @MINGW_FALSE@	libtcmalloc_internal_la-maybe_threads.lo
@@ -728,7 +728,8 @@ am__addressmap_unittest_SOURCES_DIST =  \
 	src/tests/addressmap_unittest.cc src/addressmap-inl.h \
 	src/base/commandlineflags.h src/base/logging.h \
 	src/base/basictypes.h src/base/dynamic_annotations.h \
-	src/windows/port.h src/windows/port.cc
+	src/third_party/valgrind.h src/windows/port.h \
+	src/windows/port.cc
 @MINGW_TRUE@am__objects_25 = addressmap_unittest-port.$(OBJEXT)
 am_addressmap_unittest_OBJECTS =  \
 	addressmap_unittest-addressmap_unittest.$(OBJEXT) \
@@ -774,7 +775,7 @@ am__heap_checker_debug_unittest_SOURCES_DIST =  \
 	src/memory_region_map.h src/base/commandlineflags.h \
 	src/base/googleinit.h src/google/heap-checker.h \
 	src/base/logging.h src/base/basictypes.h \
-	src/base/dynamic_annotations.h
+	src/base/dynamic_annotations.h src/third_party/valgrind.h
 @WITH_HEAP_CHECKER_TRUE@am__objects_26 = $(am__objects_1)
 @WITH_HEAP_CHECKER_TRUE@am__objects_27 = heap_checker_debug_unittest-heap-checker_unittest.$(OBJEXT) \
 @WITH_HEAP_CHECKER_TRUE@	$(am__objects_26)
@@ -795,7 +796,7 @@ am__heap_checker_unittest_SOURCES_DIST =  \
 	src/memory_region_map.h src/base/commandlineflags.h \
 	src/base/googleinit.h src/google/heap-checker.h \
 	src/base/logging.h src/base/basictypes.h \
-	src/base/dynamic_annotations.h
+	src/base/dynamic_annotations.h src/third_party/valgrind.h
 @WITH_HEAP_CHECKER_TRUE@am_heap_checker_unittest_OBJECTS = heap_checker_unittest-heap-checker_unittest.$(OBJEXT) \
 @WITH_HEAP_CHECKER_TRUE@	$(am__objects_26)
 heap_checker_unittest_OBJECTS = $(am_heap_checker_unittest_OBJECTS)
@@ -853,7 +854,8 @@ am__low_level_alloc_unittest_SOURCES_DIST =  \
 	src/base/atomicops-internals-x86.h \
 	src/base/spinlock_win32-inl.h src/base/spinlock_linux-inl.h \
 	src/base/spinlock_posix-inl.h src/base/logging.h \
-	src/base/commandlineflags.h src/base/dynamic_annotations.h
+	src/base/commandlineflags.h src/base/dynamic_annotations.h \
+	src/third_party/valgrind.h
 am__objects_29 = $(am__objects_1) $(am__objects_1)
 am_low_level_alloc_unittest_OBJECTS =  \
 	low_level_alloc_unittest-low_level_alloc.$(OBJEXT) \
@@ -1077,7 +1079,7 @@ am__stacktrace_unittest_SOURCES_DIST =  \
 	src/stacktrace_x86-inl.h src/stacktrace_win32-inl.h \
 	src/base/vdso_support.h src/google/stacktrace.h \
 	src/base/logging.h src/base/basictypes.h \
-	src/base/dynamic_annotations.h
+	src/base/dynamic_annotations.h src/third_party/valgrind.h
 @WITH_STACK_TRACE_TRUE@am__objects_39 = $(am__objects_4) \
 @WITH_STACK_TRACE_TRUE@	$(am__objects_1)
 @WITH_STACK_TRACE_TRUE@am_stacktrace_unittest_OBJECTS =  \
@@ -1702,10 +1704,11 @@ EXTRA_INSTALL =
 LOGGING_INCLUDES = src/base/logging.h \
                    src/base/commandlineflags.h \
                    src/base/basictypes.h \
-                   src/base/dynamic_annotations.h
+                   src/base/dynamic_annotations.h \
+                   src/third_party/valgrind.h
 
 liblogging_la_SOURCES = src/base/logging.cc \
-                        src/base/dynamic_annotations.cc \
+                        src/base/dynamic_annotations.c \
                         $(LOGGING_INCLUDES)
 
 SYSINFO_INCLUDES = src/base/sysinfo.h \
@@ -1823,7 +1826,6 @@ atomicops_unittest_LDADD = $(LIBSPINLOCK)
 @WITH_STACK_TRACE_TRUE@SG_STACKTRACE_INCLUDES = src/google/stacktrace.h
 @WITH_STACK_TRACE_TRUE@STACKTRACE_INCLUDES = $(S_STACKTRACE_INCLUDES) $(SG_STACKTRACE_INCLUDES)
 @WITH_STACK_TRACE_TRUE@libstacktrace_la_SOURCES = src/stacktrace.cc \
-@WITH_STACK_TRACE_TRUE@                           src/stacktrace_with_context.cc \
 @WITH_STACK_TRACE_TRUE@                           src/base/vdso_support.cc \
 @WITH_STACK_TRACE_TRUE@                           $(STACKTRACE_INCLUDES)
 
@@ -2787,7 +2789,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stack_trace_table_test-stack_trace_table_test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stacktrace.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stacktrace_unittest.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stacktrace_with_context.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sysinfo.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/system_alloc_unittest-system-alloc_unittest.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tcmalloc_and_profiler_unittest-tcmalloc_unittest.Po@am__quote@
@@ -2830,6 +2831,13 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
 
+dynamic_annotations.lo: src/base/dynamic_annotations.c
+@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT dynamic_annotations.lo -MD -MP -MF "$(DEPDIR)/dynamic_annotations.Tpo" -c -o dynamic_annotations.lo `test -f 'src/base/dynamic_annotations.c' || echo '$(srcdir)/'`src/base/dynamic_annotations.c; \
+@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/dynamic_annotations.Tpo" "$(DEPDIR)/dynamic_annotations.Plo"; else rm -f "$(DEPDIR)/dynamic_annotations.Tpo"; exit 1; fi
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='src/base/dynamic_annotations.c' object='dynamic_annotations.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o dynamic_annotations.lo `test -f 'src/base/dynamic_annotations.c' || echo '$(srcdir)/'`src/base/dynamic_annotations.c
+
 thread_lister.lo: src/base/thread_lister.c
 @am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT thread_lister.lo -MD -MP -MF "$(DEPDIR)/thread_lister.Tpo" -c -o thread_lister.lo `test -f 'src/base/thread_lister.c' || echo '$(srcdir)/'`src/base/thread_lister.c; \
 @am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/thread_lister.Tpo" "$(DEPDIR)/thread_lister.Plo"; else rm -f "$(DEPDIR)/thread_lister.Tpo"; exit 1; fi
@@ -2879,13 +2887,6 @@ logging.lo: src/base/logging.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o logging.lo `test -f 'src/base/logging.cc' || echo '$(srcdir)/'`src/base/logging.cc
 
-dynamic_annotations.lo: src/base/dynamic_annotations.cc
-@am__fastdepCXX_TRUE@	if $(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT dynamic_annotations.lo -MD -MP -MF "$(DEPDIR)/dynamic_annotations.Tpo" -c -o dynamic_annotations.lo `test -f 'src/base/dynamic_annotations.cc' || echo '$(srcdir)/'`src/base/dynamic_annotations.cc; \
-@am__fastdepCXX_TRUE@	then mv -f "$(DEPDIR)/dynamic_annotations.Tpo" "$(DEPDIR)/dynamic_annotations.Plo"; else rm -f "$(DEPDIR)/dynamic_annotations.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='src/base/dynamic_annotations.cc' object='dynamic_annotations.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o dynamic_annotations.lo `test -f 'src/base/dynamic_annotations.cc' || echo '$(srcdir)/'`src/base/dynamic_annotations.cc
-
 profiler.lo: src/profiler.cc
 @am__fastdepCXX_TRUE@	if $(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT profiler.lo -MD -MP -MF "$(DEPDIR)/profiler.Tpo" -c -o profiler.lo `test -f 'src/profiler.cc' || echo '$(srcdir)/'`src/profiler.cc; \
 @am__fastdepCXX_TRUE@	then mv -f "$(DEPDIR)/profiler.Tpo" "$(DEPDIR)/profiler.Plo"; else rm -f "$(DEPDIR)/profiler.Tpo"; exit 1; fi
@@ -2928,13 +2929,6 @@ stacktrace.lo: src/stacktrace.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o stacktrace.lo `test -f 'src/stacktrace.cc' || echo '$(srcdir)/'`src/stacktrace.cc
 
-stacktrace_with_context.lo: src/stacktrace_with_context.cc
-@am__fastdepCXX_TRUE@	if $(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT stacktrace_with_context.lo -MD -MP -MF "$(DEPDIR)/stacktrace_with_context.Tpo" -c -o stacktrace_with_context.lo `test -f 'src/stacktrace_with_context.cc' || echo '$(srcdir)/'`src/stacktrace_with_context.cc; \
-@am__fastdepCXX_TRUE@	then mv -f "$(DEPDIR)/stacktrace_with_context.Tpo" "$(DEPDIR)/stacktrace_with_context.Plo"; else rm -f "$(DEPDIR)/stacktrace_with_context.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='src/stacktrace_with_context.cc' object='stacktrace_with_context.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o stacktrace_with_context.lo `test -f 'src/stacktrace_with_context.cc' || echo '$(srcdir)/'`src/stacktrace_with_context.cc
-
 vdso_support.lo: src/base/vdso_support.cc
 @am__fastdepCXX_TRUE@	if $(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT vdso_support.lo -MD -MP -MF "$(DEPDIR)/vdso_support.Tpo" -c -o vdso_support.lo `test -f 'src/base/vdso_support.cc' || echo '$(srcdir)/'`src/base/vdso_support.cc; \
 @am__fastdepCXX_TRUE@	then mv -f "$(DEPDIR)/vdso_support.Tpo" "$(DEPDIR)/vdso_support.Plo"; else rm -f "$(DEPDIR)/vdso_support.Tpo"; exit 1; fi
diff --git a/README.windows b/README.windows
index 750aa51..9844087 100644
--- a/README.windows
+++ b/README.windows
@@ -1,3 +1,5 @@
+--- COMPILING
+
 This project has begun being ported to Windows.  A working solution
 file exists in this directory:
     google-perftools.sln
@@ -45,6 +47,17 @@ line of every perftools .cc file.  You do not need to depend on the
 tcmalloc symbol in this case (that is, you don't need to do either
 step 1 or step 2 from above).
 
+An alternative to all the above is to statically link your application
+with libc, and then replace its malloc with tcmalloc.  This allows you
+to just build and link your program normally; the tcmalloc support
+comes in a post-processing step.  This is more reliable than the above
+technique (which depends on run-time patching, which is inherently
+fragile), though more work to set up.  For details, see
+https://groups.google.com/group/google-perftools/browse_thread/thread/41cd3710af85e57b
+
+
+--- THE HEAP-PROFILER
+
 The heap-profiler has had a preliminary port to Windows.  It has not
 been well tested, and probably does not work at all when Frame Pointer
 Optimization (FPO) is enabled -- that is, in release mode.  The other
@@ -52,6 +65,8 @@ features of perftools, such as the cpu-profiler and leak-checker, have
 not yet been ported to Windows at all.
 
 
+--- ISSUES
+
 NOTE FOR WIN2K USERS: According to reports
 (http://code.google.com/p/google-perftools/issues/detail?id=127)
 the stack-tracing necessary for the heap-profiler does not work on
@@ -60,7 +75,6 @@ is to add "/D NO_TCMALLOC_SAMPLES=" to your build, to turn off the
 stack-tracing.  You will not be able to use the heap-profiler if you
 do this.
 
-
 NOTE ON _MSIZE and _RECALLOC: The tcmalloc version of _msize returns
 the size of the region tcmalloc allocated for you -- which is at least
 as many bytes you asked for, but may be more.  (btw, these *are* bytes
@@ -82,4 +96,4 @@ them on the google-perftools Google Code site:
 
 -- craig
 
-Last modified: 18 November 2008
+Last modified: 3 February 2010
diff --git a/configure b/configure
index 35b9731..04e143b 100755
--- a/configure
+++ b/configure
@@ -19797,42 +19797,6 @@ fi
 
 done
        # for being nice in our spinlock code
-for ac_header in ucontext.h
-do :
-  ac_fn_c_check_header_mongrel "$LINENO" "ucontext.h" "ac_cv_header_ucontext_h" "$ac_includes_default"
-if test "x$ac_cv_header_ucontext_h" = x""yes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_UCONTEXT_H 1
-_ACEOF
-
-fi
-
-done
-    # for profiler.cc (cpu profiler)
-for ac_header in sys/ucontext.h
-do :
-  ac_fn_c_check_header_mongrel "$LINENO" "sys/ucontext.h" "ac_cv_header_sys_ucontext_h" "$ac_includes_default"
-if test "x$ac_cv_header_sys_ucontext_h" = x""yes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_SYS_UCONTEXT_H 1
-_ACEOF
-
-fi
-
-done
-         # ucontext on OS X 10.6 (at least)
-for ac_header in cygwin/signal.h
-do :
-  ac_fn_c_check_header_mongrel "$LINENO" "cygwin/signal.h" "ac_cv_header_cygwin_signal_h" "$ac_includes_default"
-if test "x$ac_cv_header_cygwin_signal_h" = x""yes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_CYGWIN_SIGNAL_H 1
-_ACEOF
-
-fi
-
-done
-        # ucontext on cywgin
 for ac_header in conflict-signal.h
 do :
   ac_fn_c_check_header_mongrel "$LINENO" "conflict-signal.h" "ac_cv_header_conflict_signal_h" "$ac_includes_default"
@@ -19965,6 +19929,21 @@ fi
 
 done
          # for memalign_unittest.cc
+for ac_header in valgrind.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "valgrind.h" "ac_cv_header_valgrind_h" "$ac_includes_default"
+if test "x$ac_cv_header_valgrind_h" = x""yes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_VALGRIND_H 1
+_ACEOF
+
+fi
+
+done
+    # we have a local copy if this isn't found
+# We also need <ucontext.h>/<sys/ucontext.h>, but we get those from
+# AC_PC_FROM_UCONTEXT, below.
+
 # We override a lot of memory allocation routines, not all of which are
 # standard.  For those the system doesn't declare, we'll declare ourselves.
 ac_fn_c_check_decl "$LINENO" "cfree" "ac_cv_have_decl_cfree" "#define _XOPEN_SOURCE 600
@@ -20283,7 +20262,15 @@ fi
 
 done
 
-   for ac_header in sys/ucontext.h
+   # Redhat 7 has <sys/ucontext.h>, but it barfs if we #include it directly
+   # (this was fixed in later redhats).  <ucontext.h> works fine, so use that.
+   if grep "Red Hat Linux release 7" /etc/redhat-release >/dev/null 2>&1; then
+
+$as_echo "#define HAVE_SYS_UCONTEXT_H 0" >>confdefs.h
+
+     ac_cv_header_sys_ucontext_h=no
+   else
+     for ac_header in sys/ucontext.h
 do :
   ac_fn_c_check_header_mongrel "$LINENO" "sys/ucontext.h" "ac_cv_header_sys_ucontext_h" "$ac_includes_default"
 if test "x$ac_cv_header_sys_ucontext_h" = x""yes; then :
@@ -20295,6 +20282,19 @@ fi
 
 done
        # ucontext on OS X 10.6 (at least)
+   fi
+   for ac_header in cygwin/signal.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "cygwin/signal.h" "ac_cv_header_cygwin_signal_h" "$ac_includes_default"
+if test "x$ac_cv_header_cygwin_signal_h" = x""yes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_CYGWIN_SIGNAL_H 1
+_ACEOF
+
+fi
+
+done
+        # ucontext on cywgin
    { $as_echo "$as_me:${as_lineno-$LINENO}: checking how to access the program counter from a struct ucontext" >&5
 $as_echo_n "checking how to access the program counter from a struct ucontext... " >&6; }
    pc_fields="           uc_mcontext.gregs[REG_PC]"  # Solaris x86 (32 + 64 bit)
@@ -20304,6 +20304,7 @@ $as_echo_n "checking how to access the program counter from a struct ucontext...
    pc_fields="$pc_fields uc_mcontext.uc_regs->gregs[PT_NIP]" # Linux (ppc)
    pc_fields="$pc_fields uc_mcontext.gregs[R15]"     # Linux (arm old [untested])
    pc_fields="$pc_fields uc_mcontext.arm_pc"           # Linux (arm new [untested])
+   pc_fields="$pc_fields uc_mcontext.gp_regs[PT_NIP]"  # Suse SLES 11 (ppc64)
    pc_fields="$pc_fields uc_mcontext.mc_eip"           # FreeBSD (i386)
    pc_fields="$pc_fields uc_mcontext.mc_rip"           # FreeBSD (x86_64 [untested])
    pc_fields="$pc_fields uc_mcontext.__gregs[_REG_EIP]"  # NetBSD (i386)
@@ -20317,7 +20318,32 @@ $as_echo_n "checking how to access the program counter from a struct ucontext...
    pc_field_found=false
    for pc_field in $pc_fields; do
      if ! $pc_field_found; then
-       if test "x$ac_cv_header_sys_ucontext_h" = xyes; then
+       # Prefer sys/ucontext.h to ucontext.h, for OS X's sake.
+       if test "x$ac_cv_header_cygwin_signal_h" = xyes; then
+         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#define _GNU_SOURCE 1
+                         #include <cygwin/signal.h>
+int
+main ()
+{
+ucontext_t u; return u.$pc_field == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+cat >>confdefs.h <<_ACEOF
+#define PC_FROM_UCONTEXT $pc_field
+_ACEOF
+
+                        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pc_field" >&5
+$as_echo "$pc_field" >&6; }
+                        pc_field_found=true
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+       elif test "x$ac_cv_header_sys_ucontext_h" = xyes; then
          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #define _GNU_SOURCE 1
@@ -20341,7 +20367,7 @@ $as_echo "$pc_field" >&6; }
                         pc_field_found=true
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-       else
+       elif test "x$ac_cv_header_ucontext_h" = xyes; then
          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #define _GNU_SOURCE 1
@@ -20365,6 +20391,29 @@ $as_echo "$pc_field" >&6; }
                         pc_field_found=true
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+       else     # hope some standard header gives it to us
+         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+ucontext_t u; return u.$pc_field == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+cat >>confdefs.h <<_ACEOF
+#define PC_FROM_UCONTEXT $pc_field
+_ACEOF
+
+                        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pc_field" >&5
+$as_echo "$pc_field" >&6; }
+                        pc_field_found=true
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
        fi
      fi
    done
@@ -21500,6 +21549,15 @@ else
 fi
 
 
+# Redhat 7 (and below?) has sys/ucontext.h, but if you try to #include
+# it directly, the compiler gets upset.  So we pretend we don't have
+# it.
+if cat /etc/redhat-release 2>/dev/null | grep "Red Hat Linux release 7" >/dev/null 2>&1; then
+
+$as_echo "#define HAVE_SYS_UCONTEXT_H 0" >>confdefs.h
+
+fi
+
 # Export the --enable flags we set above.  We do this at the end so
 # other configure rules can enable or disable targets based on what
 # they find.
diff --git a/configure.ac b/configure.ac
index 8252ff5..e93cdc4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -119,9 +119,6 @@ AC_CHECK_HEADERS(execinfo.h)    # for stacktrace? and heapchecker_unittest
 AC_CHECK_HEADERS(libunwind.h)   # for stacktrace
 AC_CHECK_HEADERS(unwind.h)      # for stacktrace
 AC_CHECK_HEADERS(sched.h)       # for being nice in our spinlock code
-AC_CHECK_HEADERS(ucontext.h)    # for profiler.cc (cpu profiler)
-AC_CHECK_HEADERS(sys/ucontext.h)         # ucontext on OS X 10.6 (at least)
-AC_CHECK_HEADERS(cygwin/signal.h)        # ucontext on cywgin
 AC_CHECK_HEADERS(conflict-signal.h)      # defined on some windows platforms?
 AC_CHECK_HEADERS(sys/prctl.h)   # for thread_lister (needed by leak-checker)
 AC_CHECK_HEADERS(linux/ptrace.h)# also needed by leak-checker
@@ -133,6 +130,10 @@ AC_CHECK_HEADERS(fcntl.h)       # for tcmalloc_unittest
 AC_CHECK_HEADERS(grp.h)         # for heapchecker_unittest
 AC_CHECK_HEADERS(pwd.h)         # for heapchecker_unittest
 AC_CHECK_HEADERS(sys/resource.h)         # for memalign_unittest.cc
+AC_CHECK_HEADERS(valgrind.h)    # we have a local copy if this isn't found
+# We also need <ucontext.h>/<sys/ucontext.h>, but we get those from
+# AC_PC_FROM_UCONTEXT, below.
+
 # We override a lot of memory allocation routines, not all of which are
 # standard.  For those the system doesn't declare, we'll declare ourselves.
 AC_CHECK_DECLS([cfree,
@@ -311,6 +312,13 @@ AH_BOTTOM([
 ])
 AM_CONDITIONAL(MINGW, expr $host : '.*-mingw' >/dev/null 2>&1)
 
+# Redhat 7 (and below?) has sys/ucontext.h, but if you try to #include
+# it directly, the compiler gets upset.  So we pretend we don't have
+# it.
+if cat /etc/redhat-release 2>/dev/null | grep "Red Hat Linux release 7" >/dev/null 2>&1; then
+AC_DEFINE(HAVE_SYS_UCONTEXT_H, 0, [<sys/ucontext.h> is broken on redhat 7])
+fi
+
 # Export the --enable flags we set above.  We do this at the end so
 # other configure rules can enable or disable targets based on what
 # they find.
diff --git a/doc/heap_checker.html b/doc/heap_checker.html
index 87e497a..caf46ef 100644
--- a/doc/heap_checker.html
+++ b/doc/heap_checker.html
@@ -351,6 +351,15 @@ checking.</p>
 </tr>
 
 <tr valign=top>
+  <td><code>HEAP_CHECK_POINTER_SOURCE_ALIGNMENT</code></td>
+  <td>Default: sizeof(void*)</td>
+  <td>
+    Alignment at which all pointers in memory are supposed to be located.
+    Use 1 if any alignment is ok.
+  </td>
+</tr>
+
+<tr valign=top>
   <td><code>PPROF_PATH</code></td>
   <td>Default: pprof</td>
 <td>
diff --git a/m4/pc_from_ucontext.m4 b/m4/pc_from_ucontext.m4
index daffddb..19ec347 100644
--- a/m4/pc_from_ucontext.m4
+++ b/m4/pc_from_ucontext.m4
@@ -11,7 +11,15 @@
 
 AC_DEFUN([AC_PC_FROM_UCONTEXT],
   [AC_CHECK_HEADERS(ucontext.h)
-   AC_CHECK_HEADERS(sys/ucontext.h)       # ucontext on OS X 10.6 (at least)
+   # Redhat 7 has <sys/ucontext.h>, but it barfs if we #include it directly
+   # (this was fixed in later redhats).  <ucontext.h> works fine, so use that.
+   if grep "Red Hat Linux release 7" /etc/redhat-release >/dev/null 2>&1; then
+     AC_DEFINE(HAVE_SYS_UCONTEXT_H, 0, [<sys/ucontext.h> is broken on redhat 7])
+     ac_cv_header_sys_ucontext_h=no
+   else
+     AC_CHECK_HEADERS(sys/ucontext.h)       # ucontext on OS X 10.6 (at least)
+   fi
+   AC_CHECK_HEADERS(cygwin/signal.h)        # ucontext on cywgin
    AC_MSG_CHECKING([how to access the program counter from a struct ucontext])
    pc_fields="           uc_mcontext.gregs[[REG_PC]]"  # Solaris x86 (32 + 64 bit)
    pc_fields="$pc_fields uc_mcontext.gregs[[REG_EIP]]" # Linux (i386)
@@ -20,6 +28,7 @@ AC_DEFUN([AC_PC_FROM_UCONTEXT],
    pc_fields="$pc_fields uc_mcontext.uc_regs->gregs[[PT_NIP]]" # Linux (ppc)
    pc_fields="$pc_fields uc_mcontext.gregs[[R15]]"     # Linux (arm old [untested])
    pc_fields="$pc_fields uc_mcontext.arm_pc"           # Linux (arm new [untested])
+   pc_fields="$pc_fields uc_mcontext.gp_regs[[PT_NIP]]"  # Suse SLES 11 (ppc64)
    pc_fields="$pc_fields uc_mcontext.mc_eip"           # FreeBSD (i386)
    pc_fields="$pc_fields uc_mcontext.mc_rip"           # FreeBSD (x86_64 [untested])
    pc_fields="$pc_fields uc_mcontext.__gregs[[_REG_EIP]]"  # NetBSD (i386)
@@ -33,7 +42,16 @@ AC_DEFUN([AC_PC_FROM_UCONTEXT],
    pc_field_found=false
    for pc_field in $pc_fields; do
      if ! $pc_field_found; then
-       if test "x$ac_cv_header_sys_ucontext_h" = xyes; then
+       # Prefer sys/ucontext.h to ucontext.h, for OS X's sake.
+       if test "x$ac_cv_header_cygwin_signal_h" = xyes; then
+         AC_TRY_COMPILE([#define _GNU_SOURCE 1
+                         #include <cygwin/signal.h>],
+                        [ucontext_t u; return u.$pc_field == 0;],
+                        AC_DEFINE_UNQUOTED(PC_FROM_UCONTEXT, $pc_field,
+                                           How to access the PC from a struct ucontext)
+                        AC_MSG_RESULT([$pc_field])
+                        pc_field_found=true)
+       elif test "x$ac_cv_header_sys_ucontext_h" = xyes; then
          AC_TRY_COMPILE([#define _GNU_SOURCE 1
                          #include <sys/ucontext.h>],
                         [ucontext_t u; return u.$pc_field == 0;],
@@ -41,7 +59,7 @@ AC_DEFUN([AC_PC_FROM_UCONTEXT],
                                            How to access the PC from a struct ucontext)
                         AC_MSG_RESULT([$pc_field])
                         pc_field_found=true)
-       else
+       elif test "x$ac_cv_header_ucontext_h" = xyes; then
          AC_TRY_COMPILE([#define _GNU_SOURCE 1
                          #include <ucontext.h>],
                         [ucontext_t u; return u.$pc_field == 0;],
@@ -49,6 +67,13 @@ AC_DEFUN([AC_PC_FROM_UCONTEXT],
                                            How to access the PC from a struct ucontext)
                         AC_MSG_RESULT([$pc_field])
                         pc_field_found=true)
+       else     # hope some standard header gives it to us
+         AC_TRY_COMPILE([],
+                        [ucontext_t u; return u.$pc_field == 0;],
+                        AC_DEFINE_UNQUOTED(PC_FROM_UCONTEXT, $pc_field,
+                                           How to access the PC from a struct ucontext)
+                        AC_MSG_RESULT([$pc_field])
+                        pc_field_found=true)
        fi
      fi
    done
diff --git a/src/base/dynamic_annotations.c b/src/base/dynamic_annotations.c
new file mode 100644
index 0000000..65c4158
--- /dev/null
+++ b/src/base/dynamic_annotations.c
@@ -0,0 +1,152 @@
+/* Copyright (c) 2008-2009, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ */
+
+#ifdef __cplusplus
+# error "This file should be built as pure C to avoid name mangling"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "base/dynamic_annotations.h"
+
+#ifdef __GNUC__
+/* valgrind.h uses gcc extensions so it won't build with other compilers */
+# ifdef HAVE_VALGRIND_H    /* prefer the user's copy if they have it */
+#  include <valgrind.h>
+# else                     /* otherwise just use the copy that we have */
+#  include "third_party/valgrind.h"
+# endif
+#endif
+
+/* Each function is empty and called (via a macro) only in debug mode.
+   The arguments are captured by dynamic tools at runtime. */
+
+#if DYNAMIC_ANNOTATIONS_ENABLED == 1
+
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock){}
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock){}
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w){}
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w){}
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed) {}
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier) {}
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier) {}
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier) {}
+
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock){}
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv){}
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv){}
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size){}
+void AnnotateUnpublishMemoryRange(const char *file, int line,
+                                  const volatile void *address,
+                                  long size){}
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq){}
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq){}
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq){}
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq){}
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *mem,
+                       long size){}
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *mem,
+                        const char *description){}
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *mem,
+                        const char *description){}
+void AnnotateBenignRaceSized(const char *file, int line,
+                             const volatile void *mem,
+                             long size,
+                             const char *description) {
+  long i;
+  for (i = 0; i < size; i++) {
+    AnnotateBenignRace(file, line, (char*)(mem) + i, description);
+  }
+}
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu){}
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg){}
+void AnnotateThreadName(const char *file, int line,
+                        const char *name){}
+void AnnotateIgnoreReadsBegin(const char *file, int line){}
+void AnnotateIgnoreReadsEnd(const char *file, int line){}
+void AnnotateIgnoreWritesBegin(const char *file, int line){}
+void AnnotateIgnoreWritesEnd(const char *file, int line){}
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg){}
+void AnnotateFlushState(const char *file, int line){}
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED == 1 */
+
+static int GetRunningOnValgrind(void) {
+#ifdef RUNNING_ON_VALGRIND
+  if (RUNNING_ON_VALGRIND) return 1;
+#endif
+  // TODO(csilvers): use GetenvBeforeMain() instead?  Will need to
+  //                 change it to be extern "C".
+  char *running_on_valgrind_str = getenv("RUNNING_ON_VALGRIND");
+  if (running_on_valgrind_str) {
+    return strcmp(running_on_valgrind_str, "0") != 0;
+  }
+  return 0;
+}
+
+/* See the comments in dynamic_annotations.h */
+int RunningOnValgrind(void) {
+  static volatile int running_on_valgrind = -1;
+  /* C doesn't have thread-safe initialization of statics, and we
+     don't want to depend on pthread_once here, so hack it. */
+  int local_running_on_valgrind = running_on_valgrind;
+  if (local_running_on_valgrind == -1)
+    running_on_valgrind = local_running_on_valgrind = GetRunningOnValgrind();
+  return local_running_on_valgrind;
+}
diff --git a/src/base/dynamic_annotations.cc b/src/base/dynamic_annotations.cc
deleted file mode 100644
index c8bbcd7..0000000
--- a/src/base/dynamic_annotations.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2008, Google Inc.
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * 
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *     * Neither the name of Google Inc. nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * ---
- * Author: Kostya Serebryany
- */
-
-#include <config.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "base/dynamic_annotations.h"
-#include "base/sysinfo.h"
-
-// Each function is empty and called (via a macro) only in debug mode.
-// The arguments are captured by dynamic tools at runtime.
-
-extern "C" void AnnotateRWLockCreate(const char *file, int line,
-                                     const volatile void *lock){}
-extern "C" void AnnotateRWLockDestroy(const char *file, int line,
-                                      const volatile void *lock){}
-extern "C" void AnnotateRWLockAcquired(const char *file, int line,
-                                       const volatile void *lock, long is_w){}
-extern "C" void AnnotateRWLockReleased(const char *file, int line,
-                                       const volatile void *lock, long is_w){}
-extern "C" void AnnotateCondVarWait(const char *file, int line,
-                                    const volatile void *cv,
-                                    const volatile void *lock){}
-extern "C" void AnnotateCondVarSignal(const char *file, int line,
-                                      const volatile void *cv){}
-extern "C" void AnnotateCondVarSignalAll(const char *file, int line,
-                                         const volatile void *cv){}
-extern "C" void AnnotatePublishMemoryRange(const char *file, int line,
-                                           const volatile void *address,
-                                           long size){}
-extern "C" void AnnotateUnpublishMemoryRange(const char *file, int line,
-                                           const volatile void *address,
-                                           long size){}
-extern "C" void AnnotatePCQCreate(const char *file, int line,
-                                  const volatile void *pcq){}
-extern "C" void AnnotatePCQDestroy(const char *file, int line,
-                                   const volatile void *pcq){}
-extern "C" void AnnotatePCQPut(const char *file, int line,
-                               const volatile void *pcq){}
-extern "C" void AnnotatePCQGet(const char *file, int line,
-                               const volatile void *pcq){}
-extern "C" void AnnotateNewMemory(const char *file, int line,
-                                  const volatile void *mem,
-                                  long size){}
-extern "C" void AnnotateExpectRace(const char *file, int line,
-                                   const volatile void *mem,
-                                   const char *description){}
-extern "C" void AnnotateBenignRace(const char *file, int line,
-                                   const volatile void *mem,
-                                   const char *description){}
-extern "C" void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
-                                            const volatile void *mu){}
-extern "C" void AnnotateTraceMemory(const char *file, int line,
-                                    const volatile void *arg){}
-extern "C" void AnnotateThreadName(const char *file, int line,
-                                   const char *name){}
-extern "C" void AnnotateIgnoreReadsBegin(const char *file, int line){}
-extern "C" void AnnotateIgnoreReadsEnd(const char *file, int line){}
-extern "C" void AnnotateIgnoreWritesBegin(const char *file, int line){}
-extern "C" void AnnotateIgnoreWritesEnd(const char *file, int line){}
-extern "C" void AnnotateNoOp(const char *file, int line,
-                             const volatile void *arg){}
-
-static int GetRunningOnValgrind() {
-  const char *running_on_valgrind_str = GetenvBeforeMain("RUNNING_ON_VALGRIND");
-  if (running_on_valgrind_str) {
-    return strcmp(running_on_valgrind_str, "0") != 0;
-  }
-  return 0;
-}
-
-// When running under valgrind, this function will be intercepted
-// and a non-zero value will be returned.
-// Some valgrind-based tools (e.g. callgrind) do not intercept functions,
-// so we also read environment variable.
-extern "C" int RunningOnValgrind() {
-  static int running_on_valgrind = GetRunningOnValgrind();
-  return running_on_valgrind;
-}
diff --git a/src/base/dynamic_annotations.h b/src/base/dynamic_annotations.h
index a2a268f..3980b24 100644
--- a/src/base/dynamic_annotations.h
+++ b/src/base/dynamic_annotations.h
@@ -1,10 +1,10 @@
 /* Copyright (c) 2008, Google Inc.
  * All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
- * 
+ *
  *     * Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above
@@ -14,7 +14,7 @@
  *     * Neither the name of Google Inc. nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -31,445 +31,463 @@
  * Author: Kostya Serebryany
  */
 
-// This file defines dynamic annotations for use with dynamic analysis
-// tool such as valgrind, PIN, etc.
-//
-// Dynamic annotation is a source code annotation that affects
-// the generated code (that is, the annotation is not a comment).
-// Each such annotation is attached to a particular
-// instruction and/or to a particular object (address) in the program.
-//
-// The annotations that should be used by users are macros in all upper-case
-// (e.g., ANNOTATE_NEW_MEMORY).
-//
-// Actual implementation of these macros may differ depending on the
-// dynamic analysis tool being used.
-//
-// This file supports the following dynamic analysis tools:
-// - None (NDEBUG is defined).
-//    Macros are defined empty.
-// - Helgrind (NDEBUG is not defined).
-//    Macros are defined as calls to non-inlinable empty functions
-//    that are intercepted by helgrind.
-//
+/* This file defines dynamic annotations for use with dynamic analysis
+   tool such as valgrind, PIN, etc.
+
+   Dynamic annotation is a source code annotation that affects
+   the generated code (that is, the annotation is not a comment).
+   Each such annotation is attached to a particular
+   instruction and/or to a particular object (address) in the program.
+
+   The annotations that should be used by users are macros in all upper-case
+   (e.g., ANNOTATE_NEW_MEMORY).
+
+   Actual implementation of these macros may differ depending on the
+   dynamic analysis tool being used.
+
+   See http://code.google.com/p/data-race-test/  for more information.
+
+   This file supports the following dynamic analysis tools:
+   - None (DYNAMIC_ANNOTATIONS_ENABLED is not defined or zero).
+      Macros are defined empty.
+   - ThreadSanitizer, Helgrind, DRD (DYNAMIC_ANNOTATIONS_ENABLED is 1).
+      Macros are defined as calls to non-inlinable empty functions
+      that are intercepted by Valgrind. */
+
 #ifndef BASE_DYNAMIC_ANNOTATIONS_H_
 #define BASE_DYNAMIC_ANNOTATIONS_H_
 
-#include "base/thread_annotations.h"
-
-// All the annotation macros are in effect only in debug mode.
-#ifndef NDEBUG
-
-  // -------------------------------------------------------------
-  // Annotations useful when implementing condition variables such as CondVar,
-  // using conditional critical sections (Await/LockWhen) and when constructing
-  // user-defined synchronization mechanisms.
-  //
-  // The annotations ANNOTATE_HAPPENS_BEFORE() and ANNOTATE_HAPPENS_AFTER() can
-  // be used to define happens-before arcs in user-defined synchronization
-  // mechanisms:  the race detector will infer an arc from the former to the
-  // latter when they share the same argument pointer.
-  //
-  // Example 1 (reference counting):
-  //
-  // void Unref() {
-  //   ANNOTATE_HAPPENS_BEFORE(&refcount_);
-  //   if (AtomicDecrementByOne(&refcount_) == 0) {
-  //     ANNOTATE_HAPPENS_AFTER(&refcount_);
-  //     delete this;
-  //   }
-  // }
-  //
-  // Example 2 (message queue):
-  //
-  // void MyQueue::Put(Type *e) {
-  //   MutexLock lock(&mu_);
-  //   ANNOTATE_HAPPENS_BEFORE(e);
-  //   PutElementIntoMyQueue(e);
-  // }
-  //
-  // Type *MyQueue::Get() {
-  //   MutexLock lock(&mu_);
-  //   Type *e = GetElementFromMyQueue();
-  //   ANNOTATE_HAPPENS_AFTER(e);
-  //   return e;
-  // }
-  //
-  // Note: when possible, please use the existing reference counting and message
-  // queue implementations instead of inventing new ones.
-
-  // Report that wait on the condition variable at address "cv" has succeeded
-  // and the lock at address "lock" is held.
+#ifndef DYNAMIC_ANNOTATIONS_ENABLED
+# define DYNAMIC_ANNOTATIONS_ENABLED 0
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing condition variables such as CondVar,
+     using conditional critical sections (Await/LockWhen) and when constructing
+     user-defined synchronization mechanisms.
+
+     The annotations ANNOTATE_HAPPENS_BEFORE() and ANNOTATE_HAPPENS_AFTER() can
+     be used to define happens-before arcs in user-defined synchronization
+     mechanisms:  the race detector will infer an arc from the former to the
+     latter when they share the same argument pointer.
+
+     Example 1 (reference counting):
+
+     void Unref() {
+       ANNOTATE_HAPPENS_BEFORE(&refcount_);
+       if (AtomicDecrementByOne(&refcount_) == 0) {
+         ANNOTATE_HAPPENS_AFTER(&refcount_);
+         delete this;
+       }
+     }
+
+     Example 2 (message queue):
+
+     void MyQueue::Put(Type *e) {
+       MutexLock lock(&mu_);
+       ANNOTATE_HAPPENS_BEFORE(e);
+       PutElementIntoMyQueue(e);
+     }
+
+     Type *MyQueue::Get() {
+       MutexLock lock(&mu_);
+       Type *e = GetElementFromMyQueue();
+       ANNOTATE_HAPPENS_AFTER(e);
+       return e;
+     }
+
+     Note: when possible, please use the existing reference counting and message
+     queue implementations instead of inventing new ones. */
+
+  /* Report that wait on the condition variable at address "cv" has succeeded
+     and the lock at address "lock" is held. */
   #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) \
     AnnotateCondVarWait(__FILE__, __LINE__, cv, lock)
 
-  // Report that wait on the condition variable at "cv" has succeeded.  Variant
-  // w/o lock.
+  /* Report that wait on the condition variable at "cv" has succeeded.  Variant
+     w/o lock. */
   #define ANNOTATE_CONDVAR_WAIT(cv) \
     AnnotateCondVarWait(__FILE__, __LINE__, cv, NULL)
 
-  // Report that we are about to signal on the condition variable at address
-  // "cv".
+  /* Report that we are about to signal on the condition variable at address
+     "cv". */
   #define ANNOTATE_CONDVAR_SIGNAL(cv) \
     AnnotateCondVarSignal(__FILE__, __LINE__, cv)
 
-  // Report that we are about to signal_all on the condition variable at "cv".
+  /* Report that we are about to signal_all on the condition variable at "cv". */
   #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) \
     AnnotateCondVarSignalAll(__FILE__, __LINE__, cv)
 
-  // Annotations for user-defined synchronization mechanisms.
+  /* Annotations for user-defined synchronization mechanisms. */
   #define ANNOTATE_HAPPENS_BEFORE(obj) ANNOTATE_CONDVAR_SIGNAL(obj)
   #define ANNOTATE_HAPPENS_AFTER(obj)  ANNOTATE_CONDVAR_WAIT(obj)
 
-  // Report that the bytes in the range [pointer, pointer+size) are about
-  // to be published safely. The race checker will create a happens-before
-  // arc from the call ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) to
-  // subsequent accesses to this memory.
+  /* Report that the bytes in the range [pointer, pointer+size) are about
+     to be published safely. The race checker will create a happens-before
+     arc from the call ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) to
+     subsequent accesses to this memory.
+     Note: this annotation may not work properly if the race detector uses
+     sampling, i.e. does not observe all memory accesses.
+     */
   #define ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) \
     AnnotatePublishMemoryRange(__FILE__, __LINE__, pointer, size)
 
-  // Report that the bytes in the range [pointer, pointer+size) are not shared
-  // between threads any more and can be safely used by the current thread w/o
-  // synchronization. The race checker will create a happens-before arc from
-  // all previous accesses to this memory to this call.
-  //
-  // This annotation could be applied to complex objects, such as STL
-  // containers, with one condition: the accesses to the object itself
-  // and its internal data should not be separated with any synchronization.
-  //
-  // Example that works:
-  //
-  // map<int, int> the_map;
-  // void Thread1() {
-  //   MutexLock lock(&mu);
-  //   // Ok: accesses to the_map and its internal data is not separated by
-  //   // synchronization.
-  //   the_map[1]++;
-  // }
-  // void Thread2() {
-  //   {
-  //     MutexLock lock(&mu);
-  //     ...
-  //     // because of some reason we know that the_map will not be used by
-  //     // other threads any more
-  //     ANNOTATE_UNPUBLISH_MEMORY_RANGE(&the_map, sizeof(the_map));
-  //   }
-  //   the_map->DoSomething();
-  // }
-  //
-  // Example that does not work (due to the way happens-before arcs are
-  // represented in some race detectors):
-  //
-  // void Thread1() {
-  //   MutexLock lock(&mu);
-  //   int *guts_of_the_map = &(*the_map)[1];
-  //   // we have some synchronization between access to 'c' and its guts.
-  //   // This will make ANNOTATE_UNPUBLISH_MEMORY_RANGE in Thread2  useless.
-  //   some_other_lock_or_other_synchronization_utility.Lock();
-  //   (*guts_of_the_map)++;
-  //    ...
-  // }
-  //
-  // void Thread1() { // same as above...
+  /* DEPRECATED. Don't use it. */
   #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(pointer, size) \
     AnnotateUnpublishMemoryRange(__FILE__, __LINE__, pointer, size)
 
-  // This annotation should be used to annotate thread-safe swapping of
-  // containers. Required only when using hybrid (i.e. not pure happens-before)
-  // detectors.
-  //
-  // This annotation has the same limitation as ANNOTATE_UNPUBLISH_MEMORY_RANGE
-  // (see above).
-  //
-  // Example:
-  // map<int, int> the_map;
-  // void Thread1() {
-  //   MutexLock lock(&mu);
-  //   the_map[1]++;
-  // }
-  // void Thread2() {
-  //   map<int,int> tmp;
-  //   {
-  //     MutexLock lock(&mu);
-  //     the_map.swap(tmp);
-  //     ANNOTATE_SWAP_MEMORY_RANGE(&the_map, sizeof(the_map));
-  //   }
-  //   tmp->DoSomething();
-  // }
+  /* DEPRECATED. Don't use it. */
   #define ANNOTATE_SWAP_MEMORY_RANGE(pointer, size)   \
     do {                                              \
       ANNOTATE_UNPUBLISH_MEMORY_RANGE(pointer, size); \
       ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size);   \
     } while (0)
 
-  // Instruct the tool to create a happens-before arc between mu->Unlock() and
-  // mu->Lock(). This annotation may slow down the race detector and hide real
-  // races. Normally it is used only when it would be difficult to annotate each
-  // of the mutex's critical sections individually using the annotations above.
-  // This annotation makes sense only for hybrid race detectors. For pure
-  // happens-before detectors this is a no-op. For more details see
-  // http://code.google.com/p/data-race-test/wiki/PureHappensBeforeVsHybrid .
+  /* Instruct the tool to create a happens-before arc between mu->Unlock() and
+     mu->Lock(). This annotation may slow down the race detector and hide real
+     races. Normally it is used only when it would be difficult to annotate each
+     of the mutex's critical sections individually using the annotations above.
+     This annotation makes sense only for hybrid race detectors. For pure
+     happens-before detectors this is a no-op. For more details see
+     http://code.google.com/p/data-race-test/wiki/PureHappensBeforeVsHybrid . */
   #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) \
     AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
 
-  // Deprecated. Use ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX.
+  /* Deprecated. Use ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX. */
   #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) \
     AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
 
-  // -------------------------------------------------------------
-  // Annotations useful when defining memory allocators, or when memory that
-  // was protected in one way starts to be protected in another.
+  /* -------------------------------------------------------------
+     Annotations useful when defining memory allocators, or when memory that
+     was protected in one way starts to be protected in another. */
 
-  // Report that a new memory at "address" of size "size" has been allocated.
-  // This might be used when the memory has been retrieved from a free list and
-  // is about to be reused, or when a the locking discipline for a variable
-  // changes.
+  /* Report that a new memory at "address" of size "size" has been allocated.
+     This might be used when the memory has been retrieved from a free list and
+     is about to be reused, or when a the locking discipline for a variable
+     changes. */
   #define ANNOTATE_NEW_MEMORY(address, size) \
     AnnotateNewMemory(__FILE__, __LINE__, address, size)
 
-  // -------------------------------------------------------------
-  // Annotations useful when defining FIFO queues that transfer data between
-  // threads.
+  /* -------------------------------------------------------------
+     Annotations useful when defining FIFO queues that transfer data between
+     threads. */
 
-  // Report that the producer-consumer queue (such as ProducerConsumerQueue) at
-  // address "pcq" has been created.  The ANNOTATE_PCQ_* annotations
-  // should be used only for FIFO queues.  For non-FIFO queues use
-  // ANNOTATE_HAPPENS_BEFORE (for put) and ANNOTATE_HAPPENS_AFTER (for get).
+  /* Report that the producer-consumer queue (such as ProducerConsumerQueue) at
+     address "pcq" has been created.  The ANNOTATE_PCQ_* annotations
+     should be used only for FIFO queues.  For non-FIFO queues use
+     ANNOTATE_HAPPENS_BEFORE (for put) and ANNOTATE_HAPPENS_AFTER (for get). */
   #define ANNOTATE_PCQ_CREATE(pcq) \
     AnnotatePCQCreate(__FILE__, __LINE__, pcq)
 
-  // Report that the queue at address "pcq" is about to be destroyed.
+  /* Report that the queue at address "pcq" is about to be destroyed. */
   #define ANNOTATE_PCQ_DESTROY(pcq) \
     AnnotatePCQDestroy(__FILE__, __LINE__, pcq)
 
-  // Report that we are about to put an element into a FIFO queue at address
-  // "pcq".
+  /* Report that we are about to put an element into a FIFO queue at address
+     "pcq". */
   #define ANNOTATE_PCQ_PUT(pcq) \
     AnnotatePCQPut(__FILE__, __LINE__, pcq)
 
-  // Report that we've just got an element from a FIFO queue at address "pcq".
+  /* Report that we've just got an element from a FIFO queue at address "pcq". */
   #define ANNOTATE_PCQ_GET(pcq) \
     AnnotatePCQGet(__FILE__, __LINE__, pcq)
 
-  // -------------------------------------------------------------
-  // Annotations that suppress errors.  It is usually better to express the
-  // program's synchronization using the other annotations, but these can
-  // be used when all else fails.
-
-  // Report that we may have a benign race on at "address".
-  // Insert at the point where "address" has been allocated, preferably close
-  // to the point where the race happens.
-  // See also ANNOTATE_BENIGN_RACE_STATIC.
-  #define ANNOTATE_BENIGN_RACE(address, description) \
-    AnnotateBenignRace(__FILE__, __LINE__, address, description)
-
-  // Request the analysis tool to ignore all reads in the current thread
-  // until ANNOTATE_IGNORE_READS_END is called.
-  // Useful to ignore intentional racey reads, while still checking
-  // other reads and all writes.
-  // See also ANNOTATE_UNPROTECTED_READ.
+  /* -------------------------------------------------------------
+     Annotations that suppress errors.  It is usually better to express the
+     program's synchronization using the other annotations, but these can
+     be used when all else fails. */
+
+  /* Report that we may have a benign race at "pointer", with size
+     "sizeof(*(pointer))". "pointer" must be a non-void* pointer.  Insert at the
+     point where "pointer" has been allocated, preferably close to the point
+     where the race happens.  See also ANNOTATE_BENIGN_RACE_STATIC. */
+  #define ANNOTATE_BENIGN_RACE(pointer, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, pointer, \
+                            sizeof(*(pointer)), description)
+
+  /* Same as ANNOTATE_BENIGN_RACE(address, description), but applies to
+     the memory range [address, address+size). */
+  #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, address, size, description)
+
+  /* Request the analysis tool to ignore all reads in the current thread
+     until ANNOTATE_IGNORE_READS_END is called.
+     Useful to ignore intentional racey reads, while still checking
+     other reads and all writes.
+     See also ANNOTATE_UNPROTECTED_READ. */
   #define ANNOTATE_IGNORE_READS_BEGIN() \
     AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
 
-  // Stop ignoring reads.
+  /* Stop ignoring reads. */
   #define ANNOTATE_IGNORE_READS_END() \
     AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
 
-  // Similar to ANNOTATE_IGNORE_READS_BEGIN, but ignore writes.
+  /* Similar to ANNOTATE_IGNORE_READS_BEGIN, but ignore writes. */
   #define ANNOTATE_IGNORE_WRITES_BEGIN() \
     AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
 
-  // Stop ignoring writes.
+  /* Stop ignoring writes. */
   #define ANNOTATE_IGNORE_WRITES_END() \
     AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
 
-  // Start ignoring all memory accesses (reads and writes).
+  /* Start ignoring all memory accesses (reads and writes). */
   #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \
     do {\
       ANNOTATE_IGNORE_READS_BEGIN();\
       ANNOTATE_IGNORE_WRITES_BEGIN();\
     }while(0)\
 
-  // Stop ignoring all memory accesses.
+  /* Stop ignoring all memory accesses. */
   #define ANNOTATE_IGNORE_READS_AND_WRITES_END() \
     do {\
       ANNOTATE_IGNORE_WRITES_END();\
       ANNOTATE_IGNORE_READS_END();\
     }while(0)\
 
-  // -------------------------------------------------------------
-  // Annotations useful for debugging.
+  /* -------------------------------------------------------------
+     Annotations useful for debugging. */
 
-  // Request to trace every access to "address".
+  /* Request to trace every access to "address". */
   #define ANNOTATE_TRACE_MEMORY(address) \
     AnnotateTraceMemory(__FILE__, __LINE__, address)
 
-  // Report the current thread name to a race detector.
+  /* Report the current thread name to a race detector. */
   #define ANNOTATE_THREAD_NAME(name) \
     AnnotateThreadName(__FILE__, __LINE__, name)
 
-  // -------------------------------------------------------------
-  // Annotations useful when implementing locks.  They are not
-  // normally needed by modules that merely use locks.
-  // The "lock" argument is a pointer to the lock object.
+  /* -------------------------------------------------------------
+     Annotations useful when implementing locks.  They are not
+     normally needed by modules that merely use locks.
+     The "lock" argument is a pointer to the lock object. */
 
-  // Report that a lock has been created at address "lock".
+  /* Report that a lock has been created at address "lock". */
   #define ANNOTATE_RWLOCK_CREATE(lock) \
     AnnotateRWLockCreate(__FILE__, __LINE__, lock)
 
-  // Report that the lock at address "lock" is about to be destroyed.
+  /* Report that the lock at address "lock" is about to be destroyed. */
   #define ANNOTATE_RWLOCK_DESTROY(lock) \
     AnnotateRWLockDestroy(__FILE__, __LINE__, lock)
 
-  // Report that the lock at address "lock" has been acquired.
-  // is_w=1 for writer lock, is_w=0 for reader lock.
+  /* Report that the lock at address "lock" has been acquired.
+     is_w=1 for writer lock, is_w=0 for reader lock. */
   #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) \
     AnnotateRWLockAcquired(__FILE__, __LINE__, lock, is_w)
 
-  // Report that the lock at address "lock" is about to be released.
+  /* Report that the lock at address "lock" is about to be released. */
   #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) \
     AnnotateRWLockReleased(__FILE__, __LINE__, lock, is_w)
 
-  // -------------------------------------------------------------
-  // Annotations useful for testing race detectors.
+  /* -------------------------------------------------------------
+     Annotations useful when implementing barriers.  They are not
+     normally needed by modules that merely use barriers.
+     The "barrier" argument is a pointer to the barrier object. */
+
+  /* Report that the "barrier" has been initialized with initial "count".
+   If 'reinitialization_allowed' is true, initialization is allowed to happen
+   multiple times w/o calling barrier_destroy() */
+  #define ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) \
+    AnnotateBarrierInit(__FILE__, __LINE__, barrier, count, \
+                        reinitialization_allowed)
+
+  /* Report that we are about to enter barrier_wait("barrier"). */
+  #define ANNOTATE_BARRIER_WAIT_BEFORE(barrier) \
+    AnnotateBarrierWaitBefore(__FILE__, __LINE__, barrier)
+
+  /* Report that we just exited barrier_wait("barrier"). */
+  #define ANNOTATE_BARRIER_WAIT_AFTER(barrier) \
+    AnnotateBarrierWaitAfter(__FILE__, __LINE__, barrier)
+
+  /* Report that the "barrier" has been destroyed. */
+  #define ANNOTATE_BARRIER_DESTROY(barrier) \
+    AnnotateBarrierDestroy(__FILE__, __LINE__, barrier)
+
+  /* -------------------------------------------------------------
+     Annotations useful for testing race detectors. */
 
-  // Report that we expect a race on the variable at "address".
-  // Use only in unit tests for a race detector.
+  /* Report that we expect a race on the variable at "address".
+     Use only in unit tests for a race detector. */
   #define ANNOTATE_EXPECT_RACE(address, description) \
     AnnotateExpectRace(__FILE__, __LINE__, address, description)
 
-  // A no-op. Insert where you like to test the interceptors.
+  /* A no-op. Insert where you like to test the interceptors. */
   #define ANNOTATE_NO_OP(arg) \
     AnnotateNoOp(__FILE__, __LINE__, arg)
 
-#else  // NDEBUG is defined
-
-  #define ANNOTATE_RWLOCK_CREATE(lock) // empty
-  #define ANNOTATE_RWLOCK_DESTROY(lock) // empty
-  #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) // empty
-  #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) // empty
-  #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) // empty
-  #define ANNOTATE_CONDVAR_WAIT(cv) // empty
-  #define ANNOTATE_CONDVAR_SIGNAL(cv) // empty
-  #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) // empty
-  #define ANNOTATE_HAPPENS_BEFORE(obj) // empty
-  #define ANNOTATE_HAPPENS_AFTER(obj) // empty
-  #define ANNOTATE_PUBLISH_MEMORY_RANGE(address, size) // empty
-  #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(address, size)  // empty
-  #define ANNOTATE_SWAP_MEMORY_RANGE(address, size)  // empty
-  #define ANNOTATE_PCQ_CREATE(pcq) // empty
-  #define ANNOTATE_PCQ_DESTROY(pcq) // empty
-  #define ANNOTATE_PCQ_PUT(pcq) // empty
-  #define ANNOTATE_PCQ_GET(pcq) // empty
-  #define ANNOTATE_NEW_MEMORY(address, size) // empty
-  #define ANNOTATE_EXPECT_RACE(address, description) // empty
-  #define ANNOTATE_BENIGN_RACE(address, description) // empty
-  #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) // empty
-  #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) // empty
-  #define ANNOTATE_TRACE_MEMORY(arg) // empty
-  #define ANNOTATE_THREAD_NAME(name) // empty
-  #define ANNOTATE_IGNORE_READS_BEGIN() // empty
-  #define ANNOTATE_IGNORE_READS_END() // empty
-  #define ANNOTATE_IGNORE_WRITES_BEGIN() // empty
-  #define ANNOTATE_IGNORE_WRITES_END() // empty
-  #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() // empty
-  #define ANNOTATE_IGNORE_READS_AND_WRITES_END() // empty
-  #define ANNOTATE_NO_OP(arg) // empty
-
-#endif  // NDEBUG
-
-// Use the macros above rather than using these functions directly.
-extern "C" void AnnotateRWLockCreate(const char *file, int line,
-                                     const volatile void *lock);
-extern "C" void AnnotateRWLockDestroy(const char *file, int line,
-                                      const volatile void *lock);
-extern "C" void AnnotateRWLockAcquired(const char *file, int line,
-                                       const volatile void *lock, long is_w);
-extern "C" void AnnotateRWLockReleased(const char *file, int line,
-                                       const volatile void *lock, long is_w);
-extern "C" void AnnotateCondVarWait(const char *file, int line,
-                                    const volatile void *cv,
-                                    const volatile void *lock);
-extern "C" void AnnotateCondVarSignal(const char *file, int line,
-                                      const volatile void *cv);
-extern "C" void AnnotateCondVarSignalAll(const char *file, int line,
-                                         const volatile void *cv);
-extern "C" void AnnotatePublishMemoryRange(const char *file, int line,
-                                           const volatile void *address,
-                                           long size);
-extern "C" void AnnotateUnpublishMemoryRange(const char *file, int line,
-                                           const volatile void *address,
-                                           long size);
-extern "C" void AnnotatePCQCreate(const char *file, int line,
-                                  const volatile void *pcq);
-extern "C" void AnnotatePCQDestroy(const char *file, int line,
-                                   const volatile void *pcq);
-extern "C" void AnnotatePCQPut(const char *file, int line,
-                               const volatile void *pcq);
-extern "C" void AnnotatePCQGet(const char *file, int line,
-                               const volatile void *pcq);
-extern "C" void AnnotateNewMemory(const char *file, int line,
+  /* Force the race detector to flush its state. The actual effect depends on
+   * the implementation of the detector. */
+  #define ANNOTATE_FLUSH_STATE() \
+    AnnotateFlushState(__FILE__, __LINE__)
+
+
+#else  /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+  #define ANNOTATE_RWLOCK_CREATE(lock) /* empty */
+  #define ANNOTATE_RWLOCK_DESTROY(lock) /* empty */
+  #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) /* empty */
+  #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) /* empty */
+  #define ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) /* */
+  #define ANNOTATE_BARRIER_WAIT_BEFORE(barrier) /* empty */
+  #define ANNOTATE_BARRIER_WAIT_AFTER(barrier) /* empty */
+  #define ANNOTATE_BARRIER_DESTROY(barrier) /* empty */
+  #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) /* empty */
+  #define ANNOTATE_CONDVAR_WAIT(cv) /* empty */
+  #define ANNOTATE_CONDVAR_SIGNAL(cv) /* empty */
+  #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) /* empty */
+  #define ANNOTATE_HAPPENS_BEFORE(obj) /* empty */
+  #define ANNOTATE_HAPPENS_AFTER(obj) /* empty */
+  #define ANNOTATE_PUBLISH_MEMORY_RANGE(address, size) /* empty */
+  #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(address, size)  /* empty */
+  #define ANNOTATE_SWAP_MEMORY_RANGE(address, size)  /* empty */
+  #define ANNOTATE_PCQ_CREATE(pcq) /* empty */
+  #define ANNOTATE_PCQ_DESTROY(pcq) /* empty */
+  #define ANNOTATE_PCQ_PUT(pcq) /* empty */
+  #define ANNOTATE_PCQ_GET(pcq) /* empty */
+  #define ANNOTATE_NEW_MEMORY(address, size) /* empty */
+  #define ANNOTATE_EXPECT_RACE(address, description) /* empty */
+  #define ANNOTATE_BENIGN_RACE(address, description) /* empty */
+  #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) /* empty */
+  #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) /* empty */
+  #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) /* empty */
+  #define ANNOTATE_TRACE_MEMORY(arg) /* empty */
+  #define ANNOTATE_THREAD_NAME(name) /* empty */
+  #define ANNOTATE_IGNORE_READS_BEGIN() /* empty */
+  #define ANNOTATE_IGNORE_READS_END() /* empty */
+  #define ANNOTATE_IGNORE_WRITES_BEGIN() /* empty */
+  #define ANNOTATE_IGNORE_WRITES_END() /* empty */
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() /* empty */
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_END() /* empty */
+  #define ANNOTATE_NO_OP(arg) /* empty */
+  #define ANNOTATE_FLUSH_STATE() /* empty */
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+/* Use the macros above rather than using these functions directly. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock);
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock);
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed);
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier);
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier);
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier);
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock);
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv);
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv);
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size);
+void AnnotateUnpublishMemoryRange(const char *file, int line,
                                   const volatile void *address,
                                   long size);
-extern "C" void AnnotateExpectRace(const char *file, int line,
-                                   const volatile void *address,
-                                   const char *description);
-extern "C" void AnnotateBenignRace(const char *file, int line,
-                                   const volatile void *address,
-                                   const char *description);
-extern "C" void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
-                                            const volatile void *mu);
-extern "C" void AnnotateTraceMemory(const char *file, int line,
-                                    const volatile void *arg);
-extern "C" void AnnotateThreadName(const char *file, int line,
-                                   const char *name);
-extern "C" void AnnotateIgnoreReadsBegin(const char *file, int line);
-extern "C" void AnnotateIgnoreReadsEnd(const char *file, int line);
-extern "C" void AnnotateIgnoreWritesBegin(const char *file, int line);
-extern "C" void AnnotateIgnoreWritesEnd(const char *file, int line);
-extern "C" void AnnotateNoOp(const char *file, int line,
-                             const volatile void *arg);
-
-#ifndef NDEBUG
-
-  // ANNOTATE_UNPROTECTED_READ is the preferred way to annotate racey reads.
-  //
-  // Instead of doing
-  //    ANNOTATE_IGNORE_READS_BEGIN();
-  //    ... = x;
-  //    ANNOTATE_IGNORE_READS_END();
-  // one can use
-  //    ... = ANNOTATE_UNPROTECTED_READ(x);
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq);
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq);
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *address,
+                       long size);
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRaceSized(const char *file, int line,
+                        const volatile void *address,
+                        long size,
+                        const char *description);
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu);
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg);
+void AnnotateThreadName(const char *file, int line,
+                        const char *name);
+void AnnotateIgnoreReadsBegin(const char *file, int line);
+void AnnotateIgnoreReadsEnd(const char *file, int line);
+void AnnotateIgnoreWritesBegin(const char *file, int line);
+void AnnotateIgnoreWritesEnd(const char *file, int line);
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg);
+void AnnotateFlushState(const char *file, int line);
+
+/* Return non-zero value if running under valgrind.
+
+  If "valgrind.h" is included into dynamic_annotations.c,
+  the regular valgrind mechanism will be used.
+  See http://valgrind.org/docs/manual/manual-core-adv.html about
+  RUNNING_ON_VALGRIND and other valgrind "client requests".
+  The file "valgrind.h" may be obtained by doing
+     svn co svn://svn.valgrind.org/valgrind/trunk/include
+
+  If for some reason you can't use "valgrind.h" or want to fake valgrind,
+  there are two ways to make this function return non-zero:
+    - Use environment variable: export RUNNING_ON_VALGRIND=1
+    - Make your tool intercept the function RunningOnValgrind() and
+      change its return value.
+ */
+int RunningOnValgrind(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0 && defined(__cplusplus)
+
+  /* ANNOTATE_UNPROTECTED_READ is the preferred way to annotate racey reads.
+
+     Instead of doing
+        ANNOTATE_IGNORE_READS_BEGIN();
+        ... = x;
+        ANNOTATE_IGNORE_READS_END();
+     one can use
+        ... = ANNOTATE_UNPROTECTED_READ(x); */
   template <class T>
-  inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x)
-       NO_THREAD_SAFETY_ANALYSIS {
+  inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x) {
     ANNOTATE_IGNORE_READS_BEGIN();
     T res = x;
     ANNOTATE_IGNORE_READS_END();
     return res;
   }
-
-  // Apply ANNOTATE_BENIGN_RACE to a static variable.
+  /* Apply ANNOTATE_BENIGN_RACE_SIZED to a static variable. */
   #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description)        \
     namespace {                                                       \
       class static_var ## _annotator {                                \
        public:                                                        \
         static_var ## _annotator() {                                  \
-          ANNOTATE_BENIGN_RACE(&static_var,                           \
+          ANNOTATE_BENIGN_RACE_SIZED(&static_var,                     \
+                                      sizeof(static_var),             \
             # static_var ": " description);                           \
         }                                                             \
       };                                                              \
       static static_var ## _annotator the ## static_var ## _annotator;\
     }
-#else // !NDEBUG
+#else /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
 
   #define ANNOTATE_UNPROTECTED_READ(x) (x)
-  #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description)  // empty
-
-#endif // !NDEBUG
-
-// Return non-zero value if running under valgrind.
-extern "C" int RunningOnValgrind();
+  #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description)  /* empty */
 
+#endif /* DYNAMIC_ANNOTATIONS_ENABLED */
 
-#endif  // BASE_DYNAMIC_ANNOTATIONS_H_
+#endif  /* BASE_DYNAMIC_ANNOTATIONS_H_ */
diff --git a/src/base/low_level_alloc.cc b/src/base/low_level_alloc.cc
index 2bbce54..7ca3953 100644
--- a/src/base/low_level_alloc.cc
+++ b/src/base/low_level_alloc.cc
@@ -210,8 +210,9 @@ static const intptr_t kMagicUnallocated = ~kMagicAllocated;
 namespace {
   class ArenaLock {
    public:
-    explicit ArenaLock(LowLevelAlloc::Arena *arena) :
-        left_(false), mask_valid_(false), arena_(arena) {
+    explicit ArenaLock(LowLevelAlloc::Arena *arena)
+        EXCLUSIVE_LOCK_FUNCTION(arena->mu)
+        : left_(false), mask_valid_(false), arena_(arena) {
       if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) != 0) {
       // We've decided not to support async-signal-safe arena use until
       // there a demonstrated need.  Here's how one could do it though
@@ -228,7 +229,7 @@ namespace {
       this->arena_->mu.Lock();
     }
     ~ArenaLock() { RAW_CHECK(this->left_, "haven't left Arena region"); }
-    void Leave() {
+    void Leave() UNLOCK_FUNCTION(arena_->mu) {
       this->arena_->mu.Unlock();
 #if 0
       if (this->mask_valid_) {
diff --git a/src/config.h.in b/src/config.h.in
index 1ad2642..49bbf0d 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -132,7 +132,7 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #undef HAVE_SYS_TYPES_H
 
-/* Define to 1 if you have the <sys/ucontext.h> header file. */
+/* <sys/ucontext.h> is broken on redhat 7 */
 #undef HAVE_SYS_UCONTEXT_H
 
 /* Define to 1 if you have the <sys/wait.h> header file. */
@@ -150,6 +150,9 @@
 /* Define to 1 if you have the <unwind.h> header file. */
 #undef HAVE_UNWIND_H
 
+/* Define to 1 if you have the <valgrind.h> header file. */
+#undef HAVE_VALGRIND_H
+
 /* define if your compiler has __attribute__ */
 #undef HAVE___ATTRIBUTE__
 
diff --git a/src/debugallocation.cc b/src/debugallocation.cc
index 1a9ddcb..949fbe9 100644
--- a/src/debugallocation.cc
+++ b/src/debugallocation.cc
@@ -1010,7 +1010,7 @@ static void *MemalignOverride(size_t align, size_t size,
                               const void *caller) __THROW
   ATTRIBUTE_SECTION(google_malloc);
 
-void* operator new(size_t size)
+void* operator new(size_t size) throw (std::bad_alloc)
   ATTRIBUTE_SECTION(google_malloc);
 void* operator new(size_t size, const std::nothrow_t&) __THROW
   ATTRIBUTE_SECTION(google_malloc);
@@ -1018,7 +1018,7 @@ void operator delete(void* p) __THROW
   ATTRIBUTE_SECTION(google_malloc);
 void operator delete(void* p, const std::nothrow_t&) __THROW
   ATTRIBUTE_SECTION(google_malloc);
-void* operator new[](size_t size)
+void* operator new[](size_t size) throw (std::bad_alloc)
   ATTRIBUTE_SECTION(google_malloc);
 void* operator new[](size_t size, const std::nothrow_t&) __THROW
   ATTRIBUTE_SECTION(google_malloc);
@@ -1176,12 +1176,12 @@ extern "C" void* pvalloc(size_t size) __THROW {
   return p;
 }
 
-extern "C" int mallopt(int cmd, int value) {
+extern "C" int mallopt(int cmd, int value) __THROW {
   return BASE_MALLOPT(cmd, value);
 }
 
 #ifdef HAVE_STRUCT_MALLINFO
-extern "C" struct mallinfo mallinfo(void) {
+extern "C" struct mallinfo mallinfo(void) __THROW {
   return BASE_MALLINFO();
 }
 #endif
@@ -1239,7 +1239,7 @@ inline void* cpp_debug_alloc(size_t size, int new_type, bool nothrow) {
   }
 }
 
-void* operator new(size_t size) {
+void* operator new(size_t size) throw (std::bad_alloc) {
   void* ptr = cpp_debug_alloc(size, MallocBlock::kNewType, false);
   MallocHook::InvokeNewHook(ptr, size);
   if (ptr == NULL) {
@@ -1259,7 +1259,8 @@ void operator delete(void* ptr) __THROW {
   DebugDeallocate(ptr, MallocBlock::kNewType);
 }
 
-// Compilers use this, though I can't see how it differs from normal delete.
+// Some STL implementations explicitly invoke this.
+// It is completely equivalent to a normal delete (delete never throws).
 void operator delete(void* ptr, const std::nothrow_t&) __THROW {
   MallocHook::InvokeDeleteHook(ptr);
   DebugDeallocate(ptr, MallocBlock::kNewType);
@@ -1269,7 +1270,7 @@ void operator delete(void* ptr, const std::nothrow_t&) __THROW {
 
 // Alloc/free stuff for debug operator new[] & friends
 
-void* operator new[](size_t size) {
+void* operator new[](size_t size) throw (std::bad_alloc) {
   void* ptr = cpp_debug_alloc(size, MallocBlock::kArrayNewType, false);
   MallocHook::InvokeNewHook(ptr, size);
   if (ptr == NULL) {
@@ -1289,7 +1290,8 @@ void operator delete[](void* ptr) __THROW {
   DebugDeallocate(ptr, MallocBlock::kArrayNewType);
 }
 
-// Compilers use this, though I can't see how it differs from normal delete.
+// Some STL implementations explicitly invoke this.
+// It is completely equivalent to a normal delete (delete never throws).
 void operator delete[](void* ptr, const std::nothrow_t&) __THROW {
   MallocHook::InvokeDeleteHook(ptr);
   DebugDeallocate(ptr, MallocBlock::kArrayNewType);
@@ -1359,17 +1361,22 @@ class DebugMallocImplementation : public ParentImplementation {
 static DebugMallocImplementation debug_malloc_implementation;
 
 REGISTER_MODULE_INITIALIZER(debugallocation, {
-  MallocExtension::Register(&debug_malloc_implementation);
-
-  // When the program exits, check all blocks still in the free queue for
-  // corruption.
-  atexit(DanglingWriteChecker);
+  // Either we or valgrind will control memory management.  We
+  // register our extension if we're the winner.
+  if (RunningOnValgrind()) {
+    // Let Valgrind uses its own malloc (so don't register our extension).
+  } else {
+    MallocExtension::Register(&debug_malloc_implementation);
+    // When the program exits, check all blocks still in the free
+    // queue for corruption.
+    atexit(DanglingWriteChecker);
+  }
 });
 
 #ifdef TCMALLOC_FOR_DEBUGALLOCATION
 
 // Redefine malloc_stats to use tcmalloc's implementation:
-extern "C" void malloc_stats(void) {
+extern "C" void malloc_stats(void) __THROW {
   do_malloc_stats();
 }
 
diff --git a/src/google/stacktrace.h b/src/google/stacktrace.h
index 8188ce3..fd186d6 100644
--- a/src/google/stacktrace.h
+++ b/src/google/stacktrace.h
@@ -49,23 +49,23 @@
 // Skips the most recent "skip_count" stack frames (also skips the
 // frame generated for the "GetStackFrames" routine itself), and then
 // records the pc values for up to the next "max_depth" frames in
-// "pcs", and the corresponding stack frame sizes in "sizes".  Returns
-// the number of values recorded in "pcs"/"sizes".
+// "result", and the corresponding stack frame sizes in "sizes".
+// Returns the number of values recorded in "result"/"sizes".
 //
 // Example:
 //      main() { foo(); }
 //      foo() { bar(); }
 //      bar() {
-//        void* pcs[10];
+//        void* result[10];
 //        int sizes[10];
-//        int depth = GetStackFrames(pcs, sizes, 10, 1);
+//        int depth = GetStackFrames(result, sizes, 10, 1);
 //      }
 //
 // The GetStackFrames call will skip the frame for "bar".  It will
 // return 2 and will produce pc values that map to the following
 // procedures:
-//      pcs[0]       foo
-//      pcs[1]       main
+//      result[0]       foo
+//      result[1]       main
 // (Actually, there may be a few more entries after "main" to account for
 // startup procedures.)
 // And corresponding stack frame sizes will also be recorded:
@@ -76,15 +76,15 @@
 // be identified.
 //
 // This routine may return fewer stack frame entries than are
-// available. Also note that "pcs" and "sizes" must both be non-NULL.
-extern PERFTOOLS_DLL_DECL int GetStackFrames(void** pcs, int* sizes, int max_depth,
+// available. Also note that "result" and "sizes" must both be non-NULL.
+extern PERFTOOLS_DLL_DECL int GetStackFrames(void** result, int* sizes, int max_depth,
                           int skip_count);
 
 // Same as above, but to be used from a signal handler. The "uc" parameter
 // should be the pointer to ucontext_t which was passed as the 3rd parameter
 // to sa_sigaction signal handler. It may help the unwinder to get a
 // better stack trace under certain conditions. The "uc" may safely be NULL.
-extern PERFTOOLS_DLL_DECL int GetStackFramesWithContext(void** pcs, int* sizes, int max_depth,
+extern PERFTOOLS_DLL_DECL int GetStackFramesWithContext(void** result, int* sizes, int max_depth,
                                      int skip_count, const void *uc);
 
 // This is similar to the GetStackFrames routine, except that it returns
diff --git a/src/google/tcmalloc.h.in b/src/google/tcmalloc.h.in
index e5c873d..fbb70ab 100644
--- a/src/google/tcmalloc.h.in
+++ b/src/google/tcmalloc.h.in
@@ -60,7 +60,8 @@
 #endif
 
 #ifdef __cplusplus
-#include <new>  // for nothrow_t
+#include <new>          // for std::nothrow_t
+
 extern "C" {
 #endif
   // Returns a human-readable version string.  If major, minor,
@@ -91,16 +92,15 @@ extern "C" {
 #ifdef __cplusplus
   PERFTOOLS_DLL_DECL int tc_set_new_mode(int flag) __THROW;
   PERFTOOLS_DLL_DECL void* tc_new(size_t size);
-  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
-  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
-  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
-
   PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size,
                                           const std::nothrow_t&) __THROW;
-  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
-                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
   PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p,
                                             const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
+                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
   PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p,
                                                  const std::nothrow_t&) __THROW;
 }
diff --git a/src/heap-checker.cc b/src/heap-checker.cc
index 82a7adb..b539ed8 100644
--- a/src/heap-checker.cc
+++ b/src/heap-checker.cc
@@ -159,6 +159,23 @@ DEFINE_bool(heap_check_test_pointer_alignment,
             "Set to true to check if the found leak can be due to "
             "use of unaligned pointers");
 
+// Alignment at which all pointers in memory are supposed to be located;
+// use 1 if any alignment is ok.
+// heap_check_test_pointer_alignment flag guides if we try the value of 1.
+// The larger it can be, the lesser is the chance of missing real leaks.
+//
+// sizeof(void)* is correct.  However gold (the new linker) has a bug where it
+// sometimes places global pointers on 4-byte boundaries, even when pointers
+// are 8 bytes long.  While we are fixing the linker, degrade to 4-byte
+// alignment on all targets.  http://b/1226481
+//
+static const size_t kPointerSourceAlignment = sizeof(void*);
+DEFINE_int32(heap_check_pointer_source_alignment,
+	     EnvToInt("HEAP_CHECK_POINTER_SOURCE_ALIGNMENT",
+                      kPointerSourceAlignment),
+             "Alignment at which all pointers in memory are supposed to be "
+             "located.  Use 1 if any alignment is ok.");
+
 // A reasonable default to handle pointers inside of typical class objects:
 // Too low and we won't be able to traverse pointers to normally-used
 // nested objects and base parts of multiple-inherited objects.
@@ -245,13 +262,6 @@ static bool constructor_heap_profiling = false;
 static const int heap_checker_info_level = 0;
 
 //----------------------------------------------------------------------
-
-// Alignment at which all pointers in memory are supposed to be located;
-// use 1 if any alignment is ok.
-// heap_check_test_pointer_alignment flag guides if we try the value of 1.
-// The larger it can be, the lesser is the chance of missing real leaks.
-static const size_t kPointerSourceAlignment = sizeof(void*);
-
 // Cancel our InitialMallocHook_* if present.
 static void CancelInitialMallocHooks();  // defined below
 
@@ -484,7 +494,7 @@ HeapLeakChecker::Disabler::Disabler() {
   // in a thread-safe manner.
   int counter = get_thread_disable_counter();
   set_thread_disable_counter(counter + 1);
-  RAW_VLOG(1, "Increasing thread disable counter to %d", counter + 1);
+  RAW_VLOG(10, "Increasing thread disable counter to %d", counter + 1);
 }
 
 HeapLeakChecker::Disabler::~Disabler() {
@@ -492,7 +502,7 @@ HeapLeakChecker::Disabler::~Disabler() {
   RAW_DCHECK(counter > 0, "");
   if (counter > 0) {
     set_thread_disable_counter(counter - 1);
-    RAW_VLOG(1, "Decreasing thread disable counter to %d", counter);
+    RAW_VLOG(10, "Decreasing thread disable counter to %d", counter);
   } else {
     RAW_VLOG(0, "Thread disable counter underflow : %d", counter);
   }
@@ -525,7 +535,7 @@ static void NewHook(const void* ptr, size_t size) {
   if (ptr != NULL) {
     const int counter = get_thread_disable_counter();
     const bool ignore = (counter > 0);
-    RAW_VLOG(7, "Recording Alloc: %p of %"PRIuS "; %d", ptr, size,
+    RAW_VLOG(16, "Recording Alloc: %p of %"PRIuS "; %d", ptr, size,
              int(counter));
     { SpinLockHolder l(&heap_checker_lock);
       if (size > max_heap_object_size) max_heap_object_size = size;
@@ -540,17 +550,17 @@ static void NewHook(const void* ptr, size_t size) {
         }
       }
     }
-    RAW_VLOG(8, "Alloc Recorded: %p of %"PRIuS"", ptr, size);
+    RAW_VLOG(17, "Alloc Recorded: %p of %"PRIuS"", ptr, size);
   }
 }
 
 static void DeleteHook(const void* ptr) {
   if (ptr != NULL) {
-    RAW_VLOG(7, "Recording Free %p", ptr);
+    RAW_VLOG(16, "Recording Free %p", ptr);
     { SpinLockHolder l(&heap_checker_lock);
       if (heap_checker_on) heap_profile->RecordFree(ptr);
     }
-    RAW_VLOG(8, "Free Recorded: %p", ptr);
+    RAW_VLOG(17, "Free Recorded: %p", ptr);
   }
 }
 
@@ -584,7 +594,7 @@ static StackDirection stack_direction = UNKNOWN_DIRECTION;
 static void RegisterStackLocked(const void* top_ptr) {
   RAW_DCHECK(heap_checker_lock.IsHeld(), "");
   RAW_DCHECK(MemoryRegionMap::LockIsHeld(), "");
-  RAW_VLOG(1, "Thread stack at %p", top_ptr);
+  RAW_VLOG(10, "Thread stack at %p", top_ptr);
   uintptr_t top = AsInt(top_ptr);
   stack_tops->insert(top);  // add for later use
 
@@ -598,12 +608,12 @@ static void RegisterStackLocked(const void* top_ptr) {
   if (MemoryRegionMap::FindAndMarkStackRegion(top, &region)) {
     // Make the proper portion of the stack live:
     if (stack_direction == GROWS_TOWARDS_LOW_ADDRESSES) {
-      RAW_VLOG(2, "Live stack at %p of %"PRIuPTR" bytes",
+      RAW_VLOG(11, "Live stack at %p of %"PRIuPTR" bytes",
                   top_ptr, region.end_addr - top);
       live_objects->push_back(AllocObject(top_ptr, region.end_addr - top,
                                           THREAD_DATA));
     } else {  // GROWS_TOWARDS_HIGH_ADDRESSES
-      RAW_VLOG(2, "Live stack at %p of %"PRIuPTR" bytes",
+      RAW_VLOG(11, "Live stack at %p of %"PRIuPTR" bytes",
                   AsPtr(region.start_addr),
                   top - region.start_addr);
       live_objects->push_back(AllocObject(AsPtr(region.start_addr),
@@ -619,7 +629,7 @@ static void RegisterStackLocked(const void* top_ptr) {
         uintptr_t start = AsInt(span->ptr);
         uintptr_t end = start + span->size;
         if (start <= top  &&  top < end) {
-          RAW_VLOG(2, "Stack at %p is inside /proc/self/maps chunk %p..%p",
+          RAW_VLOG(11, "Stack at %p is inside /proc/self/maps chunk %p..%p",
                       top_ptr, AsPtr(start), AsPtr(end));
           // Shrink start..end region by chopping away the memory regions in
           // MemoryRegionMap that land in it to undo merging of regions
@@ -640,17 +650,17 @@ static void RegisterStackLocked(const void* top_ptr) {
             }
           }
           if (stack_start != start  ||  stack_end != end) {
-            RAW_VLOG(2, "Stack at %p is actually inside memory chunk %p..%p",
+            RAW_VLOG(11, "Stack at %p is actually inside memory chunk %p..%p",
                         top_ptr, AsPtr(stack_start), AsPtr(stack_end));
           }
           // Make the proper portion of the stack live:
           if (stack_direction == GROWS_TOWARDS_LOW_ADDRESSES) {
-            RAW_VLOG(2, "Live stack at %p of %"PRIuPTR" bytes",
+            RAW_VLOG(11, "Live stack at %p of %"PRIuPTR" bytes",
                         top_ptr, stack_end - top);
             live_objects->push_back(
               AllocObject(top_ptr, stack_end - top, THREAD_DATA));
           } else {  // GROWS_TOWARDS_HIGH_ADDRESSES
-            RAW_VLOG(2, "Live stack at %p of %"PRIuPTR" bytes",
+            RAW_VLOG(11, "Live stack at %p of %"PRIuPTR" bytes",
                         AsPtr(stack_start), top - stack_start);
             live_objects->push_back(
               AllocObject(AsPtr(stack_start), top - stack_start, THREAD_DATA));
@@ -723,14 +733,14 @@ static void MakeDisabledLiveCallbackLocked(
         // and the rest of the region where the stack lives can well
         // contain outdated stack variables which are not live anymore,
         // hence should not be treated as such.
-        RAW_VLOG(2, "Not %s-disabling %"PRIuS" bytes at %p"
+        RAW_VLOG(11, "Not %s-disabling %"PRIuS" bytes at %p"
                     ": have stack inside: %p",
                     (stack_disable ? "stack" : "range"),
                     info.object_size, ptr, AsPtr(*iter));
         return;
       }
     }
-    RAW_VLOG(2, "%s-disabling %"PRIuS" bytes at %p",
+    RAW_VLOG(11, "%s-disabling %"PRIuS" bytes at %p",
                 (stack_disable ? "Stack" : "Range"), info.object_size, ptr);
     live_objects->push_back(AllocObject(ptr, info.object_size,
                                         MUST_BE_ON_HEAP));
@@ -755,7 +765,7 @@ static void RecordGlobalDataLocked(uintptr_t start_address,
   // Ignore non-writeable regions.
   if (strchr(permissions, 'w') == NULL) return;
   if (filename == NULL  ||  *filename == '\0')  filename = "UNNAMED";
-  RAW_VLOG(2, "Looking into %s: 0x%" PRIxPTR "..0x%" PRIxPTR,
+  RAW_VLOG(11, "Looking into %s: 0x%" PRIxPTR "..0x%" PRIxPTR,
               filename, start_address, end_address);
   (*library_live_objects)[filename].
     push_back(AllocObject(AsPtr(start_address),
@@ -814,12 +824,12 @@ void HeapLeakChecker::DisableLibraryAllocsLocked(const char* library,
     // does not call user code.
   }
   if (depth) {
-    RAW_VLOG(1, "Disabling allocations from %s at depth %d:", library, depth);
+    RAW_VLOG(10, "Disabling allocations from %s at depth %d:", library, depth);
     DisableChecksFromToLocked(AsPtr(start_address), AsPtr(end_address), depth);
     if (IsLibraryNamed(library, "/libpthread")  ||
         IsLibraryNamed(library, "/libdl")  ||
         IsLibraryNamed(library, "/ld")) {
-      RAW_VLOG(1, "Global memory regions made by %s will be live data",
+      RAW_VLOG(10, "Global memory regions made by %s will be live data",
                   library);
       if (global_region_caller_ranges == NULL) {
         global_region_caller_ranges =
@@ -936,7 +946,7 @@ static enum {
                                                         va_list /*ap*/) {
   RAW_DCHECK(heap_checker_lock.IsHeld(), "");
   thread_listing_status = CALLBACK_STARTED;
-  RAW_VLOG(2, "Found %d threads (from pid %d)", num_threads, getpid());
+  RAW_VLOG(11, "Found %d threads (from pid %d)", num_threads, getpid());
 
   if (FLAGS_heap_check_ignore_global_live) {
     UseProcMapsLocked(RECORD_GLOBAL_DATA);
@@ -951,7 +961,7 @@ static enum {
     // the leak checking thread itself is handled
     // specially via self_thread_stack, not here:
     if (thread_pids[i] == self_thread_pid) continue;
-    RAW_VLOG(2, "Handling thread with pid %d", thread_pids[i]);
+    RAW_VLOG(11, "Handling thread with pid %d", thread_pids[i]);
 #if defined(HAVE_LINUX_PTRACE_H) && defined(HAVE_SYS_SYSCALL_H) && defined(DUMPER)
     i386_regs thread_regs;
 #define sys_ptrace(r, p, a, d)  syscall(SYS_ptrace, (r), (p), (a), (d))
@@ -967,7 +977,7 @@ static enum {
       // register pointers still being in the registers and not on the stack):
       for (void** p = reinterpret_cast<void**>(&thread_regs);
            p < reinterpret_cast<void**>(&thread_regs + 1); ++p) {
-        RAW_VLOG(3, "Thread register %p", *p);
+        RAW_VLOG(12, "Thread register %p", *p);
         thread_registers.push_back(*p);
       }
     } else {
@@ -982,7 +992,7 @@ static enum {
   if (thread_registers.size()) {
     // Make thread registers be live heap data sources.
     // we rely here on the fact that vector is in one memory chunk:
-    RAW_VLOG(2, "Live registers at %p of %"PRIuS" bytes",
+    RAW_VLOG(11, "Live registers at %p of %"PRIuS" bytes",
                 &thread_registers[0], thread_registers.size() * sizeof(void*));
     live_objects->push_back(AllocObject(&thread_registers[0],
                                         thread_registers.size() * sizeof(void*),
@@ -1005,7 +1015,7 @@ static const void* self_thread_stack_top;
 void HeapLeakChecker::IgnoreNonThreadLiveObjectsLocked() {
   RAW_DCHECK(heap_checker_lock.IsHeld(), "");
   RAW_DCHECK(MemoryRegionMap::LockIsHeld(), "");
-  RAW_VLOG(2, "Handling self thread with pid %d", self_thread_pid);
+  RAW_VLOG(11, "Handling self thread with pid %d", self_thread_pid);
   // Register our own stack:
 
   // Important that all stack ranges (including the one here)
@@ -1019,7 +1029,7 @@ void HeapLeakChecker::IgnoreNonThreadLiveObjectsLocked() {
     for (IgnoredObjectsMap::const_iterator object = ignored_objects->begin();
          object != ignored_objects->end(); ++object) {
       const void* ptr = AsPtr(object->first);
-      RAW_VLOG(2, "Ignored live object at %p of %"PRIuS" bytes",
+      RAW_VLOG(11, "Ignored live object at %p of %"PRIuS" bytes",
                   ptr, object->second);
       live_objects->
         push_back(AllocObject(ptr, object->second, MUST_BE_ON_HEAP));
@@ -1132,10 +1142,10 @@ void HeapLeakChecker::IgnoreNonThreadLiveObjectsLocked() {
         }
       }
       // Now get and use live_objects from the final version of l->second:
-      if (VLOG_IS_ON(2)) {
+      if (VLOG_IS_ON(11)) {
         for (LiveObjectsStack::const_iterator i = l->second.begin();
              i != l->second.end(); ++i) {
-          RAW_VLOG(2, "Library live region at %p of %"PRIuPTR" bytes",
+          RAW_VLOG(11, "Library live region at %p of %"PRIuPTR" bytes",
                       i->ptr, i->size);
         }
       }
@@ -1240,7 +1250,7 @@ void HeapLeakChecker::IgnoreAllLiveObjectsLocked(const void* self_stack_top) {
       RAW_LOG(ERROR, "Thread stacks not found for %d threads. "
                      "Will likely report false leak positives.", r);
     } else {
-      RAW_VLOG(2, "Thread stacks appear to be found for all threads");
+      RAW_VLOG(11, "Thread stacks appear to be found for all threads");
     }
   } else {
     RAW_LOG(WARNING, "Not looking for thread stacks; "
@@ -1256,7 +1266,7 @@ void HeapLeakChecker::IgnoreAllLiveObjectsLocked(const void* self_stack_top) {
     IgnoreNonThreadLiveObjectsLocked();
   }
   if (live_objects_total) {
-    RAW_VLOG(1, "Ignoring %"PRId64" reachable objects of %"PRId64" bytes",
+    RAW_VLOG(10, "Ignoring %"PRId64" reachable objects of %"PRId64" bytes",
                 live_objects_total, live_bytes_total);
   }
   // Free these: we made them here and heap_profile never saw them
@@ -1266,7 +1276,8 @@ void HeapLeakChecker::IgnoreAllLiveObjectsLocked(const void* self_stack_top) {
 }
 
 // Alignment at which we should consider pointer positions
-// in IgnoreLiveObjectsLocked. Use 1 if any alignment is ok.
+// in IgnoreLiveObjectsLocked. Will normally use the value of
+// FLAGS_heap_check_pointer_source_alignment.
 static size_t pointer_source_alignment = kPointerSourceAlignment;
 // Global lock for HeapLeakChecker::DoNoLeaks
 // to protect pointer_source_alignment.
@@ -1314,7 +1325,7 @@ static SpinLock alignment_checker_lock(SpinLock::LINKER_INITIALIZED);
       live_object_count += 1;
       live_byte_count += size;
     }
-    RAW_VLOG(4, "Looking for heap pointers in %p of %"PRIuS" bytes",
+    RAW_VLOG(13, "Looking for heap pointers in %p of %"PRIuS" bytes",
                 object, size);
     const char* const whole_object = object;
     size_t const whole_size = size;
@@ -1351,7 +1362,7 @@ static SpinLock alignment_checker_lock(SpinLock::LINKER_INITIALIZED);
       if (can_be_on_heap) {
         const void* ptr = reinterpret_cast<const void*>(addr);
         // Too expensive (inner loop): manually uncomment when debugging:
-        // RAW_VLOG(8, "Trying pointer to %p at %p", ptr, object);
+        // RAW_VLOG(17, "Trying pointer to %p at %p", ptr, object);
         size_t object_size;
         if (HaveOnHeapLocked(&ptr, &object_size)  &&
             heap_profile->MarkAsLive(ptr)) {
@@ -1360,10 +1371,10 @@ static SpinLock alignment_checker_lock(SpinLock::LINKER_INITIALIZED);
           // a heap object which is in fact leaked.
           // I.e. in very rare and probably not repeatable/lasting cases
           // we might miss some real heap memory leaks.
-          RAW_VLOG(5, "Found pointer to %p of %"PRIuS" bytes at %p "
+          RAW_VLOG(14, "Found pointer to %p of %"PRIuS" bytes at %p "
                       "inside %p of size %"PRIuS"",
                       ptr, object_size, object, whole_object, whole_size);
-          if (VLOG_IS_ON(6)) {
+          if (VLOG_IS_ON(15)) {
             // log call stacks to help debug how come something is not a leak
             HeapProfileTable::AllocInfo alloc;
             bool r = heap_profile->FindAllocDetails(ptr, &alloc);
@@ -1386,7 +1397,7 @@ static SpinLock alignment_checker_lock(SpinLock::LINKER_INITIALIZED);
   live_objects_total += live_object_count;
   live_bytes_total += live_byte_count;
   if (live_object_count) {
-    RAW_VLOG(1, "Removed %"PRId64" live heap objects of %"PRId64" bytes: %s%s",
+    RAW_VLOG(10, "Removed %"PRId64" live heap objects of %"PRId64" bytes: %s%s",
                 live_object_count, live_byte_count, name, name2);
   }
 }
@@ -1408,7 +1419,7 @@ void HeapLeakChecker::IgnoreObject(const void* ptr) {
   if (!HaveOnHeapLocked(&ptr, &object_size)) {
     RAW_LOG(ERROR, "No live heap object at %p to ignore", ptr);
   } else {
-    RAW_VLOG(1, "Going to ignore live object at %p of %"PRIuS" bytes",
+    RAW_VLOG(10, "Going to ignore live object at %p of %"PRIuS" bytes",
                 ptr, object_size);
     if (ignored_objects == NULL)  {
       ignored_objects = new(Allocator::Allocate(sizeof(IgnoredObjectsMap)))
@@ -1434,7 +1445,7 @@ void HeapLeakChecker::UnIgnoreObject(const void* ptr) {
       if (object != ignored_objects->end()  &&  object_size == object->second) {
         ignored_objects->erase(object);
         found = true;
-        RAW_VLOG(1, "Now not going to ignore live object "
+        RAW_VLOG(10, "Now not going to ignore live object "
                     "at %p of %"PRIuS" bytes", ptr, object_size);
       }
     }
@@ -1483,7 +1494,7 @@ void HeapLeakChecker::Create(const char *name, bool make_start_snapshot) {
       const HeapProfileTable::Stats& t = heap_profile->total();
       const size_t start_inuse_bytes = t.alloc_size - t.free_size;
       const size_t start_inuse_allocs = t.allocs - t.frees;
-      RAW_VLOG(1, "Start check \"%s\" profile: %"PRIuS" bytes "
+      RAW_VLOG(10, "Start check \"%s\" profile: %"PRIuS" bytes "
                "in %"PRIuS" objects",
                name_, start_inuse_bytes, start_inuse_allocs);
     } else {
@@ -1649,6 +1660,8 @@ bool HeapLeakChecker::DoNoLeaks(ShouldSymbolize should_symbolize) {
     // Make the heap profile, other threads are locked out.
     HeapProfileTable::Snapshot* base =
         reinterpret_cast<HeapProfileTable::Snapshot*>(start_snapshot_);
+    RAW_DCHECK(FLAGS_heap_check_pointer_source_alignment > 0, "");
+    pointer_source_alignment = FLAGS_heap_check_pointer_source_alignment;
     IgnoreAllLiveObjectsLocked(&a_local_var);
     leaks = heap_profile->NonLiveSnapshot(base);
 
@@ -1668,23 +1681,28 @@ bool HeapLeakChecker::DoNoLeaks(ShouldSymbolize should_symbolize) {
                 initial_allocs, Allocator::alloc_count());
       }
     } else if (FLAGS_heap_check_test_pointer_alignment) {
-      // Try with reduced pointer aligment
-      pointer_source_alignment = 1;
-      IgnoreAllLiveObjectsLocked(&a_local_var);
-      HeapProfileTable::Snapshot* leaks_wo_align =
-          heap_profile->NonLiveSnapshot(base);
-      pointer_source_alignment = kPointerSourceAlignment;
-      if (leaks_wo_align->Empty()) {
-        RAW_LOG(WARNING, "Found no leaks without pointer alignment: "
-                "something might be placing pointers at "
-                "unaligned addresses! This needs to be fixed.");
+      if (pointer_source_alignment == 1) {
+        RAW_LOG(WARNING, "--heap_check_test_pointer_alignment has no effect: "
+                "--heap_check_pointer_source_alignment was already set to 1");
       } else {
-        RAW_LOG(INFO, "Found leaks without pointer alignment as well: "
-                "unaligned pointers must not be the cause of leaks.");
-        RAW_LOG(INFO, "--heap_check_test_pointer_alignment did not help "
-                "to diagnose the leaks.");
+        // Try with reduced pointer aligment
+        pointer_source_alignment = 1;
+        IgnoreAllLiveObjectsLocked(&a_local_var);
+        HeapProfileTable::Snapshot* leaks_wo_align =
+            heap_profile->NonLiveSnapshot(base);
+        pointer_source_alignment = FLAGS_heap_check_pointer_source_alignment;
+        if (leaks_wo_align->Empty()) {
+          RAW_LOG(WARNING, "Found no leaks without pointer alignment: "
+                  "something might be placing pointers at "
+                  "unaligned addresses! This needs to be fixed.");
+        } else {
+          RAW_LOG(INFO, "Found leaks without pointer alignment as well: "
+                  "unaligned pointers must not be the cause of leaks.");
+          RAW_LOG(INFO, "--heap_check_test_pointer_alignment did not help "
+                  "to diagnose the leaks.");
+        }
+        heap_profile->ReleaseSnapshot(leaks_wo_align);
       }
-      heap_profile->ReleaseSnapshot(leaks_wo_align);
     }
 
     if (leaks != NULL) {
@@ -1874,6 +1892,7 @@ static bool internal_init_start_has_run = false;
   }
 
   // Set all flags
+  RAW_DCHECK(FLAGS_heap_check_pointer_source_alignment > 0, "");
   if (FLAGS_heap_check == "minimal") {
     // The least we can check.
     FLAGS_heap_check_before_constructors = false;  // from after main
@@ -2043,7 +2062,7 @@ bool HeapLeakChecker::NoGlobalLeaks() {
   // we never delete or change main_heap_checker once it's set:
   HeapLeakChecker* main_hc = GlobalChecker();
   if (main_hc) {
-    RAW_VLOG(1, "Checking for whole-program memory leaks");
+    RAW_VLOG(10, "Checking for whole-program memory leaks");
     // The program is over, so it's safe to symbolize addresses (which
     // requires a fork) because no serious work is expected to be done
     // after this.  Symbolizing is really useful -- knowing what
@@ -2165,7 +2184,7 @@ void HeapLeakChecker::BeforeConstructorsLocked() {
   RAW_CHECK(heap_profile == NULL, "");
   heap_profile = new(Allocator::Allocate(sizeof(HeapProfileTable)))
                    HeapProfileTable(&Allocator::Allocate, &Allocator::Free);
-  RAW_VLOG(1, "Starting tracking the heap");
+  RAW_VLOG(10, "Starting tracking the heap");
   heap_checker_on = true;
 }
 
@@ -2329,7 +2348,7 @@ void HeapLeakChecker::DisableChecksFromToLocked(const void* start_address,
   value.start_address = AsInt(start_address);
   value.max_depth = max_depth;
   if (disabled_ranges->insert(make_pair(AsInt(end_address), value)).second) {
-    RAW_VLOG(1, "Disabling leak checking in stack traces "
+    RAW_VLOG(10, "Disabling leak checking in stack traces "
                 "under frame addresses between %p..%p",
                 start_address, end_address);
   } else {  // check that this is just a verbatim repetition
@@ -2352,7 +2371,7 @@ inline bool HeapLeakChecker::HaveOnHeapLocked(const void** ptr,
   const uintptr_t addr = AsInt(*ptr);
   if (heap_profile->FindInsideAlloc(
         *ptr, max_heap_object_size, ptr, object_size)) {
-    RAW_VLOG(7, "Got pointer into %p at +%"PRIuPTR" offset",
+    RAW_VLOG(16, "Got pointer into %p at +%"PRIuPTR" offset",
              *ptr, addr - AsInt(*ptr));
     return true;
   }
diff --git a/src/heap-profile-table.cc b/src/heap-profile-table.cc
index 66e4f20..ecaf75f 100644
--- a/src/heap-profile-table.cc
+++ b/src/heap-profile-table.cc
@@ -99,7 +99,7 @@ const char HeapProfileTable::kFileExt[] = ".heap";
 //----------------------------------------------------------------------
 
 static const int kHashTableSize = 179999;   // Size for table_.
-/*static*/ const int HeapProfileTable::kMaxStackDepth = 32;
+/*static*/ const int HeapProfileTable::kMaxStackDepth;
 
 //----------------------------------------------------------------------
 
diff --git a/src/heap-profile-table.h b/src/heap-profile-table.h
index 5403257..c9bee15 100644
--- a/src/heap-profile-table.h
+++ b/src/heap-profile-table.h
@@ -52,8 +52,8 @@ class HeapProfileTable {
   // Extension to be used for heap pforile files.
   static const char kFileExt[];
 
-  // Longest stack trace we record.  Defined in the .cc file.
-  static const int kMaxStackDepth;
+  // Longest stack trace we record.
+  static const int kMaxStackDepth = 32;
 
   // data types ----------------------------
 
diff --git a/src/malloc_extension.cc b/src/malloc_extension.cc
index 4ce262f..c2f8b54 100644
--- a/src/malloc_extension.cc
+++ b/src/malloc_extension.cc
@@ -187,7 +187,10 @@ MallocExtension* MallocExtension::instance() {
 void MallocExtension::Register(MallocExtension* implementation) {
   perftools_pthread_once(&module_init, InitModule);
   // When running under valgrind, our custom malloc is replaced with
-  // valgrind's one and malloc extensions will not work.
+  // valgrind's one and malloc extensions will not work.  (Note:
+  // callers should be responsible for checking that they are the
+  // malloc that is really being run, before calling Register.  This
+  // is just here as an extra sanity check.)
   if (!RunningOnValgrind()) {
     current_instance = implementation;
   }
diff --git a/src/malloc_hook.cc b/src/malloc_hook.cc
index 2a7f542..4315b86 100644
--- a/src/malloc_hook.cc
+++ b/src/malloc_hook.cc
@@ -326,8 +326,8 @@ extern "C" int MallocHook_GetCallerStackTrace(void** result, int max_depth,
     return 0;
   for (int i = 0; i < depth; ++i) {  // stack[0] is our immediate caller
     if (InHookCaller(stack[i])) {
-      RAW_VLOG(4, "Found hooked allocator at %d: %p <- %p",
-                  i, stack[i], stack[i+1]);
+      RAW_VLOG(10, "Found hooked allocator at %d: %p <- %p",
+                   i, stack[i], stack[i+1]);
       i += 1;  // skip hook caller frame
       depth -= i;  // correct depth
       if (depth > max_depth) depth = max_depth;
diff --git a/src/memory_region_map.cc b/src/memory_region_map.cc
index 05fdc06..f6bed45 100644
--- a/src/memory_region_map.cc
+++ b/src/memory_region_map.cc
@@ -181,7 +181,7 @@ static MemoryRegionMap::RegionSetRep regions_rep;
 static bool recursive_insert = false;
 
 void MemoryRegionMap::Init(int max_stack_depth) {
-  RAW_VLOG(2, "MemoryRegionMap Init");
+  RAW_VLOG(10, "MemoryRegionMap Init");
   RAW_CHECK(max_stack_depth >= 0, "");
   // Make sure we don't overflow the memory in region stacks:
   RAW_CHECK(max_stack_depth <= kMaxStackDepth,
@@ -192,7 +192,7 @@ void MemoryRegionMap::Init(int max_stack_depth) {
   if (client_count_ > 1) {
     // not first client: already did initialization-proper
     Unlock();
-    RAW_VLOG(2, "MemoryRegionMap Init increment done");
+    RAW_VLOG(10, "MemoryRegionMap Init increment done");
     return;
   }
   // Set our hooks and make sure no other hooks existed:
@@ -217,17 +217,17 @@ void MemoryRegionMap::Init(int max_stack_depth) {
     // recursive_insert = false; as InsertRegionLocked will also construct
     // regions_ on demand for us.
   Unlock();
-  RAW_VLOG(2, "MemoryRegionMap Init done");
+  RAW_VLOG(10, "MemoryRegionMap Init done");
 }
 
 bool MemoryRegionMap::Shutdown() {
-  RAW_VLOG(2, "MemoryRegionMap Shutdown");
+  RAW_VLOG(10, "MemoryRegionMap Shutdown");
   Lock();
   RAW_CHECK(client_count_ > 0, "");
   client_count_ -= 1;
   if (client_count_ != 0) {  // not last client; need not really shutdown
     Unlock();
-    RAW_VLOG(2, "MemoryRegionMap Shutdown decrement done");
+    RAW_VLOG(10, "MemoryRegionMap Shutdown decrement done");
     return true;
   }
   CheckMallocHooks();  // we assume no other hooks
@@ -244,7 +244,7 @@ bool MemoryRegionMap::Shutdown() {
     RAW_LOG(WARNING, "Can't delete LowLevelAlloc arena: it's being used");
   }
   Unlock();
-  RAW_VLOG(2, "MemoryRegionMap Shutdown done");
+  RAW_VLOG(10, "MemoryRegionMap Shutdown done");
   return deleted_arena;
 }
 
@@ -336,7 +336,7 @@ bool MemoryRegionMap::FindAndMarkStackRegion(uintptr_t stack_top,
   Lock();
   const Region* region = DoFindRegionLocked(stack_top);
   if (region != NULL) {
-    RAW_VLOG(2, "Stack at %p is inside region %p..%p",
+    RAW_VLOG(10, "Stack at %p is inside region %p..%p",
                 reinterpret_cast<void*>(stack_top),
                 reinterpret_cast<void*>(region->start_addr),
                 reinterpret_cast<void*>(region->end_addr));
@@ -361,7 +361,7 @@ MemoryRegionMap::RegionIterator MemoryRegionMap::EndRegionLocked() {
 }
 
 inline void MemoryRegionMap::DoInsertRegionLocked(const Region& region) {
-  RAW_VLOG(4, "Inserting region %p..%p from %p",
+  RAW_VLOG(12, "Inserting region %p..%p from %p",
               reinterpret_cast<void*>(region.start_addr),
               reinterpret_cast<void*>(region.end_addr),
               reinterpret_cast<void*>(region.caller()));
@@ -385,10 +385,10 @@ inline void MemoryRegionMap::DoInsertRegionLocked(const Region& region) {
   // This inserts and allocates permanent storage for region
   // and its call stack data: it's safe to do it now:
   regions_->insert(region);
-  RAW_VLOG(4, "Inserted region %p..%p :",
+  RAW_VLOG(12, "Inserted region %p..%p :",
               reinterpret_cast<void*>(region.start_addr),
               reinterpret_cast<void*>(region.end_addr));
-  if (VLOG_IS_ON(4))  LogAllLocked();
+  if (VLOG_IS_ON(12))  LogAllLocked();
 }
 
 // These variables are local to MemoryRegionMap::InsertRegionLocked()
@@ -425,7 +425,7 @@ inline void MemoryRegionMap::InsertRegionLocked(const Region& region) {
   // and taken into account when the recursion unwinds.
   // Do the insert:
   if (recursive_insert) {  // recursion: save in saved_regions
-    RAW_VLOG(4, "Saving recursive insert of region %p..%p from %p",
+    RAW_VLOG(12, "Saving recursive insert of region %p..%p from %p",
                 reinterpret_cast<void*>(region.start_addr),
                 reinterpret_cast<void*>(region.end_addr),
                 reinterpret_cast<void*>(region.caller()));
@@ -436,7 +436,7 @@ inline void MemoryRegionMap::InsertRegionLocked(const Region& region) {
     saved_regions[saved_regions_count++] = region;
   } else {  // not a recusrive call
     if (regions_ == NULL) {  // init regions_
-      RAW_VLOG(4, "Initializing region set");
+      RAW_VLOG(12, "Initializing region set");
       regions_ = regions_rep.region_set();
       recursive_insert = true;
       new(regions_) RegionSet();
@@ -470,7 +470,7 @@ void MemoryRegionMap::RecordRegionAddition(const void* start, size_t size) {
                                       max_stack_depth_, kStripFrames + 1)
     : 0;
   region.set_call_stack_depth(depth);  // record stack info fully
-  RAW_VLOG(2, "New global region %p..%p from %p",
+  RAW_VLOG(10, "New global region %p..%p from %p",
               reinterpret_cast<void*>(region.start_addr),
               reinterpret_cast<void*>(region.end_addr),
               reinterpret_cast<void*>(region.caller()));
@@ -499,7 +499,7 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
         // An exact match, so it's safe to remove.
         --saved_regions_count;
         --put_pos;
-        RAW_VLOG(2, ("Insta-Removing saved region %p..%p; "
+        RAW_VLOG(10, ("Insta-Removing saved region %p..%p; "
                      "now have %d saved regions"),
                  reinterpret_cast<void*>(start_addr),
                  reinterpret_cast<void*>(end_addr),
@@ -523,7 +523,7 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
   uintptr_t start_addr = reinterpret_cast<uintptr_t>(start);
   uintptr_t end_addr = start_addr + size;
   // subtract start_addr, end_addr from all the regions
-  RAW_VLOG(2, "Removing global region %p..%p; have %"PRIuS" regions",
+  RAW_VLOG(10, "Removing global region %p..%p; have %"PRIuS" regions",
               reinterpret_cast<void*>(start_addr),
               reinterpret_cast<void*>(end_addr),
               regions_->size());
@@ -533,12 +533,12 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
   for (RegionSet::iterator region = regions_->lower_bound(sample);
        region != regions_->end()  &&  region->start_addr < end_addr;
        /*noop*/) {
-    RAW_VLOG(5, "Looking at region %p..%p",
+    RAW_VLOG(13, "Looking at region %p..%p",
                 reinterpret_cast<void*>(region->start_addr),
                 reinterpret_cast<void*>(region->end_addr));
     if (start_addr <= region->start_addr  &&
         region->end_addr <= end_addr) {  // full deletion
-      RAW_VLOG(4, "Deleting region %p..%p",
+      RAW_VLOG(12, "Deleting region %p..%p",
                   reinterpret_cast<void*>(region->start_addr),
                   reinterpret_cast<void*>(region->end_addr));
       RegionSet::iterator d = region;
@@ -547,7 +547,7 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
       continue;
     } else if (region->start_addr < start_addr  &&
                end_addr < region->end_addr) {  // cutting-out split
-      RAW_VLOG(4, "Splitting region %p..%p in two",
+      RAW_VLOG(12, "Splitting region %p..%p in two",
                   reinterpret_cast<void*>(region->start_addr),
                   reinterpret_cast<void*>(region->end_addr));
       // Make another region for the start portion:
@@ -560,13 +560,13 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
       const_cast<Region&>(*region).set_start_addr(end_addr);
     } else if (end_addr > region->start_addr  &&
                start_addr <= region->start_addr) {  // cut from start
-      RAW_VLOG(4, "Start-chopping region %p..%p",
+      RAW_VLOG(12, "Start-chopping region %p..%p",
                   reinterpret_cast<void*>(region->start_addr),
                   reinterpret_cast<void*>(region->end_addr));
       const_cast<Region&>(*region).set_start_addr(end_addr);
     } else if (start_addr > region->start_addr  &&
                start_addr < region->end_addr) {  // cut from end
-      RAW_VLOG(4, "End-chopping region %p..%p",
+      RAW_VLOG(12, "End-chopping region %p..%p",
                   reinterpret_cast<void*>(region->start_addr),
                   reinterpret_cast<void*>(region->end_addr));
       // Can't just modify region->end_addr (it's the sorting key):
@@ -582,11 +582,11 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
     }
     ++region;
   }
-  RAW_VLOG(4, "Removed region %p..%p; have %"PRIuS" regions",
+  RAW_VLOG(12, "Removed region %p..%p; have %"PRIuS" regions",
               reinterpret_cast<void*>(start_addr),
               reinterpret_cast<void*>(end_addr),
               regions_->size());
-  if (VLOG_IS_ON(4))  LogAllLocked();
+  if (VLOG_IS_ON(12))  LogAllLocked();
   Unlock();
 }
 
@@ -596,7 +596,7 @@ void MemoryRegionMap::MmapHook(const void* result,
                                int fd, off_t offset) {
   // TODO(maxim): replace all 0x%"PRIxS" by %p when RAW_VLOG uses a safe
   // snprintf reimplementation that does not malloc to pretty-print NULL
-  RAW_VLOG(2, "MMap = 0x%"PRIxPTR" of %"PRIuS" at %llu "
+  RAW_VLOG(10, "MMap = 0x%"PRIxPTR" of %"PRIuS" at %llu "
               "prot %d flags %d fd %d offs %lld",
               reinterpret_cast<uintptr_t>(result), size,
               reinterpret_cast<uint64>(start), prot, flags, fd,
@@ -607,7 +607,7 @@ void MemoryRegionMap::MmapHook(const void* result,
 }
 
 void MemoryRegionMap::MunmapHook(const void* ptr, size_t size) {
-  RAW_VLOG(2, "MUnmap of %p %"PRIuS"", ptr, size);
+  RAW_VLOG(10, "MUnmap of %p %"PRIuS"", ptr, size);
   if (size != 0) {
     RecordRegionRemoval(ptr, size);
   }
@@ -617,7 +617,7 @@ void MemoryRegionMap::MremapHook(const void* result,
                                  const void* old_addr, size_t old_size,
                                  size_t new_size, int flags,
                                  const void* new_addr) {
-  RAW_VLOG(2, "MRemap = 0x%"PRIxPTR" of 0x%"PRIxPTR" %"PRIuS" "
+  RAW_VLOG(10, "MRemap = 0x%"PRIxPTR" of 0x%"PRIxPTR" %"PRIuS" "
               "to %"PRIuS" flags %d new_addr=0x%"PRIxPTR,
               (uintptr_t)result, (uintptr_t)old_addr,
                old_size, new_size, flags,
@@ -631,7 +631,7 @@ void MemoryRegionMap::MremapHook(const void* result,
 extern "C" void* __sbrk(ptrdiff_t increment);  // defined in libc
 
 void MemoryRegionMap::SbrkHook(const void* result, ptrdiff_t increment) {
-  RAW_VLOG(2, "Sbrk = 0x%"PRIxPTR" of %"PRIdS"", (uintptr_t)result, increment);
+  RAW_VLOG(10, "Sbrk = 0x%"PRIxPTR" of %"PRIdS"", (uintptr_t)result, increment);
   if (result != reinterpret_cast<void*>(-1)) {
     if (increment > 0) {
       void* new_end = sbrk(0);
diff --git a/src/pprof b/src/pprof
index fec0c9e..f8b20a4 100755
--- a/src/pprof
+++ b/src/pprof
@@ -89,6 +89,7 @@ my %obj_tool_map = (
 );
 my $DOT = "dot";          # leave non-absolute, since it may be in /usr/local
 my $GV = "gv";
+my $KCACHEGRIND = "kcachegrind";
 my $PS2PDF = "ps2pdf";
 # These are used for dynamic profiles
 my $WGET = "wget";
@@ -332,11 +333,11 @@ sub Init() {
 
   # Type of profile we are dealing with
   # Supported types:
-  #	cpu
-  #	heap
-  #	growth
-  #	contention
-  $main::profile_type = '';	# Empty type means "unknown"
+  #     cpu
+  #     heap
+  #     growth
+  #     contention
+  $main::profile_type = '';     # Empty type means "unknown"
 
   GetOptions("help!"          => \$main::opt_help,
              "version!"       => \$main::opt_version,
@@ -380,8 +381,8 @@ sub Init() {
              "tools=s"        => \$main::opt_tools,
              "test!"          => \$main::opt_test,
              "debug!"         => \$main::opt_debug,
-	     # Undocumented flags used only by unittests:
-	     "test_stride=i"  => \$main::opt_test_stride,
+             # Undocumented flags used only by unittests:
+             "test_stride=i"  => \$main::opt_test_stride,
       ) || usage("Invalid option(s)");
 
   # Deal with the standard --help and --version
@@ -634,7 +635,7 @@ sub Main() {
     } else {
       if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
         if ($main::opt_gv) {
-	  RunGV(PsTempName($main::next_tmpfile), "");
+          RunGV(PsTempName($main::next_tmpfile), "");
         }
       } else {
         exit(1);
@@ -667,7 +668,7 @@ sub ReadlineMightFail {
 
 sub RunGV {
   my $fname = shift;
-  my $bg = shift;	# "" or " &" if we should run in background
+  my $bg = shift;       # "" or " &" if we should run in background
   if (!system("$GV --version >/dev/null 2>&1")) {
     # Options using double dash are supported by this gv version.
     # Also, turn on noantialias to better handle bug in gv for
@@ -682,6 +683,13 @@ sub RunGV {
   }
 }
 
+sub RunKcachegrind {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  print STDERR "Starting '$KCACHEGRIND " . $fname . $bg . "'\n";
+  system("$KCACHEGRIND " . $fname . $bg);
+}
+
 
 ##### Interactive helper routines #####
 
@@ -689,10 +697,11 @@ sub InteractiveMode {
   $| = 1;  # Make output unbuffered for interactive mode
   my ($orig_profile, $symbols, $libs, $total) = @_;
 
-  print "Welcome to pprof!  For help, type 'help'.\n";
+  print STDERR "Welcome to pprof!  For help, type 'help'.\n";
 
-  # Use ReadLine if it's installed.
-  if ( !ReadlineMightFail() &&
+  # Use ReadLine if it's installed and input comes from a console.
+  if ( -t STDIN &&
+       !ReadlineMightFail() &&
        defined(eval {require Term::ReadLine}) ) {
     my $term = new Term::ReadLine 'pprof';
     while ( defined ($_ = $term->readline('(pprof) '))) {
@@ -703,7 +712,7 @@ sub InteractiveMode {
     }
   } else {       # don't have readline
     while (1) {
-      print "(pprof) ";
+      print STDERR "(pprof) ";
       $_ = <STDIN>;
       last if ! defined $_ ;
       s/\r//g;         # turn windows-looking lines into unix-looking lines
@@ -727,7 +736,7 @@ sub InteractiveCommand {
   my($orig_profile, $symbols, $libs, $total, $command) = @_;
   $_ = $command;                # just to make future m//'s easier
   if (!defined($_)) {
-    print "\n";
+    print STDERR "\n";
     return 0;
   }
   if (m/^ *quit/) {
@@ -764,6 +773,23 @@ sub InteractiveCommand {
     PrintText($symbols, $flat, $cumulative, $total, $line_limit);
     return 1;
   }
+  if (m/^ *callgrind *([^ \n]*)/) {
+    $main::opt_callgrind = 1;
+
+    # Get derived profiles
+    my $calls = ExtractCalls($symbols, $orig_profile);
+    my $filename = $1;
+    if ( $1 eq '' ) {
+      $filename = CallgrindTempName($main::next_tmpfile);
+    }
+    PrintCallgrind($calls, $filename);
+    if ( $1 eq '' ) {
+      RunKcachegrind($filename, " & ");
+      $main::next_tmpfile++;
+    }
+
+    return 1;
+  }
   if (m/^ *list *(.+)/) {
     $main::opt_list = 1;
 
@@ -856,7 +882,7 @@ sub ProcessProfile {
 }
 
 sub InteractiveHelpMessage {
-  print <<ENDOFHELP;
+  print STDERR <<ENDOFHELP;
 Interactive pprof mode
 
 Commands:
@@ -882,6 +908,10 @@ Commands:
       Show disassembly of routines whose names match "routine_regexp",
       annotated with sample counts.
 
+  callgrind
+  callgrind [filename]
+      Generates callgrind file. If no filename is given, kcachegrind is called.
+
   help - This listing
   quit or ^D - End pprof
 
@@ -913,7 +943,7 @@ sub ParseInteractiveArgs {
     }
   }
   if ($ignore ne "") {
-    print "Ignoring samples in call stacks that match '$ignore'\n";
+    print STDERR "Ignoring samples in call stacks that match '$ignore'\n";
   }
   return ($focus, $ignore);
 }
@@ -925,6 +955,11 @@ sub PsTempName {
   return "$main::tmpfile_ps" . "." . "$fnum" . ".ps";
 }
 
+sub CallgrindTempName {
+  my $fnum = shift;
+  return "$main::tmpfile_ps" . "." . "$fnum" . ".callgrind";
+}
+
 # Print profile data in packed binary format (64-bit) to standard out
 sub PrintProfileData {
   my $profile = shift;
@@ -1045,7 +1080,15 @@ sub PrintText {
 # Print the call graph in a way that's suiteable for callgrind.
 sub PrintCallgrind {
   my $calls = shift;
-  printf("events: Hits\n\n");
+  my $filename;
+  if ($main::opt_interactive) {
+    $filename = shift;
+    print STDERR "Writing callgrind file to '$filename'.\n"
+  } else {
+    $filename = "&STDOUT";
+  }
+  open(CG, ">".$filename );
+  printf CG ("events: Hits\n\n");
   foreach my $call ( map { $_->[0] }
                      sort { $a->[1] cmp $b ->[1] ||
                             $a->[2] <=> $b->[2] }
@@ -1057,13 +1100,15 @@ sub PrintCallgrind {
     my ( $caller_file, $caller_line, $caller_function,
          $callee_file, $callee_line, $callee_function ) =
        ( $1, $2, $3, $5, $6, $7 );
-    printf("fl=$caller_file\nfn=$caller_function\n");
+
+      
+    printf CG ("fl=$caller_file\nfn=$caller_function\n");
     if (defined $6) {
-      printf("cfl=$callee_file\n");
-      printf("cfn=$callee_function\n");
-      printf("calls=$count $callee_line\n");
+      printf CG ("cfl=$callee_file\n");
+      printf CG ("cfn=$callee_function\n");
+      printf CG ("calls=$count $callee_line\n");
     }
-    printf("$caller_line $count\n\n");
+    printf CG ("$caller_line $count\n\n");
   }
 }
 
@@ -1385,7 +1430,7 @@ sub SourceLine {
       return undef;
     }
     my $lines = [];
-    push(@{$lines}, "");	# So we can use 1-based line numbers as indices
+    push(@{$lines}, "");        # So we can use 1-based line numbers as indices
     while (<FILE>) {
       push(@{$lines}, $_);
     }
@@ -1477,8 +1522,8 @@ sub PrintDisassembledFunction {
     # Find run of instructions for this range of source lines
     my $first_inst = $i;
     while (($i <= $#instructions) &&
-	   ($instructions[$i]->[2] >= $first_line) &&
-	   ($instructions[$i]->[2] <= $last_line)) {
+           ($instructions[$i]->[2] >= $first_line) &&
+           ($instructions[$i]->[2] <= $last_line)) {
       $e = $instructions[$i];
       $flat_sum{$e->[2]} += $flat_count[$i];
       $cum_sum{$e->[2]} += $cum_count[$i];
@@ -1490,16 +1535,16 @@ sub PrintDisassembledFunction {
     for (my $l = $first_line; $l <= $last_line; $l++) {
       my $line = SourceLine($current_file, $l);
       if (!defined($line)) {
-	$line = "?\n";
+        $line = "?\n";
         next;
       } else {
         $line =~ s/^\s+//;
       }
       printf("%6s %6s %5d: %s",
-	     UnparseAlt($flat_sum{$l}),
-	     UnparseAlt($cum_sum{$l}),
-	     $l,
-	     $line);
+             UnparseAlt($flat_sum{$l}),
+             UnparseAlt($cum_sum{$l}),
+             $l,
+             $line);
     }
 
     # Print disassembly
@@ -1516,9 +1561,9 @@ sub PrintDisassembledFunction {
       while ($d =~ s/(\w+)<[^<>]*>/$1/g)  { }       # Remove template arguments
 
       printf("%6s %6s    %8s: %6s\n",
-	     UnparseAlt($flat_count[$x]),
-	     UnparseAlt($cum_count[$x]),
-	     $address,
+             UnparseAlt($flat_count[$x]),
+             UnparseAlt($cum_count[$x]),
+             $address,
              $d);
     }
   }
@@ -1542,7 +1587,7 @@ sub PrintDot {
   # Find nodes to include
   my @list = (sort { abs(GetEntry($cumulative, $b)) <=>
                      abs(GetEntry($cumulative, $a))
-		     || $a cmp $b }
+                     || $a cmp $b }
               keys(%{$cumulative}));
   my $last = $nodecount - 1;
   if ($last > $#list) {
@@ -1806,7 +1851,7 @@ sub Unparse {
       }
     }
   } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
-    return sprintf("%.3f", $num / 1e9);	# Convert nanoseconds to seconds
+    return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds
   } else {
     return sprintf("%d", $num);
   }
@@ -1947,42 +1992,42 @@ sub RemoveUninterestingFrames {
                       'malloc',
                       'free',
                       'memalign',
-		      'posix_memalign',
+                      'posix_memalign',
                       'pvalloc',
                       'valloc',
                       'realloc',
-		      'tc_calloc',
+                      'tc_calloc',
                       'tc_cfree',
                       'tc_malloc',
                       'tc_free',
                       'tc_memalign',
-		      'tc_posix_memalign',
+                      'tc_posix_memalign',
                       'tc_pvalloc',
                       'tc_valloc',
                       'tc_realloc',
-		      'tc_new',
-		      'tc_delete',
-		      'tc_newarray',
-		      'tc_deletearray',
-		      'tc_new_nothrow',
-		      'tc_newarray_nothrow',
-		      'do_malloc',
+                      'tc_new',
+                      'tc_delete',
+                      'tc_newarray',
+                      'tc_deletearray',
+                      'tc_new_nothrow',
+                      'tc_newarray_nothrow',
+                      'do_malloc',
                       '::do_malloc',   # new name -- got moved to an unnamed ns
                       '::do_malloc_or_cpp_alloc',
                       'DoSampledAllocation',
-		      'simple_alloc::allocate',
-		      '__malloc_alloc_template::allocate',
+                      'simple_alloc::allocate',
+                      '__malloc_alloc_template::allocate',
                       '__builtin_delete',
                       '__builtin_new',
                       '__builtin_vec_delete',
                       '__builtin_vec_new',
                       'operator new',
                       'operator new[]',
-		      # These mark the beginning/end of our custom sections
-		      '__start_google_malloc',
-		      '__stop_google_malloc',
-		      '__start_malloc_hook',
-		      '__stop_malloc_hook') {
+                      # These mark the beginning/end of our custom sections
+                      '__start_google_malloc',
+                      '__stop_google_malloc',
+                      '__start_malloc_hook',
+                      '__stop_malloc_hook') {
       $skip{$name} = 1;
       $skip{"_" . $name} = 1;   # Mach (OS X) adds a _ prefix to everything
     }
@@ -1999,11 +2044,11 @@ sub RemoveUninterestingFrames {
     # TODO(dpeng): this should not be necessary; it's taken
     # care of by the general 2nd-pc mechanism below.
     foreach my $name ('ProfileData::Add',           # historical
-		      'ProfileData::prof_handler',  # historical
-		      'CpuProfiler::prof_handler',
+                      'ProfileData::prof_handler',  # historical
+                      'CpuProfiler::prof_handler',
                       '__FRAME_END__',
-		      '__pthread_sighandler',
-		      '__restore') {
+                      '__pthread_sighandler',
+                      '__restore') {
       $skip{$name} = 1;
     }
   } else {
@@ -2042,10 +2087,10 @@ sub RemoveUninterestingFrames {
     my @path = ();
     foreach my $a (@addrs) {
       if (exists($symbols->{$a})) {
-	my $func = $symbols->{$a}->[0];
-	if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
-	  next;
-	}
+        my $func = $symbols->{$a}->[0];
+        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
+          next;
+        }
       }
       push(@path, $a);
     }
@@ -2070,8 +2115,8 @@ sub ReduceProfile {
       # To avoid double-counting due to recursion, skip a stack-trace
       # entry if it has already been seen
       if (!$seen{$e}) {
-	$seen{$e} = 1;
-	push(@path, $e);
+        $seen{$e} = 1;
+        push(@path, $e);
       }
     }
     my $reduced_path = join("\n", @path);
@@ -2404,7 +2449,6 @@ sub FetchSymbols {
   my $pcset = shift;
   my $symbol_map = shift;
 
-
   my %seen = ();
   my @pcs = grep { !$seen{$_}++ } keys(%$pcset);  # uniq
 
@@ -2414,7 +2458,7 @@ sub FetchSymbols {
     open(POSTFILE, ">$main::tmpfile_sym");
     print POSTFILE $post_data;
     close(POSTFILE);
- 
+
     my $url = SymbolPageURL();
     # Here we use curl for sending data via POST since old
     # wget doesn't have --post-file option.
@@ -2517,7 +2561,7 @@ sub FetchDynamicProfile {
     my $profile_dir = $ENV{"PPROF_TMPDIR"} || ($ENV{HOME} . "/pprof");
     if (!(-d $profile_dir)) {
       mkdir($profile_dir)
-	  || die("Unable to create profile directory $profile_dir: $!\n");
+          || die("Unable to create profile directory $profile_dir: $!\n");
     }
     my $tmp_profile = "$profile_dir/.tmp.$profile_file";
     my $real_profile = "$profile_dir/$profile_file";
@@ -2603,22 +2647,69 @@ sub TryCollectProfile {
 
 # Provide a small streaming-read module to handle very large
 # cpu-profile files.  Stream in chunks along a sliding window.
+# Provides an interface to get one 'slot', correctly handling
+# endian-ness differences.  A slot is one 32-bit or 64-bit word
+# (depending on the input profile).  We tell endianness and bit-size
+# for the profile by looking at the first 8 bytes: in cpu profiles,
+# the second slot is always 3 (we'll accept anything that's not 0).
 BEGIN {
   package CpuProfileStream;
 
   sub new {
-    my ($class, $file) = @_;
-    my $self = { file      => $file,
-                 base      => 0,
-                 stride    => 512 * 1024,     # must be a multiple of |long|
-                 slots     => []
+    my ($class, $file, $fname) = @_;
+    my $self = { file        => $file,
+                 base        => 0,
+                 stride      => 512 * 1024,   # must be a multiple of bitsize/8
+                 slots       => [],
+                 unpack_code => "",           # N for big-endian, V for little
     };
     bless $self, $class;
     # Let unittests adjust the stride
     if ($main::opt_test_stride > 0) {
       $self->{stride} = $main::opt_test_stride;
     }
-    $self->overflow();
+    # Read the first two slots to figure out bitsize and endianness.
+    my $slots = $self->{slots};
+    my $str;
+    read($self->{file}, $str, 8);
+    # Set the global $address_length based on what we see here.
+    # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars).
+    $address_length = ($str eq (chr(0)x8)) ? 16 : 8;
+    if ($address_length == 8) {
+      if (substr($str, 6, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 4, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**16\n");
+      }
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # If we're a 64-bit profile, make sure we're a 64-bit-capable
+      # perl.  Otherwise, each slot will be represented as a float
+      # instead of an int64, losing precision and making all the
+      # 64-bit addresses right.  We *could* try to handle this with
+      # software emulation of 64-bit ints, but that's added complexity
+      # for no clear benefit (yet).  We use 'Q' to test for 64-bit-ness;
+      # perl docs say it's only available on 64-bit perl systems.
+      my $has_q = 0;
+      eval { $has_q = pack("Q", "1") ? 1 : 1; };
+      if (!$has_q) {
+        ::error("$fname: need a 64-bit perl to process this 64-bit profile.\n");
+      }
+      read($self->{file}, $str, 8);
+      if (substr($str, 4, 4) eq chr(0)x4) {
+        # We'd love to use 'Q', but it's a) not universal, b) not endian-proof.
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 0, 4) eq chr(0)x4) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**32\n");
+      }
+      my @pair = unpack($self->{unpack_code} . "*", $str);
+      # Since we know one of the pair is 0, it's fine to just add them.
+      @$slots = (0, $pair[0] + $pair[1]);
+    }
     return $self;
   }
 
@@ -2629,7 +2720,25 @@ BEGIN {
     $self->{base} += $#$slots + 1;   # skip over data we're replacing
     my $str;
     read($self->{file}, $str, $self->{stride});
-    @$slots = unpack("L*", $str);
+    if ($address_length == 8) {      # the 32-bit case
+      # This is the easy case: unpack provides 32-bit unpacking primitives.
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # We need to unpack 32 bits at a time and combine.
+      my @b32_values = unpack($self->{unpack_code} . "*", $str);
+      my @b64_values = ();
+      for (my $i = 0; $i < $#b32_values; $i += 2) {
+        # TODO(csilvers): if this is a 32-bit perl, the math below
+        #    could end up in a too-large int, which perl will promote
+        #    to a double, losing necessary precision.  Deal with that.
+        if ($self->{unpack_code} eq 'V') {    # little-endian
+          push(@b64_values, $b32_values[$i] + $b32_values[$i+1] * (2**32));
+        } else {
+          push(@b64_values, $b32_values[$i] * (2**32) + $b32_values[$i+1]);
+        }
+      }
+      @$slots = @b64_values;
+    }
   }
 
   # Access the i-th long in the file (logically), or -1 at EOF.
@@ -2638,16 +2747,16 @@ BEGIN {
     my $slots = $self->{slots};
     while ($#$slots >= 0) {
       if ($idx < $self->{base}) {
-	# The only time we expect a reference to $slots[$i - something]
-	# after referencing $slots[$i] is reading the very first header.
-	# Since $stride > |header|, that shouldn't cause any lookback
-	# errors.  And everything after the header is sequential.
-	print STDERR "Unexpected look-back reading CPU profile";
-	return -1;   # shrug, don't know what better to return
+        # The only time we expect a reference to $slots[$i - something]
+        # after referencing $slots[$i] is reading the very first header.
+        # Since $stride > |header|, that shouldn't cause any lookback
+        # errors.  And everything after the header is sequential.
+        print STDERR "Unexpected look-back reading CPU profile";
+        return -1;   # shrug, don't know what better to return
       } elsif ($idx > $self->{base} + $#$slots) {
-	$self->overflow();
+        $self->overflow();
       } else {
-	return $slots->[$idx - $self->{base}];
+        return $slots->[$idx - $self->{base}];
       }
     }
     # If we get here, $slots is [], which means we've reached EOF
@@ -2752,6 +2861,33 @@ sub ReadProfile {
   return $result;
 }
 
+# Subtract one from caller pc so we map back to call instr.
+# However, don't do this if we're reading a symbolized profile
+# file, in which case the subtract-one was done when the file
+# was written.
+#
+# We apply the same logic to all readers, though ReadCPUProfile uses an
+# independent implementation.
+sub FixCallerAddresses {
+  my $stack = shift;
+  if ($main::use_symbolized_profile) {
+    return $stack;
+  } else {
+    $stack =~ /(\s)/;
+    my $delimiter = $1;
+    my @addrs = split(' ', $stack);
+    my @fixedaddrs;
+    $#fixedaddrs = $#addrs;
+    if ($#addrs >= 0) {
+      $fixedaddrs[0] = $addrs[0];
+    }
+    for (my $i = 1; $i <= $#addrs; $i++) {
+      $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1");
+    }
+    return join $delimiter, @fixedaddrs;
+  }
+}
+
 # CPU profile reader
 sub ReadCPUProfile {
   my $prog = shift;
@@ -2763,10 +2899,7 @@ sub ReadCPUProfile {
   my $pcs = {};
 
   # Parse string into array of slots.
-  # L! cannot be used because with a native 64-bit build, it will cause
-  # 1) a valid 64-bit profile to use the 32-bit codepath, and
-  # 2) a valid 32-bit profile to be unrecognized.
-  my $slots = CpuProfileStream->new(*PROFILE);
+  my $slots = CpuProfileStream->new(*PROFILE, $fname);
 
   # Read header.  The current header version is a 5-element structure
   # containing:
@@ -2775,108 +2908,50 @@ sub ReadCPUProfile {
   #   2: format version (0)
   #   3: sampling period (usec)
   #   4: unused padding (always 0)
-  # The header words are 32-bit or 64-bit depending on the ABI of the program
-  # that generated the profile.  In the 64-bit case, since our x86-architecture
-  # machines are little-endian, the actual value of each of these elements is
-  # in the first 32-bit word, and the second is always zero.  The @slots array
-  # above was read as a sequence of 32-bit words in both cases, so we need to
-  # explicitly check for both cases.  A typical slot sequence for each is:
-  #   32-bit:  0 3 0 100 0
-  #   64-bit:  0 0  3 0  0 0  100 0  0 0
-  #
   if ($slots->get(0) != 0 ) {
     error("$fname: not a profile file, or old format profile file\n");
   }
-  if ($slots->get(1) >= 3) {
-    # Normal 32-bit header:
-    $version = $slots->get(2);
-    $period = $slots->get(3);
-    $i = 2 + $slots->get(1);
-    $address_length = 8;
-
-    # Parse profile
-    while ($slots->get($i) != -1) {
-      my $n = $slots->get($i++);
-      my $d = $slots->get($i++);
-      if ($slots->get($i) == 0) {
-        # End of profile data marker
-        $i += $d;
-        last;
-      }
-
-      # Make key out of the stack entries
-      my @k = ();
-      for (my $j = 0; $j < $d; $j++) {
-        my $pc = sprintf("%08x", $slots->get($i+$j));
-        $pcs->{$pc} = 1;
-        push @k, $pc;
-      }
-
-      AddEntry($profile, (join "\n", @k), $n);
+  $i = 2 + $slots->get(1);
+  $version = $slots->get(2);
+  $period = $slots->get(3);
+  # Do some sanity checking on these header values.
+  if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) {
+    error("$fname: not a profile file, or corrupted profile file\n");
+  }
+
+  # Parse profile
+  while ($slots->get($i) != -1) {
+    my $n = $slots->get($i++);
+    my $d = $slots->get($i++);
+    if ($d > (2**16)) {  # TODO(csilvers): what's a reasonable max-stack-depth?
+      my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8));
+      print STDERR "At index $i (address $addr):\n";
+      error("$fname: stack trace depth >= 2**32\n");
+    }
+    if ($slots->get($i) == 0) {
+      # End of profile data marker
       $i += $d;
+      last;
     }
 
-  # Normal 64-bit header:  All entries are doubled in size.  The first
-  # word (little-endian) should contain the real value, the second should
-  # be zero.
-  } elsif ($slots->get(1) != 0 ||
-	   $slots->get(2) < 3 ||
-	   $slots->get(3) != 0 ||
-	   $slots->get(5) != 0 ||
-	   $slots->get(7) != 0) {
-    error("$fname: not a profile file, or old format profile file\n");
-  } else {
-    $version = $slots->get(4);
-    $period = $slots->get(6);
-    $i = 4 + 2 * $slots->get(2);
-    $address_length = 16;
-
-    # Parse profile
-    while ($slots->get($i) != -1) {
-      my $n = $slots->get($i++);
-      my $nhi = $slots->get($i++);
-      # Huge counts may coerce to floating point, keeping scale, not precision
-      if ($nhi != 0) { $n += $nhi*(2**32); }
-      my $d = $slots->get($i++);
-      if ($slots->get($i++) != 0) {
-        my $addr = sprintf("%o", 4 * $i);
-        print STDERR "At index $i ($addr):\n";
-        error("$fname: stack trace depth >= 2**32\n");
-      }
-      if ($slots->get($i) == 0 && $slots->get($i+1) == 0) {
-        # End of profile data marker
-        $i += 2 * $d;
-        last;
-      }
-
-      # Make key out of the stack entries
-      my @k = ();
-      for (my $j = 0; $j < $d; $j++) {
-        my $pclo = $slots->get($i++);
-        my $pchi = $slots->get($i++);
-        if ($pclo == -1 || $pchi == -1) {
-          error("$fname: Unexpected EOF when reading stack of depth $d\n");
-        }
-
-	# Subtract one from caller pc so we map back to call instr.
-        # However, don't do this if we're reading a symbolized profile
-        # file, in which case the subtract-one was done when the file
-        # was written.
-        if ($j > 0 && !$main::use_symbolized_profile) {
-          if ($pclo == 0) {
-            $pchi--;
-            $pclo = 0xffffffff;
-          } else {
-            $pclo--;
-          }
-        }
-
-        my $pc = sprintf("%08x%08x", $pchi, $pclo);
-        $pcs->{$pc} = 1;
-        push @k, $pc;
+    # Make key out of the stack entries
+    my @k = ();
+    for (my $j = 0; $j < $d; $j++) {
+      my $pc = $slots->get($i+$j);
+      # Subtract one from caller pc so we map back to call instr.
+      # However, don't do this if we're reading a symbolized profile
+      # file, in which case the subtract-one was done when the file
+      # was written.
+      if ($j > 0 && !$main::use_symbolized_profile) {
+        $pc--;
       }
-      AddEntry($profile, (join "\n", @k), $n);
+      $pc = sprintf("%0*x", $address_length, $pc);
+      $pcs->{$pc} = 1;
+      push @k, $pc;
     }
+
+    AddEntry($profile, (join "\n", @k), $n);
+    $i += $d;
   }
 
   # Parse map
@@ -2947,18 +3022,18 @@ sub ReadHeapProfile {
       # found for profiles generated locally, and the others for
       # remote profiles.
       if (($type eq "heapprofile") || ($type !~ /heap/) ) {
-	# No need to adjust for the sampling rate with heap-profiler-derived data
-	$sampling_algorithm = 0;
+        # No need to adjust for the sampling rate with heap-profiler-derived data
+        $sampling_algorithm = 0;
       } elsif ($type =~ /_v2/) {
-	$sampling_algorithm = 2;     # version 2 sampling
+        $sampling_algorithm = 2;     # version 2 sampling
         if (defined($sample_period) && ($sample_period ne '')) {
-	  $sample_adjustment = int($sample_period);
-	}
+          $sample_adjustment = int($sample_period);
+        }
       } else {
-	$sampling_algorithm = 1;     # version 1 sampling
+        $sampling_algorithm = 1;     # version 1 sampling
         if (defined($sample_period) && ($sample_period ne '')) {
-	  $sample_adjustment = int($sample_period)/2;
-	}
+          $sample_adjustment = int($sample_period)/2;
+        }
       }
     } else {
       # We detect whether or not this is a remote-heap profile by checking
@@ -2970,7 +3045,7 @@ sub ReadHeapProfile {
       my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
       if (($n1 == $n2) && ($s1 == $s2)) {
         # This is likely to be a remote-heap based sample profile
-	$sampling_algorithm = 1;
+        $sampling_algorithm = 1;
       }
     }
   }
@@ -2984,7 +3059,7 @@ sub ReadHeapProfile {
       print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n";
     } else {
       printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n",
-		     $sample_adjustment);
+                     $sample_adjustment);
     }
     if ($sampling_algorithm > 1) {
       # We don't bother printing anything for the original version (version 1)
@@ -3001,7 +3076,7 @@ sub ReadHeapProfile {
     if (/^MAPPED_LIBRARIES:/) {
       # Read the /proc/self/maps data
       while (<PROFILE>) {
-	s/\r//g;         # turn windows-looking lines into unix-looking lines
+        s/\r//g;         # turn windows-looking lines into unix-looking lines
         $map .= $_;
       }
       last;
@@ -3011,7 +3086,7 @@ sub ReadHeapProfile {
       # Read /proc/self/maps data as formatted by DumpAddressMap()
       my $buildvar = "";
       while (<PROFILE>) {
-	s/\r//g;         # turn windows-looking lines into unix-looking lines
+        s/\r//g;         # turn windows-looking lines into unix-looking lines
         # Parse "build=<dir>" specification if supplied
         if (m/^\s*build=(.*)\n/) {
           $buildvar = $1;
@@ -3066,7 +3141,7 @@ sub ReadHeapProfile {
       }
 
       my @counts = ($n1, $s1, $n2, $s2);
-      AddEntries($profile, $pcs, $stack, $counts[$index]);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
     }
   }
 
@@ -3086,7 +3161,7 @@ sub ReadSynchProfile {
   my $profile = {};
   my $pcs = {};
   my $sampling_period = 1;
-  my $cyclespernanosec = 2.8;	# Default assumption for old binaries
+  my $cyclespernanosec = 2.8;   # Default assumption for old binaries
   my $seen_clockrate = 0;
   my $line;
 
@@ -3112,7 +3187,7 @@ sub ReadSynchProfile {
       $count *= $sampling_period;
 
       my @values = ($cycles, $count, $cycles / $count);
-      AddEntries($profile, $pcs, $stack, $values[$index]);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]);
 
     } elsif ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
               $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
@@ -3127,7 +3202,7 @@ sub ReadSynchProfile {
       # Adjust for sampling done by application
       $cycles *= $sampling_period;
 
-      AddEntries($profile, $pcs, $stack, $cycles);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles);
 
     } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) {
       my ($variable, $value) = ($1,$2);
@@ -3308,8 +3383,8 @@ sub ParseTextSectionHeaderFromOtool {
     } elsif ($line =~ /segname (\w+)/) {
       $segname = $1;
     } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") &&
-	       $sectname eq "__text" &&
-	       $segname eq "__TEXT")) {
+               $sectname eq "__text" &&
+               $segname eq "__TEXT")) {
       next;
     } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) {
       $vma = $1;
@@ -3369,7 +3444,7 @@ sub ParseLibraries {
     my $finish;
     my $offset;
     my $lib;
-    if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*)?)$/i) {
+    if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) {
       # Full line from /proc/self/maps.  Example:
       #   40000000-40015000 r-xp 00000000 03:01 12845071   /lib/ld-2.3.2.so
       $start = HexExtend($1);
@@ -3675,7 +3750,7 @@ sub MapToSymbols {
   if ($debug) { print("---- $image ---\n"); }
   for (my $i = 0; $i <= $#{$pclist}; $i++) {
     # addr2line always reads hex addresses, and does not need '0x' prefix.
-    if ($debug) { printf("%s\n", $pclist->[$i]); }
+    if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); }
     printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset));
     if (defined($sep_address)) {
       printf ADDRESSES ("%s\n", $sep_address);
@@ -3727,7 +3802,7 @@ sub MapToSymbols {
       $symbols->{$pcstr} = $sym;
     }
     unshift(@{$sym}, $function, $filelinenum, $fullfunction);
-    if ($debug) { printf("%s => [%s]\n", $pcstr, join(" ", @{$sym})); }
+    if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); }
     if (!defined($sep_address)) {
       # Inlining is off, se this entry ends immediately
       $count++;
@@ -3783,7 +3858,7 @@ sub MapSymbolsWithNM {
   }
   return 1;
 }
- 
+
 sub ShortFunctionName {
   my $function = shift;
   while ($function =~ s/\([^()]*\)(\s*const)?//g) { }   # Argument types
@@ -3942,12 +4017,12 @@ sub GetProcedureBoundariesViaNm {
       # we'll just go ahead and process the first entry (which never
       # got touched in the queue), and ignore the others.
       if ($start_val eq $last_start && $type =~ /t/i) {
-	# We are the 'T' symbol at this address, replace previous symbol.
-	$routine = $this_routine;
-	next;
+        # We are the 'T' symbol at this address, replace previous symbol.
+        $routine = $this_routine;
+        next;
       } elsif ($start_val eq $last_start) {
-	# We're not the 'T' symbol at this address, so ignore us.
-	next;
+        # We're not the 'T' symbol at this address, so ignore us.
+        next;
       }
 
       if ($this_routine eq $sep_symbol) {
@@ -3962,7 +4037,7 @@ sub GetProcedureBoundariesViaNm {
 
       if (defined($routine) && $routine =~ m/$regexp/) {
         $symbol_table->{$routine} = [HexExtend($last_start),
-				     HexExtend($start_val)];
+                                     HexExtend($start_val)];
       }
       $last_start = $start_val;
       $routine = $this_routine;
@@ -3981,7 +4056,7 @@ sub GetProcedureBoundariesViaNm {
   # TODO(csilvers): do better here.
   if (defined($routine) && $routine =~ m/$regexp/) {
     $symbol_table->{$routine} = [HexExtend($last_start),
-				 HexExtend($last_start)];
+                                 HexExtend($last_start)];
   }
 
   return $symbol_table;
@@ -4029,9 +4104,9 @@ sub GetProcedureBoundaries {
   # -D to at least get *exported* symbols.  If we can't use --demangle,
   # we use c++filt instead, if it exists on this system.
   my @nm_commands = ("$nm -n $flatten_flag $demangle_flag" .
-		     " $image 2>/dev/null $cppfilt_flag",
-		     "$nm -D -n $flatten_flag $demangle_flag" .
-		     " $image 2>/dev/null $cppfilt_flag");
+                     " $image 2>/dev/null $cppfilt_flag",
+                     "$nm -D -n $flatten_flag $demangle_flag" .
+                     " $image 2>/dev/null $cppfilt_flag");
   # If the executable is an MS Windows PDB-format executable, we'll
   # have set up obj_tool_map("nm_pdb").  In this case, we actually
   # want to use both unix nm and windows-specific nm_pdb, since
@@ -4263,4 +4338,3 @@ sub RunUnitTests {
   }
   exit ($error_count);
 }
-
diff --git a/src/stacktrace.cc b/src/stacktrace.cc
index d158eea..68cb865 100644
--- a/src/stacktrace.cc
+++ b/src/stacktrace.cc
@@ -57,7 +57,45 @@
 #include "stacktrace_config.h"
 
 #if defined(STACKTRACE_INL_HEADER)
-# include STACKTRACE_INL_HEADER
+
+#define IS_STACK_FRAMES 0
+#define IS_WITH_CONTEXT 0
+#define GET_STACK_TRACE_OR_FRAMES \
+   GetStackTrace(void **result, int max_depth, int skip_count)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+#define IS_STACK_FRAMES 1
+#define IS_WITH_CONTEXT 0
+#define GET_STACK_TRACE_OR_FRAMES \
+  GetStackFrames(void **result, int *sizes, int max_depth, int skip_count)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+#define IS_STACK_FRAMES 0
+#define IS_WITH_CONTEXT 1
+#define GET_STACK_TRACE_OR_FRAMES \
+  GetStackTraceWithContext(void **result, int max_depth, \
+                           int skip_count, const void *ucp)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+#define IS_STACK_FRAMES 1
+#define IS_WITH_CONTEXT 1
+#define GET_STACK_TRACE_OR_FRAMES \
+  GetStackFramesWithContext(void **result, int *sizes, int max_depth, \
+                            int skip_count, const void *ucp)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
 #elif 0
 // This is for the benefit of code analysis tools that may have
 // trouble with the computed #include above.
diff --git a/src/stacktrace_generic-inl.h b/src/stacktrace_generic-inl.h
index 490cd9d..0e72ee7 100644
--- a/src/stacktrace_generic-inl.h
+++ b/src/stacktrace_generic-inl.h
@@ -34,57 +34,32 @@
 //
 // Note:  The glibc implementation may cause a call to malloc.
 // This can cause a deadlock in HeapProfiler.
+
+#ifndef BASE_STACKTRACE_GENERIC_INL_H_
+#define BASE_STACKTRACE_GENERIC_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
 #include <execinfo.h>
 #include <string.h>
 #include "google/stacktrace.h"
+#endif  // BASE_STACKTRACE_GENERIC_INL_H_
 
-// If you change this function, also change GetStackFrames below.
-int GetStackTrace(void** result, int max_depth, int skip_count) {
-  static const int kStackLength = 64;
-  void * stack[kStackLength];
-  int size;
-
-  size = backtrace(stack, kStackLength);
-  skip_count++;  // we want to skip the current frame as well
-  int result_count = size - skip_count;
-  if (result_count < 0)
-    result_count = 0;
-  if (result_count > max_depth)
-    result_count = max_depth;
-  for (int i = 0; i < result_count; i++)
-    result[i] = stack[i + skip_count];
-
-  return result_count;
-}
+// Note: this part of the file is included several times.
+// Do not put globals below.
 
-// If you change this function, also change GetStackTrace above:
-//
-// This GetStackFrames routine shares a lot of code with GetStackTrace
-// above. This code could have been refactored into a common routine,
-// and then both GetStackTrace/GetStackFrames could call that routine.
-// There are two problems with that:
-//
-// (1) The performance of the refactored-code suffers substantially - the
-//     refactored needs to be able to record the stack trace when called
-//     from GetStackTrace, and both the stack trace and stack frame sizes,
-//     when called from GetStackFrames - this introduces enough new
-//     conditionals that GetStackTrace performance can degrade by as much
-//     as 50%.
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
 //
-// (2) Whether the refactored routine gets inlined into GetStackTrace and
-//     GetStackFrames depends on the compiler, and we can't guarantee the
-//     behavior either-way, even with "__attribute__ ((always_inline))"
-//     or "__attribute__ ((noinline))". But we need this guarantee or the
-//     frame counts may be off by one.
-//
-// Both (1) and (2) can be addressed without this code duplication, by
-// clever use of template functions, and by defining GetStackTrace and
-// GetStackFrames as macros that expand to these template functions.
-// However, this approach comes with its own set of problems - namely,
-// macros and  preprocessor trouble - for example,  if GetStackTrace
-// and/or GetStackFrames is ever defined as a member functions in some
-// class, we are in trouble.
-int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+int GET_STACK_TRACE_OR_FRAMES {
   static const int kStackLength = 64;
   void * stack[kStackLength];
   int size;
@@ -97,10 +72,12 @@ int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
   if (result_count > max_depth)
     result_count = max_depth;
   for (int i = 0; i < result_count; i++)
-    pcs[i] = stack[i + skip_count];
+    result[i] = stack[i + skip_count];
 
+#if IS_STACK_FRAMES
   // No implementation for finding out the stack frame sizes yet.
   memset(sizes, 0, sizeof(*sizes) * result_count);
+#endif
 
   return result_count;
 }
diff --git a/src/stacktrace_libunwind-inl.h b/src/stacktrace_libunwind-inl.h
index d9d829a..a1d5249 100644
--- a/src/stacktrace_libunwind-inl.h
+++ b/src/stacktrace_libunwind-inl.h
@@ -32,6 +32,11 @@
 //
 // Produce stack trace using libunwind
 
+#ifndef BASE_STACKTRACE_LIBINWIND_INL_H_
+#define BASE_STACKTRACE_LIBINWIND_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
 // We only need local unwinder.
 #define UNW_LOCAL_ONLY
 
@@ -52,73 +57,30 @@ extern "C" {
 // cases, we return 0 to indicate the situation.
 static __thread int recursive;
 
-// If you change this function, also change GetStackFrames below.
-int GetStackTrace(void** result, int max_depth, int skip_count) {
-  void *ip;
-  int n = 0;
-  unw_cursor_t cursor;
-  unw_context_t uc;
+#endif  // BASE_STACKTRACE_LIBINWIND_INL_H_
 
-  if (recursive) {
-    return 0;
-  }
-  ++recursive;
-
-  unw_getcontext(&uc);
-  int ret = unw_init_local(&cursor, &uc);
-  assert(ret >= 0);
-  skip_count++;         // Do not include the "GetStackTrace" frame
-
-  while (n < max_depth) {
-    if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
-      break;
-    }
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-      result[n++] = ip;
-    }
-    if (unw_step(&cursor) <= 0) {
-      break;
-    }
-  }
-  --recursive;
-  return n;
-}
+// Note: this part of the file is included several times.
+// Do not put globals below.
 
-// If you change this function, also change GetStackTrace above:
-//
-// This GetStackFrames routine shares a lot of code with GetStackTrace
-// above. This code could have been refactored into a common routine,
-// and then both GetStackTrace/GetStackFrames could call that routine.
-// There are two problems with that:
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
 //
-// (1) The performance of the refactored-code suffers substantially - the
-//     refactored needs to be able to record the stack trace when called
-//     from GetStackTrace, and both the stack trace and stack frame sizes,
-//     when called from GetStackFrames - this introduces enough new
-//     conditionals that GetStackTrace performance can degrade by as much
-//     as 50%.
-//
-// (2) Whether the refactored routine gets inlined into GetStackTrace and
-//     GetStackFrames depends on the compiler, and we can't guarantee the
-//     behavior either-way, even with "__attribute__ ((always_inline))"
-//     or "__attribute__ ((noinline))". But we need this guarantee or the
-//     frame counts may be off by one.
-//
-// Both (1) and (2) can be addressed without this code duplication, by
-// clever use of template functions, and by defining GetStackTrace and
-// GetStackFrames as macros that expand to these template functions.
-// However, this approach comes with its own set of problems - namely,
-// macros and  preprocessor trouble - for example,  if GetStackTrace
-// and/or GetStackFrames is ever defined as a member functions in some
-// class, we are in trouble.
-int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+int GET_STACK_TRACE_OR_FRAMES {
   void *ip;
   int n = 0;
   unw_cursor_t cursor;
   unw_context_t uc;
+#if IS_STACK_FRAMES
   unw_word_t sp = 0, next_sp = 0;
+#endif
 
   if (recursive) {
     return 0;
@@ -126,31 +88,41 @@ int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
   ++recursive;
 
   unw_getcontext(&uc);
-  RAW_CHECK(unw_init_local(&cursor, &uc) >= 0, "unw_init_local failed");
-  skip_count++;         // Do not include the "GetStackFrames" frame
+  int ret = unw_init_local(&cursor, &uc);
+  assert(ret >= 0);
+  skip_count++;         // Do not include current frame
 
   while (skip_count--) {
-    if (unw_step(&cursor) <= 0 ||
-        unw_get_reg(&cursor, UNW_REG_SP, &next_sp) < 0) {
+    if (unw_step(&cursor) <= 0) {
       goto out;
     }
+#if IS_STACK_FRAMES
+    if (unw_get_reg(&cursor, UNW_REG_SP, &next_sp)) {
+      goto out;
+    }
+#endif
   }
+
   while (n < max_depth) {
-    sp = next_sp;
-    if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0)
+    if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
       break;
-    if (unw_step(&cursor) <= 0 ||
-        unw_get_reg(&cursor, UNW_REG_SP, &next_sp)) {
-      // We couldn't step any further (possibly because we reached _start).
-      // Provide the last good PC we've got, and get out.
-      sizes[n] = 0;
-      pcs[n++] = ip;
+    }
+#if IS_STACK_FRAMES
+    sizes[n] = 0;
+#endif
+    result[n++] = ip;
+    if (unw_step(&cursor) <= 0) {
+      break;
+    }
+#if IS_STACK_FRAMES
+    sp = next_sp;
+    if (unw_get_reg(&cursor, UNW_REG_SP, &next_sp) , 0) {
       break;
     }
-    sizes[n] = next_sp - sp;
-    pcs[n++] = ip;
+    sizes[n - 1] = next_sp - sp;
+#endif
   }
- out:
+out:
   --recursive;
   return n;
 }
diff --git a/src/stacktrace_powerpc-inl.h b/src/stacktrace_powerpc-inl.h
index 5631e49..9a07eea 100644
--- a/src/stacktrace_powerpc-inl.h
+++ b/src/stacktrace_powerpc-inl.h
@@ -36,6 +36,11 @@
 //    http://www.linux-foundation.org/spec/ELF/ppc64/PPC-elf64abi-1.9.html#STACK
 // Linux has similar code: http://patchwork.ozlabs.org/linuxppc/patch?id=8882
 
+#ifndef BASE_STACKTRACE_POWERPC_INL_H_
+#define BASE_STACKTRACE_POWERPC_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
 #include <stdint.h>   // for uintptr_t
 #include <stdlib.h>   // for NULL
 #include <google/stacktrace.h>
@@ -71,9 +76,23 @@ static void **NextStackFrame(void **old_sp) {
 // This ensures that GetStackTrace stes up the Link Register properly.
 void StacktracePowerPCDummyFunction() __attribute__((noinline));
 void StacktracePowerPCDummyFunction() { __asm__ volatile(""); }
+#endif  // BASE_STACKTRACE_POWERPC_INL_H_
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
 
-// If you change this function, also change GetStackFrames below.
-int GetStackTrace(void** result, int max_depth, int skip_count) {
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+int GET_STACK_TRACE_OR_FRAMES {
   void **sp;
   // Apple OS X uses an old version of gnu as -- both Darwin 7.9.0 (Panther)
   // and Darwin 8.8.1 (Tiger) use as 1.38.  This means we have to use a
@@ -95,11 +114,29 @@ int GetStackTrace(void** result, int max_depth, int skip_count) {
   // This routine forces the compiler (at least gcc) to push it anyway.
   StacktracePowerPCDummyFunction();
 
+#if IS_STACK_FRAMES
+  // Note we do *not* increment skip_count here for the SYSV ABI.  If
+  // we did, the list of stack frames wouldn't properly match up with
+  // the list of return addresses.  Note this means the top pc entry
+  // is probably bogus for linux/ppc (and other SYSV-ABI systems).
+#else
   // The LR save area is used by the callee, so the top entry is bogus.
   skip_count++;
+#endif
 
   int n = 0;
   while (sp && n < max_depth) {
+#if IS_STACK_FRAMES
+    // The GetStackFrames routine is called when we are in some
+    // informational context (the failure signal handler for example).
+    // Use the non-strict unwinding rules to produce a stack trace
+    // that is as complete as possible (even if it contains a few bogus
+    // entries in some rare cases).
+    void **next_sp = NextStackFrame<false>(sp);
+#else
+    void **next_sp = NextStackFrame<true>(sp);
+#endif
+
     if (skip_count > 0) {
       skip_count--;
     } else {
@@ -120,85 +157,15 @@ int GetStackTrace(void** result, int max_depth, int skip_count) {
 #else
 #error Need to specify the PPC ABI for your archiecture.
 #endif
-    }
-    // Use strict unwinding rules.
-    sp = NextStackFrame<true>(sp);
-  }
-  return n;
-}
-
-// If you change this function, also change GetStackTrace above:
-//
-// This GetStackFrames routine shares a lot of code with GetStackTrace
-// above. This code could have been refactored into a common routine,
-// and then both GetStackTrace/GetStackFrames could call that routine.
-// There are two problems with that:
-//
-// (1) The performance of the refactored-code suffers substantially - the
-//     refactored needs to be able to record the stack trace when called
-//     from GetStackTrace, and both the stack trace and stack frame sizes,
-//     when called from GetStackFrames - this introduces enough new
-//     conditionals that GetStackTrace performance can degrade by as much
-//     as 50%.
-//
-// (2) Whether the refactored routine gets inlined into GetStackTrace and
-//     GetStackFrames depends on the compiler, and we can't guarantee the
-//     behavior either-way, even with "__attribute__ ((always_inline))"
-//     or "__attribute__ ((noinline))". But we need this guarantee or the
-//     frame counts may be off by one.
-//
-// Both (1) and (2) can be addressed without this code duplication, by
-// clever use of template functions, and by defining GetStackTrace and
-// GetStackFrames as macros that expand to these template functions.
-// However, this approach comes with its own set of problems - namely,
-// macros and  preprocessor trouble - for example,  if GetStackTrace
-// and/or GetStackFrames is ever defined as a member functions in some
-// class, we are in trouble.
-int GetStackFrames(void** pcs, int *sizes, int max_depth, int skip_count) {
-  void **sp;
-#ifdef __APPLE__
-  __asm__ volatile ("mr %0,r1" : "=r" (sp));
-#else
-  __asm__ volatile ("mr %0,1" : "=r" (sp));
-#endif
 
-  StacktracePowerPCDummyFunction();
-  // Note we do *not* increment skip_count here for the SYSV ABI.  If
-  // we did, the list of stack frames wouldn't properly match up with
-  // the list of return addresses.  Note this means the top pc entry
-  // is probably bogus for linux/ppc (and other SYSV-ABI systems).
-
-  int n = 0;
-  while (sp && n < max_depth) {
-    // The GetStackFrames routine is called when we are in some
-    // informational context (the failure signal handler for example).
-    // Use the non-strict unwinding rules to produce a stack trace
-    // that is as complete as possible (even if it contains a few bogus
-    // entries in some rare cases).
-    void **next_sp = NextStackFrame<false>(sp);
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-#if defined(_CALL_AIX) || defined(_CALL_DARWIN)
-      pcs[n++] = *(sp+2);
-#elif defined(_CALL_SYSV)
-      pcs[n++] = *(sp+1);
-#elif defined(__APPLE__) || (defined(__linux) && defined(__PPC64__))
-      // This check is in case the compiler doesn't define _CALL_AIX/etc.
-      pcs[n++] = *(sp+2);
-#elif defined(__linux)
-      // This check is in case the compiler doesn't define _CALL_SYSV.
-      pcs[n++] = *(sp+1);
-#else
-#error Need to specify the PPC ABI for your archiecture.
-#endif
+#if IS_STACK_FRAME
       if (next_sp > sp) {
         sizes[n] = (uintptr_t)next_sp - (uintptr_t)sp;
       } else {
         // A frame-size of 0 is used to indicate unknown frame size.
         sizes[n] = 0;
       }
-      n++;
+#endif
     }
     sp = next_sp;
   }
diff --git a/src/stacktrace_with_context.cc b/src/stacktrace_with_context.cc
deleted file mode 100644
index ed7bfe3..0000000
--- a/src/stacktrace_with_context.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2009, Google Inc.
-// All rights reserved.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-// 
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ---
-// Author: Paul Pluzhnikov
-//
-// This code logically belongs in stacktrace.cc, but
-// it is moved into (this) separate file in order to
-// prevent inlining of routines defined here.
-//
-// Inlining causes skip_count to be incorrect, and there
-// is no portable way to prevent it.
-//
-// Eventually LTO (link-time optimization) and/or LLVM
-// may inline this code anyway. Let's hope they respect
-// ATTRIBUTE_NOINLINE.
-
-#include <config.h>
-#include <google/stacktrace.h>
-#include "stacktrace_config.h"
-#include "base/basictypes.h"
-
-#if !defined(STACKTRACE_SKIP_CONTEXT_ROUTINES)
-ATTRIBUTE_NOINLINE PERFTOOLS_DLL_DECL
-int GetStackFramesWithContext(void** pcs, int* sizes, int max_depth,
-                              int skip_count, const void * /* uc */) {
-  return GetStackFrames(pcs, sizes, max_depth, skip_count + 1);
-}
-
-ATTRIBUTE_NOINLINE PERFTOOLS_DLL_DECL
-int GetStackTraceWithContext(void** result, int max_depth,
-                             int skip_count, const void * /* uc */) {
-  return GetStackTrace(result, max_depth, skip_count + 1);
-}
-#endif
diff --git a/src/stacktrace_x86-inl.h b/src/stacktrace_x86-inl.h
index 05701e7..6753fdb 100644
--- a/src/stacktrace_x86-inl.h
+++ b/src/stacktrace_x86-inl.h
@@ -31,17 +31,13 @@
 // Author: Sanjay Ghemawat
 //
 // Produce stack trace
-//
-// NOTE: there is code duplication between
-// GetStackTrace, GetStackTraceWithContext, GetStackFrames and
-// GetStackFramesWithContext. If you update one, update them all.
-//
-// There is no easy way to avoid this, because inlining
-// interferes with skip_count, and there is no portable
-// way to turn inlining off, or force it always on.
 
-#include "config.h"
+#ifndef BASE_STACKTRACE_X86_INL_H_
+#define BASE_STACKTRACE_X86_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
 
+#include "config.h"
 #include <stdlib.h>   // for NULL
 #include <assert.h>
 #if defined(HAVE_SYS_UCONTEXT_H)
@@ -190,8 +186,8 @@ static void **NextStackFrame(void **old_sp, const void *uc) {
       const ucontext_t *ucv = static_cast<const ucontext_t *>(uc);
       // This kernel does not use frame pointer in its VDSO code,
       // and so %ebp is not suitable for unwinding.
-      const void **const reg_ebp =
-          reinterpret_cast<const void **>(ucv->uc_mcontext.gregs[REG_EBP]);
+      void **const reg_ebp =
+          reinterpret_cast<void **>(ucv->uc_mcontext.gregs[REG_EBP]);
       const unsigned char *const reg_eip =
           reinterpret_cast<unsigned char *>(ucv->uc_mcontext.gregs[REG_EIP]);
       if (new_sp == reg_ebp &&
@@ -269,209 +265,24 @@ static void **NextStackFrame(void **old_sp, const void *uc) {
   return new_sp;
 }
 
-// If you change this function, see NOTE at the top of file.
-// Same as above, but with signal ucontext_t pointer.
-int GetStackTraceWithContext(void** result,
-                             int max_depth,
-                             int skip_count,
-                             const void *uc) {
-  void **sp;
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) || __llvm__
-  // __builtin_frame_address(0) can return the wrong address on gcc-4.1.0-k8.
-  // It's always correct on llvm, and the techniques below aren't (in
-  // particular, llvm-gcc will make a copy of pcs, so it's not in sp[2]),
-  // so we also prefer __builtin_frame_address when running under llvm.
-  sp = reinterpret_cast<void**>(__builtin_frame_address(0));
-#elif defined(__i386__)
-  // Stack frame format:
-  //    sp[0]   pointer to previous frame
-  //    sp[1]   caller address
-  //    sp[2]   first argument
-  //    ...
-  // NOTE: This will break under llvm, since result is a copy and not in sp[2]
-  sp = (void **)&result - 2;
-#elif defined(__x86_64__)
-  unsigned long rbp;
-  // Move the value of the register %rbp into the local variable rbp.
-  // We need 'volatile' to prevent this instruction from getting moved
-  // around during optimization to before function prologue is done.
-  // An alternative way to achieve this
-  // would be (before this __asm__ instruction) to call Noop() defined as
-  //   static void Noop() __attribute__ ((noinline));  // prevent inlining
-  //   static void Noop() { asm(""); }  // prevent optimizing-away
-  __asm__ volatile ("mov %%rbp, %0" : "=r" (rbp));
-  // Arguments are passed in registers on x86-64, so we can't just
-  // offset from &result
-  sp = (void **) rbp;
-#else
-# error Using stacktrace_x86-inl.h on a non x86 architecture!
-#endif
-
-  int n = 0;
-  while (sp && n < max_depth) {
-    if (*(sp+1) == reinterpret_cast<void *>(0)) {
-      // In 64-bit code, we often see a frame that
-      // points to itself and has a return address of 0.
-      break;
-    }
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-      result[n++] = *(sp+1);
-    }
-    // Use strict unwinding rules.
-    sp = NextStackFrame<true, true>(sp, uc);
-  }
-  return n;
-}
-
-int GetStackTrace(void** result, int max_depth, int skip_count) {
-  void **sp;
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) || __llvm__
-  // __builtin_frame_address(0) can return the wrong address on gcc-4.1.0-k8.
-  // It's always correct on llvm, and the techniques below aren't (in
-  // particular, llvm-gcc will make a copy of pcs, so it's not in sp[2]),
-  // so we also prefer __builtin_frame_address when running under llvm.
-  sp = reinterpret_cast<void**>(__builtin_frame_address(0));
-#elif defined(__i386__)
-  // Stack frame format:
-  //    sp[0]   pointer to previous frame
-  //    sp[1]   caller address
-  //    sp[2]   first argument
-  //    ...
-  // NOTE: This will break under llvm, since result is a copy and not in sp[2]
-  sp = (void **)&result - 2;
-#elif defined(__x86_64__)
-  unsigned long rbp;
-  // Move the value of the register %rbp into the local variable rbp.
-  // We need 'volatile' to prevent this instruction from getting moved
-  // around during optimization to before function prologue is done.
-  // An alternative way to achieve this
-  // would be (before this __asm__ instruction) to call Noop() defined as
-  //   static void Noop() __attribute__ ((noinline));  // prevent inlining
-  //   static void Noop() { asm(""); }  // prevent optimizing-away
-  __asm__ volatile ("mov %%rbp, %0" : "=r" (rbp));
-  // Arguments are passed in registers on x86-64, so we can't just
-  // offset from &result
-  sp = (void **) rbp;
-#else
-# error Using stacktrace_x86-inl.h on a non x86 architecture!
-#endif
+#endif  // BASE_STACKTRACE_X86_INL_H_
 
-  int n = 0;
-  while (sp && n < max_depth) {
-    if (*(sp+1) == reinterpret_cast<void *>(0)) {
-      // In 64-bit code, we often see a frame that
-      // points to itself and has a return address of 0.
-      break;
-    }
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-      result[n++] = *(sp+1);
-    }
-    // Use strict unwinding rules.
-    sp = NextStackFrame<true, false>(sp, NULL);
-  }
-  return n;
-}
+// Note: this part of the file is included several times.
+// Do not put globals below.
 
-// If you change this function, see NOTE at the top of file.
-//
-// This GetStackFrames routine shares a lot of code with GetStackTrace
-// above. This code could have been refactored into a common routine,
-// and then both GetStackTrace/GetStackFrames could call that routine.
-// There are two problems with that:
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
 //
-// (1) The performance of the refactored-code suffers substantially - the
-//     refactored needs to be able to record the stack trace when called
-//     from GetStackTrace, and both the stack trace and stack frame sizes,
-//     when called from GetStackFrames - this introduces enough new
-//     conditionals that GetStackTrace performance can degrade by as much
-//     as 50%.
-//
-// (2) Whether the refactored routine gets inlined into GetStackTrace and
-//     GetStackFrames depends on the compiler, and we can't guarantee the
-//     behavior either-way, even with "__attribute__ ((always_inline))"
-//     or "__attribute__ ((noinline))". But we need this guarantee or the
-//     frame counts may be off by one.
-//
-// Both (1) and (2) can be addressed without this code duplication, by
-// clever use of template functions, and by defining GetStackTrace and
-// GetStackFrames as macros that expand to these template functions.
-// However, this approach comes with its own set of problems - namely,
-// macros and  preprocessor trouble - for example,  if GetStackTrace
-// and/or GetStackFrames is ever defined as a member functions in some
-// class, we are in trouble.
-int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
-  void **sp;
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) || __llvm__
-  // __builtin_frame_address(0) can return the wrong address on gcc-4.1.0-k8.
-  // It's always correct on llvm, and the techniques below aren't (in
-  // particular, llvm-gcc will make a copy of pcs, so it's not in sp[2]),
-  // so we also prefer __builtin_frame_address when running under llvm.
-  sp = reinterpret_cast<void**>(__builtin_frame_address(0));
-#elif defined(__i386__)
-  // Stack frame format:
-  //    sp[0]   pointer to previous frame
-  //    sp[1]   caller address
-  //    sp[2]   first argument
-  //    ...
-  sp = (void **)&pcs - 2;
-#elif defined(__x86_64__)
-  unsigned long rbp;
-  // Move the value of the register %rbp into the local variable rbp.
-  // We need 'volatile' to prevent this instruction from getting moved
-  // around during optimization to before function prologue is done.
-  // An alternative way to achieve this
-  // would be (before this __asm__ instruction) to call Noop() defined as
-  //   static void Noop() __attribute__ ((noinline));  // prevent inlining
-  //   static void Noop() { asm(""); }  // prevent optimizing-away
-  __asm__ volatile ("mov %%rbp, %0" : "=r" (rbp));
-  // Arguments are passed in registers on x86-64, so we can't just
-  // offset from &result
-  sp = (void **) rbp;
-#else
-# error Using stacktrace_x86-inl.h on a non x86 architecture!
-#endif
-
-  int n = 0;
-  while (sp && n < max_depth) {
-    if (*(sp+1) == reinterpret_cast<void *>(0)) {
-      // In 64-bit code, we often see a frame that
-      // points to itself and has a return address of 0.
-      break;
-    }
-    // The GetStackFrames routine is called when we are in some
-    // informational context (the failure signal handler for example).
-    // Use the non-strict unwinding rules to produce a stack trace
-    // that is as complete as possible (even if it contains a few bogus
-    // entries in some rare cases).
-    void **next_sp = NextStackFrame<false, false>(sp, NULL);
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-      pcs[n] = *(sp+1);
-      if (next_sp > sp) {
-        sizes[n] = (uintptr_t)next_sp - (uintptr_t)sp;
-      } else {
-        // A frame-size of 0 is used to indicate unknown frame size.
-        sizes[n] = 0;
-      }
-      n++;
-    }
-    sp = next_sp;
-  }
-  return n;
-}
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
 
-// If you change this function, see NOTE at the top of file.
-// Same as above, but with signal ucontext_t pointer.
-int GetStackFramesWithContext(void** pcs,
-                              int* sizes,
-                              int max_depth,
-                              int skip_count,
-                              const void *uc) {
+int GET_STACK_TRACE_OR_FRAMES {
   void **sp;
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) || __llvm__
   // __builtin_frame_address(0) can return the wrong address on gcc-4.1.0-k8.
@@ -511,22 +322,22 @@ int GetStackFramesWithContext(void** pcs,
       // points to itself and has a return address of 0.
       break;
     }
-    // The GetStackFrames routine is called when we are in some
-    // informational context (the failure signal handler for example).
-    // Use the non-strict unwinding rules to produce a stack trace
-    // that is as complete as possible (even if it contains a few bogus
-    // entries in some rare cases).
-    void **next_sp = NextStackFrame<false, true>(sp, uc);
+#if !IS_WITH_CONTEXT
+    const void *const ucp = NULL;
+#endif
+    void **next_sp = NextStackFrame<!IS_STACK_FRAMES, IS_WITH_CONTEXT>(sp, ucp);
     if (skip_count > 0) {
       skip_count--;
     } else {
-      pcs[n] = *(sp+1);
+      result[n] = *(sp+1);
+#if IS_STACK_FRAMES
       if (next_sp > sp) {
         sizes[n] = (uintptr_t)next_sp - (uintptr_t)sp;
       } else {
         // A frame-size of 0 is used to indicate unknown frame size.
         sizes[n] = 0;
       }
+#endif
       n++;
     }
     sp = next_sp;
diff --git a/src/system-alloc.cc b/src/system-alloc.cc
index 3341f17..6d2e1c6 100644
--- a/src/system-alloc.cc
+++ b/src/system-alloc.cc
@@ -168,16 +168,16 @@ void* SbrkSysAllocator::Alloc(size_t size, size_t *actual_size,
   // a strict check here
   if (static_cast<ptrdiff_t>(size + alignment) < 0) return NULL;
 
-  // could theoretically return the "extra" bytes here, but this
-  // is simple and correct.
-  if (actual_size) {
-    *actual_size = size;
-  }
-
   // This doesn't overflow because TCMalloc_SystemAlloc has already
   // tested for overflow at the alignment boundary.
   size = ((size + alignment - 1) / alignment) * alignment;
 
+  // "actual_size" indicates that the bytes from the returned pointer
+  // p up to and including (p + actual_size - 1) have been allocated.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
   // Check that we we're not asking for so much more memory that we'd
   // wrap around the end of the virtual address space.  (This seems
   // like something sbrk() should check for us, and indeed opensolaris
@@ -243,12 +243,6 @@ void* MmapSysAllocator::Alloc(size_t size, size_t *actual_size,
     return NULL;
   }
 
-  // could theoretically return the "extra" bytes here, but this
-  // is simple and correct.
-  if (actual_size) {
-    *actual_size = size;
-  }
-
   // Enforce page alignment
   if (pagesize == 0) pagesize = getpagesize();
   if (alignment < pagesize) alignment = pagesize;
@@ -258,6 +252,12 @@ void* MmapSysAllocator::Alloc(size_t size, size_t *actual_size,
   }
   size = aligned_size;
 
+  // "actual_size" indicates that the bytes from the returned pointer
+  // p up to and including (p + actual_size - 1) have been allocated.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
   // Ask for extra memory if alignment > pagesize
   size_t extra = 0;
   if (alignment > pagesize) {
@@ -333,12 +333,6 @@ void* DevMemSysAllocator::Alloc(size_t size, size_t *actual_size,
     initialized = true;
   }
 
-  // could theoretically return the "extra" bytes here, but this
-  // is simple and correct.
-  if (actual_size) {
-    *actual_size = size;
-  }
-
   // Enforce page alignment
   if (pagesize == 0) pagesize = getpagesize();
   if (alignment < pagesize) alignment = pagesize;
@@ -348,6 +342,12 @@ void* DevMemSysAllocator::Alloc(size_t size, size_t *actual_size,
   }
   size = aligned_size;
 
+  // "actual_size" indicates that the bytes from the returned pointer
+  // p up to and including (p + actual_size - 1) have been allocated.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
   // Ask for extra memory if alignment > pagesize
   size_t extra = 0;
   if (alignment > pagesize) {
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 625301e..48cf328 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -231,8 +231,9 @@ extern "C" {
       ATTRIBUTE_SECTION(google_malloc);
   void* tc_newarray_nothrow(size_t size, const std::nothrow_t&) __THROW
       ATTRIBUTE_SECTION(google_malloc);
-  // Surprisingly, compilers use a nothrow-delete internally.  See, eg:
-  //   http://www.dinkumware.com/manuals/?manual=compleat&page=new.html
+  // Surprisingly, standard C++ library implementations use a
+  // nothrow-delete internally.  See, eg:
+  // http://www.dinkumware.com/manuals/?manual=compleat&page=new.html
   void tc_delete_nothrow(void* ptr, const std::nothrow_t&) __THROW
       ATTRIBUTE_SECTION(google_malloc);
   void tc_deletearray_nothrow(void* ptr, const std::nothrow_t&) __THROW
@@ -253,9 +254,9 @@ extern "C" {
   // NOTE: we make many of these symbols weak, but do so in the makefile
   //       (via objcopy -W) and not here.  That ends up being more portable.
 # define ALIAS(x) __attribute__ ((alias (x)))
-void* operator new(size_t size)                  ALIAS("tc_new");
+void* operator new(size_t size) throw (std::bad_alloc) ALIAS("tc_new");
 void operator delete(void* p) __THROW            ALIAS("tc_delete");
-void* operator new[](size_t size)                ALIAS("tc_newarray");
+void* operator new[](size_t size) throw (std::bad_alloc) ALIAS("tc_newarray");
 void operator delete[](void* p) __THROW          ALIAS("tc_deletearray");
 void* operator new(size_t size, const std::nothrow_t&) __THROW
                                                  ALIAS("tc_new_nothrow");
@@ -264,7 +265,7 @@ void* operator new[](size_t size, const std::nothrow_t&) __THROW
 void operator delete(void* size, const std::nothrow_t&) __THROW
                                                  ALIAS("tc_delete_nothrow");
 void operator delete[](void* size, const std::nothrow_t&) __THROW
-                                                 ALIAS("tc_deletearray_nothrow");
+                                                ALIAS("tc_deletearray_nothrow");
 extern "C" {
   void* malloc(size_t size) __THROW              ALIAS("tc_malloc");
   void  free(void* ptr) __THROW                  ALIAS("tc_free");
@@ -765,7 +766,17 @@ TCMallocGuard::TCMallocGuard() {
     tc_free(tc_malloc(1));
     ThreadCache::InitTSD();
     tc_free(tc_malloc(1));
-    MallocExtension::Register(new TCMallocImplementation);
+    // Either we, or debugallocation.cc, or valgrind will control memory
+    // management.  We register our extension if we're the winner.
+#ifdef TCMALLOC_FOR_DEBUGALLOCATION
+    // Let debugallocation register its extension.
+#else
+    if (RunningOnValgrind()) {
+      // Let Valgrind uses its own malloc (so don't register our extension).
+    } else {
+      MallocExtension::Register(new TCMallocImplementation);
+    }
+#endif
   }
 }
 
@@ -1353,8 +1364,7 @@ extern "C" PERFTOOLS_DLL_DECL void* tc_new(size_t size) {
   return p;
 }
 
-extern "C" PERFTOOLS_DLL_DECL void* tc_new_nothrow(
-    size_t size, const std::nothrow_t&) __THROW {
+extern "C" PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size, const std::nothrow_t&) __THROW {
   void* p = cpp_alloc(size, true);
   MallocHook::InvokeNewHook(p, size);
   return p;
@@ -1365,10 +1375,10 @@ extern "C" PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW {
   do_free(p);
 }
 
-// Compilers define and use this (via ::operator delete(ptr, nothrow)).
+// Standard C++ library implementations define and use this
+// (via ::operator delete(ptr, nothrow)).
 // But it's really the same as normal delete, so we just do the same thing.
-extern "C" PERFTOOLS_DLL_DECL void tc_delete_nothrow(
-    void* p, const std::nothrow_t&) __THROW {
+extern "C" PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p, const std::nothrow_t&) __THROW {
   MallocHook::InvokeDeleteHook(p);
   do_free(p);
 }
@@ -1384,8 +1394,8 @@ extern "C" PERFTOOLS_DLL_DECL void* tc_newarray(size_t size) {
   return p;
 }
 
-extern "C" PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(
-    size_t size, const std::nothrow_t&) __THROW {
+extern "C" PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size, const std::nothrow_t&)
+    __THROW {
   void* p = cpp_alloc(size, true);
   MallocHook::InvokeNewHook(p, size);
   return p;
@@ -1396,8 +1406,7 @@ extern "C" PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW {
   do_free(p);
 }
 
-extern "C" PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(
-    void* p, const std::nothrow_t&) __THROW {
+extern "C" PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p, const std::nothrow_t&) __THROW {
   MallocHook::InvokeDeleteHook(p);
   do_free(p);
 }
diff --git a/src/tests/debugallocation_test.cc b/src/tests/debugallocation_test.cc
index ca00e36..c482187 100644
--- a/src/tests/debugallocation_test.cc
+++ b/src/tests/debugallocation_test.cc
@@ -75,7 +75,14 @@ static int test_counter = 0;    // incremented every time the macro is called
 // This flag won't be compiled in in opt mode.
 DECLARE_int32(max_free_queue_size);
 
+// Test match as well as mismatch rules:
 TEST(DebugAllocationTest, DeallocMismatch) {
+  // malloc can be matched only by free
+  // new can be matched only by delete and delete(nothrow)
+  // new[] can be matched only by delete[] and delete[](nothrow)
+  // new(nothrow) can be matched only by delete and delete(nothrow)
+  // new(nothrow)[] can be matched only by delete[] and delete[](nothrow)
+
   // Allocate with malloc.
   {
     int* x = static_cast<int*>(malloc(sizeof(*x)));
@@ -88,17 +95,41 @@ TEST(DebugAllocationTest, DeallocMismatch) {
   // Allocate with new.
   {
     int* x = new int;
+    int* y = new int;
     IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
     IF_DEBUG_EXPECT_DEATH(delete [] x, "mismatch.*being dealloc.*delete *[[]");
     delete x;
+    ::operator delete(y, std::nothrow);
   }
 
   // Allocate with new[].
   {
     int* x = new int[1];
+    int* y = new int[1];
+    IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
+    IF_DEBUG_EXPECT_DEATH(delete x, "mismatch.*being dealloc.*delete");
+    delete [] x;
+    ::operator delete[](y, std::nothrow);
+  }
+
+  // Allocate with new(nothrow).
+  {
+    int* x = new(std::nothrow) int;
+    int* y = new(std::nothrow) int;
+    IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
+    IF_DEBUG_EXPECT_DEATH(delete [] x, "mismatch.*being dealloc.*delete *[[]");
+    delete x;
+    ::operator delete(y, std::nothrow);
+  }
+
+  // Allocate with new(nothrow)[].
+  {
+    int* x = new(std::nothrow) int[1];
+    int* y = new(std::nothrow) int[1];
     IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
     IF_DEBUG_EXPECT_DEATH(delete x, "mismatch.*being dealloc.*delete");
     delete [] x;
+    ::operator delete[](y, std::nothrow);
   }
 }
 
diff --git a/src/tests/heap-checker-death_unittest.sh b/src/tests/heap-checker-death_unittest.sh
index 9f0c08c..4a83fc2 100755
--- a/src/tests/heap-checker-death_unittest.sh
+++ b/src/tests/heap-checker-death_unittest.sh
@@ -139,13 +139,13 @@ EARLY_MSG="Starting tracking the heap$"
 
 Test 60 0 "$EARLY_MSG" "" \
   HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
-  PERFTOOLS_VERBOSE=1 || exit 5
+  PERFTOOLS_VERBOSE=10 || exit 5
 Test 60 0 "MemoryRegionMap Init$" "" \
   HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
-  PERFTOOLS_VERBOSE=2 || exit 6
+  PERFTOOLS_VERBOSE=11 || exit 6
 Test 60 0 "" "$EARLY_MSG" \
   HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
-  PERFTOOLS_VERBOSE=-2 || exit 7
+  PERFTOOLS_VERBOSE=-11 || exit 7
 
 # These invocations should fail with very high probability,
 # rather than return 0 or hang (1 == exit(1), 134 == abort(), 139 = SIGSEGV):
@@ -162,10 +162,10 @@ Test 60 1 "MakeALeak" "" \
 
 # Test that very early log messages are present and controllable:
 Test 60 1 "Starting tracking the heap$" "" \
-  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=1 \
+  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=10 \
   || exit 11
 Test 60 1 "" "Starting tracking the heap" \
-  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=-1 \
+  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=-10 \
   || exit 12
 
 cd /    # so we're not in TMPDIR when we delete it
diff --git a/src/tests/profiler_unittest.cc b/src/tests/profiler_unittest.cc
index 1908b03..19371b7 100644
--- a/src/tests/profiler_unittest.cc
+++ b/src/tests/profiler_unittest.cc
@@ -56,12 +56,11 @@ static void test_other_thread() {
 
   int i, m;
   char b[128];
+  MutexLock ml(&mutex);
   for (m = 0; m < 1000000; ++m) {          // run millions of times
     for (i = 0; i < g_iters; ++i ) {
-      MutexLock ml(&mutex);
       result ^= i;
     }
-    MutexLock ml(&mutex);
     snprintf(b, sizeof(b), "%d", result);  // get some libc action
   }
 #endif
@@ -70,12 +69,11 @@ static void test_other_thread() {
 static void test_main_thread() {
   int i, m;
   char b[128];
+  MutexLock ml(&mutex);
   for (m = 0; m < 1000000; ++m) {          // run millions of times
     for (i = 0; i < g_iters; ++i ) {
-      MutexLock ml(&mutex);
       result ^= i;
     }
-    MutexLock ml(&mutex);
     snprintf(b, sizeof(b), "%d", result);  // get some libc action
   }
 }
diff --git a/src/tests/profiler_unittest.sh b/src/tests/profiler_unittest.sh
index 5766f2e..4668fa7 100755
--- a/src/tests/profiler_unittest.sh
+++ b/src/tests/profiler_unittest.sh
@@ -206,28 +206,27 @@ CPUPROFILE="$TMPDIR/p5" "$PROFILER2" 50 || RegisterFailure
 CPUPROFILE="$TMPDIR/p6" "$PROFILER2" 100 || RegisterFailure
 VerifySimilar p5 "$PROFILER2_REALNAME" p6 "$PROFILER2_REALNAME" 2
 
-# When we compile with threads, things take a lot longer even when we only use 1
-CPUPROFILE="$TMPDIR/p5b" "$PROFILER3" 10 || RegisterFailure
-CPUPROFILE="$TMPDIR/p5c" "$PROFILER3" 20 || RegisterFailure
+CPUPROFILE="$TMPDIR/p5b" "$PROFILER3" 30 || RegisterFailure
+CPUPROFILE="$TMPDIR/p5c" "$PROFILER3" 60 || RegisterFailure
 VerifySimilar p5b "$PROFILER3_REALNAME" p5c "$PROFILER3_REALNAME" 2
 
 # Now try what happens when we use threads
-"$PROFILER3" 5 2 "$TMPDIR/p7" || RegisterFailure
-"$PROFILER3" 10 2 "$TMPDIR/p8" || RegisterFailure
+"$PROFILER3" 30 2 "$TMPDIR/p7" || RegisterFailure
+"$PROFILER3" 60 2 "$TMPDIR/p8" || RegisterFailure
 VerifySimilar p7 "$PROFILER3_REALNAME" p8 "$PROFILER3_REALNAME" 2
 
-"$PROFILER4" 5 2 "$TMPDIR/p9" || RegisterFailure
-"$PROFILER4" 10 2 "$TMPDIR/p10" || RegisterFailure
+"$PROFILER4" 30 2 "$TMPDIR/p9" || RegisterFailure
+"$PROFILER4" 60 2 "$TMPDIR/p10" || RegisterFailure
 VerifySimilar p9 "$PROFILER4_REALNAME" p10 "$PROFILER4_REALNAME" 2
 
 # More threads!
-"$PROFILER4" 2 3 "$TMPDIR/p9" || RegisterFailure
-"$PROFILER4" 4 3 "$TMPDIR/p10" || RegisterFailure
+"$PROFILER4" 25 3 "$TMPDIR/p9" || RegisterFailure
+"$PROFILER4" 50 3 "$TMPDIR/p10" || RegisterFailure
 VerifySimilar p9 "$PROFILER4_REALNAME" p10 "$PROFILER4_REALNAME" 2
 
 # Compare how much time the main thread takes compared to the other threads
 # Recall the main thread runs twice as long as the other threads, by design.
-"$PROFILER4" 2 4 "$TMPDIR/p11" || RegisterFailure
+"$PROFILER4" 20 4 "$TMPDIR/p11" || RegisterFailure
 VerifyAcrossThreads p11 "$PROFILER4_REALNAME" 2
 
 # Test symbol save and restore
@@ -236,14 +235,14 @@ VerifyAcrossThreads p11 "$PROFILER4_REALNAME" 2
     >"$TMPDIR/p13" 2>/dev/null || RegisterFailure
 VerifyIdentical p12 "$PROFILER1_REALNAME" p13 "" || RegisterFailure
 
-"$PROFILER3" 5 2 "$TMPDIR/p14" || RegisterFailure
+"$PROFILER3" 30 2 "$TMPDIR/p14" || RegisterFailure
 "$PPROF" $PPROF_FLAGS "$PROFILER3_REALNAME" "$TMPDIR/p14" --raw \
     >"$TMPDIR/p15" 2>/dev/null || RegisterFailure
 VerifyIdentical p14 "$PROFILER3_REALNAME" p15 "" || RegisterFailure
 
 # Test using ITIMER_REAL instead of ITIMER_PROF.
-env CPUPROFILE_REALTIME=1 "$PROFILER3" 5 2 "$TMPDIR/p16" || RegisterFailure
-env CPUPROFILE_REALTIME=1 "$PROFILER3" 10 2 "$TMPDIR/p17" || RegisterFailure
+env CPUPROFILE_REALTIME=1 "$PROFILER3" 30 2 "$TMPDIR/p16" || RegisterFailure
+env CPUPROFILE_REALTIME=1 "$PROFILER3" 60 2 "$TMPDIR/p17" || RegisterFailure
 VerifySimilar p16 "$PROFILER3_REALNAME" p17 "$PROFILER3_REALNAME" 2
 
 
diff --git a/src/tests/tcmalloc_unittest.cc b/src/tests/tcmalloc_unittest.cc
index 25bfd6a..6b2ec26 100644
--- a/src/tests/tcmalloc_unittest.cc
+++ b/src/tests/tcmalloc_unittest.cc
@@ -977,7 +977,7 @@ static int RunAllTests(int argc, char** argv) {
   }
 
   // This code stresses some of the memory allocation via STL.
-  // In particular, it calls operator delete(void*, nothrow_t).
+  // It may call operator delete(void*, nothrow_t).
   fprintf(LOGSTREAM, "Testing STL use\n");
   {
     std::vector<int> v;
diff --git a/src/third_party/valgrind.h b/src/third_party/valgrind.h
new file mode 100644
index 0000000..577c59a
--- /dev/null
+++ b/src/third_party/valgrind.h
@@ -0,0 +1,3924 @@
+/* -*- c -*-
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (valgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2000-2008 Julian Seward.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must 
+      not claim that you wrote the original software.  If you use this 
+      software in a product, an acknowledgment in the product 
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote 
+      products derived from this software without specific prior written 
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (valgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ---------------------------------------------------------------- 
+*/
+
+
+/* This file is for inclusion into client (your!) code.
+
+   You can use these macros to manipulate and query Valgrind's 
+   execution inside your own programs.
+
+   The resulting executables will still run without Valgrind, just a
+   little bit more slowly than they otherwise would, but otherwise
+   unchanged.  When not running on valgrind, each client request
+   consumes very few (eg. 7) instructions, so the resulting performance
+   loss is negligible unless you plan to execute client requests
+   millions of times per second.  Nevertheless, if that is still a
+   problem, you can compile with the NVALGRIND symbol defined (gcc
+   -DNVALGRIND) so that client requests are not even compiled in.  */
+
+#ifndef __VALGRIND_H
+#define __VALGRIND_H
+
+#include <stdarg.h>
+
+/* Nb: this file might be included in a file compiled with -ansi.  So
+   we can't use C++ style "//" comments nor the "asm" keyword (instead
+   use "__asm__"). */
+
+/* Derive some tags indicating what the target platform is.  Note
+   that in this file we're using the compiler's CPP symbols for
+   identifying architectures, which are different to the ones we use
+   within the rest of Valgrind.  Note, __powerpc__ is active for both
+   32 and 64-bit PPC, whereas __powerpc64__ is only active for the
+   latter (on Linux, that is). */
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_ppc32_aix5
+#undef PLAT_ppc64_aix5
+
+#if !defined(_AIX) && defined(__i386__)
+#  define PLAT_x86_linux 1
+#elif !defined(_AIX) && defined(__x86_64__)
+#  define PLAT_amd64_linux 1
+#elif !defined(_AIX) && defined(__powerpc__) && !defined(__powerpc64__)
+#  define PLAT_ppc32_linux 1
+#elif !defined(_AIX) && defined(__powerpc__) && defined(__powerpc64__)
+#  define PLAT_ppc64_linux 1
+#elif defined(_AIX) && defined(__64BIT__)
+#  define PLAT_ppc64_aix5 1
+#elif defined(_AIX) && !defined(__64BIT__)
+#  define PLAT_ppc32_aix5 1
+#endif
+
+
+/* If we're not compiling for our target platform, don't generate
+   any inline asms.  */
+#if !defined(PLAT_x86_linux) && !defined(PLAT_amd64_linux) \
+    && !defined(PLAT_ppc32_linux) && !defined(PLAT_ppc64_linux) \
+    && !defined(PLAT_ppc32_aix5) && !defined(PLAT_ppc64_aix5)
+#  if !defined(NVALGRIND)
+#    define NVALGRIND 1
+#  endif
+#endif
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS.  There is nothing */
+/* in here of use to end-users -- skip to the next section.           */
+/* ------------------------------------------------------------------ */
+
+#if defined(NVALGRIND)
+
+/* Define NVALGRIND to completely remove the Valgrind magic sequence
+   from the compiled code (analogous to NDEBUG's effects on
+   assert()) */
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+   {                                                              \
+      (_zzq_rlval) = (_zzq_default);                              \
+   }
+
+#else  /* ! NVALGRIND */
+
+/* The following defines the magic code sequences which the JITter
+   spots and handles magically.  Don't look too closely at them as
+   they will rot your brain.
+
+   The assembly code sequences for all architectures is in this one
+   file.  This is because this file must be stand-alone, and we don't
+   want to have multiple files.
+
+   For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default
+   value gets put in the return slot, so that everything works when
+   this is executed not under Valgrind.  Args are passed in a memory
+   block, and so there's no intrinsic limit to the number that could
+   be passed, but it's currently five.
+   
+   The macro args are: 
+      _zzq_rlval    result lvalue
+      _zzq_default  default value (result returned when running on real CPU)
+      _zzq_request  request code
+      _zzq_arg1..5  request params
+
+   The other two macros are used to support function wrapping, and are
+   a lot simpler.  VALGRIND_GET_NR_CONTEXT returns the value of the
+   guest's NRADDR pseudo-register and whatever other information is
+   needed to safely run the call original from the wrapper: on
+   ppc64-linux, the R2 value at the divert point is also needed.  This
+   information is abstracted into a user-visible type, OrigFn.
+
+   VALGRIND_CALL_NOREDIR_* behaves the same as the following on the
+   guest, but guarantees that the branch instruction will not be
+   redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64:
+   branch-and-link-to-r11.  VALGRIND_CALL_NOREDIR is just text, not a
+   complete inline asm, since it needs to be combined with more magic
+   inline asm stuff to be useful.
+*/
+
+/* ------------------------- x86-linux ------------------------- */
+
+#if defined(PLAT_x86_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "roll $3,  %%edi ; roll $13, %%edi\n\t"      \
+                     "roll $29, %%edi ; roll $19, %%edi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  { volatile unsigned int _zzq_args[6];                           \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EDX = client_request ( %EAX ) */         \
+                     "xchgl %%ebx,%%ebx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EAX = guest_NRADDR */                    \
+                     "xchgl %%ecx,%%ecx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%EAX */                     \
+                     "xchgl %%edx,%%edx\n\t"
+#endif /* PLAT_x86_linux */
+
+/* ------------------------ amd64-linux ------------------------ */
+
+#if defined(PLAT_amd64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rolq $3,  %%rdi ; rolq $13, %%rdi\n\t"      \
+                     "rolq $61, %%rdi ; rolq $51, %%rdi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  { volatile unsigned long long int _zzq_args[6];                 \
+    volatile unsigned long long int _zzq_result;                  \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RDX = client_request ( %RAX ) */         \
+                     "xchgq %%rbx,%%rbx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RAX = guest_NRADDR */                    \
+                     "xchgq %%rcx,%%rcx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_RAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%RAX */                     \
+                     "xchgq %%rdx,%%rdx\n\t"
+#endif /* PLAT_amd64_linux */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned int  _zzq_args[6];                          \
+             unsigned int  _zzq_result;                           \
+             unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned long long int  _zzq_args[6];                \
+    register unsigned long long int  _zzq_result __asm__("r3");   \
+    register unsigned long long int* _zzq_ptr __asm__("r4");      \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1"                                   \
+                     : "=r" (_zzq_result)                         \
+                     : "0" (_zzq_default), "r" (_zzq_ptr)         \
+                     : "cc", "memory");                           \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned long long int __addr __asm__("r3");         \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2"                                   \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4"                                   \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------ ppc32-aix5 ------------------------- */
+
+#if defined(PLAT_ppc32_aix5)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+      unsigned int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned int  _zzq_args[7];                          \
+    register unsigned int  _zzq_result;                           \
+    register unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_args[6] = (unsigned int)(_zzq_default);                  \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 4,%1\n\t"                                \
+                     "lwz 3, 24(4)\n\t"                           \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_ptr)                             \
+                     : "r3", "r4", "cc", "memory");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc32_aix5 */
+
+/* ------------------------ ppc64-aix5 ------------------------- */
+
+#if defined(PLAT_ppc64_aix5)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned long long int  _zzq_args[7];                \
+    register unsigned long long int  _zzq_result;                 \
+    register unsigned long long int* _zzq_ptr;                    \
+    _zzq_args[0] = (unsigned int long long)(_zzq_request);        \
+    _zzq_args[1] = (unsigned int long long)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned int long long)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned int long long)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned int long long)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned int long long)(_zzq_arg5);           \
+    _zzq_args[6] = (unsigned int long long)(_zzq_default);        \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 4,%1\n\t"                                \
+                     "ld 3, 48(4)\n\t"                            \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_ptr)                             \
+                     : "r3", "r4", "cc", "memory");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_aix5 */
+
+/* Insert assembly code for other platforms here... */
+
+#endif /* NVALGRIND */
+
+
+/* ------------------------------------------------------------------ */
+/* PLATFORM SPECIFICS for FUNCTION WRAPPING.  This is all very        */
+/* ugly.  It's the least-worst tradeoff I can think of.               */
+/* ------------------------------------------------------------------ */
+
+/* This section defines magic (a.k.a appalling-hack) macros for doing
+   guaranteed-no-redirection macros, so as to get from function
+   wrappers to the functions they are wrapping.  The whole point is to
+   construct standard call sequences, but to do the call itself with a
+   special no-redirect call pseudo-instruction that the JIT
+   understands and handles specially.  This section is long and
+   repetitious, and I can't see a way to make it shorter.
+
+   The naming scheme is as follows:
+
+      CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc}
+
+   'W' stands for "word" and 'v' for "void".  Hence there are
+   different macros for calling arity 0, 1, 2, 3, 4, etc, functions,
+   and for each, the possibility of returning a word-typed result, or
+   no result.
+*/
+
+/* Use these to write the name of your wrapper.  NOTE: duplicates
+   VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h. */
+
+#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname)                    \
+   _vgwZU_##soname##_##fnname
+
+#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname)                    \
+   _vgwZZ_##soname##_##fnname
+
+/* Use this macro from within a wrapper function to collect the
+   context (address and possibly other info) of the original function.
+   Once you have that you can then use it in one of the CALL_FN_
+   macros.  The type of the argument _lval is OrigFn. */
+#define VALGRIND_GET_ORIG_FN(_lval)  VALGRIND_GET_NR_CONTEXT(_lval)
+
+/* Derivatives of the main macros below, for calling functions
+   returning void. */
+
+#define CALL_FN_v_v(fnptr)                                        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_v(_junk,fnptr); } while (0)
+
+#define CALL_FN_v_W(fnptr, arg1)                                  \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_W(_junk,fnptr,arg1); } while (0)
+
+#define CALL_FN_v_WW(fnptr, arg1,arg2)                            \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0)
+
+#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3)                      \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0)
+
+/* ------------------------- x86-linux ------------------------- */
+
+#if defined(PLAT_x86_linux)
+
+/* These regs are trashed by the hidden call.  No need to mention eax
+   as gcc can already see that, plus causes gcc to bomb. */
+#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx"
+
+/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $4, %%esp\n"                                       \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $8, %%esp\n"                                       \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $12, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $20, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $24, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $28, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $36, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $40, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $44, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "pushl 48(%%eax)\n\t"                                    \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_x86_linux */
+
+/* ------------------------ amd64-linux ------------------------ */
+
+#if defined(PLAT_amd64_linux)
+
+/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi",       \
+                            "rdi", "r8", "r9", "r10", "r11"
+
+/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned
+   long) == 8. */
+
+/* NB 9 Sept 07.  There is a nasty kludge here in all these CALL_FN_
+   macros.  In order not to trash the stack redzone, we need to drop
+   %rsp by 128 before the hidden call, and restore afterwards.  The
+   nastyness is that it is only by luck that the stack still appears
+   to be unwindable during the hidden call - since then the behaviour
+   of any routine using this macro does not match what the CFI data
+   says.  Sigh.
+
+   Why is this important?  Imagine that a wrapper has a stack
+   allocated local, and passes to the hidden call, a pointer to it.
+   Because gcc does not know about the hidden call, it may allocate
+   that local in the redzone.  Unfortunately the hidden call may then
+   trash it before it comes to use it.  So we must step clear of the
+   redzone, for the duration of the hidden call, to make it safe.
+
+   Probably the same problem afflicts the other redzone-style ABIs too
+   (ppc64-linux, ppc32-aix5, ppc64-aix5); but for those, the stack is
+   self describing (none of this CFI nonsense) so at least messing
+   with the stack pointer doesn't give a danger of non-unwindable
+   stack. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $8, %%rsp\n"                                       \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $16, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $24, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $32, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $40, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 96(%%rax)\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $48, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_amd64_linux */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+/* This is useful for finding out about the on-stack stuff:
+
+   extern int f9  ( int,int,int,int,int,int,int,int,int );
+   extern int f10 ( int,int,int,int,int,int,int,int,int,int );
+   extern int f11 ( int,int,int,int,int,int,int,int,int,int,int );
+   extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int );
+
+   int g9 ( void ) {
+      return f9(11,22,33,44,55,66,77,88,99);
+   }
+   int g10 ( void ) {
+      return f10(11,22,33,44,55,66,77,88,99,110);
+   }
+   int g11 ( void ) {
+      return f11(11,22,33,44,55,66,77,88,99,110,121);
+   }
+   int g12 ( void ) {
+      return f12(11,22,33,44,55,66,77,88,99,110,121,132);
+   }
+*/
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc32-linux, 
+   sizeof(unsigned long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      _argvec[12] = (unsigned long)arg12;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,20(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------ ppc32-aix5 ------------------------- */
+
+#if defined(PLAT_ppc32_aix5)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Expand the stack frame, copying enough info that unwinding
+   still works.  Trashes r3. */
+
+#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr)                      \
+         "addi 1,1,-" #_n_fr "\n\t"                               \
+         "lwz  3," #_n_fr "(1)\n\t"                               \
+         "stw  3,0(1)\n\t"
+
+#define VG_CONTRACT_FRAME_BY(_n_fr)                               \
+         "addi 1,1," #_n_fr "\n\t"
+
+/* These CALL_FN_ macros assume that on ppc32-aix5, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t" /* arg2->r4 */                       \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(64)                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(64)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(64)                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(64)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(72)                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,64(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(72)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(72)                        \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,68(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,64(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(72)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_aix5 */
+
+/* ------------------------ ppc64-aix5 ------------------------- */
+
+#if defined(PLAT_ppc64_aix5)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Expand the stack frame, copying enough info that unwinding
+   still works.  Trashes r3. */
+
+#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr)                      \
+         "addi 1,1,-" #_n_fr "\n\t"                               \
+         "ld   3," #_n_fr "(1)\n\t"                               \
+         "std  3,0(1)\n\t"
+
+#define VG_CONTRACT_FRAME_BY(_n_fr)                               \
+         "addi 1,1," #_n_fr "\n\t"
+
+/* These CALL_FN_ macros assume that on ppc64-aix5, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(128)                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(128)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(128)                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(128)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(144)                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(144)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(144)                       \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(144)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_aix5 */
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS.               */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+
+/* Some request codes.  There are many more of these, but most are not
+   exposed to end-user view.  These are the public ones, all of the
+   form 0x1000 + small_number.
+
+   Core ones are in the range 0x00000000--0x0000ffff.  The non-public
+   ones start at 0x2000.
+*/
+
+/* These macros are used by tools -- they must be public, but don't
+   embed them into other programs. */
+#define VG_USERREQ_TOOL_BASE(a,b) \
+   ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16))
+#define VG_IS_TOOL_USERREQ(a, b, v) \
+   (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000))
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end. */
+typedef
+   enum { VG_USERREQ__RUNNING_ON_VALGRIND  = 0x1001,
+          VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002,
+
+          /* These allow any function to be called from the simulated
+             CPU but run on the real CPU.  Nb: the first arg passed to
+             the function is always the ThreadId of the running
+             thread!  So CLIENT_CALL0 actually requires a 1 arg
+             function, etc. */
+          VG_USERREQ__CLIENT_CALL0 = 0x1101,
+          VG_USERREQ__CLIENT_CALL1 = 0x1102,
+          VG_USERREQ__CLIENT_CALL2 = 0x1103,
+          VG_USERREQ__CLIENT_CALL3 = 0x1104,
+
+          /* Can be useful in regression testing suites -- eg. can
+             send Valgrind's output to /dev/null and still count
+             errors. */
+          VG_USERREQ__COUNT_ERRORS = 0x1201,
+
+          /* These are useful and can be interpreted by any tool that
+             tracks malloc() et al, by using vg_replace_malloc.c. */
+          VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301,
+          VG_USERREQ__FREELIKE_BLOCK   = 0x1302,
+          /* Memory pool support. */
+          VG_USERREQ__CREATE_MEMPOOL   = 0x1303,
+          VG_USERREQ__DESTROY_MEMPOOL  = 0x1304,
+          VG_USERREQ__MEMPOOL_ALLOC    = 0x1305,
+          VG_USERREQ__MEMPOOL_FREE     = 0x1306,
+          VG_USERREQ__MEMPOOL_TRIM     = 0x1307,
+          VG_USERREQ__MOVE_MEMPOOL     = 0x1308,
+          VG_USERREQ__MEMPOOL_CHANGE   = 0x1309,
+          VG_USERREQ__MEMPOOL_EXISTS   = 0x130a,
+
+          /* Allow printfs to valgrind log. */
+          VG_USERREQ__PRINTF           = 0x1401,
+          VG_USERREQ__PRINTF_BACKTRACE = 0x1402,
+
+          /* Stack support. */
+          VG_USERREQ__STACK_REGISTER   = 0x1501,
+          VG_USERREQ__STACK_DEREGISTER = 0x1502,
+          VG_USERREQ__STACK_CHANGE     = 0x1503
+   } Vg_ClientRequest;
+
+#if !defined(__GNUC__)
+#  define __extension__ /* */
+#endif
+
+/* Returns the number of Valgrinds this code is running under.  That
+   is, 0 if running natively, 1 if running under Valgrind, 2 if
+   running under Valgrind which is running under another Valgrind,
+   etc. */
+#define RUNNING_ON_VALGRIND  __extension__                        \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* if not */,          \
+                               VG_USERREQ__RUNNING_ON_VALGRIND,   \
+                               0, 0, 0, 0, 0);                    \
+    _qzz_res;                                                     \
+   })
+
+
+/* Discard translation of code in the range [_qzz_addr .. _qzz_addr +
+   _qzz_len - 1].  Useful if you are debugging a JITter or some such,
+   since it provides a way to make sure valgrind will retranslate the
+   invalidated area.  Returns no value. */
+#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len)         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__DISCARD_TRANSLATIONS,  \
+                               _qzz_addr, _qzz_len, 0, 0, 0);     \
+   }
+
+
+/* These requests are for getting Valgrind itself to print something.
+   Possibly with a backtrace.  This is a really ugly hack. */
+
+#if defined(NVALGRIND)
+
+#  define VALGRIND_PRINTF(...)
+#  define VALGRIND_PRINTF_BACKTRACE(...)
+
+#else /* NVALGRIND */
+
+/* Modern GCC will optimize the static routine out if unused,
+   and unused attribute will shut down warnings about it.  */
+static int VALGRIND_PRINTF(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+static int
+VALGRIND_PRINTF(const char *format, ...)
+{
+   unsigned long _qzz_res;
+   va_list vargs;
+   va_start(vargs, format);
+   VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF,
+                              (unsigned long)format, (unsigned long)vargs, 
+                              0, 0, 0);
+   va_end(vargs);
+   return (int)_qzz_res;
+}
+
+static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+static int
+VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+{
+   unsigned long _qzz_res;
+   va_list vargs;
+   va_start(vargs, format);
+   VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF_BACKTRACE,
+                              (unsigned long)format, (unsigned long)vargs, 
+                              0, 0, 0);
+   va_end(vargs);
+   return (int)_qzz_res;
+}
+
+#endif /* NVALGRIND */
+
+
+/* These requests allow control to move from the simulated CPU to the
+   real CPU, calling an arbitary function.
+   
+   Note that the current ThreadId is inserted as the first argument.
+   So this call:
+
+     VALGRIND_NON_SIMD_CALL2(f, arg1, arg2)
+
+   requires f to have this signature:
+
+     Word f(Word tid, Word arg1, Word arg2)
+
+   where "Word" is a word-sized type.
+
+   Note that these client requests are not entirely reliable.  For example,
+   if you call a function with them that subsequently calls printf(),
+   there's a high chance Valgrind will crash.  Generally, your prospects of
+   these working are made higher if the called function does not refer to
+   any global variables, and does not refer to any libc or other functions
+   (printf et al).  Any kind of entanglement with libc or dynamic linking is
+   likely to have a bad outcome, for tricky reasons which we've grappled
+   with a lot in the past.
+*/
+#define VALGRIND_NON_SIMD_CALL0(_qyy_fn)                          \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL0,          \
+                               _qyy_fn,                           \
+                               0, 0, 0, 0);                       \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1)               \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL1,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, 0, 0, 0);               \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2)    \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL2,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, _qyy_arg2, 0, 0);       \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL3,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, _qyy_arg2,              \
+                               _qyy_arg3, 0);                     \
+    _qyy_res;                                                     \
+   })
+
+
+/* Counts the number of errors that have been recorded by a tool.  Nb:
+   the tool must record the errors with VG_(maybe_record_error)() or
+   VG_(unique_error)() for them to be counted. */
+#define VALGRIND_COUNT_ERRORS                                     \
+   __extension__                                                  \
+   ({unsigned int _qyy_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__COUNT_ERRORS,          \
+                               0, 0, 0, 0, 0);                    \
+    _qyy_res;                                                     \
+   })
+
+/* Mark a block of memory as having been allocated by a malloc()-like
+   function.  `addr' is the start of the usable block (ie. after any
+   redzone) `rzB' is redzone size if the allocator can apply redzones;
+   use '0' if not.  Adding redzones makes it more likely Valgrind will spot
+   block overruns.  `is_zeroed' indicates if the memory is zeroed, as it is
+   for calloc().  Put it immediately after the point where a block is
+   allocated. 
+   
+   If you're using Memcheck: If you're allocating memory via superblocks,
+   and then handing out small chunks of each superblock, if you don't have
+   redzones on your small blocks, it's worth marking the superblock with
+   VALGRIND_MAKE_MEM_NOACCESS when it's created, so that block overruns are
+   detected.  But if you can put redzones on, it's probably better to not do
+   this, so that messages for small overruns are described in terms of the
+   small block rather than the superblock (but if you have a big overrun
+   that skips over a redzone, you could miss an error this way).  See
+   memcheck/tests/custom_alloc.c for an example.
+
+   WARNING: if your allocator uses malloc() or 'new' to allocate
+   superblocks, rather than mmap() or brk(), this will not work properly --
+   you'll likely get assertion failures during leak detection.  This is
+   because Valgrind doesn't like seeing overlapping heap blocks.  Sorry.
+
+   Nb: block must be freed via a free()-like function specified
+   with VALGRIND_FREELIKE_BLOCK or mismatch errors will occur. */
+#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)    \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MALLOCLIKE_BLOCK,      \
+                               addr, sizeB, rzB, is_zeroed, 0);   \
+   }
+
+/* Mark a block of memory as having been freed by a free()-like function.
+   `rzB' is redzone size;  it must match that given to
+   VALGRIND_MALLOCLIKE_BLOCK.  Memory not freed will be detected by the leak
+   checker.  Put it immediately after the point where the block is freed. */
+#define VALGRIND_FREELIKE_BLOCK(addr, rzB)                        \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__FREELIKE_BLOCK,        \
+                               addr, rzB, 0, 0, 0);               \
+   }
+
+/* Create a memory pool. */
+#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed)             \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__CREATE_MEMPOOL,        \
+                               pool, rzB, is_zeroed, 0, 0);       \
+   }
+
+/* Destroy a memory pool. */
+#define VALGRIND_DESTROY_MEMPOOL(pool)                            \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__DESTROY_MEMPOOL,       \
+                               pool, 0, 0, 0, 0);                 \
+   }
+
+/* Associate a piece of memory with a memory pool. */
+#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size)                  \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_ALLOC,         \
+                               pool, addr, size, 0, 0);           \
+   }
+
+/* Disassociate a piece of memory from a memory pool. */
+#define VALGRIND_MEMPOOL_FREE(pool, addr)                         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_FREE,          \
+                               pool, addr, 0, 0, 0);              \
+   }
+
+/* Disassociate any pieces outside a particular range. */
+#define VALGRIND_MEMPOOL_TRIM(pool, addr, size)                   \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_TRIM,          \
+                               pool, addr, size, 0, 0);           \
+   }
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MOVE_MEMPOOL(poolA, poolB)                       \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MOVE_MEMPOOL,          \
+                               poolA, poolB, 0, 0, 0);            \
+   }
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size)         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_CHANGE,        \
+                               pool, addrA, addrB, size, 0);      \
+   }
+
+/* Return 1 if a mempool exists, else 0. */
+#define VALGRIND_MEMPOOL_EXISTS(pool)                             \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_EXISTS,        \
+                               pool, 0, 0, 0, 0);                 \
+    _qzz_res;                                                     \
+   })
+
+/* Mark a piece of memory as being a stack. Returns a stack id. */
+#define VALGRIND_STACK_REGISTER(start, end)                       \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_REGISTER,        \
+                               start, end, 0, 0, 0);              \
+    _qzz_res;                                                     \
+   })
+
+/* Unmark the piece of memory associated with a stack id as being a
+   stack. */
+#define VALGRIND_STACK_DEREGISTER(id)                             \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_DEREGISTER,      \
+                               id, 0, 0, 0, 0);                   \
+   }
+
+/* Change the start and end address of the stack id. */
+#define VALGRIND_STACK_CHANGE(id, start, end)                     \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_CHANGE,          \
+                               id, start, end, 0, 0);             \
+   }
+
+
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_ppc32_aix5
+#undef PLAT_ppc64_aix5
+
+#endif   /* __VALGRIND_H */
diff --git a/src/windows/google/tcmalloc.h b/src/windows/google/tcmalloc.h
index 4b97b15..663b7f9 100644
--- a/src/windows/google/tcmalloc.h
+++ b/src/windows/google/tcmalloc.h
@@ -61,7 +61,8 @@
 #endif
 
 #ifdef __cplusplus
-#include <new>  // for nothrow_t
+#include <new>          // for std::nothrow_t
+
 extern "C" {
 #endif
   // Returns a human-readable version string.  If major, minor,
@@ -92,16 +93,15 @@ extern "C" {
 #ifdef __cplusplus
   PERFTOOLS_DLL_DECL int tc_set_new_mode(int flag) __THROW;
   PERFTOOLS_DLL_DECL void* tc_new(size_t size);
-  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
-  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
-  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
-
   PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size,
                                           const std::nothrow_t&) __THROW;
-  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
-                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
   PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p,
                                             const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
+                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
   PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p,
                                                  const std::nothrow_t&) __THROW;
 }
diff --git a/src/windows/port.cc b/src/windows/port.cc
index 76a9e38..bf3b106 100644
--- a/src/windows/port.cc
+++ b/src/windows/port.cc
@@ -71,7 +71,8 @@ int getpagesize() {
   if (pagesize == 0) {
     SYSTEM_INFO system_info;
     GetSystemInfo(&system_info);
-    pagesize = system_info.dwPageSize;
+    pagesize = std::max(system_info.dwPageSize,
+                        system_info.dwAllocationGranularity);
   }
   return pagesize;
 }
@@ -186,16 +187,16 @@ pthread_key_t PthreadKeyCreate(void (*destr_fn)(void*)) {
 // munmap's in the middle of the page, which is forbidden in windows.
 extern void* TCMalloc_SystemAlloc(size_t size, size_t *actual_size,
                                   size_t alignment) {
-  // Safest is to make actual_size same as input-size.
-  if (actual_size) {
-    *actual_size = size;
-  }
-
   // Align on the pagesize boundary
   const int pagesize = getpagesize();
   if (alignment < pagesize) alignment = pagesize;
   size = ((size + alignment - 1) / alignment) * alignment;
 
+  // Safest is to make actual_size same as input-size.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
   // Ask for extra memory if alignment > pagesize
   size_t extra = 0;
   if (alignment > pagesize) {
author	csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>	2010-03-23 20:39:55 +0000
committer	csilvers <csilvers@6b5cf1ce-ec42-a296-1ba9-69fdba395a50>	2010-03-23 20:39:55 +0000
commit	92beff88437b31f4a618640b88487e0f8dfb7017 (patch)
tree	d15e670fdc74a690d012c25e16a2d6efa4ab7d26
parent	23dd124970bc11636feaa240394063ba5889ca54 (diff)
download	gperftools-92beff88437b31f4a618640b88487e0f8dfb7017.tar.gz