53 files changed, 6224 insertions, 1745 deletions
diff --git a/third_party/tcmalloc/chromium/src/base/basictypes.h b/third_party/tcmalloc/chromium/src/base/basictypes.h
index 9991413..ab9cdabc 100644
--- a/third_party/tcmalloc/chromium/src/base/basictypes.h
+++ b/third_party/tcmalloc/chromium/src/base/basictypes.h
@@ -308,6 +308,14 @@ class AssignAttributeStartEnd {
 
 #endif  // HAVE___ATTRIBUTE__ and __ELF__ or __MACH__
 
+#if defined(HAVE___ATTRIBUTE__) && (defined(__i386__) || defined(__x86_64__))
+# define CACHELINE_SIZE 64
+# define CACHELINE_ALIGNED __attribute__((aligned(CACHELINE_SIZE)))
+#else
+# define CACHELINE_ALIGNED
+#endif  // defined(HAVE___ATTRIBUTE__) && (__i386__ || __x86_64__)
+
+
 // The following enum should be used only as a constructor argument to indicate
 // that the variable has static storage class, and that the constructor should
 // do nothing to its state.  It indicates to the reader that it is legal to
diff --git a/third_party/tcmalloc/chromium/src/base/dynamic_annotations.c b/third_party/tcmalloc/chromium/src/base/dynamic_annotations.c
new file mode 100644
index 0000000..cdefaa7
--- /dev/null
+++ b/third_party/tcmalloc/chromium/src/base/dynamic_annotations.c
@@ -0,0 +1,148 @@
+/* Copyright (c) 2008-2009, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ */
+
+#ifdef __cplusplus
+# error "This file should be built as pure C to avoid name mangling"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "base/dynamic_annotations.h"
+
+#ifdef __GNUC__
+/* valgrind.h uses gcc extensions so it won't build with other compilers */
+# ifdef HAVE_VALGRIND_H    /* prefer the user's copy if they have it */
+#  include <valgrind.h>
+# else                     /* otherwise just use the copy that we have */
+#  include "third_party/valgrind.h"
+# endif
+#endif
+
+/* Each function is empty and called (via a macro) only in debug mode.
+   The arguments are captured by dynamic tools at runtime. */
+
+#if DYNAMIC_ANNOTATIONS_ENABLED == 1
+
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock){}
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock){}
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w){}
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w){}
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed) {}
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier) {}
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier) {}
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier) {}
+
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock){}
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv){}
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv){}
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size){}
+void AnnotateUnpublishMemoryRange(const char *file, int line,
+                                  const volatile void *address,
+                                  long size){}
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq){}
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq){}
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq){}
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq){}
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *mem,
+                       long size){}
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *mem,
+                        const char *description){}
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *mem,
+                        const char *description){}
+void AnnotateBenignRaceSized(const char *file, int line,
+                             const volatile void *mem,
+                             long size,
+                             const char *description) {}
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu){}
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg){}
+void AnnotateThreadName(const char *file, int line,
+                        const char *name){}
+void AnnotateIgnoreReadsBegin(const char *file, int line){}
+void AnnotateIgnoreReadsEnd(const char *file, int line){}
+void AnnotateIgnoreWritesBegin(const char *file, int line){}
+void AnnotateIgnoreWritesEnd(const char *file, int line){}
+void AnnotateEnableRaceDetection(const char *file, int line, int enable){}
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg){}
+void AnnotateFlushState(const char *file, int line){}
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED == 1 */
+
+static int GetRunningOnValgrind(void) {
+#ifdef RUNNING_ON_VALGRIND
+  if (RUNNING_ON_VALGRIND) return 1;
+#endif
+  // TODO(csilvers): use GetenvBeforeMain() instead?  Will need to
+  //                 change it to be extern "C".
+  char *running_on_valgrind_str = getenv("RUNNING_ON_VALGRIND");
+  if (running_on_valgrind_str) {
+    return strcmp(running_on_valgrind_str, "0") != 0;
+  }
+  return 0;
+}
+
+/* See the comments in dynamic_annotations.h */
+int RunningOnValgrind(void) {
+  static volatile int running_on_valgrind = -1;
+  /* C doesn't have thread-safe initialization of statics, and we
+     don't want to depend on pthread_once here, so hack it. */
+  int local_running_on_valgrind = running_on_valgrind;
+  if (local_running_on_valgrind == -1)
+    running_on_valgrind = local_running_on_valgrind = GetRunningOnValgrind();
+  return local_running_on_valgrind;
+}
diff --git a/third_party/tcmalloc/chromium/src/base/dynamic_annotations.cc b/third_party/tcmalloc/chromium/src/base/dynamic_annotations.cc
deleted file mode 100644
index c8bbcd7..0000000
--- a/third_party/tcmalloc/chromium/src/base/dynamic_annotations.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2008, Google Inc.
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * 
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *     * Neither the name of Google Inc. nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * ---
- * Author: Kostya Serebryany
- */
-
-#include <config.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "base/dynamic_annotations.h"
-#include "base/sysinfo.h"
-
-// Each function is empty and called (via a macro) only in debug mode.
-// The arguments are captured by dynamic tools at runtime.
-
-extern "C" void AnnotateRWLockCreate(const char *file, int line,
-                                     const volatile void *lock){}
-extern "C" void AnnotateRWLockDestroy(const char *file, int line,
-                                      const volatile void *lock){}
-extern "C" void AnnotateRWLockAcquired(const char *file, int line,
-                                       const volatile void *lock, long is_w){}
-extern "C" void AnnotateRWLockReleased(const char *file, int line,
-                                       const volatile void *lock, long is_w){}
-extern "C" void AnnotateCondVarWait(const char *file, int line,
-                                    const volatile void *cv,
-                                    const volatile void *lock){}
-extern "C" void AnnotateCondVarSignal(const char *file, int line,
-                                      const volatile void *cv){}
-extern "C" void AnnotateCondVarSignalAll(const char *file, int line,
-                                         const volatile void *cv){}
-extern "C" void AnnotatePublishMemoryRange(const char *file, int line,
-                                           const volatile void *address,
-                                           long size){}
-extern "C" void AnnotateUnpublishMemoryRange(const char *file, int line,
-                                           const volatile void *address,
-                                           long size){}
-extern "C" void AnnotatePCQCreate(const char *file, int line,
-                                  const volatile void *pcq){}
-extern "C" void AnnotatePCQDestroy(const char *file, int line,
-                                   const volatile void *pcq){}
-extern "C" void AnnotatePCQPut(const char *file, int line,
-                               const volatile void *pcq){}
-extern "C" void AnnotatePCQGet(const char *file, int line,
-                               const volatile void *pcq){}
-extern "C" void AnnotateNewMemory(const char *file, int line,
-                                  const volatile void *mem,
-                                  long size){}
-extern "C" void AnnotateExpectRace(const char *file, int line,
-                                   const volatile void *mem,
-                                   const char *description){}
-extern "C" void AnnotateBenignRace(const char *file, int line,
-                                   const volatile void *mem,
-                                   const char *description){}
-extern "C" void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
-                                            const volatile void *mu){}
-extern "C" void AnnotateTraceMemory(const char *file, int line,
-                                    const volatile void *arg){}
-extern "C" void AnnotateThreadName(const char *file, int line,
-                                   const char *name){}
-extern "C" void AnnotateIgnoreReadsBegin(const char *file, int line){}
-extern "C" void AnnotateIgnoreReadsEnd(const char *file, int line){}
-extern "C" void AnnotateIgnoreWritesBegin(const char *file, int line){}
-extern "C" void AnnotateIgnoreWritesEnd(const char *file, int line){}
-extern "C" void AnnotateNoOp(const char *file, int line,
-                             const volatile void *arg){}
-
-static int GetRunningOnValgrind() {
-  const char *running_on_valgrind_str = GetenvBeforeMain("RUNNING_ON_VALGRIND");
-  if (running_on_valgrind_str) {
-    return strcmp(running_on_valgrind_str, "0") != 0;
-  }
-  return 0;
-}
-
-// When running under valgrind, this function will be intercepted
-// and a non-zero value will be returned.
-// Some valgrind-based tools (e.g. callgrind) do not intercept functions,
-// so we also read environment variable.
-extern "C" int RunningOnValgrind() {
-  static int running_on_valgrind = GetRunningOnValgrind();
-  return running_on_valgrind;
-}
diff --git a/third_party/tcmalloc/chromium/src/base/dynamic_annotations.h b/third_party/tcmalloc/chromium/src/base/dynamic_annotations.h
index a2a268f..dae1a14 100644
--- a/third_party/tcmalloc/chromium/src/base/dynamic_annotations.h
+++ b/third_party/tcmalloc/chromium/src/base/dynamic_annotations.h
@@ -1,10 +1,10 @@
 /* Copyright (c) 2008, Google Inc.
  * All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
- * 
+ *
  *     * Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above
@@ -14,7 +14,7 @@
  *     * Neither the name of Google Inc. nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -31,445 +31,471 @@
  * Author: Kostya Serebryany
  */
 
-// This file defines dynamic annotations for use with dynamic analysis
-// tool such as valgrind, PIN, etc.
-//
-// Dynamic annotation is a source code annotation that affects
-// the generated code (that is, the annotation is not a comment).
-// Each such annotation is attached to a particular
-// instruction and/or to a particular object (address) in the program.
-//
-// The annotations that should be used by users are macros in all upper-case
-// (e.g., ANNOTATE_NEW_MEMORY).
-//
-// Actual implementation of these macros may differ depending on the
-// dynamic analysis tool being used.
-//
-// This file supports the following dynamic analysis tools:
-// - None (NDEBUG is defined).
-//    Macros are defined empty.
-// - Helgrind (NDEBUG is not defined).
-//    Macros are defined as calls to non-inlinable empty functions
-//    that are intercepted by helgrind.
-//
+/* This file defines dynamic annotations for use with dynamic analysis
+   tool such as valgrind, PIN, etc.
+
+   Dynamic annotation is a source code annotation that affects
+   the generated code (that is, the annotation is not a comment).
+   Each such annotation is attached to a particular
+   instruction and/or to a particular object (address) in the program.
+
+   The annotations that should be used by users are macros in all upper-case
+   (e.g., ANNOTATE_NEW_MEMORY).
+
+   Actual implementation of these macros may differ depending on the
+   dynamic analysis tool being used.
+
+   See http://code.google.com/p/data-race-test/  for more information.
+
+   This file supports the following dynamic analysis tools:
+   - None (DYNAMIC_ANNOTATIONS_ENABLED is not defined or zero).
+      Macros are defined empty.
+   - ThreadSanitizer, Helgrind, DRD (DYNAMIC_ANNOTATIONS_ENABLED is 1).
+      Macros are defined as calls to non-inlinable empty functions
+      that are intercepted by Valgrind. */
+
 #ifndef BASE_DYNAMIC_ANNOTATIONS_H_
 #define BASE_DYNAMIC_ANNOTATIONS_H_
 
-#include "base/thread_annotations.h"
-
-// All the annotation macros are in effect only in debug mode.
-#ifndef NDEBUG
-
-  // -------------------------------------------------------------
-  // Annotations useful when implementing condition variables such as CondVar,
-  // using conditional critical sections (Await/LockWhen) and when constructing
-  // user-defined synchronization mechanisms.
-  //
-  // The annotations ANNOTATE_HAPPENS_BEFORE() and ANNOTATE_HAPPENS_AFTER() can
-  // be used to define happens-before arcs in user-defined synchronization
-  // mechanisms:  the race detector will infer an arc from the former to the
-  // latter when they share the same argument pointer.
-  //
-  // Example 1 (reference counting):
-  //
-  // void Unref() {
-  //   ANNOTATE_HAPPENS_BEFORE(&refcount_);
-  //   if (AtomicDecrementByOne(&refcount_) == 0) {
-  //     ANNOTATE_HAPPENS_AFTER(&refcount_);
-  //     delete this;
-  //   }
-  // }
-  //
-  // Example 2 (message queue):
-  //
-  // void MyQueue::Put(Type *e) {
-  //   MutexLock lock(&mu_);
-  //   ANNOTATE_HAPPENS_BEFORE(e);
-  //   PutElementIntoMyQueue(e);
-  // }
-  //
-  // Type *MyQueue::Get() {
-  //   MutexLock lock(&mu_);
-  //   Type *e = GetElementFromMyQueue();
-  //   ANNOTATE_HAPPENS_AFTER(e);
-  //   return e;
-  // }
-  //
-  // Note: when possible, please use the existing reference counting and message
-  // queue implementations instead of inventing new ones.
-
-  // Report that wait on the condition variable at address "cv" has succeeded
-  // and the lock at address "lock" is held.
+#ifndef DYNAMIC_ANNOTATIONS_ENABLED
+# define DYNAMIC_ANNOTATIONS_ENABLED 0
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing condition variables such as CondVar,
+     using conditional critical sections (Await/LockWhen) and when constructing
+     user-defined synchronization mechanisms.
+
+     The annotations ANNOTATE_HAPPENS_BEFORE() and ANNOTATE_HAPPENS_AFTER() can
+     be used to define happens-before arcs in user-defined synchronization
+     mechanisms:  the race detector will infer an arc from the former to the
+     latter when they share the same argument pointer.
+
+     Example 1 (reference counting):
+
+     void Unref() {
+       ANNOTATE_HAPPENS_BEFORE(&refcount_);
+       if (AtomicDecrementByOne(&refcount_) == 0) {
+         ANNOTATE_HAPPENS_AFTER(&refcount_);
+         delete this;
+       }
+     }
+
+     Example 2 (message queue):
+
+     void MyQueue::Put(Type *e) {
+       MutexLock lock(&mu_);
+       ANNOTATE_HAPPENS_BEFORE(e);
+       PutElementIntoMyQueue(e);
+     }
+
+     Type *MyQueue::Get() {
+       MutexLock lock(&mu_);
+       Type *e = GetElementFromMyQueue();
+       ANNOTATE_HAPPENS_AFTER(e);
+       return e;
+     }
+
+     Note: when possible, please use the existing reference counting and message
+     queue implementations instead of inventing new ones. */
+
+  /* Report that wait on the condition variable at address "cv" has succeeded
+     and the lock at address "lock" is held. */
   #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) \
     AnnotateCondVarWait(__FILE__, __LINE__, cv, lock)
 
-  // Report that wait on the condition variable at "cv" has succeeded.  Variant
-  // w/o lock.
+  /* Report that wait on the condition variable at "cv" has succeeded.  Variant
+     w/o lock. */
   #define ANNOTATE_CONDVAR_WAIT(cv) \
     AnnotateCondVarWait(__FILE__, __LINE__, cv, NULL)
 
-  // Report that we are about to signal on the condition variable at address
-  // "cv".
+  /* Report that we are about to signal on the condition variable at address
+     "cv". */
   #define ANNOTATE_CONDVAR_SIGNAL(cv) \
     AnnotateCondVarSignal(__FILE__, __LINE__, cv)
 
-  // Report that we are about to signal_all on the condition variable at "cv".
+  /* Report that we are about to signal_all on the condition variable at "cv". */
   #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) \
     AnnotateCondVarSignalAll(__FILE__, __LINE__, cv)
 
-  // Annotations for user-defined synchronization mechanisms.
+  /* Annotations for user-defined synchronization mechanisms. */
   #define ANNOTATE_HAPPENS_BEFORE(obj) ANNOTATE_CONDVAR_SIGNAL(obj)
   #define ANNOTATE_HAPPENS_AFTER(obj)  ANNOTATE_CONDVAR_WAIT(obj)
 
-  // Report that the bytes in the range [pointer, pointer+size) are about
-  // to be published safely. The race checker will create a happens-before
-  // arc from the call ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) to
-  // subsequent accesses to this memory.
+  /* Report that the bytes in the range [pointer, pointer+size) are about
+     to be published safely. The race checker will create a happens-before
+     arc from the call ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) to
+     subsequent accesses to this memory.
+     Note: this annotation may not work properly if the race detector uses
+     sampling, i.e. does not observe all memory accesses.
+     */
   #define ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) \
     AnnotatePublishMemoryRange(__FILE__, __LINE__, pointer, size)
 
-  // Report that the bytes in the range [pointer, pointer+size) are not shared
-  // between threads any more and can be safely used by the current thread w/o
-  // synchronization. The race checker will create a happens-before arc from
-  // all previous accesses to this memory to this call.
-  //
-  // This annotation could be applied to complex objects, such as STL
-  // containers, with one condition: the accesses to the object itself
-  // and its internal data should not be separated with any synchronization.
-  //
-  // Example that works:
-  //
-  // map<int, int> the_map;
-  // void Thread1() {
-  //   MutexLock lock(&mu);
-  //   // Ok: accesses to the_map and its internal data is not separated by
-  //   // synchronization.
-  //   the_map[1]++;
-  // }
-  // void Thread2() {
-  //   {
-  //     MutexLock lock(&mu);
-  //     ...
-  //     // because of some reason we know that the_map will not be used by
-  //     // other threads any more
-  //     ANNOTATE_UNPUBLISH_MEMORY_RANGE(&the_map, sizeof(the_map));
-  //   }
-  //   the_map->DoSomething();
-  // }
-  //
-  // Example that does not work (due to the way happens-before arcs are
-  // represented in some race detectors):
-  //
-  // void Thread1() {
-  //   MutexLock lock(&mu);
-  //   int *guts_of_the_map = &(*the_map)[1];
-  //   // we have some synchronization between access to 'c' and its guts.
-  //   // This will make ANNOTATE_UNPUBLISH_MEMORY_RANGE in Thread2  useless.
-  //   some_other_lock_or_other_synchronization_utility.Lock();
-  //   (*guts_of_the_map)++;
-  //    ...
-  // }
-  //
-  // void Thread1() { // same as above...
+  /* DEPRECATED. Don't use it. */
   #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(pointer, size) \
     AnnotateUnpublishMemoryRange(__FILE__, __LINE__, pointer, size)
 
-  // This annotation should be used to annotate thread-safe swapping of
-  // containers. Required only when using hybrid (i.e. not pure happens-before)
-  // detectors.
-  //
-  // This annotation has the same limitation as ANNOTATE_UNPUBLISH_MEMORY_RANGE
-  // (see above).
-  //
-  // Example:
-  // map<int, int> the_map;
-  // void Thread1() {
-  //   MutexLock lock(&mu);
-  //   the_map[1]++;
-  // }
-  // void Thread2() {
-  //   map<int,int> tmp;
-  //   {
-  //     MutexLock lock(&mu);
-  //     the_map.swap(tmp);
-  //     ANNOTATE_SWAP_MEMORY_RANGE(&the_map, sizeof(the_map));
-  //   }
-  //   tmp->DoSomething();
-  // }
+  /* DEPRECATED. Don't use it. */
   #define ANNOTATE_SWAP_MEMORY_RANGE(pointer, size)   \
     do {                                              \
       ANNOTATE_UNPUBLISH_MEMORY_RANGE(pointer, size); \
       ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size);   \
     } while (0)
 
-  // Instruct the tool to create a happens-before arc between mu->Unlock() and
-  // mu->Lock(). This annotation may slow down the race detector and hide real
-  // races. Normally it is used only when it would be difficult to annotate each
-  // of the mutex's critical sections individually using the annotations above.
-  // This annotation makes sense only for hybrid race detectors. For pure
-  // happens-before detectors this is a no-op. For more details see
-  // http://code.google.com/p/data-race-test/wiki/PureHappensBeforeVsHybrid .
+  /* Instruct the tool to create a happens-before arc between mu->Unlock() and
+     mu->Lock(). This annotation may slow down the race detector and hide real
+     races. Normally it is used only when it would be difficult to annotate each
+     of the mutex's critical sections individually using the annotations above.
+     This annotation makes sense only for hybrid race detectors. For pure
+     happens-before detectors this is a no-op. For more details see
+     http://code.google.com/p/data-race-test/wiki/PureHappensBeforeVsHybrid . */
   #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) \
     AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
 
-  // Deprecated. Use ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX.
+  /* Deprecated. Use ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX. */
   #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) \
     AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
 
-  // -------------------------------------------------------------
-  // Annotations useful when defining memory allocators, or when memory that
-  // was protected in one way starts to be protected in another.
+  /* -------------------------------------------------------------
+     Annotations useful when defining memory allocators, or when memory that
+     was protected in one way starts to be protected in another. */
 
-  // Report that a new memory at "address" of size "size" has been allocated.
-  // This might be used when the memory has been retrieved from a free list and
-  // is about to be reused, or when a the locking discipline for a variable
-  // changes.
+  /* Report that a new memory at "address" of size "size" has been allocated.
+     This might be used when the memory has been retrieved from a free list and
+     is about to be reused, or when a the locking discipline for a variable
+     changes. */
   #define ANNOTATE_NEW_MEMORY(address, size) \
     AnnotateNewMemory(__FILE__, __LINE__, address, size)
 
-  // -------------------------------------------------------------
-  // Annotations useful when defining FIFO queues that transfer data between
-  // threads.
+  /* -------------------------------------------------------------
+     Annotations useful when defining FIFO queues that transfer data between
+     threads. */
 
-  // Report that the producer-consumer queue (such as ProducerConsumerQueue) at
-  // address "pcq" has been created.  The ANNOTATE_PCQ_* annotations
-  // should be used only for FIFO queues.  For non-FIFO queues use
-  // ANNOTATE_HAPPENS_BEFORE (for put) and ANNOTATE_HAPPENS_AFTER (for get).
+  /* Report that the producer-consumer queue (such as ProducerConsumerQueue) at
+     address "pcq" has been created.  The ANNOTATE_PCQ_* annotations
+     should be used only for FIFO queues.  For non-FIFO queues use
+     ANNOTATE_HAPPENS_BEFORE (for put) and ANNOTATE_HAPPENS_AFTER (for get). */
   #define ANNOTATE_PCQ_CREATE(pcq) \
     AnnotatePCQCreate(__FILE__, __LINE__, pcq)
 
-  // Report that the queue at address "pcq" is about to be destroyed.
+  /* Report that the queue at address "pcq" is about to be destroyed. */
   #define ANNOTATE_PCQ_DESTROY(pcq) \
     AnnotatePCQDestroy(__FILE__, __LINE__, pcq)
 
-  // Report that we are about to put an element into a FIFO queue at address
-  // "pcq".
+  /* Report that we are about to put an element into a FIFO queue at address
+     "pcq". */
   #define ANNOTATE_PCQ_PUT(pcq) \
     AnnotatePCQPut(__FILE__, __LINE__, pcq)
 
-  // Report that we've just got an element from a FIFO queue at address "pcq".
+  /* Report that we've just got an element from a FIFO queue at address "pcq". */
   #define ANNOTATE_PCQ_GET(pcq) \
     AnnotatePCQGet(__FILE__, __LINE__, pcq)
 
-  // -------------------------------------------------------------
-  // Annotations that suppress errors.  It is usually better to express the
-  // program's synchronization using the other annotations, but these can
-  // be used when all else fails.
-
-  // Report that we may have a benign race on at "address".
-  // Insert at the point where "address" has been allocated, preferably close
-  // to the point where the race happens.
-  // See also ANNOTATE_BENIGN_RACE_STATIC.
-  #define ANNOTATE_BENIGN_RACE(address, description) \
-    AnnotateBenignRace(__FILE__, __LINE__, address, description)
-
-  // Request the analysis tool to ignore all reads in the current thread
-  // until ANNOTATE_IGNORE_READS_END is called.
-  // Useful to ignore intentional racey reads, while still checking
-  // other reads and all writes.
-  // See also ANNOTATE_UNPROTECTED_READ.
+  /* -------------------------------------------------------------
+     Annotations that suppress errors.  It is usually better to express the
+     program's synchronization using the other annotations, but these can
+     be used when all else fails. */
+
+  /* Report that we may have a benign race at "pointer", with size
+     "sizeof(*(pointer))". "pointer" must be a non-void* pointer.  Insert at the
+     point where "pointer" has been allocated, preferably close to the point
+     where the race happens.  See also ANNOTATE_BENIGN_RACE_STATIC. */
+  #define ANNOTATE_BENIGN_RACE(pointer, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, pointer, \
+                            sizeof(*(pointer)), description)
+
+  /* Same as ANNOTATE_BENIGN_RACE(address, description), but applies to
+     the memory range [address, address+size). */
+  #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, address, size, description)
+
+  /* Request the analysis tool to ignore all reads in the current thread
+     until ANNOTATE_IGNORE_READS_END is called.
+     Useful to ignore intentional racey reads, while still checking
+     other reads and all writes.
+     See also ANNOTATE_UNPROTECTED_READ. */
   #define ANNOTATE_IGNORE_READS_BEGIN() \
     AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
 
-  // Stop ignoring reads.
+  /* Stop ignoring reads. */
   #define ANNOTATE_IGNORE_READS_END() \
     AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
 
-  // Similar to ANNOTATE_IGNORE_READS_BEGIN, but ignore writes.
+  /* Similar to ANNOTATE_IGNORE_READS_BEGIN, but ignore writes. */
   #define ANNOTATE_IGNORE_WRITES_BEGIN() \
     AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
 
-  // Stop ignoring writes.
+  /* Stop ignoring writes. */
   #define ANNOTATE_IGNORE_WRITES_END() \
     AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
 
-  // Start ignoring all memory accesses (reads and writes).
+  /* Start ignoring all memory accesses (reads and writes). */
   #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \
     do {\
       ANNOTATE_IGNORE_READS_BEGIN();\
       ANNOTATE_IGNORE_WRITES_BEGIN();\
     }while(0)\
 
-  // Stop ignoring all memory accesses.
+  /* Stop ignoring all memory accesses. */
   #define ANNOTATE_IGNORE_READS_AND_WRITES_END() \
     do {\
       ANNOTATE_IGNORE_WRITES_END();\
       ANNOTATE_IGNORE_READS_END();\
     }while(0)\
 
-  // -------------------------------------------------------------
-  // Annotations useful for debugging.
+  /* Enable (enable!=0) or disable (enable==0) race detection for all threads.
+     This annotation could be useful if you want to skip expensive race analysis
+     during some period of program execution, e.g. during initialization. */
+  #define ANNOTATE_ENABLE_RACE_DETECTION(enable) \
+    AnnotateEnableRaceDetection(__FILE__, __LINE__, enable)
 
-  // Request to trace every access to "address".
+  /* -------------------------------------------------------------
+     Annotations useful for debugging. */
+
+  /* Request to trace every access to "address". */
   #define ANNOTATE_TRACE_MEMORY(address) \
     AnnotateTraceMemory(__FILE__, __LINE__, address)
 
-  // Report the current thread name to a race detector.
+  /* Report the current thread name to a race detector. */
   #define ANNOTATE_THREAD_NAME(name) \
     AnnotateThreadName(__FILE__, __LINE__, name)
 
-  // -------------------------------------------------------------
-  // Annotations useful when implementing locks.  They are not
-  // normally needed by modules that merely use locks.
-  // The "lock" argument is a pointer to the lock object.
+  /* -------------------------------------------------------------
+     Annotations useful when implementing locks.  They are not
+     normally needed by modules that merely use locks.
+     The "lock" argument is a pointer to the lock object. */
 
-  // Report that a lock has been created at address "lock".
+  /* Report that a lock has been created at address "lock". */
   #define ANNOTATE_RWLOCK_CREATE(lock) \
     AnnotateRWLockCreate(__FILE__, __LINE__, lock)
 
-  // Report that the lock at address "lock" is about to be destroyed.
+  /* Report that the lock at address "lock" is about to be destroyed. */
   #define ANNOTATE_RWLOCK_DESTROY(lock) \
     AnnotateRWLockDestroy(__FILE__, __LINE__, lock)
 
-  // Report that the lock at address "lock" has been acquired.
-  // is_w=1 for writer lock, is_w=0 for reader lock.
+  /* Report that the lock at address "lock" has been acquired.
+     is_w=1 for writer lock, is_w=0 for reader lock. */
   #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) \
     AnnotateRWLockAcquired(__FILE__, __LINE__, lock, is_w)
 
-  // Report that the lock at address "lock" is about to be released.
+  /* Report that the lock at address "lock" is about to be released. */
   #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) \
     AnnotateRWLockReleased(__FILE__, __LINE__, lock, is_w)
 
-  // -------------------------------------------------------------
-  // Annotations useful for testing race detectors.
+  /* -------------------------------------------------------------
+     Annotations useful when implementing barriers.  They are not
+     normally needed by modules that merely use barriers.
+     The "barrier" argument is a pointer to the barrier object. */
+
+  /* Report that the "barrier" has been initialized with initial "count".
+   If 'reinitialization_allowed' is true, initialization is allowed to happen
+   multiple times w/o calling barrier_destroy() */
+  #define ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) \
+    AnnotateBarrierInit(__FILE__, __LINE__, barrier, count, \
+                        reinitialization_allowed)
+
+  /* Report that we are about to enter barrier_wait("barrier"). */
+  #define ANNOTATE_BARRIER_WAIT_BEFORE(barrier) \
+    AnnotateBarrierWaitBefore(__FILE__, __LINE__, barrier)
+
+  /* Report that we just exited barrier_wait("barrier"). */
+  #define ANNOTATE_BARRIER_WAIT_AFTER(barrier) \
+    AnnotateBarrierWaitAfter(__FILE__, __LINE__, barrier)
+
+  /* Report that the "barrier" has been destroyed. */
+  #define ANNOTATE_BARRIER_DESTROY(barrier) \
+    AnnotateBarrierDestroy(__FILE__, __LINE__, barrier)
+
+  /* -------------------------------------------------------------
+     Annotations useful for testing race detectors. */
 
-  // Report that we expect a race on the variable at "address".
-  // Use only in unit tests for a race detector.
+  /* Report that we expect a race on the variable at "address".
+     Use only in unit tests for a race detector. */
   #define ANNOTATE_EXPECT_RACE(address, description) \
     AnnotateExpectRace(__FILE__, __LINE__, address, description)
 
-  // A no-op. Insert where you like to test the interceptors.
+  /* A no-op. Insert where you like to test the interceptors. */
   #define ANNOTATE_NO_OP(arg) \
     AnnotateNoOp(__FILE__, __LINE__, arg)
 
-#else  // NDEBUG is defined
-
-  #define ANNOTATE_RWLOCK_CREATE(lock) // empty
-  #define ANNOTATE_RWLOCK_DESTROY(lock) // empty
-  #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) // empty
-  #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) // empty
-  #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) // empty
-  #define ANNOTATE_CONDVAR_WAIT(cv) // empty
-  #define ANNOTATE_CONDVAR_SIGNAL(cv) // empty
-  #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) // empty
-  #define ANNOTATE_HAPPENS_BEFORE(obj) // empty
-  #define ANNOTATE_HAPPENS_AFTER(obj) // empty
-  #define ANNOTATE_PUBLISH_MEMORY_RANGE(address, size) // empty
-  #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(address, size)  // empty
-  #define ANNOTATE_SWAP_MEMORY_RANGE(address, size)  // empty
-  #define ANNOTATE_PCQ_CREATE(pcq) // empty
-  #define ANNOTATE_PCQ_DESTROY(pcq) // empty
-  #define ANNOTATE_PCQ_PUT(pcq) // empty
-  #define ANNOTATE_PCQ_GET(pcq) // empty
-  #define ANNOTATE_NEW_MEMORY(address, size) // empty
-  #define ANNOTATE_EXPECT_RACE(address, description) // empty
-  #define ANNOTATE_BENIGN_RACE(address, description) // empty
-  #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) // empty
-  #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) // empty
-  #define ANNOTATE_TRACE_MEMORY(arg) // empty
-  #define ANNOTATE_THREAD_NAME(name) // empty
-  #define ANNOTATE_IGNORE_READS_BEGIN() // empty
-  #define ANNOTATE_IGNORE_READS_END() // empty
-  #define ANNOTATE_IGNORE_WRITES_BEGIN() // empty
-  #define ANNOTATE_IGNORE_WRITES_END() // empty
-  #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() // empty
-  #define ANNOTATE_IGNORE_READS_AND_WRITES_END() // empty
-  #define ANNOTATE_NO_OP(arg) // empty
-
-#endif  // NDEBUG
-
-// Use the macros above rather than using these functions directly.
-extern "C" void AnnotateRWLockCreate(const char *file, int line,
-                                     const volatile void *lock);
-extern "C" void AnnotateRWLockDestroy(const char *file, int line,
-                                      const volatile void *lock);
-extern "C" void AnnotateRWLockAcquired(const char *file, int line,
-                                       const volatile void *lock, long is_w);
-extern "C" void AnnotateRWLockReleased(const char *file, int line,
-                                       const volatile void *lock, long is_w);
-extern "C" void AnnotateCondVarWait(const char *file, int line,
-                                    const volatile void *cv,
-                                    const volatile void *lock);
-extern "C" void AnnotateCondVarSignal(const char *file, int line,
-                                      const volatile void *cv);
-extern "C" void AnnotateCondVarSignalAll(const char *file, int line,
-                                         const volatile void *cv);
-extern "C" void AnnotatePublishMemoryRange(const char *file, int line,
-                                           const volatile void *address,
-                                           long size);
-extern "C" void AnnotateUnpublishMemoryRange(const char *file, int line,
-                                           const volatile void *address,
-                                           long size);
-extern "C" void AnnotatePCQCreate(const char *file, int line,
-                                  const volatile void *pcq);
-extern "C" void AnnotatePCQDestroy(const char *file, int line,
-                                   const volatile void *pcq);
-extern "C" void AnnotatePCQPut(const char *file, int line,
-                               const volatile void *pcq);
-extern "C" void AnnotatePCQGet(const char *file, int line,
-                               const volatile void *pcq);
-extern "C" void AnnotateNewMemory(const char *file, int line,
+  /* Force the race detector to flush its state. The actual effect depends on
+   * the implementation of the detector. */
+  #define ANNOTATE_FLUSH_STATE() \
+    AnnotateFlushState(__FILE__, __LINE__)
+
+
+#else  /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+  #define ANNOTATE_RWLOCK_CREATE(lock) /* empty */
+  #define ANNOTATE_RWLOCK_DESTROY(lock) /* empty */
+  #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) /* empty */
+  #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) /* empty */
+  #define ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) /* */
+  #define ANNOTATE_BARRIER_WAIT_BEFORE(barrier) /* empty */
+  #define ANNOTATE_BARRIER_WAIT_AFTER(barrier) /* empty */
+  #define ANNOTATE_BARRIER_DESTROY(barrier) /* empty */
+  #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) /* empty */
+  #define ANNOTATE_CONDVAR_WAIT(cv) /* empty */
+  #define ANNOTATE_CONDVAR_SIGNAL(cv) /* empty */
+  #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) /* empty */
+  #define ANNOTATE_HAPPENS_BEFORE(obj) /* empty */
+  #define ANNOTATE_HAPPENS_AFTER(obj) /* empty */
+  #define ANNOTATE_PUBLISH_MEMORY_RANGE(address, size) /* empty */
+  #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(address, size)  /* empty */
+  #define ANNOTATE_SWAP_MEMORY_RANGE(address, size)  /* empty */
+  #define ANNOTATE_PCQ_CREATE(pcq) /* empty */
+  #define ANNOTATE_PCQ_DESTROY(pcq) /* empty */
+  #define ANNOTATE_PCQ_PUT(pcq) /* empty */
+  #define ANNOTATE_PCQ_GET(pcq) /* empty */
+  #define ANNOTATE_NEW_MEMORY(address, size) /* empty */
+  #define ANNOTATE_EXPECT_RACE(address, description) /* empty */
+  #define ANNOTATE_BENIGN_RACE(address, description) /* empty */
+  #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) /* empty */
+  #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) /* empty */
+  #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) /* empty */
+  #define ANNOTATE_TRACE_MEMORY(arg) /* empty */
+  #define ANNOTATE_THREAD_NAME(name) /* empty */
+  #define ANNOTATE_IGNORE_READS_BEGIN() /* empty */
+  #define ANNOTATE_IGNORE_READS_END() /* empty */
+  #define ANNOTATE_IGNORE_WRITES_BEGIN() /* empty */
+  #define ANNOTATE_IGNORE_WRITES_END() /* empty */
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() /* empty */
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_END() /* empty */
+  #define ANNOTATE_ENABLE_RACE_DETECTION(enable) /* empty */
+  #define ANNOTATE_NO_OP(arg) /* empty */
+  #define ANNOTATE_FLUSH_STATE() /* empty */
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+/* Use the macros above rather than using these functions directly. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock);
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock);
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed);
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier);
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier);
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier);
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock);
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv);
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv);
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size);
+void AnnotateUnpublishMemoryRange(const char *file, int line,
                                   const volatile void *address,
                                   long size);
-extern "C" void AnnotateExpectRace(const char *file, int line,
-                                   const volatile void *address,
-                                   const char *description);
-extern "C" void AnnotateBenignRace(const char *file, int line,
-                                   const volatile void *address,
-                                   const char *description);
-extern "C" void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
-                                            const volatile void *mu);
-extern "C" void AnnotateTraceMemory(const char *file, int line,
-                                    const volatile void *arg);
-extern "C" void AnnotateThreadName(const char *file, int line,
-                                   const char *name);
-extern "C" void AnnotateIgnoreReadsBegin(const char *file, int line);
-extern "C" void AnnotateIgnoreReadsEnd(const char *file, int line);
-extern "C" void AnnotateIgnoreWritesBegin(const char *file, int line);
-extern "C" void AnnotateIgnoreWritesEnd(const char *file, int line);
-extern "C" void AnnotateNoOp(const char *file, int line,
-                             const volatile void *arg);
-
-#ifndef NDEBUG
-
-  // ANNOTATE_UNPROTECTED_READ is the preferred way to annotate racey reads.
-  //
-  // Instead of doing
-  //    ANNOTATE_IGNORE_READS_BEGIN();
-  //    ... = x;
-  //    ANNOTATE_IGNORE_READS_END();
-  // one can use
-  //    ... = ANNOTATE_UNPROTECTED_READ(x);
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq);
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq);
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *address,
+                       long size);
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRaceSized(const char *file, int line,
+                        const volatile void *address,
+                        long size,
+                        const char *description);
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu);
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg);
+void AnnotateThreadName(const char *file, int line,
+                        const char *name);
+void AnnotateIgnoreReadsBegin(const char *file, int line);
+void AnnotateIgnoreReadsEnd(const char *file, int line);
+void AnnotateIgnoreWritesBegin(const char *file, int line);
+void AnnotateIgnoreWritesEnd(const char *file, int line);
+void AnnotateEnableRaceDetection(const char *file, int line, int enable);
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg);
+void AnnotateFlushState(const char *file, int line);
+
+/* Return non-zero value if running under valgrind.
+
+  If "valgrind.h" is included into dynamic_annotations.c,
+  the regular valgrind mechanism will be used.
+  See http://valgrind.org/docs/manual/manual-core-adv.html about
+  RUNNING_ON_VALGRIND and other valgrind "client requests".
+  The file "valgrind.h" may be obtained by doing
+     svn co svn://svn.valgrind.org/valgrind/trunk/include
+
+  If for some reason you can't use "valgrind.h" or want to fake valgrind,
+  there are two ways to make this function return non-zero:
+    - Use environment variable: export RUNNING_ON_VALGRIND=1
+    - Make your tool intercept the function RunningOnValgrind() and
+      change its return value.
+ */
+int RunningOnValgrind(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0 && defined(__cplusplus)
+
+  /* ANNOTATE_UNPROTECTED_READ is the preferred way to annotate racey reads.
+
+     Instead of doing
+        ANNOTATE_IGNORE_READS_BEGIN();
+        ... = x;
+        ANNOTATE_IGNORE_READS_END();
+     one can use
+        ... = ANNOTATE_UNPROTECTED_READ(x); */
   template <class T>
-  inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x)
-       NO_THREAD_SAFETY_ANALYSIS {
+  inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x) {
     ANNOTATE_IGNORE_READS_BEGIN();
     T res = x;
     ANNOTATE_IGNORE_READS_END();
     return res;
   }
-
-  // Apply ANNOTATE_BENIGN_RACE to a static variable.
+  /* Apply ANNOTATE_BENIGN_RACE_SIZED to a static variable. */
   #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description)        \
     namespace {                                                       \
       class static_var ## _annotator {                                \
        public:                                                        \
         static_var ## _annotator() {                                  \
-          ANNOTATE_BENIGN_RACE(&static_var,                           \
+          ANNOTATE_BENIGN_RACE_SIZED(&static_var,                     \
+                                      sizeof(static_var),             \
             # static_var ": " description);                           \
         }                                                             \
       };                                                              \
       static static_var ## _annotator the ## static_var ## _annotator;\
     }
-#else // !NDEBUG
+#else /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
 
   #define ANNOTATE_UNPROTECTED_READ(x) (x)
-  #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description)  // empty
-
-#endif // !NDEBUG
-
-// Return non-zero value if running under valgrind.
-extern "C" int RunningOnValgrind();
+  #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description)  /* empty */
 
+#endif /* DYNAMIC_ANNOTATIONS_ENABLED */
 
-#endif  // BASE_DYNAMIC_ANNOTATIONS_H_
+#endif  /* BASE_DYNAMIC_ANNOTATIONS_H_ */
diff --git a/third_party/tcmalloc/chromium/src/base/low_level_alloc.cc b/third_party/tcmalloc/chromium/src/base/low_level_alloc.cc
index 2bbce54..7ca3953a 100644
--- a/third_party/tcmalloc/chromium/src/base/low_level_alloc.cc
+++ b/third_party/tcmalloc/chromium/src/base/low_level_alloc.cc
@@ -210,8 +210,9 @@ static const intptr_t kMagicUnallocated = ~kMagicAllocated;
 namespace {
   class ArenaLock {
    public:
-    explicit ArenaLock(LowLevelAlloc::Arena *arena) :
-        left_(false), mask_valid_(false), arena_(arena) {
+    explicit ArenaLock(LowLevelAlloc::Arena *arena)
+        EXCLUSIVE_LOCK_FUNCTION(arena->mu)
+        : left_(false), mask_valid_(false), arena_(arena) {
       if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) != 0) {
       // We've decided not to support async-signal-safe arena use until
       // there a demonstrated need.  Here's how one could do it though
@@ -228,7 +229,7 @@ namespace {
       this->arena_->mu.Lock();
     }
     ~ArenaLock() { RAW_CHECK(this->left_, "haven't left Arena region"); }
-    void Leave() {
+    void Leave() UNLOCK_FUNCTION(arena_->mu) {
       this->arena_->mu.Unlock();
 #if 0
       if (this->mask_valid_) {
diff --git a/third_party/tcmalloc/chromium/src/base/vdso_support.cc b/third_party/tcmalloc/chromium/src/base/vdso_support.cc
index ddaca37..fce7c2c 100644
--- a/third_party/tcmalloc/chromium/src/base/vdso_support.cc
+++ b/third_party/tcmalloc/chromium/src/base/vdso_support.cc
@@ -42,8 +42,8 @@
 #include <fcntl.h>
 
 #include "base/atomicops.h"  // for MemoryBarrier
-#include "base/logging.h"
 #include "base/linux_syscall_support.h"
+#include "base/logging.h"
 #include "base/dynamic_annotations.h"
 #include "base/basictypes.h"  // for COMPILE_ASSERT
 
diff --git a/third_party/tcmalloc/chromium/src/central_freelist.cc b/third_party/tcmalloc/chromium/src/central_freelist.cc
index 674ff9b..5b7dfbb 100644
--- a/third_party/tcmalloc/chromium/src/central_freelist.cc
+++ b/third_party/tcmalloc/chromium/src/central_freelist.cc
@@ -266,8 +266,7 @@ void CentralFreeList::Populate() {
   Span* span;
   {
     SpinLockHolder h(Static::pageheap_lock());
-    span = Static::pageheap()->New(npages);
-    if (span) Static::pageheap()->RegisterSizeClass(span, size_class_);
+    span = Static::pageheap()->New(npages, size_class_, kPageSize);
   }
   if (span == NULL) {
     MESSAGE("tcmalloc: allocation failed", npages << kPageShift);
@@ -275,12 +274,6 @@ void CentralFreeList::Populate() {
     return;
   }
   ASSERT(span->length == npages);
-  // Cache sizeclass info eagerly.  Locking is not necessary.
-  // (Instead of being eager, we could just replace any stale info
-  // about this span, but that seems to be no better in practice.)
-  for (int i = 0; i < npages; i++) {
-    Static::pageheap()->CacheSizeClass(span->start + i, size_class_);
-  }
 
   // Split the block into pieces and add to the free-list
   // TODO: coloring of objects to avoid cache conflicts?
diff --git a/third_party/tcmalloc/chromium/src/common.h b/third_party/tcmalloc/chromium/src/common.h
index 53a0a0b..f9557c9 100644
--- a/third_party/tcmalloc/chromium/src/common.h
+++ b/third_party/tcmalloc/chromium/src/common.h
@@ -62,6 +62,7 @@ static const size_t kPageSize   = 1 << kPageShift;
 static const size_t kMaxSize    = 8u * kPageSize;
 static const size_t kAlignment  = 8;
 static const size_t kNumClasses = 61;
+static const size_t kLargeSizeClass = 0;
 
 // Maximum length we allow a per-thread free-list to have before we
 // move objects from it into the corresponding central free-list.  We
diff --git a/third_party/tcmalloc/chromium/src/config.h.in b/third_party/tcmalloc/chromium/src/config.h.in
index 1ad2642..49bbf0d 100644
--- a/third_party/tcmalloc/chromium/src/config.h.in
+++ b/third_party/tcmalloc/chromium/src/config.h.in
@@ -132,7 +132,7 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #undef HAVE_SYS_TYPES_H
 
-/* Define to 1 if you have the <sys/ucontext.h> header file. */
+/* <sys/ucontext.h> is broken on redhat 7 */
 #undef HAVE_SYS_UCONTEXT_H
 
 /* Define to 1 if you have the <sys/wait.h> header file. */
@@ -150,6 +150,9 @@
 /* Define to 1 if you have the <unwind.h> header file. */
 #undef HAVE_UNWIND_H
 
+/* Define to 1 if you have the <valgrind.h> header file. */
+#undef HAVE_VALGRIND_H
+
 /* define if your compiler has __attribute__ */
 #undef HAVE___ATTRIBUTE__
 
diff --git a/third_party/tcmalloc/chromium/src/config_linux.h b/third_party/tcmalloc/chromium/src/config_linux.h
index 398f303..9786b3e 100644
--- a/third_party/tcmalloc/chromium/src/config_linux.h
+++ b/third_party/tcmalloc/chromium/src/config_linux.h
@@ -136,7 +136,7 @@
 /* Define to 1 if compiler supports __thread */
 #define HAVE_TLS 1
 
-/* Define to 1 if you have the <ucontext.h> header file. */
+/* <sys/ucontext.h> is broken on redhat 7 */
 #define HAVE_UCONTEXT_H 1
 
 /* Define to 1 if you have the <unistd.h> header file. */
@@ -145,6 +145,9 @@
 /* Define to 1 if you have the <unwind.h> header file. */
 #define HAVE_UNWIND_H 1
 
+/* Define to 1 if you have the <valgrind.h> header file. */
+#undef HAVE_VALGRIND_H
+
 /* define if your compiler has __attribute__ */
 #define HAVE___ATTRIBUTE__ 1
 
diff --git a/third_party/tcmalloc/chromium/src/config_win.h b/third_party/tcmalloc/chromium/src/config_win.h
index 30daf4f..236bd6b 100644
--- a/third_party/tcmalloc/chromium/src/config_win.h
+++ b/third_party/tcmalloc/chromium/src/config_win.h
@@ -255,10 +255,12 @@
 // ---------------------------------------------------------------------
 // Extra stuff not found in config.h.in
 
-// This must be defined before the windows.h is included.  It's needed
-// for mutex.h, to give access to the TryLock method.
+// This must be defined before the windows.h is included.  We need at
+// least 0x0400 for mutex.h to have access to TryLock, and at least
+// 0x0501 for patch_functions.cc to have access to GetModuleHandleEx.
+// (This latter is an optimization we could take out if need be.)
 #ifndef _WIN32_WINNT
-# define _WIN32_WINNT 0x0400
+# define _WIN32_WINNT 0x0501
 #endif
 
 // We want to make sure not to ever try to #include heap-checker.h
diff --git a/third_party/tcmalloc/chromium/src/debugallocation.cc b/third_party/tcmalloc/chromium/src/debugallocation.cc
index 1a9ddcb..949fbe9 100644
--- a/third_party/tcmalloc/chromium/src/debugallocation.cc
+++ b/third_party/tcmalloc/chromium/src/debugallocation.cc
@@ -1010,7 +1010,7 @@ static void *MemalignOverride(size_t align, size_t size,
                               const void *caller) __THROW
   ATTRIBUTE_SECTION(google_malloc);
 
-void* operator new(size_t size)
+void* operator new(size_t size) throw (std::bad_alloc)
   ATTRIBUTE_SECTION(google_malloc);
 void* operator new(size_t size, const std::nothrow_t&) __THROW
   ATTRIBUTE_SECTION(google_malloc);
@@ -1018,7 +1018,7 @@ void operator delete(void* p) __THROW
   ATTRIBUTE_SECTION(google_malloc);
 void operator delete(void* p, const std::nothrow_t&) __THROW
   ATTRIBUTE_SECTION(google_malloc);
-void* operator new[](size_t size)
+void* operator new[](size_t size) throw (std::bad_alloc)
   ATTRIBUTE_SECTION(google_malloc);
 void* operator new[](size_t size, const std::nothrow_t&) __THROW
   ATTRIBUTE_SECTION(google_malloc);
@@ -1176,12 +1176,12 @@ extern "C" void* pvalloc(size_t size) __THROW {
   return p;
 }
 
-extern "C" int mallopt(int cmd, int value) {
+extern "C" int mallopt(int cmd, int value) __THROW {
   return BASE_MALLOPT(cmd, value);
 }
 
 #ifdef HAVE_STRUCT_MALLINFO
-extern "C" struct mallinfo mallinfo(void) {
+extern "C" struct mallinfo mallinfo(void) __THROW {
   return BASE_MALLINFO();
 }
 #endif
@@ -1239,7 +1239,7 @@ inline void* cpp_debug_alloc(size_t size, int new_type, bool nothrow) {
   }
 }
 
-void* operator new(size_t size) {
+void* operator new(size_t size) throw (std::bad_alloc) {
   void* ptr = cpp_debug_alloc(size, MallocBlock::kNewType, false);
   MallocHook::InvokeNewHook(ptr, size);
   if (ptr == NULL) {
@@ -1259,7 +1259,8 @@ void operator delete(void* ptr) __THROW {
   DebugDeallocate(ptr, MallocBlock::kNewType);
 }
 
-// Compilers use this, though I can't see how it differs from normal delete.
+// Some STL implementations explicitly invoke this.
+// It is completely equivalent to a normal delete (delete never throws).
 void operator delete(void* ptr, const std::nothrow_t&) __THROW {
   MallocHook::InvokeDeleteHook(ptr);
   DebugDeallocate(ptr, MallocBlock::kNewType);
@@ -1269,7 +1270,7 @@ void operator delete(void* ptr, const std::nothrow_t&) __THROW {
 
 // Alloc/free stuff for debug operator new[] & friends
 
-void* operator new[](size_t size) {
+void* operator new[](size_t size) throw (std::bad_alloc) {
   void* ptr = cpp_debug_alloc(size, MallocBlock::kArrayNewType, false);
   MallocHook::InvokeNewHook(ptr, size);
   if (ptr == NULL) {
@@ -1289,7 +1290,8 @@ void operator delete[](void* ptr) __THROW {
   DebugDeallocate(ptr, MallocBlock::kArrayNewType);
 }
 
-// Compilers use this, though I can't see how it differs from normal delete.
+// Some STL implementations explicitly invoke this.
+// It is completely equivalent to a normal delete (delete never throws).
 void operator delete[](void* ptr, const std::nothrow_t&) __THROW {
   MallocHook::InvokeDeleteHook(ptr);
   DebugDeallocate(ptr, MallocBlock::kArrayNewType);
@@ -1359,17 +1361,22 @@ class DebugMallocImplementation : public ParentImplementation {
 static DebugMallocImplementation debug_malloc_implementation;
 
 REGISTER_MODULE_INITIALIZER(debugallocation, {
-  MallocExtension::Register(&debug_malloc_implementation);
-
-  // When the program exits, check all blocks still in the free queue for
-  // corruption.
-  atexit(DanglingWriteChecker);
+  // Either we or valgrind will control memory management.  We
+  // register our extension if we're the winner.
+  if (RunningOnValgrind()) {
+    // Let Valgrind uses its own malloc (so don't register our extension).
+  } else {
+    MallocExtension::Register(&debug_malloc_implementation);
+    // When the program exits, check all blocks still in the free
+    // queue for corruption.
+    atexit(DanglingWriteChecker);
+  }
 });
 
 #ifdef TCMALLOC_FOR_DEBUGALLOCATION
 
 // Redefine malloc_stats to use tcmalloc's implementation:
-extern "C" void malloc_stats(void) {
+extern "C" void malloc_stats(void) __THROW {
   do_malloc_stats();
 }
 
diff --git a/third_party/tcmalloc/chromium/src/google/heap-profiler.h b/third_party/tcmalloc/chromium/src/google/heap-profiler.h
index 5efaf64..57cb97a 100644
--- a/third_party/tcmalloc/chromium/src/google/heap-profiler.h
+++ b/third_party/tcmalloc/chromium/src/google/heap-profiler.h
@@ -71,12 +71,13 @@ extern "C" {
  */
 PERFTOOLS_DLL_DECL void HeapProfilerStart(const char* prefix);
 
-/* Returns true if we are currently profiling the heap.  This is true
+/* Returns non-zero if we are currently profiling the heap.  (Returns
+ * an int rather than a bool so it's usable from C.)  This is true
  * between calls to HeapProfilerStart() and HeapProfilerStop(), and
  * also if the program has been run with HEAPPROFILER, or some other
  * way to turn on whole-program profiling.
  */
-bool IsHeapProfilerRunning();
+int IsHeapProfilerRunning();
 
 /* Stop heap profiling.  Can be restarted again with HeapProfilerStart(),
  * but the currently accumulated profiling information will be cleared.
diff --git a/third_party/tcmalloc/chromium/src/google/profiler.h b/third_party/tcmalloc/chromium/src/google/profiler.h
index 74b936f..a6883f4 100644
--- a/third_party/tcmalloc/chromium/src/google/profiler.h
+++ b/third_party/tcmalloc/chromium/src/google/profiler.h
@@ -108,13 +108,15 @@ struct ProfilerOptions {
   void *filter_in_thread_arg;
 };
 
-/* Start profiling and write profile info into fname.
+/* Start profiling and write profile info into fname, discarding any
+ * existing profiling data in that file.
  *
  * This is equivalent to calling ProfilerStartWithOptions(fname, NULL).
  */
 PERFTOOLS_DLL_DECL int ProfilerStart(const char* fname);
 
-/* Start profiling and write profile into fname.
+/* Start profiling and write profile into fname, discarding any
+ * existing profiling data in that file.
  *
  * The profiler is configured using the options given by 'options'.
  * Options which are not specified are given default values.
diff --git a/third_party/tcmalloc/chromium/src/google/stacktrace.h b/third_party/tcmalloc/chromium/src/google/stacktrace.h
index 8188ce3..fd186d6 100644
--- a/third_party/tcmalloc/chromium/src/google/stacktrace.h
+++ b/third_party/tcmalloc/chromium/src/google/stacktrace.h
@@ -49,23 +49,23 @@
 // Skips the most recent "skip_count" stack frames (also skips the
 // frame generated for the "GetStackFrames" routine itself), and then
 // records the pc values for up to the next "max_depth" frames in
-// "pcs", and the corresponding stack frame sizes in "sizes".  Returns
-// the number of values recorded in "pcs"/"sizes".
+// "result", and the corresponding stack frame sizes in "sizes".
+// Returns the number of values recorded in "result"/"sizes".
 //
 // Example:
 //      main() { foo(); }
 //      foo() { bar(); }
 //      bar() {
-//        void* pcs[10];
+//        void* result[10];
 //        int sizes[10];
-//        int depth = GetStackFrames(pcs, sizes, 10, 1);
+//        int depth = GetStackFrames(result, sizes, 10, 1);
 //      }
 //
 // The GetStackFrames call will skip the frame for "bar".  It will
 // return 2 and will produce pc values that map to the following
 // procedures:
-//      pcs[0]       foo
-//      pcs[1]       main
+//      result[0]       foo
+//      result[1]       main
 // (Actually, there may be a few more entries after "main" to account for
 // startup procedures.)
 // And corresponding stack frame sizes will also be recorded:
@@ -76,15 +76,15 @@
 // be identified.
 //
 // This routine may return fewer stack frame entries than are
-// available. Also note that "pcs" and "sizes" must both be non-NULL.
-extern PERFTOOLS_DLL_DECL int GetStackFrames(void** pcs, int* sizes, int max_depth,
+// available. Also note that "result" and "sizes" must both be non-NULL.
+extern PERFTOOLS_DLL_DECL int GetStackFrames(void** result, int* sizes, int max_depth,
                           int skip_count);
 
 // Same as above, but to be used from a signal handler. The "uc" parameter
 // should be the pointer to ucontext_t which was passed as the 3rd parameter
 // to sa_sigaction signal handler. It may help the unwinder to get a
 // better stack trace under certain conditions. The "uc" may safely be NULL.
-extern PERFTOOLS_DLL_DECL int GetStackFramesWithContext(void** pcs, int* sizes, int max_depth,
+extern PERFTOOLS_DLL_DECL int GetStackFramesWithContext(void** result, int* sizes, int max_depth,
                                      int skip_count, const void *uc);
 
 // This is similar to the GetStackFrames routine, except that it returns
diff --git a/third_party/tcmalloc/chromium/src/google/tcmalloc.h.in b/third_party/tcmalloc/chromium/src/google/tcmalloc.h.in
index e5c873d..fbb70ab 100644
--- a/third_party/tcmalloc/chromium/src/google/tcmalloc.h.in
+++ b/third_party/tcmalloc/chromium/src/google/tcmalloc.h.in
@@ -60,7 +60,8 @@
 #endif
 
 #ifdef __cplusplus
-#include <new>  // for nothrow_t
+#include <new>          // for std::nothrow_t
+
 extern "C" {
 #endif
   // Returns a human-readable version string.  If major, minor,
@@ -91,16 +92,15 @@ extern "C" {
 #ifdef __cplusplus
   PERFTOOLS_DLL_DECL int tc_set_new_mode(int flag) __THROW;
   PERFTOOLS_DLL_DECL void* tc_new(size_t size);
-  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
-  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
-  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
-
   PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size,
                                           const std::nothrow_t&) __THROW;
-  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
-                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
   PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p,
                                             const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
+                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
   PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p,
                                                  const std::nothrow_t&) __THROW;
 }
diff --git a/third_party/tcmalloc/chromium/src/heap-checker.cc b/third_party/tcmalloc/chromium/src/heap-checker.cc
index 82a7adb..2779c97 100644
--- a/third_party/tcmalloc/chromium/src/heap-checker.cc
+++ b/third_party/tcmalloc/chromium/src/heap-checker.cc
@@ -159,6 +159,23 @@ DEFINE_bool(heap_check_test_pointer_alignment,
             "Set to true to check if the found leak can be due to "
             "use of unaligned pointers");
 
+// Alignment at which all pointers in memory are supposed to be located;
+// use 1 if any alignment is ok.
+// heap_check_test_pointer_alignment flag guides if we try the value of 1.
+// The larger it can be, the lesser is the chance of missing real leaks.
+//
+// sizeof(void)* is correct.  However gold (the new linker) has a bug where it
+// sometimes places global pointers on 4-byte boundaries, even when pointers
+// are 8 bytes long.  While we are fixing the linker, degrade to 4-byte
+// alignment on all targets.  http://b/1226481
+//
+static const size_t kPointerSourceAlignment = sizeof(void*);
+DEFINE_int32(heap_check_pointer_source_alignment,
+	     EnvToInt("HEAP_CHECK_POINTER_SOURCE_ALIGNMENT",
+                      kPointerSourceAlignment),
+             "Alignment at which all pointers in memory are supposed to be "
+             "located.  Use 1 if any alignment is ok.");
+
 // A reasonable default to handle pointers inside of typical class objects:
 // Too low and we won't be able to traverse pointers to normally-used
 // nested objects and base parts of multiple-inherited objects.
@@ -245,13 +262,6 @@ static bool constructor_heap_profiling = false;
 static const int heap_checker_info_level = 0;
 
 //----------------------------------------------------------------------
-
-// Alignment at which all pointers in memory are supposed to be located;
-// use 1 if any alignment is ok.
-// heap_check_test_pointer_alignment flag guides if we try the value of 1.
-// The larger it can be, the lesser is the chance of missing real leaks.
-static const size_t kPointerSourceAlignment = sizeof(void*);
-
 // Cancel our InitialMallocHook_* if present.
 static void CancelInitialMallocHooks();  // defined below
 
@@ -484,7 +494,7 @@ HeapLeakChecker::Disabler::Disabler() {
   // in a thread-safe manner.
   int counter = get_thread_disable_counter();
   set_thread_disable_counter(counter + 1);
-  RAW_VLOG(1, "Increasing thread disable counter to %d", counter + 1);
+  RAW_VLOG(10, "Increasing thread disable counter to %d", counter + 1);
 }
 
 HeapLeakChecker::Disabler::~Disabler() {
@@ -492,7 +502,7 @@ HeapLeakChecker::Disabler::~Disabler() {
   RAW_DCHECK(counter > 0, "");
   if (counter > 0) {
     set_thread_disable_counter(counter - 1);
-    RAW_VLOG(1, "Decreasing thread disable counter to %d", counter);
+    RAW_VLOG(10, "Decreasing thread disable counter to %d", counter);
   } else {
     RAW_VLOG(0, "Thread disable counter underflow : %d", counter);
   }
@@ -525,7 +535,7 @@ static void NewHook(const void* ptr, size_t size) {
   if (ptr != NULL) {
     const int counter = get_thread_disable_counter();
     const bool ignore = (counter > 0);
-    RAW_VLOG(7, "Recording Alloc: %p of %"PRIuS "; %d", ptr, size,
+    RAW_VLOG(16, "Recording Alloc: %p of %"PRIuS "; %d", ptr, size,
              int(counter));
     { SpinLockHolder l(&heap_checker_lock);
       if (size > max_heap_object_size) max_heap_object_size = size;
@@ -540,17 +550,17 @@ static void NewHook(const void* ptr, size_t size) {
         }
       }
     }
-    RAW_VLOG(8, "Alloc Recorded: %p of %"PRIuS"", ptr, size);
+    RAW_VLOG(17, "Alloc Recorded: %p of %"PRIuS"", ptr, size);
   }
 }
 
 static void DeleteHook(const void* ptr) {
   if (ptr != NULL) {
-    RAW_VLOG(7, "Recording Free %p", ptr);
+    RAW_VLOG(16, "Recording Free %p", ptr);
     { SpinLockHolder l(&heap_checker_lock);
       if (heap_checker_on) heap_profile->RecordFree(ptr);
     }
-    RAW_VLOG(8, "Free Recorded: %p", ptr);
+    RAW_VLOG(17, "Free Recorded: %p", ptr);
   }
 }
 
@@ -584,7 +594,7 @@ static StackDirection stack_direction = UNKNOWN_DIRECTION;
 static void RegisterStackLocked(const void* top_ptr) {
   RAW_DCHECK(heap_checker_lock.IsHeld(), "");
   RAW_DCHECK(MemoryRegionMap::LockIsHeld(), "");
-  RAW_VLOG(1, "Thread stack at %p", top_ptr);
+  RAW_VLOG(10, "Thread stack at %p", top_ptr);
   uintptr_t top = AsInt(top_ptr);
   stack_tops->insert(top);  // add for later use
 
@@ -598,12 +608,12 @@ static void RegisterStackLocked(const void* top_ptr) {
   if (MemoryRegionMap::FindAndMarkStackRegion(top, &region)) {
     // Make the proper portion of the stack live:
     if (stack_direction == GROWS_TOWARDS_LOW_ADDRESSES) {
-      RAW_VLOG(2, "Live stack at %p of %"PRIuPTR" bytes",
+      RAW_VLOG(11, "Live stack at %p of %"PRIuPTR" bytes",
                   top_ptr, region.end_addr - top);
       live_objects->push_back(AllocObject(top_ptr, region.end_addr - top,
                                           THREAD_DATA));
     } else {  // GROWS_TOWARDS_HIGH_ADDRESSES
-      RAW_VLOG(2, "Live stack at %p of %"PRIuPTR" bytes",
+      RAW_VLOG(11, "Live stack at %p of %"PRIuPTR" bytes",
                   AsPtr(region.start_addr),
                   top - region.start_addr);
       live_objects->push_back(AllocObject(AsPtr(region.start_addr),
@@ -619,7 +629,7 @@ static void RegisterStackLocked(const void* top_ptr) {
         uintptr_t start = AsInt(span->ptr);
         uintptr_t end = start + span->size;
         if (start <= top  &&  top < end) {
-          RAW_VLOG(2, "Stack at %p is inside /proc/self/maps chunk %p..%p",
+          RAW_VLOG(11, "Stack at %p is inside /proc/self/maps chunk %p..%p",
                       top_ptr, AsPtr(start), AsPtr(end));
           // Shrink start..end region by chopping away the memory regions in
           // MemoryRegionMap that land in it to undo merging of regions
@@ -640,17 +650,17 @@ static void RegisterStackLocked(const void* top_ptr) {
             }
           }
           if (stack_start != start  ||  stack_end != end) {
-            RAW_VLOG(2, "Stack at %p is actually inside memory chunk %p..%p",
+            RAW_VLOG(11, "Stack at %p is actually inside memory chunk %p..%p",
                         top_ptr, AsPtr(stack_start), AsPtr(stack_end));
           }
           // Make the proper portion of the stack live:
           if (stack_direction == GROWS_TOWARDS_LOW_ADDRESSES) {
-            RAW_VLOG(2, "Live stack at %p of %"PRIuPTR" bytes",
+            RAW_VLOG(11, "Live stack at %p of %"PRIuPTR" bytes",
                         top_ptr, stack_end - top);
             live_objects->push_back(
               AllocObject(top_ptr, stack_end - top, THREAD_DATA));
           } else {  // GROWS_TOWARDS_HIGH_ADDRESSES
-            RAW_VLOG(2, "Live stack at %p of %"PRIuPTR" bytes",
+            RAW_VLOG(11, "Live stack at %p of %"PRIuPTR" bytes",
                         AsPtr(stack_start), top - stack_start);
             live_objects->push_back(
               AllocObject(AsPtr(stack_start), top - stack_start, THREAD_DATA));
@@ -723,14 +733,14 @@ static void MakeDisabledLiveCallbackLocked(
         // and the rest of the region where the stack lives can well
         // contain outdated stack variables which are not live anymore,
         // hence should not be treated as such.
-        RAW_VLOG(2, "Not %s-disabling %"PRIuS" bytes at %p"
+        RAW_VLOG(11, "Not %s-disabling %"PRIuS" bytes at %p"
                     ": have stack inside: %p",
                     (stack_disable ? "stack" : "range"),
                     info.object_size, ptr, AsPtr(*iter));
         return;
       }
     }
-    RAW_VLOG(2, "%s-disabling %"PRIuS" bytes at %p",
+    RAW_VLOG(11, "%s-disabling %"PRIuS" bytes at %p",
                 (stack_disable ? "Stack" : "Range"), info.object_size, ptr);
     live_objects->push_back(AllocObject(ptr, info.object_size,
                                         MUST_BE_ON_HEAP));
@@ -755,7 +765,7 @@ static void RecordGlobalDataLocked(uintptr_t start_address,
   // Ignore non-writeable regions.
   if (strchr(permissions, 'w') == NULL) return;
   if (filename == NULL  ||  *filename == '\0')  filename = "UNNAMED";
-  RAW_VLOG(2, "Looking into %s: 0x%" PRIxPTR "..0x%" PRIxPTR,
+  RAW_VLOG(11, "Looking into %s: 0x%" PRIxPTR "..0x%" PRIxPTR,
               filename, start_address, end_address);
   (*library_live_objects)[filename].
     push_back(AllocObject(AsPtr(start_address),
@@ -814,12 +824,12 @@ void HeapLeakChecker::DisableLibraryAllocsLocked(const char* library,
     // does not call user code.
   }
   if (depth) {
-    RAW_VLOG(1, "Disabling allocations from %s at depth %d:", library, depth);
+    RAW_VLOG(10, "Disabling allocations from %s at depth %d:", library, depth);
     DisableChecksFromToLocked(AsPtr(start_address), AsPtr(end_address), depth);
     if (IsLibraryNamed(library, "/libpthread")  ||
         IsLibraryNamed(library, "/libdl")  ||
         IsLibraryNamed(library, "/ld")) {
-      RAW_VLOG(1, "Global memory regions made by %s will be live data",
+      RAW_VLOG(10, "Global memory regions made by %s will be live data",
                   library);
       if (global_region_caller_ranges == NULL) {
         global_region_caller_ranges =
@@ -936,7 +946,7 @@ static enum {
                                                         va_list /*ap*/) {
   RAW_DCHECK(heap_checker_lock.IsHeld(), "");
   thread_listing_status = CALLBACK_STARTED;
-  RAW_VLOG(2, "Found %d threads (from pid %d)", num_threads, getpid());
+  RAW_VLOG(11, "Found %d threads (from pid %d)", num_threads, getpid());
 
   if (FLAGS_heap_check_ignore_global_live) {
     UseProcMapsLocked(RECORD_GLOBAL_DATA);
@@ -951,7 +961,7 @@ static enum {
     // the leak checking thread itself is handled
     // specially via self_thread_stack, not here:
     if (thread_pids[i] == self_thread_pid) continue;
-    RAW_VLOG(2, "Handling thread with pid %d", thread_pids[i]);
+    RAW_VLOG(11, "Handling thread with pid %d", thread_pids[i]);
 #if defined(HAVE_LINUX_PTRACE_H) && defined(HAVE_SYS_SYSCALL_H) && defined(DUMPER)
     i386_regs thread_regs;
 #define sys_ptrace(r, p, a, d)  syscall(SYS_ptrace, (r), (p), (a), (d))
@@ -967,7 +977,7 @@ static enum {
       // register pointers still being in the registers and not on the stack):
       for (void** p = reinterpret_cast<void**>(&thread_regs);
            p < reinterpret_cast<void**>(&thread_regs + 1); ++p) {
-        RAW_VLOG(3, "Thread register %p", *p);
+        RAW_VLOG(12, "Thread register %p", *p);
         thread_registers.push_back(*p);
       }
     } else {
@@ -982,7 +992,7 @@ static enum {
   if (thread_registers.size()) {
     // Make thread registers be live heap data sources.
     // we rely here on the fact that vector is in one memory chunk:
-    RAW_VLOG(2, "Live registers at %p of %"PRIuS" bytes",
+    RAW_VLOG(11, "Live registers at %p of %"PRIuS" bytes",
                 &thread_registers[0], thread_registers.size() * sizeof(void*));
     live_objects->push_back(AllocObject(&thread_registers[0],
                                         thread_registers.size() * sizeof(void*),
@@ -1005,7 +1015,7 @@ static const void* self_thread_stack_top;
 void HeapLeakChecker::IgnoreNonThreadLiveObjectsLocked() {
   RAW_DCHECK(heap_checker_lock.IsHeld(), "");
   RAW_DCHECK(MemoryRegionMap::LockIsHeld(), "");
-  RAW_VLOG(2, "Handling self thread with pid %d", self_thread_pid);
+  RAW_VLOG(11, "Handling self thread with pid %d", self_thread_pid);
   // Register our own stack:
 
   // Important that all stack ranges (including the one here)
@@ -1019,7 +1029,7 @@ void HeapLeakChecker::IgnoreNonThreadLiveObjectsLocked() {
     for (IgnoredObjectsMap::const_iterator object = ignored_objects->begin();
          object != ignored_objects->end(); ++object) {
       const void* ptr = AsPtr(object->first);
-      RAW_VLOG(2, "Ignored live object at %p of %"PRIuS" bytes",
+      RAW_VLOG(11, "Ignored live object at %p of %"PRIuS" bytes",
                   ptr, object->second);
       live_objects->
         push_back(AllocObject(ptr, object->second, MUST_BE_ON_HEAP));
@@ -1132,10 +1142,10 @@ void HeapLeakChecker::IgnoreNonThreadLiveObjectsLocked() {
         }
       }
       // Now get and use live_objects from the final version of l->second:
-      if (VLOG_IS_ON(2)) {
+      if (VLOG_IS_ON(11)) {
         for (LiveObjectsStack::const_iterator i = l->second.begin();
              i != l->second.end(); ++i) {
-          RAW_VLOG(2, "Library live region at %p of %"PRIuPTR" bytes",
+          RAW_VLOG(11, "Library live region at %p of %"PRIuPTR" bytes",
                       i->ptr, i->size);
         }
       }
@@ -1240,7 +1250,7 @@ void HeapLeakChecker::IgnoreAllLiveObjectsLocked(const void* self_stack_top) {
       RAW_LOG(ERROR, "Thread stacks not found for %d threads. "
                      "Will likely report false leak positives.", r);
     } else {
-      RAW_VLOG(2, "Thread stacks appear to be found for all threads");
+      RAW_VLOG(11, "Thread stacks appear to be found for all threads");
     }
   } else {
     RAW_LOG(WARNING, "Not looking for thread stacks; "
@@ -1256,7 +1266,7 @@ void HeapLeakChecker::IgnoreAllLiveObjectsLocked(const void* self_stack_top) {
     IgnoreNonThreadLiveObjectsLocked();
   }
   if (live_objects_total) {
-    RAW_VLOG(1, "Ignoring %"PRId64" reachable objects of %"PRId64" bytes",
+    RAW_VLOG(10, "Ignoring %"PRId64" reachable objects of %"PRId64" bytes",
                 live_objects_total, live_bytes_total);
   }
   // Free these: we made them here and heap_profile never saw them
@@ -1266,7 +1276,8 @@ void HeapLeakChecker::IgnoreAllLiveObjectsLocked(const void* self_stack_top) {
 }
 
 // Alignment at which we should consider pointer positions
-// in IgnoreLiveObjectsLocked. Use 1 if any alignment is ok.
+// in IgnoreLiveObjectsLocked. Will normally use the value of
+// FLAGS_heap_check_pointer_source_alignment.
 static size_t pointer_source_alignment = kPointerSourceAlignment;
 // Global lock for HeapLeakChecker::DoNoLeaks
 // to protect pointer_source_alignment.
@@ -1314,7 +1325,7 @@ static SpinLock alignment_checker_lock(SpinLock::LINKER_INITIALIZED);
       live_object_count += 1;
       live_byte_count += size;
     }
-    RAW_VLOG(4, "Looking for heap pointers in %p of %"PRIuS" bytes",
+    RAW_VLOG(13, "Looking for heap pointers in %p of %"PRIuS" bytes",
                 object, size);
     const char* const whole_object = object;
     size_t const whole_size = size;
@@ -1351,7 +1362,7 @@ static SpinLock alignment_checker_lock(SpinLock::LINKER_INITIALIZED);
       if (can_be_on_heap) {
         const void* ptr = reinterpret_cast<const void*>(addr);
         // Too expensive (inner loop): manually uncomment when debugging:
-        // RAW_VLOG(8, "Trying pointer to %p at %p", ptr, object);
+        // RAW_VLOG(17, "Trying pointer to %p at %p", ptr, object);
         size_t object_size;
         if (HaveOnHeapLocked(&ptr, &object_size)  &&
             heap_profile->MarkAsLive(ptr)) {
@@ -1360,15 +1371,15 @@ static SpinLock alignment_checker_lock(SpinLock::LINKER_INITIALIZED);
           // a heap object which is in fact leaked.
           // I.e. in very rare and probably not repeatable/lasting cases
           // we might miss some real heap memory leaks.
-          RAW_VLOG(5, "Found pointer to %p of %"PRIuS" bytes at %p "
+          RAW_VLOG(14, "Found pointer to %p of %"PRIuS" bytes at %p "
                       "inside %p of size %"PRIuS"",
                       ptr, object_size, object, whole_object, whole_size);
-          if (VLOG_IS_ON(6)) {
+          if (VLOG_IS_ON(15)) {
             // log call stacks to help debug how come something is not a leak
             HeapProfileTable::AllocInfo alloc;
-            bool r = heap_profile->FindAllocDetails(ptr, &alloc);
-            r = r;              // suppress compiler warning in non-debug mode
-            RAW_DCHECK(r, "");  // sanity
+            if (!heap_profile->FindAllocDetails(ptr, &alloc)) {
+              RAW_LOG(FATAL, "FindAllocDetails failed on ptr %p", ptr);
+            }
             RAW_LOG(INFO, "New live %p object's alloc stack:", ptr);
             for (int i = 0; i < alloc.stack_depth; ++i) {
               RAW_LOG(INFO, "  @ %p", alloc.call_stack[i]);
@@ -1386,7 +1397,7 @@ static SpinLock alignment_checker_lock(SpinLock::LINKER_INITIALIZED);
   live_objects_total += live_object_count;
   live_bytes_total += live_byte_count;
   if (live_object_count) {
-    RAW_VLOG(1, "Removed %"PRId64" live heap objects of %"PRId64" bytes: %s%s",
+    RAW_VLOG(10, "Removed %"PRId64" live heap objects of %"PRId64" bytes: %s%s",
                 live_object_count, live_byte_count, name, name2);
   }
 }
@@ -1408,7 +1419,7 @@ void HeapLeakChecker::IgnoreObject(const void* ptr) {
   if (!HaveOnHeapLocked(&ptr, &object_size)) {
     RAW_LOG(ERROR, "No live heap object at %p to ignore", ptr);
   } else {
-    RAW_VLOG(1, "Going to ignore live object at %p of %"PRIuS" bytes",
+    RAW_VLOG(10, "Going to ignore live object at %p of %"PRIuS" bytes",
                 ptr, object_size);
     if (ignored_objects == NULL)  {
       ignored_objects = new(Allocator::Allocate(sizeof(IgnoredObjectsMap)))
@@ -1434,7 +1445,7 @@ void HeapLeakChecker::UnIgnoreObject(const void* ptr) {
       if (object != ignored_objects->end()  &&  object_size == object->second) {
         ignored_objects->erase(object);
         found = true;
-        RAW_VLOG(1, "Now not going to ignore live object "
+        RAW_VLOG(10, "Now not going to ignore live object "
                     "at %p of %"PRIuS" bytes", ptr, object_size);
       }
     }
@@ -1483,7 +1494,7 @@ void HeapLeakChecker::Create(const char *name, bool make_start_snapshot) {
       const HeapProfileTable::Stats& t = heap_profile->total();
       const size_t start_inuse_bytes = t.alloc_size - t.free_size;
       const size_t start_inuse_allocs = t.allocs - t.frees;
-      RAW_VLOG(1, "Start check \"%s\" profile: %"PRIuS" bytes "
+      RAW_VLOG(10, "Start check \"%s\" profile: %"PRIuS" bytes "
                "in %"PRIuS" objects",
                name_, start_inuse_bytes, start_inuse_allocs);
     } else {
@@ -1612,7 +1623,7 @@ bool HeapLeakChecker::DoNoLeaks(ShouldSymbolize should_symbolize) {
   {
     // Heap activity in other threads is paused during this function
     // (i.e. until we got all profile difference info).
-    SpinLockHolder l(&heap_checker_lock);
+    SpinLockHolder hl(&heap_checker_lock);
     if (heap_checker_on == false) {
       if (name_ != NULL) {  // leak checking enabled when created the checker
         RAW_LOG(WARNING, "Heap leak checker got turned off after checker "
@@ -1649,6 +1660,8 @@ bool HeapLeakChecker::DoNoLeaks(ShouldSymbolize should_symbolize) {
     // Make the heap profile, other threads are locked out.
     HeapProfileTable::Snapshot* base =
         reinterpret_cast<HeapProfileTable::Snapshot*>(start_snapshot_);
+    RAW_DCHECK(FLAGS_heap_check_pointer_source_alignment > 0, "");
+    pointer_source_alignment = FLAGS_heap_check_pointer_source_alignment;
     IgnoreAllLiveObjectsLocked(&a_local_var);
     leaks = heap_profile->NonLiveSnapshot(base);
 
@@ -1668,23 +1681,28 @@ bool HeapLeakChecker::DoNoLeaks(ShouldSymbolize should_symbolize) {
                 initial_allocs, Allocator::alloc_count());
       }
     } else if (FLAGS_heap_check_test_pointer_alignment) {
-      // Try with reduced pointer aligment
-      pointer_source_alignment = 1;
-      IgnoreAllLiveObjectsLocked(&a_local_var);
-      HeapProfileTable::Snapshot* leaks_wo_align =
-          heap_profile->NonLiveSnapshot(base);
-      pointer_source_alignment = kPointerSourceAlignment;
-      if (leaks_wo_align->Empty()) {
-        RAW_LOG(WARNING, "Found no leaks without pointer alignment: "
-                "something might be placing pointers at "
-                "unaligned addresses! This needs to be fixed.");
+      if (pointer_source_alignment == 1) {
+        RAW_LOG(WARNING, "--heap_check_test_pointer_alignment has no effect: "
+                "--heap_check_pointer_source_alignment was already set to 1");
       } else {
-        RAW_LOG(INFO, "Found leaks without pointer alignment as well: "
-                "unaligned pointers must not be the cause of leaks.");
-        RAW_LOG(INFO, "--heap_check_test_pointer_alignment did not help "
-                "to diagnose the leaks.");
+        // Try with reduced pointer aligment
+        pointer_source_alignment = 1;
+        IgnoreAllLiveObjectsLocked(&a_local_var);
+        HeapProfileTable::Snapshot* leaks_wo_align =
+            heap_profile->NonLiveSnapshot(base);
+        pointer_source_alignment = FLAGS_heap_check_pointer_source_alignment;
+        if (leaks_wo_align->Empty()) {
+          RAW_LOG(WARNING, "Found no leaks without pointer alignment: "
+                  "something might be placing pointers at "
+                  "unaligned addresses! This needs to be fixed.");
+        } else {
+          RAW_LOG(INFO, "Found leaks without pointer alignment as well: "
+                  "unaligned pointers must not be the cause of leaks.");
+          RAW_LOG(INFO, "--heap_check_test_pointer_alignment did not help "
+                  "to diagnose the leaks.");
+        }
+        heap_profile->ReleaseSnapshot(leaks_wo_align);
       }
-      heap_profile->ReleaseSnapshot(leaks_wo_align);
     }
 
     if (leaks != NULL) {
@@ -1741,7 +1759,7 @@ bool HeapLeakChecker::DoNoLeaks(ShouldSymbolize should_symbolize) {
     SuggestPprofCommand(pprof_file);
 
     {
-      SpinLockHolder l(&heap_checker_lock);
+      SpinLockHolder hl(&heap_checker_lock);
       heap_profile->ReleaseSnapshot(leaks);
       Allocator::Free(pprof_file);
     }
@@ -1874,6 +1892,7 @@ static bool internal_init_start_has_run = false;
   }
 
   // Set all flags
+  RAW_DCHECK(FLAGS_heap_check_pointer_source_alignment > 0, "");
   if (FLAGS_heap_check == "minimal") {
     // The least we can check.
     FLAGS_heap_check_before_constructors = false;  // from after main
@@ -2043,7 +2062,7 @@ bool HeapLeakChecker::NoGlobalLeaks() {
   // we never delete or change main_heap_checker once it's set:
   HeapLeakChecker* main_hc = GlobalChecker();
   if (main_hc) {
-    RAW_VLOG(1, "Checking for whole-program memory leaks");
+    RAW_VLOG(10, "Checking for whole-program memory leaks");
     // The program is over, so it's safe to symbolize addresses (which
     // requires a fork) because no serious work is expected to be done
     // after this.  Symbolizing is really useful -- knowing what
@@ -2165,7 +2184,7 @@ void HeapLeakChecker::BeforeConstructorsLocked() {
   RAW_CHECK(heap_profile == NULL, "");
   heap_profile = new(Allocator::Allocate(sizeof(HeapProfileTable)))
                    HeapProfileTable(&Allocator::Allocate, &Allocator::Free);
-  RAW_VLOG(1, "Starting tracking the heap");
+  RAW_VLOG(10, "Starting tracking the heap");
   heap_checker_on = true;
 }
 
@@ -2329,7 +2348,7 @@ void HeapLeakChecker::DisableChecksFromToLocked(const void* start_address,
   value.start_address = AsInt(start_address);
   value.max_depth = max_depth;
   if (disabled_ranges->insert(make_pair(AsInt(end_address), value)).second) {
-    RAW_VLOG(1, "Disabling leak checking in stack traces "
+    RAW_VLOG(10, "Disabling leak checking in stack traces "
                 "under frame addresses between %p..%p",
                 start_address, end_address);
   } else {  // check that this is just a verbatim repetition
@@ -2352,7 +2371,7 @@ inline bool HeapLeakChecker::HaveOnHeapLocked(const void** ptr,
   const uintptr_t addr = AsInt(*ptr);
   if (heap_profile->FindInsideAlloc(
         *ptr, max_heap_object_size, ptr, object_size)) {
-    RAW_VLOG(7, "Got pointer into %p at +%"PRIuPTR" offset",
+    RAW_VLOG(16, "Got pointer into %p at +%"PRIuPTR" offset",
              *ptr, addr - AsInt(*ptr));
     return true;
   }
diff --git a/third_party/tcmalloc/chromium/src/heap-profile-table.cc b/third_party/tcmalloc/chromium/src/heap-profile-table.cc
index 66e4f20..ecaf75f 100644
--- a/third_party/tcmalloc/chromium/src/heap-profile-table.cc
+++ b/third_party/tcmalloc/chromium/src/heap-profile-table.cc
@@ -99,7 +99,7 @@ const char HeapProfileTable::kFileExt[] = ".heap";
 //----------------------------------------------------------------------
 
 static const int kHashTableSize = 179999;   // Size for table_.
-/*static*/ const int HeapProfileTable::kMaxStackDepth = 32;
+/*static*/ const int HeapProfileTable::kMaxStackDepth;
 
 //----------------------------------------------------------------------
 
diff --git a/third_party/tcmalloc/chromium/src/heap-profile-table.h b/third_party/tcmalloc/chromium/src/heap-profile-table.h
index 5403257..c9bee15 100644
--- a/third_party/tcmalloc/chromium/src/heap-profile-table.h
+++ b/third_party/tcmalloc/chromium/src/heap-profile-table.h
@@ -52,8 +52,8 @@ class HeapProfileTable {
   // Extension to be used for heap pforile files.
   static const char kFileExt[];
 
-  // Longest stack trace we record.  Defined in the .cc file.
-  static const int kMaxStackDepth;
+  // Longest stack trace we record.
+  static const int kMaxStackDepth = 32;
 
   // data types ----------------------------
 
diff --git a/third_party/tcmalloc/chromium/src/heap-profiler.cc b/third_party/tcmalloc/chromium/src/heap-profiler.cc
index a1c643a9..3055f4ce 100644
--- a/third_party/tcmalloc/chromium/src/heap-profiler.cc
+++ b/third_party/tcmalloc/chromium/src/heap-profiler.cc
@@ -524,9 +524,9 @@ extern "C" void HeapProfilerStart(const char* prefix) {
   filename_prefix[prefix_length] = '\0';
 }
 
-extern "C" bool IsHeapProfilerRunning() {
+extern "C" int IsHeapProfilerRunning() {
   SpinLockHolder l(&heap_lock);
-  return is_on;
+  return is_on ? 1 : 0;   // return an int, because C code doesn't have bool
 }
 
 extern "C" void HeapProfilerStop() {
diff --git a/third_party/tcmalloc/chromium/src/internal_logging.h b/third_party/tcmalloc/chromium/src/internal_logging.h
index 0cb9ba2..731b2d9 100644
--- a/third_party/tcmalloc/chromium/src/internal_logging.h
+++ b/third_party/tcmalloc/chromium/src/internal_logging.h
@@ -119,7 +119,9 @@ do {                                                                     \
 #ifndef NDEBUG
 #define ASSERT(cond) CHECK_CONDITION(cond)
 #else
-#define ASSERT(cond) ((void) 0)
+#define ASSERT(cond)                            \
+  do {                                          \
+  } while (0 && (cond))
 #endif
 
 // Print into buffer
diff --git a/third_party/tcmalloc/chromium/src/malloc_extension.cc b/third_party/tcmalloc/chromium/src/malloc_extension.cc
index 4ce262f..c2f8b54 100644
--- a/third_party/tcmalloc/chromium/src/malloc_extension.cc
+++ b/third_party/tcmalloc/chromium/src/malloc_extension.cc
@@ -187,7 +187,10 @@ MallocExtension* MallocExtension::instance() {
 void MallocExtension::Register(MallocExtension* implementation) {
   perftools_pthread_once(&module_init, InitModule);
   // When running under valgrind, our custom malloc is replaced with
-  // valgrind's one and malloc extensions will not work.
+  // valgrind's one and malloc extensions will not work.  (Note:
+  // callers should be responsible for checking that they are the
+  // malloc that is really being run, before calling Register.  This
+  // is just here as an extra sanity check.)
   if (!RunningOnValgrind()) {
     current_instance = implementation;
   }
diff --git a/third_party/tcmalloc/chromium/src/malloc_hook.cc b/third_party/tcmalloc/chromium/src/malloc_hook.cc
index 2a7f542..4315b86 100644
--- a/third_party/tcmalloc/chromium/src/malloc_hook.cc
+++ b/third_party/tcmalloc/chromium/src/malloc_hook.cc
@@ -326,8 +326,8 @@ extern "C" int MallocHook_GetCallerStackTrace(void** result, int max_depth,
     return 0;
   for (int i = 0; i < depth; ++i) {  // stack[0] is our immediate caller
     if (InHookCaller(stack[i])) {
-      RAW_VLOG(4, "Found hooked allocator at %d: %p <- %p",
-                  i, stack[i], stack[i+1]);
+      RAW_VLOG(10, "Found hooked allocator at %d: %p <- %p",
+                   i, stack[i], stack[i+1]);
       i += 1;  // skip hook caller frame
       depth -= i;  // correct depth
       if (depth > max_depth) depth = max_depth;
diff --git a/third_party/tcmalloc/chromium/src/memory_region_map.cc b/third_party/tcmalloc/chromium/src/memory_region_map.cc
index 05fdc06..f6bed45 100644
--- a/third_party/tcmalloc/chromium/src/memory_region_map.cc
+++ b/third_party/tcmalloc/chromium/src/memory_region_map.cc
@@ -181,7 +181,7 @@ static MemoryRegionMap::RegionSetRep regions_rep;
 static bool recursive_insert = false;
 
 void MemoryRegionMap::Init(int max_stack_depth) {
-  RAW_VLOG(2, "MemoryRegionMap Init");
+  RAW_VLOG(10, "MemoryRegionMap Init");
   RAW_CHECK(max_stack_depth >= 0, "");
   // Make sure we don't overflow the memory in region stacks:
   RAW_CHECK(max_stack_depth <= kMaxStackDepth,
@@ -192,7 +192,7 @@ void MemoryRegionMap::Init(int max_stack_depth) {
   if (client_count_ > 1) {
     // not first client: already did initialization-proper
     Unlock();
-    RAW_VLOG(2, "MemoryRegionMap Init increment done");
+    RAW_VLOG(10, "MemoryRegionMap Init increment done");
     return;
   }
   // Set our hooks and make sure no other hooks existed:
@@ -217,17 +217,17 @@ void MemoryRegionMap::Init(int max_stack_depth) {
     // recursive_insert = false; as InsertRegionLocked will also construct
     // regions_ on demand for us.
   Unlock();
-  RAW_VLOG(2, "MemoryRegionMap Init done");
+  RAW_VLOG(10, "MemoryRegionMap Init done");
 }
 
 bool MemoryRegionMap::Shutdown() {
-  RAW_VLOG(2, "MemoryRegionMap Shutdown");
+  RAW_VLOG(10, "MemoryRegionMap Shutdown");
   Lock();
   RAW_CHECK(client_count_ > 0, "");
   client_count_ -= 1;
   if (client_count_ != 0) {  // not last client; need not really shutdown
     Unlock();
-    RAW_VLOG(2, "MemoryRegionMap Shutdown decrement done");
+    RAW_VLOG(10, "MemoryRegionMap Shutdown decrement done");
     return true;
   }
   CheckMallocHooks();  // we assume no other hooks
@@ -244,7 +244,7 @@ bool MemoryRegionMap::Shutdown() {
     RAW_LOG(WARNING, "Can't delete LowLevelAlloc arena: it's being used");
   }
   Unlock();
-  RAW_VLOG(2, "MemoryRegionMap Shutdown done");
+  RAW_VLOG(10, "MemoryRegionMap Shutdown done");
   return deleted_arena;
 }
 
@@ -336,7 +336,7 @@ bool MemoryRegionMap::FindAndMarkStackRegion(uintptr_t stack_top,
   Lock();
   const Region* region = DoFindRegionLocked(stack_top);
   if (region != NULL) {
-    RAW_VLOG(2, "Stack at %p is inside region %p..%p",
+    RAW_VLOG(10, "Stack at %p is inside region %p..%p",
                 reinterpret_cast<void*>(stack_top),
                 reinterpret_cast<void*>(region->start_addr),
                 reinterpret_cast<void*>(region->end_addr));
@@ -361,7 +361,7 @@ MemoryRegionMap::RegionIterator MemoryRegionMap::EndRegionLocked() {
 }
 
 inline void MemoryRegionMap::DoInsertRegionLocked(const Region& region) {
-  RAW_VLOG(4, "Inserting region %p..%p from %p",
+  RAW_VLOG(12, "Inserting region %p..%p from %p",
               reinterpret_cast<void*>(region.start_addr),
               reinterpret_cast<void*>(region.end_addr),
               reinterpret_cast<void*>(region.caller()));
@@ -385,10 +385,10 @@ inline void MemoryRegionMap::DoInsertRegionLocked(const Region& region) {
   // This inserts and allocates permanent storage for region
   // and its call stack data: it's safe to do it now:
   regions_->insert(region);
-  RAW_VLOG(4, "Inserted region %p..%p :",
+  RAW_VLOG(12, "Inserted region %p..%p :",
               reinterpret_cast<void*>(region.start_addr),
               reinterpret_cast<void*>(region.end_addr));
-  if (VLOG_IS_ON(4))  LogAllLocked();
+  if (VLOG_IS_ON(12))  LogAllLocked();
 }
 
 // These variables are local to MemoryRegionMap::InsertRegionLocked()
@@ -425,7 +425,7 @@ inline void MemoryRegionMap::InsertRegionLocked(const Region& region) {
   // and taken into account when the recursion unwinds.
   // Do the insert:
   if (recursive_insert) {  // recursion: save in saved_regions
-    RAW_VLOG(4, "Saving recursive insert of region %p..%p from %p",
+    RAW_VLOG(12, "Saving recursive insert of region %p..%p from %p",
                 reinterpret_cast<void*>(region.start_addr),
                 reinterpret_cast<void*>(region.end_addr),
                 reinterpret_cast<void*>(region.caller()));
@@ -436,7 +436,7 @@ inline void MemoryRegionMap::InsertRegionLocked(const Region& region) {
     saved_regions[saved_regions_count++] = region;
   } else {  // not a recusrive call
     if (regions_ == NULL) {  // init regions_
-      RAW_VLOG(4, "Initializing region set");
+      RAW_VLOG(12, "Initializing region set");
       regions_ = regions_rep.region_set();
       recursive_insert = true;
       new(regions_) RegionSet();
@@ -470,7 +470,7 @@ void MemoryRegionMap::RecordRegionAddition(const void* start, size_t size) {
                                       max_stack_depth_, kStripFrames + 1)
     : 0;
   region.set_call_stack_depth(depth);  // record stack info fully
-  RAW_VLOG(2, "New global region %p..%p from %p",
+  RAW_VLOG(10, "New global region %p..%p from %p",
               reinterpret_cast<void*>(region.start_addr),
               reinterpret_cast<void*>(region.end_addr),
               reinterpret_cast<void*>(region.caller()));
@@ -499,7 +499,7 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
         // An exact match, so it's safe to remove.
         --saved_regions_count;
         --put_pos;
-        RAW_VLOG(2, ("Insta-Removing saved region %p..%p; "
+        RAW_VLOG(10, ("Insta-Removing saved region %p..%p; "
                      "now have %d saved regions"),
                  reinterpret_cast<void*>(start_addr),
                  reinterpret_cast<void*>(end_addr),
@@ -523,7 +523,7 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
   uintptr_t start_addr = reinterpret_cast<uintptr_t>(start);
   uintptr_t end_addr = start_addr + size;
   // subtract start_addr, end_addr from all the regions
-  RAW_VLOG(2, "Removing global region %p..%p; have %"PRIuS" regions",
+  RAW_VLOG(10, "Removing global region %p..%p; have %"PRIuS" regions",
               reinterpret_cast<void*>(start_addr),
               reinterpret_cast<void*>(end_addr),
               regions_->size());
@@ -533,12 +533,12 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
   for (RegionSet::iterator region = regions_->lower_bound(sample);
        region != regions_->end()  &&  region->start_addr < end_addr;
        /*noop*/) {
-    RAW_VLOG(5, "Looking at region %p..%p",
+    RAW_VLOG(13, "Looking at region %p..%p",
                 reinterpret_cast<void*>(region->start_addr),
                 reinterpret_cast<void*>(region->end_addr));
     if (start_addr <= region->start_addr  &&
         region->end_addr <= end_addr) {  // full deletion
-      RAW_VLOG(4, "Deleting region %p..%p",
+      RAW_VLOG(12, "Deleting region %p..%p",
                   reinterpret_cast<void*>(region->start_addr),
                   reinterpret_cast<void*>(region->end_addr));
       RegionSet::iterator d = region;
@@ -547,7 +547,7 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
       continue;
     } else if (region->start_addr < start_addr  &&
                end_addr < region->end_addr) {  // cutting-out split
-      RAW_VLOG(4, "Splitting region %p..%p in two",
+      RAW_VLOG(12, "Splitting region %p..%p in two",
                   reinterpret_cast<void*>(region->start_addr),
                   reinterpret_cast<void*>(region->end_addr));
       // Make another region for the start portion:
@@ -560,13 +560,13 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
       const_cast<Region&>(*region).set_start_addr(end_addr);
     } else if (end_addr > region->start_addr  &&
                start_addr <= region->start_addr) {  // cut from start
-      RAW_VLOG(4, "Start-chopping region %p..%p",
+      RAW_VLOG(12, "Start-chopping region %p..%p",
                   reinterpret_cast<void*>(region->start_addr),
                   reinterpret_cast<void*>(region->end_addr));
       const_cast<Region&>(*region).set_start_addr(end_addr);
     } else if (start_addr > region->start_addr  &&
                start_addr < region->end_addr) {  // cut from end
-      RAW_VLOG(4, "End-chopping region %p..%p",
+      RAW_VLOG(12, "End-chopping region %p..%p",
                   reinterpret_cast<void*>(region->start_addr),
                   reinterpret_cast<void*>(region->end_addr));
       // Can't just modify region->end_addr (it's the sorting key):
@@ -582,11 +582,11 @@ void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
     }
     ++region;
   }
-  RAW_VLOG(4, "Removed region %p..%p; have %"PRIuS" regions",
+  RAW_VLOG(12, "Removed region %p..%p; have %"PRIuS" regions",
               reinterpret_cast<void*>(start_addr),
               reinterpret_cast<void*>(end_addr),
               regions_->size());
-  if (VLOG_IS_ON(4))  LogAllLocked();
+  if (VLOG_IS_ON(12))  LogAllLocked();
   Unlock();
 }
 
@@ -596,7 +596,7 @@ void MemoryRegionMap::MmapHook(const void* result,
                                int fd, off_t offset) {
   // TODO(maxim): replace all 0x%"PRIxS" by %p when RAW_VLOG uses a safe
   // snprintf reimplementation that does not malloc to pretty-print NULL
-  RAW_VLOG(2, "MMap = 0x%"PRIxPTR" of %"PRIuS" at %llu "
+  RAW_VLOG(10, "MMap = 0x%"PRIxPTR" of %"PRIuS" at %llu "
               "prot %d flags %d fd %d offs %lld",
               reinterpret_cast<uintptr_t>(result), size,
               reinterpret_cast<uint64>(start), prot, flags, fd,
@@ -607,7 +607,7 @@ void MemoryRegionMap::MmapHook(const void* result,
 }
 
 void MemoryRegionMap::MunmapHook(const void* ptr, size_t size) {
-  RAW_VLOG(2, "MUnmap of %p %"PRIuS"", ptr, size);
+  RAW_VLOG(10, "MUnmap of %p %"PRIuS"", ptr, size);
   if (size != 0) {
     RecordRegionRemoval(ptr, size);
   }
@@ -617,7 +617,7 @@ void MemoryRegionMap::MremapHook(const void* result,
                                  const void* old_addr, size_t old_size,
                                  size_t new_size, int flags,
                                  const void* new_addr) {
-  RAW_VLOG(2, "MRemap = 0x%"PRIxPTR" of 0x%"PRIxPTR" %"PRIuS" "
+  RAW_VLOG(10, "MRemap = 0x%"PRIxPTR" of 0x%"PRIxPTR" %"PRIuS" "
               "to %"PRIuS" flags %d new_addr=0x%"PRIxPTR,
               (uintptr_t)result, (uintptr_t)old_addr,
                old_size, new_size, flags,
@@ -631,7 +631,7 @@ void MemoryRegionMap::MremapHook(const void* result,
 extern "C" void* __sbrk(ptrdiff_t increment);  // defined in libc
 
 void MemoryRegionMap::SbrkHook(const void* result, ptrdiff_t increment) {
-  RAW_VLOG(2, "Sbrk = 0x%"PRIxPTR" of %"PRIdS"", (uintptr_t)result, increment);
+  RAW_VLOG(10, "Sbrk = 0x%"PRIxPTR" of %"PRIdS"", (uintptr_t)result, increment);
   if (result != reinterpret_cast<void*>(-1)) {
     if (increment > 0) {
       void* new_end = sbrk(0);
diff --git a/third_party/tcmalloc/chromium/src/page_heap.cc b/third_party/tcmalloc/chromium/src/page_heap.cc
index 31130e9..a256b64 100644
--- a/third_party/tcmalloc/chromium/src/page_heap.cc
+++ b/third_party/tcmalloc/chromium/src/page_heap.cc
@@ -61,50 +61,65 @@ PageHeap::PageHeap()
   }
 }
 
-Span* PageHeap::New(Length n) {
+// Returns the minimum number of pages necessary to ensure that an
+// allocation of size n can be aligned to the given alignment.
+static Length AlignedAllocationSize(Length n, size_t alignment) {
+  ASSERT(alignment >= kPageSize);
+  return n + tcmalloc::pages(alignment - kPageSize);
+}
+
+Span* PageHeap::New(Length n, size_t sc, size_t align) {
   ASSERT(Check());
   ASSERT(n > 0);
 
+  if (align < kPageSize) {
+    align = kPageSize;
+  }
+
+  Length aligned_size = AlignedAllocationSize(n, align);
+
   // Find first size >= n that has a non-empty list
-  for (Length s = n; s < kMaxPages; s++) {
+  for (Length s = aligned_size; s < kMaxPages; s++) {
     Span* ll = &free_[s].normal;
     // If we're lucky, ll is non-empty, meaning it has a suitable span.
     if (!DLL_IsEmpty(ll)) {
       ASSERT(ll->next->location == Span::ON_NORMAL_FREELIST);
-      return Carve(ll->next, n);
+      return Carve(ll->next, n, sc, align);
     }
     // Alternatively, maybe there's a usable returned span.
     ll = &free_[s].returned;
     if (!DLL_IsEmpty(ll)) {
       ASSERT(ll->next->location == Span::ON_RETURNED_FREELIST);
-      return Carve(ll->next, n);
+      return Carve(ll->next, n, sc, align);
     }
     // Still no luck, so keep looking in larger classes.
   }
 
-  Span* result = AllocLarge(n);
+  Span* result = AllocLarge(n, sc, align);
   if (result != NULL) return result;
 
   // Grow the heap and try again
-  if (!GrowHeap(n)) {
+  if (!GrowHeap(aligned_size)) {
     ASSERT(stats_.unmapped_bytes+ stats_.committed_bytes==stats_.system_bytes);
     ASSERT(Check());
     return NULL;
   }
 
-  return AllocLarge(n);
+  return AllocLarge(n, sc, align);
 }
 
-Span* PageHeap::AllocLarge(Length n) {
-  // find the best span (closest to n in size).
+Span* PageHeap::AllocLarge(Length n, size_t sc, size_t align) {
+  // Find the best span (closest to n in size).
   // The following loops implements address-ordered best-fit.
   Span *best = NULL;
 
+  Length aligned_size = AlignedAllocationSize(n, align);
+
   // Search through normal list
   for (Span* span = large_.normal.next;
        span != &large_.normal;
        span = span->next) {
-    if (span->length >= n) {
+    if (span->length >= aligned_size) {
       if ((best == NULL)
           || (span->length < best->length)
           || ((span->length == best->length) && (span->start < best->start))) {
@@ -118,7 +133,7 @@ Span* PageHeap::AllocLarge(Length n) {
   for (Span* span = large_.returned.next;
        span != &large_.returned;
        span = span->next) {
-    if (span->length >= n) {
+    if (span->length >= aligned_size) {
       if ((best == NULL)
           || (span->length < best->length)
           || ((span->length == best->length) && (span->start < best->start))) {
@@ -128,19 +143,18 @@ Span* PageHeap::AllocLarge(Length n) {
     }
   }
 
-  return best == NULL ? NULL : Carve(best, n);
+  return best == NULL ? NULL : Carve(best, n, sc, align);
 }
 
 Span* PageHeap::Split(Span* span, Length n) {
   ASSERT(0 < n);
   ASSERT(n < span->length);
-  ASSERT(span->location == Span::IN_USE);
-  ASSERT(span->sizeclass == 0);
+  ASSERT((span->location != Span::IN_USE) || span->sizeclass == 0);
   Event(span, 'T', n);
 
   const int extra = span->length - n;
   Span* leftover = NewSpan(span->start + n, extra);
-  ASSERT(leftover->location == Span::IN_USE);
+  leftover->location = span->location;
   Event(leftover, 'U', extra);
   RecordSpan(leftover);
   pagemap_.set(span->start + n - 1, span); // Update map from pageid to span
@@ -161,43 +175,71 @@ void PageHeap::DecommitSpan(Span* span) {
   stats_.committed_bytes -= span->length << kPageShift;
 }
 
-Span* PageHeap::Carve(Span* span, Length n) {
+Span* PageHeap::Carve(Span* span, Length n, size_t sc, size_t align) {
   ASSERT(n > 0);
   ASSERT(span->location != Span::IN_USE);
+  ASSERT(align >= kPageSize);
   const int old_location = span->location;
+
+  Length align_pages = align >> kPageShift;
   RemoveFromFreeList(span);
-  span->location = Span::IN_USE;
-  Event(span, 'A', n);
+
+  if (span->start & (align_pages - 1)) {
+    Length skip_for_alignment = align_pages - (span->start & (align_pages - 1));
+    Span* aligned = Split(span, skip_for_alignment);
+
+    // The next span of |span| was just splitted -- no need to
+    // coalesce them. The previous span of |span| was not previously coalesced
+    // with |span|, i.e. is NULL or has location other than |old_location|.
+    const PageID p = span->start;
+    const Length n = span->length;
+    Span* prev = GetDescriptor(p-1);
+    ASSERT(prev == NULL ||
+           prev->location == Span::IN_USE ||
+           prev->location != old_location);
+    PrependToFreeList(span); // Skip coalescing - no candidates possible
+    span = aligned;
+  }
 
   const int extra = span->length - n;
   ASSERT(extra >= 0);
   if (extra > 0) {
-    Span* leftover = NewSpan(span->start + n, extra);
-    leftover->location = old_location;
-    Event(leftover, 'S', extra);
-    RecordSpan(leftover);
-
+    Span* leftover = Split(span, n);
     // The previous span of |leftover| was just splitted -- no need to
     // coalesce them. The next span of |leftover| was not previously coalesced
-    // with |span|, i.e. is NULL or has got location other than |old_location|.
+    // with |span|, i.e. is NULL or has location other than |old_location|.
     const PageID p = leftover->start;
     const Length len = leftover->length;
     Span* next = GetDescriptor(p+len);
     ASSERT (next == NULL ||
             next->location == Span::IN_USE ||
             next->location != leftover->location);
-
-    PrependToFreeList(leftover);  // Skip coalescing - no candidates possible
-    span->length = n;
-    pagemap_.set(span->start + n - 1, span);
+    PrependToFreeList(leftover);
   }
+
+
   ASSERT(Check());
   if (old_location == Span::ON_RETURNED_FREELIST) {
     // We need to recommit this address space.
     CommitSpan(span);
   }
-  ASSERT(span->location == Span::IN_USE);
-  ASSERT(span->length == n);
+
+  span->location = Span::IN_USE;
+  span->sizeclass = sc;
+  Event(span, 'A', n);
+
+  // Cache sizeclass info eagerly. Locking is not necessary.
+  // (Instead of being eager, we could just replace any stale info
+  // about this span, but that seems to be no better in practice.)
+  CacheSizeClass(span->start, sc);
+
+  if (sc != kLargeSizeClass) {
+    for (Length i = 1; i < n; i++) {
+      pagemap_.set(span->start + i, span);
+      CacheSizeClass(span->start + i, sc);
+    }
+  }
+
   ASSERT(stats_.unmapped_bytes+ stats_.committed_bytes==stats_.system_bytes);
   return span;
 }
@@ -379,18 +421,6 @@ Length PageHeap::ReleaseAtLeastNPages(Length num_pages) {
   return released_pages;
 }
 
-void PageHeap::RegisterSizeClass(Span* span, size_t sc) {
-  // Associate span object with all interior pages as well
-  ASSERT(span->location == Span::IN_USE);
-  ASSERT(GetDescriptor(span->start) == span);
-  ASSERT(GetDescriptor(span->start+span->length-1) == span);
-  Event(span, 'C', sc);
-  span->sizeclass = sc;
-  for (Length i = 1; i < span->length-1; i++) {
-    pagemap_.set(span->start+i, span);
-  }
-}
-
 static double MB(uint64_t bytes) {
   return bytes / 1048576.0;
 }
diff --git a/third_party/tcmalloc/chromium/src/page_heap.h b/third_party/tcmalloc/chromium/src/page_heap.h
index 52acedb..63f21b2 100644
--- a/third_party/tcmalloc/chromium/src/page_heap.h
+++ b/third_party/tcmalloc/chromium/src/page_heap.h
@@ -101,21 +101,49 @@ class PERFTOOLS_DLL_DECL PageHeap {
  public:
   PageHeap();
 
-  // Allocate a run of "n" pages.  Returns zero if out of memory.
-  // Caller should not pass "n == 0" -- instead, n should have
-  // been rounded up already.
-  Span* New(Length n);
+  // Allocate a run of "n" pages.  Returns NULL if out of memory.
+  // Caller should not pass "n == 0" -- instead, n should have been
+  // rounded up already.  The span will be used for allocating objects
+  // with the specifled sizeclass sc (sc must be zero for large
+  // objects). The first page of the span will be aligned to the value
+  // specified by align, which must be a power of two.
+  Span* New(Length n, size_t sc, size_t align);
 
   // Delete the span "[p, p+n-1]".
   // REQUIRES: span was returned by earlier call to New() and
   //           has not yet been deleted.
   void Delete(Span* span);
 
-  // Mark an allocated span as being used for small objects of the
-  // specified size-class.
-  // REQUIRES: span was returned by an earlier call to New()
-  //           and has not yet been deleted.
-  void RegisterSizeClass(Span* span, size_t sc);
+  // Gets either the size class of addr, if it is a small object, or it's span.
+  // Return:
+  // if addr is invalid:
+  //   leave *out_sc and *out_span unchanged and return false;
+  // if addr is valid and has a small size class:
+  //   *out_sc = the size class
+  //   *out_span = <undefined>
+  //   return true
+  // if addr is valid and has a large size class:
+  //   *out_sc = kLargeSizeClass
+  //   *out_span = the span pointer
+  //   return true
+  bool GetSizeClassOrSpan(void* addr, size_t* out_sc, Span** out_span) {
+    const PageID p = reinterpret_cast<uintptr_t>(addr) >> kPageShift;
+    size_t cl = GetSizeClassIfCached(p);
+    Span* span = NULL;
+
+    if (cl != kLargeSizeClass) {
+      ASSERT(cl == GetDescriptor(p)->sizeclass);
+    } else {
+      span = GetDescriptor(p);
+      if (!span) {
+        return false;
+      }
+      cl = span->sizeclass;
+    }
+    *out_span = span;
+    *out_sc = cl;
+    return true;
+  }
 
   // Split an allocated span into two spans: one of length "n" pages
   // followed by another span of length "span->length - n" pages.
@@ -123,14 +151,29 @@ class PERFTOOLS_DLL_DECL PageHeap {
   // Returns a pointer to the second span.
   //
   // REQUIRES: "0 < n < span->length"
-  // REQUIRES: span->location == IN_USE
-  // REQUIRES: span->sizeclass == 0
+  // REQUIRES: a) the span is free or b) sizeclass == 0
   Span* Split(Span* span, Length n);
 
   // Return the descriptor for the specified page.  Returns NULL if
   // this PageID was not allocated previously.
   inline Span* GetDescriptor(PageID p) const {
-    return reinterpret_cast<Span*>(pagemap_.get(p));
+    Span* ret = reinterpret_cast<Span*>(pagemap_.get(p));
+#ifndef NDEBUG
+    if (ret != NULL && ret->location == Span::IN_USE) {
+      size_t cl = GetSizeClassIfCached(p);
+      // Three cases:
+      //  - The object is not cached
+      //  - The object is cached correctly
+      //  - It is a large object and we're not looking at the first
+      //    page. This happens in coalescing.
+      ASSERT(cl == kLargeSizeClass || cl == ret->sizeclass ||
+             (ret->start != p && ret->sizeclass == kLargeSizeClass));
+      // If the object is sampled, it must have be kLargeSizeClass
+      ASSERT(ret->sizeclass == kLargeSizeClass || !ret->sample);
+    }
+#endif
+
+    return ret;
   }
 
   // Dump state to stderr
@@ -234,7 +277,7 @@ class PERFTOOLS_DLL_DECL PageHeap {
   // length exactly "n" and mark it as non-free so it can be returned
   // to the client.  After all that, decrease free_pages_ by n and
   // return span.
-  Span* Carve(Span* span, Length n);
+  Span* Carve(Span* span, Length n, size_t sc, size_t align);
 
   void RecordSpan(Span* span) {
     pagemap_.set(span->start, span);
@@ -245,7 +288,7 @@ class PERFTOOLS_DLL_DECL PageHeap {
 
   // Allocate a large span of length == n.  If successful, returns a
   // span of exactly the specified length.  Else, returns NULL.
-  Span* AllocLarge(Length n);
+  Span* AllocLarge(Length n, size_t sc, size_t align);
 
   // Coalesce span with neighboring spans if possible, prepend to
   // appropriate free list, and adjust stats.
diff --git a/third_party/tcmalloc/chromium/src/page_heap_allocator.h b/third_party/tcmalloc/chromium/src/page_heap_allocator.h
index 20e1ab1..3f75939 100644
--- a/third_party/tcmalloc/chromium/src/page_heap_allocator.h
+++ b/third_party/tcmalloc/chromium/src/page_heap_allocator.h
@@ -44,7 +44,7 @@ class PageHeapAllocator {
   // allocated and their constructors might not have run by the time some
   // other static variable tries to allocate memory.
   void Init() {
-    ASSERT(kAlignedSize <= kAllocIncrement);
+    ASSERT(sizeof(T) <= kAllocIncrement);
     inuse_ = 0;
     free_area_ = NULL;
     free_avail_ = 0;
@@ -60,8 +60,9 @@ class PageHeapAllocator {
       result = free_list_;
       free_list_ = *(reinterpret_cast<void**>(result));
     } else {
-      if (free_avail_ < kAlignedSize) {
-        // Need more room
+      if (free_avail_ < sizeof(T)) {
+        // Need more room. We assume that MetaDataAlloc returns
+        // suitably aligned memory.
         free_area_ = reinterpret_cast<char*>(MetaDataAlloc(kAllocIncrement));
         if (free_area_ == NULL) {
           CRASH("FATAL ERROR: Out of memory trying to allocate internal "
@@ -71,8 +72,8 @@ class PageHeapAllocator {
         free_avail_ = kAllocIncrement;
       }
       result = free_area_;
-      free_area_ += kAlignedSize;
-      free_avail_ -= kAlignedSize;
+      free_area_ += sizeof(T);
+      free_avail_ -= sizeof(T);
     }
     inuse_++;
     return reinterpret_cast<T*>(result);
@@ -90,10 +91,6 @@ class PageHeapAllocator {
   // How much to allocate from system at a time
   static const int kAllocIncrement = 128 << 10;
 
-  // Aligned size of T
-  static const size_t kAlignedSize
-  = (((sizeof(T) + kAlignment - 1) / kAlignment) * kAlignment);
-
   // Free area from which to carve new objects
   char* free_area_;
   size_t free_avail_;
diff --git a/third_party/tcmalloc/chromium/src/pprof b/third_party/tcmalloc/chromium/src/pprof
index fec0c9e..8aff380 100755
--- a/third_party/tcmalloc/chromium/src/pprof
+++ b/third_party/tcmalloc/chromium/src/pprof
@@ -89,11 +89,10 @@ my %obj_tool_map = (
 );
 my $DOT = "dot";          # leave non-absolute, since it may be in /usr/local
 my $GV = "gv";
+my $KCACHEGRIND = "kcachegrind";
 my $PS2PDF = "ps2pdf";
 # These are used for dynamic profiles
-my $WGET = "wget";
-my $WGET_FLAGS = "--no-http-keep-alive";   # only supported by some wgets
-my $CURL = "curl";
+my $URL_FETCHER = "curl -s";
 
 # These are the web pages that servers need to support for dynamic profiles
 my $HEAP_PAGE = "/pprof/heap";
@@ -107,6 +106,12 @@ my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
 my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
 my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
 
+# These are the web pages that can be named on the command line.
+# All the alternatives must begin with /.
+my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
+               "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
+               "$FILTEREDPROFILE_PAGE)";
+
 # default binary name
 my $UNKNOWN_BINARY = "(unknown)";
 
@@ -175,12 +180,14 @@ Output type:
    --text              Generate text report
    --callgrind         Generate callgrind format to stdout
    --gv                Generate Postscript and display
+   --web               Generate SVG and display
    --list=<regexp>     Generate source listing of matching routines
    --disasm=<regexp>   Generate disassembly of matching routines
    --symbols           Print demangled symbol names found at given addresses
    --dot               Generate DOT file to stdout
    --ps                Generate Postcript to stdout
    --pdf               Generate PDF to stdout
+   --svg               Generate SVG to stdout
    --gif               Generate GIF to stdout
    --raw               Generate symbolized pprof data (useful with remote fetch)
 
@@ -223,6 +230,8 @@ pprof /bin/ls ls.prof
                        Enters "interactive" mode
 pprof --text /bin/ls ls.prof
                        Outputs one line per procedure
+pprof --web /bin/ls ls.prof
+                       Displays annotated call-graph in web browser
 pprof --gv /bin/ls ls.prof
                        Displays annotated call-graph via 'gv'
 pprof --gv --focus=Mutex /bin/ls ls.prof
@@ -233,6 +242,9 @@ pprof --list=getdir /bin/ls ls.prof
                        (Per-line) annotated source listing for getdir()
 pprof --disasm=getdir /bin/ls ls.prof
                        (Per-PC) annotated disassembly for getdir()
+
+pprof http://localhost:1234/
+                       Enters "interactive" mode
 pprof --text localhost:1234
                        Outputs one line per procedure for localhost:1234
 pprof --raw localhost:1234 > ./local.raw
@@ -292,10 +304,12 @@ sub Init() {
   $main::opt_disasm = "";
   $main::opt_symbols = 0;
   $main::opt_gv = 0;
+  $main::opt_web = 0;
   $main::opt_dot = 0;
   $main::opt_ps = 0;
   $main::opt_pdf = 0;
   $main::opt_gif = 0;
+  $main::opt_svg = 0;
   $main::opt_raw = 0;
 
   $main::opt_nodecount = 80;
@@ -330,13 +344,16 @@ sub Init() {
   # Are we using $SYMBOL_PAGE?
   $main::use_symbol_page = 0;
 
+  # Files returned by TempName.
+  %main::tempnames = ();
+
   # Type of profile we are dealing with
   # Supported types:
-  #	cpu
-  #	heap
-  #	growth
-  #	contention
-  $main::profile_type = '';	# Empty type means "unknown"
+  #     cpu
+  #     heap
+  #     growth
+  #     contention
+  $main::profile_type = '';     # Empty type means "unknown"
 
   GetOptions("help!"          => \$main::opt_help,
              "version!"       => \$main::opt_version,
@@ -355,9 +372,11 @@ sub Init() {
              "disasm=s"       => \$main::opt_disasm,
              "symbols!"       => \$main::opt_symbols,
              "gv!"            => \$main::opt_gv,
+             "web!"           => \$main::opt_web,
              "dot!"           => \$main::opt_dot,
              "ps!"            => \$main::opt_ps,
              "pdf!"           => \$main::opt_pdf,
+             "svg!"           => \$main::opt_svg,
              "gif!"           => \$main::opt_gif,
              "raw!"           => \$main::opt_raw,
              "interactive!"   => \$main::opt_interactive,
@@ -380,8 +399,8 @@ sub Init() {
              "tools=s"        => \$main::opt_tools,
              "test!"          => \$main::opt_test,
              "debug!"         => \$main::opt_debug,
-	     # Undocumented flags used only by unittests:
-	     "test_stride=i"  => \$main::opt_test_stride,
+             # Undocumented flags used only by unittests:
+             "test_stride=i"  => \$main::opt_test_stride,
       ) || usage("Invalid option(s)");
 
   # Deal with the standard --help and --version
@@ -433,9 +452,11 @@ sub Init() {
       ($main::opt_disasm eq '' ? 0 : 1) +
       ($main::opt_symbols == 0 ? 0 : 1) +
       $main::opt_gv +
+      $main::opt_web +
       $main::opt_dot +
       $main::opt_ps +
       $main::opt_pdf +
+      $main::opt_svg +
       $main::opt_gif +
       $main::opt_raw +
       $main::opt_interactive +
@@ -510,20 +531,6 @@ sub Init() {
     ConfigureObjTools($main::prog)
   }
 
-  # Check what flags our commandline utilities support
-  if (open(TFILE, "$WGET $WGET_FLAGS -V 2>&1 |")) {
-    my @lines = <TFILE>;
-    if (grep(/unrecognized/, @lines) > 0) {
-      # grep found 'unrecognized' token from WGET, clear WGET flags
-      $WGET_FLAGS = "";
-    }
-    close(TFILE);
-  }
-  # TODO(csilvers): check all the other binaries and objtools to see
-  # if they are installed and what flags they support, and store that
-  # in a data structure here, rather than scattering these tests about.
-  # Then, ideally, rewrite code to use wget OR curl OR GET or ...
-
   # Break the opt_list_prefix into the prefix_list array
   @prefix_list = split (',', $main::opt_lib_prefix);
 
@@ -634,9 +641,24 @@ sub Main() {
     } else {
       if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
         if ($main::opt_gv) {
-	  RunGV(PsTempName($main::next_tmpfile), "");
+          RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_web) {
+          my $tmp = TempName($main::next_tmpfile, "svg");
+          RunWeb($tmp);
+          # The command we run might hand the file name off
+          # to an already running browser instance and then exit.
+          # Normally, we'd remove $tmp on exit (right now),
+          # but fork a child to remove $tmp a little later, so that the
+          # browser has time to load it first.
+          delete $main::tempnames{$tmp};
+          if (fork() == 0) {
+            sleep 5;
+            unlink($tmp);
+            exit(0);
+          }
         }
       } else {
+        cleanup();
         exit(1);
       }
     }
@@ -667,7 +689,7 @@ sub ReadlineMightFail {
 
 sub RunGV {
   my $fname = shift;
-  my $bg = shift;	# "" or " &" if we should run in background
+  my $bg = shift;       # "" or " &" if we should run in background
   if (!system("$GV --version >/dev/null 2>&1")) {
     # Options using double dash are supported by this gv version.
     # Also, turn on noantialias to better handle bug in gv for
@@ -682,6 +704,41 @@ sub RunGV {
   }
 }
 
+sub RunWeb {
+  my $fname = shift;
+  print STDERR "Loading web page file:///$fname\n";
+
+  if (`uname` =~ /Darwin/) {
+    # OS X: open will use standard preference for SVG files.
+    system("/usr/bin/open", $fname);
+    return;
+  }
+
+  # Some kind of Unix; try generic symlinks, then specific browsers.
+  # (Stop once we find one.)
+  # Works best if the browser is already running.
+  my @alt = (
+    "/etc/alternatives/gnome-www-browser",
+    "/etc/alternatives/x-www-browser",
+    "google-chrome",
+    "firefox",
+  );
+  foreach my $b (@alt) {
+    if (system($b, $fname) == 0) {
+      return;
+    }
+  }
+
+  print STDERR "Could not load web browser.\n";
+}
+
+sub RunKcachegrind {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  print STDERR "Starting '$KCACHEGRIND " . $fname . $bg . "'\n";
+  system("$KCACHEGRIND " . $fname . $bg);
+}
+
 
 ##### Interactive helper routines #####
 
@@ -689,10 +746,11 @@ sub InteractiveMode {
   $| = 1;  # Make output unbuffered for interactive mode
   my ($orig_profile, $symbols, $libs, $total) = @_;
 
-  print "Welcome to pprof!  For help, type 'help'.\n";
+  print STDERR "Welcome to pprof!  For help, type 'help'.\n";
 
-  # Use ReadLine if it's installed.
-  if ( !ReadlineMightFail() &&
+  # Use ReadLine if it's installed and input comes from a console.
+  if ( -t STDIN &&
+       !ReadlineMightFail() &&
        defined(eval {require Term::ReadLine}) ) {
     my $term = new Term::ReadLine 'pprof';
     while ( defined ($_ = $term->readline('(pprof) '))) {
@@ -703,7 +761,7 @@ sub InteractiveMode {
     }
   } else {       # don't have readline
     while (1) {
-      print "(pprof) ";
+      print STDERR "(pprof) ";
       $_ = <STDIN>;
       last if ! defined $_ ;
       s/\r//g;         # turn windows-looking lines into unix-looking lines
@@ -727,13 +785,13 @@ sub InteractiveCommand {
   my($orig_profile, $symbols, $libs, $total, $command) = @_;
   $_ = $command;                # just to make future m//'s easier
   if (!defined($_)) {
-    print "\n";
+    print STDERR "\n";
     return 0;
   }
-  if (m/^ *quit/) {
+  if (m/^\s*quit/) {
     return 0;
   }
-  if (m/^ *help/) {
+  if (m/^\s*help/) {
     InteractiveHelpMessage();
     return 1;
   }
@@ -745,7 +803,7 @@ sub InteractiveCommand {
   $main::opt_gv = 0;
   $main::opt_cum = 0;
 
-  if (m/^ *(text|top)(\d*) *(.*)/) {
+  if (m/^\s*(text|top)(\d*)\s*(.*)/) {
     $main::opt_text = 1;
 
     my $line_limit = ($2 ne "") ? int($2) : 10;
@@ -764,7 +822,24 @@ sub InteractiveCommand {
     PrintText($symbols, $flat, $cumulative, $total, $line_limit);
     return 1;
   }
-  if (m/^ *list *(.+)/) {
+  if (m/^\s*callgrind\s*([^ \n]*)/) {
+    $main::opt_callgrind = 1;
+
+    # Get derived profiles
+    my $calls = ExtractCalls($symbols, $orig_profile);
+    my $filename = $1;
+    if ( $1 eq '' ) {
+      $filename = TempName($main::next_tmpfile, "callgrind");
+    }
+    PrintCallgrind($calls, $filename);
+    if ( $1 eq '' ) {
+      RunKcachegrind($filename, " & ");
+      $main::next_tmpfile++;
+    }
+
+    return 1;
+  }
+  if (m/^\s*list\s*(.+)/) {
     $main::opt_list = 1;
 
     my $routine;
@@ -781,7 +856,7 @@ sub InteractiveCommand {
     PrintListing($libs, $flat, $cumulative, $routine);
     return 1;
   }
-  if (m/^ *disasm *(.+)/) {
+  if (m/^\s*disasm\s*(.+)/) {
     $main::opt_disasm = 1;
 
     my $routine;
@@ -799,12 +874,18 @@ sub InteractiveCommand {
     PrintDisassembly($libs, $flat, $cumulative, $routine, $total);
     return 1;
   }
-  if (m/^ *gv *(.*)/) {
-    $main::opt_gv = 1;
+  if (m/^\s*(gv|web)\s*(.*)/) {
+    $main::opt_gv = 0;
+    $main::opt_web = 0;
+    if ($1 eq "gv") {
+      $main::opt_gv = 1;
+    } elsif ($1 eq "web") {
+      $main::opt_web = 1;
+    }
 
     my $focus;
     my $ignore;
-    ($focus, $ignore) = ParseInteractiveArgs($1);
+    ($focus, $ignore) = ParseInteractiveArgs($2);
 
     # Process current profile to account for various settings
     my $profile = ProcessProfile($orig_profile, $symbols, $focus, $ignore);
@@ -815,11 +896,19 @@ sub InteractiveCommand {
     my $cumulative = CumulativeProfile($reduced);
 
     if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-      RunGV(PsTempName($main::next_tmpfile), " &");
+      if ($main::opt_gv) {
+        RunGV(TempName($main::next_tmpfile, "ps"), " &");
+      } elsif ($main::opt_web) {
+        RunWeb(TempName($main::next_tmpfile, "svg"));
+      }
       $main::next_tmpfile++;
     }
     return 1;
   }
+  if (m/^\s*$/) {
+    return 1;
+  }
+  print STDERR "Unknown command: try 'help'.\n";
   return 1;
 }
 
@@ -856,7 +945,7 @@ sub ProcessProfile {
 }
 
 sub InteractiveHelpMessage {
-  print <<ENDOFHELP;
+  print STDERR <<ENDOFHELP;
 Interactive pprof mode
 
 Commands:
@@ -868,6 +957,14 @@ Commands:
       the "focus" regular expression matches a routine name on the stack
       trace.
 
+  web
+  web [focus] [-ignore1] [-ignore2]
+      Like GV, but displays profile in your web browser instead of using
+      Ghostview. Works best if your web browser is already running.
+      To change the browser that gets used:
+      On Linux, set the /etc/alternatives/gnome-www-browser symlink.
+      On OS X, change the Finder association for SVG files.
+
   list [routine_regexp] [-ignore1] [-ignore2]
       Show source listing of routines whose names match "routine_regexp"
 
@@ -882,6 +979,10 @@ Commands:
       Show disassembly of routines whose names match "routine_regexp",
       annotated with sample counts.
 
+  callgrind
+  callgrind [filename]
+      Generates callgrind file. If no filename is given, kcachegrind is called.
+
   help - This listing
   quit or ^D - End pprof
 
@@ -913,16 +1014,19 @@ sub ParseInteractiveArgs {
     }
   }
   if ($ignore ne "") {
-    print "Ignoring samples in call stacks that match '$ignore'\n";
+    print STDERR "Ignoring samples in call stacks that match '$ignore'\n";
   }
   return ($focus, $ignore);
 }
 
 ##### Output code #####
 
-sub PsTempName {
+sub TempName {
   my $fnum = shift;
-  return "$main::tmpfile_ps" . "." . "$fnum" . ".ps";
+  my $ext = shift;
+  my $file = "$main::tmpfile_ps.$fnum.$ext";
+  $main::tempnames{$file} = 1;
+  return $file;
 }
 
 # Print profile data in packed binary format (64-bit) to standard out
@@ -1045,7 +1149,15 @@ sub PrintText {
 # Print the call graph in a way that's suiteable for callgrind.
 sub PrintCallgrind {
   my $calls = shift;
-  printf("events: Hits\n\n");
+  my $filename;
+  if ($main::opt_interactive) {
+    $filename = shift;
+    print STDERR "Writing callgrind file to '$filename'.\n"
+  } else {
+    $filename = "&STDOUT";
+  }
+  open(CG, ">".$filename );
+  printf CG ("events: Hits\n\n");
   foreach my $call ( map { $_->[0] }
                      sort { $a->[1] cmp $b ->[1] ||
                             $a->[2] <=> $b->[2] }
@@ -1057,13 +1169,15 @@ sub PrintCallgrind {
     my ( $caller_file, $caller_line, $caller_function,
          $callee_file, $callee_line, $callee_function ) =
        ( $1, $2, $3, $5, $6, $7 );
-    printf("fl=$caller_file\nfn=$caller_function\n");
+
+      
+    printf CG ("fl=$caller_file\nfn=$caller_function\n");
     if (defined $6) {
-      printf("cfl=$callee_file\n");
-      printf("cfn=$callee_function\n");
-      printf("calls=$count $callee_line\n");
+      printf CG ("cfl=$callee_file\n");
+      printf CG ("cfn=$callee_function\n");
+      printf CG ("calls=$count $callee_line\n");
     }
-    printf("$caller_line $count\n\n");
+    printf CG ("$caller_line $count\n\n");
   }
 }
 
@@ -1385,7 +1499,7 @@ sub SourceLine {
       return undef;
     }
     my $lines = [];
-    push(@{$lines}, "");	# So we can use 1-based line numbers as indices
+    push(@{$lines}, "");        # So we can use 1-based line numbers as indices
     while (<FILE>) {
       push(@{$lines}, $_);
     }
@@ -1477,8 +1591,8 @@ sub PrintDisassembledFunction {
     # Find run of instructions for this range of source lines
     my $first_inst = $i;
     while (($i <= $#instructions) &&
-	   ($instructions[$i]->[2] >= $first_line) &&
-	   ($instructions[$i]->[2] <= $last_line)) {
+           ($instructions[$i]->[2] >= $first_line) &&
+           ($instructions[$i]->[2] <= $last_line)) {
       $e = $instructions[$i];
       $flat_sum{$e->[2]} += $flat_count[$i];
       $cum_sum{$e->[2]} += $cum_count[$i];
@@ -1490,16 +1604,16 @@ sub PrintDisassembledFunction {
     for (my $l = $first_line; $l <= $last_line; $l++) {
       my $line = SourceLine($current_file, $l);
       if (!defined($line)) {
-	$line = "?\n";
+        $line = "?\n";
         next;
       } else {
         $line =~ s/^\s+//;
       }
       printf("%6s %6s %5d: %s",
-	     UnparseAlt($flat_sum{$l}),
-	     UnparseAlt($cum_sum{$l}),
-	     $l,
-	     $line);
+             UnparseAlt($flat_sum{$l}),
+             UnparseAlt($cum_sum{$l}),
+             $l,
+             $line);
     }
 
     # Print disassembly
@@ -1516,9 +1630,9 @@ sub PrintDisassembledFunction {
       while ($d =~ s/(\w+)<[^<>]*>/$1/g)  { }       # Remove template arguments
 
       printf("%6s %6s    %8s: %6s\n",
-	     UnparseAlt($flat_count[$x]),
-	     UnparseAlt($cum_count[$x]),
-	     $address,
+             UnparseAlt($flat_count[$x]),
+             UnparseAlt($cum_count[$x]),
+             $address,
              $d);
     }
   }
@@ -1542,7 +1656,7 @@ sub PrintDot {
   # Find nodes to include
   my @list = (sort { abs(GetEntry($cumulative, $b)) <=>
                      abs(GetEntry($cumulative, $a))
-		     || $a cmp $b }
+                     || $a cmp $b }
               keys(%{$cumulative}));
   my $last = $nodecount - 1;
   if ($last > $#list) {
@@ -1554,7 +1668,6 @@ sub PrintDot {
   }
   if ($last < 0) {
     print STDERR "No nodes to print\n";
-    cleanup();
     return 0;
   }
 
@@ -1567,11 +1680,14 @@ sub PrintDot {
   # Open DOT output file
   my $output;
   if ($main::opt_gv) {
-    $output = "| $DOT -Tps2 >" . PsTempName($main::next_tmpfile);
+    $output = "| $DOT -Tps2 >" . TempName($main::next_tmpfile, "ps");
   } elsif ($main::opt_ps) {
     $output = "| $DOT -Tps2";
   } elsif ($main::opt_pdf) {
     $output = "| $DOT -Tps2 | $PS2PDF - -";
+  } elsif ($main::opt_web || $main::opt_svg) {
+    # We need to post-process the SVG, so write to a temporary file always.
+    $output = "| $DOT -Tsvg >" . TempName($main::next_tmpfile, "svg");
   } elsif ($main::opt_gif) {
     $output = "| $DOT -Tgif";
   } else {
@@ -1682,7 +1798,10 @@ sub PrintDot {
       my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
       if ($fraction > 1) { $fraction = 1; }
       my $w = $fraction * 2;
-      #if ($w < 1) { $w = 1; }
+      if ($w < 1 && ($main::opt_web || $main::opt_svg)) {
+        # SVG output treats line widths < 1 poorly.
+        $w = 1;
+      }
 
       # Dot sometimes segfaults if given edge weights that are too large, so
       # we cap the weights at a large value
@@ -1706,11 +1825,312 @@ sub PrintDot {
   }
 
   print DOT ("}\n");
-
   close(DOT);
+
+  if ($main::opt_web || $main::opt_svg) {
+    # Rewrite SVG to be more usable inside web browser.
+    RewriteSvg(TempName($main::next_tmpfile, "svg"));
+  }
+
   return 1;
 }
 
+sub RewriteSvg {
+  my $svgfile = shift;
+
+  open(SVG, $svgfile) || die "open temp svg: $!";
+  my @svg = <SVG>;
+  close(SVG);
+  unlink $svgfile;
+  my $svg = join('', @svg);
+
+  # Dot's SVG output is
+  #
+  #    <svg width="___" height="___"
+  #     viewBox="___" xmlns=...>
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </svg>
+  #
+  # Change it to
+  #
+  #    <svg width="100%" height="100%"
+  #     xmlns=...>
+  #    $svg_javascript
+  #    <g id="viewport" transform="translate(0,0)">
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </g>
+  #    </svg>
+
+  # Fix width, height; drop viewBox.
+  $svg =~ s/(?s)<svg width="[^"]+" height="[^"]+"(.*?)viewBox="[^"]+"/<svg width="100%" height="100%"$1/;
+
+  # Insert script, viewport <g> above first <g>
+  my $svg_javascript = SvgJavascript();
+  my $viewport = "<g id=\"viewport\" transform=\"translate(0,0)\">\n";
+  $svg =~ s/<g id="graph\d"/$svg_javascript$viewport$&/;
+
+  # Insert final </g> above </svg>.
+  $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/;
+  $svg =~ s/<g id="graph\d"(.*?)/<g id="viewport"$1/;
+
+  if ($main::opt_svg) {
+    # --svg: write to standard output.
+    print $svg;
+  } else {
+    # Write back to temporary file.
+    open(SVG, ">$svgfile") || die "open $svgfile: $!";
+    print SVG $svg;
+    close(SVG);
+  }
+}
+
+sub SvgJavascript {
+  return <<'EOF';
+<script type="text/ecmascript"><![CDATA[
+// SVGPan
+// http://www.cyberz.org/blog/2009/12/08/svgpan-a-javascript-svg-panzoomdrag-library/
+// Local modification: if(true || ...) below to force panning, never moving.
+
+/**
+ *  SVGPan library 1.2
+ * ====================
+ *
+ * Given an unique existing element with id "viewport", including the
+ * the library into any SVG adds the following capabilities:
+ *
+ *  - Mouse panning
+ *  - Mouse zooming (using the wheel)
+ *  - Object dargging
+ *
+ * Known issues:
+ *
+ *  - Zooming (while panning) on Safari has still some issues
+ *
+ * Releases:
+ *
+ * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui
+ *	Fixed a bug with browser mouse handler interaction
+ *
+ * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui
+ *	Updated the zoom code to support the mouse wheel on Safari/Chrome
+ *
+ * 1.0, Andrea Leofreddi
+ *	First release
+ *
+ * This code is licensed under the following BSD license:
+ *
+ * Copyright 2009-2010 Andrea Leofreddi <a.leofreddi@itcharm.com>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *       of conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation are those of the
+ * authors and should not be interpreted as representing official policies, either expressed
+ * or implied, of Andrea Leofreddi.
+ */
+
+var root = document.documentElement;
+
+var state = 'none', stateTarget, stateOrigin, stateTf;
+
+setupHandlers(root);
+
+/**
+ * Register handlers
+ */
+function setupHandlers(root){
+	setAttributes(root, {
+		"onmouseup" : "add(evt)",
+		"onmousedown" : "handleMouseDown(evt)",
+		"onmousemove" : "handleMouseMove(evt)",
+		"onmouseup" : "handleMouseUp(evt)",
+		//"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element
+	});
+
+	if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0)
+		window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari
+	else
+		window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others
+
+	var g = svgDoc.getElementById("svg");
+	g.width = "100%";
+	g.height = "100%";
+}
+
+/**
+ * Instance an SVGPoint object with given event coordinates.
+ */
+function getEventPoint(evt) {
+	var p = root.createSVGPoint();
+
+	p.x = evt.clientX;
+	p.y = evt.clientY;
+
+	return p;
+}
+
+/**
+ * Sets the current transform matrix of an element.
+ */
+function setCTM(element, matrix) {
+	var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";
+
+	element.setAttribute("transform", s);
+}
+
+/**
+ * Dumps a matrix to a string (useful for debug).
+ */
+function dumpMatrix(matrix) {
+	var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\n  " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\n  0, 0, 1 ]";
+
+	return s;
+}
+
+/**
+ * Sets attributes of an element.
+ */
+function setAttributes(element, attributes){
+	for (i in attributes)
+		element.setAttributeNS(null, i, attributes[i]);
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseWheel(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var delta;
+
+	if(evt.wheelDelta)
+		delta = evt.wheelDelta / 3600; // Chrome/Safari
+	else
+		delta = evt.detail / -90; // Mozilla
+
+	var z = 1 + delta; // Zoom factor: 0.9/1.1
+
+	var g = svgDoc.getElementById("viewport");
+
+	var p = getEventPoint(evt);
+
+	p = p.matrixTransform(g.getCTM().inverse());
+
+	// Compute new scale matrix in current mouse position
+	var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y);
+
+        setCTM(g, g.getCTM().multiply(k));
+
+	stateTf = stateTf.multiply(k.inverse());
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseMove(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(state == 'pan') {
+		// Pan mode
+		var p = getEventPoint(evt).matrixTransform(stateTf);
+
+		setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y));
+	} else if(state == 'move') {
+		// Move mode
+		var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse());
+
+		setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM()));
+
+		stateOrigin = p;
+	}
+}
+
+/**
+ * Handle click event.
+ */
+function handleMouseDown(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(true || evt.target.tagName == "svg") {
+		// Pan mode
+		state = 'pan';
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	} else {
+		// Move mode
+		state = 'move';
+
+		stateTarget = evt.target;
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	}
+}
+
+/**
+ * Handle mouse button release event.
+ */
+function handleMouseUp(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	if(state == 'pan' || state == 'move') {
+		// Quit pan mode
+		state = '';
+	}
+}
+
+]]></script>
+EOF
+}
+
 # Translate a stack of addresses into a stack of symbols
 sub TranslateStack {
   my $symbols = shift;
@@ -1806,7 +2226,7 @@ sub Unparse {
       }
     }
   } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
-    return sprintf("%.3f", $num / 1e9);	# Convert nanoseconds to seconds
+    return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds
   } else {
     return sprintf("%d", $num);
   }
@@ -1947,42 +2367,42 @@ sub RemoveUninterestingFrames {
                       'malloc',
                       'free',
                       'memalign',
-		      'posix_memalign',
+                      'posix_memalign',
                       'pvalloc',
                       'valloc',
                       'realloc',
-		      'tc_calloc',
+                      'tc_calloc',
                       'tc_cfree',
                       'tc_malloc',
                       'tc_free',
                       'tc_memalign',
-		      'tc_posix_memalign',
+                      'tc_posix_memalign',
                       'tc_pvalloc',
                       'tc_valloc',
                       'tc_realloc',
-		      'tc_new',
-		      'tc_delete',
-		      'tc_newarray',
-		      'tc_deletearray',
-		      'tc_new_nothrow',
-		      'tc_newarray_nothrow',
-		      'do_malloc',
+                      'tc_new',
+                      'tc_delete',
+                      'tc_newarray',
+                      'tc_deletearray',
+                      'tc_new_nothrow',
+                      'tc_newarray_nothrow',
+                      'do_malloc',
                       '::do_malloc',   # new name -- got moved to an unnamed ns
                       '::do_malloc_or_cpp_alloc',
                       'DoSampledAllocation',
-		      'simple_alloc::allocate',
-		      '__malloc_alloc_template::allocate',
+                      'simple_alloc::allocate',
+                      '__malloc_alloc_template::allocate',
                       '__builtin_delete',
                       '__builtin_new',
                       '__builtin_vec_delete',
                       '__builtin_vec_new',
                       'operator new',
                       'operator new[]',
-		      # These mark the beginning/end of our custom sections
-		      '__start_google_malloc',
-		      '__stop_google_malloc',
-		      '__start_malloc_hook',
-		      '__stop_malloc_hook') {
+                      # These mark the beginning/end of our custom sections
+                      '__start_google_malloc',
+                      '__stop_google_malloc',
+                      '__start_malloc_hook',
+                      '__stop_malloc_hook') {
       $skip{$name} = 1;
       $skip{"_" . $name} = 1;   # Mach (OS X) adds a _ prefix to everything
     }
@@ -1999,11 +2419,11 @@ sub RemoveUninterestingFrames {
     # TODO(dpeng): this should not be necessary; it's taken
     # care of by the general 2nd-pc mechanism below.
     foreach my $name ('ProfileData::Add',           # historical
-		      'ProfileData::prof_handler',  # historical
-		      'CpuProfiler::prof_handler',
+                      'ProfileData::prof_handler',  # historical
+                      'CpuProfiler::prof_handler',
                       '__FRAME_END__',
-		      '__pthread_sighandler',
-		      '__restore') {
+                      '__pthread_sighandler',
+                      '__restore') {
       $skip{$name} = 1;
     }
   } else {
@@ -2042,10 +2462,10 @@ sub RemoveUninterestingFrames {
     my @path = ();
     foreach my $a (@addrs) {
       if (exists($symbols->{$a})) {
-	my $func = $symbols->{$a}->[0];
-	if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
-	  next;
-	}
+        my $func = $symbols->{$a}->[0];
+        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
+          next;
+        }
       }
       push(@path, $a);
     }
@@ -2070,8 +2490,8 @@ sub ReduceProfile {
       # To avoid double-counting due to recursion, skip a stack-trace
       # entry if it has already been seen
       if (!$seen{$e}) {
-	$seen{$e} = 1;
-	push(@path, $e);
+        $seen{$e} = 1;
+        push(@path, $e);
       }
     }
     my $reduced_path = join("\n", @path);
@@ -2265,28 +2685,11 @@ sub AddEntries {
   AddEntry($profile, (join "\n", @k), $count);
 }
 
-sub IsSymbolizedProfileFile {
-  my $file_name = shift;
-
-  if (!(-e $file_name) || !(-r $file_name)) {
-    return 0;
-  }
-
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-  # Check if the file contains a symbol-section marker.
-  open(TFILE, "<$file_name");
-  my @lines = <TFILE>;
-  my $result = grep(/^--- *$symbol_marker/, @lines);
-  close(TFILE);
-  return $result > 0;
-}
-
 ##### Code to profile a server dynamically #####
 
 sub CheckSymbolPage {
   my $url = SymbolPageURL();
-  open(SYMBOL, "$WGET $WGET_FLAGS -qO- '$url' |");
+  open(SYMBOL, "$URL_FETCHER '$url' |");
   my $line = <SYMBOL>;
   $line =~ s/\r//g;         # turn windows-looking lines into unix-looking lines
   close(SYMBOL);
@@ -2305,33 +2708,45 @@ sub CheckSymbolPage {
 
 sub IsProfileURL {
   my $profile_name = shift;
-  my ($host, $port, $path) = ParseProfileURL($profile_name);
-  return defined($host) and defined($port) and defined($path);
+  if (-f $profile_name) {
+    printf STDERR "Using local file $profile_name.\n";
+    return 0;
+  }
+  return 1;
 }
 
 sub ParseProfileURL {
   my $profile_name = shift;
-  if (defined($profile_name) &&
-      $profile_name =~ m,^(http://|)([^/:]+):(\d+)(|\@\d+)(|/|.*($PROFILE_PAGE|$PMUPROFILE_PAGE|$HEAP_PAGE|$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|$FILTEREDPROFILE_PAGE))$,o) {
-    # $6 is $PROFILE_PAGE/$HEAP_PAGE/etc.  $5 is *everything* after
-    # the hostname, as long as that everything is the empty string,
-    # a slash, or something ending in $PROFILE_PAGE/$HEAP_PAGE/etc.
-    # So "$6 || $5" is $PROFILE_PAGE/etc if there, or else it's "/" or "".
-    return ($2, $3, $6 || $5);
-  }
-  return ();
+
+  if (!defined($profile_name) || $profile_name eq "") {
+    return ();
+  }
+
+  # Split profile URL - matches all non-empty strings, so no test.
+  $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,;
+
+  my $proto = $1 || "http://";
+  my $hostport = $2;
+  my $prefix = $3;
+  my $profile = $4 || "/";
+
+  my $host = $hostport;
+  $host =~ s/:.*//;
+
+  my $baseurl = "$proto$hostport$prefix";
+  return ($host, $baseurl, $profile);
 }
 
 # We fetch symbols from the first profile argument.
 sub SymbolPageURL {
-  my ($host, $port, $path) = ParseProfileURL($main::pfile_args[0]);
-  return "http://$host:$port$SYMBOL_PAGE";
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  return "$baseURL$SYMBOL_PAGE";
 }
 
 sub FetchProgramName() {
-  my ($host, $port, $path) = ParseProfileURL($main::pfile_args[0]);
-  my $url = "http://$host:$port$PROGRAM_NAME_PAGE";
-  my $command_line = "$WGET $WGET_FLAGS -qO- '$url'";
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  my $url = "$baseURL$PROGRAM_NAME_PAGE";
+  my $command_line = "$URL_FETCHER '$url'";
   open(CMDLINE, "$command_line |") or error($command_line);
   my $cmdline = <CMDLINE>;
   $cmdline =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
@@ -2348,7 +2763,7 @@ sub FetchProgramName() {
 # curl.  Redirection happens on borg hosts.
 sub ResolveRedirectionForCurl {
   my $url = shift;
-  my $command_line = "$CURL -s --head '$url'";
+  my $command_line = "$URL_FETCHER --head '$url'";
   open(CMDLINE, "$command_line |") or error($command_line);
   while (<CMDLINE>) {
     s/\r//g;         # turn windows-looking lines into unix-looking lines
@@ -2360,6 +2775,20 @@ sub ResolveRedirectionForCurl {
   return $url;
 }
 
+# Add a timeout flat to URL_FETCHER
+sub AddFetchTimeout {
+  my $fetcher = shift;
+  my $timeout = shift;
+  if (defined($timeout)) {
+    if ($fetcher =~ m/\bcurl -s/) {
+      $fetcher .= sprintf(" --max-time %d", $timeout);
+    } elsif ($fetcher =~ m/\brpcget\b/) {
+      $fetcher .= sprintf(" --deadline=%d", $timeout);
+    }
+  }
+  return $fetcher;
+}
+
 # Reads a symbol map from the file handle name given as $1, returning
 # the resulting symbol map.  Also processes variables relating to symbols.
 # Currently, the only variable processed is 'binary=<value>' which updates
@@ -2404,7 +2833,6 @@ sub FetchSymbols {
   my $pcset = shift;
   my $symbol_map = shift;
 
-
   my %seen = ();
   my @pcs = grep { !$seen{$_}++ } keys(%$pcset);  # uniq
 
@@ -2414,12 +2842,16 @@ sub FetchSymbols {
     open(POSTFILE, ">$main::tmpfile_sym");
     print POSTFILE $post_data;
     close(POSTFILE);
- 
+
     my $url = SymbolPageURL();
-    # Here we use curl for sending data via POST since old
-    # wget doesn't have --post-file option.
-    $url = ResolveRedirectionForCurl($url);
-    my $command_line = "$CURL -sd '\@$main::tmpfile_sym' '$url'";
+
+    my $command_line;
+    if ($URL_FETCHER =~ m/\bcurl -s/) {
+      $url = ResolveRedirectionForCurl($url);
+      $command_line = "$URL_FETCHER -d '\@$main::tmpfile_sym' '$url'";
+    } else {
+      $command_line = "$URL_FETCHER --post '$url' < '$main::tmpfile_sym'";
+    }
     # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols.
     my $cppfilt = $obj_tool_map{"c++filt"};
     open(SYMBOL, "$command_line | $cppfilt |") or error($command_line);
@@ -2464,10 +2896,10 @@ sub BaseName {
 
 sub MakeProfileBaseName {
   my ($binary_name, $profile_name) = @_;
-  my ($host, $port, $path) = ParseProfileURL($profile_name);
+  my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
   my $binary_shortname = BaseName($binary_name);
-  return sprintf("%s.%s.%s-port%s",
-                 $binary_shortname, $main::op_time, $host, $port);
+  return sprintf("%s.%s.%s",
+                 $binary_shortname, $main::op_time, $host);
 }
 
 sub FetchDynamicProfile {
@@ -2479,7 +2911,7 @@ sub FetchDynamicProfile {
   if (!IsProfileURL($profile_name)) {
     return $profile_name;
   } else {
-    my ($host, $port, $path) = ParseProfileURL($profile_name);
+    my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
     if ($path eq "" || $path eq "/") {
       # Missing type specifier defaults to cpu-profile
       $path = $PROFILE_PAGE;
@@ -2487,37 +2919,28 @@ sub FetchDynamicProfile {
 
     my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
 
-    my $url;
-    my $wget_timeout;
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)) {
-      if ($path =~ m/$PROFILE_PAGE/) {
-        $url = sprintf("http://$host:$port$path?seconds=%d",
-            $main::opt_seconds);
+    my $url = "$baseURL$path";
+    my $fetch_timeout = undef;
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) {
+      if ($path =~ m/[?]/) {
+        $url .= "&";
       } else {
-        if ($profile_name =~ m/[?]/) {
-          $profile_name .= "&"
-        } else {
-          $profile_name .= "?"
-        }
-        $url = sprintf("http://$profile_name" . "seconds=%d",
-            $main::opt_seconds);
+        $url .= "?";
       }
-      $wget_timeout = sprintf("--timeout=%d",
-                              int($main::opt_seconds * 1.01 + 60));
+      $url .= sprintf("seconds=%d", $main::opt_seconds);
+      $fetch_timeout = $main::opt_seconds * 1.01 + 60;
     } else {
       # For non-CPU profiles, we add a type-extension to
       # the target profile file name.
       my $suffix = $path;
       $suffix =~ s,/,.,g;
-      $profile_file .= "$suffix";
-      $url = "http://$host:$port$path";
-      $wget_timeout = "";
+      $profile_file .= $suffix;
     }
 
     my $profile_dir = $ENV{"PPROF_TMPDIR"} || ($ENV{HOME} . "/pprof");
-    if (!(-d $profile_dir)) {
+    if (! -d $profile_dir) {
       mkdir($profile_dir)
-	  || die("Unable to create profile directory $profile_dir: $!\n");
+          || die("Unable to create profile directory $profile_dir: $!\n");
     }
     my $tmp_profile = "$profile_dir/.tmp.$profile_file";
     my $real_profile = "$profile_dir/$profile_file";
@@ -2526,14 +2949,15 @@ sub FetchDynamicProfile {
       return $real_profile;
     }
 
-    my $cmd = "$WGET $WGET_FLAGS $wget_timeout -q -O $tmp_profile '$url'";
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)){
+    my $fetcher = AddFetchTimeout($URL_FETCHER, $fetch_timeout);
+    my $cmd = "$fetcher '$url' > '$tmp_profile'";
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/){
       print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
       if ($encourage_patience) {
         print STDERR "Be patient...\n";
       }
     } else {
-      print STDERR "Fetching $path profile from $host:$port to\n  ${real_profile}\n";
+      print STDERR "Fetching $path profile from $url to\n  ${real_profile}\n";
     }
 
     (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n");
@@ -2580,6 +3004,7 @@ sub FetchDynamicProfilesRecurse {
   } else {
     $position = 1 | ($position << 1);
     TryCollectProfile($maxlevel, $level, $position);
+    cleanup();
     exit(0);
   }
 }
@@ -2603,22 +3028,69 @@ sub TryCollectProfile {
 
 # Provide a small streaming-read module to handle very large
 # cpu-profile files.  Stream in chunks along a sliding window.
+# Provides an interface to get one 'slot', correctly handling
+# endian-ness differences.  A slot is one 32-bit or 64-bit word
+# (depending on the input profile).  We tell endianness and bit-size
+# for the profile by looking at the first 8 bytes: in cpu profiles,
+# the second slot is always 3 (we'll accept anything that's not 0).
 BEGIN {
   package CpuProfileStream;
 
   sub new {
-    my ($class, $file) = @_;
-    my $self = { file      => $file,
-                 base      => 0,
-                 stride    => 512 * 1024,     # must be a multiple of |long|
-                 slots     => []
+    my ($class, $file, $fname) = @_;
+    my $self = { file        => $file,
+                 base        => 0,
+                 stride      => 512 * 1024,   # must be a multiple of bitsize/8
+                 slots       => [],
+                 unpack_code => "",           # N for big-endian, V for little
     };
     bless $self, $class;
     # Let unittests adjust the stride
     if ($main::opt_test_stride > 0) {
       $self->{stride} = $main::opt_test_stride;
     }
-    $self->overflow();
+    # Read the first two slots to figure out bitsize and endianness.
+    my $slots = $self->{slots};
+    my $str;
+    read($self->{file}, $str, 8);
+    # Set the global $address_length based on what we see here.
+    # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars).
+    $address_length = ($str eq (chr(0)x8)) ? 16 : 8;
+    if ($address_length == 8) {
+      if (substr($str, 6, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 4, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**16\n");
+      }
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # If we're a 64-bit profile, make sure we're a 64-bit-capable
+      # perl.  Otherwise, each slot will be represented as a float
+      # instead of an int64, losing precision and making all the
+      # 64-bit addresses right.  We *could* try to handle this with
+      # software emulation of 64-bit ints, but that's added complexity
+      # for no clear benefit (yet).  We use 'Q' to test for 64-bit-ness;
+      # perl docs say it's only available on 64-bit perl systems.
+      my $has_q = 0;
+      eval { $has_q = pack("Q", "1") ? 1 : 1; };
+      if (!$has_q) {
+        ::error("$fname: need a 64-bit perl to process this 64-bit profile.\n");
+      }
+      read($self->{file}, $str, 8);
+      if (substr($str, 4, 4) eq chr(0)x4) {
+        # We'd love to use 'Q', but it's a) not universal, b) not endian-proof.
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 0, 4) eq chr(0)x4) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**32\n");
+      }
+      my @pair = unpack($self->{unpack_code} . "*", $str);
+      # Since we know one of the pair is 0, it's fine to just add them.
+      @$slots = (0, $pair[0] + $pair[1]);
+    }
     return $self;
   }
 
@@ -2629,7 +3101,25 @@ BEGIN {
     $self->{base} += $#$slots + 1;   # skip over data we're replacing
     my $str;
     read($self->{file}, $str, $self->{stride});
-    @$slots = unpack("L*", $str);
+    if ($address_length == 8) {      # the 32-bit case
+      # This is the easy case: unpack provides 32-bit unpacking primitives.
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # We need to unpack 32 bits at a time and combine.
+      my @b32_values = unpack($self->{unpack_code} . "*", $str);
+      my @b64_values = ();
+      for (my $i = 0; $i < $#b32_values; $i += 2) {
+        # TODO(csilvers): if this is a 32-bit perl, the math below
+        #    could end up in a too-large int, which perl will promote
+        #    to a double, losing necessary precision.  Deal with that.
+        if ($self->{unpack_code} eq 'V') {    # little-endian
+          push(@b64_values, $b32_values[$i] + $b32_values[$i+1] * (2**32));
+        } else {
+          push(@b64_values, $b32_values[$i] * (2**32) + $b32_values[$i+1]);
+        }
+      }
+      @$slots = @b64_values;
+    }
   }
 
   # Access the i-th long in the file (logically), or -1 at EOF.
@@ -2638,16 +3128,16 @@ BEGIN {
     my $slots = $self->{slots};
     while ($#$slots >= 0) {
       if ($idx < $self->{base}) {
-	# The only time we expect a reference to $slots[$i - something]
-	# after referencing $slots[$i] is reading the very first header.
-	# Since $stride > |header|, that shouldn't cause any lookback
-	# errors.  And everything after the header is sequential.
-	print STDERR "Unexpected look-back reading CPU profile";
-	return -1;   # shrug, don't know what better to return
+        # The only time we expect a reference to $slots[$i - something]
+        # after referencing $slots[$i] is reading the very first header.
+        # Since $stride > |header|, that shouldn't cause any lookback
+        # errors.  And everything after the header is sequential.
+        print STDERR "Unexpected look-back reading CPU profile";
+        return -1;   # shrug, don't know what better to return
       } elsif ($idx > $self->{base} + $#$slots) {
-	$self->overflow();
+        $self->overflow();
       } else {
-	return $slots->[$idx - $self->{base}];
+        return $slots->[$idx - $self->{base}];
       }
     }
     # If we get here, $slots is [], which means we've reached EOF
@@ -2655,6 +3145,44 @@ BEGIN {
   }
 }
 
+# Return the next line from the profile file, assuming it's a text
+# line (which in this case means, doesn't start with a NUL byte).  If
+# it's not a text line, return "".  At EOF, return undef, like perl does.
+# Input file should be in binmode.
+sub ReadProfileLine {
+  local *PROFILE = shift;
+  my $firstchar = "";
+  my $line = "";
+  read(PROFILE, $firstchar, 1);
+  seek(PROFILE, -1, 1);          # unread the firstchar
+  if ($firstchar eq "\0") {
+    return "";
+  }
+  $line = <PROFILE>;
+  if (defined($line)) {
+    $line =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  }
+  return $line;
+}
+
+sub IsSymbolizedProfileFile {
+  my $file_name = shift;
+  if (!(-e $file_name) || !(-r $file_name)) {
+    return 0;
+  }
+  # Check if the file contains a symbol-section marker.
+  open(TFILE, "<$file_name");
+  binmode TFILE;
+  my $firstline = ReadProfileLine(*TFILE);
+  close(TFILE);
+  if (!$firstline) {
+    return 0;
+  }
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  return $firstline =~ /^--- *$symbol_marker/;
+}
+
 # Parse profile generated by common/profiler.cc and return a reference
 # to a map:
 #      $result->{version}     Version number of profile file
@@ -2689,28 +3217,17 @@ sub ReadProfile {
   # whole firstline, since it may be gigabytes(!) of data.
   open(PROFILE, "<$fname") || error("$fname: $!\n");
   binmode PROFILE;      # New perls do UTF-8 processing
-  my $firstchar = "";
-  my $header = "";
-  read(PROFILE, $firstchar, 1);
-  seek(PROFILE, -1, 1);          # unread the firstchar
-  if ($firstchar ne "\0") {
-    $header = <PROFILE>;
-    $header =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  my $header = ReadProfileLine(*PROFILE);
+  if (!defined($header)) {   # means "at EOF"
+    error("Profile is empty.\n");
   }
 
   my $symbols;
   if ($header =~ m/^--- *$symbol_marker/o) {
-    # read the symbol section of the symbolized profile file
+    # Read the symbol section of the symbolized profile file.
     $symbols = ReadSymbols(*PROFILE{IO});
-
-    # read the next line to get the header for the remaining profile
-    $header = "";
-    read(PROFILE, $firstchar, 1);
-    seek(PROFILE, -1, 1);          # unread the firstchar
-    if ($firstchar ne "\0") {
-      $header = <PROFILE>;
-      $header =~ s/\r//g;
-    }
+    # Read the next line to get the header for the remaining profile.
+    $header = ReadProfileLine(*PROFILE) || "";
   }
 
   my $result;
@@ -2752,6 +3269,33 @@ sub ReadProfile {
   return $result;
 }
 
+# Subtract one from caller pc so we map back to call instr.
+# However, don't do this if we're reading a symbolized profile
+# file, in which case the subtract-one was done when the file
+# was written.
+#
+# We apply the same logic to all readers, though ReadCPUProfile uses an
+# independent implementation.
+sub FixCallerAddresses {
+  my $stack = shift;
+  if ($main::use_symbolized_profile) {
+    return $stack;
+  } else {
+    $stack =~ /(\s)/;
+    my $delimiter = $1;
+    my @addrs = split(' ', $stack);
+    my @fixedaddrs;
+    $#fixedaddrs = $#addrs;
+    if ($#addrs >= 0) {
+      $fixedaddrs[0] = $addrs[0];
+    }
+    for (my $i = 1; $i <= $#addrs; $i++) {
+      $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1");
+    }
+    return join $delimiter, @fixedaddrs;
+  }
+}
+
 # CPU profile reader
 sub ReadCPUProfile {
   my $prog = shift;
@@ -2763,10 +3307,7 @@ sub ReadCPUProfile {
   my $pcs = {};
 
   # Parse string into array of slots.
-  # L! cannot be used because with a native 64-bit build, it will cause
-  # 1) a valid 64-bit profile to use the 32-bit codepath, and
-  # 2) a valid 32-bit profile to be unrecognized.
-  my $slots = CpuProfileStream->new(*PROFILE);
+  my $slots = CpuProfileStream->new(*PROFILE, $fname);
 
   # Read header.  The current header version is a 5-element structure
   # containing:
@@ -2775,108 +3316,50 @@ sub ReadCPUProfile {
   #   2: format version (0)
   #   3: sampling period (usec)
   #   4: unused padding (always 0)
-  # The header words are 32-bit or 64-bit depending on the ABI of the program
-  # that generated the profile.  In the 64-bit case, since our x86-architecture
-  # machines are little-endian, the actual value of each of these elements is
-  # in the first 32-bit word, and the second is always zero.  The @slots array
-  # above was read as a sequence of 32-bit words in both cases, so we need to
-  # explicitly check for both cases.  A typical slot sequence for each is:
-  #   32-bit:  0 3 0 100 0
-  #   64-bit:  0 0  3 0  0 0  100 0  0 0
-  #
   if ($slots->get(0) != 0 ) {
     error("$fname: not a profile file, or old format profile file\n");
   }
-  if ($slots->get(1) >= 3) {
-    # Normal 32-bit header:
-    $version = $slots->get(2);
-    $period = $slots->get(3);
-    $i = 2 + $slots->get(1);
-    $address_length = 8;
-
-    # Parse profile
-    while ($slots->get($i) != -1) {
-      my $n = $slots->get($i++);
-      my $d = $slots->get($i++);
-      if ($slots->get($i) == 0) {
-        # End of profile data marker
-        $i += $d;
-        last;
-      }
-
-      # Make key out of the stack entries
-      my @k = ();
-      for (my $j = 0; $j < $d; $j++) {
-        my $pc = sprintf("%08x", $slots->get($i+$j));
-        $pcs->{$pc} = 1;
-        push @k, $pc;
-      }
-
-      AddEntry($profile, (join "\n", @k), $n);
+  $i = 2 + $slots->get(1);
+  $version = $slots->get(2);
+  $period = $slots->get(3);
+  # Do some sanity checking on these header values.
+  if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) {
+    error("$fname: not a profile file, or corrupted profile file\n");
+  }
+
+  # Parse profile
+  while ($slots->get($i) != -1) {
+    my $n = $slots->get($i++);
+    my $d = $slots->get($i++);
+    if ($d > (2**16)) {  # TODO(csilvers): what's a reasonable max-stack-depth?
+      my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8));
+      print STDERR "At index $i (address $addr):\n";
+      error("$fname: stack trace depth >= 2**32\n");
+    }
+    if ($slots->get($i) == 0) {
+      # End of profile data marker
       $i += $d;
+      last;
     }
 
-  # Normal 64-bit header:  All entries are doubled in size.  The first
-  # word (little-endian) should contain the real value, the second should
-  # be zero.
-  } elsif ($slots->get(1) != 0 ||
-	   $slots->get(2) < 3 ||
-	   $slots->get(3) != 0 ||
-	   $slots->get(5) != 0 ||
-	   $slots->get(7) != 0) {
-    error("$fname: not a profile file, or old format profile file\n");
-  } else {
-    $version = $slots->get(4);
-    $period = $slots->get(6);
-    $i = 4 + 2 * $slots->get(2);
-    $address_length = 16;
-
-    # Parse profile
-    while ($slots->get($i) != -1) {
-      my $n = $slots->get($i++);
-      my $nhi = $slots->get($i++);
-      # Huge counts may coerce to floating point, keeping scale, not precision
-      if ($nhi != 0) { $n += $nhi*(2**32); }
-      my $d = $slots->get($i++);
-      if ($slots->get($i++) != 0) {
-        my $addr = sprintf("%o", 4 * $i);
-        print STDERR "At index $i ($addr):\n";
-        error("$fname: stack trace depth >= 2**32\n");
+    # Make key out of the stack entries
+    my @k = ();
+    for (my $j = 0; $j < $d; $j++) {
+      my $pc = $slots->get($i+$j);
+      # Subtract one from caller pc so we map back to call instr.
+      # However, don't do this if we're reading a symbolized profile
+      # file, in which case the subtract-one was done when the file
+      # was written.
+      if ($j > 0 && !$main::use_symbolized_profile) {
+        $pc--;
       }
-      if ($slots->get($i) == 0 && $slots->get($i+1) == 0) {
-        # End of profile data marker
-        $i += 2 * $d;
-        last;
-      }
-
-      # Make key out of the stack entries
-      my @k = ();
-      for (my $j = 0; $j < $d; $j++) {
-        my $pclo = $slots->get($i++);
-        my $pchi = $slots->get($i++);
-        if ($pclo == -1 || $pchi == -1) {
-          error("$fname: Unexpected EOF when reading stack of depth $d\n");
-        }
-
-	# Subtract one from caller pc so we map back to call instr.
-        # However, don't do this if we're reading a symbolized profile
-        # file, in which case the subtract-one was done when the file
-        # was written.
-        if ($j > 0 && !$main::use_symbolized_profile) {
-          if ($pclo == 0) {
-            $pchi--;
-            $pclo = 0xffffffff;
-          } else {
-            $pclo--;
-          }
-        }
-
-        my $pc = sprintf("%08x%08x", $pchi, $pclo);
-        $pcs->{$pc} = 1;
-        push @k, $pc;
-      }
-      AddEntry($profile, (join "\n", @k), $n);
+      $pc = sprintf("%0*x", $address_length, $pc);
+      $pcs->{$pc} = 1;
+      push @k, $pc;
     }
+
+    AddEntry($profile, (join "\n", @k), $n);
+    $i += $d;
   }
 
   # Parse map
@@ -2947,18 +3430,18 @@ sub ReadHeapProfile {
       # found for profiles generated locally, and the others for
       # remote profiles.
       if (($type eq "heapprofile") || ($type !~ /heap/) ) {
-	# No need to adjust for the sampling rate with heap-profiler-derived data
-	$sampling_algorithm = 0;
+        # No need to adjust for the sampling rate with heap-profiler-derived data
+        $sampling_algorithm = 0;
       } elsif ($type =~ /_v2/) {
-	$sampling_algorithm = 2;     # version 2 sampling
+        $sampling_algorithm = 2;     # version 2 sampling
         if (defined($sample_period) && ($sample_period ne '')) {
-	  $sample_adjustment = int($sample_period);
-	}
+          $sample_adjustment = int($sample_period);
+        }
       } else {
-	$sampling_algorithm = 1;     # version 1 sampling
+        $sampling_algorithm = 1;     # version 1 sampling
         if (defined($sample_period) && ($sample_period ne '')) {
-	  $sample_adjustment = int($sample_period)/2;
-	}
+          $sample_adjustment = int($sample_period)/2;
+        }
       }
     } else {
       # We detect whether or not this is a remote-heap profile by checking
@@ -2970,7 +3453,7 @@ sub ReadHeapProfile {
       my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
       if (($n1 == $n2) && ($s1 == $s2)) {
         # This is likely to be a remote-heap based sample profile
-	$sampling_algorithm = 1;
+        $sampling_algorithm = 1;
       }
     }
   }
@@ -2984,7 +3467,7 @@ sub ReadHeapProfile {
       print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n";
     } else {
       printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n",
-		     $sample_adjustment);
+                     $sample_adjustment);
     }
     if ($sampling_algorithm > 1) {
       # We don't bother printing anything for the original version (version 1)
@@ -3001,7 +3484,7 @@ sub ReadHeapProfile {
     if (/^MAPPED_LIBRARIES:/) {
       # Read the /proc/self/maps data
       while (<PROFILE>) {
-	s/\r//g;         # turn windows-looking lines into unix-looking lines
+        s/\r//g;         # turn windows-looking lines into unix-looking lines
         $map .= $_;
       }
       last;
@@ -3011,7 +3494,7 @@ sub ReadHeapProfile {
       # Read /proc/self/maps data as formatted by DumpAddressMap()
       my $buildvar = "";
       while (<PROFILE>) {
-	s/\r//g;         # turn windows-looking lines into unix-looking lines
+        s/\r//g;         # turn windows-looking lines into unix-looking lines
         # Parse "build=<dir>" specification if supplied
         if (m/^\s*build=(.*)\n/) {
           $buildvar = $1;
@@ -3066,7 +3549,7 @@ sub ReadHeapProfile {
       }
 
       my @counts = ($n1, $s1, $n2, $s2);
-      AddEntries($profile, $pcs, $stack, $counts[$index]);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
     }
   }
 
@@ -3086,7 +3569,7 @@ sub ReadSynchProfile {
   my $profile = {};
   my $pcs = {};
   my $sampling_period = 1;
-  my $cyclespernanosec = 2.8;	# Default assumption for old binaries
+  my $cyclespernanosec = 2.8;   # Default assumption for old binaries
   my $seen_clockrate = 0;
   my $line;
 
@@ -3112,7 +3595,7 @@ sub ReadSynchProfile {
       $count *= $sampling_period;
 
       my @values = ($cycles, $count, $cycles / $count);
-      AddEntries($profile, $pcs, $stack, $values[$index]);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]);
 
     } elsif ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
               $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
@@ -3127,7 +3610,7 @@ sub ReadSynchProfile {
       # Adjust for sampling done by application
       $cycles *= $sampling_period;
 
-      AddEntries($profile, $pcs, $stack, $cycles);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles);
 
     } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) {
       my ($variable, $value) = ($1,$2);
@@ -3308,8 +3791,8 @@ sub ParseTextSectionHeaderFromOtool {
     } elsif ($line =~ /segname (\w+)/) {
       $segname = $1;
     } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") &&
-	       $sectname eq "__text" &&
-	       $segname eq "__TEXT")) {
+               $sectname eq "__text" &&
+               $segname eq "__TEXT")) {
       next;
     } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) {
       $vma = $1;
@@ -3369,7 +3852,7 @@ sub ParseLibraries {
     my $finish;
     my $offset;
     my $lib;
-    if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*)?)$/i) {
+    if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) {
       # Full line from /proc/self/maps.  Example:
       #   40000000-40015000 r-xp 00000000 03:01 12845071   /lib/ld-2.3.2.so
       $start = HexExtend($1);
@@ -3675,7 +4158,7 @@ sub MapToSymbols {
   if ($debug) { print("---- $image ---\n"); }
   for (my $i = 0; $i <= $#{$pclist}; $i++) {
     # addr2line always reads hex addresses, and does not need '0x' prefix.
-    if ($debug) { printf("%s\n", $pclist->[$i]); }
+    if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); }
     printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset));
     if (defined($sep_address)) {
       printf ADDRESSES ("%s\n", $sep_address);
@@ -3727,7 +4210,7 @@ sub MapToSymbols {
       $symbols->{$pcstr} = $sym;
     }
     unshift(@{$sym}, $function, $filelinenum, $fullfunction);
-    if ($debug) { printf("%s => [%s]\n", $pcstr, join(" ", @{$sym})); }
+    if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); }
     if (!defined($sep_address)) {
       # Inlining is off, se this entry ends immediately
       $count++;
@@ -3783,7 +4266,7 @@ sub MapSymbolsWithNM {
   }
   return 1;
 }
- 
+
 sub ShortFunctionName {
   my $function = shift;
   while ($function =~ s/\([^()]*\)(\s*const)?//g) { }   # Argument types
@@ -3830,6 +4313,8 @@ sub ConfigureObjTools {
   if ($file_type =~ /Mach-O/) {
     # OS X uses otool to examine Mach-O files, rather than objdump.
     $obj_tool_map{"otool"} = "otool";
+    $obj_tool_map{"addr2line"} = "false";  # no addr2line
+    $obj_tool_map{"objdump"} = "false";  # no objdump
   }
 
   # Go fill in %obj_tool_map with the pathnames to use:
@@ -3876,9 +4361,8 @@ sub ConfigureTool {
 
 sub cleanup {
   unlink($main::tmpfile_sym);
-  for (my $i = 0; $i < $main::next_tmpfile; $i++) {
-    unlink(PsTempName($i));
-  }
+  unlink(keys %main::tempnames);
+
   # We leave any collected profiles in $HOME/pprof in case the user wants
   # to look at them later.  We print a message informing them of this.
   if ((scalar(@main::profile_files) > 0) &&
@@ -3921,7 +4405,7 @@ sub GetProcedureBoundariesViaNm {
   my $routine = "";
   while (<NM>) {
     s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (m/^([0-9a-f]+) (.) (..*)/) {
+    if (m/^\s*([0-9a-f]+) (.) (..*)/) {
       my $start_val = $1;
       my $type = $2;
       my $this_routine = $3;
@@ -3942,12 +4426,12 @@ sub GetProcedureBoundariesViaNm {
       # we'll just go ahead and process the first entry (which never
       # got touched in the queue), and ignore the others.
       if ($start_val eq $last_start && $type =~ /t/i) {
-	# We are the 'T' symbol at this address, replace previous symbol.
-	$routine = $this_routine;
-	next;
+        # We are the 'T' symbol at this address, replace previous symbol.
+        $routine = $this_routine;
+        next;
       } elsif ($start_val eq $last_start) {
-	# We're not the 'T' symbol at this address, so ignore us.
-	next;
+        # We're not the 'T' symbol at this address, so ignore us.
+        next;
       }
 
       if ($this_routine eq $sep_symbol) {
@@ -3962,7 +4446,7 @@ sub GetProcedureBoundariesViaNm {
 
       if (defined($routine) && $routine =~ m/$regexp/) {
         $symbol_table->{$routine} = [HexExtend($last_start),
-				     HexExtend($start_val)];
+                                     HexExtend($start_val)];
       }
       $last_start = $start_val;
       $routine = $this_routine;
@@ -3981,9 +4465,8 @@ sub GetProcedureBoundariesViaNm {
   # TODO(csilvers): do better here.
   if (defined($routine) && $routine =~ m/$regexp/) {
     $symbol_table->{$routine} = [HexExtend($last_start),
-				 HexExtend($last_start)];
+                                 HexExtend($last_start)];
   }
-
   return $symbol_table;
 }
 
@@ -4029,9 +4512,13 @@ sub GetProcedureBoundaries {
   # -D to at least get *exported* symbols.  If we can't use --demangle,
   # we use c++filt instead, if it exists on this system.
   my @nm_commands = ("$nm -n $flatten_flag $demangle_flag" .
-		     " $image 2>/dev/null $cppfilt_flag",
-		     "$nm -D -n $flatten_flag $demangle_flag" .
-		     " $image 2>/dev/null $cppfilt_flag");
+                     " $image 2>/dev/null $cppfilt_flag",
+                     "$nm -D -n $flatten_flag $demangle_flag" .
+                     " $image 2>/dev/null $cppfilt_flag",
+                     # 6nm is for Go binaries
+		     "6nm $image 2>/dev/null | sort",
+                     );
+
   # If the executable is an MS Windows PDB-format executable, we'll
   # have set up obj_tool_map("nm_pdb").  In this case, we actually
   # want to use both unix nm and windows-specific nm_pdb, since
@@ -4263,4 +4750,3 @@ sub RunUnitTests {
   }
   exit ($error_count);
 }
-
diff --git a/third_party/tcmalloc/chromium/src/span.h b/third_party/tcmalloc/chromium/src/span.h
index ab9a796..b3483ca 100644
--- a/third_party/tcmalloc/chromium/src/span.h
+++ b/third_party/tcmalloc/chromium/src/span.h
@@ -60,6 +60,10 @@ struct Span {
   int value[64];
 #endif
 
+  void* start_ptr() {
+    return reinterpret_cast<void*>(start << kPageShift);
+  }
+
   // What freelist the span is on: IN_USE if on none, or normal or returned
   enum { IN_USE, ON_NORMAL_FREELIST, ON_RETURNED_FREELIST };
 };
diff --git a/third_party/tcmalloc/chromium/src/stacktrace.cc b/third_party/tcmalloc/chromium/src/stacktrace.cc
index d158eea..68cb865 100644
--- a/third_party/tcmalloc/chromium/src/stacktrace.cc
+++ b/third_party/tcmalloc/chromium/src/stacktrace.cc
@@ -57,7 +57,45 @@
 #include "stacktrace_config.h"
 
 #if defined(STACKTRACE_INL_HEADER)
-# include STACKTRACE_INL_HEADER
+
+#define IS_STACK_FRAMES 0
+#define IS_WITH_CONTEXT 0
+#define GET_STACK_TRACE_OR_FRAMES \
+   GetStackTrace(void **result, int max_depth, int skip_count)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+#define IS_STACK_FRAMES 1
+#define IS_WITH_CONTEXT 0
+#define GET_STACK_TRACE_OR_FRAMES \
+  GetStackFrames(void **result, int *sizes, int max_depth, int skip_count)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+#define IS_STACK_FRAMES 0
+#define IS_WITH_CONTEXT 1
+#define GET_STACK_TRACE_OR_FRAMES \
+  GetStackTraceWithContext(void **result, int max_depth, \
+                           int skip_count, const void *ucp)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+#define IS_STACK_FRAMES 1
+#define IS_WITH_CONTEXT 1
+#define GET_STACK_TRACE_OR_FRAMES \
+  GetStackFramesWithContext(void **result, int *sizes, int max_depth, \
+                            int skip_count, const void *ucp)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
 #elif 0
 // This is for the benefit of code analysis tools that may have
 // trouble with the computed #include above.
diff --git a/third_party/tcmalloc/chromium/src/stacktrace_config.h b/third_party/tcmalloc/chromium/src/stacktrace_config.h
index b58ab1d..18f16ab 100644
--- a/third_party/tcmalloc/chromium/src/stacktrace_config.h
+++ b/third_party/tcmalloc/chromium/src/stacktrace_config.h
@@ -53,6 +53,7 @@
 #   define STACKTRACE_SKIP_CONTEXT_ROUTINES 1
 # elif defined(HAVE_LIBUNWIND_H)  // a proxy for having libunwind installed
 #   define STACKTRACE_INL_HEADER "stacktrace_libunwind-inl.h"
+#   define STACKTRACE_USES_LIBUNWIND 1
 # elif defined(__linux)
 #   error Cannnot calculate stack trace: need either libunwind or frame-pointers (see INSTALL file)
 # else
diff --git a/third_party/tcmalloc/chromium/src/stacktrace_generic-inl.h b/third_party/tcmalloc/chromium/src/stacktrace_generic-inl.h
index 490cd9d..0e72ee7 100644
--- a/third_party/tcmalloc/chromium/src/stacktrace_generic-inl.h
+++ b/third_party/tcmalloc/chromium/src/stacktrace_generic-inl.h
@@ -34,57 +34,32 @@
 //
 // Note:  The glibc implementation may cause a call to malloc.
 // This can cause a deadlock in HeapProfiler.
+
+#ifndef BASE_STACKTRACE_GENERIC_INL_H_
+#define BASE_STACKTRACE_GENERIC_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
 #include <execinfo.h>
 #include <string.h>
 #include "google/stacktrace.h"
+#endif  // BASE_STACKTRACE_GENERIC_INL_H_
 
-// If you change this function, also change GetStackFrames below.
-int GetStackTrace(void** result, int max_depth, int skip_count) {
-  static const int kStackLength = 64;
-  void * stack[kStackLength];
-  int size;
-
-  size = backtrace(stack, kStackLength);
-  skip_count++;  // we want to skip the current frame as well
-  int result_count = size - skip_count;
-  if (result_count < 0)
-    result_count = 0;
-  if (result_count > max_depth)
-    result_count = max_depth;
-  for (int i = 0; i < result_count; i++)
-    result[i] = stack[i + skip_count];
-
-  return result_count;
-}
+// Note: this part of the file is included several times.
+// Do not put globals below.
 
-// If you change this function, also change GetStackTrace above:
-//
-// This GetStackFrames routine shares a lot of code with GetStackTrace
-// above. This code could have been refactored into a common routine,
-// and then both GetStackTrace/GetStackFrames could call that routine.
-// There are two problems with that:
-//
-// (1) The performance of the refactored-code suffers substantially - the
-//     refactored needs to be able to record the stack trace when called
-//     from GetStackTrace, and both the stack trace and stack frame sizes,
-//     when called from GetStackFrames - this introduces enough new
-//     conditionals that GetStackTrace performance can degrade by as much
-//     as 50%.
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
 //
-// (2) Whether the refactored routine gets inlined into GetStackTrace and
-//     GetStackFrames depends on the compiler, and we can't guarantee the
-//     behavior either-way, even with "__attribute__ ((always_inline))"
-//     or "__attribute__ ((noinline))". But we need this guarantee or the
-//     frame counts may be off by one.
-//
-// Both (1) and (2) can be addressed without this code duplication, by
-// clever use of template functions, and by defining GetStackTrace and
-// GetStackFrames as macros that expand to these template functions.
-// However, this approach comes with its own set of problems - namely,
-// macros and  preprocessor trouble - for example,  if GetStackTrace
-// and/or GetStackFrames is ever defined as a member functions in some
-// class, we are in trouble.
-int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+int GET_STACK_TRACE_OR_FRAMES {
   static const int kStackLength = 64;
   void * stack[kStackLength];
   int size;
@@ -97,10 +72,12 @@ int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
   if (result_count > max_depth)
     result_count = max_depth;
   for (int i = 0; i < result_count; i++)
-    pcs[i] = stack[i + skip_count];
+    result[i] = stack[i + skip_count];
 
+#if IS_STACK_FRAMES
   // No implementation for finding out the stack frame sizes yet.
   memset(sizes, 0, sizeof(*sizes) * result_count);
+#endif
 
   return result_count;
 }
diff --git a/third_party/tcmalloc/chromium/src/stacktrace_libunwind-inl.h b/third_party/tcmalloc/chromium/src/stacktrace_libunwind-inl.h
index d9d829a..a1d5249 100644
--- a/third_party/tcmalloc/chromium/src/stacktrace_libunwind-inl.h
+++ b/third_party/tcmalloc/chromium/src/stacktrace_libunwind-inl.h
@@ -32,6 +32,11 @@
 //
 // Produce stack trace using libunwind
 
+#ifndef BASE_STACKTRACE_LIBINWIND_INL_H_
+#define BASE_STACKTRACE_LIBINWIND_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
 // We only need local unwinder.
 #define UNW_LOCAL_ONLY
 
@@ -52,73 +57,30 @@ extern "C" {
 // cases, we return 0 to indicate the situation.
 static __thread int recursive;
 
-// If you change this function, also change GetStackFrames below.
-int GetStackTrace(void** result, int max_depth, int skip_count) {
-  void *ip;
-  int n = 0;
-  unw_cursor_t cursor;
-  unw_context_t uc;
+#endif  // BASE_STACKTRACE_LIBINWIND_INL_H_
 
-  if (recursive) {
-    return 0;
-  }
-  ++recursive;
-
-  unw_getcontext(&uc);
-  int ret = unw_init_local(&cursor, &uc);
-  assert(ret >= 0);
-  skip_count++;         // Do not include the "GetStackTrace" frame
-
-  while (n < max_depth) {
-    if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
-      break;
-    }
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-      result[n++] = ip;
-    }
-    if (unw_step(&cursor) <= 0) {
-      break;
-    }
-  }
-  --recursive;
-  return n;
-}
+// Note: this part of the file is included several times.
+// Do not put globals below.
 
-// If you change this function, also change GetStackTrace above:
-//
-// This GetStackFrames routine shares a lot of code with GetStackTrace
-// above. This code could have been refactored into a common routine,
-// and then both GetStackTrace/GetStackFrames could call that routine.
-// There are two problems with that:
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
 //
-// (1) The performance of the refactored-code suffers substantially - the
-//     refactored needs to be able to record the stack trace when called
-//     from GetStackTrace, and both the stack trace and stack frame sizes,
-//     when called from GetStackFrames - this introduces enough new
-//     conditionals that GetStackTrace performance can degrade by as much
-//     as 50%.
-//
-// (2) Whether the refactored routine gets inlined into GetStackTrace and
-//     GetStackFrames depends on the compiler, and we can't guarantee the
-//     behavior either-way, even with "__attribute__ ((always_inline))"
-//     or "__attribute__ ((noinline))". But we need this guarantee or the
-//     frame counts may be off by one.
-//
-// Both (1) and (2) can be addressed without this code duplication, by
-// clever use of template functions, and by defining GetStackTrace and
-// GetStackFrames as macros that expand to these template functions.
-// However, this approach comes with its own set of problems - namely,
-// macros and  preprocessor trouble - for example,  if GetStackTrace
-// and/or GetStackFrames is ever defined as a member functions in some
-// class, we are in trouble.
-int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+int GET_STACK_TRACE_OR_FRAMES {
   void *ip;
   int n = 0;
   unw_cursor_t cursor;
   unw_context_t uc;
+#if IS_STACK_FRAMES
   unw_word_t sp = 0, next_sp = 0;
+#endif
 
   if (recursive) {
     return 0;
@@ -126,31 +88,41 @@ int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
   ++recursive;
 
   unw_getcontext(&uc);
-  RAW_CHECK(unw_init_local(&cursor, &uc) >= 0, "unw_init_local failed");
-  skip_count++;         // Do not include the "GetStackFrames" frame
+  int ret = unw_init_local(&cursor, &uc);
+  assert(ret >= 0);
+  skip_count++;         // Do not include current frame
 
   while (skip_count--) {
-    if (unw_step(&cursor) <= 0 ||
-        unw_get_reg(&cursor, UNW_REG_SP, &next_sp) < 0) {
+    if (unw_step(&cursor) <= 0) {
       goto out;
     }
+#if IS_STACK_FRAMES
+    if (unw_get_reg(&cursor, UNW_REG_SP, &next_sp)) {
+      goto out;
+    }
+#endif
   }
+
   while (n < max_depth) {
-    sp = next_sp;
-    if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0)
+    if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
       break;
-    if (unw_step(&cursor) <= 0 ||
-        unw_get_reg(&cursor, UNW_REG_SP, &next_sp)) {
-      // We couldn't step any further (possibly because we reached _start).
-      // Provide the last good PC we've got, and get out.
-      sizes[n] = 0;
-      pcs[n++] = ip;
+    }
+#if IS_STACK_FRAMES
+    sizes[n] = 0;
+#endif
+    result[n++] = ip;
+    if (unw_step(&cursor) <= 0) {
+      break;
+    }
+#if IS_STACK_FRAMES
+    sp = next_sp;
+    if (unw_get_reg(&cursor, UNW_REG_SP, &next_sp) , 0) {
       break;
     }
-    sizes[n] = next_sp - sp;
-    pcs[n++] = ip;
+    sizes[n - 1] = next_sp - sp;
+#endif
   }
- out:
+out:
   --recursive;
   return n;
 }
diff --git a/third_party/tcmalloc/chromium/src/stacktrace_powerpc-inl.h b/third_party/tcmalloc/chromium/src/stacktrace_powerpc-inl.h
index 5631e49..9a07eea 100644
--- a/third_party/tcmalloc/chromium/src/stacktrace_powerpc-inl.h
+++ b/third_party/tcmalloc/chromium/src/stacktrace_powerpc-inl.h
@@ -36,6 +36,11 @@
 //    http://www.linux-foundation.org/spec/ELF/ppc64/PPC-elf64abi-1.9.html#STACK
 // Linux has similar code: http://patchwork.ozlabs.org/linuxppc/patch?id=8882
 
+#ifndef BASE_STACKTRACE_POWERPC_INL_H_
+#define BASE_STACKTRACE_POWERPC_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
 #include <stdint.h>   // for uintptr_t
 #include <stdlib.h>   // for NULL
 #include <google/stacktrace.h>
@@ -71,9 +76,23 @@ static void **NextStackFrame(void **old_sp) {
 // This ensures that GetStackTrace stes up the Link Register properly.
 void StacktracePowerPCDummyFunction() __attribute__((noinline));
 void StacktracePowerPCDummyFunction() { __asm__ volatile(""); }
+#endif  // BASE_STACKTRACE_POWERPC_INL_H_
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
 
-// If you change this function, also change GetStackFrames below.
-int GetStackTrace(void** result, int max_depth, int skip_count) {
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+int GET_STACK_TRACE_OR_FRAMES {
   void **sp;
   // Apple OS X uses an old version of gnu as -- both Darwin 7.9.0 (Panther)
   // and Darwin 8.8.1 (Tiger) use as 1.38.  This means we have to use a
@@ -95,11 +114,29 @@ int GetStackTrace(void** result, int max_depth, int skip_count) {
   // This routine forces the compiler (at least gcc) to push it anyway.
   StacktracePowerPCDummyFunction();
 
+#if IS_STACK_FRAMES
+  // Note we do *not* increment skip_count here for the SYSV ABI.  If
+  // we did, the list of stack frames wouldn't properly match up with
+  // the list of return addresses.  Note this means the top pc entry
+  // is probably bogus for linux/ppc (and other SYSV-ABI systems).
+#else
   // The LR save area is used by the callee, so the top entry is bogus.
   skip_count++;
+#endif
 
   int n = 0;
   while (sp && n < max_depth) {
+#if IS_STACK_FRAMES
+    // The GetStackFrames routine is called when we are in some
+    // informational context (the failure signal handler for example).
+    // Use the non-strict unwinding rules to produce a stack trace
+    // that is as complete as possible (even if it contains a few bogus
+    // entries in some rare cases).
+    void **next_sp = NextStackFrame<false>(sp);
+#else
+    void **next_sp = NextStackFrame<true>(sp);
+#endif
+
     if (skip_count > 0) {
       skip_count--;
     } else {
@@ -120,85 +157,15 @@ int GetStackTrace(void** result, int max_depth, int skip_count) {
 #else
 #error Need to specify the PPC ABI for your archiecture.
 #endif
-    }
-    // Use strict unwinding rules.
-    sp = NextStackFrame<true>(sp);
-  }
-  return n;
-}
-
-// If you change this function, also change GetStackTrace above:
-//
-// This GetStackFrames routine shares a lot of code with GetStackTrace
-// above. This code could have been refactored into a common routine,
-// and then both GetStackTrace/GetStackFrames could call that routine.
-// There are two problems with that:
-//
-// (1) The performance of the refactored-code suffers substantially - the
-//     refactored needs to be able to record the stack trace when called
-//     from GetStackTrace, and both the stack trace and stack frame sizes,
-//     when called from GetStackFrames - this introduces enough new
-//     conditionals that GetStackTrace performance can degrade by as much
-//     as 50%.
-//
-// (2) Whether the refactored routine gets inlined into GetStackTrace and
-//     GetStackFrames depends on the compiler, and we can't guarantee the
-//     behavior either-way, even with "__attribute__ ((always_inline))"
-//     or "__attribute__ ((noinline))". But we need this guarantee or the
-//     frame counts may be off by one.
-//
-// Both (1) and (2) can be addressed without this code duplication, by
-// clever use of template functions, and by defining GetStackTrace and
-// GetStackFrames as macros that expand to these template functions.
-// However, this approach comes with its own set of problems - namely,
-// macros and  preprocessor trouble - for example,  if GetStackTrace
-// and/or GetStackFrames is ever defined as a member functions in some
-// class, we are in trouble.
-int GetStackFrames(void** pcs, int *sizes, int max_depth, int skip_count) {
-  void **sp;
-#ifdef __APPLE__
-  __asm__ volatile ("mr %0,r1" : "=r" (sp));
-#else
-  __asm__ volatile ("mr %0,1" : "=r" (sp));
-#endif
 
-  StacktracePowerPCDummyFunction();
-  // Note we do *not* increment skip_count here for the SYSV ABI.  If
-  // we did, the list of stack frames wouldn't properly match up with
-  // the list of return addresses.  Note this means the top pc entry
-  // is probably bogus for linux/ppc (and other SYSV-ABI systems).
-
-  int n = 0;
-  while (sp && n < max_depth) {
-    // The GetStackFrames routine is called when we are in some
-    // informational context (the failure signal handler for example).
-    // Use the non-strict unwinding rules to produce a stack trace
-    // that is as complete as possible (even if it contains a few bogus
-    // entries in some rare cases).
-    void **next_sp = NextStackFrame<false>(sp);
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-#if defined(_CALL_AIX) || defined(_CALL_DARWIN)
-      pcs[n++] = *(sp+2);
-#elif defined(_CALL_SYSV)
-      pcs[n++] = *(sp+1);
-#elif defined(__APPLE__) || (defined(__linux) && defined(__PPC64__))
-      // This check is in case the compiler doesn't define _CALL_AIX/etc.
-      pcs[n++] = *(sp+2);
-#elif defined(__linux)
-      // This check is in case the compiler doesn't define _CALL_SYSV.
-      pcs[n++] = *(sp+1);
-#else
-#error Need to specify the PPC ABI for your archiecture.
-#endif
+#if IS_STACK_FRAME
       if (next_sp > sp) {
         sizes[n] = (uintptr_t)next_sp - (uintptr_t)sp;
       } else {
         // A frame-size of 0 is used to indicate unknown frame size.
         sizes[n] = 0;
       }
-      n++;
+#endif
     }
     sp = next_sp;
   }
diff --git a/third_party/tcmalloc/chromium/src/stacktrace_win32-inl.h b/third_party/tcmalloc/chromium/src/stacktrace_win32-inl.h
index 892cd7c..bbd4c43 100644
--- a/third_party/tcmalloc/chromium/src/stacktrace_win32-inl.h
+++ b/third_party/tcmalloc/chromium/src/stacktrace_win32-inl.h
@@ -49,6 +49,11 @@
 // This code is inspired by a patch from David Vitek:
 //   http://code.google.com/p/google-perftools/issues/detail?id=83
 
+#ifndef BASE_STACKTRACE_WIN32_INL_H_
+#define BASE_STACKTRACE_WIN32_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
 #include "config.h"
 #include <windows.h>    // for GetProcAddress and GetModuleHandle
 #include <assert.h>
@@ -82,3 +87,5 @@ PERFTOOLS_DLL_DECL int GetStackFrames(void** /* pcs */,
   assert(0 == "Not yet implemented");
   return 0;
 }
+
+#endif  // BASE_STACKTRACE_WIN32_INL_H_
diff --git a/third_party/tcmalloc/chromium/src/stacktrace_x86-inl.h b/third_party/tcmalloc/chromium/src/stacktrace_x86-inl.h
index 05701e7..6753fdb 100644
--- a/third_party/tcmalloc/chromium/src/stacktrace_x86-inl.h
+++ b/third_party/tcmalloc/chromium/src/stacktrace_x86-inl.h
@@ -31,17 +31,13 @@
 // Author: Sanjay Ghemawat
 //
 // Produce stack trace
-//
-// NOTE: there is code duplication between
-// GetStackTrace, GetStackTraceWithContext, GetStackFrames and
-// GetStackFramesWithContext. If you update one, update them all.
-//
-// There is no easy way to avoid this, because inlining
-// interferes with skip_count, and there is no portable
-// way to turn inlining off, or force it always on.
 
-#include "config.h"
+#ifndef BASE_STACKTRACE_X86_INL_H_
+#define BASE_STACKTRACE_X86_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
 
+#include "config.h"
 #include <stdlib.h>   // for NULL
 #include <assert.h>
 #if defined(HAVE_SYS_UCONTEXT_H)
@@ -190,8 +186,8 @@ static void **NextStackFrame(void **old_sp, const void *uc) {
       const ucontext_t *ucv = static_cast<const ucontext_t *>(uc);
       // This kernel does not use frame pointer in its VDSO code,
       // and so %ebp is not suitable for unwinding.
-      const void **const reg_ebp =
-          reinterpret_cast<const void **>(ucv->uc_mcontext.gregs[REG_EBP]);
+      void **const reg_ebp =
+          reinterpret_cast<void **>(ucv->uc_mcontext.gregs[REG_EBP]);
       const unsigned char *const reg_eip =
           reinterpret_cast<unsigned char *>(ucv->uc_mcontext.gregs[REG_EIP]);
       if (new_sp == reg_ebp &&
@@ -269,209 +265,24 @@ static void **NextStackFrame(void **old_sp, const void *uc) {
   return new_sp;
 }
 
-// If you change this function, see NOTE at the top of file.
-// Same as above, but with signal ucontext_t pointer.
-int GetStackTraceWithContext(void** result,
-                             int max_depth,
-                             int skip_count,
-                             const void *uc) {
-  void **sp;
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) || __llvm__
-  // __builtin_frame_address(0) can return the wrong address on gcc-4.1.0-k8.
-  // It's always correct on llvm, and the techniques below aren't (in
-  // particular, llvm-gcc will make a copy of pcs, so it's not in sp[2]),
-  // so we also prefer __builtin_frame_address when running under llvm.
-  sp = reinterpret_cast<void**>(__builtin_frame_address(0));
-#elif defined(__i386__)
-  // Stack frame format:
-  //    sp[0]   pointer to previous frame
-  //    sp[1]   caller address
-  //    sp[2]   first argument
-  //    ...
-  // NOTE: This will break under llvm, since result is a copy and not in sp[2]
-  sp = (void **)&result - 2;
-#elif defined(__x86_64__)
-  unsigned long rbp;
-  // Move the value of the register %rbp into the local variable rbp.
-  // We need 'volatile' to prevent this instruction from getting moved
-  // around during optimization to before function prologue is done.
-  // An alternative way to achieve this
-  // would be (before this __asm__ instruction) to call Noop() defined as
-  //   static void Noop() __attribute__ ((noinline));  // prevent inlining
-  //   static void Noop() { asm(""); }  // prevent optimizing-away
-  __asm__ volatile ("mov %%rbp, %0" : "=r" (rbp));
-  // Arguments are passed in registers on x86-64, so we can't just
-  // offset from &result
-  sp = (void **) rbp;
-#else
-# error Using stacktrace_x86-inl.h on a non x86 architecture!
-#endif
-
-  int n = 0;
-  while (sp && n < max_depth) {
-    if (*(sp+1) == reinterpret_cast<void *>(0)) {
-      // In 64-bit code, we often see a frame that
-      // points to itself and has a return address of 0.
-      break;
-    }
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-      result[n++] = *(sp+1);
-    }
-    // Use strict unwinding rules.
-    sp = NextStackFrame<true, true>(sp, uc);
-  }
-  return n;
-}
-
-int GetStackTrace(void** result, int max_depth, int skip_count) {
-  void **sp;
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) || __llvm__
-  // __builtin_frame_address(0) can return the wrong address on gcc-4.1.0-k8.
-  // It's always correct on llvm, and the techniques below aren't (in
-  // particular, llvm-gcc will make a copy of pcs, so it's not in sp[2]),
-  // so we also prefer __builtin_frame_address when running under llvm.
-  sp = reinterpret_cast<void**>(__builtin_frame_address(0));
-#elif defined(__i386__)
-  // Stack frame format:
-  //    sp[0]   pointer to previous frame
-  //    sp[1]   caller address
-  //    sp[2]   first argument
-  //    ...
-  // NOTE: This will break under llvm, since result is a copy and not in sp[2]
-  sp = (void **)&result - 2;
-#elif defined(__x86_64__)
-  unsigned long rbp;
-  // Move the value of the register %rbp into the local variable rbp.
-  // We need 'volatile' to prevent this instruction from getting moved
-  // around during optimization to before function prologue is done.
-  // An alternative way to achieve this
-  // would be (before this __asm__ instruction) to call Noop() defined as
-  //   static void Noop() __attribute__ ((noinline));  // prevent inlining
-  //   static void Noop() { asm(""); }  // prevent optimizing-away
-  __asm__ volatile ("mov %%rbp, %0" : "=r" (rbp));
-  // Arguments are passed in registers on x86-64, so we can't just
-  // offset from &result
-  sp = (void **) rbp;
-#else
-# error Using stacktrace_x86-inl.h on a non x86 architecture!
-#endif
+#endif  // BASE_STACKTRACE_X86_INL_H_
 
-  int n = 0;
-  while (sp && n < max_depth) {
-    if (*(sp+1) == reinterpret_cast<void *>(0)) {
-      // In 64-bit code, we often see a frame that
-      // points to itself and has a return address of 0.
-      break;
-    }
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-      result[n++] = *(sp+1);
-    }
-    // Use strict unwinding rules.
-    sp = NextStackFrame<true, false>(sp, NULL);
-  }
-  return n;
-}
+// Note: this part of the file is included several times.
+// Do not put globals below.
 
-// If you change this function, see NOTE at the top of file.
-//
-// This GetStackFrames routine shares a lot of code with GetStackTrace
-// above. This code could have been refactored into a common routine,
-// and then both GetStackTrace/GetStackFrames could call that routine.
-// There are two problems with that:
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
 //
-// (1) The performance of the refactored-code suffers substantially - the
-//     refactored needs to be able to record the stack trace when called
-//     from GetStackTrace, and both the stack trace and stack frame sizes,
-//     when called from GetStackFrames - this introduces enough new
-//     conditionals that GetStackTrace performance can degrade by as much
-//     as 50%.
-//
-// (2) Whether the refactored routine gets inlined into GetStackTrace and
-//     GetStackFrames depends on the compiler, and we can't guarantee the
-//     behavior either-way, even with "__attribute__ ((always_inline))"
-//     or "__attribute__ ((noinline))". But we need this guarantee or the
-//     frame counts may be off by one.
-//
-// Both (1) and (2) can be addressed without this code duplication, by
-// clever use of template functions, and by defining GetStackTrace and
-// GetStackFrames as macros that expand to these template functions.
-// However, this approach comes with its own set of problems - namely,
-// macros and  preprocessor trouble - for example,  if GetStackTrace
-// and/or GetStackFrames is ever defined as a member functions in some
-// class, we are in trouble.
-int GetStackFrames(void** pcs, int* sizes, int max_depth, int skip_count) {
-  void **sp;
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) || __llvm__
-  // __builtin_frame_address(0) can return the wrong address on gcc-4.1.0-k8.
-  // It's always correct on llvm, and the techniques below aren't (in
-  // particular, llvm-gcc will make a copy of pcs, so it's not in sp[2]),
-  // so we also prefer __builtin_frame_address when running under llvm.
-  sp = reinterpret_cast<void**>(__builtin_frame_address(0));
-#elif defined(__i386__)
-  // Stack frame format:
-  //    sp[0]   pointer to previous frame
-  //    sp[1]   caller address
-  //    sp[2]   first argument
-  //    ...
-  sp = (void **)&pcs - 2;
-#elif defined(__x86_64__)
-  unsigned long rbp;
-  // Move the value of the register %rbp into the local variable rbp.
-  // We need 'volatile' to prevent this instruction from getting moved
-  // around during optimization to before function prologue is done.
-  // An alternative way to achieve this
-  // would be (before this __asm__ instruction) to call Noop() defined as
-  //   static void Noop() __attribute__ ((noinline));  // prevent inlining
-  //   static void Noop() { asm(""); }  // prevent optimizing-away
-  __asm__ volatile ("mov %%rbp, %0" : "=r" (rbp));
-  // Arguments are passed in registers on x86-64, so we can't just
-  // offset from &result
-  sp = (void **) rbp;
-#else
-# error Using stacktrace_x86-inl.h on a non x86 architecture!
-#endif
-
-  int n = 0;
-  while (sp && n < max_depth) {
-    if (*(sp+1) == reinterpret_cast<void *>(0)) {
-      // In 64-bit code, we often see a frame that
-      // points to itself and has a return address of 0.
-      break;
-    }
-    // The GetStackFrames routine is called when we are in some
-    // informational context (the failure signal handler for example).
-    // Use the non-strict unwinding rules to produce a stack trace
-    // that is as complete as possible (even if it contains a few bogus
-    // entries in some rare cases).
-    void **next_sp = NextStackFrame<false, false>(sp, NULL);
-    if (skip_count > 0) {
-      skip_count--;
-    } else {
-      pcs[n] = *(sp+1);
-      if (next_sp > sp) {
-        sizes[n] = (uintptr_t)next_sp - (uintptr_t)sp;
-      } else {
-        // A frame-size of 0 is used to indicate unknown frame size.
-        sizes[n] = 0;
-      }
-      n++;
-    }
-    sp = next_sp;
-  }
-  return n;
-}
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
 
-// If you change this function, see NOTE at the top of file.
-// Same as above, but with signal ucontext_t pointer.
-int GetStackFramesWithContext(void** pcs,
-                              int* sizes,
-                              int max_depth,
-                              int skip_count,
-                              const void *uc) {
+int GET_STACK_TRACE_OR_FRAMES {
   void **sp;
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) || __llvm__
   // __builtin_frame_address(0) can return the wrong address on gcc-4.1.0-k8.
@@ -511,22 +322,22 @@ int GetStackFramesWithContext(void** pcs,
       // points to itself and has a return address of 0.
       break;
     }
-    // The GetStackFrames routine is called when we are in some
-    // informational context (the failure signal handler for example).
-    // Use the non-strict unwinding rules to produce a stack trace
-    // that is as complete as possible (even if it contains a few bogus
-    // entries in some rare cases).
-    void **next_sp = NextStackFrame<false, true>(sp, uc);
+#if !IS_WITH_CONTEXT
+    const void *const ucp = NULL;
+#endif
+    void **next_sp = NextStackFrame<!IS_STACK_FRAMES, IS_WITH_CONTEXT>(sp, ucp);
     if (skip_count > 0) {
       skip_count--;
     } else {
-      pcs[n] = *(sp+1);
+      result[n] = *(sp+1);
+#if IS_STACK_FRAMES
       if (next_sp > sp) {
         sizes[n] = (uintptr_t)next_sp - (uintptr_t)sp;
       } else {
         // A frame-size of 0 is used to indicate unknown frame size.
         sizes[n] = 0;
       }
+#endif
       n++;
     }
     sp = next_sp;
diff --git a/third_party/tcmalloc/chromium/src/symbolize.cc b/third_party/tcmalloc/chromium/src/symbolize.cc
index 9dd890e..ff45e3e 100644
--- a/third_party/tcmalloc/chromium/src/symbolize.cc
+++ b/third_party/tcmalloc/chromium/src/symbolize.cc
@@ -87,16 +87,40 @@ int SymbolTable::Symbolize() {
 #else
   // All this work is to do two-way communication.  ugh.
   extern char* program_invocation_name;  // gcc provides this
-  int child_in[2];   // file descriptors
-  int child_out[2];  // for now, we don't worry about child_err
-  if (socketpair(AF_UNIX, SOCK_STREAM, 0, child_in) == -1) {
-    return 0;
-  }
-  if (socketpair(AF_UNIX, SOCK_STREAM, 0, child_out) == -1) {
-    close(child_in[0]);
-    close(child_in[1]);
-    return 0;
+  int *child_in = NULL;   // file descriptors
+  int *child_out = NULL;  // for now, we don't worry about child_err
+  int child_fds[5][2];    // socketpair may be called up to five times below
+
+  // The client program may close its stdin and/or stdout and/or stderr
+  // thus allowing socketpair to reuse file descriptors 0, 1 or 2.
+  // In this case the communication between the forked processes may be broken
+  // if either the parent or the child tries to close or duplicate these
+  // descriptors. The loop below produces two pairs of file descriptors, each
+  // greater than 2 (stderr).
+  for (int i = 0; i < 5; i++) {
+    if (socketpair(AF_UNIX, SOCK_STREAM, 0, child_fds[i]) == -1) {
+      for (int j = 0; j < i; j++) {
+        close(child_fds[j][0]);
+        close(child_fds[j][1]);
+        return 0;
+      }
+    } else {
+      if ((child_fds[i][0] > 2) && (child_fds[i][1] > 2)) {
+        if (child_in == NULL) {
+          child_in = child_fds[i];
+        } else {
+          child_out = child_fds[i];
+          for (int j = 0; j < i; j++) {
+            if (child_fds[j] == child_in) continue;
+            close(child_fds[j][0]);
+            close(child_fds[j][1]);
+          }
+          break;
+        }
+      }
+    }
   }
+
   switch (fork()) {
     case -1: {  // error
       close(child_in[0]);
diff --git a/third_party/tcmalloc/chromium/src/system-alloc.cc b/third_party/tcmalloc/chromium/src/system-alloc.cc
index 21d9b43..29bed80 100644
--- a/third_party/tcmalloc/chromium/src/system-alloc.cc
+++ b/third_party/tcmalloc/chromium/src/system-alloc.cc
@@ -78,7 +78,7 @@ union MemoryAligner {
   void*  p;
   double d;
   size_t s;
-};
+} CACHELINE_ALIGNED;
 
 static SpinLock spinlock(SpinLock::LINKER_INITIALIZED);
 
@@ -150,6 +150,10 @@ bool RegisterSystemAllocator(SysAllocator *a, int priority) {
 
 void* SbrkSysAllocator::Alloc(size_t size, size_t *actual_size,
                               size_t alignment) {
+#ifndef HAVE_SBRK
+  failed_ = true;
+  return NULL;
+#else
   // Check if we should use sbrk allocation.
   // FLAGS_malloc_skip_sbrk starts out as false (its uninitialized
   // state) and eventually gets initialized to the specified value.  Note
@@ -164,16 +168,16 @@ void* SbrkSysAllocator::Alloc(size_t size, size_t *actual_size,
   // a strict check here
   if (static_cast<ptrdiff_t>(size + alignment) < 0) return NULL;
 
-  // could theoretically return the "extra" bytes here, but this
-  // is simple and correct.
-  if (actual_size) {
-    *actual_size = size;
-  }
-
   // This doesn't overflow because TCMalloc_SystemAlloc has already
   // tested for overflow at the alignment boundary.
   size = ((size + alignment - 1) / alignment) * alignment;
 
+  // "actual_size" indicates that the bytes from the returned pointer
+  // p up to and including (p + actual_size - 1) have been allocated.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
   // Check that we we're not asking for so much more memory that we'd
   // wrap around the end of the virtual address space.  (This seems
   // like something sbrk() should check for us, and indeed opensolaris
@@ -216,6 +220,7 @@ void* SbrkSysAllocator::Alloc(size_t size, size_t *actual_size,
     ptr += alignment - (ptr & (alignment-1));
   }
   return reinterpret_cast<void*>(ptr);
+#endif  // HAVE_SBRK
 }
 
 void SbrkSysAllocator::DumpStats(TCMalloc_Printer* printer) {
@@ -238,12 +243,6 @@ void* MmapSysAllocator::Alloc(size_t size, size_t *actual_size,
     return NULL;
   }
 
-  // could theoretically return the "extra" bytes here, but this
-  // is simple and correct.
-  if (actual_size) {
-    *actual_size = size;
-  }
-
   // Enforce page alignment
   if (pagesize == 0) pagesize = getpagesize();
   if (alignment < pagesize) alignment = pagesize;
@@ -253,6 +252,12 @@ void* MmapSysAllocator::Alloc(size_t size, size_t *actual_size,
   }
   size = aligned_size;
 
+  // "actual_size" indicates that the bytes from the returned pointer
+  // p up to and including (p + actual_size - 1) have been allocated.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
   // Ask for extra memory if alignment > pagesize
   size_t extra = 0;
   if (alignment > pagesize) {
@@ -328,12 +333,6 @@ void* DevMemSysAllocator::Alloc(size_t size, size_t *actual_size,
     initialized = true;
   }
 
-  // could theoretically return the "extra" bytes here, but this
-  // is simple and correct.
-  if (actual_size) {
-    *actual_size = size;
-  }
-
   // Enforce page alignment
   if (pagesize == 0) pagesize = getpagesize();
   if (alignment < pagesize) alignment = pagesize;
@@ -343,6 +342,12 @@ void* DevMemSysAllocator::Alloc(size_t size, size_t *actual_size,
   }
   size = aligned_size;
 
+  // "actual_size" indicates that the bytes from the returned pointer
+  // p up to and including (p + actual_size - 1) have been allocated.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
   // Ask for extra memory if alignment > pagesize
   size_t extra = 0;
   if (alignment > pagesize) {
diff --git a/third_party/tcmalloc/chromium/src/system-alloc.h b/third_party/tcmalloc/chromium/src/system-alloc.h
index 60affed..8d982ef 100644
--- a/third_party/tcmalloc/chromium/src/system-alloc.h
+++ b/third_party/tcmalloc/chromium/src/system-alloc.h
@@ -48,7 +48,11 @@
 // may optionally return more bytes than asked for (i.e. return an
 // entire "huge" page if a huge page allocator is in use).
 //
-// The returned pointer is a multiple of "alignment" if non-zero.
+// The returned pointer is a multiple of "alignment" if non-zero. The
+// returned pointer will always be aligned suitably for holding a
+// void*, double, or size_t. In addition, if this platform defines
+// CACHELINE_ALIGNED, the return pointer will always be cacheline
+// aligned.
 //
 // Returns NULL when out of memory.
 extern void* TCMalloc_SystemAlloc(size_t bytes, size_t *actual_bytes,
diff --git a/third_party/tcmalloc/chromium/src/tcmalloc.cc b/third_party/tcmalloc/chromium/src/tcmalloc.cc
index 6acead8..79825ce 100644
--- a/third_party/tcmalloc/chromium/src/tcmalloc.cc
+++ b/third_party/tcmalloc/chromium/src/tcmalloc.cc
@@ -228,8 +228,9 @@ extern "C" {
       ATTRIBUTE_SECTION(google_malloc);
   void* tc_newarray_nothrow(size_t size, const std::nothrow_t&) __THROW
       ATTRIBUTE_SECTION(google_malloc);
-  // Surprisingly, compilers use a nothrow-delete internally.  See, eg:
-  //   http://www.dinkumware.com/manuals/?manual=compleat&page=new.html
+  // Surprisingly, standard C++ library implementations use a
+  // nothrow-delete internally.  See, eg:
+  // http://www.dinkumware.com/manuals/?manual=compleat&page=new.html
   void tc_delete_nothrow(void* ptr, const std::nothrow_t&) __THROW
       ATTRIBUTE_SECTION(google_malloc);
   void tc_deletearray_nothrow(void* ptr, const std::nothrow_t&) __THROW
@@ -253,9 +254,9 @@ extern "C" {
   // NOTE: we make many of these symbols weak, but do so in the makefile
   //       (via objcopy -W) and not here.  That ends up being more portable.
 # define ALIAS(x) __attribute__ ((alias (x)))
-void* operator new(size_t size)                  ALIAS("tc_new");
+void* operator new(size_t size) throw (std::bad_alloc) ALIAS("tc_new");
 void operator delete(void* p) __THROW            ALIAS("tc_delete");
-void* operator new[](size_t size)                ALIAS("tc_newarray");
+void* operator new[](size_t size) throw (std::bad_alloc) ALIAS("tc_newarray");
 void operator delete[](void* p) __THROW          ALIAS("tc_deletearray");
 void* operator new(size_t size, const std::nothrow_t&) __THROW
                                                  ALIAS("tc_new_nothrow");
@@ -264,7 +265,7 @@ void* operator new[](size_t size, const std::nothrow_t&) __THROW
 void operator delete(void* size, const std::nothrow_t&) __THROW
                                                  ALIAS("tc_delete_nothrow");
 void operator delete[](void* size, const std::nothrow_t&) __THROW
-                                                 ALIAS("tc_deletearray_nothrow");
+                                                ALIAS("tc_deletearray_nothrow");
 extern "C" {
   void* malloc(size_t size) __THROW              ALIAS("tc_malloc");
   void  free(void* ptr) __THROW                  ALIAS("tc_free");
@@ -804,7 +805,17 @@ TCMallocGuard::TCMallocGuard() {
     tc_free(tc_malloc(1));
     ThreadCache::InitTSD();
     tc_free(tc_malloc(1));
-    MallocExtension::Register(new TCMallocImplementation);
+    // Either we, or debugallocation.cc, or valgrind will control memory
+    // management.  We register our extension if we're the winner.
+#ifdef TCMALLOC_FOR_DEBUGALLOCATION
+    // Let debugallocation register its extension.
+#else
+    if (RunningOnValgrind()) {
+      // Let Valgrind uses its own malloc (so don't register our extension).
+    } else {
+      MallocExtension::Register(new TCMallocImplementation);
+    }
+#endif
   }
 }
 
@@ -826,7 +837,28 @@ static TCMallocGuard module_enter_exit_hook;
 // Helpers for the exported routines below
 //-------------------------------------------------------------------
 
-static Span* DoSampledAllocation(size_t size) {
+static inline void* CheckedMallocResult(void *result) {
+  Span* fetched_span;
+  size_t cl;
+
+  if (result != NULL) {
+    ASSERT(Static::pageheap()->GetSizeClassOrSpan(result, &cl, &fetched_span));
+  }
+
+  return result;
+}
+
+static inline void* SpanToMallocResult(Span *span) {
+  Span* fetched_span = NULL;
+  size_t cl = 0;
+  ASSERT(Static::pageheap()->GetSizeClassOrSpan(span->start_ptr(),
+                                                &cl, &fetched_span));
+  ASSERT(cl == kLargeSizeClass);
+  ASSERT(span == fetched_span);
+  return span->start_ptr();
+}
+
+static void* DoSampledAllocation(size_t size) {
   // Grab the stack trace outside the heap lock
   StackTrace tmp;
   tmp.depth = GetStackTrace(tmp.stack, tcmalloc::kMaxStackDepth, 1);
@@ -834,7 +866,8 @@ static Span* DoSampledAllocation(size_t size) {
 
   SpinLockHolder h(Static::pageheap_lock());
   // Allocate span
-  Span *span = Static::pageheap()->New(tcmalloc::pages(size == 0 ? 1 : size));
+  Span *span = Static::pageheap()->New(tcmalloc::pages(size == 0 ? 1 : size),
+                                       kLargeSizeClass, kPageSize);
   if (span == NULL) {
     return NULL;
   }
@@ -851,26 +884,7 @@ static Span* DoSampledAllocation(size_t size) {
   span->objects = stack;
   tcmalloc::DLL_Prepend(Static::sampled_objects(), span);
 
-  return span;
-}
-
-static inline bool CheckCachedSizeClass(void *ptr) {
-  PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
-  size_t cached_value = Static::pageheap()->GetSizeClassIfCached(p);
-  return cached_value == 0 ||
-      cached_value == Static::pageheap()->GetDescriptor(p)->sizeclass;
-}
-
-static inline void* CheckedMallocResult(void *result)
-{
-  ASSERT(result == 0 || CheckCachedSizeClass(result));
-  return result;
-}
-
-static inline void* SpanToMallocResult(Span *span) {
-  Static::pageheap()->CacheSizeClass(span->start, 0);
-  return
-      CheckedMallocResult(reinterpret_cast<void*>(span->start << kPageShift));
+  return SpanToMallocResult(span);
 }
 
 // Copy of FLAGS_tcmalloc_large_alloc_report_threshold with
@@ -916,24 +930,39 @@ inline void* do_memalign_or_cpp_memalign(size_t align, size_t size) {
   return tc_new_mode ? cpp_memalign(align, size) : do_memalign(align, size);
 }
 
+// Must be called with the page lock held.
+inline bool should_report_large(Length num_pages) {
+  const int64 threshold = large_alloc_threshold;
+  if (threshold > 0 && num_pages >= (threshold >> kPageShift)) {
+    // Increase the threshold by 1/8 every time we generate a report.
+    // We cap the threshold at 8GB to avoid overflow problems.
+    large_alloc_threshold = (threshold + threshold/8 < 8ll<<30
+                             ? threshold + threshold/8 : 8ll<<30);
+    return true;
+  }
+  return false;
+}
+
 // Helper for do_malloc().
-inline void* do_malloc_pages(Length num_pages) {
-  Span *span;
-  bool report_large = false;
-  {
+inline void* do_malloc_pages(ThreadCache* heap, size_t size) {
+  void* result;
+  bool report_large;
+
+  Length num_pages = tcmalloc::pages(size);
+  size = num_pages << kPageShift;
+
+  if ((FLAGS_tcmalloc_sample_parameter > 0) && heap->SampleAllocation(size)) {
+    result = DoSampledAllocation(size);
+
     SpinLockHolder h(Static::pageheap_lock());
-    span = Static::pageheap()->New(num_pages);
-    const int64 threshold = large_alloc_threshold;
-    if (threshold > 0 && num_pages >= (threshold >> kPageShift)) {
-      // Increase the threshold by 1/8 every time we generate a report.
-      // We cap the threshold at 8GB to avoid overflow problems.
-      large_alloc_threshold = (threshold + threshold/8 < 8ll<<30
-                               ? threshold + threshold/8 : 8ll<<30);
-      report_large = true;
-    }
+    report_large = should_report_large(num_pages);
+  } else {
+    SpinLockHolder h(Static::pageheap_lock());
+    Span* span = Static::pageheap()->New(num_pages, kLargeSizeClass, kPageSize);
+    result = (span == NULL ? NULL : SpanToMallocResult(span));
+    report_large = should_report_large(num_pages);
   }
 
-  void* result = (span == NULL ? NULL : SpanToMallocResult(span));
   if (report_large) {
     ReportLargeAlloc(num_pages, result);
   }
@@ -945,17 +974,19 @@ inline void* do_malloc(size_t size) {
 
   // The following call forces module initialization
   ThreadCache* heap = ThreadCache::GetCache();
-  if ((FLAGS_tcmalloc_sample_parameter > 0) && heap->SampleAllocation(size)) {
-    Span* span = DoSampledAllocation(size);
-    if (span != NULL) {
-      ret = SpanToMallocResult(span);
+  if (size <= kMaxSize) {
+    size_t cl = Static::sizemap()->SizeClass(size);
+    size = Static::sizemap()->class_to_size(cl);
+
+    if ((FLAGS_tcmalloc_sample_parameter > 0) && heap->SampleAllocation(size)) {
+      ret = DoSampledAllocation(size);
+    } else {
+      // The common case, and also the simplest.  This just pops the
+      // size-appropriate freelist, after replenishing it if it's empty.
+      ret = CheckedMallocResult(heap->Allocate(size, cl));
     }
-  } else if (size <= kMaxSize) {
-    // The common case, and also the simplest.  This just pops the
-    // size-appropriate freelist, after replenishing it if it's empty.
-    ret = CheckedMallocResult(heap->Allocate(size));
   } else {
-    ret = do_malloc_pages(tcmalloc::pages(size));
+    ret = do_malloc_pages(heap, size);
   }
   if (ret == NULL) errno = ENOMEM;
   return ret;
@@ -983,28 +1014,22 @@ static inline ThreadCache* GetCacheIfPresent() {
 inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) {
   if (ptr == NULL) return;
   ASSERT(Static::pageheap() != NULL);  // Should not call free() before malloc()
-  const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
-  Span* span = NULL;
-  size_t cl = Static::pageheap()->GetSizeClassIfCached(p);
-
-  if (cl == 0) {
-    span = Static::pageheap()->GetDescriptor(p);
-    if (!span) {
-      // span can be NULL because the pointer passed in is invalid
-      // (not something returned by malloc or friends), or because the
-      // pointer was allocated with some other allocator besides
-      // tcmalloc.  The latter can happen if tcmalloc is linked in via
-      // a dynamic library, but is not listed last on the link line.
-      // In that case, libraries after it on the link line will
-      // allocate with libc malloc, but free with tcmalloc's free.
-      (*invalid_free_fn)(ptr);  // Decide how to handle the bad free request
-      return;
-    }
-    cl = span->sizeclass;
-    Static::pageheap()->CacheSizeClass(p, cl);
+  Span* span;
+  size_t cl;
+
+  if (!Static::pageheap()->GetSizeClassOrSpan(ptr, &cl, &span)) {
+    // result can be false because the pointer passed in is invalid
+    // (not something returned by malloc or friends), or because the
+    // pointer was allocated with some other allocator besides
+    // tcmalloc.  The latter can happen if tcmalloc is linked in via
+    // a dynamic library, but is not listed last on the link line.
+    // In that case, libraries after it on the link line will
+    // allocate with libc malloc, but free with tcmalloc's free.
+    (*invalid_free_fn)(ptr);  // Decide how to handle the bad free request
+    return;
   }
-  if (cl != 0) {
-    ASSERT(!Static::pageheap()->GetDescriptor(p)->sample);
+
+  if (cl != kLargeSizeClass) {
     ThreadCache* heap = GetCacheIfPresent();
     if (heap != NULL) {
       heap->Deallocate(ptr, cl);
@@ -1015,8 +1040,7 @@ inline void do_free_with_callback(void* ptr, void (*invalid_free_fn)(void*)) {
     }
   } else {
     SpinLockHolder h(Static::pageheap_lock());
-    ASSERT(reinterpret_cast<uintptr_t>(ptr) % kPageSize == 0);
-    ASSERT(span != NULL && span->start == p);
+    ASSERT(span != NULL && ptr == span->start_ptr());
     if (span->sample) {
       tcmalloc::DLL_Remove(span);
       Static::stacktrace_allocator()->Delete(
@@ -1036,20 +1060,17 @@ inline size_t GetSizeWithCallback(void* ptr,
                                   size_t (*invalid_getsize_fn)(void*)) {
   if (ptr == NULL)
     return 0;
-  const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
-  size_t cl = Static::pageheap()->GetSizeClassIfCached(p);
-  if (cl != 0) {
+
+  Span* span;
+  size_t cl;
+  if (!Static::pageheap()->GetSizeClassOrSpan(ptr, &cl, &span)) {
+    return (*invalid_getsize_fn)(ptr);
+  }
+
+  if (cl != kLargeSizeClass) {
     return Static::sizemap()->ByteSizeForClass(cl);
   } else {
-    Span *span = Static::pageheap()->GetDescriptor(p);
-    if (span == NULL) {  // means we do not own this memory
-      return (*invalid_getsize_fn)(ptr);
-    } else if (span->sizeclass != 0) {
-      Static::pageheap()->CacheSizeClass(p, span->sizeclass);
-      return Static::sizemap()->ByteSizeForClass(span->sizeclass);
-    } else {
-      return span->length << kPageShift;
-    }
+    return span->length << kPageShift;
   }
 }
 
@@ -1136,47 +1157,18 @@ void* do_memalign(size_t align, size_t size) {
     }
     if (cl < kNumClasses) {
       ThreadCache* heap = ThreadCache::GetCache();
-      return CheckedMallocResult(heap->Allocate(
-                                     Static::sizemap()->class_to_size(cl)));
+      size = Static::sizemap()->class_to_size(cl);
+      return CheckedMallocResult(heap->Allocate(size, cl));
     }
   }
 
   // We will allocate directly from the page heap
   SpinLockHolder h(Static::pageheap_lock());
 
-  if (align <= kPageSize) {
-    // Any page-level allocation will be fine
-    // TODO: We could put the rest of this page in the appropriate
-    // TODO: cache but it does not seem worth it.
-    Span* span = Static::pageheap()->New(tcmalloc::pages(size));
-    return span == NULL ? NULL : SpanToMallocResult(span);
-  }
-
-  // Allocate extra pages and carve off an aligned portion
-  const Length alloc = tcmalloc::pages(size + align);
-  Span* span = Static::pageheap()->New(alloc);
-  if (span == NULL) return NULL;
-
-  // Skip starting portion so that we end up aligned
-  Length skip = 0;
-  while ((((span->start+skip) << kPageShift) & (align - 1)) != 0) {
-    skip++;
-  }
-  ASSERT(skip < alloc);
-  if (skip > 0) {
-    Span* rest = Static::pageheap()->Split(span, skip);
-    Static::pageheap()->Delete(span);
-    span = rest;
-  }
-
-  // Skip trailing portion that we do not need to return
-  const Length needed = tcmalloc::pages(size);
-  ASSERT(span->length >= needed);
-  if (span->length > needed) {
-    Span* trailer = Static::pageheap()->Split(span, needed);
-    Static::pageheap()->Delete(trailer);
-  }
-  return SpanToMallocResult(span);
+  // Any page-level allocation will be fine
+  Span* span = Static::pageheap()->New(tcmalloc::pages(size),
+                                       kLargeSizeClass, align);
+  return span == NULL ? NULL : SpanToMallocResult(span);
 }
 
 // Helpers for use by exported routines below:
@@ -1392,8 +1384,7 @@ extern "C" PERFTOOLS_DLL_DECL void* tc_new(size_t size) {
   return p;
 }
 
-extern "C" PERFTOOLS_DLL_DECL void* tc_new_nothrow(
-    size_t size, const std::nothrow_t&) __THROW {
+extern "C" PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size, const std::nothrow_t&) __THROW {
   void* p = cpp_alloc(size, true);
   MallocHook::InvokeNewHook(p, size);
   return p;
@@ -1404,10 +1395,10 @@ extern "C" PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW {
   do_free(p);
 }
 
-// Compilers define and use this (via ::operator delete(ptr, nothrow)).
+// Standard C++ library implementations define and use this
+// (via ::operator delete(ptr, nothrow)).
 // But it's really the same as normal delete, so we just do the same thing.
-extern "C" PERFTOOLS_DLL_DECL void tc_delete_nothrow(
-    void* p, const std::nothrow_t&) __THROW {
+extern "C" PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p, const std::nothrow_t&) __THROW {
   MallocHook::InvokeDeleteHook(p);
   do_free(p);
 }
@@ -1423,8 +1414,8 @@ extern "C" PERFTOOLS_DLL_DECL void* tc_newarray(size_t size) {
   return p;
 }
 
-extern "C" PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(
-    size_t size, const std::nothrow_t&) __THROW {
+extern "C" PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size, const std::nothrow_t&)
+    __THROW {
   void* p = cpp_alloc(size, true);
   MallocHook::InvokeNewHook(p, size);
   return p;
@@ -1435,8 +1426,7 @@ extern "C" PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW {
   do_free(p);
 }
 
-extern "C" PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(
-    void* p, const std::nothrow_t&) __THROW {
+extern "C" PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p, const std::nothrow_t&) __THROW {
   MallocHook::InvokeDeleteHook(p);
   do_free(p);
 }
diff --git a/third_party/tcmalloc/chromium/src/tests/debugallocation_test.cc b/third_party/tcmalloc/chromium/src/tests/debugallocation_test.cc
index ca00e36..c482187 100644
--- a/third_party/tcmalloc/chromium/src/tests/debugallocation_test.cc
+++ b/third_party/tcmalloc/chromium/src/tests/debugallocation_test.cc
@@ -75,7 +75,14 @@ static int test_counter = 0;    // incremented every time the macro is called
 // This flag won't be compiled in in opt mode.
 DECLARE_int32(max_free_queue_size);
 
+// Test match as well as mismatch rules:
 TEST(DebugAllocationTest, DeallocMismatch) {
+  // malloc can be matched only by free
+  // new can be matched only by delete and delete(nothrow)
+  // new[] can be matched only by delete[] and delete[](nothrow)
+  // new(nothrow) can be matched only by delete and delete(nothrow)
+  // new(nothrow)[] can be matched only by delete[] and delete[](nothrow)
+
   // Allocate with malloc.
   {
     int* x = static_cast<int*>(malloc(sizeof(*x)));
@@ -88,17 +95,41 @@ TEST(DebugAllocationTest, DeallocMismatch) {
   // Allocate with new.
   {
     int* x = new int;
+    int* y = new int;
     IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
     IF_DEBUG_EXPECT_DEATH(delete [] x, "mismatch.*being dealloc.*delete *[[]");
     delete x;
+    ::operator delete(y, std::nothrow);
   }
 
   // Allocate with new[].
   {
     int* x = new int[1];
+    int* y = new int[1];
+    IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
+    IF_DEBUG_EXPECT_DEATH(delete x, "mismatch.*being dealloc.*delete");
+    delete [] x;
+    ::operator delete[](y, std::nothrow);
+  }
+
+  // Allocate with new(nothrow).
+  {
+    int* x = new(std::nothrow) int;
+    int* y = new(std::nothrow) int;
+    IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
+    IF_DEBUG_EXPECT_DEATH(delete [] x, "mismatch.*being dealloc.*delete *[[]");
+    delete x;
+    ::operator delete(y, std::nothrow);
+  }
+
+  // Allocate with new(nothrow)[].
+  {
+    int* x = new(std::nothrow) int[1];
+    int* y = new(std::nothrow) int[1];
     IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
     IF_DEBUG_EXPECT_DEATH(delete x, "mismatch.*being dealloc.*delete");
     delete [] x;
+    ::operator delete[](y, std::nothrow);
   }
 }
 
diff --git a/third_party/tcmalloc/chromium/src/tests/heap-checker-death_unittest.sh b/third_party/tcmalloc/chromium/src/tests/heap-checker-death_unittest.sh
index 9f0c08c..4a83fc2 100644
--- a/third_party/tcmalloc/chromium/src/tests/heap-checker-death_unittest.sh
+++ b/third_party/tcmalloc/chromium/src/tests/heap-checker-death_unittest.sh
@@ -139,13 +139,13 @@ EARLY_MSG="Starting tracking the heap$"
 
 Test 60 0 "$EARLY_MSG" "" \
   HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
-  PERFTOOLS_VERBOSE=1 || exit 5
+  PERFTOOLS_VERBOSE=10 || exit 5
 Test 60 0 "MemoryRegionMap Init$" "" \
   HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
-  PERFTOOLS_VERBOSE=2 || exit 6
+  PERFTOOLS_VERBOSE=11 || exit 6
 Test 60 0 "" "$EARLY_MSG" \
   HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
-  PERFTOOLS_VERBOSE=-2 || exit 7
+  PERFTOOLS_VERBOSE=-11 || exit 7
 
 # These invocations should fail with very high probability,
 # rather than return 0 or hang (1 == exit(1), 134 == abort(), 139 = SIGSEGV):
@@ -162,10 +162,10 @@ Test 60 1 "MakeALeak" "" \
 
 # Test that very early log messages are present and controllable:
 Test 60 1 "Starting tracking the heap$" "" \
-  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=1 \
+  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=10 \
   || exit 11
 Test 60 1 "" "Starting tracking the heap" \
-  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=-1 \
+  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=-10 \
   || exit 12
 
 cd /    # so we're not in TMPDIR when we delete it
diff --git a/third_party/tcmalloc/chromium/src/tests/page_heap_test.cc b/third_party/tcmalloc/chromium/src/tests/page_heap_test.cc
index 9120b78..fd444da 100644
--- a/third_party/tcmalloc/chromium/src/tests/page_heap_test.cc
+++ b/third_party/tcmalloc/chromium/src/tests/page_heap_test.cc
@@ -26,7 +26,7 @@ static void TestPageHeap_Stats() {
   CheckStats(ph, 0, 0, 0);
 
   // Allocate a span 's1'
-  tcmalloc::Span* s1 = ph->New(256);
+  tcmalloc::Span* s1 = ph->New(256, kLargeSizeClass, kPageSize);
   CheckStats(ph, 256, 0, 0);
 
   // Split span 's1' into 's1', 's2'.  Delete 's2'
diff --git a/third_party/tcmalloc/chromium/src/tests/profiler_unittest.cc b/third_party/tcmalloc/chromium/src/tests/profiler_unittest.cc
index 1908b03..19371b7 100644
--- a/third_party/tcmalloc/chromium/src/tests/profiler_unittest.cc
+++ b/third_party/tcmalloc/chromium/src/tests/profiler_unittest.cc
@@ -56,12 +56,11 @@ static void test_other_thread() {
 
   int i, m;
   char b[128];
+  MutexLock ml(&mutex);
   for (m = 0; m < 1000000; ++m) {          // run millions of times
     for (i = 0; i < g_iters; ++i ) {
-      MutexLock ml(&mutex);
       result ^= i;
     }
-    MutexLock ml(&mutex);
     snprintf(b, sizeof(b), "%d", result);  // get some libc action
   }
 #endif
@@ -70,12 +69,11 @@ static void test_other_thread() {
 static void test_main_thread() {
   int i, m;
   char b[128];
+  MutexLock ml(&mutex);
   for (m = 0; m < 1000000; ++m) {          // run millions of times
     for (i = 0; i < g_iters; ++i ) {
-      MutexLock ml(&mutex);
       result ^= i;
     }
-    MutexLock ml(&mutex);
     snprintf(b, sizeof(b), "%d", result);  // get some libc action
   }
 }
diff --git a/third_party/tcmalloc/chromium/src/tests/profiler_unittest.sh b/third_party/tcmalloc/chromium/src/tests/profiler_unittest.sh
index 5766f2e..4668fa7 100644
--- a/third_party/tcmalloc/chromium/src/tests/profiler_unittest.sh
+++ b/third_party/tcmalloc/chromium/src/tests/profiler_unittest.sh
@@ -206,28 +206,27 @@ CPUPROFILE="$TMPDIR/p5" "$PROFILER2" 50 || RegisterFailure
 CPUPROFILE="$TMPDIR/p6" "$PROFILER2" 100 || RegisterFailure
 VerifySimilar p5 "$PROFILER2_REALNAME" p6 "$PROFILER2_REALNAME" 2
 
-# When we compile with threads, things take a lot longer even when we only use 1
-CPUPROFILE="$TMPDIR/p5b" "$PROFILER3" 10 || RegisterFailure
-CPUPROFILE="$TMPDIR/p5c" "$PROFILER3" 20 || RegisterFailure
+CPUPROFILE="$TMPDIR/p5b" "$PROFILER3" 30 || RegisterFailure
+CPUPROFILE="$TMPDIR/p5c" "$PROFILER3" 60 || RegisterFailure
 VerifySimilar p5b "$PROFILER3_REALNAME" p5c "$PROFILER3_REALNAME" 2
 
 # Now try what happens when we use threads
-"$PROFILER3" 5 2 "$TMPDIR/p7" || RegisterFailure
-"$PROFILER3" 10 2 "$TMPDIR/p8" || RegisterFailure
+"$PROFILER3" 30 2 "$TMPDIR/p7" || RegisterFailure
+"$PROFILER3" 60 2 "$TMPDIR/p8" || RegisterFailure
 VerifySimilar p7 "$PROFILER3_REALNAME" p8 "$PROFILER3_REALNAME" 2
 
-"$PROFILER4" 5 2 "$TMPDIR/p9" || RegisterFailure
-"$PROFILER4" 10 2 "$TMPDIR/p10" || RegisterFailure
+"$PROFILER4" 30 2 "$TMPDIR/p9" || RegisterFailure
+"$PROFILER4" 60 2 "$TMPDIR/p10" || RegisterFailure
 VerifySimilar p9 "$PROFILER4_REALNAME" p10 "$PROFILER4_REALNAME" 2
 
 # More threads!
-"$PROFILER4" 2 3 "$TMPDIR/p9" || RegisterFailure
-"$PROFILER4" 4 3 "$TMPDIR/p10" || RegisterFailure
+"$PROFILER4" 25 3 "$TMPDIR/p9" || RegisterFailure
+"$PROFILER4" 50 3 "$TMPDIR/p10" || RegisterFailure
 VerifySimilar p9 "$PROFILER4_REALNAME" p10 "$PROFILER4_REALNAME" 2
 
 # Compare how much time the main thread takes compared to the other threads
 # Recall the main thread runs twice as long as the other threads, by design.
-"$PROFILER4" 2 4 "$TMPDIR/p11" || RegisterFailure
+"$PROFILER4" 20 4 "$TMPDIR/p11" || RegisterFailure
 VerifyAcrossThreads p11 "$PROFILER4_REALNAME" 2
 
 # Test symbol save and restore
@@ -236,14 +235,14 @@ VerifyAcrossThreads p11 "$PROFILER4_REALNAME" 2
     >"$TMPDIR/p13" 2>/dev/null || RegisterFailure
 VerifyIdentical p12 "$PROFILER1_REALNAME" p13 "" || RegisterFailure
 
-"$PROFILER3" 5 2 "$TMPDIR/p14" || RegisterFailure
+"$PROFILER3" 30 2 "$TMPDIR/p14" || RegisterFailure
 "$PPROF" $PPROF_FLAGS "$PROFILER3_REALNAME" "$TMPDIR/p14" --raw \
     >"$TMPDIR/p15" 2>/dev/null || RegisterFailure
 VerifyIdentical p14 "$PROFILER3_REALNAME" p15 "" || RegisterFailure
 
 # Test using ITIMER_REAL instead of ITIMER_PROF.
-env CPUPROFILE_REALTIME=1 "$PROFILER3" 5 2 "$TMPDIR/p16" || RegisterFailure
-env CPUPROFILE_REALTIME=1 "$PROFILER3" 10 2 "$TMPDIR/p17" || RegisterFailure
+env CPUPROFILE_REALTIME=1 "$PROFILER3" 30 2 "$TMPDIR/p16" || RegisterFailure
+env CPUPROFILE_REALTIME=1 "$PROFILER3" 60 2 "$TMPDIR/p17" || RegisterFailure
 VerifySimilar p16 "$PROFILER3_REALNAME" p17 "$PROFILER3_REALNAME" 2
 
 
diff --git a/third_party/tcmalloc/chromium/src/tests/tcmalloc_unittest.cc b/third_party/tcmalloc/chromium/src/tests/tcmalloc_unittest.cc
index 25bfd6a..6b2ec26 100644
--- a/third_party/tcmalloc/chromium/src/tests/tcmalloc_unittest.cc
+++ b/third_party/tcmalloc/chromium/src/tests/tcmalloc_unittest.cc
@@ -977,7 +977,7 @@ static int RunAllTests(int argc, char** argv) {
   }
 
   // This code stresses some of the memory allocation via STL.
-  // In particular, it calls operator delete(void*, nothrow_t).
+  // It may call operator delete(void*, nothrow_t).
   fprintf(LOGSTREAM, "Testing STL use\n");
   {
     std::vector<int> v;
diff --git a/third_party/tcmalloc/chromium/src/third_party/valgrind.h b/third_party/tcmalloc/chromium/src/third_party/valgrind.h
new file mode 100644
index 0000000..577c59a
--- /dev/null
+++ b/third_party/tcmalloc/chromium/src/third_party/valgrind.h
@@ -0,0 +1,3924 @@
+/* -*- c -*-
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (valgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2000-2008 Julian Seward.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must 
+      not claim that you wrote the original software.  If you use this 
+      software in a product, an acknowledgment in the product 
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote 
+      products derived from this software without specific prior written 
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (valgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ---------------------------------------------------------------- 
+*/
+
+
+/* This file is for inclusion into client (your!) code.
+
+   You can use these macros to manipulate and query Valgrind's 
+   execution inside your own programs.
+
+   The resulting executables will still run without Valgrind, just a
+   little bit more slowly than they otherwise would, but otherwise
+   unchanged.  When not running on valgrind, each client request
+   consumes very few (eg. 7) instructions, so the resulting performance
+   loss is negligible unless you plan to execute client requests
+   millions of times per second.  Nevertheless, if that is still a
+   problem, you can compile with the NVALGRIND symbol defined (gcc
+   -DNVALGRIND) so that client requests are not even compiled in.  */
+
+#ifndef __VALGRIND_H
+#define __VALGRIND_H
+
+#include <stdarg.h>
+
+/* Nb: this file might be included in a file compiled with -ansi.  So
+   we can't use C++ style "//" comments nor the "asm" keyword (instead
+   use "__asm__"). */
+
+/* Derive some tags indicating what the target platform is.  Note
+   that in this file we're using the compiler's CPP symbols for
+   identifying architectures, which are different to the ones we use
+   within the rest of Valgrind.  Note, __powerpc__ is active for both
+   32 and 64-bit PPC, whereas __powerpc64__ is only active for the
+   latter (on Linux, that is). */
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_ppc32_aix5
+#undef PLAT_ppc64_aix5
+
+#if !defined(_AIX) && defined(__i386__)
+#  define PLAT_x86_linux 1
+#elif !defined(_AIX) && defined(__x86_64__)
+#  define PLAT_amd64_linux 1
+#elif !defined(_AIX) && defined(__powerpc__) && !defined(__powerpc64__)
+#  define PLAT_ppc32_linux 1
+#elif !defined(_AIX) && defined(__powerpc__) && defined(__powerpc64__)
+#  define PLAT_ppc64_linux 1
+#elif defined(_AIX) && defined(__64BIT__)
+#  define PLAT_ppc64_aix5 1
+#elif defined(_AIX) && !defined(__64BIT__)
+#  define PLAT_ppc32_aix5 1
+#endif
+
+
+/* If we're not compiling for our target platform, don't generate
+   any inline asms.  */
+#if !defined(PLAT_x86_linux) && !defined(PLAT_amd64_linux) \
+    && !defined(PLAT_ppc32_linux) && !defined(PLAT_ppc64_linux) \
+    && !defined(PLAT_ppc32_aix5) && !defined(PLAT_ppc64_aix5)
+#  if !defined(NVALGRIND)
+#    define NVALGRIND 1
+#  endif
+#endif
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS.  There is nothing */
+/* in here of use to end-users -- skip to the next section.           */
+/* ------------------------------------------------------------------ */
+
+#if defined(NVALGRIND)
+
+/* Define NVALGRIND to completely remove the Valgrind magic sequence
+   from the compiled code (analogous to NDEBUG's effects on
+   assert()) */
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+   {                                                              \
+      (_zzq_rlval) = (_zzq_default);                              \
+   }
+
+#else  /* ! NVALGRIND */
+
+/* The following defines the magic code sequences which the JITter
+   spots and handles magically.  Don't look too closely at them as
+   they will rot your brain.
+
+   The assembly code sequences for all architectures is in this one
+   file.  This is because this file must be stand-alone, and we don't
+   want to have multiple files.
+
+   For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default
+   value gets put in the return slot, so that everything works when
+   this is executed not under Valgrind.  Args are passed in a memory
+   block, and so there's no intrinsic limit to the number that could
+   be passed, but it's currently five.
+   
+   The macro args are: 
+      _zzq_rlval    result lvalue
+      _zzq_default  default value (result returned when running on real CPU)
+      _zzq_request  request code
+      _zzq_arg1..5  request params
+
+   The other two macros are used to support function wrapping, and are
+   a lot simpler.  VALGRIND_GET_NR_CONTEXT returns the value of the
+   guest's NRADDR pseudo-register and whatever other information is
+   needed to safely run the call original from the wrapper: on
+   ppc64-linux, the R2 value at the divert point is also needed.  This
+   information is abstracted into a user-visible type, OrigFn.
+
+   VALGRIND_CALL_NOREDIR_* behaves the same as the following on the
+   guest, but guarantees that the branch instruction will not be
+   redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64:
+   branch-and-link-to-r11.  VALGRIND_CALL_NOREDIR is just text, not a
+   complete inline asm, since it needs to be combined with more magic
+   inline asm stuff to be useful.
+*/
+
+/* ------------------------- x86-linux ------------------------- */
+
+#if defined(PLAT_x86_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "roll $3,  %%edi ; roll $13, %%edi\n\t"      \
+                     "roll $29, %%edi ; roll $19, %%edi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  { volatile unsigned int _zzq_args[6];                           \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EDX = client_request ( %EAX ) */         \
+                     "xchgl %%ebx,%%ebx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EAX = guest_NRADDR */                    \
+                     "xchgl %%ecx,%%ecx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%EAX */                     \
+                     "xchgl %%edx,%%edx\n\t"
+#endif /* PLAT_x86_linux */
+
+/* ------------------------ amd64-linux ------------------------ */
+
+#if defined(PLAT_amd64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rolq $3,  %%rdi ; rolq $13, %%rdi\n\t"      \
+                     "rolq $61, %%rdi ; rolq $51, %%rdi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  { volatile unsigned long long int _zzq_args[6];                 \
+    volatile unsigned long long int _zzq_result;                  \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RDX = client_request ( %RAX ) */         \
+                     "xchgq %%rbx,%%rbx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RAX = guest_NRADDR */                    \
+                     "xchgq %%rcx,%%rcx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_RAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%RAX */                     \
+                     "xchgq %%rdx,%%rdx\n\t"
+#endif /* PLAT_amd64_linux */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned int  _zzq_args[6];                          \
+             unsigned int  _zzq_result;                           \
+             unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned long long int  _zzq_args[6];                \
+    register unsigned long long int  _zzq_result __asm__("r3");   \
+    register unsigned long long int* _zzq_ptr __asm__("r4");      \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1"                                   \
+                     : "=r" (_zzq_result)                         \
+                     : "0" (_zzq_default), "r" (_zzq_ptr)         \
+                     : "cc", "memory");                           \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned long long int __addr __asm__("r3");         \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2"                                   \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4"                                   \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------ ppc32-aix5 ------------------------- */
+
+#if defined(PLAT_ppc32_aix5)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+      unsigned int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned int  _zzq_args[7];                          \
+    register unsigned int  _zzq_result;                           \
+    register unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_args[6] = (unsigned int)(_zzq_default);                  \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 4,%1\n\t"                                \
+                     "lwz 3, 24(4)\n\t"                           \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_ptr)                             \
+                     : "r3", "r4", "cc", "memory");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc32_aix5 */
+
+/* ------------------------ ppc64-aix5 ------------------------- */
+
+#if defined(PLAT_ppc64_aix5)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned long long int  _zzq_args[7];                \
+    register unsigned long long int  _zzq_result;                 \
+    register unsigned long long int* _zzq_ptr;                    \
+    _zzq_args[0] = (unsigned int long long)(_zzq_request);        \
+    _zzq_args[1] = (unsigned int long long)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned int long long)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned int long long)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned int long long)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned int long long)(_zzq_arg5);           \
+    _zzq_args[6] = (unsigned int long long)(_zzq_default);        \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 4,%1\n\t"                                \
+                     "ld 3, 48(4)\n\t"                            \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_ptr)                             \
+                     : "r3", "r4", "cc", "memory");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_aix5 */
+
+/* Insert assembly code for other platforms here... */
+
+#endif /* NVALGRIND */
+
+
+/* ------------------------------------------------------------------ */
+/* PLATFORM SPECIFICS for FUNCTION WRAPPING.  This is all very        */
+/* ugly.  It's the least-worst tradeoff I can think of.               */
+/* ------------------------------------------------------------------ */
+
+/* This section defines magic (a.k.a appalling-hack) macros for doing
+   guaranteed-no-redirection macros, so as to get from function
+   wrappers to the functions they are wrapping.  The whole point is to
+   construct standard call sequences, but to do the call itself with a
+   special no-redirect call pseudo-instruction that the JIT
+   understands and handles specially.  This section is long and
+   repetitious, and I can't see a way to make it shorter.
+
+   The naming scheme is as follows:
+
+      CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc}
+
+   'W' stands for "word" and 'v' for "void".  Hence there are
+   different macros for calling arity 0, 1, 2, 3, 4, etc, functions,
+   and for each, the possibility of returning a word-typed result, or
+   no result.
+*/
+
+/* Use these to write the name of your wrapper.  NOTE: duplicates
+   VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h. */
+
+#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname)                    \
+   _vgwZU_##soname##_##fnname
+
+#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname)                    \
+   _vgwZZ_##soname##_##fnname
+
+/* Use this macro from within a wrapper function to collect the
+   context (address and possibly other info) of the original function.
+   Once you have that you can then use it in one of the CALL_FN_
+   macros.  The type of the argument _lval is OrigFn. */
+#define VALGRIND_GET_ORIG_FN(_lval)  VALGRIND_GET_NR_CONTEXT(_lval)
+
+/* Derivatives of the main macros below, for calling functions
+   returning void. */
+
+#define CALL_FN_v_v(fnptr)                                        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_v(_junk,fnptr); } while (0)
+
+#define CALL_FN_v_W(fnptr, arg1)                                  \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_W(_junk,fnptr,arg1); } while (0)
+
+#define CALL_FN_v_WW(fnptr, arg1,arg2)                            \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0)
+
+#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3)                      \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0)
+
+/* ------------------------- x86-linux ------------------------- */
+
+#if defined(PLAT_x86_linux)
+
+/* These regs are trashed by the hidden call.  No need to mention eax
+   as gcc can already see that, plus causes gcc to bomb. */
+#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx"
+
+/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $4, %%esp\n"                                       \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $8, %%esp\n"                                       \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $12, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $20, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $24, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $28, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $36, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $40, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $44, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "pushl 48(%%eax)\n\t"                                    \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_x86_linux */
+
+/* ------------------------ amd64-linux ------------------------ */
+
+#if defined(PLAT_amd64_linux)
+
+/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi",       \
+                            "rdi", "r8", "r9", "r10", "r11"
+
+/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned
+   long) == 8. */
+
+/* NB 9 Sept 07.  There is a nasty kludge here in all these CALL_FN_
+   macros.  In order not to trash the stack redzone, we need to drop
+   %rsp by 128 before the hidden call, and restore afterwards.  The
+   nastyness is that it is only by luck that the stack still appears
+   to be unwindable during the hidden call - since then the behaviour
+   of any routine using this macro does not match what the CFI data
+   says.  Sigh.
+
+   Why is this important?  Imagine that a wrapper has a stack
+   allocated local, and passes to the hidden call, a pointer to it.
+   Because gcc does not know about the hidden call, it may allocate
+   that local in the redzone.  Unfortunately the hidden call may then
+   trash it before it comes to use it.  So we must step clear of the
+   redzone, for the duration of the hidden call, to make it safe.
+
+   Probably the same problem afflicts the other redzone-style ABIs too
+   (ppc64-linux, ppc32-aix5, ppc64-aix5); but for those, the stack is
+   self describing (none of this CFI nonsense) so at least messing
+   with the stack pointer doesn't give a danger of non-unwindable
+   stack. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $8, %%rsp\n"                                       \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $16, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $24, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $32, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $40, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 96(%%rax)\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $48, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_amd64_linux */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+/* This is useful for finding out about the on-stack stuff:
+
+   extern int f9  ( int,int,int,int,int,int,int,int,int );
+   extern int f10 ( int,int,int,int,int,int,int,int,int,int );
+   extern int f11 ( int,int,int,int,int,int,int,int,int,int,int );
+   extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int );
+
+   int g9 ( void ) {
+      return f9(11,22,33,44,55,66,77,88,99);
+   }
+   int g10 ( void ) {
+      return f10(11,22,33,44,55,66,77,88,99,110);
+   }
+   int g11 ( void ) {
+      return f11(11,22,33,44,55,66,77,88,99,110,121);
+   }
+   int g12 ( void ) {
+      return f12(11,22,33,44,55,66,77,88,99,110,121,132);
+   }
+*/
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc32-linux, 
+   sizeof(unsigned long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      _argvec[12] = (unsigned long)arg12;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,20(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------ ppc32-aix5 ------------------------- */
+
+#if defined(PLAT_ppc32_aix5)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Expand the stack frame, copying enough info that unwinding
+   still works.  Trashes r3. */
+
+#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr)                      \
+         "addi 1,1,-" #_n_fr "\n\t"                               \
+         "lwz  3," #_n_fr "(1)\n\t"                               \
+         "stw  3,0(1)\n\t"
+
+#define VG_CONTRACT_FRAME_BY(_n_fr)                               \
+         "addi 1,1," #_n_fr "\n\t"
+
+/* These CALL_FN_ macros assume that on ppc32-aix5, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t" /* arg2->r4 */                       \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(64)                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(64)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(64)                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(64)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(72)                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,64(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(72)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(72)                        \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,68(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,64(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(72)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_aix5 */
+
+/* ------------------------ ppc64-aix5 ------------------------- */
+
+#if defined(PLAT_ppc64_aix5)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Expand the stack frame, copying enough info that unwinding
+   still works.  Trashes r3. */
+
+#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr)                      \
+         "addi 1,1,-" #_n_fr "\n\t"                               \
+         "ld   3," #_n_fr "(1)\n\t"                               \
+         "std  3,0(1)\n\t"
+
+#define VG_CONTRACT_FRAME_BY(_n_fr)                               \
+         "addi 1,1," #_n_fr "\n\t"
+
+/* These CALL_FN_ macros assume that on ppc64-aix5, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(128)                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(128)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(128)                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(128)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(144)                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(144)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(144)                       \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(144)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_aix5 */
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS.               */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+
+/* Some request codes.  There are many more of these, but most are not
+   exposed to end-user view.  These are the public ones, all of the
+   form 0x1000 + small_number.
+
+   Core ones are in the range 0x00000000--0x0000ffff.  The non-public
+   ones start at 0x2000.
+*/
+
+/* These macros are used by tools -- they must be public, but don't
+   embed them into other programs. */
+#define VG_USERREQ_TOOL_BASE(a,b) \
+   ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16))
+#define VG_IS_TOOL_USERREQ(a, b, v) \
+   (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000))
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end. */
+typedef
+   enum { VG_USERREQ__RUNNING_ON_VALGRIND  = 0x1001,
+          VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002,
+
+          /* These allow any function to be called from the simulated
+             CPU but run on the real CPU.  Nb: the first arg passed to
+             the function is always the ThreadId of the running
+             thread!  So CLIENT_CALL0 actually requires a 1 arg
+             function, etc. */
+          VG_USERREQ__CLIENT_CALL0 = 0x1101,
+          VG_USERREQ__CLIENT_CALL1 = 0x1102,
+          VG_USERREQ__CLIENT_CALL2 = 0x1103,
+          VG_USERREQ__CLIENT_CALL3 = 0x1104,
+
+          /* Can be useful in regression testing suites -- eg. can
+             send Valgrind's output to /dev/null and still count
+             errors. */
+          VG_USERREQ__COUNT_ERRORS = 0x1201,
+
+          /* These are useful and can be interpreted by any tool that
+             tracks malloc() et al, by using vg_replace_malloc.c. */
+          VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301,
+          VG_USERREQ__FREELIKE_BLOCK   = 0x1302,
+          /* Memory pool support. */
+          VG_USERREQ__CREATE_MEMPOOL   = 0x1303,
+          VG_USERREQ__DESTROY_MEMPOOL  = 0x1304,
+          VG_USERREQ__MEMPOOL_ALLOC    = 0x1305,
+          VG_USERREQ__MEMPOOL_FREE     = 0x1306,
+          VG_USERREQ__MEMPOOL_TRIM     = 0x1307,
+          VG_USERREQ__MOVE_MEMPOOL     = 0x1308,
+          VG_USERREQ__MEMPOOL_CHANGE   = 0x1309,
+          VG_USERREQ__MEMPOOL_EXISTS   = 0x130a,
+
+          /* Allow printfs to valgrind log. */
+          VG_USERREQ__PRINTF           = 0x1401,
+          VG_USERREQ__PRINTF_BACKTRACE = 0x1402,
+
+          /* Stack support. */
+          VG_USERREQ__STACK_REGISTER   = 0x1501,
+          VG_USERREQ__STACK_DEREGISTER = 0x1502,
+          VG_USERREQ__STACK_CHANGE     = 0x1503
+   } Vg_ClientRequest;
+
+#if !defined(__GNUC__)
+#  define __extension__ /* */
+#endif
+
+/* Returns the number of Valgrinds this code is running under.  That
+   is, 0 if running natively, 1 if running under Valgrind, 2 if
+   running under Valgrind which is running under another Valgrind,
+   etc. */
+#define RUNNING_ON_VALGRIND  __extension__                        \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* if not */,          \
+                               VG_USERREQ__RUNNING_ON_VALGRIND,   \
+                               0, 0, 0, 0, 0);                    \
+    _qzz_res;                                                     \
+   })
+
+
+/* Discard translation of code in the range [_qzz_addr .. _qzz_addr +
+   _qzz_len - 1].  Useful if you are debugging a JITter or some such,
+   since it provides a way to make sure valgrind will retranslate the
+   invalidated area.  Returns no value. */
+#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len)         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__DISCARD_TRANSLATIONS,  \
+                               _qzz_addr, _qzz_len, 0, 0, 0);     \
+   }
+
+
+/* These requests are for getting Valgrind itself to print something.
+   Possibly with a backtrace.  This is a really ugly hack. */
+
+#if defined(NVALGRIND)
+
+#  define VALGRIND_PRINTF(...)
+#  define VALGRIND_PRINTF_BACKTRACE(...)
+
+#else /* NVALGRIND */
+
+/* Modern GCC will optimize the static routine out if unused,
+   and unused attribute will shut down warnings about it.  */
+static int VALGRIND_PRINTF(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+static int
+VALGRIND_PRINTF(const char *format, ...)
+{
+   unsigned long _qzz_res;
+   va_list vargs;
+   va_start(vargs, format);
+   VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF,
+                              (unsigned long)format, (unsigned long)vargs, 
+                              0, 0, 0);
+   va_end(vargs);
+   return (int)_qzz_res;
+}
+
+static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+static int
+VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+{
+   unsigned long _qzz_res;
+   va_list vargs;
+   va_start(vargs, format);
+   VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF_BACKTRACE,
+                              (unsigned long)format, (unsigned long)vargs, 
+                              0, 0, 0);
+   va_end(vargs);
+   return (int)_qzz_res;
+}
+
+#endif /* NVALGRIND */
+
+
+/* These requests allow control to move from the simulated CPU to the
+   real CPU, calling an arbitary function.
+   
+   Note that the current ThreadId is inserted as the first argument.
+   So this call:
+
+     VALGRIND_NON_SIMD_CALL2(f, arg1, arg2)
+
+   requires f to have this signature:
+
+     Word f(Word tid, Word arg1, Word arg2)
+
+   where "Word" is a word-sized type.
+
+   Note that these client requests are not entirely reliable.  For example,
+   if you call a function with them that subsequently calls printf(),
+   there's a high chance Valgrind will crash.  Generally, your prospects of
+   these working are made higher if the called function does not refer to
+   any global variables, and does not refer to any libc or other functions
+   (printf et al).  Any kind of entanglement with libc or dynamic linking is
+   likely to have a bad outcome, for tricky reasons which we've grappled
+   with a lot in the past.
+*/
+#define VALGRIND_NON_SIMD_CALL0(_qyy_fn)                          \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL0,          \
+                               _qyy_fn,                           \
+                               0, 0, 0, 0);                       \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1)               \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL1,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, 0, 0, 0);               \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2)    \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL2,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, _qyy_arg2, 0, 0);       \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL3,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, _qyy_arg2,              \
+                               _qyy_arg3, 0);                     \
+    _qyy_res;                                                     \
+   })
+
+
+/* Counts the number of errors that have been recorded by a tool.  Nb:
+   the tool must record the errors with VG_(maybe_record_error)() or
+   VG_(unique_error)() for them to be counted. */
+#define VALGRIND_COUNT_ERRORS                                     \
+   __extension__                                                  \
+   ({unsigned int _qyy_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__COUNT_ERRORS,          \
+                               0, 0, 0, 0, 0);                    \
+    _qyy_res;                                                     \
+   })
+
+/* Mark a block of memory as having been allocated by a malloc()-like
+   function.  `addr' is the start of the usable block (ie. after any
+   redzone) `rzB' is redzone size if the allocator can apply redzones;
+   use '0' if not.  Adding redzones makes it more likely Valgrind will spot
+   block overruns.  `is_zeroed' indicates if the memory is zeroed, as it is
+   for calloc().  Put it immediately after the point where a block is
+   allocated. 
+   
+   If you're using Memcheck: If you're allocating memory via superblocks,
+   and then handing out small chunks of each superblock, if you don't have
+   redzones on your small blocks, it's worth marking the superblock with
+   VALGRIND_MAKE_MEM_NOACCESS when it's created, so that block overruns are
+   detected.  But if you can put redzones on, it's probably better to not do
+   this, so that messages for small overruns are described in terms of the
+   small block rather than the superblock (but if you have a big overrun
+   that skips over a redzone, you could miss an error this way).  See
+   memcheck/tests/custom_alloc.c for an example.
+
+   WARNING: if your allocator uses malloc() or 'new' to allocate
+   superblocks, rather than mmap() or brk(), this will not work properly --
+   you'll likely get assertion failures during leak detection.  This is
+   because Valgrind doesn't like seeing overlapping heap blocks.  Sorry.
+
+   Nb: block must be freed via a free()-like function specified
+   with VALGRIND_FREELIKE_BLOCK or mismatch errors will occur. */
+#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)    \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MALLOCLIKE_BLOCK,      \
+                               addr, sizeB, rzB, is_zeroed, 0);   \
+   }
+
+/* Mark a block of memory as having been freed by a free()-like function.
+   `rzB' is redzone size;  it must match that given to
+   VALGRIND_MALLOCLIKE_BLOCK.  Memory not freed will be detected by the leak
+   checker.  Put it immediately after the point where the block is freed. */
+#define VALGRIND_FREELIKE_BLOCK(addr, rzB)                        \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__FREELIKE_BLOCK,        \
+                               addr, rzB, 0, 0, 0);               \
+   }
+
+/* Create a memory pool. */
+#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed)             \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__CREATE_MEMPOOL,        \
+                               pool, rzB, is_zeroed, 0, 0);       \
+   }
+
+/* Destroy a memory pool. */
+#define VALGRIND_DESTROY_MEMPOOL(pool)                            \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__DESTROY_MEMPOOL,       \
+                               pool, 0, 0, 0, 0);                 \
+   }
+
+/* Associate a piece of memory with a memory pool. */
+#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size)                  \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_ALLOC,         \
+                               pool, addr, size, 0, 0);           \
+   }
+
+/* Disassociate a piece of memory from a memory pool. */
+#define VALGRIND_MEMPOOL_FREE(pool, addr)                         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_FREE,          \
+                               pool, addr, 0, 0, 0);              \
+   }
+
+/* Disassociate any pieces outside a particular range. */
+#define VALGRIND_MEMPOOL_TRIM(pool, addr, size)                   \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_TRIM,          \
+                               pool, addr, size, 0, 0);           \
+   }
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MOVE_MEMPOOL(poolA, poolB)                       \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MOVE_MEMPOOL,          \
+                               poolA, poolB, 0, 0, 0);            \
+   }
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size)         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_CHANGE,        \
+                               pool, addrA, addrB, size, 0);      \
+   }
+
+/* Return 1 if a mempool exists, else 0. */
+#define VALGRIND_MEMPOOL_EXISTS(pool)                             \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_EXISTS,        \
+                               pool, 0, 0, 0, 0);                 \
+    _qzz_res;                                                     \
+   })
+
+/* Mark a piece of memory as being a stack. Returns a stack id. */
+#define VALGRIND_STACK_REGISTER(start, end)                       \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_REGISTER,        \
+                               start, end, 0, 0, 0);              \
+    _qzz_res;                                                     \
+   })
+
+/* Unmark the piece of memory associated with a stack id as being a
+   stack. */
+#define VALGRIND_STACK_DEREGISTER(id)                             \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_DEREGISTER,      \
+                               id, 0, 0, 0, 0);                   \
+   }
+
+/* Change the start and end address of the stack id. */
+#define VALGRIND_STACK_CHANGE(id, start, end)                     \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_CHANGE,          \
+                               id, start, end, 0, 0);             \
+   }
+
+
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_ppc32_aix5
+#undef PLAT_ppc64_aix5
+
+#endif   /* __VALGRIND_H */
diff --git a/third_party/tcmalloc/chromium/src/thread_cache.h b/third_party/tcmalloc/chromium/src/thread_cache.h
index 4c6a233..1165447 100644
--- a/third_party/tcmalloc/chromium/src/thread_cache.h
+++ b/third_party/tcmalloc/chromium/src/thread_cache.h
@@ -79,7 +79,9 @@ class ThreadCache {
   // Total byte size in cache
   size_t Size() const { return size_; }
 
-  void* Allocate(size_t size);
+  // Allocate an object of the given size and class. The size given
+  // must be the same as the size of the class in the size map.
+  void* Allocate(size_t size, size_t cl);
   void Deallocate(void* ptr, size_t size_class);
 
   void Scavenge();
@@ -293,15 +295,18 @@ class ThreadCache {
   // across all ThreadCaches.  Protected by Static::pageheap_lock.
   static ssize_t unclaimed_cache_space_;
 
-  // Warning: the offset of list_ affects performance.  On general
-  // principles, we don't like list_[x] to span multiple L1 cache
-  // lines.  However, merely placing list_ at offset 0 here seems to
-  // cause cache conflicts.
+  // This class is laid out with the most frequently used fields
+  // first so that hot elements are placed on the same cache line.
 
   size_t        size_;                  // Combined size of data
   size_t        max_size_;              // size_ > max_size_ --> Scavenge()
-  pthread_t     tid_;                   // Which thread owns it
+
+  // We sample allocations, biased by the size of the allocation
+  Sampler       sampler_;               // A sampler
+
   FreeList      list_[kNumClasses];     // Array indexed by size-class
+
+  pthread_t     tid_;                   // Which thread owns it
   bool          in_setspecific_;        // In call to pthread_setspecific?
 
   // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
@@ -313,9 +318,10 @@ class ThreadCache {
   static void DeleteCache(ThreadCache* heap);
   static void RecomputePerThreadCacheSize();
 
-  // We sample allocations, biased by the size of the allocation
-  Sampler       sampler_;               // A sampler
-};
+  // Ensure that this class is cacheline-aligned. This is critical for
+  // performance, as false sharing would negate many of the benefits
+  // of a per-thread cache.
+} CACHELINE_ALIGNED;
 
 // Allocator for thread heaps
 // This is logically part of the ThreadCache class, but MSVC, at
@@ -331,15 +337,15 @@ inline bool ThreadCache::SampleAllocation(size_t k) {
   return sampler_.SampleAllocation(k);
 }
 
-inline void* ThreadCache::Allocate(size_t size) {
+inline void* ThreadCache::Allocate(size_t size, size_t cl) {
   ASSERT(size <= kMaxSize);
-  const size_t cl = Static::sizemap()->SizeClass(size);
-  const size_t alloc_size = Static::sizemap()->ByteSizeForClass(cl);
+  ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));
+
   FreeList* list = &list_[cl];
   if (list->empty()) {
-    return FetchFromCentralCache(cl, alloc_size);
+    return FetchFromCentralCache(cl, size);
   }
-  size_ -= alloc_size;
+  size_ -= size;
   return list->Pop();
 }
 
diff --git a/third_party/tcmalloc/chromium/src/windows/addr2line-pdb.c b/third_party/tcmalloc/chromium/src/windows/addr2line-pdb.c
index 97b614b..5c65a03 100644
--- a/third_party/tcmalloc/chromium/src/windows/addr2line-pdb.c
+++ b/third_party/tcmalloc/chromium/src/windows/addr2line-pdb.c
@@ -48,6 +48,12 @@
 #define SEARCH_CAP (1024*1024)
 #define WEBSYM "SRV*c:\\websymbols*http://msdl.microsoft.com/download/symbols"
 
+void usage() {
+  fprintf(stderr, "usage: "
+          "addr2line-pdb [-f|--functions] [-C|--demangle] [-e filename]\n");
+  fprintf(stderr, "(Then list the hex addresses on stdin, one per line)\n");
+}
+
 int main(int argc, char *argv[]) {
   DWORD  error;
   HANDLE process;
@@ -74,10 +80,11 @@ int main(int argc, char *argv[]) {
       }
       filename = argv[i+1];
       i++;     /* to skip over filename too */
+    } else if (strcmp(argv[i], "--help") == 0) {
+      usage();
+      exit(0);
     } else {
-      fprintf(stderr, "usage: "
-              "addr2line-pdb [-f|--functions] [-C|--demangle] [-e filename]\n");
-      fprintf(stderr, "(Then list the hex addresses on stdin, one per line)\n");
+      usage();
       exit(1);
     }
   }
diff --git a/third_party/tcmalloc/chromium/src/windows/config.h b/third_party/tcmalloc/chromium/src/windows/config.h
index 99de82c..b5d9bb6 100644
--- a/third_party/tcmalloc/chromium/src/windows/config.h
+++ b/third_party/tcmalloc/chromium/src/windows/config.h
@@ -261,10 +261,12 @@
 // ---------------------------------------------------------------------
 // Extra stuff not found in config.h.in
 
-// This must be defined before the windows.h is included.  It's needed
-// for mutex.h, to give access to the TryLock method.
+// This must be defined before the windows.h is included.  We need at
+// least 0x0400 for mutex.h to have access to TryLock, and at least
+// 0x0501 for patch_functions.cc to have access to GetModuleHandleEx.
+// (This latter is an optimization we could take out if need be.)
 #ifndef _WIN32_WINNT
-# define _WIN32_WINNT 0x0400
+# define _WIN32_WINNT 0x0501
 #endif
 
 // We want to make sure not to ever try to #include heap-checker.h
diff --git a/third_party/tcmalloc/chromium/src/windows/google/tcmalloc.h b/third_party/tcmalloc/chromium/src/windows/google/tcmalloc.h
index 4b97b15..663b7f9 100644
--- a/third_party/tcmalloc/chromium/src/windows/google/tcmalloc.h
+++ b/third_party/tcmalloc/chromium/src/windows/google/tcmalloc.h
@@ -61,7 +61,8 @@
 #endif
 
 #ifdef __cplusplus
-#include <new>  // for nothrow_t
+#include <new>          // for std::nothrow_t
+
 extern "C" {
 #endif
   // Returns a human-readable version string.  If major, minor,
@@ -92,16 +93,15 @@ extern "C" {
 #ifdef __cplusplus
   PERFTOOLS_DLL_DECL int tc_set_new_mode(int flag) __THROW;
   PERFTOOLS_DLL_DECL void* tc_new(size_t size);
-  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
-  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
-  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
-
   PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size,
                                           const std::nothrow_t&) __THROW;
-  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
-                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
   PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p,
                                             const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
+                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
   PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p,
                                                  const std::nothrow_t&) __THROW;
 }
diff --git a/third_party/tcmalloc/chromium/src/windows/nm-pdb.c b/third_party/tcmalloc/chromium/src/windows/nm-pdb.c
index 726d345..9beb21d 100644
--- a/third_party/tcmalloc/chromium/src/windows/nm-pdb.c
+++ b/third_party/tcmalloc/chromium/src/windows/nm-pdb.c
@@ -180,6 +180,10 @@ static void ShowSymbolInfo(HANDLE process, ULONG64 module_base) {
 #endif
 }
 
+void usage() {
+  fprintf(stderr, "usage: nm-pdb [-C|--demangle] <module or filename>\n");
+}
+
 int main(int argc, char *argv[]) {
   DWORD  error;
   HANDLE process;
@@ -195,12 +199,15 @@ int main(int argc, char *argv[]) {
   for (i = 1; i < argc; i++) {
     if (strcmp(argv[i], "--demangle") == 0 || strcmp(argv[i], "-C") == 0) {
       symopts |= SYMOPT_UNDNAME;
+    } else if (strcmp(argv[i], "--help") == 0) {
+      usage();
+      exit(0);
     } else {
       break;
     }
   }
   if (i != argc - 1) {
-    fprintf(stderr, "usage: nm-pdb [-C|--demangle] <module or filename>\n");
+    usage();
     exit(1);
   }
   filename = argv[i];
diff --git a/third_party/tcmalloc/chromium/src/windows/patch_functions.cc b/third_party/tcmalloc/chromium/src/windows/patch_functions.cc
index c1ed37f..deb841b 100644
--- a/third_party/tcmalloc/chromium/src/windows/patch_functions.cc
+++ b/third_party/tcmalloc/chromium/src/windows/patch_functions.cc
@@ -83,6 +83,7 @@
 #endif
 
 #include <windows.h>
+#include <stdio.h>
 #include <malloc.h>       // for _msize and _expand
 #include <Psapi.h>        // for EnumProcessModules, GetModuleInformation, etc.
 #include <set>
@@ -96,8 +97,6 @@
 
 // The maximum number of modules we allow to be in one executable
 const int kMaxModules = 8182;
-// The maximum size of a module's basename
-const int kMaxModuleNameSize = 256;
 
 // These are hard-coded, unfortunately. :-( They are also probably
 // compiler specific.  See get_mangled_names.cc, in this directory,
@@ -145,13 +144,15 @@ class LibcInfo {
   LibcInfo() {
     memset(this, 0, sizeof(*this));  // easiest way to initialize the array
   }
-  bool SameAs(const LibcInfo& that) const;
-  bool SameAsModuleEntry(const ModuleEntryCopy& module_entry) const;
-
-  bool patched() const { return is_valid() && module_name_[0] != '\0'; }
-  const char* module_name() const { return is_valid() ? module_name_ : ""; }
 
+  bool patched() const { return is_valid(); }
   void set_is_valid(bool b) { is_valid_ = b; }
+  // According to http://msdn.microsoft.com/en-us/library/ms684229(VS.85).aspx:
+  // "The load address of a module (lpBaseOfDll) is the same as the HMODULE
+  // value."
+  HMODULE hmodule() const {
+    return reinterpret_cast<HMODULE>(const_cast<void*>(module_base_address_));
+  }
 
   // Populates all the windows_fn_[] vars based on our module info.
   // Returns false if windows_fn_ is all NULL's, because there's
@@ -167,7 +168,6 @@ class LibcInfo {
     memcpy(this->windows_fn_, that.windows_fn_, sizeof(windows_fn_));
     this->module_base_address_ = that.module_base_address_;
     this->module_base_size_ = that.module_base_size_;
-    memcpy(this->module_name_, that.module_name_, sizeof(module_name_));
   }
 
   enum {
@@ -207,7 +207,6 @@ class LibcInfo {
 
   const void *module_base_address_;
   size_t module_base_size_;
-  char module_name_[kMaxModuleNameSize];
 
  public:
   // These shouldn't have to be public, since only subclasses of
@@ -285,10 +284,8 @@ template<int> class LibcInfoWithPatchFunctions : public LibcInfo {
 
 // This is a subset of MODDULEENTRY32, that we need for patching.
 struct ModuleEntryCopy {
-  LPVOID  modBaseAddr;
+  LPVOID  modBaseAddr;     // the same as hmodule
   DWORD   modBaseSize;
-  HMODULE hModule;
-  TCHAR   szModule[kMaxModuleNameSize];
   // This is not part of MODDULEENTRY32, but is needed to avoid making
   // windows syscalls while we're holding patch_all_modules_lock (see
   // lock-inversion comments at patch_all_modules_lock definition, below).
@@ -297,26 +294,16 @@ struct ModuleEntryCopy {
   ModuleEntryCopy() {
     modBaseAddr = NULL;
     modBaseSize = 0;
-    hModule = NULL;
-    strcpy(szModule, "<executable>");
     for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++)
       rgProcAddresses[i] = LibcInfo::static_fn(i);
   }
-  ModuleEntryCopy(HANDLE hprocess, HMODULE hmodule, const MODULEINFO& mi) {
+  ModuleEntryCopy(const MODULEINFO& mi) {
     this->modBaseAddr = mi.lpBaseOfDll;
     this->modBaseSize = mi.SizeOfImage;
-    this->hModule = hmodule;
-    // TODO(csilvers): we could make more efficient by calling these
-    // lazily (not until the vars are needed, which is often never).
-    // However, there's tricky business with calling windows functions
-    // inside the patch_all_modules_lock (see the lock inversion
-    // comments with the patch_all_modules_lock definition, below), so
-    // it's safest to do it all here, where no lock is needed.
-    ::GetModuleBaseNameA(hprocess, hmodule,
-                         this->szModule, sizeof(this->szModule));
     for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++)
-      rgProcAddresses[i] =
-          (GenericFnPtr)::GetProcAddress(hModule, LibcInfo::function_name(i));
+      rgProcAddresses[i] = (GenericFnPtr)::GetProcAddress(
+          reinterpret_cast<const HMODULE>(mi.lpBaseOfDll),
+          LibcInfo::function_name(i));
   }
 };
 
@@ -479,18 +466,6 @@ const GenericFnPtr LibcInfoWithPatchFunctions<T>::perftools_fn_[] = {
   { "FreeLibrary", NULL, NULL, (GenericFnPtr)&Perftools_FreeLibrary },
 };
 
-bool LibcInfo::SameAs(const LibcInfo& that) const {
-  return (is_valid() &&
-          module_base_address_ == that.module_base_address_ &&
-          module_base_size_ == that.module_base_size_);
-}
-
-bool LibcInfo::SameAsModuleEntry(const ModuleEntryCopy& module_entry) const {
-  return (is_valid() &&
-          module_base_address_ == module_entry.modBaseAddr &&
-          module_base_size_ == module_entry.modBaseSize);
-}
-
 bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& module_entry) {
   // First, store the location of the function to patch before
   // patching it.  If none of these functions are found in the module,
@@ -552,10 +527,9 @@ bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& module_entry) {
   CHECK(windows_fn_[kFree]);
   CHECK(windows_fn_[kRealloc]);
 
-  // OK, we successfully patched.  Let's store our member information.
+  // OK, we successfully populated.  Let's store our member information.
   module_base_address_ = module_entry.modBaseAddr;
   module_base_size_ = module_entry.modBaseSize;
-  strcpy(module_name_, module_entry.szModule);
   return true;
 }
 
@@ -636,14 +610,6 @@ void WindowsInfo::Unpatch() {
 
 // You should hold the patch_all_modules_lock when calling this.
 void PatchOneModuleLocked(const LibcInfo& me_info) {
-  // Double-check we haven't seen this module before.
-  for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) {
-    if (g_module_libcs[i]->SameAs(me_info)) {
-      fprintf(stderr, "%s:%d: FATAL PERFTOOLS ERROR: %s double-patched somehow.\n",
-              __FILE__, __LINE__, g_module_libcs[i]->module_name());
-      CHECK(false);
-    }
-  }
   // If we don't already have info on this module, let's add it.  This
   // is where we're sad that each libcX has a different type, so we
   // can't use an array; instead, we have to use a switch statement.
@@ -686,52 +652,70 @@ void PatchMainExecutableLocked() {
 // patch_all_modules_lock, inside PatchAllModules().
 static SpinLock patch_all_modules_lock(SpinLock::LINKER_INITIALIZED);
 
+// last_loaded: The set of modules that were loaded the last time
+// PatchAllModules was called.  This is an optimization for only
+// looking at modules that were added or removed from the last call.
+static std::set<HMODULE> *g_last_loaded;
+
 // Iterates over all the modules currently loaded by the executable,
-// and makes sure they're all patched.  For ones that aren't, we patch
-// them in.  We also check that every module we had patched in the
-// past is still loaded, and update internal data structures if so.
-// We return true if this PatchAllModules did any work, false else.
+// according to windows, and makes sure they're all patched.  Most
+// modules will already be in loaded_modules, meaning we have already
+// loaded and either patched them or determined they did not need to
+// be patched.  Others will not, which means we need to patch them
+// (if necessary).  Finally, we have to go through the existing
+// g_module_libcs and see if any of those are *not* in the modules
+// currently loaded by the executable.  If so, we need to invalidate
+// them.  Returns true if we did any work (patching or invalidating),
+// false if we were a noop.  May update loaded_modules as well.
+// NOTE: you must hold the patch_all_modules_lock to access loaded_modules.
 bool PatchAllModules() {
   std::vector<ModuleEntryCopy> modules;
   bool made_changes = false;
 
   const HANDLE hCurrentProcess = GetCurrentProcess();
-  MODULEINFO mi;
-  DWORD cbNeeded = 0;
+  DWORD num_modules = 0;
   HMODULE hModules[kMaxModules];  // max # of modules we support in one process
-  if (::EnumProcessModules(hCurrentProcess, hModules, sizeof(hModules),
-                           &cbNeeded)) {
-    for (int i = 0; i < cbNeeded / sizeof(*hModules); ++i) {
-      if (i >= kMaxModules) {
-        printf("PERFTOOLS ERROR: Too many modules in this executable to try"
-               " to patch them all (if you need to, raise kMaxModules in"
-               " patch_functions.cc).\n");
-        break;
-      }
-      if (::GetModuleInformation(hCurrentProcess, hModules[i], &mi, sizeof(mi)))
-        modules.push_back(ModuleEntryCopy(hCurrentProcess, hModules[i], mi));
-    }
+  if (!::EnumProcessModules(hCurrentProcess, hModules, sizeof(hModules),
+                            &num_modules)) {
+    num_modules = 0;
+  }
+  // EnumProcessModules actually set the bytes written into hModules,
+  // so we need to divide to make num_modules actually be a module-count.
+  num_modules /= sizeof(*hModules);
+  if (num_modules >= kMaxModules) {
+    printf("PERFTOOLS ERROR: Too many modules in this executable to try"
+           " to patch them all (if you need to, raise kMaxModules in"
+           " patch_functions.cc).\n");
+    num_modules = kMaxModules;
   }
 
-  // Now do the actual patching and unpatching.
+  // Now we handle the unpatching of modules we have in g_module_libcs
+  // but that were not found in EnumProcessModules.  We need to
+  // invalidate them.  To speed that up, we store the EnumProcessModules
+  // output in a set.
+  // At the same time, we prepare for the adding of new modules, by
+  // removing from hModules all the modules we know we've already
+  // patched (or decided don't need to be patched).  At the end,
+  // hModules will hold only the modules that we need to consider patching.
+  std::set<HMODULE> currently_loaded_modules;
   {
     SpinLockHolder h(&patch_all_modules_lock);
-    for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) {
-      if (!g_module_libcs[i]->is_valid())
-        continue;
-      bool still_loaded = false;
-      for (std::vector<ModuleEntryCopy>::iterator it = modules.begin();
-           it != modules.end(); ++it) {
-        if (g_module_libcs[i]->SameAsModuleEntry(*it)) {
-          // Both g_module_libcs[i] and it are still valid.  Mark it by
-          // removing it from the vector; mark g_module_libcs[i] by
-          // setting a bool.
-          modules.erase(it);
-          still_loaded = true;
-          break;
-        }
+    if (!g_last_loaded)  g_last_loaded = new std::set<HMODULE>;
+    // At the end of this loop, currently_loaded_modules contains the
+    // full list of EnumProcessModules, and hModules just the ones we
+    // haven't handled yet.
+    for (int i = 0; i < num_modules; ) {
+      currently_loaded_modules.insert(hModules[i]);
+      if (g_last_loaded->count(hModules[i]) > 0) {
+        hModules[i] = hModules[--num_modules];  // replace element i with tail
+      } else {
+        i++;                                    // keep element i
       }
-      if (!still_loaded) {
+    }
+    // Now we do the unpatching/invalidation.
+    for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) {
+      if (g_module_libcs[i]->patched() &&
+          currently_loaded_modules.count(g_module_libcs[i]->hmodule()) == 0) {
         // Means g_module_libcs[i] is no longer loaded (no me32 matched).
         // We could call Unpatch() here, but why bother?  The module
         // has gone away, so nobody is going to call into it anyway.
@@ -739,14 +723,28 @@ bool PatchAllModules() {
         made_changes = true;
       }
     }
+    // Update the loaded module cache.
+    g_last_loaded->swap(currently_loaded_modules);
+  }
+
+  // Now that we know what modules are new, let's get the info we'll
+  // need to patch them.  Note this *cannot* be done while holding the
+  // lock, since it needs to make windows calls (see the lock-inversion
+  // comments before the definition of patch_all_modules_lock).
+  MODULEINFO mi;
+  for (int i = 0; i < num_modules; i++) {
+    if (::GetModuleInformation(hCurrentProcess, hModules[i], &mi, sizeof(mi)))
+      modules.push_back(ModuleEntryCopy(mi));
+  }
 
-    // We've handled all the g_module_libcs.  Now let's handle the rest
-    // of the module-entries: those that haven't already been loaded.
-    for (std::vector<ModuleEntryCopy>::const_iterator it = modules.begin();
+  // Now we can do the patching of new modules.
+  {
+    SpinLockHolder h(&patch_all_modules_lock);
+    for (std::vector<ModuleEntryCopy>::iterator it = modules.begin();
          it != modules.end(); ++it) {
       LibcInfo libc_info;
       if (libc_info.PopulateWindowsFn(*it)) { // true==module has libc routines
-        PatchOneModuleLocked(libc_info);      // updates num_patched_modules
+        PatchOneModuleLocked(libc_info);
         made_changes = true;
       }
     }
@@ -759,6 +757,10 @@ bool PatchAllModules() {
       made_changes = true;
     }
   }
+  // TODO(csilvers): for this to be reliable, we need to also take
+  // into account if we *would* have patched any modules had they not
+  // already been loaded.  (That is, made_changes should ignore
+  // g_last_loaded.)
   return made_changes;
 }
 
@@ -766,59 +768,9 @@ bool PatchAllModules() {
 }  // end unnamed namespace
 
 // ---------------------------------------------------------------------
-// PatchWindowsFunctions()
-//    This is the function that is exposed to the outside world.
-//    It should be called before the program becomes multi-threaded,
-//    since main_executable_windows.Patch() is not thread-safe.
-// ---------------------------------------------------------------------
-
-void PatchWindowsFunctions() {
-  // This does the libc patching in every module, and the main executable.
-  PatchAllModules();
-  main_executable_windows.Patch();
-}
-
-#if 0
-// It's possible to unpatch all the functions when we are exiting.
-
-// The idea is to handle properly windows-internal data that is
-// allocated before PatchWindowsFunctions is called.  If all
-// destruction happened in reverse order from construction, then we
-// could call UnpatchWindowsFunctions at just the right time, so that
-// that early-allocated data would be freed using the windows
-// allocation functions rather than tcmalloc.  The problem is that
-// windows allocates some structures lazily, so it would allocate them
-// late (using tcmalloc) and then try to deallocate them late as well.
-// So instead of unpatching, we just modify all the tcmalloc routines
-// so they call through to the libc rountines if the memory in
-// question doesn't seem to have been allocated with tcmalloc.  I keep
-// this unpatch code around for reference.
-
-void UnpatchWindowsFunctions() {
-  // We need to go back to the system malloc/etc at global destruct time,
-  // so objects that were constructed before tcmalloc, using the system
-  // malloc, can destroy themselves using the system free.  This depends
-  // on DLLs unloading in the reverse order in which they load!
-  //
-  // We also go back to the default HeapAlloc/etc, just for consistency.
-  // Who knows, it may help avoid weird bugs in some situations.
-  main_executable_windows.Unpatch();
-  main_executable.Unpatch();
-  if (libc1.is_valid()) libc1.Unpatch();
-  if (libc2.is_valid()) libc2.Unpatch();
-  if (libc3.is_valid()) libc3.Unpatch();
-  if (libc4.is_valid()) libc4.Unpatch();
-  if (libc5.is_valid()) libc5.Unpatch();
-  if (libc6.is_valid()) libc6.Unpatch();
-  if (libc7.is_valid()) libc7.Unpatch();
-  if (libc8.is_valid()) libc8.Unpatch();
-}
-#endif
-
-// ---------------------------------------------------------------------
-// Now that we've done all the patching machinery, let's end the file
-// by actually defining the functions we're patching in.  Mostly these
-// are simple wrappers around the do_* routines in tcmalloc.cc.
+// Now that we've done all the patching machinery, let's actually
+// define the functions we're patching in.  Mostly these are
+// simple wrappers around the do_* routines in tcmalloc.cc.
 //
 // In fact, we #include tcmalloc.cc to get at the tcmalloc internal
 // do_* functions, the better to write our own hook functions.
@@ -1029,19 +981,107 @@ BOOL WINAPI WindowsInfo::Perftools_UnmapViewOfFile(LPCVOID lpBaseAddress) {
               lpBaseAddress);
 }
 
+// g_load_map holds a copy of windows' refcount for how many times
+// each currently loaded module has been loaded and unloaded.  We use
+// it as an optimization when the same module is loaded more than
+// once: as long as the refcount stays above 1, we don't need to worry
+// about patching because it's already patched.  Likewise, we don't
+// need to unpatch until the refcount drops to 0.  load_map is
+// maintained in LoadLibraryExW and FreeLibrary, and only covers
+// modules explicitly loaded/freed via those interfaces.
+static std::map<HMODULE, int>* g_load_map = NULL;
+
 HMODULE WINAPI WindowsInfo::Perftools_LoadLibraryExW(LPCWSTR lpFileName,
                                                      HANDLE hFile,
                                                      DWORD dwFlags) {
-  HMODULE rv = ((HMODULE (WINAPI *)(LPCWSTR, HANDLE, DWORD))
-                function_info_[kLoadLibraryExW].origstub_fn)(
-                    lpFileName, hFile, dwFlags);
-  PatchAllModules();
-  return rv;
+  HMODULE rv;
+  // Check to see if the modules is already loaded, flag 0 gets a
+  // reference if it was loaded.  If it was loaded no need to call
+  // PatchAllModules, just increase the reference count to match
+  // what GetModuleHandleExW does internally inside windows.
+  if (::GetModuleHandleExW(0, lpFileName, &rv)) {
+    return rv;
+  } else {
+    // Not already loaded, so load it.
+    rv = ((HMODULE (WINAPI *)(LPCWSTR, HANDLE, DWORD))
+                  function_info_[kLoadLibraryExW].origstub_fn)(
+                      lpFileName, hFile, dwFlags);
+    // This will patch any newly loaded libraries, if patching needs
+    // to be done.
+    PatchAllModules();
+
+    return rv;
+  }
 }
 
 BOOL WINAPI WindowsInfo::Perftools_FreeLibrary(HMODULE hLibModule) {
   BOOL rv = ((BOOL (WINAPI *)(HMODULE))
              function_info_[kFreeLibrary].origstub_fn)(hLibModule);
+
+  // Check to see if the module is still loaded by passing the base
+  // address and seeing if it comes back with the same address.  If it
+  // is the same address it's still loaded, so the FreeLibrary() call
+  // was a noop, and there's no need to redo the patching.
+  HMODULE owner = NULL;
+  BOOL result = ::GetModuleHandleExW(
+      (GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+       GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT),
+      (LPCWSTR)hLibModule,
+      &owner);
+  if (result && owner == hLibModule)
+    return rv;
+
   PatchAllModules();    // this will fix up the list of patched libraries
   return rv;
 }
+
+
+// ---------------------------------------------------------------------
+// PatchWindowsFunctions()
+//    This is the function that is exposed to the outside world.
+//    It should be called before the program becomes multi-threaded,
+//    since main_executable_windows.Patch() is not thread-safe.
+// ---------------------------------------------------------------------
+
+void PatchWindowsFunctions() {
+  // This does the libc patching in every module, and the main executable.
+  PatchAllModules();
+  main_executable_windows.Patch();
+}
+
+#if 0
+// It's possible to unpatch all the functions when we are exiting.
+
+// The idea is to handle properly windows-internal data that is
+// allocated before PatchWindowsFunctions is called.  If all
+// destruction happened in reverse order from construction, then we
+// could call UnpatchWindowsFunctions at just the right time, so that
+// that early-allocated data would be freed using the windows
+// allocation functions rather than tcmalloc.  The problem is that
+// windows allocates some structures lazily, so it would allocate them
+// late (using tcmalloc) and then try to deallocate them late as well.
+// So instead of unpatching, we just modify all the tcmalloc routines
+// so they call through to the libc rountines if the memory in
+// question doesn't seem to have been allocated with tcmalloc.  I keep
+// this unpatch code around for reference.
+
+void UnpatchWindowsFunctions() {
+  // We need to go back to the system malloc/etc at global destruct time,
+  // so objects that were constructed before tcmalloc, using the system
+  // malloc, can destroy themselves using the system free.  This depends
+  // on DLLs unloading in the reverse order in which they load!
+  //
+  // We also go back to the default HeapAlloc/etc, just for consistency.
+  // Who knows, it may help avoid weird bugs in some situations.
+  main_executable_windows.Unpatch();
+  main_executable.Unpatch();
+  if (libc1.is_valid()) libc1.Unpatch();
+  if (libc2.is_valid()) libc2.Unpatch();
+  if (libc3.is_valid()) libc3.Unpatch();
+  if (libc4.is_valid()) libc4.Unpatch();
+  if (libc5.is_valid()) libc5.Unpatch();
+  if (libc6.is_valid()) libc6.Unpatch();
+  if (libc7.is_valid()) libc7.Unpatch();
+  if (libc8.is_valid()) libc8.Unpatch();
+}
+#endif