Creating branches/google/testing and tags/google/testing/ from r317203

git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/branches/google/testing@317856 91177308-0d34-0410-b5e6-96231b3b80d8
author: David L. Jones <dlj@google.com> 2017-11-10 01:07:01 +0000
committer: David L. Jones <dlj@google.com> 2017-11-10 01:07:01 +0000
commit: 5cdb7458cb1d6fc8fc83dd5a177658f1284f5d29 (patch)
tree: 1772d0d4f25219059bc98e9c65baeef15d268524 /lib/tsan
parent: 97e140242d3085562afa1340578d5f531a82ad69 (diff)
parent: bcc227ee4af1ef3e63033b35dcb1d5627a3b2941 (diff)
download: compiler-rt-5cdb7458cb1d6fc8fc83dd5a177658f1284f5d29.tar.gz
51 files changed, 1824 insertions, 775 deletions
diff --git a/lib/tsan/CMakeLists.txt b/lib/tsan/CMakeLists.txt
index 195ecb5df..08974a467 100644
--- a/lib/tsan/CMakeLists.txt
+++ b/lib/tsan/CMakeLists.txt
@@ -100,7 +100,7 @@ set(TSAN_RUNTIME_LIBRARIES)
 add_compiler_rt_component(tsan)
 
 if(APPLE)
-  set(TSAN_ASM_SOURCES rtl/tsan_rtl_amd64.S)
+  set(TSAN_ASM_SOURCES rtl/tsan_rtl_amd64.S rtl/tsan_rtl_aarch64.S)
   # Xcode will try to compile this file as C ('clang -x c'), and that will fail.
   if (${CMAKE_GENERATOR} STREQUAL "Xcode")
     enable_language(ASM)
@@ -109,6 +109,8 @@ if(APPLE)
     set_source_files_properties(${TSAN_ASM_SOURCES} PROPERTIES LANGUAGE C)
   endif()
 
+  set(TSAN_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS})
+
   add_weak_symbols("ubsan" WEAK_SYMBOL_LINK_FLAGS)
   add_weak_symbols("sanitizer_common" WEAK_SYMBOL_LINK_FLAGS)
 
@@ -122,7 +124,8 @@ if(APPLE)
                 RTSanitizerCommonLibc
                 RTUbsan
     CFLAGS ${TSAN_RTL_CFLAGS}
-    LINK_FLAGS ${WEAK_SYMBOL_LINK_FLAGS}
+    LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+    LINK_LIBS ${TSAN_LINK_LIBS}
     PARENT_TARGET tsan)
   add_compiler_rt_object_libraries(RTTsan_dynamic
     OS ${TSAN_SUPPORTED_OS}
@@ -219,7 +222,8 @@ endif()
 
 # Build libcxx instrumented with TSan.
 if(COMPILER_RT_HAS_LIBCXX_SOURCES AND
-   COMPILER_RT_TEST_COMPILER_ID STREQUAL "Clang")
+   COMPILER_RT_TEST_COMPILER_ID STREQUAL "Clang" AND
+   NOT ANDROID)
   set(libcxx_tsan_deps)
   foreach(arch ${TSAN_SUPPORTED_ARCH})
     get_target_flags_for_arch(${arch} TARGET_CFLAGS)
diff --git a/lib/tsan/check_analyze.sh b/lib/tsan/check_analyze.sh
index a5d3632df..9b5abc317 100755
--- a/lib/tsan/check_analyze.sh
+++ b/lib/tsan/check_analyze.sh
@@ -2,6 +2,14 @@
 #
 # Script that checks that critical functions in TSan runtime have correct number
 # of push/pop/rsp instructions to verify that runtime is efficient enough.
+#
+# This test can fail when backend code generation changes the output for various
+# tsan interceptors. When such a change happens, you can ensure that the
+# performance has not regressed by running the following benchmarks before and
+# after the breaking change to verify that the values in this file are safe to
+# update:
+# ./projects/compiler-rt/lib/tsan/tests/rtl/TsanRtlTest
+#   --gtest_also_run_disabled_tests --gtest_filter=DISABLED_BENCH.Mop*
 
 set -u
 
@@ -26,22 +34,16 @@ check() {
   fi
 }
 
-for f in write1; do
+for f in write1 write2 write4 write8; do
   check $f rsp 1
   check $f push 2
-  check $f pop 2
-done
-
-for f in write2 write4 write8; do
-  check $f rsp 1
-  check $f push 3
-  check $f pop 3
+  check $f pop 12
 done
 
 for f in read1 read2 read4 read8; do
   check $f rsp 1
-  check $f push 5
-  check $f pop 5
+  check $f push 3
+  check $f pop 18
 done
 
 for f in func_entry func_exit; do
diff --git a/lib/tsan/dd/CMakeLists.txt b/lib/tsan/dd/CMakeLists.txt
index bcff35f20..07fc30053 100644
--- a/lib/tsan/dd/CMakeLists.txt
+++ b/lib/tsan/dd/CMakeLists.txt
@@ -10,7 +10,8 @@ set(DD_SOURCES
   dd_interceptors.cc
 )
 
-set(DD_LINKLIBS)
+set(DD_LINKLIBS ${SANITIZER_CXX_ABI_LIBRARY} ${SANITIZER_COMMON_LINK_LIBS})
+
 append_list_if(COMPILER_RT_HAS_LIBDL dl DD_LINKLIBS)
 append_list_if(COMPILER_RT_HAS_LIBRT rt DD_LINKLIBS)
 append_list_if(COMPILER_RT_HAS_LIBPTHREAD pthread DD_LINKLIBS)
@@ -40,6 +41,7 @@ if(CAN_TARGET_x86_64 AND UNIX AND NOT APPLE AND NOT ANDROID)
             $<TARGET_OBJECTS:RTInterception.${arch}>
             $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
             $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+    LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS}
     LINK_LIBS ${DD_LINKLIBS}
     PARENT_TARGET dd)
 endif()
diff --git a/lib/tsan/dd/dd_interceptors.cc b/lib/tsan/dd/dd_interceptors.cc
index 97c72dd2b..a39218f04 100644
--- a/lib/tsan/dd/dd_interceptors.cc
+++ b/lib/tsan/dd/dd_interceptors.cc
@@ -270,20 +270,19 @@ namespace __dsan {
 
 static void InitDataSeg() {
   MemoryMappingLayout proc_maps(true);
-  uptr start, end, offset;
   char name[128];
+  MemoryMappedSegment segment(name, ARRAY_SIZE(name));
   bool prev_is_data = false;
-  while (proc_maps.Next(&start, &end, &offset, name, ARRAY_SIZE(name),
-                        /*protection*/ 0)) {
-    bool is_data = offset != 0 && name[0] != 0;
+  while (proc_maps.Next(&segment)) {
+    bool is_data = segment.offset != 0 && segment.filename[0] != 0;
     // BSS may get merged with [heap] in /proc/self/maps. This is not very
     // reliable.
-    bool is_bss = offset == 0 &&
-      (name[0] == 0 || internal_strcmp(name, "[heap]") == 0) && prev_is_data;
-    if (g_data_start == 0 && is_data)
-      g_data_start = start;
-    if (is_bss)
-      g_data_end = end;
+    bool is_bss = segment.offset == 0 &&
+                  (segment.filename[0] == 0 ||
+                   internal_strcmp(segment.filename, "[heap]") == 0) &&
+                  prev_is_data;
+    if (g_data_start == 0 && is_data) g_data_start = segment.start;
+    if (is_bss) g_data_end = segment.end;
     prev_is_data = is_data;
   }
   VPrintf(1, "guessed data_start=%p data_end=%p\n",  g_data_start, g_data_end);
diff --git a/lib/tsan/go/buildgo.sh b/lib/tsan/go/buildgo.sh
index 42d479064..62ff0fc38 100755
--- a/lib/tsan/go/buildgo.sh
+++ b/lib/tsan/go/buildgo.sh
@@ -5,6 +5,7 @@ set -e
 SRCS="
 	tsan_go.cc
 	../rtl/tsan_clock.cc
+	../rtl/tsan_external.cc
 	../rtl/tsan_flags.cc
 	../rtl/tsan_interface_atomic.cc
 	../rtl/tsan_md5.cc
@@ -23,6 +24,7 @@ SRCS="
 	../../sanitizer_common/sanitizer_common.cc
 	../../sanitizer_common/sanitizer_common_libcdep.cc
 	../../sanitizer_common/sanitizer_deadlock_detector2.cc
+	../../sanitizer_common/sanitizer_file.cc
 	../../sanitizer_common/sanitizer_flag_parser.cc
 	../../sanitizer_common/sanitizer_flags.cc
 	../../sanitizer_common/sanitizer_libc.cc
@@ -66,9 +68,24 @@ elif [ "`uname -a | grep FreeBSD`" != "" ]; then
 		../../sanitizer_common/sanitizer_linux_libcdep.cc
 		../../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
 	"
+elif [ "`uname -a | grep NetBSD`" != "" ]; then
+	SUFFIX="netbsd_amd64"
+	OSCFLAGS="-fno-strict-aliasing -fPIC -Werror"
+	OSLDFLAGS="-lpthread -fPIC -fpie"
+	SRCS="
+		$SRCS
+		../rtl/tsan_platform_linux.cc
+		../../sanitizer_common/sanitizer_posix.cc
+		../../sanitizer_common/sanitizer_posix_libcdep.cc
+		../../sanitizer_common/sanitizer_procmaps_common.cc
+		../../sanitizer_common/sanitizer_procmaps_freebsd.cc
+		../../sanitizer_common/sanitizer_linux.cc
+		../../sanitizer_common/sanitizer_linux_libcdep.cc
+		../../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
+	"
 elif [ "`uname -a | grep Darwin`" != "" ]; then
 	SUFFIX="darwin_amd64"
-	OSCFLAGS="-fPIC -Wno-unused-const-variable -Wno-unknown-warning-option -mmacosx-version-min=10.7"
+	OSCFLAGS="-fPIC -Wno-unused-const-variable -Wno-unknown-warning-option -isysroot $(xcodebuild -version -sdk macosx Path) -mmacosx-version-min=10.7"
 	OSLDFLAGS="-lpthread -fPIC -fpie -mmacosx-version-min=10.7"
 	SRCS="
 		$SRCS
@@ -125,7 +142,7 @@ if [ "$SILENT" != "1" ]; then
 fi
 $CC $DIR/gotsan.cc -c -o $DIR/race_$SUFFIX.syso $FLAGS $CFLAGS
 
-$CC $OSCFLAGS test.c $DIR/race_$SUFFIX.syso -m64 -g -o $DIR/test $OSLDFLAGS
+$CC $OSCFLAGS test.c $DIR/race_$SUFFIX.syso -m64 -g -o $DIR/test $OSLDFLAGS $LDFLAGS
 
 export GORACE="exitcode=0 atexit_sleep_ms=0"
 if [ "$SILENT" != "1" ]; then
diff --git a/lib/tsan/go/tsan_go.cc b/lib/tsan/go/tsan_go.cc
index 7fb4eb2a5..d7a9e0b67 100644
--- a/lib/tsan/go/tsan_go.cc
+++ b/lib/tsan/go/tsan_go.cc
@@ -247,13 +247,17 @@ void __tsan_finalizer_goroutine(ThreadState *thr) {
 }
 
 void __tsan_mutex_before_lock(ThreadState *thr, uptr addr, uptr write) {
+  if (write)
+    MutexPreLock(thr, 0, addr);
+  else
+    MutexPreReadLock(thr, 0, addr);
 }
 
 void __tsan_mutex_after_lock(ThreadState *thr, uptr addr, uptr write) {
   if (write)
-    MutexLock(thr, 0, addr);
+    MutexPostLock(thr, 0, addr);
   else
-    MutexReadLock(thr, 0, addr);
+    MutexPostReadLock(thr, 0, addr);
 }
 
 void __tsan_mutex_before_unlock(ThreadState *thr, uptr addr, uptr write) {
diff --git a/lib/tsan/rtl/tsan.syms.extra b/lib/tsan/rtl/tsan.syms.extra
index 22dfde914..ab5b5a4fc 100644
--- a/lib/tsan/rtl/tsan.syms.extra
+++ b/lib/tsan/rtl/tsan.syms.extra
@@ -9,6 +9,16 @@ __tsan_java*
 __tsan_unaligned*
 __tsan_release
 __tsan_acquire
+__tsan_mutex_create
+__tsan_mutex_destroy
+__tsan_mutex_pre_lock
+__tsan_mutex_post_lock
+__tsan_mutex_pre_unlock
+__tsan_mutex_post_unlock
+__tsan_mutex_pre_signal
+__tsan_mutex_post_signal
+__tsan_mutex_pre_divert
+__tsan_mutex_post_divert
 __ubsan_*
 Annotate*
 WTFAnnotate*
diff --git a/lib/tsan/rtl/tsan_clock.cc b/lib/tsan/rtl/tsan_clock.cc
index 32435adfd..ef984a45c 100644
--- a/lib/tsan/rtl/tsan_clock.cc
+++ b/lib/tsan/rtl/tsan_clock.cc
@@ -61,20 +61,13 @@
 // an exclusive lock; ThreadClock's are private to respective threads and so
 // do not need any protection.
 //
-// Description of ThreadClock state:
-// clk_ - fixed size vector clock.
-// nclk_ - effective size of the vector clock (the rest is zeros).
-// tid_ - index of the thread associated with he clock ("current thread").
-// last_acquire_ - current thread time when it acquired something from
-//   other threads.
-//
 // Description of SyncClock state:
 // clk_ - variable size vector clock, low kClkBits hold timestamp,
 //   the remaining bits hold "acquired" flag (the actual value is thread's
 //   reused counter);
 //   if acquried == thr->reused_, then the respective thread has already
-//   acquired this clock (except possibly dirty_tids_).
-// dirty_tids_ - holds up to two indeces in the vector clock that other threads
+//   acquired this clock (except possibly for dirty elements).
+// dirty_ - holds up to two indeces in the vector clock that other threads
 //   need to acquire regardless of "acquired" flag value;
 // release_store_tid_ - denotes that the clock state is a result of
 //   release-store operation by the thread with release_store_tid_ index.
@@ -90,18 +83,51 @@
 
 namespace __tsan {
 
+static atomic_uint32_t *ref_ptr(ClockBlock *cb) {
+  return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]);
+}
+
+// Drop reference to the first level block idx.
+static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
+  ClockBlock *cb = ctx->clock_alloc.Map(idx);
+  atomic_uint32_t *ref = ref_ptr(cb);
+  u32 v = atomic_load(ref, memory_order_acquire);
+  for (;;) {
+    CHECK_GT(v, 0);
+    if (v == 1)
+      break;
+    if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel))
+      return;
+  }
+  // First level block owns second level blocks, so them as well.
+  for (uptr i = 0; i < blocks; i++)
+    ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]);
+  ctx->clock_alloc.Free(c, idx);
+}
+
 ThreadClock::ThreadClock(unsigned tid, unsigned reused)
     : tid_(tid)
-    , reused_(reused + 1) {  // 0 has special meaning
+    , reused_(reused + 1)  // 0 has special meaning
+    , cached_idx_()
+    , cached_size_()
+    , cached_blocks_() {
   CHECK_LT(tid, kMaxTidInClock);
   CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
   nclk_ = tid_ + 1;
   last_acquire_ = 0;
   internal_memset(clk_, 0, sizeof(clk_));
-  clk_[tid_].reused = reused_;
 }
 
-void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
+void ThreadClock::ResetCached(ClockCache *c) {
+  if (cached_idx_) {
+    UnrefClockBlock(c, cached_idx_, cached_blocks_);
+    cached_idx_ = 0;
+    cached_size_ = 0;
+    cached_blocks_ = 0;
+  }
+}
+
+void ThreadClock::acquire(ClockCache *c, SyncClock *src) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(src->size_, kMaxTid);
   CPP_STAT_INC(StatClockAcquire);
@@ -113,52 +139,46 @@ void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
     return;
   }
 
-  // Check if we've already acquired src after the last release operation on src
   bool acquired = false;
-  if (nclk > tid_) {
-    CPP_STAT_INC(StatClockAcquireLarge);
-    if (src->elem(tid_).reused == reused_) {
-      CPP_STAT_INC(StatClockAcquireRepeat);
-      for (unsigned i = 0; i < kDirtyTids; i++) {
-        unsigned tid = src->dirty_tids_[i];
-        if (tid != kInvalidTid) {
-          u64 epoch = src->elem(tid).epoch;
-          if (clk_[tid].epoch < epoch) {
-            clk_[tid].epoch = epoch;
-            acquired = true;
-          }
-        }
-      }
-      if (acquired) {
-        CPP_STAT_INC(StatClockAcquiredSomething);
-        last_acquire_ = clk_[tid_].epoch;
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    SyncClock::Dirty dirty = src->dirty_[i];
+    unsigned tid = dirty.tid;
+    if (tid != kInvalidTid) {
+      if (clk_[tid] < dirty.epoch) {
+        clk_[tid] = dirty.epoch;
+        acquired = true;
       }
-      return;
     }
   }
 
-  // O(N) acquire.
-  CPP_STAT_INC(StatClockAcquireFull);
-  nclk_ = max(nclk_, nclk);
-  for (uptr i = 0; i < nclk; i++) {
-    u64 epoch = src->elem(i).epoch;
-    if (clk_[i].epoch < epoch) {
-      clk_[i].epoch = epoch;
-      acquired = true;
+  // Check if we've already acquired src after the last release operation on src
+  if (tid_ >= nclk || src->elem(tid_).reused != reused_) {
+    // O(N) acquire.
+    CPP_STAT_INC(StatClockAcquireFull);
+    nclk_ = max(nclk_, nclk);
+    u64 *dst_pos = &clk_[0];
+    for (ClockElem &src_elem : *src) {
+      u64 epoch = src_elem.epoch;
+      if (*dst_pos < epoch) {
+        *dst_pos = epoch;
+        acquired = true;
+      }
+      dst_pos++;
     }
-  }
 
-  // Remember that this thread has acquired this clock.
-  if (nclk > tid_)
-    src->elem(tid_).reused = reused_;
+    // Remember that this thread has acquired this clock.
+    if (nclk > tid_)
+      src->elem(tid_).reused = reused_;
+  }
 
   if (acquired) {
     CPP_STAT_INC(StatClockAcquiredSomething);
-    last_acquire_ = clk_[tid_].epoch;
+    last_acquire_ = clk_[tid_];
+    ResetCached(c);
   }
 }
 
-void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
+void ThreadClock::release(ClockCache *c, SyncClock *dst) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(dst->size_, kMaxTid);
 
@@ -178,7 +198,7 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
   // since the last release on dst. If so, we need to update
   // only dst->elem(tid_).
   if (dst->elem(tid_).epoch > last_acquire_) {
-    UpdateCurrentThread(dst);
+    UpdateCurrentThread(c, dst);
     if (dst->release_store_tid_ != tid_ ||
         dst->release_store_reused_ != reused_)
       dst->release_store_tid_ = kInvalidTid;
@@ -187,23 +207,24 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
 
   // O(N) release.
   CPP_STAT_INC(StatClockReleaseFull);
+  dst->Unshare(c);
   // First, remember whether we've acquired dst.
   bool acquired = IsAlreadyAcquired(dst);
   if (acquired)
     CPP_STAT_INC(StatClockReleaseAcquired);
   // Update dst->clk_.
-  for (uptr i = 0; i < nclk_; i++) {
-    ClockElem &ce = dst->elem(i);
-    ce.epoch = max(ce.epoch, clk_[i].epoch);
+  dst->FlushDirty();
+  uptr i = 0;
+  for (ClockElem &ce : *dst) {
+    ce.epoch = max(ce.epoch, clk_[i]);
     ce.reused = 0;
+    i++;
   }
   // Clear 'acquired' flag in the remaining elements.
   if (nclk_ < dst->size_)
     CPP_STAT_INC(StatClockReleaseClearTail);
   for (uptr i = nclk_; i < dst->size_; i++)
     dst->elem(i).reused = 0;
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dst->dirty_tids_[i] = kInvalidTid;
   dst->release_store_tid_ = kInvalidTid;
   dst->release_store_reused_ = 0;
   // If we've acquired dst, remember this fact,
@@ -212,11 +233,37 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
     dst->elem(tid_).reused = reused_;
 }
 
-void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
+void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(dst->size_, kMaxTid);
   CPP_STAT_INC(StatClockStore);
 
+  if (dst->size_ == 0 && cached_idx_ != 0) {
+    // Reuse the cached clock.
+    // Note: we could reuse/cache the cached clock in more cases:
+    // we could update the existing clock and cache it, or replace it with the
+    // currently cached clock and release the old one. And for a shared
+    // existing clock, we could replace it with the currently cached;
+    // or unshare, update and cache. But, for simplicity, we currnetly reuse
+    // cached clock only when the target clock is empty.
+    dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
+    dst->tab_idx_ = cached_idx_;
+    dst->size_ = cached_size_;
+    dst->blocks_ = cached_blocks_;
+    CHECK_EQ(dst->dirty_[0].tid, kInvalidTid);
+    // The cached clock is shared (immutable),
+    // so this is where we store the current clock.
+    dst->dirty_[0].tid = tid_;
+    dst->dirty_[0].epoch = clk_[tid_];
+    dst->release_store_tid_ = tid_;
+    dst->release_store_reused_ = reused_;
+    // Rememeber that we don't need to acquire it in future.
+    dst->elem(tid_).reused = reused_;
+    // Grab a reference.
+    atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
+    return;
+  }
+
   // Check if we need to resize dst.
   if (dst->size_ < nclk_)
     dst->Resize(c, nclk_);
@@ -225,32 +272,41 @@ void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
       dst->release_store_reused_ == reused_ &&
       dst->elem(tid_).epoch > last_acquire_) {
     CPP_STAT_INC(StatClockStoreFast);
-    UpdateCurrentThread(dst);
+    UpdateCurrentThread(c, dst);
     return;
   }
 
   // O(N) release-store.
   CPP_STAT_INC(StatClockStoreFull);
-  for (uptr i = 0; i < nclk_; i++) {
-    ClockElem &ce = dst->elem(i);
-    ce.epoch = clk_[i].epoch;
+  dst->Unshare(c);
+  // Note: dst can be larger than this ThreadClock.
+  // This is fine since clk_ beyond size is all zeros.
+  uptr i = 0;
+  for (ClockElem &ce : *dst) {
+    ce.epoch = clk_[i];
     ce.reused = 0;
+    i++;
   }
-  // Clear the tail of dst->clk_.
-  if (nclk_ < dst->size_) {
-    for (uptr i = nclk_; i < dst->size_; i++) {
-      ClockElem &ce = dst->elem(i);
-      ce.epoch = 0;
-      ce.reused = 0;
-    }
-    CPP_STAT_INC(StatClockStoreTail);
-  }
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dst->dirty_tids_[i] = kInvalidTid;
+  for (uptr i = 0; i < kDirtyTids; i++)
+    dst->dirty_[i].tid = kInvalidTid;
   dst->release_store_tid_ = tid_;
   dst->release_store_reused_ = reused_;
   // Rememeber that we don't need to acquire it in future.
   dst->elem(tid_).reused = reused_;
+
+  // If the resulting clock is cachable, cache it for future release operations.
+  // The clock is always cachable if we released to an empty sync object.
+  if (cached_idx_ == 0 && dst->Cachable()) {
+    // Grab a reference to the ClockBlock.
+    atomic_uint32_t *ref = ref_ptr(dst->tab_);
+    if (atomic_load(ref, memory_order_acquire) == 1)
+      atomic_store_relaxed(ref, 2);
+    else
+      atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
+    cached_idx_ = dst->tab_idx_;
+    cached_size_ = dst->size_;
+    cached_blocks_ = dst->blocks_;
+  }
 }
 
 void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
@@ -260,157 +316,248 @@ void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
 }
 
 // Updates only single element related to the current thread in dst->clk_.
-void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
+void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
   // Update the threads time, but preserve 'acquired' flag.
-  dst->elem(tid_).epoch = clk_[tid_].epoch;
-
   for (unsigned i = 0; i < kDirtyTids; i++) {
-    if (dst->dirty_tids_[i] == tid_) {
-      CPP_STAT_INC(StatClockReleaseFast1);
-      return;
-    }
-    if (dst->dirty_tids_[i] == kInvalidTid) {
-      CPP_STAT_INC(StatClockReleaseFast2);
-      dst->dirty_tids_[i] = tid_;
+    SyncClock::Dirty *dirty = &dst->dirty_[i];
+    const unsigned tid = dirty->tid;
+    if (tid == tid_ || tid == kInvalidTid) {
+      CPP_STAT_INC(StatClockReleaseFast);
+      dirty->tid = tid_;
+      dirty->epoch = clk_[tid_];
       return;
     }
   }
   // Reset all 'acquired' flags, O(N).
+  // We are going to touch dst elements, so we need to unshare it.
+  dst->Unshare(c);
   CPP_STAT_INC(StatClockReleaseSlow);
+  dst->elem(tid_).epoch = clk_[tid_];
   for (uptr i = 0; i < dst->size_; i++)
     dst->elem(i).reused = 0;
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dst->dirty_tids_[i] = kInvalidTid;
+  dst->FlushDirty();
 }
 
-// Checks whether the current threads has already acquired src.
+// Checks whether the current thread has already acquired src.
 bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
   if (src->elem(tid_).reused != reused_)
     return false;
   for (unsigned i = 0; i < kDirtyTids; i++) {
-    unsigned tid = src->dirty_tids_[i];
-    if (tid != kInvalidTid) {
-      if (clk_[tid].epoch < src->elem(tid).epoch)
+    SyncClock::Dirty dirty = src->dirty_[i];
+    if (dirty.tid != kInvalidTid) {
+      if (clk_[dirty.tid] < dirty.epoch)
         return false;
     }
   }
   return true;
 }
 
+// Sets a single element in the vector clock.
+// This function is called only from weird places like AcquireGlobal.
+void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
+  DCHECK_LT(tid, kMaxTid);
+  DCHECK_GE(v, clk_[tid]);
+  clk_[tid] = v;
+  if (nclk_ <= tid)
+    nclk_ = tid + 1;
+  last_acquire_ = clk_[tid_];
+  ResetCached(c);
+}
+
+void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
+  printf("clock=[");
+  for (uptr i = 0; i < nclk_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
+  printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_);
+}
+
+SyncClock::SyncClock() {
+  ResetImpl();
+}
+
+SyncClock::~SyncClock() {
+  // Reset must be called before dtor.
+  CHECK_EQ(size_, 0);
+  CHECK_EQ(blocks_, 0);
+  CHECK_EQ(tab_, 0);
+  CHECK_EQ(tab_idx_, 0);
+}
+
+void SyncClock::Reset(ClockCache *c) {
+  if (size_)
+    UnrefClockBlock(c, tab_idx_, blocks_);
+  ResetImpl();
+}
+
+void SyncClock::ResetImpl() {
+  tab_ = 0;
+  tab_idx_ = 0;
+  size_ = 0;
+  blocks_ = 0;
+  release_store_tid_ = kInvalidTid;
+  release_store_reused_ = 0;
+  for (uptr i = 0; i < kDirtyTids; i++)
+    dirty_[i].tid = kInvalidTid;
+}
+
 void SyncClock::Resize(ClockCache *c, uptr nclk) {
   CPP_STAT_INC(StatClockReleaseResize);
-  if (RoundUpTo(nclk, ClockBlock::kClockCount) <=
-      RoundUpTo(size_, ClockBlock::kClockCount)) {
-    // Growing within the same block.
+  Unshare(c);
+  if (nclk <= capacity()) {
     // Memory is already allocated, just increase the size.
     size_ = nclk;
     return;
   }
-  if (nclk <= ClockBlock::kClockCount) {
+  if (size_ == 0) {
     // Grow from 0 to one-level table.
     CHECK_EQ(size_, 0);
+    CHECK_EQ(blocks_, 0);
     CHECK_EQ(tab_, 0);
     CHECK_EQ(tab_idx_, 0);
-    size_ = nclk;
     tab_idx_ = ctx->clock_alloc.Alloc(c);
     tab_ = ctx->clock_alloc.Map(tab_idx_);
     internal_memset(tab_, 0, sizeof(*tab_));
-    return;
-  }
-  // Growing two-level table.
-  if (size_ == 0) {
-    // Allocate first level table.
-    tab_idx_ = ctx->clock_alloc.Alloc(c);
-    tab_ = ctx->clock_alloc.Map(tab_idx_);
-    internal_memset(tab_, 0, sizeof(*tab_));
-  } else if (size_ <= ClockBlock::kClockCount) {
-    // Transform one-level table to two-level table.
-    u32 old = tab_idx_;
-    tab_idx_ = ctx->clock_alloc.Alloc(c);
-    tab_ = ctx->clock_alloc.Map(tab_idx_);
-    internal_memset(tab_, 0, sizeof(*tab_));
-    tab_->table[0] = old;
+    atomic_store_relaxed(ref_ptr(tab_), 1);
+    size_ = 1;
+  } else if (size_ > blocks_ * ClockBlock::kClockCount) {
+    u32 idx = ctx->clock_alloc.Alloc(c);
+    ClockBlock *new_cb = ctx->clock_alloc.Map(idx);
+    uptr top = size_ - blocks_ * ClockBlock::kClockCount;
+    CHECK_LT(top, ClockBlock::kClockCount);
+    const uptr move = top * sizeof(tab_->clock[0]);
+    internal_memcpy(&new_cb->clock[0], tab_->clock, move);
+    internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move);
+    internal_memset(tab_->clock, 0, move);
+    append_block(idx);
   }
-  // At this point we have first level table allocated.
+  // At this point we have first level table allocated and all clock elements
+  // are evacuated from it to a second level block.
   // Add second level tables as necessary.
-  for (uptr i = RoundUpTo(size_, ClockBlock::kClockCount);
-      i < nclk; i += ClockBlock::kClockCount) {
+  while (nclk > capacity()) {
     u32 idx = ctx->clock_alloc.Alloc(c);
     ClockBlock *cb = ctx->clock_alloc.Map(idx);
     internal_memset(cb, 0, sizeof(*cb));
-    CHECK_EQ(tab_->table[i/ClockBlock::kClockCount], 0);
-    tab_->table[i/ClockBlock::kClockCount] = idx;
+    append_block(idx);
   }
   size_ = nclk;
 }
 
-// Sets a single element in the vector clock.
-// This function is called only from weird places like AcquireGlobal.
-void ThreadClock::set(unsigned tid, u64 v) {
-  DCHECK_LT(tid, kMaxTid);
-  DCHECK_GE(v, clk_[tid].epoch);
-  clk_[tid].epoch = v;
-  if (nclk_ <= tid)
-    nclk_ = tid + 1;
-  last_acquire_ = clk_[tid_].epoch;
-}
-
-void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
-  printf("clock=[");
-  for (uptr i = 0; i < nclk_; i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].epoch);
-  printf("] reused=[");
-  for (uptr i = 0; i < nclk_; i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].reused);
-  printf("] tid=%u/%u last_acq=%llu",
-      tid_, reused_, last_acquire_);
+// Flushes all dirty elements into the main clock array.
+void SyncClock::FlushDirty() {
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    Dirty *dirty = &dirty_[i];
+    if (dirty->tid != kInvalidTid) {
+      CHECK_LT(dirty->tid, size_);
+      elem(dirty->tid).epoch = dirty->epoch;
+      dirty->tid = kInvalidTid;
+    }
+  }
 }
 
-SyncClock::SyncClock()
-    : release_store_tid_(kInvalidTid)
-    , release_store_reused_()
-    , tab_()
-    , tab_idx_()
-    , size_() {
-  for (uptr i = 0; i < kDirtyTids; i++)
-    dirty_tids_[i] = kInvalidTid;
+bool SyncClock::IsShared() const {
+  if (size_ == 0)
+    return false;
+  atomic_uint32_t *ref = ref_ptr(tab_);
+  u32 v = atomic_load(ref, memory_order_acquire);
+  CHECK_GT(v, 0);
+  return v > 1;
 }
 
-SyncClock::~SyncClock() {
-  // Reset must be called before dtor.
-  CHECK_EQ(size_, 0);
-  CHECK_EQ(tab_, 0);
-  CHECK_EQ(tab_idx_, 0);
+// Unshares the current clock if it's shared.
+// Shared clocks are immutable, so they need to be unshared before any updates.
+// Note: this does not apply to dirty entries as they are not shared.
+void SyncClock::Unshare(ClockCache *c) {
+  if (!IsShared())
+    return;
+  // First, copy current state into old.
+  SyncClock old;
+  old.tab_ = tab_;
+  old.tab_idx_ = tab_idx_;
+  old.size_ = size_;
+  old.blocks_ = blocks_;
+  old.release_store_tid_ = release_store_tid_;
+  old.release_store_reused_ = release_store_reused_;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    old.dirty_[i] = dirty_[i];
+  // Then, clear current object.
+  ResetImpl();
+  // Allocate brand new clock in the current object.
+  Resize(c, old.size_);
+  // Now copy state back into this object.
+  Iter old_iter(&old);
+  for (ClockElem &ce : *this) {
+    ce = *old_iter;
+    ++old_iter;
+  }
+  release_store_tid_ = old.release_store_tid_;
+  release_store_reused_ = old.release_store_reused_;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    dirty_[i] = old.dirty_[i];
+  // Drop reference to old and delete if necessary.
+  old.Reset(c);
 }
 
-void SyncClock::Reset(ClockCache *c) {
-  if (size_ == 0) {
-    // nothing
-  } else if (size_ <= ClockBlock::kClockCount) {
-    // One-level table.
-    ctx->clock_alloc.Free(c, tab_idx_);
-  } else {
-    // Two-level table.
-    for (uptr i = 0; i < size_; i += ClockBlock::kClockCount)
-      ctx->clock_alloc.Free(c, tab_->table[i / ClockBlock::kClockCount]);
-    ctx->clock_alloc.Free(c, tab_idx_);
+// Can we cache this clock for future release operations?
+ALWAYS_INLINE bool SyncClock::Cachable() const {
+  if (size_ == 0)
+    return false;
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    if (dirty_[i].tid != kInvalidTid)
+      return false;
   }
-  tab_ = 0;
-  tab_idx_ = 0;
-  size_ = 0;
-  release_store_tid_ = kInvalidTid;
-  release_store_reused_ = 0;
-  for (uptr i = 0; i < kDirtyTids; i++)
-    dirty_tids_[i] = kInvalidTid;
+  return atomic_load_relaxed(ref_ptr(tab_)) == 1;
 }
 
-ClockElem &SyncClock::elem(unsigned tid) const {
+// elem linearizes the two-level structure into linear array.
+// Note: this is used only for one time accesses, vector operations use
+// the iterator as it is much faster.
+ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const {
   DCHECK_LT(tid, size_);
-  if (size_ <= ClockBlock::kClockCount)
+  const uptr block = tid / ClockBlock::kClockCount;
+  DCHECK_LE(block, blocks_);
+  tid %= ClockBlock::kClockCount;
+  if (block == blocks_)
     return tab_->clock[tid];
-  u32 idx = tab_->table[tid / ClockBlock::kClockCount];
+  u32 idx = get_block(block);
   ClockBlock *cb = ctx->clock_alloc.Map(idx);
-  return cb->clock[tid % ClockBlock::kClockCount];
+  return cb->clock[tid];
+}
+
+ALWAYS_INLINE uptr SyncClock::capacity() const {
+  if (size_ == 0)
+    return 0;
+  uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]);
+  // How many clock elements we can fit into the first level block.
+  // +1 for ref counter.
+  uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio;
+  return blocks_ * ClockBlock::kClockCount + top;
+}
+
+ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const {
+  DCHECK(size_);
+  DCHECK_LT(bi, blocks_);
+  return tab_->table[ClockBlock::kBlockIdx - bi];
+}
+
+ALWAYS_INLINE void SyncClock::append_block(u32 idx) {
+  uptr bi = blocks_++;
+  CHECK_EQ(get_block(bi), 0);
+  tab_->table[ClockBlock::kBlockIdx - bi] = idx;
+}
+
+// Used only by tests.
+u64 SyncClock::get(unsigned tid) const {
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    Dirty dirty = dirty_[i];
+    if (dirty.tid == tid)
+      return dirty.epoch;
+  }
+  return elem(tid).epoch;
+}
+
+// Used only by Iter test.
+u64 SyncClock::get_clean(unsigned tid) const {
+  return elem(tid).epoch;
 }
 
 void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
@@ -420,8 +567,32 @@ void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
   printf("] reused=[");
   for (uptr i = 0; i < size_; i++)
     printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
-  printf("] release_store_tid=%d/%d dirty_tids=%d/%d",
+  printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
       release_store_tid_, release_store_reused_,
-      dirty_tids_[0], dirty_tids_[1]);
+      dirty_[0].tid, dirty_[0].epoch,
+      dirty_[1].tid, dirty_[1].epoch);
+}
+
+void SyncClock::Iter::Next() {
+  // Finished with the current block, move on to the next one.
+  block_++;
+  if (block_ < parent_->blocks_) {
+    // Iterate over the next second level block.
+    u32 idx = parent_->get_block(block_);
+    ClockBlock *cb = ctx->clock_alloc.Map(idx);
+    pos_ = &cb->clock[0];
+    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
+        ClockBlock::kClockCount);
+    return;
+  }
+  if (block_ == parent_->blocks_ &&
+      parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) {
+    // Iterate over elements in the first level block.
+    pos_ = &parent_->tab_->clock[0];
+    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
+        ClockBlock::kClockCount);
+    return;
+  }
+  parent_ = nullptr;  // denotes end
 }
 }  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_clock.h b/lib/tsan/rtl/tsan_clock.h
index 4e352cb81..a891d7bbd 100644
--- a/lib/tsan/rtl/tsan_clock.h
+++ b/lib/tsan/rtl/tsan_clock.h
@@ -18,25 +18,6 @@
 
 namespace __tsan {
 
-struct ClockElem {
-  u64 epoch  : kClkBits;
-  u64 reused : 64 - kClkBits;
-};
-
-struct ClockBlock {
-  static const uptr kSize = 512;
-  static const uptr kTableSize = kSize / sizeof(u32);
-  static const uptr kClockCount = kSize / sizeof(ClockElem);
-
-  union {
-    u32       table[kTableSize];
-    ClockElem clock[kClockCount];
-  };
-
-  ClockBlock() {
-  }
-};
-
 typedef DenseSlabAlloc<ClockBlock, 1<<16, 1<<10> ClockAlloc;
 typedef DenseSlabAllocCache ClockCache;
 
@@ -46,84 +27,200 @@ class SyncClock {
   SyncClock();
   ~SyncClock();
 
-  uptr size() const {
-    return size_;
-  }
+  uptr size() const;
 
-  u64 get(unsigned tid) const {
-    return elem(tid).epoch;
-  }
+  // These are used only in tests.
+  u64 get(unsigned tid) const;
+  u64 get_clean(unsigned tid) const;
 
   void Resize(ClockCache *c, uptr nclk);
   void Reset(ClockCache *c);
 
   void DebugDump(int(*printf)(const char *s, ...));
 
+  // Clock element iterator.
+  // Note: it iterates only over the table without regard to dirty entries.
+  class Iter {
+   public:
+    explicit Iter(SyncClock* parent);
+    Iter& operator++();
+    bool operator!=(const Iter& other);
+    ClockElem &operator*();
+
+   private:
+    SyncClock *parent_;
+    // [pos_, end_) is the current continuous range of clock elements.
+    ClockElem *pos_;
+    ClockElem *end_;
+    int block_;  // Current number of second level block.
+
+    NOINLINE void Next();
+  };
+
+  Iter begin();
+  Iter end();
+
  private:
-  friend struct ThreadClock;
+  friend class ThreadClock;
+  friend class Iter;
   static const uptr kDirtyTids = 2;
 
+  struct Dirty {
+    u64 epoch  : kClkBits;
+    u64 tid : 64 - kClkBits;  // kInvalidId if not active
+  };
+
   unsigned release_store_tid_;
   unsigned release_store_reused_;
-  unsigned dirty_tids_[kDirtyTids];
-  // tab_ contains indirect pointer to a 512b block using DenseSlabAlloc.
-  // If size_ <= 64, then tab_ points to an array with 64 ClockElem's.
-  // Otherwise, tab_ points to an array with 128 u32 elements,
+  Dirty dirty_[kDirtyTids];
+  // If size_ is 0, tab_ is nullptr.
+  // If size <= 64 (kClockCount), tab_ contains pointer to an array with
+  // 64 ClockElem's (ClockBlock::clock).
+  // Otherwise, tab_ points to an array with up to 127 u32 elements,
   // each pointing to the second-level 512b block with 64 ClockElem's.
+  // Unused space in the first level ClockBlock is used to store additional
+  // clock elements.
+  // The last u32 element in the first level ClockBlock is always used as
+  // reference counter.
+  //
+  // See the following scheme for details.
+  // All memory blocks are 512 bytes (allocated from ClockAlloc).
+  // Clock (clk) elements are 64 bits.
+  // Idx and ref are 32 bits.
+  //
+  // tab_
+  //    |
+  //    \/
+  //    +----------------------------------------------------+
+  //    | clk128 | clk129 | ...unused... | idx1 | idx0 | ref |
+  //    +----------------------------------------------------+
+  //                                        |      |
+  //                                        |      \/
+  //                                        |      +----------------+
+  //                                        |      | clk0 ... clk63 |
+  //                                        |      +----------------+
+  //                                        \/
+  //                                        +------------------+
+  //                                        | clk64 ... clk127 |
+  //                                        +------------------+
+  //
+  // Note: dirty entries, if active, always override what's stored in the clock.
   ClockBlock *tab_;
   u32 tab_idx_;
-  u32 size_;
-
+  u16 size_;
+  u16 blocks_;  // Number of second level blocks.
+
+  void Unshare(ClockCache *c);
+  bool IsShared() const;
+  bool Cachable() const;
+  void ResetImpl();
+  void FlushDirty();
+  uptr capacity() const;
+  u32 get_block(uptr bi) const;
+  void append_block(u32 idx);
   ClockElem &elem(unsigned tid) const;
 };
 
 // The clock that lives in threads.
-struct ThreadClock {
+class ThreadClock {
  public:
   typedef DenseSlabAllocCache Cache;
 
   explicit ThreadClock(unsigned tid, unsigned reused = 0);
 
-  u64 get(unsigned tid) const {
-    DCHECK_LT(tid, kMaxTidInClock);
-    return clk_[tid].epoch;
-  }
-
-  void set(unsigned tid, u64 v);
-
-  void set(u64 v) {
-    DCHECK_GE(v, clk_[tid_].epoch);
-    clk_[tid_].epoch = v;
-  }
+  u64 get(unsigned tid) const;
+  void set(ClockCache *c, unsigned tid, u64 v);
+  void set(u64 v);
+  void tick();
+  uptr size() const;
 
-  void tick() {
-    clk_[tid_].epoch++;
-  }
-
-  uptr size() const {
-    return nclk_;
-  }
-
-  void acquire(ClockCache *c, const SyncClock *src);
-  void release(ClockCache *c, SyncClock *dst) const;
+  void acquire(ClockCache *c, SyncClock *src);
+  void release(ClockCache *c, SyncClock *dst);
   void acq_rel(ClockCache *c, SyncClock *dst);
-  void ReleaseStore(ClockCache *c, SyncClock *dst) const;
+  void ReleaseStore(ClockCache *c, SyncClock *dst);
+  void ResetCached(ClockCache *c);
 
   void DebugReset();
   void DebugDump(int(*printf)(const char *s, ...));
 
  private:
   static const uptr kDirtyTids = SyncClock::kDirtyTids;
+  // Index of the thread associated with he clock ("current thread").
   const unsigned tid_;
-  const unsigned reused_;
+  const unsigned reused_;  // tid_ reuse count.
+  // Current thread time when it acquired something from other threads.
   u64 last_acquire_;
+
+  // Cached SyncClock (without dirty entries and release_store_tid_).
+  // We reuse it for subsequent store-release operations without intervening
+  // acquire operations. Since it is shared (and thus constant), clock value
+  // for the current thread is then stored in dirty entries in the SyncClock.
+  // We host a refernece to the table while it is cached here.
+  u32 cached_idx_;
+  u16 cached_size_;
+  u16 cached_blocks_;
+
+  // Number of active elements in the clk_ table (the rest is zeros).
   uptr nclk_;
-  ClockElem clk_[kMaxTidInClock];
+  u64 clk_[kMaxTidInClock];  // Fixed size vector clock.
 
   bool IsAlreadyAcquired(const SyncClock *src) const;
-  void UpdateCurrentThread(SyncClock *dst) const;
+  void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const;
 };
 
+ALWAYS_INLINE u64 ThreadClock::get(unsigned tid) const {
+  DCHECK_LT(tid, kMaxTidInClock);
+  return clk_[tid];
+}
+
+ALWAYS_INLINE void ThreadClock::set(u64 v) {
+  DCHECK_GE(v, clk_[tid_]);
+  clk_[tid_] = v;
+}
+
+ALWAYS_INLINE void ThreadClock::tick() {
+  clk_[tid_]++;
+}
+
+ALWAYS_INLINE uptr ThreadClock::size() const {
+  return nclk_;
+}
+
+ALWAYS_INLINE SyncClock::Iter SyncClock::begin() {
+  return Iter(this);
+}
+
+ALWAYS_INLINE SyncClock::Iter SyncClock::end() {
+  return Iter(nullptr);
+}
+
+ALWAYS_INLINE uptr SyncClock::size() const {
+  return size_;
+}
+
+ALWAYS_INLINE SyncClock::Iter::Iter(SyncClock* parent)
+    : parent_(parent)
+    , pos_(nullptr)
+    , end_(nullptr)
+    , block_(-1) {
+  if (parent)
+    Next();
+}
+
+ALWAYS_INLINE SyncClock::Iter& SyncClock::Iter::operator++() {
+  pos_++;
+  if (UNLIKELY(pos_ >= end_))
+    Next();
+  return *this;
+}
+
+ALWAYS_INLINE bool SyncClock::Iter::operator!=(const SyncClock::Iter& other) {
+  return parent_ != other.parent_;
+}
+
+ALWAYS_INLINE ClockElem &SyncClock::Iter::operator*() {
+  return *pos_;
+}
 }  // namespace __tsan
 
 #endif  // TSAN_CLOCK_H
diff --git a/lib/tsan/rtl/tsan_debugging.cc b/lib/tsan/rtl/tsan_debugging.cc
index 06154bc13..a44b13632 100644
--- a/lib/tsan/rtl/tsan_debugging.cc
+++ b/lib/tsan/rtl/tsan_debugging.cc
@@ -151,7 +151,7 @@ int __tsan_get_report_mutex(void *report, uptr idx, uptr *mutex_id, void **addr,
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
-int __tsan_get_report_thread(void *report, uptr idx, int *tid, uptr *os_id,
+int __tsan_get_report_thread(void *report, uptr idx, int *tid, tid_t *os_id,
                              int *running, const char **name, int *parent_tid,
                              void **trace, uptr trace_size) {
   const ReportDesc *rep = (ReportDesc *)report;
@@ -228,7 +228,7 @@ const char *__tsan_locate_address(uptr addr, char *name, uptr name_size,
 
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_get_alloc_stack(uptr addr, uptr *trace, uptr size, int *thread_id,
-                           uptr *os_id) {
+                           tid_t *os_id) {
   MBlock *b = 0;
   Allocator *a = allocator();
   if (a->PointerIsMine((void *)addr)) {
diff --git a/lib/tsan/rtl/tsan_defs.h b/lib/tsan/rtl/tsan_defs.h
index 8a0381e61..3c775debf 100644
--- a/lib/tsan/rtl/tsan_defs.h
+++ b/lib/tsan/rtl/tsan_defs.h
@@ -38,15 +38,40 @@
 
 namespace __tsan {
 
+const int kClkBits = 42;
+const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
+
+struct ClockElem {
+  u64 epoch  : kClkBits;
+  u64 reused : 64 - kClkBits;  // tid reuse count
+};
+
+struct ClockBlock {
+  static const uptr kSize = 512;
+  static const uptr kTableSize = kSize / sizeof(u32);
+  static const uptr kClockCount = kSize / sizeof(ClockElem);
+  static const uptr kRefIdx = kTableSize - 1;
+  static const uptr kBlockIdx = kTableSize - 2;
+
+  union {
+    u32       table[kTableSize];
+    ClockElem clock[kClockCount];
+  };
+
+  ClockBlock() {
+  }
+};
+
 const int kTidBits = 13;
-const unsigned kMaxTid = 1 << kTidBits;
+// Reduce kMaxTid by kClockCount because one slot in ClockBlock table is
+// occupied by reference counter, so total number of elements we can store
+// in SyncClock is kClockCount * (kTableSize - 1).
+const unsigned kMaxTid = (1 << kTidBits) - ClockBlock::kClockCount;
 #if !SANITIZER_GO
 const unsigned kMaxTidInClock = kMaxTid * 2;  // This includes msb 'freed' bit.
 #else
 const unsigned kMaxTidInClock = kMaxTid;  // Go does not track freed memory.
 #endif
-const int kClkBits = 42;
-const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
 const uptr kShadowStackSize = 64 * 1024;
 
 // Count of shadow values in a shadow cell.
@@ -74,7 +99,7 @@ const bool kCollectHistory = false;
 const bool kCollectHistory = true;
 #endif
 
-const unsigned kInvalidTid = (unsigned)-1;
+const u16 kInvalidTid = kMaxTid + 1;
 
 // The following "build consistency" machinery ensures that all source files
 // are built in the same configuration. Inconsistent builds lead to
@@ -157,6 +182,15 @@ struct MBlock {
 
 COMPILER_CHECK(sizeof(MBlock) == 16);
 
+enum ExternalTag : uptr {
+  kExternalTagNone = 0,
+  kExternalTagSwiftModifyingAccess = 1,
+  kExternalTagFirstUserAvailable = 2,
+  kExternalTagMax = 1024,
+  // Don't set kExternalTagMax over 65,536, since MBlock only stores tags
+  // as 16-bit values, see tsan_defs.h.
+};
+
 }  // namespace __tsan
 
 #endif  // TSAN_DEFS_H
diff --git a/lib/tsan/rtl/tsan_dense_alloc.h b/lib/tsan/rtl/tsan_dense_alloc.h
index e9815c90a..16dbdf391 100644
--- a/lib/tsan/rtl/tsan_dense_alloc.h
+++ b/lib/tsan/rtl/tsan_dense_alloc.h
@@ -39,7 +39,7 @@ class DenseSlabAlloc {
   typedef DenseSlabAllocCache Cache;
   typedef typename Cache::IndexT IndexT;
 
-  DenseSlabAlloc() {
+  explicit DenseSlabAlloc(const char *name) {
     // Check that kL1Size and kL2Size are sane.
     CHECK_EQ(kL1Size & (kL1Size - 1), 0);
     CHECK_EQ(kL2Size & (kL2Size - 1), 0);
@@ -49,6 +49,7 @@ class DenseSlabAlloc {
     internal_memset(map_, 0, sizeof(map_));
     freelist_ = 0;
     fillpos_ = 0;
+    name_ = name;
   }
 
   ~DenseSlabAlloc() {
@@ -96,15 +97,19 @@ class DenseSlabAlloc {
   SpinMutex mtx_;
   IndexT freelist_;
   uptr fillpos_;
+  const char *name_;
 
   void Refill(Cache *c) {
     SpinMutexLock lock(&mtx_);
     if (freelist_ == 0) {
       if (fillpos_ == kL1Size) {
-        Printf("ThreadSanitizer: DenseSlabAllocator overflow. Dying.\n");
+        Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n",
+            name_, kL1Size, kL2Size);
         Die();
       }
-      T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), "DenseSlabAllocator");
+      VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n",
+          name_, fillpos_, kL1Size, kL2Size);
+      T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), name_);
       // Reserve 0 as invalid index.
       IndexT start = fillpos_ == 0 ? 1 : 0;
       for (IndexT i = start; i < kL2Size; i++) {
diff --git a/lib/tsan/rtl/tsan_external.cc b/lib/tsan/rtl/tsan_external.cc
index dc8ec6232..6c0e9477e 100644
--- a/lib/tsan/rtl/tsan_external.cc
+++ b/lib/tsan/rtl/tsan_external.cc
@@ -11,34 +11,91 @@
 //
 //===----------------------------------------------------------------------===//
 #include "tsan_rtl.h"
+#include "tsan_interceptors.h"
 
 namespace __tsan {
 
 #define CALLERPC ((uptr)__builtin_return_address(0))
 
-const uptr kMaxTag = 128;  // Limited to 65,536, since MBlock only stores tags
-                           // as 16-bit values, see tsan_defs.h.
+struct TagData {
+  const char *object_type;
+  const char *header;
+};
 
-const char *registered_tags[kMaxTag];
-static atomic_uint32_t used_tags{1};  // Tag 0 means "no tag". NOLINT
-
-const char *GetObjectTypeFromTag(uptr tag) {
-  if (tag == 0) return nullptr;
+static TagData registered_tags[kExternalTagMax] = {
+  {},
+  {"Swift variable", "Swift access race"},
+};
+static atomic_uint32_t used_tags{kExternalTagFirstUserAvailable};  // NOLINT.
+static TagData *GetTagData(uptr tag) {
   // Invalid/corrupted tag?  Better return NULL and let the caller deal with it.
   if (tag >= atomic_load(&used_tags, memory_order_relaxed)) return nullptr;
-  return registered_tags[tag];
+  return &registered_tags[tag];
+}
+
+const char *GetObjectTypeFromTag(uptr tag) {
+  TagData *tag_data = GetTagData(tag);
+  return tag_data ? tag_data->object_type : nullptr;
+}
+
+const char *GetReportHeaderFromTag(uptr tag) {
+  TagData *tag_data = GetTagData(tag);
+  return tag_data ? tag_data->header : nullptr;
+}
+
+void InsertShadowStackFrameForTag(ThreadState *thr, uptr tag) {
+  FuncEntry(thr, (uptr)&registered_tags[tag]);
+}
+
+uptr TagFromShadowStackFrame(uptr pc) {
+  uptr tag_count = atomic_load(&used_tags, memory_order_relaxed);
+  void *pc_ptr = (void *)pc;
+  if (pc_ptr < GetTagData(0) || pc_ptr > GetTagData(tag_count - 1))
+    return 0;
+  return (TagData *)pc_ptr - GetTagData(0);
+}
+
+#if !SANITIZER_GO
+
+typedef void(*AccessFunc)(ThreadState *, uptr, uptr, int);
+void ExternalAccess(void *addr, void *caller_pc, void *tag, AccessFunc access) {
+  CHECK_LT(tag, atomic_load(&used_tags, memory_order_relaxed));
+  ThreadState *thr = cur_thread();
+  if (caller_pc) FuncEntry(thr, (uptr)caller_pc);
+  InsertShadowStackFrameForTag(thr, (uptr)tag);
+  bool in_ignored_lib;
+  if (!caller_pc || !libignore()->IsIgnored((uptr)caller_pc, &in_ignored_lib)) {
+    access(thr, CALLERPC, (uptr)addr, kSizeLog1);
+  }
+  FuncExit(thr);
+  if (caller_pc) FuncExit(thr);
 }
 
 extern "C" {
 SANITIZER_INTERFACE_ATTRIBUTE
 void *__tsan_external_register_tag(const char *object_type) {
   uptr new_tag = atomic_fetch_add(&used_tags, 1, memory_order_relaxed);
-  CHECK_LT(new_tag, kMaxTag);
-  registered_tags[new_tag] = internal_strdup(object_type);
+  CHECK_LT(new_tag, kExternalTagMax);
+  GetTagData(new_tag)->object_type = internal_strdup(object_type);
+  char header[127] = {0};
+  internal_snprintf(header, sizeof(header), "race on %s", object_type);
+  GetTagData(new_tag)->header = internal_strdup(header);
   return (void *)new_tag;
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_external_register_header(void *tag, const char *header) {
+  CHECK_GE((uptr)tag, kExternalTagFirstUserAvailable);
+  CHECK_LT((uptr)tag, kExternalTagMax);
+  atomic_uintptr_t *header_ptr =
+      (atomic_uintptr_t *)&GetTagData((uptr)tag)->header;
+  header = internal_strdup(header);
+  char *old_header =
+      (char *)atomic_exchange(header_ptr, (uptr)header, memory_order_seq_cst);
+  if (old_header) internal_free(old_header);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_external_assign_tag(void *addr, void *tag) {
   CHECK_LT(tag, atomic_load(&used_tags, memory_order_relaxed));
   Allocator *a = allocator();
@@ -54,25 +111,15 @@ void __tsan_external_assign_tag(void *addr, void *tag) {
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_external_read(void *addr, void *caller_pc, void *tag) {
-  CHECK_LT(tag, atomic_load(&used_tags, memory_order_relaxed));
-  ThreadState *thr = cur_thread();
-  thr->external_tag = (uptr)tag;
-  FuncEntry(thr, (uptr)caller_pc);
-  MemoryRead(thr, CALLERPC, (uptr)addr, kSizeLog8);
-  FuncExit(thr);
-  thr->external_tag = 0;
+  ExternalAccess(addr, caller_pc, tag, MemoryRead);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_external_write(void *addr, void *caller_pc, void *tag) {
-  CHECK_LT(tag, atomic_load(&used_tags, memory_order_relaxed));
-  ThreadState *thr = cur_thread();
-  thr->external_tag = (uptr)tag;
-  FuncEntry(thr, (uptr)caller_pc);
-  MemoryWrite(thr, CALLERPC, (uptr)addr, kSizeLog8);
-  FuncExit(thr);
-  thr->external_tag = 0;
+  ExternalAccess(addr, caller_pc, tag, MemoryWrite);
 }
 }  // extern "C"
 
+#endif  // !SANITIZER_GO
+
 }  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_fd.cc b/lib/tsan/rtl/tsan_fd.cc
index d84df4a64..f13a7432e 100644
--- a/lib/tsan/rtl/tsan_fd.cc
+++ b/lib/tsan/rtl/tsan_fd.cc
@@ -48,8 +48,8 @@ static bool bogusfd(int fd) {
 }
 
 static FdSync *allocsync(ThreadState *thr, uptr pc) {
-  FdSync *s = (FdSync*)user_alloc(thr, pc, sizeof(FdSync), kDefaultAlignment,
-      false);
+  FdSync *s = (FdSync*)user_alloc_internal(thr, pc, sizeof(FdSync),
+      kDefaultAlignment, false);
   atomic_store(&s->rc, 1, memory_order_relaxed);
   return s;
 }
@@ -79,7 +79,7 @@ static FdDesc *fddesc(ThreadState *thr, uptr pc, int fd) {
   if (l1 == 0) {
     uptr size = kTableSizeL2 * sizeof(FdDesc);
     // We need this to reside in user memory to properly catch races on it.
-    void *p = user_alloc(thr, pc, size, kDefaultAlignment, false);
+    void *p = user_alloc_internal(thr, pc, size, kDefaultAlignment, false);
     internal_memset(p, 0, size);
     MemoryResetRange(thr, (uptr)&fddesc, (uptr)p, size);
     if (atomic_compare_exchange_strong(pl1, &l1, (uptr)p, memory_order_acq_rel))
diff --git a/lib/tsan/rtl/tsan_flags.cc b/lib/tsan/rtl/tsan_flags.cc
index d8d4746ab..89e22a132 100644
--- a/lib/tsan/rtl/tsan_flags.cc
+++ b/lib/tsan/rtl/tsan_flags.cc
@@ -21,10 +21,6 @@
 
 namespace __tsan {
 
-Flags *flags() {
-  return &ctx->flags;
-}
-
 // Can be overriden in frontend.
 #ifdef TSAN_EXTERNAL_HOOKS
 extern "C" const char* __tsan_default_options();
diff --git a/lib/tsan/rtl/tsan_flags.h b/lib/tsan/rtl/tsan_flags.h
index e2f6b3c9f..66740def5 100644
--- a/lib/tsan/rtl/tsan_flags.h
+++ b/lib/tsan/rtl/tsan_flags.h
@@ -28,7 +28,6 @@ struct Flags : DDFlags {
   void ParseFromString(const char *str);
 };
 
-Flags *flags();
 void InitializeFlags(Flags *flags, const char *env);
 }  // namespace __tsan
 
diff --git a/lib/tsan/rtl/tsan_interceptors.cc b/lib/tsan/rtl/tsan_interceptors.cc
index 9bf1b28b9..79e243aef 100644
--- a/lib/tsan/rtl/tsan_interceptors.cc
+++ b/lib/tsan/rtl/tsan_interceptors.cc
@@ -14,10 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_errno.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_linux.h"
+#include "sanitizer_common/sanitizer_platform_limits_netbsd.h"
 #include "sanitizer_common/sanitizer_platform_limits_posix.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
+#include "sanitizer_common/sanitizer_posix.h"
 #include "sanitizer_common/sanitizer_stacktrace.h"
 #include "sanitizer_common/sanitizer_tls_get_addr.h"
 #include "interception/interception.h"
@@ -29,29 +32,32 @@
 #include "tsan_mman.h"
 #include "tsan_fd.h"
 
-#if SANITIZER_POSIX
-#include "sanitizer_common/sanitizer_posix.h"
-#endif
 
 using namespace __tsan;  // NOLINT
 
 #if SANITIZER_FREEBSD || SANITIZER_MAC
-#define __errno_location __error
 #define stdout __stdoutp
 #define stderr __stderrp
 #endif
 
-#if SANITIZER_ANDROID
-#define __errno_location __errno
-#define mallopt(a, b)
+#if SANITIZER_NETBSD
+#define dirfd(dirp) (*(int *)(dirp))
+#define fileno_unlocked fileno
+
+#if _LP64
+#define __sF_size 152
+#else
+#define __sF_size 88
 #endif
 
-#if SANITIZER_LINUX || SANITIZER_FREEBSD
-#define PTHREAD_CREATE_DETACHED 1
-#elif SANITIZER_MAC
-#define PTHREAD_CREATE_DETACHED 2
+#define stdout ((char*)&__sF + (__sF_size * 1))
+#define stderr ((char*)&__sF + (__sF_size * 2))
+
 #endif
 
+#if SANITIZER_ANDROID
+#define mallopt(a, b)
+#endif
 
 #ifdef __mips__
 const int kSigCount = 129;
@@ -93,24 +99,26 @@ DECLARE_REAL_AND_INTERCEPTOR(void *, malloc, uptr size)
 DECLARE_REAL_AND_INTERCEPTOR(void, free, void *ptr)
 extern "C" void *pthread_self();
 extern "C" void _exit(int status);
-extern "C" int *__errno_location();
 extern "C" int fileno_unlocked(void *stream);
+#if !SANITIZER_NETBSD
 extern "C" int dirfd(void *dirp);
-#if !SANITIZER_FREEBSD && !SANITIZER_ANDROID
+#endif
+#if !SANITIZER_FREEBSD && !SANITIZER_ANDROID && !SANITIZER_NETBSD
 extern "C" int mallopt(int param, int value);
 #endif
+#if SANITIZER_NETBSD
+extern __sanitizer_FILE __sF[];
+#else
 extern __sanitizer_FILE *stdout, *stderr;
-#if !SANITIZER_FREEBSD && !SANITIZER_MAC
+#endif
+#if !SANITIZER_FREEBSD && !SANITIZER_MAC && !SANITIZER_NETBSD
 const int PTHREAD_MUTEX_RECURSIVE = 1;
 const int PTHREAD_MUTEX_RECURSIVE_NP = 1;
 #else
 const int PTHREAD_MUTEX_RECURSIVE = 2;
 const int PTHREAD_MUTEX_RECURSIVE_NP = 2;
 #endif
-const int EINVAL = 22;
-const int EBUSY = 16;
-const int EOWNERDEAD = 130;
-#if !SANITIZER_FREEBSD && !SANITIZER_MAC
+#if !SANITIZER_FREEBSD && !SANITIZER_MAC && !SANITIZER_NETBSD
 const int EPOLL_CTL_ADD = 1;
 #endif
 const int SIGILL = 4;
@@ -119,7 +127,7 @@ const int SIGFPE = 8;
 const int SIGSEGV = 11;
 const int SIGPIPE = 13;
 const int SIGTERM = 15;
-#if defined(__mips__) || SANITIZER_FREEBSD || SANITIZER_MAC
+#if defined(__mips__) || SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_NETBSD
 const int SIGBUS = 10;
 const int SIGSYS = 12;
 #else
@@ -127,7 +135,9 @@ const int SIGBUS = 7;
 const int SIGSYS = 31;
 #endif
 void *const MAP_FAILED = (void*)-1;
-#if !SANITIZER_MAC
+#if SANITIZER_NETBSD
+const int PTHREAD_BARRIER_SERIAL_THREAD = 1234567;
+#elif !SANITIZER_MAC
 const int PTHREAD_BARRIER_SERIAL_THREAD = -1;
 #endif
 const int MAP_FIXED = 0x10;
@@ -139,8 +149,6 @@ typedef long long_t;  // NOLINT
 # define F_TLOCK 2      /* Test and lock a region for exclusive use.  */
 # define F_TEST  3      /* Test a region for other processes locks.  */
 
-#define errno (*__errno_location())
-
 typedef void (*sighandler_t)(int sig);
 typedef void (*sigactionhandler_t)(int sig, my_siginfo_t *siginfo, void *uctx);
 
@@ -154,6 +162,15 @@ struct sigaction_t {
   __sanitizer_sigset_t sa_mask;
   void (*sa_restorer)();
 };
+#elif SANITIZER_NETBSD
+struct sigaction_t {
+  union {
+    sighandler_t sa_handler;
+    sigactionhandler_t sa_sigaction;
+  };
+  __sanitizer_sigset_t sa_mask;
+  int sa_flags;
+};
 #else
 struct sigaction_t {
 #ifdef __mips__
@@ -182,7 +199,7 @@ struct sigaction_t {
 const sighandler_t SIG_DFL = (sighandler_t)0;
 const sighandler_t SIG_IGN = (sighandler_t)1;
 const sighandler_t SIG_ERR = (sighandler_t)-1;
-#if SANITIZER_FREEBSD || SANITIZER_MAC
+#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_NETBSD
 const int SA_SIGINFO = 0x40;
 const int SIG_SETMASK = 3;
 #elif defined(__mips__)
@@ -219,7 +236,7 @@ struct ThreadSignalContext {
 // The object is 64-byte aligned, because we want hot data to be located in
 // a single cache line if possible (it's accessed in every interceptor).
 static ALIGNED(64) char libignore_placeholder[sizeof(LibIgnore)];
-static LibIgnore *libignore() {
+LibIgnore *libignore() {
   return reinterpret_cast<LibIgnore*>(&libignore_placeholder[0]);
 }
 
@@ -277,7 +294,8 @@ ScopedInterceptor::~ScopedInterceptor() {
 
 void ScopedInterceptor::EnableIgnores() {
   if (ignoring_) {
-    ThreadIgnoreBegin(thr_, pc_);
+    ThreadIgnoreBegin(thr_, pc_, /*save_stack=*/false);
+    if (flags()->ignore_noninstrumented_modules) thr_->suppress_reports++;
     if (in_ignored_lib_) {
       DCHECK(!thr_->in_ignored_lib);
       thr_->in_ignored_lib = true;
@@ -288,6 +306,7 @@ void ScopedInterceptor::EnableIgnores() {
 void ScopedInterceptor::DisableIgnores() {
   if (ignoring_) {
     ThreadIgnoreEnd(thr_, pc_);
+    if (flags()->ignore_noninstrumented_modules) thr_->suppress_reports--;
     if (in_ignored_lib_) {
       DCHECK(thr_->in_ignored_lib);
       thr_->in_ignored_lib = false;
@@ -296,7 +315,7 @@ void ScopedInterceptor::DisableIgnores() {
 }
 
 #define TSAN_INTERCEPT(func) INTERCEPT_FUNCTION(func)
-#if SANITIZER_FREEBSD
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD
 # define TSAN_INTERCEPT_VER(func, ver) INTERCEPT_FUNCTION(func)
 #else
 # define TSAN_INTERCEPT_VER(func, ver) INTERCEPT_FUNCTION_VER(func, ver)
@@ -360,6 +379,11 @@ TSAN_INTERCEPTOR(int, nanosleep, void *req, void *rem) {
   return res;
 }
 
+TSAN_INTERCEPTOR(int, pause, int fake) {
+  SCOPED_TSAN_INTERCEPTOR(pause, fake);
+  return BLOCK_REAL(pause)(fake);
+}
+
 // The sole reason tsan wraps atexit callbacks is to establish synchronization
 // between callback setup and callback execution.
 struct AtExitCtx {
@@ -473,8 +497,14 @@ static void SetJmp(ThreadState *thr, uptr sp, uptr mangled_sp) {
 static void LongJmp(ThreadState *thr, uptr *env) {
 #ifdef __powerpc__
   uptr mangled_sp = env[0];
-#elif SANITIZER_FREEBSD || SANITIZER_MAC
+#elif SANITIZER_FREEBSD || SANITIZER_NETBSD
   uptr mangled_sp = env[2];
+#elif SANITIZER_MAC
+# ifdef __aarch64__
+    uptr mangled_sp = env[13];
+# else
+    uptr mangled_sp = env[2];
+# endif
 #elif defined(SANITIZER_LINUX)
 # ifdef __aarch64__
   uptr mangled_sp = env[13];
@@ -592,7 +622,7 @@ TSAN_INTERCEPTOR(void*, malloc, uptr size) {
 
 TSAN_INTERCEPTOR(void*, __libc_memalign, uptr align, uptr sz) {
   SCOPED_TSAN_INTERCEPTOR(__libc_memalign, align, sz);
-  return user_alloc(thr, pc, sz, align);
+  return user_memalign(thr, pc, align, sz);
 }
 
 TSAN_INTERCEPTOR(void*, calloc, uptr size, uptr n) {
@@ -672,7 +702,7 @@ static bool fix_mmap_addr(void **addr, long_t sz, int flags) {
   if (*addr) {
     if (!IsAppMem((uptr)*addr) || !IsAppMem((uptr)*addr + sz - 1)) {
       if (flags & MAP_FIXED) {
-        errno = EINVAL;
+        errno = errno_EINVAL;
         return false;
       } else {
         *addr = 0;
@@ -738,7 +768,7 @@ TSAN_INTERCEPTOR(int, munmap, void *addr, long_t sz) {
 #if SANITIZER_LINUX
 TSAN_INTERCEPTOR(void*, memalign, uptr align, uptr sz) {
   SCOPED_INTERCEPTOR_RAW(memalign, align, sz);
-  return user_alloc(thr, pc, sz, align);
+  return user_memalign(thr, pc, align, sz);
 }
 #define TSAN_MAYBE_INTERCEPT_MEMALIGN TSAN_INTERCEPT(memalign)
 #else
@@ -747,21 +777,20 @@ TSAN_INTERCEPTOR(void*, memalign, uptr align, uptr sz) {
 
 #if !SANITIZER_MAC
 TSAN_INTERCEPTOR(void*, aligned_alloc, uptr align, uptr sz) {
-  SCOPED_INTERCEPTOR_RAW(memalign, align, sz);
-  return user_alloc(thr, pc, sz, align);
+  SCOPED_INTERCEPTOR_RAW(aligned_alloc, align, sz);
+  return user_aligned_alloc(thr, pc, align, sz);
 }
 
 TSAN_INTERCEPTOR(void*, valloc, uptr sz) {
   SCOPED_INTERCEPTOR_RAW(valloc, sz);
-  return user_alloc(thr, pc, sz, GetPageSizeCached());
+  return user_valloc(thr, pc, sz);
 }
 #endif
 
 #if SANITIZER_LINUX
 TSAN_INTERCEPTOR(void*, pvalloc, uptr sz) {
   SCOPED_INTERCEPTOR_RAW(pvalloc, sz);
-  sz = RoundUp(sz, GetPageSizeCached());
-  return user_alloc(thr, pc, sz, GetPageSizeCached());
+  return user_pvalloc(thr, pc, sz);
 }
 #define TSAN_MAYBE_INTERCEPT_PVALLOC TSAN_INTERCEPT(pvalloc)
 #else
@@ -771,8 +800,7 @@ TSAN_INTERCEPTOR(void*, pvalloc, uptr sz) {
 #if !SANITIZER_MAC
 TSAN_INTERCEPTOR(int, posix_memalign, void **memptr, uptr align, uptr sz) {
   SCOPED_INTERCEPTOR_RAW(posix_memalign, memptr, align, sz);
-  *memptr = user_alloc(thr, pc, sz, align);
-  return 0;
+  return user_posix_memalign(thr, pc, memptr, align, sz);
 }
 #endif
 
@@ -928,8 +956,7 @@ TSAN_INTERCEPTOR(int, pthread_create,
     ThreadIgnoreEnd(thr, pc);
   }
   if (res == 0) {
-    int tid = ThreadCreate(thr, pc, *(uptr*)th,
-                           detached == PTHREAD_CREATE_DETACHED);
+    int tid = ThreadCreate(thr, pc, *(uptr*)th, IsStateDetached(detached));
     CHECK_NE(tid, 0);
     // Synchronization on p.tid serves two purposes:
     // 1. ThreadCreate must finish before the new thread starts.
@@ -1025,7 +1052,7 @@ static void cond_mutex_unlock(CondMutexUnlockCtx *arg) {
   ThreadSignalContext *ctx = SigCtx(arg->thr);
   CHECK_EQ(atomic_load(&ctx->in_blocking_func, memory_order_relaxed), 1);
   atomic_store(&ctx->in_blocking_func, 0, memory_order_relaxed);
-  MutexLock(arg->thr, arg->pc, (uptr)arg->m);
+  MutexPostLock(arg->thr, arg->pc, (uptr)arg->m, MutexFlagDoPreLockOnPostLock);
   // Undo BlockingCall ctor effects.
   arg->thr->ignore_interceptors--;
   arg->si->~ScopedInterceptor();
@@ -1054,7 +1081,7 @@ static int cond_wait(ThreadState *thr, uptr pc, ScopedInterceptor *si,
         fn, c, m, t, (void (*)(void *arg))cond_mutex_unlock, &arg);
   }
   if (res == errno_EOWNERDEAD) MutexRepair(thr, pc, (uptr)m);
-  MutexLock(thr, pc, (uptr)m);
+  MutexPostLock(thr, pc, (uptr)m, MutexFlagDoPreLockOnPostLock);
   return res;
 }
 
@@ -1114,14 +1141,15 @@ TSAN_INTERCEPTOR(int, pthread_mutex_init, void *m, void *a) {
   SCOPED_TSAN_INTERCEPTOR(pthread_mutex_init, m, a);
   int res = REAL(pthread_mutex_init)(m, a);
   if (res == 0) {
-    bool recursive = false;
+    u32 flagz = 0;
     if (a) {
       int type = 0;
       if (REAL(pthread_mutexattr_gettype)(a, &type) == 0)
-        recursive = (type == PTHREAD_MUTEX_RECURSIVE
-            || type == PTHREAD_MUTEX_RECURSIVE_NP);
+        if (type == PTHREAD_MUTEX_RECURSIVE ||
+            type == PTHREAD_MUTEX_RECURSIVE_NP)
+          flagz |= MutexFlagWriteReentrant;
     }
-    MutexCreate(thr, pc, (uptr)m, false, recursive, false);
+    MutexCreate(thr, pc, (uptr)m, flagz);
   }
   return res;
 }
@@ -1129,7 +1157,7 @@ TSAN_INTERCEPTOR(int, pthread_mutex_init, void *m, void *a) {
 TSAN_INTERCEPTOR(int, pthread_mutex_destroy, void *m) {
   SCOPED_TSAN_INTERCEPTOR(pthread_mutex_destroy, m);
   int res = REAL(pthread_mutex_destroy)(m);
-  if (res == 0 || res == EBUSY) {
+  if (res == 0 || res == errno_EBUSY) {
     MutexDestroy(thr, pc, (uptr)m);
   }
   return res;
@@ -1138,10 +1166,10 @@ TSAN_INTERCEPTOR(int, pthread_mutex_destroy, void *m) {
 TSAN_INTERCEPTOR(int, pthread_mutex_trylock, void *m) {
   SCOPED_TSAN_INTERCEPTOR(pthread_mutex_trylock, m);
   int res = REAL(pthread_mutex_trylock)(m);
-  if (res == EOWNERDEAD)
+  if (res == errno_EOWNERDEAD)
     MutexRepair(thr, pc, (uptr)m);
-  if (res == 0 || res == EOWNERDEAD)
-    MutexLock(thr, pc, (uptr)m, /*rec=*/1, /*try_lock=*/true);
+  if (res == 0 || res == errno_EOWNERDEAD)
+    MutexPostLock(thr, pc, (uptr)m, MutexFlagTryLock);
   return res;
 }
 
@@ -1150,7 +1178,7 @@ TSAN_INTERCEPTOR(int, pthread_mutex_timedlock, void *m, void *abstime) {
   SCOPED_TSAN_INTERCEPTOR(pthread_mutex_timedlock, m, abstime);
   int res = REAL(pthread_mutex_timedlock)(m, abstime);
   if (res == 0) {
-    MutexLock(thr, pc, (uptr)m);
+    MutexPostLock(thr, pc, (uptr)m, MutexFlagTryLock);
   }
   return res;
 }
@@ -1161,7 +1189,7 @@ TSAN_INTERCEPTOR(int, pthread_spin_init, void *m, int pshared) {
   SCOPED_TSAN_INTERCEPTOR(pthread_spin_init, m, pshared);
   int res = REAL(pthread_spin_init)(m, pshared);
   if (res == 0) {
-    MutexCreate(thr, pc, (uptr)m, false, false, false);
+    MutexCreate(thr, pc, (uptr)m);
   }
   return res;
 }
@@ -1177,9 +1205,10 @@ TSAN_INTERCEPTOR(int, pthread_spin_destroy, void *m) {
 
 TSAN_INTERCEPTOR(int, pthread_spin_lock, void *m) {
   SCOPED_TSAN_INTERCEPTOR(pthread_spin_lock, m);
+  MutexPreLock(thr, pc, (uptr)m);
   int res = REAL(pthread_spin_lock)(m);
   if (res == 0) {
-    MutexLock(thr, pc, (uptr)m);
+    MutexPostLock(thr, pc, (uptr)m);
   }
   return res;
 }
@@ -1188,7 +1217,7 @@ TSAN_INTERCEPTOR(int, pthread_spin_trylock, void *m) {
   SCOPED_TSAN_INTERCEPTOR(pthread_spin_trylock, m);
   int res = REAL(pthread_spin_trylock)(m);
   if (res == 0) {
-    MutexLock(thr, pc, (uptr)m, /*rec=*/1, /*try_lock=*/true);
+    MutexPostLock(thr, pc, (uptr)m, MutexFlagTryLock);
   }
   return res;
 }
@@ -1205,7 +1234,7 @@ TSAN_INTERCEPTOR(int, pthread_rwlock_init, void *m, void *a) {
   SCOPED_TSAN_INTERCEPTOR(pthread_rwlock_init, m, a);
   int res = REAL(pthread_rwlock_init)(m, a);
   if (res == 0) {
-    MutexCreate(thr, pc, (uptr)m, true, false, false);
+    MutexCreate(thr, pc, (uptr)m);
   }
   return res;
 }
@@ -1221,9 +1250,10 @@ TSAN_INTERCEPTOR(int, pthread_rwlock_destroy, void *m) {
 
 TSAN_INTERCEPTOR(int, pthread_rwlock_rdlock, void *m) {
   SCOPED_TSAN_INTERCEPTOR(pthread_rwlock_rdlock, m);
+  MutexPreReadLock(thr, pc, (uptr)m);
   int res = REAL(pthread_rwlock_rdlock)(m);
   if (res == 0) {
-    MutexReadLock(thr, pc, (uptr)m);
+    MutexPostReadLock(thr, pc, (uptr)m);
   }
   return res;
 }
@@ -1232,7 +1262,7 @@ TSAN_INTERCEPTOR(int, pthread_rwlock_tryrdlock, void *m) {
   SCOPED_TSAN_INTERCEPTOR(pthread_rwlock_tryrdlock, m);
   int res = REAL(pthread_rwlock_tryrdlock)(m);
   if (res == 0) {
-    MutexReadLock(thr, pc, (uptr)m, /*try_lock=*/true);
+    MutexPostReadLock(thr, pc, (uptr)m, MutexFlagTryLock);
   }
   return res;
 }
@@ -1242,7 +1272,7 @@ TSAN_INTERCEPTOR(int, pthread_rwlock_timedrdlock, void *m, void *abstime) {
   SCOPED_TSAN_INTERCEPTOR(pthread_rwlock_timedrdlock, m, abstime);
   int res = REAL(pthread_rwlock_timedrdlock)(m, abstime);
   if (res == 0) {
-    MutexReadLock(thr, pc, (uptr)m);
+    MutexPostReadLock(thr, pc, (uptr)m);
   }
   return res;
 }
@@ -1250,9 +1280,10 @@ TSAN_INTERCEPTOR(int, pthread_rwlock_timedrdlock, void *m, void *abstime) {
 
 TSAN_INTERCEPTOR(int, pthread_rwlock_wrlock, void *m) {
   SCOPED_TSAN_INTERCEPTOR(pthread_rwlock_wrlock, m);
+  MutexPreLock(thr, pc, (uptr)m);
   int res = REAL(pthread_rwlock_wrlock)(m);
   if (res == 0) {
-    MutexLock(thr, pc, (uptr)m);
+    MutexPostLock(thr, pc, (uptr)m);
   }
   return res;
 }
@@ -1261,7 +1292,7 @@ TSAN_INTERCEPTOR(int, pthread_rwlock_trywrlock, void *m) {
   SCOPED_TSAN_INTERCEPTOR(pthread_rwlock_trywrlock, m);
   int res = REAL(pthread_rwlock_trywrlock)(m);
   if (res == 0) {
-    MutexLock(thr, pc, (uptr)m, /*rec=*/1, /*try_lock=*/true);
+    MutexPostLock(thr, pc, (uptr)m, MutexFlagTryLock);
   }
   return res;
 }
@@ -1271,7 +1302,7 @@ TSAN_INTERCEPTOR(int, pthread_rwlock_timedwrlock, void *m, void *abstime) {
   SCOPED_TSAN_INTERCEPTOR(pthread_rwlock_timedwrlock, m, abstime);
   int res = REAL(pthread_rwlock_timedwrlock)(m, abstime);
   if (res == 0) {
-    MutexLock(thr, pc, (uptr)m);
+    MutexPostLock(thr, pc, (uptr)m, MutexFlagTryLock);
   }
   return res;
 }
@@ -1315,7 +1346,7 @@ TSAN_INTERCEPTOR(int, pthread_barrier_wait, void *b) {
 TSAN_INTERCEPTOR(int, pthread_once, void *o, void (*f)()) {
   SCOPED_INTERCEPTOR_RAW(pthread_once, o, f);
   if (o == 0 || f == 0)
-    return EINVAL;
+    return errno_EINVAL;
   atomic_uint32_t *a;
   if (!SANITIZER_MAC)
     a = static_cast<atomic_uint32_t*>(o);
@@ -1352,7 +1383,7 @@ TSAN_INTERCEPTOR(int, __fxstat, int version, int fd, void *buf) {
 #endif
 
 TSAN_INTERCEPTOR(int, fstat, int fd, void *buf) {
-#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_ANDROID
+#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_ANDROID || SANITIZER_NETBSD
   SCOPED_TSAN_INTERCEPTOR(fstat, fd, buf);
   if (fd > 0)
     FdAccess(thr, pc, fd);
@@ -1644,24 +1675,6 @@ TSAN_INTERCEPTOR(void*, tmpfile64, int fake) {
 #define TSAN_MAYBE_INTERCEPT_TMPFILE64
 #endif
 
-TSAN_INTERCEPTOR(uptr, fread, void *ptr, uptr size, uptr nmemb, void *f) {
-  // libc file streams can call user-supplied functions, see fopencookie.
-  {
-    SCOPED_TSAN_INTERCEPTOR(fread, ptr, size, nmemb, f);
-    MemoryAccessRange(thr, pc, (uptr)ptr, size * nmemb, true);
-  }
-  return REAL(fread)(ptr, size, nmemb, f);
-}
-
-TSAN_INTERCEPTOR(uptr, fwrite, const void *p, uptr size, uptr nmemb, void *f) {
-  // libc file streams can call user-supplied functions, see fopencookie.
-  {
-    SCOPED_TSAN_INTERCEPTOR(fwrite, p, size, nmemb, f);
-    MemoryAccessRange(thr, pc, (uptr)p, size * nmemb, false);
-  }
-  return REAL(fwrite)(p, size, nmemb, f);
-}
-
 static void FlushStreams() {
   // Flushing all the streams here may freeze the process if a child thread is
   // performing file stream operations at the same time.
@@ -1951,7 +1964,7 @@ TSAN_INTERCEPTOR(int, sigaction, int sig, sigaction_t *act, sigaction_t *old) {
   sigactions[sig].sa_flags = *(volatile int*)&act->sa_flags;
   internal_memcpy(&sigactions[sig].sa_mask, &act->sa_mask,
       sizeof(sigactions[sig].sa_mask));
-#if !SANITIZER_FREEBSD && !SANITIZER_MAC
+#if !SANITIZER_FREEBSD && !SANITIZER_MAC && !SANITIZER_NETBSD
   sigactions[sig].sa_restorer = act->sa_restorer;
 #endif
   sigaction_t newact;
@@ -2251,8 +2264,12 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
 #define COMMON_INTERCEPTOR_ON_EXIT(ctx) \
   OnExit(((TsanInterceptorContext *) ctx)->thr)
 
-#define COMMON_INTERCEPTOR_MUTEX_LOCK(ctx, m) \
-  MutexLock(((TsanInterceptorContext *)ctx)->thr, \
+#define COMMON_INTERCEPTOR_MUTEX_PRE_LOCK(ctx, m) \
+  MutexPreLock(((TsanInterceptorContext *)ctx)->thr, \
+            ((TsanInterceptorContext *)ctx)->pc, (uptr)m)
+
+#define COMMON_INTERCEPTOR_MUTEX_POST_LOCK(ctx, m) \
+  MutexPostLock(((TsanInterceptorContext *)ctx)->thr, \
             ((TsanInterceptorContext *)ctx)->pc, (uptr)m)
 
 #define COMMON_INTERCEPTOR_MUTEX_UNLOCK(ctx, m) \
@@ -2309,7 +2326,7 @@ struct ScopedSyscall {
   }
 };
 
-#if !SANITIZER_FREEBSD && !SANITIZER_MAC
+#if !SANITIZER_FREEBSD && !SANITIZER_MAC && !SANITIZER_NETBSD
 static void syscall_access_range(uptr pc, uptr p, uptr s, bool write) {
   TSAN_SYSCALL();
   MemoryAccessRange(thr, pc, p, s, write);
@@ -2579,6 +2596,7 @@ void InitializeInterceptors() {
   TSAN_INTERCEPT(sleep);
   TSAN_INTERCEPT(usleep);
   TSAN_INTERCEPT(nanosleep);
+  TSAN_INTERCEPT(pause);
   TSAN_INTERCEPT(gettimeofday);
   TSAN_INTERCEPT(getaddrinfo);
 
diff --git a/lib/tsan/rtl/tsan_interceptors.h b/lib/tsan/rtl/tsan_interceptors.h
index 72534f4a2..de4746650 100644
--- a/lib/tsan/rtl/tsan_interceptors.h
+++ b/lib/tsan/rtl/tsan_interceptors.h
@@ -19,6 +19,8 @@ class ScopedInterceptor {
   bool ignoring_;
 };
 
+LibIgnore *libignore();
+
 }  // namespace __tsan
 
 #define SCOPED_INTERCEPTOR_RAW(func, ...) \
diff --git a/lib/tsan/rtl/tsan_interceptors_mac.cc b/lib/tsan/rtl/tsan_interceptors_mac.cc
index fc5eb0499..4f1079467 100644
--- a/lib/tsan/rtl/tsan_interceptors_mac.cc
+++ b/lib/tsan/rtl/tsan_interceptors_mac.cc
@@ -21,7 +21,10 @@
 #include "tsan_interface_ann.h"
 
 #include <libkern/OSAtomic.h>
+
+#if defined(__has_include) && __has_include(<xpc/xpc.h>)
 #include <xpc/xpc.h>
+#endif  // #if defined(__has_include) && __has_include(<xpc/xpc.h>)
 
 typedef long long_t;  // NOLINT
 
@@ -235,6 +238,8 @@ TSAN_INTERCEPTOR(void, os_lock_unlock, void *lock) {
   REAL(os_lock_unlock)(lock);
 }
 
+#if defined(__has_include) && __has_include(<xpc/xpc.h>)
+
 TSAN_INTERCEPTOR(void, xpc_connection_set_event_handler,
                  xpc_connection_t connection, xpc_handler_t handler) {
   SCOPED_TSAN_INTERCEPTOR(xpc_connection_set_event_handler, connection,
@@ -281,6 +286,14 @@ TSAN_INTERCEPTOR(void, xpc_connection_send_message_with_reply,
   (connection, message, replyq, new_handler);
 }
 
+TSAN_INTERCEPTOR(void, xpc_connection_cancel, xpc_connection_t connection) {
+  SCOPED_TSAN_INTERCEPTOR(xpc_connection_cancel, connection);
+  Release(thr, pc, (uptr)connection);
+  REAL(xpc_connection_cancel)(connection);
+}
+
+#endif  // #if defined(__has_include) && __has_include(<xpc/xpc.h>)
+
 // On macOS, libc++ is always linked dynamically, so intercepting works the
 // usual way.
 #define STDCXX_INTERCEPTOR TSAN_INTERCEPTOR
diff --git a/lib/tsan/rtl/tsan_interface.h b/lib/tsan/rtl/tsan_interface.h
index 496a8717f..a80a48991 100644
--- a/lib/tsan/rtl/tsan_interface.h
+++ b/lib/tsan/rtl/tsan_interface.h
@@ -18,6 +18,7 @@
 
 #include <sanitizer_common/sanitizer_internal_defs.h>
 using __sanitizer::uptr;
+using __sanitizer::tid_t;
 
 // This header should NOT include any other headers.
 // All functions in this header are extern "C" and start with __tsan_.
@@ -81,6 +82,8 @@ SANITIZER_INTERFACE_ATTRIBUTE void __tsan_ignore_thread_end();
 SANITIZER_INTERFACE_ATTRIBUTE
 void *__tsan_external_register_tag(const char *object_type);
 SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_external_register_header(void *tag, const char *header);
+SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_external_assign_tag(void *addr, void *tag);
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_external_read(void *addr, void *caller_pc, void *tag);
@@ -143,7 +146,7 @@ int __tsan_get_report_mutex(void *report, uptr idx, uptr *mutex_id, void **addr,
 
 // Returns information about threads included in the report.
 SANITIZER_INTERFACE_ATTRIBUTE
-int __tsan_get_report_thread(void *report, uptr idx, int *tid, uptr *os_id,
+int __tsan_get_report_thread(void *report, uptr idx, int *tid, tid_t *os_id,
                              int *running, const char **name, int *parent_tid,
                              void **trace, uptr trace_size);
 
@@ -160,7 +163,7 @@ const char *__tsan_locate_address(uptr addr, char *name, uptr name_size,
 // Returns the allocation stack for a heap pointer.
 SANITIZER_INTERFACE_ATTRIBUTE
 int __tsan_get_alloc_stack(uptr addr, uptr *trace, uptr size, int *thread_id,
-                           uptr *os_id);
+                           tid_t *os_id);
 
 #endif  // SANITIZER_GO
 
diff --git a/lib/tsan/rtl/tsan_interface_ann.cc b/lib/tsan/rtl/tsan_interface_ann.cc
index 62db79661..f68a0468d 100644
--- a/lib/tsan/rtl/tsan_interface_ann.cc
+++ b/lib/tsan/rtl/tsan_interface_ann.cc
@@ -31,11 +31,10 @@ namespace __tsan {
 
 class ScopedAnnotation {
  public:
-  ScopedAnnotation(ThreadState *thr, const char *aname, const char *f, int l,
-                   uptr pc)
+  ScopedAnnotation(ThreadState *thr, const char *aname, uptr pc)
       : thr_(thr) {
     FuncEntry(thr_, pc);
-    DPrintf("#%d: annotation %s() %s:%d\n", thr_->tid, aname, f, l);
+    DPrintf("#%d: annotation %s()\n", thr_->tid, aname);
   }
 
   ~ScopedAnnotation() {
@@ -46,18 +45,20 @@ class ScopedAnnotation {
   ThreadState *const thr_;
 };
 
-#define SCOPED_ANNOTATION(typ) \
+#define SCOPED_ANNOTATION_RET(typ, ret) \
     if (!flags()->enable_annotations) \
-      return; \
+      return ret; \
     ThreadState *thr = cur_thread(); \
     const uptr caller_pc = (uptr)__builtin_return_address(0); \
     StatInc(thr, StatAnnotation); \
     StatInc(thr, Stat##typ); \
-    ScopedAnnotation sa(thr, __func__, f, l, caller_pc); \
+    ScopedAnnotation sa(thr, __func__, caller_pc); \
     const uptr pc = StackTrace::GetCurrentPc(); \
     (void)pc; \
 /**/
 
+#define SCOPED_ANNOTATION(typ) SCOPED_ANNOTATION_RET(typ, )
+
 static const int kMaxDescLen = 128;
 
 struct ExpectRace {
@@ -252,12 +253,12 @@ void INTERFACE_ATTRIBUTE AnnotateCondVarWait(char *f, int l, uptr cv,
 
 void INTERFACE_ATTRIBUTE AnnotateRWLockCreate(char *f, int l, uptr m) {
   SCOPED_ANNOTATION(AnnotateRWLockCreate);
-  MutexCreate(thr, pc, m, true, true, false);
+  MutexCreate(thr, pc, m, MutexFlagWriteReentrant);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateRWLockCreateStatic(char *f, int l, uptr m) {
   SCOPED_ANNOTATION(AnnotateRWLockCreateStatic);
-  MutexCreate(thr, pc, m, true, true, true);
+  MutexCreate(thr, pc, m, MutexFlagWriteReentrant | MutexFlagLinkerInit);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateRWLockDestroy(char *f, int l, uptr m) {
@@ -269,9 +270,9 @@ void INTERFACE_ATTRIBUTE AnnotateRWLockAcquired(char *f, int l, uptr m,
                                                 uptr is_w) {
   SCOPED_ANNOTATION(AnnotateRWLockAcquired);
   if (is_w)
-    MutexLock(thr, pc, m);
+    MutexPostLock(thr, pc, m, MutexFlagDoPreLockOnPostLock);
   else
-    MutexReadLock(thr, pc, m);
+    MutexPostReadLock(thr, pc, m, MutexFlagDoPreLockOnPostLock);
 }
 
 void INTERFACE_ATTRIBUTE AnnotateRWLockReleased(char *f, int l, uptr m,
@@ -458,4 +459,95 @@ void INTERFACE_ATTRIBUTE
 AnnotateMemoryIsInitialized(char *f, int l, uptr mem, uptr sz) {}
 void INTERFACE_ATTRIBUTE
 AnnotateMemoryIsUninitialized(char *f, int l, uptr mem, uptr sz) {}
+
+// Note: the parameter is called flagz, because flags is already taken
+// by the global function that returns flags.
+INTERFACE_ATTRIBUTE
+void __tsan_mutex_create(void *m, unsigned flagz) {
+  SCOPED_ANNOTATION(__tsan_mutex_create);
+  MutexCreate(thr, pc, (uptr)m, flagz & MutexCreationFlagMask);
+}
+
+INTERFACE_ATTRIBUTE
+void __tsan_mutex_destroy(void *m, unsigned flagz) {
+  SCOPED_ANNOTATION(__tsan_mutex_destroy);
+  MutexDestroy(thr, pc, (uptr)m, flagz);
+}
+
+INTERFACE_ATTRIBUTE
+void __tsan_mutex_pre_lock(void *m, unsigned flagz) {
+  SCOPED_ANNOTATION(__tsan_mutex_pre_lock);
+  if (!(flagz & MutexFlagTryLock)) {
+    if (flagz & MutexFlagReadLock)
+      MutexPreReadLock(thr, pc, (uptr)m);
+    else
+      MutexPreLock(thr, pc, (uptr)m);
+  }
+  ThreadIgnoreBegin(thr, pc, /*save_stack=*/false);
+  ThreadIgnoreSyncBegin(thr, pc, /*save_stack=*/false);
+}
+
+INTERFACE_ATTRIBUTE
+void __tsan_mutex_post_lock(void *m, unsigned flagz, int rec) {
+  SCOPED_ANNOTATION(__tsan_mutex_post_lock);
+  ThreadIgnoreSyncEnd(thr, pc);
+  ThreadIgnoreEnd(thr, pc);
+  if (!(flagz & MutexFlagTryLockFailed)) {
+    if (flagz & MutexFlagReadLock)
+      MutexPostReadLock(thr, pc, (uptr)m, flagz);
+    else
+      MutexPostLock(thr, pc, (uptr)m, flagz, rec);
+  }
+}
+
+INTERFACE_ATTRIBUTE
+int __tsan_mutex_pre_unlock(void *m, unsigned flagz) {
+  SCOPED_ANNOTATION_RET(__tsan_mutex_pre_unlock, 0);
+  int ret = 0;
+  if (flagz & MutexFlagReadLock) {
+    CHECK(!(flagz & MutexFlagRecursiveUnlock));
+    MutexReadUnlock(thr, pc, (uptr)m);
+  } else {
+    ret = MutexUnlock(thr, pc, (uptr)m, flagz);
+  }
+  ThreadIgnoreBegin(thr, pc, /*save_stack=*/false);
+  ThreadIgnoreSyncBegin(thr, pc, /*save_stack=*/false);
+  return ret;
+}
+
+INTERFACE_ATTRIBUTE
+void __tsan_mutex_post_unlock(void *m, unsigned flagz) {
+  SCOPED_ANNOTATION(__tsan_mutex_post_unlock);
+  ThreadIgnoreSyncEnd(thr, pc);
+  ThreadIgnoreEnd(thr, pc);
+}
+
+INTERFACE_ATTRIBUTE
+void __tsan_mutex_pre_signal(void *addr, unsigned flagz) {
+  SCOPED_ANNOTATION(__tsan_mutex_pre_signal);
+  ThreadIgnoreBegin(thr, pc, /*save_stack=*/false);
+  ThreadIgnoreSyncBegin(thr, pc, /*save_stack=*/false);
+}
+
+INTERFACE_ATTRIBUTE
+void __tsan_mutex_post_signal(void *addr, unsigned flagz) {
+  SCOPED_ANNOTATION(__tsan_mutex_post_signal);
+  ThreadIgnoreSyncEnd(thr, pc);
+  ThreadIgnoreEnd(thr, pc);
+}
+
+INTERFACE_ATTRIBUTE
+void __tsan_mutex_pre_divert(void *addr, unsigned flagz) {
+  SCOPED_ANNOTATION(__tsan_mutex_pre_divert);
+  // Exit from ignore region started in __tsan_mutex_pre_lock/unlock/signal.
+  ThreadIgnoreSyncEnd(thr, pc);
+  ThreadIgnoreEnd(thr, pc);
+}
+
+INTERFACE_ATTRIBUTE
+void __tsan_mutex_post_divert(void *addr, unsigned flagz) {
+  SCOPED_ANNOTATION(__tsan_mutex_post_divert);
+  ThreadIgnoreBegin(thr, pc, /*save_stack=*/false);
+  ThreadIgnoreSyncBegin(thr, pc, /*save_stack=*/false);
+}
 }  // extern "C"
diff --git a/lib/tsan/rtl/tsan_interface_atomic.cc b/lib/tsan/rtl/tsan_interface_atomic.cc
index 5238b66a2..d334394f5 100644
--- a/lib/tsan/rtl/tsan_interface_atomic.cc
+++ b/lib/tsan/rtl/tsan_interface_atomic.cc
@@ -220,8 +220,7 @@ static a128 NoTsanAtomicLoad(const volatile a128 *a, morder mo) {
 #endif
 
 template<typename T>
-static T AtomicLoad(ThreadState *thr, uptr pc, const volatile T *a,
-    morder mo) {
+static T AtomicLoad(ThreadState *thr, uptr pc, const volatile T *a, morder mo) {
   CHECK(IsLoadOrder(mo));
   // This fast-path is critical for performance.
   // Assume the access is atomic.
@@ -229,10 +228,17 @@ static T AtomicLoad(ThreadState *thr, uptr pc, const volatile T *a,
     MemoryReadAtomic(thr, pc, (uptr)a, SizeLog<T>());
     return NoTsanAtomicLoad(a, mo);
   }
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, false);
-  AcquireImpl(thr, pc, &s->clock);
+  // Don't create sync object if it does not exist yet. For example, an atomic
+  // pointer is initialized to nullptr and then periodically acquire-loaded.
   T v = NoTsanAtomicLoad(a, mo);
-  s->mtx.ReadUnlock();
+  SyncVar *s = ctx->metamap.GetIfExistsAndLock((uptr)a, false);
+  if (s) {
+    AcquireImpl(thr, pc, &s->clock);
+    // Re-read under sync mutex because we need a consistent snapshot
+    // of the value and the clock we acquire.
+    v = NoTsanAtomicLoad(a, mo);
+    s->mtx.ReadUnlock();
+  }
   MemoryReadAtomic(thr, pc, (uptr)a, SizeLog<T>());
   return v;
 }
@@ -450,13 +456,32 @@ static void AtomicFence(ThreadState *thr, uptr pc, morder mo) {
 
 // C/C++
 
+static morder convert_morder(morder mo) {
+  if (flags()->force_seq_cst_atomics)
+    return (morder)mo_seq_cst;
+
+  // Filter out additional memory order flags:
+  // MEMMODEL_SYNC        = 1 << 15
+  // __ATOMIC_HLE_ACQUIRE = 1 << 16
+  // __ATOMIC_HLE_RELEASE = 1 << 17
+  //
+  // HLE is an optimization, and we pretend that elision always fails.
+  // MEMMODEL_SYNC is used when lowering __sync_ atomics,
+  // since we use __sync_ atomics for actual atomic operations,
+  // we can safely ignore it as well. It also subtly affects semantics,
+  // but we don't model the difference.
+  return (morder)(mo & 0x7fff);
+}
+
 #define SCOPED_ATOMIC(func, ...) \
-    const uptr callpc = (uptr)__builtin_return_address(0); \
-    uptr pc = StackTrace::GetCurrentPc(); \
-    mo = flags()->force_seq_cst_atomics ? (morder)mo_seq_cst : mo; \
     ThreadState *const thr = cur_thread(); \
-    if (thr->ignore_interceptors) \
+    if (thr->ignore_sync || thr->ignore_interceptors) { \
+      ProcessPendingSignals(thr); \
       return NoTsanAtomic##func(__VA_ARGS__); \
+    } \
+    const uptr callpc = (uptr)__builtin_return_address(0); \
+    uptr pc = StackTrace::GetCurrentPc(); \
+    mo = convert_morder(mo); \
     AtomicStatInc(thr, sizeof(*a), mo, StatAtomic##func); \
     ScopedAtomic sa(thr, callpc, a, mo, __func__); \
     return Atomic##func(thr, pc, __VA_ARGS__); \
diff --git a/lib/tsan/rtl/tsan_interface_java.cc b/lib/tsan/rtl/tsan_interface_java.cc
index 5bdc04f07..75e960e62 100644
--- a/lib/tsan/rtl/tsan_interface_java.cc
+++ b/lib/tsan/rtl/tsan_interface_java.cc
@@ -180,8 +180,8 @@ void __tsan_java_mutex_lock(jptr addr) {
   CHECK_GE(addr, jctx->heap_begin);
   CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
 
-  MutexCreate(thr, pc, addr, true, true, true);
-  MutexLock(thr, pc, addr);
+  MutexPostLock(thr, pc, addr, MutexFlagLinkerInit | MutexFlagWriteReentrant |
+      MutexFlagDoPreLockOnPostLock);
 }
 
 void __tsan_java_mutex_unlock(jptr addr) {
@@ -201,8 +201,8 @@ void __tsan_java_mutex_read_lock(jptr addr) {
   CHECK_GE(addr, jctx->heap_begin);
   CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
 
-  MutexCreate(thr, pc, addr, true, true, true);
-  MutexReadLock(thr, pc, addr);
+  MutexPostReadLock(thr, pc, addr, MutexFlagLinkerInit |
+      MutexFlagWriteReentrant | MutexFlagDoPreLockOnPostLock);
 }
 
 void __tsan_java_mutex_read_unlock(jptr addr) {
@@ -223,8 +223,8 @@ void __tsan_java_mutex_lock_rec(jptr addr, int rec) {
   CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
   CHECK_GT(rec, 0);
 
-  MutexCreate(thr, pc, addr, true, true, true);
-  MutexLock(thr, pc, addr, rec);
+  MutexPostLock(thr, pc, addr, MutexFlagLinkerInit | MutexFlagWriteReentrant |
+      MutexFlagDoPreLockOnPostLock | MutexFlagRecursiveLock, rec);
 }
 
 int __tsan_java_mutex_unlock_rec(jptr addr) {
@@ -234,7 +234,7 @@ int __tsan_java_mutex_unlock_rec(jptr addr) {
   CHECK_GE(addr, jctx->heap_begin);
   CHECK_LT(addr, jctx->heap_begin + jctx->heap_size);
 
-  return MutexUnlock(thr, pc, addr, true);
+  return MutexUnlock(thr, pc, addr, MutexFlagRecursiveUnlock);
 }
 
 void __tsan_java_acquire(jptr addr) {
diff --git a/lib/tsan/rtl/tsan_libdispatch_mac.cc b/lib/tsan/rtl/tsan_libdispatch_mac.cc
index d8c689ebb..0bd010700 100644
--- a/lib/tsan/rtl/tsan_libdispatch_mac.cc
+++ b/lib/tsan/rtl/tsan_libdispatch_mac.cc
@@ -86,21 +86,23 @@ static tsan_block_context_t *AllocContext(ThreadState *thr, uptr pc,
                                           void *orig_context,
                                           dispatch_function_t orig_work) {
   tsan_block_context_t *new_context =
-      (tsan_block_context_t *)user_alloc(thr, pc, sizeof(tsan_block_context_t));
+      (tsan_block_context_t *)user_alloc_internal(thr, pc,
+                                                  sizeof(tsan_block_context_t));
   new_context->queue = queue;
   new_context->orig_context = orig_context;
   new_context->orig_work = orig_work;
   new_context->free_context_in_callback = true;
   new_context->submitted_synchronously = false;
   new_context->is_barrier_block = false;
+  new_context->non_queue_sync_object = 0;
   return new_context;
 }
 
-#define GET_QUEUE_SYNC_VARS(context, q)                      \
-  bool is_queue_serial = q && IsQueueSerial(q);              \
-  uptr sync_ptr = (uptr)q ?: context->non_queue_sync_object; \
-  uptr serial_sync = (uptr)sync_ptr;                         \
-  uptr concurrent_sync = ((uptr)sync_ptr) + sizeof(uptr);    \
+#define GET_QUEUE_SYNC_VARS(context, q)                                  \
+  bool is_queue_serial = q && IsQueueSerial(q);                          \
+  uptr sync_ptr = (uptr)q ?: context->non_queue_sync_object;             \
+  uptr serial_sync = (uptr)sync_ptr;                                     \
+  uptr concurrent_sync = sync_ptr ? ((uptr)sync_ptr) + sizeof(uptr) : 0; \
   bool serial_task = context->is_barrier_block || is_queue_serial
 
 static void dispatch_sync_pre_execute(ThreadState *thr, uptr pc,
@@ -111,8 +113,8 @@ static void dispatch_sync_pre_execute(ThreadState *thr, uptr pc,
   dispatch_queue_t q = context->queue;
   do {
     GET_QUEUE_SYNC_VARS(context, q);
-    Acquire(thr, pc, serial_sync);
-    if (serial_task) Acquire(thr, pc, concurrent_sync);
+    if (serial_sync) Acquire(thr, pc, serial_sync);
+    if (serial_task && concurrent_sync) Acquire(thr, pc, concurrent_sync);
 
     if (q) q = GetTargetQueueFromQueue(q);
   } while (q);
@@ -126,7 +128,8 @@ static void dispatch_sync_post_execute(ThreadState *thr, uptr pc,
   dispatch_queue_t q = context->queue;
   do {
     GET_QUEUE_SYNC_VARS(context, q);
-    Release(thr, pc, serial_task ? serial_sync : concurrent_sync);
+    if (serial_task && serial_sync) Release(thr, pc, serial_sync);
+    if (!serial_task && concurrent_sync) Release(thr, pc, concurrent_sync);
 
     if (q) q = GetTargetQueueFromQueue(q);
   } while (q);
@@ -174,7 +177,8 @@ static void invoke_and_release_block(void *param) {
   }
 
 #define DISPATCH_INTERCEPT_SYNC_B(name, barrier)                             \
-  TSAN_INTERCEPTOR(void, name, dispatch_queue_t q, dispatch_block_t block) { \
+  TSAN_INTERCEPTOR(void, name, dispatch_queue_t q,                           \
+                   DISPATCH_NOESCAPE dispatch_block_t block) {               \
     SCOPED_TSAN_INTERCEPTOR(name, q, block);                                 \
     SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();                           \
     dispatch_block_t heap_block = Block_copy(block);                         \
@@ -264,7 +268,7 @@ TSAN_INTERCEPTOR(void, dispatch_after_f, dispatch_time_t when,
 // need to undefine the macro.
 #undef dispatch_once
 TSAN_INTERCEPTOR(void, dispatch_once, dispatch_once_t *predicate,
-                 dispatch_block_t block) {
+                 DISPATCH_NOESCAPE dispatch_block_t block) {
   SCOPED_INTERCEPTOR_RAW(dispatch_once, predicate, block);
   atomic_uint32_t *a = reinterpret_cast<atomic_uint32_t *>(predicate);
   u32 v = atomic_load(a, memory_order_acquire);
@@ -474,7 +478,8 @@ TSAN_INTERCEPTOR(void, dispatch_source_set_registration_handler_f,
 }
 
 TSAN_INTERCEPTOR(void, dispatch_apply, size_t iterations,
-                 dispatch_queue_t queue, void (^block)(size_t)) {
+                 dispatch_queue_t queue,
+                 DISPATCH_NOESCAPE void (^block)(size_t)) {
   SCOPED_TSAN_INTERCEPTOR(dispatch_apply, iterations, queue, block);
 
   void *parent_to_child_sync = nullptr;
diff --git a/lib/tsan/rtl/tsan_malloc_mac.cc b/lib/tsan/rtl/tsan_malloc_mac.cc
index 8d31ccbca..455c95df6 100644
--- a/lib/tsan/rtl/tsan_malloc_mac.cc
+++ b/lib/tsan/rtl/tsan_malloc_mac.cc
@@ -26,7 +26,7 @@ using namespace __tsan;
 #define COMMON_MALLOC_FORCE_UNLOCK()
 #define COMMON_MALLOC_MEMALIGN(alignment, size) \
   void *p =                                     \
-      user_alloc(cur_thread(), StackTrace::GetCurrentPc(), size, alignment)
+      user_memalign(cur_thread(), StackTrace::GetCurrentPc(), alignment, size)
 #define COMMON_MALLOC_MALLOC(size)                             \
   if (cur_thread()->in_symbolizer) return InternalAlloc(size); \
   SCOPED_INTERCEPTOR_RAW(malloc, size);                        \
@@ -43,7 +43,7 @@ using namespace __tsan;
   if (cur_thread()->in_symbolizer)                            \
     return InternalAlloc(size, nullptr, GetPageSizeCached()); \
   SCOPED_INTERCEPTOR_RAW(valloc, size);                       \
-  void *p = user_alloc(thr, pc, size, GetPageSizeCached())
+  void *p = user_valloc(thr, pc, size)
 #define COMMON_MALLOC_FREE(ptr)                              \
   if (cur_thread()->in_symbolizer) return InternalFree(ptr); \
   SCOPED_INTERCEPTOR_RAW(free, ptr);                         \
diff --git a/lib/tsan/rtl/tsan_mman.cc b/lib/tsan/rtl/tsan_mman.cc
index 2dea24915..19680238b 100644
--- a/lib/tsan/rtl/tsan_mman.cc
+++ b/lib/tsan/rtl/tsan_mman.cc
@@ -10,8 +10,10 @@
 // This file is a part of ThreadSanitizer (TSan), a race detector.
 //
 //===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_allocator_checks.h"
 #include "sanitizer_common/sanitizer_allocator_interface.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_errno.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "tsan_mman.h"
 #include "tsan_rtl.h"
@@ -112,9 +114,8 @@ ScopedGlobalProcessor::~ScopedGlobalProcessor() {
 }
 
 void InitializeAllocator() {
-  allocator()->Init(
-      common_flags()->allocator_may_return_null,
-      common_flags()->allocator_release_to_os_interval_ms);
+  SetAllocatorMayReturnNull(common_flags()->allocator_may_return_null);
+  allocator()->Init(common_flags()->allocator_release_to_os_interval_ms);
 }
 
 void InitializeAllocatorLate() {
@@ -149,11 +150,12 @@ static void SignalUnsafeCall(ThreadState *thr, uptr pc) {
   OutputReport(thr, rep);
 }
 
-void *user_alloc(ThreadState *thr, uptr pc, uptr sz, uptr align, bool signal) {
+void *user_alloc_internal(ThreadState *thr, uptr pc, uptr sz, uptr align,
+                          bool signal) {
   if ((sz >= (1ull << 40)) || (align >= (1ull << 40)))
-    return allocator()->ReturnNullOrDieOnBadRequest();
+    return Allocator::FailureHandler::OnBadRequest();
   void *p = allocator()->Allocate(&thr->proc()->alloc_cache, sz, align);
-  if (p == 0)
+  if (UNLIKELY(p == 0))
     return 0;
   if (ctx && ctx->initialized)
     OnUserAlloc(thr, pc, (uptr)p, sz, true);
@@ -162,15 +164,6 @@ void *user_alloc(ThreadState *thr, uptr pc, uptr sz, uptr align, bool signal) {
   return p;
 }
 
-void *user_calloc(ThreadState *thr, uptr pc, uptr size, uptr n) {
-  if (CallocShouldReturnNullDueToOverflow(size, n))
-    return allocator()->ReturnNullOrDieOnBadRequest();
-  void *p = user_alloc(thr, pc, n * size);
-  if (p)
-    internal_memset(p, 0, n * size);
-  return p;
-}
-
 void user_free(ThreadState *thr, uptr pc, void *p, bool signal) {
   ScopedGlobalProcessor sgp;
   if (ctx && ctx->initialized)
@@ -180,6 +173,19 @@ void user_free(ThreadState *thr, uptr pc, void *p, bool signal) {
     SignalUnsafeCall(thr, pc);
 }
 
+void *user_alloc(ThreadState *thr, uptr pc, uptr sz) {
+  return SetErrnoOnNull(user_alloc_internal(thr, pc, sz, kDefaultAlignment));
+}
+
+void *user_calloc(ThreadState *thr, uptr pc, uptr size, uptr n) {
+  if (UNLIKELY(CheckForCallocOverflow(size, n)))
+    return SetErrnoOnNull(Allocator::FailureHandler::OnBadRequest());
+  void *p = user_alloc_internal(thr, pc, n * size);
+  if (p)
+    internal_memset(p, 0, n * size);
+  return SetErrnoOnNull(p);
+}
+
 void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
   DPrintf("#%d: alloc(%zu) = %p\n", thr->tid, sz, p);
   ctx->metamap.AllocBlock(thr, pc, p, sz);
@@ -200,15 +206,64 @@ void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write) {
 void *user_realloc(ThreadState *thr, uptr pc, void *p, uptr sz) {
   // FIXME: Handle "shrinking" more efficiently,
   // it seems that some software actually does this.
-  void *p2 = user_alloc(thr, pc, sz);
-  if (p2 == 0)
-    return 0;
-  if (p) {
-    uptr oldsz = user_alloc_usable_size(p);
-    internal_memcpy(p2, p, min(oldsz, sz));
+  if (!p)
+    return SetErrnoOnNull(user_alloc_internal(thr, pc, sz));
+  if (!sz) {
     user_free(thr, pc, p);
+    return nullptr;
+  }
+  void *new_p = user_alloc_internal(thr, pc, sz);
+  if (new_p) {
+    uptr old_sz = user_alloc_usable_size(p);
+    internal_memcpy(new_p, p, min(old_sz, sz));
+    user_free(thr, pc, p);
+  }
+  return SetErrnoOnNull(new_p);
+}
+
+void *user_memalign(ThreadState *thr, uptr pc, uptr align, uptr sz) {
+  if (UNLIKELY(!IsPowerOfTwo(align))) {
+    errno = errno_EINVAL;
+    return Allocator::FailureHandler::OnBadRequest();
+  }
+  return SetErrnoOnNull(user_alloc_internal(thr, pc, sz, align));
+}
+
+int user_posix_memalign(ThreadState *thr, uptr pc, void **memptr, uptr align,
+                        uptr sz) {
+  if (UNLIKELY(!CheckPosixMemalignAlignment(align))) {
+    Allocator::FailureHandler::OnBadRequest();
+    return errno_EINVAL;
+  }
+  void *ptr = user_alloc_internal(thr, pc, sz, align);
+  if (UNLIKELY(!ptr))
+    return errno_ENOMEM;
+  CHECK(IsAligned((uptr)ptr, align));
+  *memptr = ptr;
+  return 0;
+}
+
+void *user_aligned_alloc(ThreadState *thr, uptr pc, uptr align, uptr sz) {
+  if (UNLIKELY(!CheckAlignedAllocAlignmentAndSize(align, sz))) {
+    errno = errno_EINVAL;
+    return Allocator::FailureHandler::OnBadRequest();
+  }
+  return SetErrnoOnNull(user_alloc_internal(thr, pc, sz, align));
+}
+
+void *user_valloc(ThreadState *thr, uptr pc, uptr sz) {
+  return SetErrnoOnNull(user_alloc_internal(thr, pc, sz, GetPageSizeCached()));
+}
+
+void *user_pvalloc(ThreadState *thr, uptr pc, uptr sz) {
+  uptr PageSize = GetPageSizeCached();
+  if (UNLIKELY(CheckForPvallocOverflow(sz, PageSize))) {
+    errno = errno_ENOMEM;
+    return Allocator::FailureHandler::OnBadRequest();
   }
-  return p2;
+  // pvalloc(0) should allocate one page.
+  sz = sz ? RoundUpTo(sz, PageSize) : PageSize;
+  return SetErrnoOnNull(user_alloc_internal(thr, pc, sz, PageSize));
 }
 
 uptr user_alloc_usable_size(const void *p) {
@@ -295,6 +350,8 @@ uptr __sanitizer_get_allocated_size(const void *p) {
 
 void __tsan_on_thread_idle() {
   ThreadState *thr = cur_thread();
+  thr->clock.ResetCached(&thr->proc()->clock_cache);
+  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
   allocator()->SwallowCache(&thr->proc()->alloc_cache);
   internal_allocator()->SwallowCache(&thr->proc()->internal_alloc_cache);
   ctx->metamap.OnProcIdle(thr->proc());
diff --git a/lib/tsan/rtl/tsan_mman.h b/lib/tsan/rtl/tsan_mman.h
index 8cdeeb35a..6042c5c5d 100644
--- a/lib/tsan/rtl/tsan_mman.h
+++ b/lib/tsan/rtl/tsan_mman.h
@@ -27,13 +27,20 @@ void AllocatorProcFinish(Processor *proc);
 void AllocatorPrintStats();
 
 // For user allocations.
-void *user_alloc(ThreadState *thr, uptr pc, uptr sz,
-                 uptr align = kDefaultAlignment, bool signal = true);
-void *user_calloc(ThreadState *thr, uptr pc, uptr sz, uptr n);
+void *user_alloc_internal(ThreadState *thr, uptr pc, uptr sz,
+                          uptr align = kDefaultAlignment, bool signal = true);
 // Does not accept NULL.
 void user_free(ThreadState *thr, uptr pc, void *p, bool signal = true);
+// Interceptor implementations.
+void *user_alloc(ThreadState *thr, uptr pc, uptr sz);
+void *user_calloc(ThreadState *thr, uptr pc, uptr sz, uptr n);
 void *user_realloc(ThreadState *thr, uptr pc, void *p, uptr sz);
-void *user_alloc_aligned(ThreadState *thr, uptr pc, uptr sz, uptr align);
+void *user_memalign(ThreadState *thr, uptr pc, uptr align, uptr sz);
+int user_posix_memalign(ThreadState *thr, uptr pc, void **memptr, uptr align,
+                        uptr sz);
+void *user_aligned_alloc(ThreadState *thr, uptr pc, uptr align, uptr sz);
+void *user_valloc(ThreadState *thr, uptr pc, uptr sz);
+void *user_pvalloc(ThreadState *thr, uptr pc, uptr sz);
 uptr user_alloc_usable_size(const void *p);
 
 // Invoking malloc/free hooks that may be installed by the user.
diff --git a/lib/tsan/rtl/tsan_new_delete.cc b/lib/tsan/rtl/tsan_new_delete.cc
index b6478bb08..4d03145c1 100644
--- a/lib/tsan/rtl/tsan_new_delete.cc
+++ b/lib/tsan/rtl/tsan_new_delete.cc
@@ -12,6 +12,7 @@
 // Interceptors for operators new and delete.
 //===----------------------------------------------------------------------===//
 #include "interception/interception.h"
+#include "sanitizer_common/sanitizer_allocator.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "tsan_interceptors.h"
 
@@ -24,13 +25,15 @@ struct nothrow_t {};
 DECLARE_REAL(void *, malloc, uptr size)
 DECLARE_REAL(void, free, void *ptr)
 
-#define OPERATOR_NEW_BODY(mangled_name) \
+// TODO(alekseys): throw std::bad_alloc instead of dying on OOM.
+#define OPERATOR_NEW_BODY(mangled_name, nothrow) \
   if (cur_thread()->in_symbolizer) \
     return InternalAlloc(size); \
   void *p = 0; \
   {  \
     SCOPED_INTERCEPTOR_RAW(mangled_name, size); \
     p = user_alloc(thr, pc, size); \
+    if (!nothrow && UNLIKELY(!p)) DieOnFailure::OnOOM(); \
   }  \
   invoke_malloc_hook(p, size);  \
   return p;
@@ -38,25 +41,25 @@ DECLARE_REAL(void, free, void *ptr)
 SANITIZER_INTERFACE_ATTRIBUTE
 void *operator new(__sanitizer::uptr size);
 void *operator new(__sanitizer::uptr size) {
-  OPERATOR_NEW_BODY(_Znwm);
+  OPERATOR_NEW_BODY(_Znwm, false /*nothrow*/);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void *operator new[](__sanitizer::uptr size);
 void *operator new[](__sanitizer::uptr size) {
-  OPERATOR_NEW_BODY(_Znam);
+  OPERATOR_NEW_BODY(_Znam, false /*nothrow*/);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void *operator new(__sanitizer::uptr size, std::nothrow_t const&);
 void *operator new(__sanitizer::uptr size, std::nothrow_t const&) {
-  OPERATOR_NEW_BODY(_ZnwmRKSt9nothrow_t);
+  OPERATOR_NEW_BODY(_ZnwmRKSt9nothrow_t, true /*nothrow*/);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void *operator new[](__sanitizer::uptr size, std::nothrow_t const&);
 void *operator new[](__sanitizer::uptr size, std::nothrow_t const&) {
-  OPERATOR_NEW_BODY(_ZnamRKSt9nothrow_t);
+  OPERATOR_NEW_BODY(_ZnamRKSt9nothrow_t, true /*nothrow*/);
 }
 
 #define OPERATOR_DELETE_BODY(mangled_name) \
diff --git a/lib/tsan/rtl/tsan_platform.h b/lib/tsan/rtl/tsan_platform.h
index 1dd9d91d4..4b9771359 100644
--- a/lib/tsan/rtl/tsan_platform.h
+++ b/lib/tsan/rtl/tsan_platform.h
@@ -42,6 +42,19 @@ C/C++ on linux/x86_64 and freebsd/x86_64
 7b00 0000 0000 - 7c00 0000 0000: heap
 7c00 0000 0000 - 7e80 0000 0000: -
 7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
+
+C/C++ on netbsd/amd64 can reuse the same mapping:
+ * The address space starts from 0x1000 (option with 0x0) and ends with
+   0x7f7ffffff000.
+ * LoAppMem-kHeapMemEnd can be reused as it is.
+ * No VDSO support.
+ * No MidAppMem region.
+ * No additional HeapMem region.
+ * HiAppMem contains the stack, loader, shared libraries and heap.
+ * Stack on NetBSD/amd64 has prereserved 128MB.
+ * Heap grows downwards (top-down).
+ * ASLR must be disabled per-process or globally.
+
 */
 struct Mapping {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
@@ -100,6 +113,37 @@ struct Mapping {
 };
 
 #define TSAN_MID_APP_RANGE 1
+#elif defined(__aarch64__) && defined(__APPLE__)
+/*
+C/C++ on Darwin/iOS/ARM64 (36-bit VMA, 64 GB VM)
+0000 0000 00 - 0100 0000 00: -                                    (4 GB)
+0100 0000 00 - 0200 0000 00: main binary, modules, thread stacks  (4 GB)
+0200 0000 00 - 0300 0000 00: heap                                 (4 GB)
+0300 0000 00 - 0400 0000 00: -                                    (4 GB)
+0400 0000 00 - 0c00 0000 00: shadow memory                       (32 GB)
+0c00 0000 00 - 0d00 0000 00: -                                    (4 GB)
+0d00 0000 00 - 0e00 0000 00: metainfo                             (4 GB)
+0e00 0000 00 - 0f00 0000 00: -                                    (4 GB)
+0f00 0000 00 - 1000 0000 00: traces                               (4 GB)
+*/
+struct Mapping {
+  static const uptr kLoAppMemBeg   = 0x0100000000ull;
+  static const uptr kLoAppMemEnd   = 0x0200000000ull;
+  static const uptr kHeapMemBeg    = 0x0200000000ull;
+  static const uptr kHeapMemEnd    = 0x0300000000ull;
+  static const uptr kShadowBeg     = 0x0400000000ull;
+  static const uptr kShadowEnd     = 0x0c00000000ull;
+  static const uptr kMetaShadowBeg = 0x0d00000000ull;
+  static const uptr kMetaShadowEnd = 0x0e00000000ull;
+  static const uptr kTraceMemBeg   = 0x0f00000000ull;
+  static const uptr kTraceMemEnd   = 0x1000000000ull;
+  static const uptr kHiAppMemBeg   = 0x1000000000ull;
+  static const uptr kHiAppMemEnd   = 0x1000000000ull;
+  static const uptr kAppMemMsk     =          0x0ull;
+  static const uptr kAppMemXor     =          0x0ull;
+  static const uptr kVdsoBeg       = 0x7000000000000000ull;
+};
+
 #elif defined(__aarch64__)
 // AArch64 supports multiple VMA which leads to multiple address transformation
 // functions.  To support these multiple VMAS transformations and mappings TSAN
@@ -389,7 +433,7 @@ uptr MappingImpl(void) {
 
 template<int Type>
 uptr MappingArchImpl(void) {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return MappingImpl<Mapping39, Type>();
     case 42: return MappingImpl<Mapping42, Type>();
@@ -542,7 +586,7 @@ bool IsAppMemImpl(uptr mem) {
 
 ALWAYS_INLINE
 bool IsAppMem(uptr mem) {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return IsAppMemImpl<Mapping39>(mem);
     case 42: return IsAppMemImpl<Mapping42>(mem);
@@ -569,7 +613,7 @@ bool IsShadowMemImpl(uptr mem) {
 
 ALWAYS_INLINE
 bool IsShadowMem(uptr mem) {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return IsShadowMemImpl<Mapping39>(mem);
     case 42: return IsShadowMemImpl<Mapping42>(mem);
@@ -596,7 +640,7 @@ bool IsMetaMemImpl(uptr mem) {
 
 ALWAYS_INLINE
 bool IsMetaMem(uptr mem) {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return IsMetaMemImpl<Mapping39>(mem);
     case 42: return IsMetaMemImpl<Mapping42>(mem);
@@ -633,7 +677,7 @@ uptr MemToShadowImpl(uptr x) {
 
 ALWAYS_INLINE
 uptr MemToShadow(uptr x) {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return MemToShadowImpl<Mapping39>(x);
     case 42: return MemToShadowImpl<Mapping42>(x);
@@ -672,7 +716,7 @@ u32 *MemToMetaImpl(uptr x) {
 
 ALWAYS_INLINE
 u32 *MemToMeta(uptr x) {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return MemToMetaImpl<Mapping39>(x);
     case 42: return MemToMetaImpl<Mapping42>(x);
@@ -724,7 +768,7 @@ uptr ShadowToMemImpl(uptr s) {
 
 ALWAYS_INLINE
 uptr ShadowToMem(uptr s) {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return ShadowToMemImpl<Mapping39>(s);
     case 42: return ShadowToMemImpl<Mapping42>(s);
@@ -759,7 +803,7 @@ uptr GetThreadTraceImpl(int tid) {
 
 ALWAYS_INLINE
 uptr GetThreadTrace(int tid) {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return GetThreadTraceImpl<Mapping39>(tid);
     case 42: return GetThreadTraceImpl<Mapping42>(tid);
@@ -789,7 +833,7 @@ uptr GetThreadTraceHeaderImpl(int tid) {
 
 ALWAYS_INLINE
 uptr GetThreadTraceHeader(int tid) {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return GetThreadTraceHeaderImpl<Mapping39>(tid);
     case 42: return GetThreadTraceHeaderImpl<Mapping42>(tid);
@@ -816,6 +860,7 @@ void FlushShadowMemory();
 void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive);
 int ExtractResolvFDs(void *state, int *fds, int nfd);
 int ExtractRecvmsgFDs(void *msg, int *fds, int nfd);
+void ImitateTlsWrite(ThreadState *thr, uptr tls_addr, uptr tls_size);
 
 int call_pthread_cancel_with_cleanup(int(*fn)(void *c, void *m,
     void *abstime), void *c, void *m, void *abstime,
diff --git a/lib/tsan/rtl/tsan_platform_linux.cc b/lib/tsan/rtl/tsan_platform_linux.cc
index 3313288a7..216eef93c 100644
--- a/lib/tsan/rtl/tsan_platform_linux.cc
+++ b/lib/tsan/rtl/tsan_platform_linux.cc
@@ -14,11 +14,12 @@
 
 
 #include "sanitizer_common/sanitizer_platform.h"
-#if SANITIZER_LINUX || SANITIZER_FREEBSD
+#if SANITIZER_LINUX || SANITIZER_FREEBSD || SANITIZER_NETBSD
 
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_linux.h"
+#include "sanitizer_common/sanitizer_platform_limits_netbsd.h"
 #include "sanitizer_common/sanitizer_platform_limits_posix.h"
 #include "sanitizer_common/sanitizer_posix.h"
 #include "sanitizer_common/sanitizer_procmaps.h"
@@ -47,7 +48,6 @@
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <unistd.h>
-#include <errno.h>
 #include <sched.h>
 #include <dlfcn.h>
 #if SANITIZER_LINUX
@@ -182,17 +182,15 @@ static void MapRodata() {
   }
   // Map the file into shadow of .rodata sections.
   MemoryMappingLayout proc_maps(/*cache_enabled*/true);
-  uptr start, end, offset, prot;
   // Reusing the buffer 'name'.
-  while (proc_maps.Next(&start, &end, &offset, name, ARRAY_SIZE(name), &prot)) {
-    if (name[0] != 0 && name[0] != '['
-        && (prot & MemoryMappingLayout::kProtectionRead)
-        && (prot & MemoryMappingLayout::kProtectionExecute)
-        && !(prot & MemoryMappingLayout::kProtectionWrite)
-        && IsAppMem(start)) {
+  MemoryMappedSegment segment(name, ARRAY_SIZE(name));
+  while (proc_maps.Next(&segment)) {
+    if (segment.filename[0] != 0 && segment.filename[0] != '[' &&
+        segment.IsReadable() && segment.IsExecutable() &&
+        !segment.IsWritable() && IsAppMem(segment.start)) {
       // Assume it's .rodata
-      char *shadow_start = (char*)MemToShadow(start);
-      char *shadow_end = (char*)MemToShadow(end);
+      char *shadow_start = (char *)MemToShadow(segment.start);
+      char *shadow_end = (char *)MemToShadow(segment.end);
       for (char *p = shadow_start; p < shadow_end; p += marker.size()) {
         internal_mmap(p, Min<uptr>(marker.size(), shadow_end - p),
                       PROT_READ, MAP_PRIVATE | MAP_FIXED, fd, 0);
@@ -289,7 +287,7 @@ void InitializePlatform() {
 int ExtractResolvFDs(void *state, int *fds, int nfd) {
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
   int cnt = 0;
-  __res_state *statp = (__res_state*)state;
+  struct __res_state *statp = (struct __res_state*)state;
   for (int i = 0; i < MAXNS && cnt < nfd; i++) {
     if (statp->_u._ext.nsaddrs[i] && statp->_u._ext.nssocks[i] != -1)
       fds[cnt++] = statp->_u._ext.nssocks[i];
@@ -320,6 +318,20 @@ int ExtractRecvmsgFDs(void *msgp, int *fds, int nfd) {
   return res;
 }
 
+void ImitateTlsWrite(ThreadState *thr, uptr tls_addr, uptr tls_size) {
+  // Check that the thr object is in tls;
+  const uptr thr_beg = (uptr)thr;
+  const uptr thr_end = (uptr)thr + sizeof(*thr);
+  CHECK_GE(thr_beg, tls_addr);
+  CHECK_LE(thr_beg, tls_addr + tls_size);
+  CHECK_GE(thr_end, tls_addr);
+  CHECK_LE(thr_end, tls_addr + tls_size);
+  // Since the thr object is huge, skip it.
+  MemoryRangeImitateWrite(thr, /*pc=*/2, tls_addr, thr_beg - tls_addr);
+  MemoryRangeImitateWrite(thr, /*pc=*/2, thr_end,
+                          tls_addr + tls_size - thr_end);
+}
+
 // Note: this function runs with async signals enabled,
 // so it must not touch any tsan state.
 int call_pthread_cancel_with_cleanup(int(*fn)(void *c, void *m,
@@ -341,36 +353,22 @@ void ReplaceSystemMalloc() { }
 
 #if !SANITIZER_GO
 #if SANITIZER_ANDROID
-
-#if defined(__aarch64__)
-# define __get_tls() \
-    ({ void** __val; __asm__("mrs %0, tpidr_el0" : "=r"(__val)); __val; })
-#elif defined(__x86_64__)
-# define __get_tls() \
-    ({ void** __val; __asm__("mov %%fs:0, %0" : "=r"(__val)); __val; })
-#else
-#error unsupported architecture
-#endif
-
-// On Android, __thread is not supported. So we store the pointer to ThreadState
-// in TLS_SLOT_TSAN, which is the tls slot allocated by Android bionic for tsan.
-static const int TLS_SLOT_TSAN = 8;
 // On Android, one thread can call intercepted functions after
 // DestroyThreadState(), so add a fake thread state for "dead" threads.
 static ThreadState *dead_thread_state = nullptr;
 
 ThreadState *cur_thread() {
-  ThreadState* thr = (ThreadState*)__get_tls()[TLS_SLOT_TSAN];
+  ThreadState* thr = reinterpret_cast<ThreadState*>(*get_android_tls_ptr());
   if (thr == nullptr) {
     __sanitizer_sigset_t emptyset;
     internal_sigfillset(&emptyset);
     __sanitizer_sigset_t oldset;
     CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, &emptyset, &oldset));
-    thr = reinterpret_cast<ThreadState*>(__get_tls()[TLS_SLOT_TSAN]);
+    thr = reinterpret_cast<ThreadState*>(*get_android_tls_ptr());
     if (thr == nullptr) {
       thr = reinterpret_cast<ThreadState*>(MmapOrDie(sizeof(ThreadState),
                                                      "ThreadState"));
-      __get_tls()[TLS_SLOT_TSAN] = thr;
+      *get_android_tls_ptr() = reinterpret_cast<uptr>(thr);
       if (dead_thread_state == nullptr) {
         dead_thread_state = reinterpret_cast<ThreadState*>(
             MmapOrDie(sizeof(ThreadState), "ThreadState"));
@@ -392,9 +390,9 @@ void cur_thread_finalize() {
   internal_sigfillset(&emptyset);
   __sanitizer_sigset_t oldset;
   CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, &emptyset, &oldset));
-  ThreadState* thr = (ThreadState*)__get_tls()[TLS_SLOT_TSAN];
+  ThreadState* thr = reinterpret_cast<ThreadState*>(*get_android_tls_ptr());
   if (thr != dead_thread_state) {
-    __get_tls()[TLS_SLOT_TSAN] = dead_thread_state;
+    *get_android_tls_ptr() = reinterpret_cast<uptr>(dead_thread_state);
     UnmapOrDie(thr, sizeof(ThreadState));
   }
   CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, &oldset, nullptr));
@@ -404,4 +402,4 @@ void cur_thread_finalize() {
 
 }  // namespace __tsan
 
-#endif  // SANITIZER_LINUX || SANITIZER_FREEBSD
+#endif  // SANITIZER_LINUX || SANITIZER_FREEBSD || SANITIZER_NETBSD
diff --git a/lib/tsan/rtl/tsan_platform_mac.cc b/lib/tsan/rtl/tsan_platform_mac.cc
index b8d3d5528..4570286b1 100644
--- a/lib/tsan/rtl/tsan_platform_mac.cc
+++ b/lib/tsan/rtl/tsan_platform_mac.cc
@@ -75,12 +75,18 @@ static void *SignalSafeGetOrAllocate(uptr *dst, uptr size) {
 static uptr main_thread_identity = 0;
 ALIGNED(64) static char main_thread_state[sizeof(ThreadState)];
 
+ThreadState **cur_thread_location() {
+  ThreadState **thread_identity = (ThreadState **)pthread_self();
+  return ((uptr)thread_identity == main_thread_identity) ? nullptr
+                                                         : thread_identity;
+}
+
 ThreadState *cur_thread() {
-  uptr thread_identity = (uptr)pthread_self();
-  if (thread_identity == main_thread_identity || main_thread_identity == 0) {
+  ThreadState **thr_state_loc = cur_thread_location();
+  if (thr_state_loc == nullptr || main_thread_identity == 0) {
     return (ThreadState *)&main_thread_state;
   }
-  ThreadState **fake_tls = (ThreadState **)MemToShadow(thread_identity);
+  ThreadState **fake_tls = (ThreadState **)MemToShadow((uptr)thr_state_loc);
   ThreadState *thr = (ThreadState *)SignalSafeGetOrAllocate(
       (uptr *)fake_tls, sizeof(ThreadState));
   return thr;
@@ -90,13 +96,13 @@ ThreadState *cur_thread() {
 // munmap first and then clear `fake_tls`; if we receive a signal in between,
 // handler will try to access the unmapped ThreadState.
 void cur_thread_finalize() {
-  uptr thread_identity = (uptr)pthread_self();
-  if (thread_identity == main_thread_identity) {
+  ThreadState **thr_state_loc = cur_thread_location();
+  if (thr_state_loc == nullptr) {
     // Calling dispatch_main() or xpc_main() actually invokes pthread_exit to
     // exit the main thread. Let's keep the main thread's ThreadState.
     return;
   }
-  ThreadState **fake_tls = (ThreadState **)MemToShadow(thread_identity);
+  ThreadState **fake_tls = (ThreadState **)MemToShadow((uptr)thr_state_loc);
   internal_munmap(*fake_tls, sizeof(ThreadState));
   *fake_tls = nullptr;
 }
@@ -161,8 +167,8 @@ void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
 #else  // !SANITIZER_GO
     "app      (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
 #endif
-    "stacks: %ld unique IDs, %ld kB allocated\n"
-    "threads: %ld total, %ld live\n"
+    "stacks: %zd unique IDs, %zd kB allocated\n"
+    "threads: %zd total, %zd live\n"
     "------------------------------\n",
     ShadowBeg(), ShadowEnd(), shadow_res / 1024, shadow_dirty / 1024,
     MetaShadowBeg(), MetaShadowEnd(), meta_res / 1024, meta_dirty / 1024,
@@ -224,6 +230,14 @@ static void my_pthread_introspection_hook(unsigned int event, pthread_t thread,
 #endif
 
 void InitializePlatformEarly() {
+#if defined(__aarch64__)
+  uptr max_vm = GetMaxVirtualAddress() + 1;
+  if (max_vm != Mapping::kHiAppMemEnd) {
+    Printf("ThreadSanitizer: unsupported vm address limit %p, expected %p.\n",
+           max_vm, Mapping::kHiAppMemEnd);
+    Die();
+  }
+#endif
 }
 
 void InitializePlatform() {
@@ -240,6 +254,29 @@ void InitializePlatform() {
 }
 
 #if !SANITIZER_GO
+void ImitateTlsWrite(ThreadState *thr, uptr tls_addr, uptr tls_size) {
+  // The pointer to the ThreadState object is stored in the shadow memory
+  // of the tls.
+  uptr tls_end = tls_addr + tls_size;
+  ThreadState **thr_state_loc = cur_thread_location();
+  if (thr_state_loc == nullptr) {
+    MemoryRangeImitateWrite(thr, /*pc=*/2, tls_addr, tls_size);
+  } else {
+    uptr thr_state_start = (uptr)thr_state_loc;
+    uptr thr_state_end = thr_state_start + sizeof(uptr);
+    CHECK_GE(thr_state_start, tls_addr);
+    CHECK_LE(thr_state_start, tls_addr + tls_size);
+    CHECK_GE(thr_state_end, tls_addr);
+    CHECK_LE(thr_state_end, tls_addr + tls_size);
+    MemoryRangeImitateWrite(thr, /*pc=*/2, tls_addr,
+                            thr_state_start - tls_addr);
+    MemoryRangeImitateWrite(thr, /*pc=*/2, thr_state_end,
+                            tls_end - thr_state_end);
+  }
+}
+#endif
+
+#if !SANITIZER_GO
 // Note: this function runs with async signals enabled,
 // so it must not touch any tsan state.
 int call_pthread_cancel_with_cleanup(int(*fn)(void *c, void *m,
diff --git a/lib/tsan/rtl/tsan_platform_posix.cc b/lib/tsan/rtl/tsan_platform_posix.cc
index 0732c83d6..e4f90a811 100644
--- a/lib/tsan/rtl/tsan_platform_posix.cc
+++ b/lib/tsan/rtl/tsan_platform_posix.cc
@@ -46,6 +46,9 @@ void InitializeShadowMemory() {
 #elif defined(__mips64)
   const uptr kMadviseRangeBeg  = 0xff00000000ull;
   const uptr kMadviseRangeSize = 0x0100000000ull;
+#elif defined(__aarch64__) && defined(__APPLE__)
+  uptr kMadviseRangeBeg = LoAppMemBeg();
+  uptr kMadviseRangeSize = LoAppMemEnd() - LoAppMemBeg();
 #elif defined(__aarch64__)
   uptr kMadviseRangeBeg = 0;
   uptr kMadviseRangeSize = 0;
@@ -115,21 +118,24 @@ static void ProtectRange(uptr beg, uptr end) {
 void CheckAndProtect() {
   // Ensure that the binary is indeed compiled with -pie.
   MemoryMappingLayout proc_maps(true);
-  uptr p, end, prot;
-  while (proc_maps.Next(&p, &end, 0, 0, 0, &prot)) {
-    if (IsAppMem(p))
+  MemoryMappedSegment segment;
+  while (proc_maps.Next(&segment)) {
+    if (IsAppMem(segment.start)) continue;
+    if (segment.start >= HeapMemEnd() && segment.start < HeapEnd()) continue;
+    if (segment.protection == 0)  // Zero page or mprotected.
       continue;
-    if (p >= HeapMemEnd() &&
-        p < HeapEnd())
-      continue;
-    if (prot == 0)  // Zero page or mprotected.
-      continue;
-    if (p >= VdsoBeg())  // vdso
+    if (segment.start >= VdsoBeg())  // vdso
       break;
-    Printf("FATAL: ThreadSanitizer: unexpected memory mapping %p-%p\n", p, end);
+    Printf("FATAL: ThreadSanitizer: unexpected memory mapping %p-%p\n",
+           segment.start, segment.end);
     Die();
   }
 
+#if defined(__aarch64__) && defined(__APPLE__)
+  ProtectRange(HeapMemEnd(), ShadowBeg());
+  ProtectRange(ShadowEnd(), MetaShadowBeg());
+  ProtectRange(MetaShadowEnd(), TraceMemBeg());
+#else
   ProtectRange(LoAppMemEnd(), ShadowBeg());
   ProtectRange(ShadowEnd(), MetaShadowBeg());
 #ifdef TSAN_MID_APP_RANGE
@@ -143,6 +149,7 @@ void CheckAndProtect() {
   ProtectRange(TraceMemBeg(), TraceMemEnd());
   ProtectRange(TraceMemEnd(), HeapMemBeg());
   ProtectRange(HeapEnd(), HiAppMemBeg());
+#endif
 }
 #endif
 
diff --git a/lib/tsan/rtl/tsan_report.cc b/lib/tsan/rtl/tsan_report.cc
index 7de00840c..be5d6b7ea 100644
--- a/lib/tsan/rtl/tsan_report.cc
+++ b/lib/tsan/rtl/tsan_report.cc
@@ -13,6 +13,7 @@
 #include "tsan_report.h"
 #include "tsan_platform.h"
 #include "tsan_rtl.h"
+#include "sanitizer_common/sanitizer_file.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_report_decorator.h"
 #include "sanitizer_common/sanitizer_stacktrace_printer.h"
@@ -38,22 +39,16 @@ ReportLocation *ReportLocation::New(ReportLocationType type) {
 class Decorator: public __sanitizer::SanitizerCommonDecorator {
  public:
   Decorator() : SanitizerCommonDecorator() { }
-  const char *Warning()    { return Red(); }
-  const char *EndWarning() { return Default(); }
   const char *Access()     { return Blue(); }
-  const char *EndAccess()  { return Default(); }
   const char *ThreadDescription()    { return Cyan(); }
-  const char *EndThreadDescription() { return Default(); }
   const char *Location()   { return Green(); }
-  const char *EndLocation() { return Default(); }
   const char *Sleep()   { return Yellow(); }
-  const char *EndSleep() { return Default(); }
   const char *Mutex()   { return Magenta(); }
-  const char *EndMutex() { return Default(); }
 };
 
 ReportDesc::ReportDesc()
-    : stacks(MBlockReportStack)
+    : tag(kExternalTagNone)
+    , stacks(MBlockReportStack)
     , mops(MBlockReportMop)
     , locs(MBlockReportLoc)
     , mutexes(MBlockReportMutex)
@@ -81,7 +76,7 @@ const char *thread_name(char *buf, int tid) {
   return buf;
 }
 
-static const char *ReportTypeString(ReportType typ) {
+static const char *ReportTypeString(ReportType typ, uptr tag) {
   if (typ == ReportTypeRace)
     return "data race";
   if (typ == ReportTypeVptrRace)
@@ -90,8 +85,10 @@ static const char *ReportTypeString(ReportType typ) {
     return "heap-use-after-free";
   if (typ == ReportTypeVptrUseAfterFree)
     return "heap-use-after-free (virtual call vs free)";
-  if (typ == ReportTypeExternalRace)
-    return "race on a library object";
+  if (typ == ReportTypeExternalRace) {
+    const char *str = GetReportHeaderFromTag(tag);
+    return str ? str : "race on external object";
+  }
   if (typ == ReportTypeThreadLeak)
     return "thread leak";
   if (typ == ReportTypeMutexDestroyLocked)
@@ -155,27 +152,29 @@ static const char *MopDesc(bool first, bool write, bool atomic) {
 }
 
 static const char *ExternalMopDesc(bool first, bool write) {
-  return first ? (write ? "Mutating" : "Read-only")
-               : (write ? "Previous mutating" : "Previous read-only");
+  return first ? (write ? "Modifying" : "Read-only")
+               : (write ? "Previous modifying" : "Previous read-only");
 }
 
 static void PrintMop(const ReportMop *mop, bool first) {
   Decorator d;
   char thrbuf[kThreadBufSize];
   Printf("%s", d.Access());
-  const char *object_type = GetObjectTypeFromTag(mop->external_tag);
-  if (!object_type) {
+  if (mop->external_tag == kExternalTagNone) {
     Printf("  %s of size %d at %p by %s",
            MopDesc(first, mop->write, mop->atomic), mop->size,
            (void *)mop->addr, thread_name(thrbuf, mop->tid));
   } else {
-    Printf("  %s access of object %s at %p by %s",
+    const char *object_type = GetObjectTypeFromTag(mop->external_tag);
+    if (object_type == nullptr)
+        object_type = "external object";
+    Printf("  %s access of %s at %p by %s",
            ExternalMopDesc(first, mop->write), object_type,
            (void *)mop->addr, thread_name(thrbuf, mop->tid));
   }
   PrintMutexSet(mop->mset);
   Printf(":\n");
-  Printf("%s", d.EndAccess());
+  Printf("%s", d.Default());
   PrintStack(mop->stack);
 }
 
@@ -202,7 +201,7 @@ static void PrintLocation(const ReportLocation *loc) {
              loc->heap_chunk_size, loc->heap_chunk_start,
              thread_name(thrbuf, loc->tid));
     } else {
-      Printf("  Location is %s object of size %zu at %p allocated by %s:\n",
+      Printf("  Location is %s of size %zu at %p allocated by %s:\n",
              object_type, loc->heap_chunk_size, loc->heap_chunk_start,
              thread_name(thrbuf, loc->tid));
     }
@@ -216,20 +215,20 @@ static void PrintLocation(const ReportLocation *loc) {
         loc->fd, thread_name(thrbuf, loc->tid));
     print_stack = true;
   }
-  Printf("%s", d.EndLocation());
+  Printf("%s", d.Default());
   if (print_stack)
     PrintStack(loc->stack);
 }
 
 static void PrintMutexShort(const ReportMutex *rm, const char *after) {
   Decorator d;
-  Printf("%sM%zd%s%s", d.Mutex(), rm->id, d.EndMutex(), after);
+  Printf("%sM%zd%s%s", d.Mutex(), rm->id, d.Default(), after);
 }
 
 static void PrintMutexShortWithAddress(const ReportMutex *rm,
                                        const char *after) {
   Decorator d;
-  Printf("%sM%zd (%p)%s%s", d.Mutex(), rm->id, rm->addr, d.EndMutex(), after);
+  Printf("%sM%zd (%p)%s%s", d.Mutex(), rm->id, rm->addr, d.Default(), after);
 }
 
 static void PrintMutex(const ReportMutex *rm) {
@@ -237,11 +236,11 @@ static void PrintMutex(const ReportMutex *rm) {
   if (rm->destroyed) {
     Printf("%s", d.Mutex());
     Printf("  Mutex M%llu is already destroyed.\n\n", rm->id);
-    Printf("%s", d.EndMutex());
+    Printf("%s", d.Default());
   } else {
     Printf("%s", d.Mutex());
     Printf("  Mutex M%llu (%p) created at:\n", rm->id, rm->addr);
-    Printf("%s", d.EndMutex());
+    Printf("%s", d.Default());
     PrintStack(rm->stack);
   }
 }
@@ -259,7 +258,7 @@ static void PrintThread(const ReportThread *rt) {
   if (rt->workerthread) {
     Printf(" (tid=%zu, %s) is a GCD worker thread\n", rt->os_id, thread_status);
     Printf("\n");
-    Printf("%s", d.EndThreadDescription());
+    Printf("%s", d.Default());
     return;
   }
   Printf(" (tid=%zu, %s) created by %s", rt->os_id, thread_status,
@@ -267,7 +266,7 @@ static void PrintThread(const ReportThread *rt) {
   if (rt->stack)
     Printf(" at:");
   Printf("\n");
-  Printf("%s", d.EndThreadDescription());
+  Printf("%s", d.Default());
   PrintStack(rt->stack);
 }
 
@@ -275,7 +274,7 @@ static void PrintSleep(const ReportStack *s) {
   Decorator d;
   Printf("%s", d.Sleep());
   Printf("  As if synchronized via sleep:\n");
-  Printf("%s", d.EndSleep());
+  Printf("%s", d.Default());
   PrintStack(s);
 }
 
@@ -315,11 +314,11 @@ static SymbolizedStack *SkipTsanInternalFrames(SymbolizedStack *frames) {
 void PrintReport(const ReportDesc *rep) {
   Decorator d;
   Printf("==================\n");
-  const char *rep_typ_str = ReportTypeString(rep->typ);
+  const char *rep_typ_str = ReportTypeString(rep->typ, rep->tag);
   Printf("%s", d.Warning());
   Printf("WARNING: ThreadSanitizer: %s (pid=%d)\n", rep_typ_str,
          (int)internal_getpid());
-  Printf("%s", d.EndWarning());
+  Printf("%s", d.Default());
 
   if (rep->typ == ReportTypeDeadlock) {
     char thrbuf[kThreadBufSize];
@@ -337,7 +336,7 @@ void PrintReport(const ReportDesc *rep) {
       PrintMutexShort(rep->mutexes[i], " in ");
       Printf("%s", d.ThreadDescription());
       Printf("%s:\n", thread_name(thrbuf, rep->unique_tids[i]));
-      Printf("%s", d.EndThreadDescription());
+      Printf("%s", d.Default());
       if (flags()->second_deadlock_stack) {
         PrintStack(rep->stacks[2*i]);
         Printf("  Mutex ");
diff --git a/lib/tsan/rtl/tsan_report.h b/lib/tsan/rtl/tsan_report.h
index 8d8ae0fd8..bc1582f90 100644
--- a/lib/tsan/rtl/tsan_report.h
+++ b/lib/tsan/rtl/tsan_report.h
@@ -90,7 +90,7 @@ struct ReportLocation {
 
 struct ReportThread {
   int id;
-  uptr os_id;
+  tid_t os_id;
   bool running;
   bool workerthread;
   char *name;
@@ -108,6 +108,7 @@ struct ReportMutex {
 class ReportDesc {
  public:
   ReportType typ;
+  uptr tag;
   Vector<ReportStack*> stacks;
   Vector<ReportMop*> mops;
   Vector<ReportLocation*> locs;
diff --git a/lib/tsan/rtl/tsan_rtl.cc b/lib/tsan/rtl/tsan_rtl.cc
index bc5991c6e..882e06765 100644
--- a/lib/tsan/rtl/tsan_rtl.cc
+++ b/lib/tsan/rtl/tsan_rtl.cc
@@ -14,6 +14,7 @@
 
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_file.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
@@ -104,7 +105,8 @@ Context::Context()
   , racy_stacks(MBlockRacyStacks)
   , racy_addresses(MBlockRacyAddresses)
   , fired_suppressions_mtx(MutexTypeFired, StatMtxFired)
-  , fired_suppressions(8) {
+  , fired_suppressions(8)
+  , clock_alloc("clock allocator") {
 }
 
 // The objects are allocated in TLS, so one may rely on zero-initialization.
@@ -866,9 +868,8 @@ static void MemoryRangeSet(ThreadState *thr, uptr pc, uptr addr, uptr size,
   // Don't want to touch lots of shadow memory.
   // If a program maps 10MB stack, there is no need reset the whole range.
   size = (size + (kShadowCell - 1)) & ~(kShadowCell - 1);
-  // UnmapOrDie/MmapFixedNoReserve does not work on Windows,
-  // so we do it only for C/C++.
-  if (SANITIZER_GO || size < common_flags()->clear_shadow_mmap_threshold) {
+  // UnmapOrDie/MmapFixedNoReserve does not work on Windows.
+  if (SANITIZER_WINDOWS || size < common_flags()->clear_shadow_mmap_threshold) {
     u64 *p = (u64*)MemToShadow(addr);
     CHECK(IsShadowMem((uptr)p));
     CHECK(IsShadowMem((uptr)(p + size * kShadowCnt / kShadowCell - 1)));
@@ -980,21 +981,21 @@ void FuncExit(ThreadState *thr) {
   thr->shadow_stack_pos--;
 }
 
-void ThreadIgnoreBegin(ThreadState *thr, uptr pc) {
+void ThreadIgnoreBegin(ThreadState *thr, uptr pc, bool save_stack) {
   DPrintf("#%d: ThreadIgnoreBegin\n", thr->tid);
   thr->ignore_reads_and_writes++;
   CHECK_GT(thr->ignore_reads_and_writes, 0);
   thr->fast_state.SetIgnoreBit();
 #if !SANITIZER_GO
-  if (!ctx->after_multithreaded_fork)
+  if (save_stack && !ctx->after_multithreaded_fork)
     thr->mop_ignore_set.Add(CurrentStackId(thr, pc));
 #endif
 }
 
 void ThreadIgnoreEnd(ThreadState *thr, uptr pc) {
   DPrintf("#%d: ThreadIgnoreEnd\n", thr->tid);
+  CHECK_GT(thr->ignore_reads_and_writes, 0);
   thr->ignore_reads_and_writes--;
-  CHECK_GE(thr->ignore_reads_and_writes, 0);
   if (thr->ignore_reads_and_writes == 0) {
     thr->fast_state.ClearIgnoreBit();
 #if !SANITIZER_GO
@@ -1011,20 +1012,20 @@ uptr __tsan_testonly_shadow_stack_current_size() {
 }
 #endif
 
-void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc) {
+void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc, bool save_stack) {
   DPrintf("#%d: ThreadIgnoreSyncBegin\n", thr->tid);
   thr->ignore_sync++;
   CHECK_GT(thr->ignore_sync, 0);
 #if !SANITIZER_GO
-  if (!ctx->after_multithreaded_fork)
+  if (save_stack && !ctx->after_multithreaded_fork)
     thr->sync_ignore_set.Add(CurrentStackId(thr, pc));
 #endif
 }
 
 void ThreadIgnoreSyncEnd(ThreadState *thr, uptr pc) {
   DPrintf("#%d: ThreadIgnoreSyncEnd\n", thr->tid);
+  CHECK_GT(thr->ignore_sync, 0);
   thr->ignore_sync--;
-  CHECK_GE(thr->ignore_sync, 0);
 #if !SANITIZER_GO
   if (thr->ignore_sync == 0)
     thr->sync_ignore_set.Reset();
diff --git a/lib/tsan/rtl/tsan_rtl.h b/lib/tsan/rtl/tsan_rtl.h
index 88539414c..99c4d2529 100644
--- a/lib/tsan/rtl/tsan_rtl.h
+++ b/lib/tsan/rtl/tsan_rtl.h
@@ -55,16 +55,22 @@ namespace __tsan {
 #if !SANITIZER_GO
 struct MapUnmapCallback;
 #if defined(__mips64) || defined(__aarch64__) || defined(__powerpc__)
-static const uptr kAllocatorSpace = 0;
-static const uptr kAllocatorSize = SANITIZER_MMAP_RANGE_SIZE;
 static const uptr kAllocatorRegionSizeLog = 20;
 static const uptr kAllocatorNumRegions =
-    kAllocatorSize >> kAllocatorRegionSizeLog;
+    SANITIZER_MMAP_RANGE_SIZE >> kAllocatorRegionSizeLog;
 typedef TwoLevelByteMap<(kAllocatorNumRegions >> 12), 1 << 12,
     MapUnmapCallback> ByteMap;
-typedef SizeClassAllocator32<kAllocatorSpace, kAllocatorSize, 0,
-    CompactSizeClassMap, kAllocatorRegionSizeLog, ByteMap,
-    MapUnmapCallback> PrimaryAllocator;
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = 0;
+  typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = kAllocatorRegionSizeLog;
+  typedef __tsan::ByteMap ByteMap;
+  typedef __tsan::MapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #else
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
   static const uptr kSpaceBeg = Mapping::kHeapMemBeg;
@@ -381,6 +387,7 @@ struct ThreadState {
   // for better performance.
   int ignore_reads_and_writes;
   int ignore_sync;
+  int suppress_reports;
   // Go does not support ignores.
 #if !SANITIZER_GO
   IgnoreSet mop_ignore_set;
@@ -410,7 +417,6 @@ struct ThreadState {
   bool is_dead;
   bool is_freeing;
   bool is_vptr_access;
-  uptr external_tag;
   const uptr stk_addr;
   const uptr stk_size;
   const uptr tls_addr;
@@ -546,6 +552,10 @@ struct Context {
 
 extern Context *ctx;  // The one and the only global runtime context.
 
+ALWAYS_INLINE Flags *flags() {
+  return &ctx->flags;
+}
+
 struct ScopedIgnoreInterceptors {
   ScopedIgnoreInterceptors() {
 #if !SANITIZER_GO
@@ -560,9 +570,13 @@ struct ScopedIgnoreInterceptors {
   }
 };
 
+const char *GetObjectTypeFromTag(uptr tag);
+const char *GetReportHeaderFromTag(uptr tag);
+uptr TagFromShadowStackFrame(uptr pc);
+
 class ScopedReport {
  public:
-  explicit ScopedReport(ReportType typ);
+  explicit ScopedReport(ReportType typ, uptr tag = kExternalTagNone);
   ~ScopedReport();
 
   void AddMemoryAccess(uptr addr, uptr external_tag, Shadow s, StackTrace stack,
@@ -593,10 +607,26 @@ class ScopedReport {
 
 ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack);
 void RestoreStack(int tid, const u64 epoch, VarSizeStackTrace *stk,
-                  MutexSet *mset);
+                  MutexSet *mset, uptr *tag = nullptr);
+
+// The stack could look like:
+//   <start> | <main> | <foo> | tag | <bar>
+// This will extract the tag and keep:
+//   <start> | <main> | <foo> | <bar>
+template<typename StackTraceTy>
+void ExtractTagFromStack(StackTraceTy *stack, uptr *tag = nullptr) {
+  if (stack->size < 2) return;
+  uptr possible_tag_pc = stack->trace[stack->size - 2];
+  uptr possible_tag = TagFromShadowStackFrame(possible_tag_pc);
+  if (possible_tag == kExternalTagNone) return;
+  stack->trace_buffer[stack->size - 2] = stack->trace_buffer[stack->size - 1];
+  stack->size -= 1;
+  if (tag) *tag = possible_tag;
+}
 
 template<typename StackTraceTy>
-void ObtainCurrentStack(ThreadState *thr, uptr toppc, StackTraceTy *stack) {
+void ObtainCurrentStack(ThreadState *thr, uptr toppc, StackTraceTy *stack,
+                        uptr *tag = nullptr) {
   uptr size = thr->shadow_stack_pos - thr->shadow_stack;
   uptr start = 0;
   if (size + !!toppc > kStackTraceMax) {
@@ -604,6 +634,7 @@ void ObtainCurrentStack(ThreadState *thr, uptr toppc, StackTraceTy *stack) {
     size = kStackTraceMax - !!toppc;
   }
   stack->Init(&thr->shadow_stack[start], size, toppc);
+  ExtractTagFromStack(stack, tag);
 }
 
 
@@ -641,8 +672,6 @@ bool IsFiredSuppression(Context *ctx, ReportType type, StackTrace trace);
 bool IsExpectedReport(uptr addr, uptr size);
 void PrintMatchedBenignRaces();
 
-const char *GetObjectTypeFromTag(uptr tag);
-
 #if defined(TSAN_DEBUG_OUTPUT) && TSAN_DEBUG_OUTPUT >= 1
 # define DPrintf Printf
 #else
@@ -707,16 +736,16 @@ void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size);
 void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size);
 void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size);
 
-void ThreadIgnoreBegin(ThreadState *thr, uptr pc);
+void ThreadIgnoreBegin(ThreadState *thr, uptr pc, bool save_stack = true);
 void ThreadIgnoreEnd(ThreadState *thr, uptr pc);
-void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc);
+void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc, bool save_stack = true);
 void ThreadIgnoreSyncEnd(ThreadState *thr, uptr pc);
 
 void FuncEntry(ThreadState *thr, uptr pc);
 void FuncExit(ThreadState *thr);
 
 int ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached);
-void ThreadStart(ThreadState *thr, int tid, uptr os_id, bool workerthread);
+void ThreadStart(ThreadState *thr, int tid, tid_t os_id, bool workerthread);
 void ThreadFinish(ThreadState *thr);
 int ThreadTid(ThreadState *thr, uptr pc, uptr uid);
 void ThreadJoin(ThreadState *thr, uptr pc, int tid);
@@ -731,13 +760,16 @@ void ProcDestroy(Processor *proc);
 void ProcWire(Processor *proc, ThreadState *thr);
 void ProcUnwire(Processor *proc, ThreadState *thr);
 
-void MutexCreate(ThreadState *thr, uptr pc, uptr addr,
-                 bool rw, bool recursive, bool linker_init);
-void MutexDestroy(ThreadState *thr, uptr pc, uptr addr);
-void MutexLock(ThreadState *thr, uptr pc, uptr addr, int rec = 1,
-               bool try_lock = false);
-int  MutexUnlock(ThreadState *thr, uptr pc, uptr addr, bool all = false);
-void MutexReadLock(ThreadState *thr, uptr pc, uptr addr, bool try_lock = false);
+// Note: the parameter is called flagz, because flags is already taken
+// by the global function that returns flags.
+void MutexCreate(ThreadState *thr, uptr pc, uptr addr, u32 flagz = 0);
+void MutexDestroy(ThreadState *thr, uptr pc, uptr addr, u32 flagz = 0);
+void MutexPreLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz = 0);
+void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz = 0,
+    int rec = 1);
+int  MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz = 0);
+void MutexPreReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz = 0);
+void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz = 0);
 void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr);
 void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr);
 void MutexRepair(ThreadState *thr, uptr pc, uptr addr);  // call on EOWNERDEAD
@@ -793,7 +825,7 @@ void ALWAYS_INLINE TraceAddEvent(ThreadState *thr, FastState fs,
     return;
   DCHECK_GE((int)typ, 0);
   DCHECK_LE((int)typ, 7);
-  DCHECK_EQ(GetLsb(addr, 61), addr);
+  DCHECK_EQ(GetLsb(addr, kEventPCBits), addr);
   StatInc(thr, StatEvents);
   u64 pos = fs.GetTracePos();
   if (UNLIKELY((pos % kTracePartSize) == 0)) {
@@ -805,7 +837,7 @@ void ALWAYS_INLINE TraceAddEvent(ThreadState *thr, FastState fs,
   }
   Event *trace = (Event*)GetThreadTrace(fs.tid());
   Event *evp = &trace[pos];
-  Event ev = (u64)addr | ((u64)typ << 61);
+  Event ev = (u64)addr | ((u64)typ << kEventPCBits);
   *evp = ev;
 }
 
diff --git a/lib/tsan/rtl/tsan_rtl_aarch64.S b/lib/tsan/rtl/tsan_rtl_aarch64.S
index ef06f0444..61171d635 100644
--- a/lib/tsan/rtl/tsan_rtl_aarch64.S
+++ b/lib/tsan/rtl/tsan_rtl_aarch64.S
@@ -1,13 +1,46 @@
+// The content of this file is AArch64-only:
+#if defined(__aarch64__)
+
 #include "sanitizer_common/sanitizer_asm.h"
 
+#if !defined(__APPLE__)
 .section .bss
 .type	__tsan_pointer_chk_guard, %object
-.size	__tsan_pointer_chk_guard, 8
+ASM_SIZE(ASM_TSAN_SYMBOL_INTERCEPTOR(__tsan_pointer_chk_guard))
 __tsan_pointer_chk_guard:
 .zero	8
+#endif
+
+#if defined(__APPLE__)
+.align  2
+
+.section  __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+.long _setjmp$non_lazy_ptr
+_setjmp$non_lazy_ptr:
+.indirect_symbol _setjmp
+.long 0
+
+.section  __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+.long __setjmp$non_lazy_ptr
+__setjmp$non_lazy_ptr:
+.indirect_symbol __setjmp
+.long 0
+
+.section  __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+.long _sigsetjmp$non_lazy_ptr
+_sigsetjmp$non_lazy_ptr:
+.indirect_symbol _sigsetjmp
+.long 0
+#endif
 
+#if !defined(__APPLE__)
 .section .text
+#else
+.section __TEXT,__text
+.align 3
+#endif
 
+#if !defined(__APPLE__)
 // GLIBC mangles the function pointers in jmp_buf (used in {set,long}*jmp
 // functions) by XORing them with a random guard pointer.  For AArch64 it is a
 // global variable rather than a TCB one (as for x86_64/powerpc) and althought
@@ -16,9 +49,9 @@ __tsan_pointer_chk_guard:
 // not stable). So InitializeGuardPtr obtains the pointer guard value by
 // issuing a setjmp and checking the resulting pointers values against the
 // original ones.
-.hidden _Z18InitializeGuardPtrv
+ASM_HIDDEN(_Z18InitializeGuardPtrv)
 .global _Z18InitializeGuardPtrv
-.type _Z18InitializeGuardPtrv, @function
+ASM_TYPE_FUNCTION(ASM_TSAN_SYMBOL_INTERCEPTOR(_Z18InitializeGuardPtrv))
 _Z18InitializeGuardPtrv:
   CFI_STARTPROC
   // Allocates a jmp_buf for the setjmp call.
@@ -55,12 +88,14 @@ _Z18InitializeGuardPtrv:
   CFI_DEF_CFA (31, 0)
   ret
   CFI_ENDPROC
-.size _Z18InitializeGuardPtrv, .-_Z18InitializeGuardPtrv
+ASM_SIZE(ASM_TSAN_SYMBOL_INTERCEPTOR(_Z18InitializeGuardPtrv))
+#endif
 
-.hidden __tsan_setjmp
+ASM_HIDDEN(__tsan_setjmp)
 .comm _ZN14__interception11real_setjmpE,8,8
-.type setjmp, @function
-setjmp:
+.globl ASM_TSAN_SYMBOL_INTERCEPTOR(setjmp)
+ASM_TYPE_FUNCTION(ASM_TSAN_SYMBOL_INTERCEPTOR(setjmp))
+ASM_TSAN_SYMBOL_INTERCEPTOR(setjmp):
   CFI_STARTPROC
 
   // save env parameters for function call
@@ -78,14 +113,19 @@ setjmp:
   CFI_OFFSET (19, -16)
   mov     x19, x0
 
+#if !defined(__APPLE__)
   // SP pointer mangling (see glibc setjmp)
   adrp    x2, __tsan_pointer_chk_guard
   ldr     x2, [x2, #:lo12:__tsan_pointer_chk_guard]
   add     x0, x29, 32
   eor     x1, x2, x0
+#else
+  add     x0, x29, 32
+  mov     x1, x0
+#endif
 
   // call tsan interceptor
-  bl      __tsan_setjmp
+  bl      ASM_TSAN_SYMBOL(__tsan_setjmp)
 
   // restore env parameter
   mov     x0, x19
@@ -96,18 +136,24 @@ setjmp:
   CFI_DEF_CFA (31, 0)
 
   // tail jump to libc setjmp
+#if !defined(__APPLE__)
   adrp    x1, :got:_ZN14__interception11real_setjmpE
   ldr     x1, [x1, #:got_lo12:_ZN14__interception11real_setjmpE]
   ldr     x1, [x1]
+#else
+  adrp    x1, _setjmp$non_lazy_ptr@page
+  add     x1, x1, _setjmp$non_lazy_ptr@pageoff
+  ldr     x1, [x1]
+#endif
   br      x1
 
   CFI_ENDPROC
-.size setjmp, .-setjmp
+ASM_SIZE(ASM_TSAN_SYMBOL_INTERCEPTOR(setjmp))
 
 .comm _ZN14__interception12real__setjmpE,8,8
-.globl _setjmp
-.type _setjmp, @function
-_setjmp:
+.globl ASM_TSAN_SYMBOL_INTERCEPTOR(_setjmp)
+ASM_TYPE_FUNCTION(ASM_TSAN_SYMBOL_INTERCEPTOR(_setjmp))
+ASM_TSAN_SYMBOL_INTERCEPTOR(_setjmp):
   CFI_STARTPROC
 
   // save env parameters for function call
@@ -125,14 +171,19 @@ _setjmp:
   CFI_OFFSET (19, -16)
   mov     x19, x0
 
+#if !defined(__APPLE__)
   // SP pointer mangling (see glibc setjmp)
   adrp    x2, __tsan_pointer_chk_guard
   ldr     x2, [x2, #:lo12:__tsan_pointer_chk_guard]
   add     x0, x29, 32
   eor     x1, x2, x0
+#else
+  add     x0, x29, 32
+  mov     x1, x0
+#endif
 
   // call tsan interceptor
-  bl      __tsan_setjmp
+  bl      ASM_TSAN_SYMBOL(__tsan_setjmp)
 
   // Restore jmp_buf parameter
   mov     x0, x19
@@ -143,18 +194,24 @@ _setjmp:
   CFI_DEF_CFA (31, 0)
 
   // tail jump to libc setjmp
+#if !defined(__APPLE__)
   adrp    x1, :got:_ZN14__interception12real__setjmpE
   ldr     x1, [x1, #:got_lo12:_ZN14__interception12real__setjmpE]
   ldr     x1, [x1]
+#else
+  adrp    x1, __setjmp$non_lazy_ptr@page
+  add     x1, x1, __setjmp$non_lazy_ptr@pageoff
+  ldr     x1, [x1]
+#endif
   br      x1
 
   CFI_ENDPROC
-.size _setjmp, .-_setjmp
+ASM_SIZE(ASM_TSAN_SYMBOL_INTERCEPTOR(_setjmp))
 
 .comm _ZN14__interception14real_sigsetjmpE,8,8
-.globl sigsetjmp
-.type sigsetjmp, @function
-sigsetjmp:
+.globl ASM_TSAN_SYMBOL_INTERCEPTOR(sigsetjmp)
+ASM_TYPE_FUNCTION(ASM_TSAN_SYMBOL_INTERCEPTOR(sigsetjmp))
+ASM_TSAN_SYMBOL_INTERCEPTOR(sigsetjmp):
   CFI_STARTPROC
 
   // save env parameters for function call
@@ -174,14 +231,19 @@ sigsetjmp:
   mov     w20, w1
   mov     x19, x0
 
+#if !defined(__APPLE__)
   // SP pointer mangling (see glibc setjmp)
   adrp    x2, __tsan_pointer_chk_guard
   ldr     x2, [x2, #:lo12:__tsan_pointer_chk_guard]
   add     x0, x29, 32
   eor     x1, x2, x0
+#else
+  add     x0, x29, 32
+  mov     x1, x0
+#endif
 
   // call tsan interceptor
-  bl      __tsan_setjmp
+  bl      ASM_TSAN_SYMBOL(__tsan_setjmp)
 
   // restore env parameter
   mov     w1, w20
@@ -195,17 +257,24 @@ sigsetjmp:
   CFI_DEF_CFA (31, 0)
 
   // tail jump to libc sigsetjmp
+#if !defined(__APPLE__)
   adrp    x2, :got:_ZN14__interception14real_sigsetjmpE
   ldr     x2, [x2, #:got_lo12:_ZN14__interception14real_sigsetjmpE]
   ldr     x2, [x2]
+#else
+  adrp    x2, _sigsetjmp$non_lazy_ptr@page
+  add     x2, x2, _sigsetjmp$non_lazy_ptr@pageoff
+  ldr     x2, [x2]
+#endif
   br      x2
   CFI_ENDPROC
-.size sigsetjmp, .-sigsetjmp
+ASM_SIZE(ASM_TSAN_SYMBOL_INTERCEPTOR(sigsetjmp))
 
+#if !defined(__APPLE__)
 .comm _ZN14__interception16real___sigsetjmpE,8,8
-.globl __sigsetjmp
-.type __sigsetjmp, @function
-__sigsetjmp:
+.globl ASM_TSAN_SYMBOL_INTERCEPTOR(__sigsetjmp)
+ASM_TYPE_FUNCTION(ASM_TSAN_SYMBOL_INTERCEPTOR(__sigsetjmp))
+ASM_TSAN_SYMBOL_INTERCEPTOR(__sigsetjmp):
   CFI_STARTPROC
 
   // save env parameters for function call
@@ -225,14 +294,16 @@ __sigsetjmp:
   mov     w20, w1
   mov     x19, x0
 
+#if !defined(__APPLE__)
   // SP pointer mangling (see glibc setjmp)
   adrp    x2, __tsan_pointer_chk_guard
   ldr     x2, [x2, #:lo12:__tsan_pointer_chk_guard]
   add     x0, x29, 32
   eor     x1, x2, x0
+#endif
 
   // call tsan interceptor
-  bl      __tsan_setjmp
+  bl      ASM_TSAN_SYMBOL(__tsan_setjmp)
 
   mov     w1, w20
   mov     x0, x19
@@ -245,14 +316,22 @@ __sigsetjmp:
   CFI_DEF_CFA (31, 0)
 
   // tail jump to libc __sigsetjmp
+#if !defined(__APPLE__)
   adrp    x2, :got:_ZN14__interception16real___sigsetjmpE
   ldr     x2, [x2, #:got_lo12:_ZN14__interception16real___sigsetjmpE]
   ldr     x2, [x2]
+#else
+  adrp    x2, ASM_TSAN_SYMBOL(__sigsetjmp)@page
+  add     x2, x2, ASM_TSAN_SYMBOL(__sigsetjmp)@pageoff
+#endif
   br      x2
   CFI_ENDPROC
-.size __sigsetjmp, .-__sigsetjmp
+ASM_SIZE(ASM_TSAN_SYMBOL_INTERCEPTOR(__sigsetjmp))
+#endif
 
 #if defined(__linux__)
 /* We do not need executable stack.  */
 .section        .note.GNU-stack,"",@progbits
 #endif
+
+#endif
diff --git a/lib/tsan/rtl/tsan_rtl_amd64.S b/lib/tsan/rtl/tsan_rtl_amd64.S
index caa832375..98947fd2a 100644
--- a/lib/tsan/rtl/tsan_rtl_amd64.S
+++ b/lib/tsan/rtl/tsan_rtl_amd64.S
@@ -1,4 +1,8 @@
+// The content of this file is x86_64-only:
+#if defined(__x86_64__)
+
 #include "sanitizer_common/sanitizer_asm.h"
+
 #if !defined(__APPLE__)
 .section .text
 #else
@@ -357,3 +361,5 @@ ASM_SIZE(ASM_TSAN_SYMBOL_INTERCEPTOR(__sigsetjmp))
 /* We do not need executable stack.  */
 .section        .note.GNU-stack,"",@progbits
 #endif
+
+#endif
diff --git a/lib/tsan/rtl/tsan_rtl_mutex.cc b/lib/tsan/rtl/tsan_rtl_mutex.cc
index f3b51c30f..152b965ad 100644
--- a/lib/tsan/rtl/tsan_rtl_mutex.cc
+++ b/lib/tsan/rtl/tsan_rtl_mutex.cc
@@ -62,32 +62,31 @@ static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
   OutputReport(thr, rep);
 }
 
-void MutexCreate(ThreadState *thr, uptr pc, uptr addr,
-                 bool rw, bool recursive, bool linker_init) {
-  DPrintf("#%d: MutexCreate %zx\n", thr->tid, addr);
+void MutexCreate(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
+  DPrintf("#%d: MutexCreate %zx flagz=0x%x\n", thr->tid, addr, flagz);
   StatInc(thr, StatMutexCreate);
-  if (!linker_init && IsAppMem(addr)) {
+  if (!(flagz & MutexFlagLinkerInit) && IsAppMem(addr)) {
     CHECK(!thr->is_freeing);
     thr->is_freeing = true;
     MemoryWrite(thr, pc, addr, kSizeLog1);
     thr->is_freeing = false;
   }
   SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
-  s->is_rw = rw;
-  s->is_recursive = recursive;
-  s->is_linker_init = linker_init;
+  s->SetFlags(flagz & MutexCreationFlagMask);
   if (!SANITIZER_GO && s->creation_stack_id == 0)
     s->creation_stack_id = CurrentStackId(thr, pc);
   s->mtx.Unlock();
 }
 
-void MutexDestroy(ThreadState *thr, uptr pc, uptr addr) {
+void MutexDestroy(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexDestroy %zx\n", thr->tid, addr);
   StatInc(thr, StatMutexDestroy);
   SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr, true);
   if (s == 0)
     return;
-  if (s->is_linker_init) {
+  if ((flagz & MutexFlagLinkerInit)
+      || s->IsFlagSet(MutexFlagLinkerInit)
+      || ((flagz & MutexFlagNotStatic) && !s->IsFlagSet(MutexFlagNotStatic))) {
     // Destroy is no-op for linker-initialized mutexes.
     s->mtx.Unlock();
     return;
@@ -100,8 +99,8 @@ void MutexDestroy(ThreadState *thr, uptr pc, uptr addr) {
   bool unlock_locked = false;
   if (flags()->report_destroy_locked
       && s->owner_tid != SyncVar::kInvalidTid
-      && !s->is_broken) {
-    s->is_broken = true;
+      && !s->IsFlagSet(MutexFlagBroken)) {
+    s->SetFlags(MutexFlagBroken);
     unlock_locked = true;
   }
   u64 mid = s->GetId();
@@ -141,12 +140,33 @@ void MutexDestroy(ThreadState *thr, uptr pc, uptr addr) {
   // s will be destroyed and freed in MetaMap::FreeBlock.
 }
 
-void MutexLock(ThreadState *thr, uptr pc, uptr addr, int rec, bool try_lock) {
-  DPrintf("#%d: MutexLock %zx rec=%d\n", thr->tid, addr, rec);
-  CHECK_GT(rec, 0);
+void MutexPreLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
+  DPrintf("#%d: MutexPreLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
+  if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
+    SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
+    s->UpdateFlags(flagz);
+    if (s->owner_tid != thr->tid) {
+      Callback cb(thr, pc);
+      ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
+      s->mtx.ReadUnlock();
+      ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
+    } else {
+      s->mtx.ReadUnlock();
+    }
+  }
+}
+
+void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
+  DPrintf("#%d: MutexPostLock %zx flag=0x%x rec=%d\n",
+      thr->tid, addr, flagz, rec);
+  if (flagz & MutexFlagRecursiveLock)
+    CHECK_GT(rec, 0);
+  else
+    rec = 1;
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
   SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
+  s->UpdateFlags(flagz);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeLock, s->GetId());
   bool report_double_lock = false;
@@ -156,38 +176,43 @@ void MutexLock(ThreadState *thr, uptr pc, uptr addr, int rec, bool try_lock) {
     s->last_lock = thr->fast_state.raw();
   } else if (s->owner_tid == thr->tid) {
     CHECK_GT(s->recursion, 0);
-  } else if (flags()->report_mutex_bugs && !s->is_broken) {
-    s->is_broken = true;
+  } else if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+    s->SetFlags(MutexFlagBroken);
     report_double_lock = true;
   }
-  if (s->recursion == 0) {
+  const bool first = s->recursion == 0;
+  s->recursion += rec;
+  if (first) {
     StatInc(thr, StatMutexLock);
     AcquireImpl(thr, pc, &s->clock);
     AcquireImpl(thr, pc, &s->read_clock);
-  } else if (!s->is_recursive) {
+  } else if (!s->IsFlagSet(MutexFlagWriteReentrant)) {
     StatInc(thr, StatMutexRecLock);
   }
-  s->recursion += rec;
   thr->mset.Add(s->GetId(), true, thr->fast_state.epoch());
-  if (common_flags()->detect_deadlocks && (s->recursion - rec) == 0) {
+  bool pre_lock = false;
+  if (first && common_flags()->detect_deadlocks) {
+    pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
+        !(flagz & MutexFlagTryLock);
     Callback cb(thr, pc);
-    if (!try_lock)
+    if (pre_lock)
       ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
-    ctx->dd->MutexAfterLock(&cb, &s->dd, true, try_lock);
+    ctx->dd->MutexAfterLock(&cb, &s->dd, true, flagz & MutexFlagTryLock);
   }
   u64 mid = s->GetId();
   s->mtx.Unlock();
   // Can't touch s after this point.
+  s = 0;
   if (report_double_lock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexDoubleLock, addr, mid);
-  if (common_flags()->detect_deadlocks) {
+  if (first && pre_lock && common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
 }
 
-int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, bool all) {
-  DPrintf("#%d: MutexUnlock %zx all=%d\n", thr->tid, addr, all);
+int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
+  DPrintf("#%d: MutexUnlock %zx flagz=0x%x\n", thr->tid, addr, flagz);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
   SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
@@ -196,12 +221,12 @@ int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, bool all) {
   int rec = 0;
   bool report_bad_unlock = false;
   if (!SANITIZER_GO && (s->recursion == 0 || s->owner_tid != thr->tid)) {
-    if (flags()->report_mutex_bugs && !s->is_broken) {
-      s->is_broken = true;
+    if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+      s->SetFlags(MutexFlagBroken);
       report_bad_unlock = true;
     }
   } else {
-    rec = all ? s->recursion : 1;
+    rec = (flagz & MutexFlagRecursiveUnlock) ? s->recursion : 1;
     s->recursion -= rec;
     if (s->recursion == 0) {
       StatInc(thr, StatMutexUnlock);
@@ -229,36 +254,53 @@ int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, bool all) {
   return rec;
 }
 
-void MutexReadLock(ThreadState *thr, uptr pc, uptr addr, bool trylock) {
-  DPrintf("#%d: MutexReadLock %zx\n", thr->tid, addr);
+void MutexPreReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
+  DPrintf("#%d: MutexPreReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
+  if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
+    SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
+    s->UpdateFlags(flagz);
+    Callback cb(thr, pc);
+    ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
+    s->mtx.ReadUnlock();
+    ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
+  }
+}
+
+void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
+  DPrintf("#%d: MutexPostReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
   StatInc(thr, StatMutexReadLock);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
   SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
+  s->UpdateFlags(flagz);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeRLock, s->GetId());
   bool report_bad_lock = false;
   if (s->owner_tid != SyncVar::kInvalidTid) {
-    if (flags()->report_mutex_bugs && !s->is_broken) {
-      s->is_broken = true;
+    if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+      s->SetFlags(MutexFlagBroken);
       report_bad_lock = true;
     }
   }
   AcquireImpl(thr, pc, &s->clock);
   s->last_lock = thr->fast_state.raw();
   thr->mset.Add(s->GetId(), false, thr->fast_state.epoch());
-  if (common_flags()->detect_deadlocks && s->recursion == 0) {
+  bool pre_lock = false;
+  if (common_flags()->detect_deadlocks) {
+    pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
+        !(flagz & MutexFlagTryLock);
     Callback cb(thr, pc);
-    if (!trylock)
+    if (pre_lock)
       ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
-    ctx->dd->MutexAfterLock(&cb, &s->dd, false, trylock);
+    ctx->dd->MutexAfterLock(&cb, &s->dd, false, flagz & MutexFlagTryLock);
   }
   u64 mid = s->GetId();
   s->mtx.ReadUnlock();
   // Can't touch s after this point.
+  s = 0;
   if (report_bad_lock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadLock, addr, mid);
-  if (common_flags()->detect_deadlocks) {
+  if (pre_lock  && common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
@@ -274,8 +316,8 @@ void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
   TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
   bool report_bad_unlock = false;
   if (s->owner_tid != SyncVar::kInvalidTid) {
-    if (flags()->report_mutex_bugs && !s->is_broken) {
-      s->is_broken = true;
+    if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+      s->SetFlags(MutexFlagBroken);
       report_bad_unlock = true;
     }
   }
@@ -323,8 +365,8 @@ void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
     } else {
       StatInc(thr, StatMutexRecUnlock);
     }
-  } else if (!s->is_broken) {
-    s->is_broken = true;
+  } else if (!s->IsFlagSet(MutexFlagBroken)) {
+    s->SetFlags(MutexFlagBroken);
     report_bad_unlock = true;
   }
   thr->mset.Del(s->GetId(), write);
@@ -373,10 +415,10 @@ void Acquire(ThreadState *thr, uptr pc, uptr addr) {
 static void UpdateClockCallback(ThreadContextBase *tctx_base, void *arg) {
   ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
   ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
+  u64 epoch = tctx->epoch1;
   if (tctx->status == ThreadStatusRunning)
-    thr->clock.set(tctx->tid, tctx->thr->fast_state.epoch());
-  else
-    thr->clock.set(tctx->tid, tctx->epoch1);
+    epoch = tctx->thr->fast_state.epoch();
+  thr->clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
 }
 
 void AcquireGlobal(ThreadState *thr, uptr pc) {
@@ -416,10 +458,10 @@ void ReleaseStore(ThreadState *thr, uptr pc, uptr addr) {
 static void UpdateSleepClockCallback(ThreadContextBase *tctx_base, void *arg) {
   ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
   ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
+  u64 epoch = tctx->epoch1;
   if (tctx->status == ThreadStatusRunning)
-    thr->last_sleep_clock.set(tctx->tid, tctx->thr->fast_state.epoch());
-  else
-    thr->last_sleep_clock.set(tctx->tid, tctx->epoch1);
+    epoch = tctx->thr->fast_state.epoch();
+  thr->last_sleep_clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
 }
 
 void AfterSleep(ThreadState *thr, uptr pc) {
diff --git a/lib/tsan/rtl/tsan_rtl_report.cc b/lib/tsan/rtl/tsan_rtl_report.cc
index 31b9e9789..c1d2cc4b5 100644
--- a/lib/tsan/rtl/tsan_rtl_report.cc
+++ b/lib/tsan/rtl/tsan_rtl_report.cc
@@ -143,11 +143,12 @@ static ReportStack *SymbolizeStack(StackTrace trace) {
   return stack;
 }
 
-ScopedReport::ScopedReport(ReportType typ) {
+ScopedReport::ScopedReport(ReportType typ, uptr tag) {
   ctx->thread_registry->CheckLocked();
   void *mem = internal_alloc(MBlockReport, sizeof(ReportDesc));
   rep_ = new(mem) ReportDesc;
   rep_->typ = typ;
+  rep_->tag = tag;
   ctx->report_mtx.Lock();
   CommonSanitizerReportMutex.Lock();
 }
@@ -313,7 +314,7 @@ void ScopedReport::AddLocation(uptr addr, uptr size) {
     return;
 #if !SANITIZER_GO
   int fd = -1;
-  int creat_tid = -1;
+  int creat_tid = kInvalidTid;
   u32 creat_stack = 0;
   if (FdLocation(addr, &fd, &creat_tid, &creat_stack)) {
     ReportLocation *loc = ReportLocation::New(ReportLocationFD);
@@ -377,7 +378,7 @@ const ReportDesc *ScopedReport::GetReport() const {
 }
 
 void RestoreStack(int tid, const u64 epoch, VarSizeStackTrace *stk,
-                  MutexSet *mset) {
+                  MutexSet *mset, uptr *tag) {
   // This function restores stack trace and mutex set for the thread/epoch.
   // It does so by getting stack trace and mutex set at the beginning of
   // trace part, and then replaying the trace till the given epoch.
@@ -405,8 +406,8 @@ void RestoreStack(int tid, const u64 epoch, VarSizeStackTrace *stk,
   Event *events = (Event*)GetThreadTrace(tid);
   for (uptr i = ebegin; i <= eend; i++) {
     Event ev = events[i];
-    EventType typ = (EventType)(ev >> 61);
-    uptr pc = (uptr)(ev & ((1ull << 61) - 1));
+    EventType typ = (EventType)(ev >> kEventPCBits);
+    uptr pc = (uptr)(ev & ((1ull << kEventPCBits) - 1));
     DPrintf2("  %zu typ=%d pc=%zx\n", i, typ, pc);
     if (typ == EventTypeMop) {
       stack[pos] = pc;
@@ -436,6 +437,7 @@ void RestoreStack(int tid, const u64 epoch, VarSizeStackTrace *stk,
     return;
   pos++;
   stk->Init(&stack[0], pos);
+  ExtractTagFromStack(stk, tag);
 }
 
 static bool HandleRacyStacks(ThreadState *thr, VarSizeStackTrace traces[2],
@@ -500,7 +502,7 @@ static void AddRacyStacks(ThreadState *thr, VarSizeStackTrace traces[2],
 }
 
 bool OutputReport(ThreadState *thr, const ScopedReport &srep) {
-  if (!flags()->report_bugs)
+  if (!flags()->report_bugs || thr->suppress_reports)
     return false;
   atomic_store_relaxed(&ctx->last_symbolize_time_ns, NanoTime());
   const ReportDesc *rep = srep.GetReport();
@@ -625,16 +627,35 @@ void ReportRace(ThreadState *thr) {
     typ = ReportTypeVptrRace;
   else if (freed)
     typ = ReportTypeUseAfterFree;
-  else if (thr->external_tag > 0)
-    typ = ReportTypeExternalRace;
 
   if (IsFiredSuppression(ctx, typ, addr))
     return;
 
   const uptr kMop = 2;
   VarSizeStackTrace traces[kMop];
-  const uptr toppc = TraceTopPC(thr);
-  ObtainCurrentStack(thr, toppc, &traces[0]);
+  uptr tags[kMop] = {kExternalTagNone};
+  uptr toppc = TraceTopPC(thr);
+  if (toppc >> kEventPCBits) {
+    // This is a work-around for a known issue.
+    // The scenario where this happens is rather elaborate and requires
+    // an instrumented __sanitizer_report_error_summary callback and
+    // a __tsan_symbolize_external callback and a race during a range memory
+    // access larger than 8 bytes. MemoryAccessRange adds the current PC to
+    // the trace and starts processing memory accesses. A first memory access
+    // triggers a race, we report it and call the instrumented
+    // __sanitizer_report_error_summary, which adds more stuff to the trace
+    // since it is intrumented. Then a second memory access in MemoryAccessRange
+    // also triggers a race and we get here and call TraceTopPC to get the
+    // current PC, however now it contains some unrelated events from the
+    // callback. Most likely, TraceTopPC will now return a EventTypeFuncExit
+    // event. Later we subtract -1 from it (in GetPreviousInstructionPc)
+    // and the resulting PC has kExternalPCBit set, so we pass it to
+    // __tsan_symbolize_external. __tsan_symbolize_external is within its rights
+    // to crash since the PC is completely bogus.
+    // test/tsan/double_race.cc contains a test case for this.
+    toppc = 0;
+  }
+  ObtainCurrentStack(thr, toppc, &traces[0], &tags[0]);
   if (IsFiredSuppression(ctx, typ, traces[0]))
     return;
 
@@ -644,18 +665,28 @@ void ReportRace(ThreadState *thr) {
   MutexSet *mset2 = new(&mset_buffer[0]) MutexSet();
 
   Shadow s2(thr->racy_state[1]);
-  RestoreStack(s2.tid(), s2.epoch(), &traces[1], mset2);
+  RestoreStack(s2.tid(), s2.epoch(), &traces[1], mset2, &tags[1]);
   if (IsFiredSuppression(ctx, typ, traces[1]))
     return;
 
   if (HandleRacyStacks(thr, traces, addr_min, addr_max))
     return;
 
+  // If any of the accesses has a tag, treat this as an "external" race.
+  uptr tag = kExternalTagNone;
+  for (uptr i = 0; i < kMop; i++) {
+    if (tags[i] != kExternalTagNone) {
+      typ = ReportTypeExternalRace;
+      tag = tags[i];
+      break;
+    }
+  }
+
   ThreadRegistryLock l0(ctx->thread_registry);
-  ScopedReport rep(typ);
+  ScopedReport rep(typ, tag);
   for (uptr i = 0; i < kMop; i++) {
     Shadow s(thr->racy_state[i]);
-    rep.AddMemoryAccess(addr, thr->external_tag, s, traces[i],
+    rep.AddMemoryAccess(addr, tags[i], s, traces[i],
                         i == 0 ? &thr->mset : mset2);
   }
 
diff --git a/lib/tsan/rtl/tsan_rtl_thread.cc b/lib/tsan/rtl/tsan_rtl_thread.cc
index 7357d97a2..83fab082a 100644
--- a/lib/tsan/rtl/tsan_rtl_thread.cc
+++ b/lib/tsan/rtl/tsan_rtl_thread.cc
@@ -142,6 +142,10 @@ void ThreadContext::OnFinished() {
 
   if (common_flags()->detect_deadlocks)
     ctx->dd->DestroyLogicalThread(thr->dd_lt);
+  thr->clock.ResetCached(&thr->proc()->clock_cache);
+#if !SANITIZER_GO
+  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
+#endif
   thr->~ThreadState();
 #if TSAN_COLLECT_STATS
   StatAggregate(ctx->stat, thr->stat);
@@ -236,7 +240,7 @@ int ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached) {
   return tid;
 }
 
-void ThreadStart(ThreadState *thr, int tid, uptr os_id, bool workerthread) {
+void ThreadStart(ThreadState *thr, int tid, tid_t os_id, bool workerthread) {
   uptr stk_addr = 0;
   uptr stk_size = 0;
   uptr tls_addr = 0;
@@ -248,19 +252,7 @@ void ThreadStart(ThreadState *thr, int tid, uptr os_id, bool workerthread) {
     if (stk_addr && stk_size)
       MemoryRangeImitateWrite(thr, /*pc=*/ 1, stk_addr, stk_size);
 
-    if (tls_addr && tls_size) {
-      // Check that the thr object is in tls;
-      const uptr thr_beg = (uptr)thr;
-      const uptr thr_end = (uptr)thr + sizeof(*thr);
-      CHECK_GE(thr_beg, tls_addr);
-      CHECK_LE(thr_beg, tls_addr + tls_size);
-      CHECK_GE(thr_end, tls_addr);
-      CHECK_LE(thr_end, tls_addr + tls_size);
-      // Since the thr object is huge, skip it.
-      MemoryRangeImitateWrite(thr, /*pc=*/ 2, tls_addr, thr_beg - tls_addr);
-      MemoryRangeImitateWrite(thr, /*pc=*/ 2,
-          thr_end, tls_addr + tls_size - thr_end);
-    }
+    if (tls_addr && tls_size) ImitateTlsWrite(thr, tls_addr, tls_size);
   }
 #endif
 
@@ -357,6 +349,7 @@ void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr,
   StatInc(thr, StatMopRange);
 
   if (*shadow_mem == kShadowRodata) {
+    DCHECK(!is_write);
     // Access to .rodata section, no races here.
     // Measurements show that it can be 10-20% of all memory accesses.
     StatInc(thr, StatMopRangeRodata);
diff --git a/lib/tsan/rtl/tsan_stat.cc b/lib/tsan/rtl/tsan_stat.cc
index d1d6ed24d..18c83d5c6 100644
--- a/lib/tsan/rtl/tsan_stat.cc
+++ b/lib/tsan/rtl/tsan_stat.cc
@@ -75,14 +75,11 @@ void StatOutput(u64 *stat) {
   name[StatClockAcquire]                 = "Clock acquire                     ";
   name[StatClockAcquireEmpty]            = "  empty clock                     ";
   name[StatClockAcquireFastRelease]      = "  fast from release-store         ";
-  name[StatClockAcquireLarge]            = "  contains my tid                 ";
-  name[StatClockAcquireRepeat]           = "  repeated (fast)                 ";
   name[StatClockAcquireFull]             = "  full (slow)                     ";
   name[StatClockAcquiredSomething]       = "  acquired something              ";
   name[StatClockRelease]                 = "Clock release                     ";
   name[StatClockReleaseResize]           = "  resize                          ";
-  name[StatClockReleaseFast1]            = "  fast1                           ";
-  name[StatClockReleaseFast2]            = "  fast2                           ";
+  name[StatClockReleaseFast]             = "  fast                            ";
   name[StatClockReleaseSlow]             = "  dirty overflow (slow)           ";
   name[StatClockReleaseFull]             = "  full (slow)                     ";
   name[StatClockReleaseAcquired]         = "  was acquired                    ";
@@ -153,6 +150,16 @@ void StatOutput(u64 *stat) {
   name[StatAnnotatePublishMemoryRange]   = "  PublishMemoryRange              ";
   name[StatAnnotateUnpublishMemoryRange] = "  UnpublishMemoryRange            ";
   name[StatAnnotateThreadName]           = "  ThreadName                      ";
+  name[Stat__tsan_mutex_create]          = "  __tsan_mutex_create             ";
+  name[Stat__tsan_mutex_destroy]         = "  __tsan_mutex_destroy            ";
+  name[Stat__tsan_mutex_pre_lock]        = "  __tsan_mutex_pre_lock           ";
+  name[Stat__tsan_mutex_post_lock]       = "  __tsan_mutex_post_lock          ";
+  name[Stat__tsan_mutex_pre_unlock]      = "  __tsan_mutex_pre_unlock         ";
+  name[Stat__tsan_mutex_post_unlock]     = "  __tsan_mutex_post_unlock        ";
+  name[Stat__tsan_mutex_pre_signal]      = "  __tsan_mutex_pre_signal         ";
+  name[Stat__tsan_mutex_post_signal]     = "  __tsan_mutex_post_signal        ";
+  name[Stat__tsan_mutex_pre_divert]      = "  __tsan_mutex_pre_divert         ";
+  name[Stat__tsan_mutex_post_divert]     = "  __tsan_mutex_post_divert        ";
 
   name[StatMtxTotal]                     = "Contentionz                       ";
   name[StatMtxTrace]                     = "  Trace                           ";
diff --git a/lib/tsan/rtl/tsan_stat.h b/lib/tsan/rtl/tsan_stat.h
index 8447dd84f..42d6a2b63 100644
--- a/lib/tsan/rtl/tsan_stat.h
+++ b/lib/tsan/rtl/tsan_stat.h
@@ -74,15 +74,12 @@ enum StatType {
   StatClockAcquire,
   StatClockAcquireEmpty,
   StatClockAcquireFastRelease,
-  StatClockAcquireLarge,
-  StatClockAcquireRepeat,
   StatClockAcquireFull,
   StatClockAcquiredSomething,
   // Clocks - release.
   StatClockRelease,
   StatClockReleaseResize,
-  StatClockReleaseFast1,
-  StatClockReleaseFast2,
+  StatClockReleaseFast,
   StatClockReleaseSlow,
   StatClockReleaseFull,
   StatClockReleaseAcquired,
@@ -157,6 +154,16 @@ enum StatType {
   StatAnnotatePublishMemoryRange,
   StatAnnotateUnpublishMemoryRange,
   StatAnnotateThreadName,
+  Stat__tsan_mutex_create,
+  Stat__tsan_mutex_destroy,
+  Stat__tsan_mutex_pre_lock,
+  Stat__tsan_mutex_post_lock,
+  Stat__tsan_mutex_pre_unlock,
+  Stat__tsan_mutex_post_unlock,
+  Stat__tsan_mutex_pre_signal,
+  Stat__tsan_mutex_post_signal,
+  Stat__tsan_mutex_pre_divert,
+  Stat__tsan_mutex_post_divert,
 
   // Internal mutex contentionz.
   StatMtxTotal,
diff --git a/lib/tsan/rtl/tsan_sync.cc b/lib/tsan/rtl/tsan_sync.cc
index 2be047462..44ae558fa 100644
--- a/lib/tsan/rtl/tsan_sync.cc
+++ b/lib/tsan/rtl/tsan_sync.cc
@@ -42,10 +42,7 @@ void SyncVar::Reset(Processor *proc) {
   owner_tid = kInvalidTid;
   last_lock = 0;
   recursion = 0;
-  is_rw = 0;
-  is_recursive = 0;
-  is_broken = 0;
-  is_linker_init = 0;
+  atomic_store_relaxed(&flags, 0);
 
   if (proc == 0) {
     CHECK_EQ(clock.size(), 0);
@@ -56,7 +53,9 @@ void SyncVar::Reset(Processor *proc) {
   }
 }
 
-MetaMap::MetaMap() {
+MetaMap::MetaMap()
+    : block_alloc_("heap block allocator")
+    , sync_alloc_("sync allocator") {
   atomic_store(&uid_gen_, 0, memory_order_relaxed);
 }
 
diff --git a/lib/tsan/rtl/tsan_sync.h b/lib/tsan/rtl/tsan_sync.h
index 86e6bbd55..9039970bc 100644
--- a/lib/tsan/rtl/tsan_sync.h
+++ b/lib/tsan/rtl/tsan_sync.h
@@ -23,6 +23,31 @@
 
 namespace __tsan {
 
+// These need to match __tsan_mutex_* flags defined in tsan_interface.h.
+// See documentation there as well.
+enum MutexFlags {
+  MutexFlagLinkerInit          = 1 << 0, // __tsan_mutex_linker_init
+  MutexFlagWriteReentrant      = 1 << 1, // __tsan_mutex_write_reentrant
+  MutexFlagReadReentrant       = 1 << 2, // __tsan_mutex_read_reentrant
+  MutexFlagReadLock            = 1 << 3, // __tsan_mutex_read_lock
+  MutexFlagTryLock             = 1 << 4, // __tsan_mutex_try_lock
+  MutexFlagTryLockFailed       = 1 << 5, // __tsan_mutex_try_lock_failed
+  MutexFlagRecursiveLock       = 1 << 6, // __tsan_mutex_recursive_lock
+  MutexFlagRecursiveUnlock     = 1 << 7, // __tsan_mutex_recursive_unlock
+  MutexFlagNotStatic           = 1 << 8, // __tsan_mutex_not_static
+
+  // The following flags are runtime private.
+  // Mutex API misuse was detected, so don't report any more.
+  MutexFlagBroken              = 1 << 30,
+  // We did not intercept pre lock event, so handle it on post lock.
+  MutexFlagDoPreLockOnPostLock = 1 << 29,
+  // Must list all mutex creation flags.
+  MutexCreationFlagMask        = MutexFlagLinkerInit |
+                                 MutexFlagWriteReentrant |
+                                 MutexFlagReadReentrant |
+                                 MutexFlagNotStatic,
+};
+
 struct SyncVar {
   SyncVar();
 
@@ -35,10 +60,7 @@ struct SyncVar {
   int owner_tid;  // Set only by exclusive owners.
   u64 last_lock;
   int recursion;
-  bool is_rw;
-  bool is_recursive;
-  bool is_broken;
-  bool is_linker_init;
+  atomic_uint32_t flags;
   u32 next;  // in MetaMap
   DDMutex dd;
   SyncClock read_clock;  // Used for rw mutexes only.
@@ -61,6 +83,26 @@ struct SyncVar {
     *uid = id >> 48;
     return (uptr)GetLsb(id, 48);
   }
+
+  bool IsFlagSet(u32 f) const {
+    return atomic_load_relaxed(&flags) & f;
+  }
+
+  void SetFlags(u32 f) {
+    atomic_store_relaxed(&flags, atomic_load_relaxed(&flags) | f);
+  }
+
+  void UpdateFlags(u32 flagz) {
+    // Filter out operation flags.
+    if (!(flagz & MutexCreationFlagMask))
+      return;
+    u32 current = atomic_load_relaxed(&flags);
+    if (current & MutexCreationFlagMask)
+      return;
+    // Note: this can be called from MutexPostReadLock which holds only read
+    // lock on the SyncVar.
+    atomic_store_relaxed(&flags, current | (flagz & MutexCreationFlagMask));
+  }
 };
 
 /* MetaMap allows to map arbitrary user pointers onto various descriptors.
diff --git a/lib/tsan/rtl/tsan_trace.h b/lib/tsan/rtl/tsan_trace.h
index 96a18ac41..9aef375cb 100644
--- a/lib/tsan/rtl/tsan_trace.h
+++ b/lib/tsan/rtl/tsan_trace.h
@@ -41,6 +41,8 @@ enum EventType {
 // u64 addr : 61;  // Associated pc.
 typedef u64 Event;
 
+const uptr kEventPCBits = 61;
+
 struct TraceHeader {
 #if !SANITIZER_GO
   BufferedStackTrace stack0;  // Start stack for the trace.
diff --git a/lib/tsan/tests/CMakeLists.txt b/lib/tsan/tests/CMakeLists.txt
index 87e14174a..ad8d02ed3 100644
--- a/lib/tsan/tests/CMakeLists.txt
+++ b/lib/tsan/tests/CMakeLists.txt
@@ -2,18 +2,40 @@ include_directories(../rtl)
 
 add_custom_target(TsanUnitTests)
 set_target_properties(TsanUnitTests PROPERTIES
-  FOLDER "TSan unittests")
+  FOLDER "Compiler-RT Tests")
 
 set(TSAN_UNITTEST_CFLAGS
   ${TSAN_CFLAGS}
   ${COMPILER_RT_UNITTEST_CFLAGS}
   ${COMPILER_RT_GTEST_CFLAGS}
+  -I${COMPILER_RT_SOURCE_DIR}/include
   -I${COMPILER_RT_SOURCE_DIR}/lib
   -I${COMPILER_RT_SOURCE_DIR}/lib/tsan/rtl
   -DGTEST_HAS_RTTI=0)
 
+set(TSAN_TEST_ARCH ${TSAN_SUPPORTED_ARCH})
 if(APPLE)
+
+  # Create a static library for test dependencies.
+  set(TSAN_TEST_RUNTIME_OBJECTS
+    $<TARGET_OBJECTS:RTTsan_dynamic.osx>
+    $<TARGET_OBJECTS:RTInterception.osx>
+    $<TARGET_OBJECTS:RTSanitizerCommon.osx>
+    $<TARGET_OBJECTS:RTSanitizerCommonLibc.osx>
+    $<TARGET_OBJECTS:RTUbsan.osx>)
+  set(TSAN_TEST_RUNTIME RTTsanTest)
+  add_library(${TSAN_TEST_RUNTIME} STATIC ${TSAN_TEST_RUNTIME_OBJECTS})
+  set_target_properties(${TSAN_TEST_RUNTIME} PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+  darwin_filter_host_archs(TSAN_SUPPORTED_ARCH TSAN_TEST_ARCH)
   list(APPEND TSAN_UNITTEST_CFLAGS ${DARWIN_osx_CFLAGS})
+
+  set(LINK_FLAGS "-lc++")
+  add_weak_symbols("ubsan" LINK_FLAGS)
+  add_weak_symbols("sanitizer_common" LINK_FLAGS)
+else()
+  set(LINK_FLAGS "-fsanitize=thread;-lstdc++;-lm")
 endif()
 
 set(TSAN_RTL_HEADERS)
@@ -21,79 +43,27 @@ foreach (header ${TSAN_HEADERS})
   list(APPEND TSAN_RTL_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../${header})
 endforeach()
 
-# tsan_compile(obj_list, source, arch, {headers})
-macro(tsan_compile obj_list source arch)
-  get_filename_component(basename ${source} NAME)
-  set(output_obj "${basename}.${arch}.o")
-  get_target_flags_for_arch(${arch} TARGET_CFLAGS)
-  set(COMPILE_DEPS ${TSAN_RTL_HEADERS} ${ARGN})
-  if(NOT COMPILER_RT_STANDALONE_BUILD)
-    list(APPEND COMPILE_DEPS gtest tsan)
-  endif()
-  clang_compile(${output_obj} ${source}
-          CFLAGS ${TSAN_UNITTEST_CFLAGS} ${TARGET_CFLAGS}
-          DEPS ${COMPILE_DEPS})
-  list(APPEND ${obj_list} ${output_obj})
-endmacro()
-
+# add_tsan_unittest(<name>
+#                   SOURCES <sources list>
+#                   HEADERS <extra headers list>)
 macro(add_tsan_unittest testname)
-  set(TSAN_TEST_ARCH ${TSAN_SUPPORTED_ARCH})
-  if(APPLE)
-    darwin_filter_host_archs(TSAN_SUPPORTED_ARCH TSAN_TEST_ARCH)
-  endif()
+  cmake_parse_arguments(TEST "" "" "SOURCES;HEADERS" ${ARGN})
   if(UNIX)
     foreach(arch ${TSAN_TEST_ARCH})
-      cmake_parse_arguments(TEST "" "" "SOURCES;HEADERS" ${ARGN})
-      set(TEST_OBJECTS)
-      foreach(SOURCE ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE})
-        tsan_compile(TEST_OBJECTS ${SOURCE} ${arch} ${TEST_HEADERS})
-      endforeach()
-      get_target_flags_for_arch(${arch} TARGET_LINK_FLAGS)
-      set(TEST_DEPS ${TEST_OBJECTS})
-      if(NOT COMPILER_RT_STANDALONE_BUILD)
-        list(APPEND TEST_DEPS tsan)
-      endif()
-      if(NOT APPLE)
-        # FIXME: Looks like we should link TSan with just-built runtime,
-        # and not rely on -fsanitize=thread, as these tests are essentially
-        # unit tests.
-        add_compiler_rt_test(TsanUnitTests ${testname}
-                OBJECTS ${TEST_OBJECTS}
-                DEPS ${TEST_DEPS}
-                LINK_FLAGS ${TARGET_LINK_FLAGS}
-                           -fsanitize=thread
-                           -lstdc++ -lm)
-      else()
-        set(TSAN_TEST_RUNTIME_OBJECTS
-          $<TARGET_OBJECTS:RTTsan_dynamic.osx>
-          $<TARGET_OBJECTS:RTInterception.osx>
-          $<TARGET_OBJECTS:RTSanitizerCommon.osx>
-          $<TARGET_OBJECTS:RTSanitizerCommonLibc.osx>
-          $<TARGET_OBJECTS:RTUbsan.osx>)
-        set(TSAN_TEST_RUNTIME RTTsanTest.${testname}.${arch})
-        add_library(${TSAN_TEST_RUNTIME} STATIC ${TSAN_TEST_RUNTIME_OBJECTS})
-        set_target_properties(${TSAN_TEST_RUNTIME} PROPERTIES
-          ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-        list(APPEND TEST_OBJECTS lib${TSAN_TEST_RUNTIME}.a)
-        list(APPEND TEST_DEPS ${TSAN_TEST_RUNTIME})
-
-        add_weak_symbols("ubsan" WEAK_SYMBOL_LINK_FLAGS)
-        add_weak_symbols("sanitizer_common" WEAK_SYMBOL_LINK_FLAGS)
-
-        # Intentionally do *not* link with `-fsanitize=thread`. We already link
-        # against a static version of the runtime, and we don't want the dynamic
-        # one.
-        add_compiler_rt_test(TsanUnitTests "${testname}-${arch}-Test"
-                OBJECTS ${TEST_OBJECTS}
-                DEPS ${TEST_DEPS}
-                LINK_FLAGS ${TARGET_LINK_FLAGS} ${DARWIN_osx_LINK_FLAGS}
-                           ${WEAK_SYMBOL_LINK_FLAGS} -lc++)
-      endif()
+      set(TsanUnitTestsObjects)
+      generate_compiler_rt_tests(TsanUnitTestsObjects TsanUnitTests
+        "${testname}-${arch}-Test" ${arch}
+        SOURCES ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}
+        RUNTIME ${TSAN_TEST_RUNTIME}
+        COMPILE_DEPS ${TEST_HEADERS} ${TSAN_RTL_HEADERS}
+        DEPS gtest tsan
+        CFLAGS ${TSAN_UNITTEST_CFLAGS}
+        LINK_FLAGS ${LINK_FLAGS})
     endforeach()
   endif()
 endmacro()
 
-if(COMPILER_RT_CAN_EXECUTE_TESTS)
+if(COMPILER_RT_CAN_EXECUTE_TESTS AND NOT ANDROID)
   add_subdirectory(rtl)
   add_subdirectory(unit)
 endif()
diff --git a/lib/tsan/tests/rtl/tsan_posix.cc b/lib/tsan/tests/rtl/tsan_posix.cc
index 9c0e013e5..e66dab609 100644
--- a/lib/tsan/tests/rtl/tsan_posix.cc
+++ b/lib/tsan/tests/rtl/tsan_posix.cc
@@ -94,8 +94,9 @@ TEST(Posix, ThreadLocalAccesses) {
 // The test is failing with high thread count for aarch64.
 // FIXME: track down the issue and re-enable the test.
 // On Darwin, we're running unit tests without interceptors and __thread is
-// using malloc and free, which causes false data race reports.
-#if !defined(__aarch64__) && !defined(__APPLE__)
+// using malloc and free, which causes false data race reports.  On rare
+// occasions on powerpc64le this test also fails.
+#if !defined(__aarch64__) && !defined(__APPLE__) && !defined(powerpc64le)
   local_thread((void*)2);
 #endif
 }
diff --git a/lib/tsan/tests/rtl/tsan_test_util_posix.cc b/lib/tsan/tests/rtl/tsan_test_util_posix.cc
index 834a271aa..d00e26dd5 100644
--- a/lib/tsan/tests/rtl/tsan_test_util_posix.cc
+++ b/lib/tsan/tests/rtl/tsan_test_util_posix.cc
@@ -9,7 +9,7 @@
 //
 // This file is a part of ThreadSanitizer (TSan), a race detector.
 //
-// Test utils, Linux, FreeBSD and Darwin implementation.
+// Test utils, Linux, FreeBSD, NetBSD and Darwin implementation.
 //===----------------------------------------------------------------------===//
 
 #include "sanitizer_common/sanitizer_atomic.h"
@@ -270,7 +270,7 @@ void ScopedThread::Impl::HandleEvent(Event *ev) {
       }
     }
     CHECK_NE(tsan_mop, 0);
-#if defined(__FreeBSD__) || defined(__APPLE__)
+#if defined(__FreeBSD__) || defined(__APPLE__) || defined(__NetBSD__)
     const int ErrCode = ESOCKTNOSUPPORT;
 #else
     const int ErrCode = ECHRNG;
diff --git a/lib/tsan/tests/unit/tsan_clock_test.cc b/lib/tsan/tests/unit/tsan_clock_test.cc
index 83e25fb5a..f6230e1be 100644
--- a/lib/tsan/tests/unit/tsan_clock_test.cc
+++ b/lib/tsan/tests/unit/tsan_clock_test.cc
@@ -26,13 +26,13 @@ TEST(Clock, VectorBasic) {
   clk.tick();
   ASSERT_EQ(clk.size(), 1U);
   ASSERT_EQ(clk.get(0), 1U);
-  clk.set(3, clk.get(3) + 1);
+  clk.set(&cache, 3, clk.get(3) + 1);
   ASSERT_EQ(clk.size(), 4U);
   ASSERT_EQ(clk.get(0), 1U);
   ASSERT_EQ(clk.get(1), 0U);
   ASSERT_EQ(clk.get(2), 0U);
   ASSERT_EQ(clk.get(3), 1U);
-  clk.set(3, clk.get(3) + 1);
+  clk.set(&cache, 3, clk.get(3) + 1);
   ASSERT_EQ(clk.get(3), 2U);
 }
 
@@ -53,6 +53,31 @@ TEST(Clock, ChunkedBasic) {
   chunked.Reset(&cache);
 }
 
+static const uptr interesting_sizes[] = {0, 1, 2, 30, 61, 62, 63, 64, 65, 66,
+    100, 124, 125, 126, 127, 128, 129, 130, 188, 189, 190, 191, 192, 193, 254,
+    255};
+
+TEST(Clock, Iter) {
+  const uptr n = ARRAY_SIZE(interesting_sizes);
+  for (uptr fi = 0; fi < n; fi++) {
+    const uptr size = interesting_sizes[fi];
+    SyncClock sync;
+    ThreadClock vector(0);
+    for (uptr i = 0; i < size; i++)
+      vector.set(&cache, i, i + 1);
+    if (size != 0)
+      vector.release(&cache, &sync);
+    uptr i = 0;
+    for (ClockElem &ce : sync) {
+      ASSERT_LT(i, size);
+      ASSERT_EQ(sync.get_clean(i), ce.epoch);
+      i++;
+    }
+    ASSERT_EQ(i, size);
+    sync.Reset(&cache);
+  }
+}
+
 TEST(Clock, AcquireRelease) {
   ThreadClock vector1(100);
   vector1.tick();
@@ -86,24 +111,26 @@ TEST(Clock, RepeatedAcquire) {
 
 TEST(Clock, ManyThreads) {
   SyncClock chunked;
-  for (unsigned i = 0; i < 100; i++) {
+  for (unsigned i = 0; i < 200; i++) {
     ThreadClock vector(0);
     vector.tick();
-    vector.set(i, 1);
+    vector.set(&cache, i, i + 1);
     vector.release(&cache, &chunked);
     ASSERT_EQ(i + 1, chunked.size());
     vector.acquire(&cache, &chunked);
     ASSERT_EQ(i + 1, vector.size());
   }
 
-  for (unsigned i = 0; i < 100; i++)
-    ASSERT_EQ(1U, chunked.get(i));
+  for (unsigned i = 0; i < 200; i++) {
+    printf("i=%d\n", i);
+    ASSERT_EQ(i + 1, chunked.get(i));
+  }
 
   ThreadClock vector(1);
   vector.acquire(&cache, &chunked);
-  ASSERT_EQ(100U, vector.size());
-  for (unsigned i = 0; i < 100; i++)
-    ASSERT_EQ(1U, vector.get(i));
+  ASSERT_EQ(200U, vector.size());
+  for (unsigned i = 0; i < 200; i++)
+    ASSERT_EQ(i + 1, vector.get(i));
 
   chunked.Reset(&cache);
 }
@@ -151,7 +178,7 @@ TEST(Clock, Growth) {
   {
     ThreadClock vector(10);
     vector.tick();
-    vector.set(5, 42);
+    vector.set(&cache, 5, 42);
     SyncClock sync;
     vector.release(&cache, &sync);
     ASSERT_EQ(sync.size(), 11U);
@@ -180,8 +207,8 @@ TEST(Clock, Growth) {
   {
     ThreadClock vector(100);
     vector.tick();
-    vector.set(5, 42);
-    vector.set(90, 84);
+    vector.set(&cache, 5, 42);
+    vector.set(&cache, 90, 84);
     SyncClock sync;
     vector.release(&cache, &sync);
     ASSERT_EQ(sync.size(), 101U);
@@ -212,6 +239,40 @@ TEST(Clock, Growth) {
   }
 }
 
+TEST(Clock, Growth2) {
+  // Test clock growth for every pair of sizes:
+  const uptr n = ARRAY_SIZE(interesting_sizes);
+  for (uptr fi = 0; fi < n; fi++) {
+    for (uptr ti = fi + 1; ti < n; ti++) {
+      const uptr from = interesting_sizes[fi];
+      const uptr to = interesting_sizes[ti];
+      SyncClock sync;
+      ThreadClock vector(0);
+      for (uptr i = 0; i < from; i++)
+        vector.set(&cache, i, i + 1);
+      if (from != 0)
+        vector.release(&cache, &sync);
+      ASSERT_EQ(sync.size(), from);
+      for (uptr i = 0; i < from; i++)
+        ASSERT_EQ(sync.get(i), i + 1);
+      for (uptr i = 0; i < to; i++)
+        vector.set(&cache, i, i + 1);
+      vector.release(&cache, &sync);
+      ASSERT_EQ(sync.size(), to);
+      for (uptr i = 0; i < to; i++)
+        ASSERT_EQ(sync.get(i), i + 1);
+      vector.set(&cache, to + 1, to + 1);
+      vector.release(&cache, &sync);
+      ASSERT_EQ(sync.size(), to + 2);
+      for (uptr i = 0; i < to; i++)
+        ASSERT_EQ(sync.get(i), i + 1);
+      ASSERT_EQ(sync.get(to), 0U);
+      ASSERT_EQ(sync.get(to + 1), to + 1);
+      sync.Reset(&cache);
+    }
+  }
+}
+
 const uptr kThreads = 4;
 const uptr kClocks = 4;
 
diff --git a/lib/tsan/tests/unit/tsan_mman_test.cc b/lib/tsan/tests/unit/tsan_mman_test.cc
index 60dea3d43..05ae42867 100644
--- a/lib/tsan/tests/unit/tsan_mman_test.cc
+++ b/lib/tsan/tests/unit/tsan_mman_test.cc
@@ -56,6 +56,7 @@ TEST(Mman, UserRealloc) {
     // Realloc(NULL, N) is equivalent to malloc(N), thus must return
     // non-NULL pointer.
     EXPECT_NE(p, (void*)0);
+    user_free(thr, pc, p);
   }
   {
     void *p = user_realloc(thr, pc, 0, 100);
@@ -67,8 +68,9 @@ TEST(Mman, UserRealloc) {
     void *p = user_alloc(thr, pc, 100);
     EXPECT_NE(p, (void*)0);
     memset(p, 0xde, 100);
+    // Realloc(P, 0) is equivalent to free(P) and returns NULL.
     void *p2 = user_realloc(thr, pc, p, 0);
-    EXPECT_NE(p2, (void*)0);
+    EXPECT_EQ(p2, (void*)0);
   }
   {
     void *p = user_realloc(thr, pc, 0, 100);
@@ -135,12 +137,34 @@ TEST(Mman, Stats) {
   EXPECT_EQ(unmapped0, __sanitizer_get_unmapped_bytes());
 }
 
+TEST(Mman, Valloc) {
+  ThreadState *thr = cur_thread();
+  uptr page_size = GetPageSizeCached();
+
+  void *p = user_valloc(thr, 0, 100);
+  EXPECT_NE(p, (void*)0);
+  user_free(thr, 0, p);
+
+  p = user_pvalloc(thr, 0, 100);
+  EXPECT_NE(p, (void*)0);
+  user_free(thr, 0, p);
+
+  p = user_pvalloc(thr, 0, 0);
+  EXPECT_NE(p, (void*)0);
+  EXPECT_EQ(page_size, __sanitizer_get_allocated_size(p));
+  user_free(thr, 0, p);
+
+  EXPECT_DEATH(p = user_pvalloc(thr, 0, (uptr)-(page_size - 1)),
+               "allocator is terminating the process instead of returning 0");
+  EXPECT_DEATH(p = user_pvalloc(thr, 0, (uptr)-1),
+               "allocator is terminating the process instead of returning 0");
+}
+
+#if !SANITIZER_DEBUG
+// EXPECT_DEATH clones a thread with 4K stack,
+// which is overflown by tsan memory accesses functions in debug mode.
+
 TEST(Mman, CallocOverflow) {
-#if SANITIZER_DEBUG
-  // EXPECT_DEATH clones a thread with 4K stack,
-  // which is overflown by tsan memory accesses functions in debug mode.
-  return;
-#endif
   ThreadState *thr = cur_thread();
   uptr pc = 0;
   size_t kArraySize = 4096;
@@ -152,4 +176,57 @@ TEST(Mman, CallocOverflow) {
   EXPECT_EQ(0L, p);
 }
 
+TEST(Mman, Memalign) {
+  ThreadState *thr = cur_thread();
+
+  void *p = user_memalign(thr, 0, 8, 100);
+  EXPECT_NE(p, (void*)0);
+  user_free(thr, 0, p);
+
+  p = NULL;
+  EXPECT_DEATH(p = user_memalign(thr, 0, 7, 100),
+               "allocator is terminating the process instead of returning 0");
+  EXPECT_EQ(0L, p);
+}
+
+TEST(Mman, PosixMemalign) {
+  ThreadState *thr = cur_thread();
+
+  void *p = NULL;
+  int res = user_posix_memalign(thr, 0, &p, 8, 100);
+  EXPECT_NE(p, (void*)0);
+  EXPECT_EQ(res, 0);
+  user_free(thr, 0, p);
+
+  p = NULL;
+  // Alignment is not a power of two, although is a multiple of sizeof(void*).
+  EXPECT_DEATH(res = user_posix_memalign(thr, 0, &p, 3 * sizeof(p), 100),
+               "allocator is terminating the process instead of returning 0");
+  EXPECT_EQ(0L, p);
+  // Alignment is not a multiple of sizeof(void*), although is a power of 2.
+  EXPECT_DEATH(res = user_posix_memalign(thr, 0, &p, 2, 100),
+               "allocator is terminating the process instead of returning 0");
+  EXPECT_EQ(0L, p);
+}
+
+TEST(Mman, AlignedAlloc) {
+  ThreadState *thr = cur_thread();
+
+  void *p = user_aligned_alloc(thr, 0, 8, 64);
+  EXPECT_NE(p, (void*)0);
+  user_free(thr, 0, p);
+
+  p = NULL;
+  // Alignement is not a power of 2.
+  EXPECT_DEATH(p = user_aligned_alloc(thr, 0, 7, 100),
+               "allocator is terminating the process instead of returning 0");
+  EXPECT_EQ(0L, p);
+  // Size is not a multiple of alignment.
+  EXPECT_DEATH(p = user_aligned_alloc(thr, 0, 8, 100),
+               "allocator is terminating the process instead of returning 0");
+  EXPECT_EQ(0L, p);
+}
+
+#endif
+
 }  // namespace __tsan
author	David L. Jones <dlj@google.com>	2017-11-10 01:07:01 +0000
committer	David L. Jones <dlj@google.com>	2017-11-10 01:07:01 +0000
commit	5cdb7458cb1d6fc8fc83dd5a177658f1284f5d29 (patch)
tree	1772d0d4f25219059bc98e9c65baeef15d268524 /lib/tsan
parent	97e140242d3085562afa1340578d5f531a82ad69 (diff)
parent	bcc227ee4af1ef3e63033b35dcb1d5627a3b2941 (diff)
download	compiler-rt-5cdb7458cb1d6fc8fc83dd5a177658f1284f5d29.tar.gz