//===-- tsan_clock.cpp ----------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file is a part of ThreadSanitizer (TSan), a race detector. // //===----------------------------------------------------------------------===// #include "tsan_clock.h" #include "tsan_rtl.h" #include "sanitizer_common/sanitizer_placement_new.h" // SyncClock and ThreadClock implement vector clocks for sync variables // (mutexes, atomic variables, file descriptors, etc) and threads, respectively. // ThreadClock contains fixed-size vector clock for maximum number of threads. // SyncClock contains growable vector clock for currently necessary number of // threads. // Together they implement very simple model of operations, namely: // // void ThreadClock::acquire(const SyncClock *src) { // for (int i = 0; i < kMaxThreads; i++) // clock[i] = max(clock[i], src->clock[i]); // } // // void ThreadClock::release(SyncClock *dst) const { // for (int i = 0; i < kMaxThreads; i++) // dst->clock[i] = max(dst->clock[i], clock[i]); // } // // void ThreadClock::ReleaseStore(SyncClock *dst) const { // for (int i = 0; i < kMaxThreads; i++) // dst->clock[i] = clock[i]; // } // // void ThreadClock::acq_rel(SyncClock *dst) { // acquire(dst); // release(dst); // } // // Conformance to this model is extensively verified in tsan_clock_test.cc. // However, the implementation is significantly more complex. The complexity // allows to implement important classes of use cases in O(1) instead of O(N). // // The use cases are: // 1. Singleton/once atomic that has a single release-store operation followed // by zillions of acquire-loads (the acquire-load is O(1)). // 2. Thread-local mutex (both lock and unlock can be O(1)). // 3. Leaf mutex (unlock is O(1)). // 4. A mutex shared by 2 threads (both lock and unlock can be O(1)). // 5. An atomic with a single writer (writes can be O(1)). // The implementation dynamically adopts to workload. So if an atomic is in // read-only phase, these reads will be O(1); if it later switches to read/write // phase, the implementation will correctly handle that by switching to O(N). // // Thread-safety note: all const operations on SyncClock's are conducted under // a shared lock; all non-const operations on SyncClock's are conducted under // an exclusive lock; ThreadClock's are private to respective threads and so // do not need any protection. // // Description of SyncClock state: // clk_ - variable size vector clock, low kClkBits hold timestamp, // the remaining bits hold "acquired" flag (the actual value is thread's // reused counter); // if acquried == thr->reused_, then the respective thread has already // acquired this clock (except possibly for dirty elements). // dirty_ - holds up to two indeces in the vector clock that other threads // need to acquire regardless of "acquired" flag value; // release_store_tid_ - denotes that the clock state is a result of // release-store operation by the thread with release_store_tid_ index. // release_store_reused_ - reuse count of release_store_tid_. // We don't have ThreadState in these methods, so this is an ugly hack that // works only in C++. #if !SANITIZER_GO # define CPP_STAT_INC(typ) StatInc(cur_thread(), typ) #else # define CPP_STAT_INC(typ) (void)0 #endif namespace __tsan { static atomic_uint32_t *ref_ptr(ClockBlock *cb) { return reinterpret_cast(&cb->table[ClockBlock::kRefIdx]); } // Drop reference to the first level block idx. static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) { ClockBlock *cb = ctx->clock_alloc.Map(idx); atomic_uint32_t *ref = ref_ptr(cb); u32 v = atomic_load(ref, memory_order_acquire); for (;;) { CHECK_GT(v, 0); if (v == 1) break; if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel)) return; } // First level block owns second level blocks, so them as well. for (uptr i = 0; i < blocks; i++) ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]); ctx->clock_alloc.Free(c, idx); } ThreadClock::ThreadClock(unsigned tid, unsigned reused) : tid_(tid) , reused_(reused + 1) // 0 has special meaning , cached_idx_() , cached_size_() , cached_blocks_() { CHECK_LT(tid, kMaxTidInClock); CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits); nclk_ = tid_ + 1; last_acquire_ = 0; internal_memset(clk_, 0, sizeof(clk_)); } void ThreadClock::ResetCached(ClockCache *c) { if (cached_idx_) { UnrefClockBlock(c, cached_idx_, cached_blocks_); cached_idx_ = 0; cached_size_ = 0; cached_blocks_ = 0; } } void ThreadClock::acquire(ClockCache *c, SyncClock *src) { DCHECK_LE(nclk_, kMaxTid); DCHECK_LE(src->size_, kMaxTid); CPP_STAT_INC(StatClockAcquire); // Check if it's empty -> no need to do anything. const uptr nclk = src->size_; if (nclk == 0) { CPP_STAT_INC(StatClockAcquireEmpty); return; } bool acquired = false; for (unsigned i = 0; i < kDirtyTids; i++) { SyncClock::Dirty dirty = src->dirty_[i]; unsigned tid = dirty.tid; if (tid != kInvalidTid) { if (clk_[tid] < dirty.epoch) { clk_[tid] = dirty.epoch; acquired = true; } } } // Check if we've already acquired src after the last release operation on src if (tid_ >= nclk || src->elem(tid_).reused != reused_) { // O(N) acquire. CPP_STAT_INC(StatClockAcquireFull); nclk_ = max(nclk_, nclk); u64 *dst_pos = &clk_[0]; for (ClockElem &src_elem : *src) { u64 epoch = src_elem.epoch; if (*dst_pos < epoch) { *dst_pos = epoch; acquired = true; } dst_pos++; } // Remember that this thread has acquired this clock. if (nclk > tid_) src->elem(tid_).reused = reused_; } if (acquired) { CPP_STAT_INC(StatClockAcquiredSomething); last_acquire_ = clk_[tid_]; ResetCached(c); } } void ThreadClock::release(ClockCache *c, SyncClock *dst) { DCHECK_LE(nclk_, kMaxTid); DCHECK_LE(dst->size_, kMaxTid); if (dst->size_ == 0) { // ReleaseStore will correctly set release_store_tid_, // which can be important for future operations. ReleaseStore(c, dst); return; } CPP_STAT_INC(StatClockRelease); // Check if we need to resize dst. if (dst->size_ < nclk_) dst->Resize(c, nclk_); // Check if we had not acquired anything from other threads // since the last release on dst. If so, we need to update // only dst->elem(tid_). if (dst->elem(tid_).epoch > last_acquire_) { UpdateCurrentThread(c, dst); if (dst->release_store_tid_ != tid_ || dst->release_store_reused_ != reused_) dst->release_store_tid_ = kInvalidTid; return; } // O(N) release. CPP_STAT_INC(StatClockReleaseFull); dst->Unshare(c); // First, remember whether we've acquired dst. bool acquired = IsAlreadyAcquired(dst); if (acquired) CPP_STAT_INC(StatClockReleaseAcquired); // Update dst->clk_. dst->FlushDirty(); uptr i = 0; for (ClockElem &ce : *dst) { ce.epoch = max(ce.epoch, clk_[i]); ce.reused = 0; i++; } // Clear 'acquired' flag in the remaining elements. if (nclk_ < dst->size_) CPP_STAT_INC(StatClockReleaseClearTail); for (uptr i = nclk_; i < dst->size_; i++) dst->elem(i).reused = 0; dst->release_store_tid_ = kInvalidTid; dst->release_store_reused_ = 0; // If we've acquired dst, remember this fact, // so that we don't need to acquire it on next acquire. if (acquired) dst->elem(tid_).reused = reused_; } void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) { DCHECK_LE(nclk_, kMaxTid); DCHECK_LE(dst->size_, kMaxTid); CPP_STAT_INC(StatClockStore); if (dst->size_ == 0 && cached_idx_ != 0) { // Reuse the cached clock. // Note: we could reuse/cache the cached clock in more cases: // we could update the existing clock and cache it, or replace it with the // currently cached clock and release the old one. And for a shared // existing clock, we could replace it with the currently cached; // or unshare, update and cache. But, for simplicity, we currnetly reuse // cached clock only when the target clock is empty. dst->tab_ = ctx->clock_alloc.Map(cached_idx_); dst->tab_idx_ = cached_idx_; dst->size_ = cached_size_; dst->blocks_ = cached_blocks_; CHECK_EQ(dst->dirty_[0].tid, kInvalidTid); // The cached clock is shared (immutable), // so this is where we store the current clock. dst->dirty_[0].tid = tid_; dst->dirty_[0].epoch = clk_[tid_]; dst->release_store_tid_ = tid_; dst->release_store_reused_ = reused_; // Rememeber that we don't need to acquire it in future. dst->elem(tid_).reused = reused_; // Grab a reference. atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); return; } // Check if we need to resize dst. if (dst->size_ < nclk_) dst->Resize(c, nclk_); if (dst->release_store_tid_ == tid_ && dst->release_store_reused_ == reused_ && dst->elem(tid_).epoch > last_acquire_) { CPP_STAT_INC(StatClockStoreFast); UpdateCurrentThread(c, dst); return; } // O(N) release-store. CPP_STAT_INC(StatClockStoreFull); dst->Unshare(c); // Note: dst can be larger than this ThreadClock. // This is fine since clk_ beyond size is all zeros. uptr i = 0; for (ClockElem &ce : *dst) { ce.epoch = clk_[i]; ce.reused = 0; i++; } for (uptr i = 0; i < kDirtyTids; i++) dst->dirty_[i].tid = kInvalidTid; dst->release_store_tid_ = tid_; dst->release_store_reused_ = reused_; // Rememeber that we don't need to acquire it in future. dst->elem(tid_).reused = reused_; // If the resulting clock is cachable, cache it for future release operations. // The clock is always cachable if we released to an empty sync object. if (cached_idx_ == 0 && dst->Cachable()) { // Grab a reference to the ClockBlock. atomic_uint32_t *ref = ref_ptr(dst->tab_); if (atomic_load(ref, memory_order_acquire) == 1) atomic_store_relaxed(ref, 2); else atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); cached_idx_ = dst->tab_idx_; cached_size_ = dst->size_; cached_blocks_ = dst->blocks_; } } void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) { CPP_STAT_INC(StatClockAcquireRelease); acquire(c, dst); ReleaseStore(c, dst); } // Updates only single element related to the current thread in dst->clk_. void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const { // Update the threads time, but preserve 'acquired' flag. for (unsigned i = 0; i < kDirtyTids; i++) { SyncClock::Dirty *dirty = &dst->dirty_[i]; const unsigned tid = dirty->tid; if (tid == tid_ || tid == kInvalidTid) { CPP_STAT_INC(StatClockReleaseFast); dirty->tid = tid_; dirty->epoch = clk_[tid_]; return; } } // Reset all 'acquired' flags, O(N). // We are going to touch dst elements, so we need to unshare it. dst->Unshare(c); CPP_STAT_INC(StatClockReleaseSlow); dst->elem(tid_).epoch = clk_[tid_]; for (uptr i = 0; i < dst->size_; i++) dst->elem(i).reused = 0; dst->FlushDirty(); } // Checks whether the current thread has already acquired src. bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const { if (src->elem(tid_).reused != reused_) return false; for (unsigned i = 0; i < kDirtyTids; i++) { SyncClock::Dirty dirty = src->dirty_[i]; if (dirty.tid != kInvalidTid) { if (clk_[dirty.tid] < dirty.epoch) return false; } } return true; } // Sets a single element in the vector clock. // This function is called only from weird places like AcquireGlobal. void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) { DCHECK_LT(tid, kMaxTid); DCHECK_GE(v, clk_[tid]); clk_[tid] = v; if (nclk_ <= tid) nclk_ = tid + 1; last_acquire_ = clk_[tid_]; ResetCached(c); } void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) { printf("clock=["); for (uptr i = 0; i < nclk_; i++) printf("%s%llu", i == 0 ? "" : ",", clk_[i]); printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_); } SyncClock::SyncClock() { ResetImpl(); } SyncClock::~SyncClock() { // Reset must be called before dtor. CHECK_EQ(size_, 0); CHECK_EQ(blocks_, 0); CHECK_EQ(tab_, 0); CHECK_EQ(tab_idx_, 0); } void SyncClock::Reset(ClockCache *c) { if (size_) UnrefClockBlock(c, tab_idx_, blocks_); ResetImpl(); } void SyncClock::ResetImpl() { tab_ = 0; tab_idx_ = 0; size_ = 0; blocks_ = 0; release_store_tid_ = kInvalidTid; release_store_reused_ = 0; for (uptr i = 0; i < kDirtyTids; i++) dirty_[i].tid = kInvalidTid; } void SyncClock::Resize(ClockCache *c, uptr nclk) { CPP_STAT_INC(StatClockReleaseResize); Unshare(c); if (nclk <= capacity()) { // Memory is already allocated, just increase the size. size_ = nclk; return; } if (size_ == 0) { // Grow from 0 to one-level table. CHECK_EQ(size_, 0); CHECK_EQ(blocks_, 0); CHECK_EQ(tab_, 0); CHECK_EQ(tab_idx_, 0); tab_idx_ = ctx->clock_alloc.Alloc(c); tab_ = ctx->clock_alloc.Map(tab_idx_); internal_memset(tab_, 0, sizeof(*tab_)); atomic_store_relaxed(ref_ptr(tab_), 1); size_ = 1; } else if (size_ > blocks_ * ClockBlock::kClockCount) { u32 idx = ctx->clock_alloc.Alloc(c); ClockBlock *new_cb = ctx->clock_alloc.Map(idx); uptr top = size_ - blocks_ * ClockBlock::kClockCount; CHECK_LT(top, ClockBlock::kClockCount); const uptr move = top * sizeof(tab_->clock[0]); internal_memcpy(&new_cb->clock[0], tab_->clock, move); internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move); internal_memset(tab_->clock, 0, move); append_block(idx); } // At this point we have first level table allocated and all clock elements // are evacuated from it to a second level block. // Add second level tables as necessary. while (nclk > capacity()) { u32 idx = ctx->clock_alloc.Alloc(c); ClockBlock *cb = ctx->clock_alloc.Map(idx); internal_memset(cb, 0, sizeof(*cb)); append_block(idx); } size_ = nclk; } // Flushes all dirty elements into the main clock array. void SyncClock::FlushDirty() { for (unsigned i = 0; i < kDirtyTids; i++) { Dirty *dirty = &dirty_[i]; if (dirty->tid != kInvalidTid) { CHECK_LT(dirty->tid, size_); elem(dirty->tid).epoch = dirty->epoch; dirty->tid = kInvalidTid; } } } bool SyncClock::IsShared() const { if (size_ == 0) return false; atomic_uint32_t *ref = ref_ptr(tab_); u32 v = atomic_load(ref, memory_order_acquire); CHECK_GT(v, 0); return v > 1; } // Unshares the current clock if it's shared. // Shared clocks are immutable, so they need to be unshared before any updates. // Note: this does not apply to dirty entries as they are not shared. void SyncClock::Unshare(ClockCache *c) { if (!IsShared()) return; // First, copy current state into old. SyncClock old; old.tab_ = tab_; old.tab_idx_ = tab_idx_; old.size_ = size_; old.blocks_ = blocks_; old.release_store_tid_ = release_store_tid_; old.release_store_reused_ = release_store_reused_; for (unsigned i = 0; i < kDirtyTids; i++) old.dirty_[i] = dirty_[i]; // Then, clear current object. ResetImpl(); // Allocate brand new clock in the current object. Resize(c, old.size_); // Now copy state back into this object. Iter old_iter(&old); for (ClockElem &ce : *this) { ce = *old_iter; ++old_iter; } release_store_tid_ = old.release_store_tid_; release_store_reused_ = old.release_store_reused_; for (unsigned i = 0; i < kDirtyTids; i++) dirty_[i] = old.dirty_[i]; // Drop reference to old and delete if necessary. old.Reset(c); } // Can we cache this clock for future release operations? ALWAYS_INLINE bool SyncClock::Cachable() const { if (size_ == 0) return false; for (unsigned i = 0; i < kDirtyTids; i++) { if (dirty_[i].tid != kInvalidTid) return false; } return atomic_load_relaxed(ref_ptr(tab_)) == 1; } // elem linearizes the two-level structure into linear array. // Note: this is used only for one time accesses, vector operations use // the iterator as it is much faster. ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const { DCHECK_LT(tid, size_); const uptr block = tid / ClockBlock::kClockCount; DCHECK_LE(block, blocks_); tid %= ClockBlock::kClockCount; if (block == blocks_) return tab_->clock[tid]; u32 idx = get_block(block); ClockBlock *cb = ctx->clock_alloc.Map(idx); return cb->clock[tid]; } ALWAYS_INLINE uptr SyncClock::capacity() const { if (size_ == 0) return 0; uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]); // How many clock elements we can fit into the first level block. // +1 for ref counter. uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio; return blocks_ * ClockBlock::kClockCount + top; } ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const { DCHECK(size_); DCHECK_LT(bi, blocks_); return tab_->table[ClockBlock::kBlockIdx - bi]; } ALWAYS_INLINE void SyncClock::append_block(u32 idx) { uptr bi = blocks_++; CHECK_EQ(get_block(bi), 0); tab_->table[ClockBlock::kBlockIdx - bi] = idx; } // Used only by tests. u64 SyncClock::get(unsigned tid) const { for (unsigned i = 0; i < kDirtyTids; i++) { Dirty dirty = dirty_[i]; if (dirty.tid == tid) return dirty.epoch; } return elem(tid).epoch; } // Used only by Iter test. u64 SyncClock::get_clean(unsigned tid) const { return elem(tid).epoch; } void SyncClock::DebugDump(int(*printf)(const char *s, ...)) { printf("clock=["); for (uptr i = 0; i < size_; i++) printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch); printf("] reused=["); for (uptr i = 0; i < size_; i++) printf("%s%llu", i == 0 ? "" : ",", elem(i).reused); printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]", release_store_tid_, release_store_reused_, dirty_[0].tid, dirty_[0].epoch, dirty_[1].tid, dirty_[1].epoch); } void SyncClock::Iter::Next() { // Finished with the current block, move on to the next one. block_++; if (block_ < parent_->blocks_) { // Iterate over the next second level block. u32 idx = parent_->get_block(block_); ClockBlock *cb = ctx->clock_alloc.Map(idx); pos_ = &cb->clock[0]; end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, ClockBlock::kClockCount); return; } if (block_ == parent_->blocks_ && parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) { // Iterate over elements in the first level block. pos_ = &parent_->tab_->clock[0]; end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, ClockBlock::kClockCount); return; } parent_ = nullptr; // denotes end } } // namespace __tsan