summaryrefslogtreecommitdiff
path: root/chromium/base/threading/hang_watcher.cc
blob: 9a4cddea653d4f577fd1f9bd008967fac283e1bc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "base/threading/hang_watcher.h"

#include <algorithm>
#include <atomic>
#include <utility>

#include "base/bind.h"
#include "base/callback_helpers.h"
#include "base/debug/alias.h"
#include "base/debug/crash_logging.h"
#include "base/debug/dump_without_crashing.h"
#include "base/feature_list.h"
#include "base/no_destructor.h"
#include "base/strings/string_number_conversions.h"
#include "base/synchronization/lock.h"
#include "base/synchronization/waitable_event.h"
#include "base/threading/platform_thread.h"
#include "base/threading/thread_checker.h"
#include "base/threading/thread_restrictions.h"
#include "base/time/default_tick_clock.h"
#include "base/time/time.h"
#include "build/build_config.h"

namespace base {

// static
constexpr base::Feature HangWatcher::kEnableHangWatcher{
    "EnableHangWatcher", base::FEATURE_DISABLED_BY_DEFAULT};

constexpr base::TimeDelta HangWatchScope::kDefaultHangWatchTime =
    base::TimeDelta::FromSeconds(10);

namespace {
HangWatcher* g_instance = nullptr;
}

constexpr const char* kThreadName = "HangWatcher";

// The time that the HangWatcher thread will sleep for between calls to
// Monitor(). Increasing or decreasing this does not modify the type of hangs
// that can be detected. It instead increases the probability that a call to
// Monitor() will happen at the right time to catch a hang. This has to be
// balanced with power/cpu use concerns as busy looping would catch amost all
// hangs but present unacceptable overhead.
const base::TimeDelta kMonitoringPeriod = base::TimeDelta::FromSeconds(10);

HangWatchScope::HangWatchScope(TimeDelta timeout) {
  internal::HangWatchState* current_hang_watch_state =
      internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();

  DCHECK(timeout >= base::TimeDelta()) << "Negative timeouts are invalid.";

  // TODO(crbug.com/1034046): Remove when all threads using HangWatchScope are
  // monitored. Thread is not monitored, noop.
  if (!current_hang_watch_state) {
    return;
  }

  DCHECK(current_hang_watch_state)
      << "A scope can only be used on a thread that "
         "registered for hang watching with HangWatcher::RegisterThread.";

#if DCHECK_IS_ON()
  previous_scope_ = current_hang_watch_state->GetCurrentHangWatchScope();
  current_hang_watch_state->SetCurrentHangWatchScope(this);
#endif

  // TODO(crbug.com/1034046): Check whether we are over deadline already for the
  // previous scope here by issuing only one TimeTicks::Now() and resuing the
  // value.

  previous_deadline_ = current_hang_watch_state->GetDeadline();
  TimeTicks deadline = TimeTicks::Now() + timeout;
  current_hang_watch_state->SetDeadline(deadline);
}

HangWatchScope::~HangWatchScope() {
  DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);

  internal::HangWatchState* current_hang_watch_state =
      internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();

  // TODO(crbug.com/1034046): Remove when all threads using HangWatchScope are
  // monitored. Thread is not monitored, noop.
  if (!current_hang_watch_state) {
    return;
  }

  // If a hang is currently being captured we should block here so execution
  // stops and the relevant stack frames are recorded.
  base::HangWatcher::GetInstance()->BlockIfCaptureInProgress();

#if DCHECK_IS_ON()
  // Verify that no Scope was destructed out of order.
  DCHECK_EQ(this, current_hang_watch_state->GetCurrentHangWatchScope());
  current_hang_watch_state->SetCurrentHangWatchScope(previous_scope_);
#endif

  // Reset the deadline to the value it had before entering this scope.
  current_hang_watch_state->SetDeadline(previous_deadline_);
  // TODO(crbug.com/1034046): Log when a HangWatchScope exits after its deadline
  // and that went undetected by the HangWatcher.
}

HangWatcher::HangWatcher()
    : monitor_period_(kMonitoringPeriod),
      should_monitor_(WaitableEvent::ResetPolicy::AUTOMATIC),
      thread_(this, kThreadName),
      tick_clock_(base::DefaultTickClock::GetInstance()) {
  // |thread_checker_| should not be bound to the constructing thread.
  DETACH_FROM_THREAD(hang_watcher_thread_checker_);

  should_monitor_.declare_only_used_while_idle();

  DCHECK(!g_instance);
  g_instance = this;
  Start();
}

HangWatcher::~HangWatcher() {
  DCHECK_EQ(g_instance, this);
  DCHECK(watch_states_.empty());
  g_instance = nullptr;
  Stop();
}

void HangWatcher::Start() {
  thread_.Start();
}

void HangWatcher::Stop() {
  keep_monitoring_.store(false, std::memory_order_relaxed);
  should_monitor_.Signal();
  thread_.Join();
}

bool HangWatcher::IsWatchListEmpty() {
  AutoLock auto_lock(watch_state_lock_);
  return watch_states_.empty();
}

void HangWatcher::Wait() {
  while (true) {
    // Amount by which the actual time spent sleeping can deviate from
    // the target time and still be considered timely.
    constexpr base::TimeDelta wait_drift_tolerance =
        base::TimeDelta::FromMilliseconds(100);

    base::TimeTicks time_before_wait = tick_clock_->NowTicks();

    // Sleep until next scheduled monitoring or until signaled.
    bool was_signaled = should_monitor_.TimedWait(monitor_period_);

    if (after_wait_callback_) {
      after_wait_callback_.Run(time_before_wait);
    }

    base::TimeTicks time_after_wait = tick_clock_->NowTicks();
    base::TimeDelta wait_time = time_after_wait - time_before_wait;
    bool wait_was_normal =
        wait_time <= (monitor_period_ + wait_drift_tolerance);

    if (!wait_was_normal) {
      // If the time spent waiting was too high it might indicate the machine is
      // very slow or that that it went to sleep. In any case we can't trust the
      // hang watch scopes that are currently live. Update the ignore threshold
      // to make sure they don't trigger a hang on subsequent monitors then keep
      // waiting.

      base::AutoLock auto_lock(watch_state_lock_);

      // Find the latest deadline among the live watch states. They might change
      // atomically while iterating but that's fine because if they do that
      // means the new HangWatchScope was constructed very soon after the
      // abnormal sleep happened and might be affected by the root cause still.
      // Ignoring it is cautious and harmless.
      base::TimeTicks latest_deadline;
      for (const auto& state : watch_states_) {
        base::TimeTicks deadline = state->GetDeadline();
        if (deadline > latest_deadline) {
          latest_deadline = deadline;
        }
      }

      deadline_ignore_threshold_ = latest_deadline;
    }

    // Stop waiting.
    if (wait_was_normal || was_signaled) {
      return;
    }
  }
}

void HangWatcher::Run() {
  // Monitor() should only run on |thread_|. Bind |thread_checker_| here to make
  // sure of that.
  DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);

  while (keep_monitoring_.load(std::memory_order_relaxed)) {
    // If there is nothing to watch sleep until there is.
    if (IsWatchListEmpty()) {
      should_monitor_.Wait();
    } else {
      Monitor();

      if (after_monitor_closure_for_testing_) {
        after_monitor_closure_for_testing_.Run();
      }
    }

    if (keep_monitoring_.load(std::memory_order_relaxed)) {
      Wait();
    }
  }
}

// static
HangWatcher* HangWatcher::GetInstance() {
  return g_instance;
}

// static
void HangWatcher::RecordHang() {
  base::debug::DumpWithoutCrashing();
  // Inhibit code folding.
  const int line_number = __LINE__;
  base::debug::Alias(&line_number);
}

ScopedClosureRunner HangWatcher::RegisterThread() {
  AutoLock auto_lock(watch_state_lock_);

  watch_states_.push_back(
      internal::HangWatchState::CreateHangWatchStateForCurrentThread());

  // Now that there is a thread to monitor we wake the HangWatcher thread.
  if (watch_states_.size() == 1) {
    should_monitor_.Signal();
  }

  return ScopedClosureRunner(BindOnce(&HangWatcher::UnregisterThread,
                                      Unretained(HangWatcher::GetInstance())));
}

base::TimeTicks HangWatcher::WatchStateSnapShot::GetHighestDeadline() const {
  DCHECK(!hung_watch_state_copies_.empty());
  // Since entries are sorted in increasing order the last entry is the largest
  // one.
  return hung_watch_state_copies_.back().deadline;
}

HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
    const HangWatchStates& watch_states,
    base::TimeTicks snapshot_time,
    base::TimeTicks deadline_ignore_threshold)
    : snapshot_time_(snapshot_time) {
  // Initial copy of the values.
  for (const auto& watch_state : watch_states) {
    base::TimeTicks deadline = watch_state.get()->GetDeadline();

    if (deadline <= deadline_ignore_threshold) {
      hung_watch_state_copies_.clear();
      return;
    }

    // Only copy hung threads.
    if (deadline <= snapshot_time) {
      hung_watch_state_copies_.push_back(
          WatchStateCopy{deadline, watch_state.get()->GetThreadID()});
    }
  }

  // Sort |hung_watch_state_copies_| by order of decreasing hang severity so the
  // most severe hang is first in the list.
  std::sort(hung_watch_state_copies_.begin(), hung_watch_state_copies_.end(),
            [](const WatchStateCopy& lhs, const WatchStateCopy& rhs) {
              return lhs.deadline < rhs.deadline;
            });
}

HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
    const WatchStateSnapShot& other) = default;

HangWatcher::WatchStateSnapShot::~WatchStateSnapShot() = default;

std::string HangWatcher::WatchStateSnapShot::PrepareHungThreadListCrashKey()
    const {
  // Build a crash key string that contains the ids of the hung threads.
  constexpr char kSeparator{'|'};
  std::string list_of_hung_thread_ids;

  // Add as many thread ids to the crash key as possible.
  for (const WatchStateCopy& copy : hung_watch_state_copies_) {
    std::string fragment = base::NumberToString(copy.thread_id) + kSeparator;
    if (list_of_hung_thread_ids.size() + fragment.size() <
        static_cast<std::size_t>(debug::CrashKeySize::Size256)) {
      list_of_hung_thread_ids += fragment;
    } else {
      // Respect the by priority ordering of thread ids in the crash key by
      // stopping the construction as soon as one does not fit. This avoids
      // including lesser priority ids while omitting more important ones.
      break;
    }
  }

  return list_of_hung_thread_ids;
}

HangWatcher::WatchStateSnapShot HangWatcher::GrabWatchStateSnapshotForTesting()
    const {
  WatchStateSnapShot snapshot(watch_states_, base::TimeTicks::Now(),
                              deadline_ignore_threshold_);
  return snapshot;
}

void HangWatcher::Monitor() {
  DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);
  AutoLock auto_lock(watch_state_lock_);

  // If all threads unregistered since this function was invoked there's
  // nothing to do anymore.
  if (watch_states_.empty())
    return;

  const base::TimeTicks now = base::TimeTicks::Now();

  // See if any thread hung. We're holding |watch_state_lock_| so threads
  // can't register or unregister but their deadline still can change
  // atomically. This is fine. Detecting a hang is generally best effort and
  // if a thread resumes from hang in the time it takes to move on to
  // capturing then its ID will be absent from the crash keys.
  bool any_thread_hung = std::any_of(
      watch_states_.cbegin(), watch_states_.cend(),
      [this, now](const std::unique_ptr<internal::HangWatchState>& state) {
        base::TimeTicks deadline = state->GetDeadline();
        return deadline > deadline_ignore_threshold_ && deadline < now;
      });

  // If at least a thread is hung we need to capture.
  if (any_thread_hung)
    CaptureHang(now);
}

void HangWatcher::CaptureHang(base::TimeTicks capture_time) {
  capture_in_progress.store(true, std::memory_order_relaxed);
  base::AutoLock scope_lock(capture_lock_);

  WatchStateSnapShot watch_state_snapshot(watch_states_, capture_time,
                                          deadline_ignore_threshold_);

  // The hung thread(s) could detected at the start of Monitor() could have
  // moved on from their scopes. If that happened and there are no more hung
  // threads then abort capture.
  std::string list_of_hung_thread_ids =
      watch_state_snapshot.PrepareHungThreadListCrashKey();
  if (list_of_hung_thread_ids.empty())
    return;

#if not defined(OS_NACL)
  static debug::CrashKeyString* crash_key = AllocateCrashKeyString(
      "list-of-hung-threads", debug::CrashKeySize::Size256);
  debug::ScopedCrashKeyString list_of_hung_threads_crash_key_string(
      crash_key, list_of_hung_thread_ids);
#endif

  // To avoid capturing more than one hang that blames a subset of the same
  // threads it's necessary to keep track of what is the furthest deadline
  // that contributed to declaring a hang. Only once
  // all threads have deadlines past this point can we be sure that a newly
  // discovered hang is not directly related.
  // Example:
  // **********************************************************************
  // Timeline A : L------1-------2----------3-------4----------N-----------
  // Timeline B : -------2----------3-------4----------L----5------N-------
  // Timeline C : L----------------------------5------6----7---8------9---N
  // **********************************************************************
  // In the example when a Monitor() happens during timeline A
  // |deadline_ignore_threshold_| (L) is at time zero and deadlines (1-4)
  // are before Now() (N) . A hang is captured and L is updated. During
  // the next Monitor() (timeline B) a new deadline is over but we can't
  // capture a hang because deadlines 2-4 are still live and already counted
  // toward a hang. During a third monitor (timeline C) all live deadlines
  // are now after L and a second hang can be recorded.
  base::TimeTicks latest_expired_deadline =
      watch_state_snapshot.GetHighestDeadline();

  if (on_hang_closure_for_testing_)
    on_hang_closure_for_testing_.Run();
  else
    RecordHang();

  // Update after running the actual capture.
  deadline_ignore_threshold_ = latest_expired_deadline;

  capture_in_progress.store(false, std::memory_order_relaxed);
}

void HangWatcher::SetAfterMonitorClosureForTesting(
    base::RepeatingClosure closure) {
  DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
  after_monitor_closure_for_testing_ = std::move(closure);
}

void HangWatcher::SetOnHangClosureForTesting(base::RepeatingClosure closure) {
  DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
  on_hang_closure_for_testing_ = std::move(closure);
}

void HangWatcher::SetMonitoringPeriodForTesting(base::TimeDelta period) {
  DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
  monitor_period_ = period;
}

void HangWatcher::SetAfterWaitCallbackForTesting(
    RepeatingCallback<void(TimeTicks)> callback) {
  DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
  after_wait_callback_ = callback;
}

void HangWatcher::SignalMonitorEventForTesting() {
  DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
  should_monitor_.Signal();
}

void HangWatcher::StopMonitoringForTesting() {
  keep_monitoring_.store(false, std::memory_order_relaxed);
}

void HangWatcher::SetTickClockForTesting(const base::TickClock* tick_clock) {
  tick_clock_ = tick_clock;
}

void HangWatcher::BlockIfCaptureInProgress() {
  // Makes a best-effort attempt to block execution if a hang is currently being
  // captured.Only block on |capture_lock| if |capture_in_progress| hints that
  // it's already held to avoid serializing all threads on this function when no
  // hang capture is in-progress.
  if (capture_in_progress.load(std::memory_order_relaxed)) {
    base::AutoLock hang_lock(capture_lock_);
  }
}

void HangWatcher::UnregisterThread() {
  AutoLock auto_lock(watch_state_lock_);

  internal::HangWatchState* current_hang_watch_state =
      internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();

  auto it =
      std::find_if(watch_states_.cbegin(), watch_states_.cend(),
                   [current_hang_watch_state](
                       const std::unique_ptr<internal::HangWatchState>& state) {
                     return state.get() == current_hang_watch_state;
                   });

  // Thread should be registered to get unregistered.
  DCHECK(it != watch_states_.end());

  watch_states_.erase(it);
}

namespace internal {

// |deadline_| starts at Max() to avoid validation problems
// when setting the first legitimate value.
HangWatchState::HangWatchState() : thread_id_(PlatformThread::CurrentId()) {
  // There should not exist a state object for this thread already.
  DCHECK(!GetHangWatchStateForCurrentThread()->Get());

  // Bind the new instance to this thread.
  GetHangWatchStateForCurrentThread()->Set(this);
}

HangWatchState::~HangWatchState() {
  DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);

  DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), this);
  GetHangWatchStateForCurrentThread()->Set(nullptr);

#if DCHECK_IS_ON()
  // Destroying the HangWatchState should not be done if there are live
  // HangWatchScopes.
  DCHECK(!current_hang_watch_scope_);
#endif
}

// static
std::unique_ptr<HangWatchState>
HangWatchState::CreateHangWatchStateForCurrentThread() {

  // Allocate a watch state object for this thread.
  std::unique_ptr<HangWatchState> hang_state =
      std::make_unique<HangWatchState>();

  // Setting the thread local worked.
  DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), hang_state.get());

  // Transfer ownership to caller.
  return hang_state;
}

TimeTicks HangWatchState::GetDeadline() const {
  return deadline_.load(std::memory_order_relaxed);
}

void HangWatchState::SetDeadline(TimeTicks deadline) {
  DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
  deadline_.store(deadline, std::memory_order_relaxed);
}

bool HangWatchState::IsOverDeadline() const {
  return TimeTicks::Now() > deadline_.load(std::memory_order_relaxed);
}

#if DCHECK_IS_ON()
void HangWatchState::SetCurrentHangWatchScope(HangWatchScope* scope) {
  DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
  current_hang_watch_scope_ = scope;
}

HangWatchScope* HangWatchState::GetCurrentHangWatchScope() {
  DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
  return current_hang_watch_scope_;
}
#endif

// static
ThreadLocalPointer<HangWatchState>*
HangWatchState::GetHangWatchStateForCurrentThread() {
  static NoDestructor<ThreadLocalPointer<HangWatchState>> hang_watch_state;
  return hang_watch_state.get();
}

PlatformThreadId HangWatchState::GetThreadID() const {
  return thread_id_;
}

}  // namespace internal

}  // namespace base