thread_watcher.cc revision f2477e01787aa58f445919b809d89e252beef54f
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/metrics/thread_watcher.h"
6
7#include <math.h>  // ceil
8
9#include "base/bind.h"
10#include "base/compiler_specific.h"
11#include "base/debug/alias.h"
12#include "base/lazy_instance.h"
13#include "base/strings/string_number_conversions.h"
14#include "base/strings/string_split.h"
15#include "base/strings/string_tokenizer.h"
16#include "base/strings/stringprintf.h"
17#include "base/threading/thread_restrictions.h"
18#include "build/build_config.h"
19#include "chrome/browser/metrics/metrics_service.h"
20#include "chrome/common/chrome_switches.h"
21#include "chrome/common/chrome_version_info.h"
22#include "chrome/common/dump_without_crashing.h"
23#include "chrome/common/logging_chrome.h"
24
25#if defined(OS_WIN)
26#include "base/win/windows_version.h"
27#endif
28
29using content::BrowserThread;
30
31namespace {
32
33// The following are unique function names for forcing the crash when a thread
34// is unresponsive. This makes it possible to tell from the callstack alone what
35// thread was unresponsive.
36//
37// We disable optimizations for this block of functions so the compiler doesn't
38// merge them all together.
39MSVC_DISABLE_OPTIMIZE()
40MSVC_PUSH_DISABLE_WARNING(4748)
41
42#ifndef NDEBUG
43int* NullPointer() {
44  return reinterpret_cast<int*>(NULL);
45}
46#endif
47
48void NullPointerCrash(int line_number) {
49#ifndef NDEBUG
50  *NullPointer() = line_number;  // Crash.
51#else
52  logging::DumpWithoutCrashing();
53#endif
54}
55
56NOINLINE void ShutdownCrash() {
57  NullPointerCrash(__LINE__);
58}
59
60NOINLINE void ThreadUnresponsive_UI() {
61  NullPointerCrash(__LINE__);
62}
63
64NOINLINE void ThreadUnresponsive_DB() {
65  NullPointerCrash(__LINE__);
66}
67
68NOINLINE void ThreadUnresponsive_FILE() {
69  NullPointerCrash(__LINE__);
70}
71
72NOINLINE void ThreadUnresponsive_FILE_USER_BLOCKING() {
73  NullPointerCrash(__LINE__);
74}
75
76NOINLINE void ThreadUnresponsive_PROCESS_LAUNCHER() {
77  NullPointerCrash(__LINE__);
78}
79
80NOINLINE void ThreadUnresponsive_CACHE() {
81  NullPointerCrash(__LINE__);
82}
83
84NOINLINE void ThreadUnresponsive_IO() {
85  NullPointerCrash(__LINE__);
86}
87
88MSVC_POP_WARNING()
89MSVC_ENABLE_OPTIMIZE();
90
91void CrashBecauseThreadWasUnresponsive(BrowserThread::ID thread_id) {
92  base::debug::Alias(&thread_id);
93
94  switch (thread_id) {
95    case BrowserThread::UI:
96      return ThreadUnresponsive_UI();
97    case BrowserThread::DB:
98      return ThreadUnresponsive_DB();
99    case BrowserThread::FILE:
100      return ThreadUnresponsive_FILE();
101    case BrowserThread::FILE_USER_BLOCKING:
102      return ThreadUnresponsive_FILE_USER_BLOCKING();
103    case BrowserThread::PROCESS_LAUNCHER:
104      return ThreadUnresponsive_PROCESS_LAUNCHER();
105    case BrowserThread::CACHE:
106      return ThreadUnresponsive_CACHE();
107    case BrowserThread::IO:
108      return ThreadUnresponsive_IO();
109    case BrowserThread::ID_COUNT:
110      CHECK(false);  // This shouldn't actually be reached!
111      break;
112
113    // Omission of the default hander is intentional -- that way the compiler
114    // should warn if our switch becomes outdated.
115  }
116
117  CHECK(false) << "Unknown thread was unresponsive.";  // Shouldn't be reached.
118}
119
120}  // namespace
121
122// ThreadWatcher methods and members.
123ThreadWatcher::ThreadWatcher(const WatchingParams& params)
124    : thread_id_(params.thread_id),
125      thread_name_(params.thread_name),
126      watched_loop_(
127          BrowserThread::GetMessageLoopProxyForThread(params.thread_id)),
128      sleep_time_(params.sleep_time),
129      unresponsive_time_(params.unresponsive_time),
130      ping_time_(base::TimeTicks::Now()),
131      pong_time_(ping_time_),
132      ping_sequence_number_(0),
133      active_(false),
134      ping_count_(params.unresponsive_threshold),
135      response_time_histogram_(NULL),
136      unresponsive_time_histogram_(NULL),
137      unresponsive_count_(0),
138      hung_processing_complete_(false),
139      unresponsive_threshold_(params.unresponsive_threshold),
140      crash_on_hang_(params.crash_on_hang),
141      live_threads_threshold_(params.live_threads_threshold),
142      weak_ptr_factory_(this) {
143  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
144  Initialize();
145}
146
147ThreadWatcher::~ThreadWatcher() {}
148
149// static
150void ThreadWatcher::StartWatching(const WatchingParams& params) {
151  DCHECK_GE(params.sleep_time.InMilliseconds(), 0);
152  DCHECK_GE(params.unresponsive_time.InMilliseconds(),
153            params.sleep_time.InMilliseconds());
154
155  // If we are not on WatchDogThread, then post a task to call StartWatching on
156  // WatchDogThread.
157  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
158    WatchDogThread::PostTask(
159        FROM_HERE,
160        base::Bind(&ThreadWatcher::StartWatching, params));
161    return;
162  }
163
164  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
165
166  // Create a new thread watcher object for the given thread and activate it.
167  ThreadWatcher* watcher = new ThreadWatcher(params);
168
169  DCHECK(watcher);
170  // If we couldn't register the thread watcher object, we are shutting down,
171  // then don't activate thread watching.
172  if (!ThreadWatcherList::IsRegistered(params.thread_id))
173    return;
174  watcher->ActivateThreadWatching();
175}
176
177void ThreadWatcher::ActivateThreadWatching() {
178  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
179  if (active_) return;
180  active_ = true;
181  ping_count_ = unresponsive_threshold_;
182  ResetHangCounters();
183  base::MessageLoop::current()->PostTask(
184      FROM_HERE,
185      base::Bind(&ThreadWatcher::PostPingMessage,
186                 weak_ptr_factory_.GetWeakPtr()));
187}
188
189void ThreadWatcher::DeActivateThreadWatching() {
190  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
191  active_ = false;
192  ping_count_ = 0;
193  weak_ptr_factory_.InvalidateWeakPtrs();
194}
195
196void ThreadWatcher::WakeUp() {
197  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
198  // There is some user activity, PostPingMessage task of thread watcher if
199  // needed.
200  if (!active_) return;
201
202  // Throw away the previous |unresponsive_count_| and start over again. Just
203  // before going to sleep, |unresponsive_count_| could be very close to
204  // |unresponsive_threshold_| and when user becomes active,
205  // |unresponsive_count_| can go over |unresponsive_threshold_| if there was no
206  // response for ping messages. Reset |unresponsive_count_| to start measuring
207  // the unresponsiveness of the threads when system becomes active.
208  unresponsive_count_ = 0;
209
210  if (ping_count_ <= 0) {
211    ping_count_ = unresponsive_threshold_;
212    ResetHangCounters();
213    PostPingMessage();
214  } else {
215    ping_count_ = unresponsive_threshold_;
216  }
217}
218
219void ThreadWatcher::PostPingMessage() {
220  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
221  // If we have stopped watching or if the user is idle, then stop sending
222  // ping messages.
223  if (!active_ || ping_count_ <= 0)
224    return;
225
226  // Save the current time when we have sent ping message.
227  ping_time_ = base::TimeTicks::Now();
228
229  // Send a ping message to the watched thread. Callback will be called on
230  // the WatchDogThread.
231  base::Closure callback(
232      base::Bind(&ThreadWatcher::OnPongMessage, weak_ptr_factory_.GetWeakPtr(),
233                 ping_sequence_number_));
234  if (watched_loop_->PostTask(
235          FROM_HERE,
236          base::Bind(&ThreadWatcher::OnPingMessage, thread_id_,
237                     callback))) {
238      // Post a task to check the responsiveness of watched thread.
239      base::MessageLoop::current()->PostDelayedTask(
240          FROM_HERE,
241          base::Bind(&ThreadWatcher::OnCheckResponsiveness,
242                     weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
243          unresponsive_time_);
244  } else {
245    // Watched thread might have gone away, stop watching it.
246    DeActivateThreadWatching();
247  }
248}
249
250void ThreadWatcher::OnPongMessage(uint64 ping_sequence_number) {
251  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
252
253  // Record watched thread's response time.
254  base::TimeTicks now = base::TimeTicks::Now();
255  base::TimeDelta response_time = now - ping_time_;
256  response_time_histogram_->AddTime(response_time);
257
258  // Save the current time when we have got pong message.
259  pong_time_ = now;
260
261  // Check if there are any extra pings in flight.
262  DCHECK_EQ(ping_sequence_number_, ping_sequence_number);
263  if (ping_sequence_number_ != ping_sequence_number)
264    return;
265
266  // Increment sequence number for the next ping message to indicate watched
267  // thread is responsive.
268  ++ping_sequence_number_;
269
270  // If we have stopped watching or if the user is idle, then stop sending
271  // ping messages.
272  if (!active_ || --ping_count_ <= 0)
273    return;
274
275  base::MessageLoop::current()->PostDelayedTask(
276      FROM_HERE,
277      base::Bind(&ThreadWatcher::PostPingMessage,
278                 weak_ptr_factory_.GetWeakPtr()),
279      sleep_time_);
280}
281
282void ThreadWatcher::OnCheckResponsiveness(uint64 ping_sequence_number) {
283  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
284  // If we have stopped watching then consider thread as responding.
285  if (!active_) {
286    responsive_ = true;
287    return;
288  }
289  // If the latest ping_sequence_number_ is not same as the ping_sequence_number
290  // that is passed in, then we can assume OnPongMessage was called.
291  // OnPongMessage increments ping_sequence_number_.
292  if (ping_sequence_number_ != ping_sequence_number) {
293    // Reset unresponsive_count_ to zero because we got a response from the
294    // watched thread.
295    ResetHangCounters();
296
297    responsive_ = true;
298    return;
299  }
300  // Record that we got no response from watched thread.
301  GotNoResponse();
302
303  // Post a task to check the responsiveness of watched thread.
304  base::MessageLoop::current()->PostDelayedTask(
305      FROM_HERE,
306      base::Bind(&ThreadWatcher::OnCheckResponsiveness,
307                 weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
308      unresponsive_time_);
309  responsive_ = false;
310}
311
312void ThreadWatcher::Initialize() {
313  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
314  ThreadWatcherList::Register(this);
315
316  const std::string response_time_histogram_name =
317      "ThreadWatcher.ResponseTime." + thread_name_;
318  response_time_histogram_ = base::Histogram::FactoryTimeGet(
319      response_time_histogram_name,
320      base::TimeDelta::FromMilliseconds(1),
321      base::TimeDelta::FromSeconds(100), 50,
322      base::Histogram::kUmaTargetedHistogramFlag);
323
324  const std::string unresponsive_time_histogram_name =
325      "ThreadWatcher.Unresponsive." + thread_name_;
326  unresponsive_time_histogram_ = base::Histogram::FactoryTimeGet(
327      unresponsive_time_histogram_name,
328      base::TimeDelta::FromMilliseconds(1),
329      base::TimeDelta::FromSeconds(100), 50,
330      base::Histogram::kUmaTargetedHistogramFlag);
331
332  const std::string responsive_count_histogram_name =
333      "ThreadWatcher.ResponsiveThreads." + thread_name_;
334  responsive_count_histogram_ = base::LinearHistogram::FactoryGet(
335      responsive_count_histogram_name, 1, 10, 11,
336      base::Histogram::kUmaTargetedHistogramFlag);
337
338  const std::string unresponsive_count_histogram_name =
339      "ThreadWatcher.UnresponsiveThreads." + thread_name_;
340  unresponsive_count_histogram_ = base::LinearHistogram::FactoryGet(
341      unresponsive_count_histogram_name, 1, 10, 11,
342      base::Histogram::kUmaTargetedHistogramFlag);
343}
344
345// static
346void ThreadWatcher::OnPingMessage(const BrowserThread::ID& thread_id,
347                                  const base::Closure& callback_task) {
348  // This method is called on watched thread.
349  DCHECK(BrowserThread::CurrentlyOn(thread_id));
350  WatchDogThread::PostTask(FROM_HERE, callback_task);
351}
352
353void ThreadWatcher::ResetHangCounters() {
354  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
355  unresponsive_count_ = 0;
356  hung_processing_complete_ = false;
357}
358
359void ThreadWatcher::GotNoResponse() {
360  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
361
362  ++unresponsive_count_;
363  if (!IsVeryUnresponsive())
364    return;
365
366  // Record total unresponsive_time since last pong message.
367  base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_;
368  unresponsive_time_histogram_->AddTime(unresponse_time);
369
370  // We have already collected stats for the non-responding watched thread.
371  if (hung_processing_complete_)
372    return;
373
374  // Record how other threads are responding.
375  uint32 responding_thread_count = 0;
376  uint32 unresponding_thread_count = 0;
377  ThreadWatcherList::GetStatusOfThreads(&responding_thread_count,
378                                        &unresponding_thread_count);
379
380  // Record how many watched threads are responding.
381  responsive_count_histogram_->Add(responding_thread_count);
382
383  // Record how many watched threads are not responding.
384  unresponsive_count_histogram_->Add(unresponding_thread_count);
385
386  // Crash the browser if the watched thread is to be crashed on hang and if the
387  // number of other threads responding is less than or equal to
388  // live_threads_threshold_ and at least one other thread is responding.
389  if (crash_on_hang_ &&
390      responding_thread_count > 0 &&
391      responding_thread_count <= live_threads_threshold_) {
392    static bool crashed_once = false;
393    if (!crashed_once) {
394      crashed_once = true;
395      CrashBecauseThreadWasUnresponsive(thread_id_);
396    }
397  }
398
399  hung_processing_complete_ = true;
400}
401
402bool ThreadWatcher::IsVeryUnresponsive() {
403  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
404  return unresponsive_count_ >= unresponsive_threshold_;
405}
406
407// ThreadWatcherList methods and members.
408//
409// static
410ThreadWatcherList* ThreadWatcherList::g_thread_watcher_list_ = NULL;
411// static
412const int ThreadWatcherList::kSleepSeconds = 1;
413// static
414const int ThreadWatcherList::kUnresponsiveSeconds = 2;
415// static
416const int ThreadWatcherList::kUnresponsiveCount = 9;
417// static
418const int ThreadWatcherList::kLiveThreadsThreshold = 2;
419
420ThreadWatcherList::CrashDataThresholds::CrashDataThresholds(
421    uint32 live_threads_threshold,
422    uint32 unresponsive_threshold)
423    : live_threads_threshold(live_threads_threshold),
424      unresponsive_threshold(unresponsive_threshold) {
425}
426
427ThreadWatcherList::CrashDataThresholds::CrashDataThresholds()
428    : live_threads_threshold(kLiveThreadsThreshold),
429      unresponsive_threshold(kUnresponsiveCount) {
430}
431
432// static
433void ThreadWatcherList::StartWatchingAll(const CommandLine& command_line) {
434  // TODO(rtenneti): Enable ThreadWatcher.
435  uint32 unresponsive_threshold;
436  CrashOnHangThreadMap crash_on_hang_threads;
437  ParseCommandLine(command_line,
438                   &unresponsive_threshold,
439                   &crash_on_hang_threads);
440
441  ThreadWatcherObserver::SetupNotifications(
442      base::TimeDelta::FromSeconds(kSleepSeconds * unresponsive_threshold));
443
444  WatchDogThread::PostDelayedTask(
445      FROM_HERE,
446      base::Bind(&ThreadWatcherList::InitializeAndStartWatching,
447                 unresponsive_threshold,
448                 crash_on_hang_threads),
449      base::TimeDelta::FromSeconds(120));
450}
451
452// static
453void ThreadWatcherList::StopWatchingAll() {
454  // TODO(rtenneti): Enable ThreadWatcher.
455  ThreadWatcherObserver::RemoveNotifications();
456  DeleteAll();
457}
458
459// static
460void ThreadWatcherList::Register(ThreadWatcher* watcher) {
461  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
462  if (!g_thread_watcher_list_)
463    return;
464  DCHECK(!g_thread_watcher_list_->Find(watcher->thread_id()));
465  g_thread_watcher_list_->registered_[watcher->thread_id()] = watcher;
466}
467
468// static
469bool ThreadWatcherList::IsRegistered(const BrowserThread::ID thread_id) {
470  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
471  return NULL != ThreadWatcherList::Find(thread_id);
472}
473
474// static
475void ThreadWatcherList::GetStatusOfThreads(uint32* responding_thread_count,
476                                           uint32* unresponding_thread_count) {
477  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
478  *responding_thread_count = 0;
479  *unresponding_thread_count = 0;
480  if (!g_thread_watcher_list_)
481    return;
482
483  for (RegistrationList::iterator it =
484           g_thread_watcher_list_->registered_.begin();
485       g_thread_watcher_list_->registered_.end() != it;
486       ++it) {
487    if (it->second->IsVeryUnresponsive())
488      ++(*unresponding_thread_count);
489    else
490      ++(*responding_thread_count);
491  }
492}
493
494// static
495void ThreadWatcherList::WakeUpAll() {
496  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
497  if (!g_thread_watcher_list_)
498    return;
499
500  for (RegistrationList::iterator it =
501           g_thread_watcher_list_->registered_.begin();
502       g_thread_watcher_list_->registered_.end() != it;
503       ++it)
504    it->second->WakeUp();
505}
506
507ThreadWatcherList::ThreadWatcherList() {
508  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
509  CHECK(!g_thread_watcher_list_);
510  g_thread_watcher_list_ = this;
511}
512
513ThreadWatcherList::~ThreadWatcherList() {
514  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
515  DCHECK(this == g_thread_watcher_list_);
516  g_thread_watcher_list_ = NULL;
517}
518
519// static
520void ThreadWatcherList::ParseCommandLine(
521    const CommandLine& command_line,
522    uint32* unresponsive_threshold,
523    CrashOnHangThreadMap* crash_on_hang_threads) {
524  // Initialize |unresponsive_threshold| to a default value.
525  *unresponsive_threshold = kUnresponsiveCount;
526
527  // Increase the unresponsive_threshold on the Stable and Beta channels to
528  // reduce the number of crashes due to ThreadWatcher.
529  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
530  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
531    *unresponsive_threshold *= 4;
532  } else if (channel == chrome::VersionInfo::CHANNEL_BETA) {
533    *unresponsive_threshold *= 2;
534  }
535
536#if defined(OS_WIN)
537  // For Windows XP (old systems), double the unresponsive_threshold to give
538  // the OS a chance to schedule UI/IO threads a time slice to respond with a
539  // pong message (to get around limitations with the OS).
540  if (base::win::GetVersion() <= base::win::VERSION_XP)
541    *unresponsive_threshold *= 2;
542#endif
543
544  uint32 crash_seconds = *unresponsive_threshold * kUnresponsiveSeconds;
545  std::string crash_on_hang_thread_names;
546  bool has_command_line_overwrite = false;
547  if (command_line.HasSwitch(switches::kCrashOnHangThreads)) {
548    crash_on_hang_thread_names =
549        command_line.GetSwitchValueASCII(switches::kCrashOnHangThreads);
550    has_command_line_overwrite = true;
551  } else if (channel != chrome::VersionInfo::CHANNEL_STABLE) {
552    // Default to crashing the browser if UI or IO or FILE threads are not
553    // responsive except in stable channel.
554    crash_on_hang_thread_names = base::StringPrintf(
555        "UI:%d:%d,IO:%d:%d,FILE:%d:%d",
556        kLiveThreadsThreshold, crash_seconds,
557        kLiveThreadsThreshold, crash_seconds,
558        kLiveThreadsThreshold, crash_seconds * 5);
559  }
560
561  ParseCommandLineCrashOnHangThreads(crash_on_hang_thread_names,
562                                     kLiveThreadsThreshold,
563                                     crash_seconds,
564                                     crash_on_hang_threads);
565
566  if (channel != chrome::VersionInfo::CHANNEL_CANARY ||
567      has_command_line_overwrite) {
568    return;
569  }
570
571  // Set up a field trial for 100% of the users to crash if either UI or IO
572  // thread is not responsive for 30 seconds (or 15 pings).
573  scoped_refptr<base::FieldTrial> field_trial(
574      base::FieldTrialList::FactoryGetFieldTrial(
575          "ThreadWatcher", 100, "default_hung_threads",
576          2014, 10, 30, base::FieldTrial::SESSION_RANDOMIZED, NULL));
577  int hung_thread_group = field_trial->AppendGroup("hung_thread", 100);
578  if (field_trial->group() == hung_thread_group) {
579    for (CrashOnHangThreadMap::iterator it = crash_on_hang_threads->begin();
580         crash_on_hang_threads->end() != it;
581         ++it) {
582      if (it->first == "FILE")
583        continue;
584      it->second.live_threads_threshold = INT_MAX;
585      if (it->first == "UI") {
586        // TODO(rtenneti): set unresponsive threshold to 120 seconds to catch
587        // the worst UI hangs and for fewer crashes due to ThreadWatcher. Reduce
588        // it to a more reasonable time ala IO thread.
589        it->second.unresponsive_threshold = 60;
590      } else {
591        it->second.unresponsive_threshold = 15;
592      }
593    }
594  }
595}
596
597// static
598void ThreadWatcherList::ParseCommandLineCrashOnHangThreads(
599    const std::string& crash_on_hang_thread_names,
600    uint32 default_live_threads_threshold,
601    uint32 default_crash_seconds,
602    CrashOnHangThreadMap* crash_on_hang_threads) {
603  base::StringTokenizer tokens(crash_on_hang_thread_names, ",");
604  std::vector<std::string> values;
605  while (tokens.GetNext()) {
606    const std::string& token = tokens.token();
607    base::SplitString(token, ':', &values);
608    std::string thread_name = values[0];
609
610    uint32 live_threads_threshold = default_live_threads_threshold;
611    uint32 crash_seconds = default_crash_seconds;
612    if (values.size() >= 2 &&
613        (!base::StringToUint(values[1], &live_threads_threshold))) {
614      continue;
615    }
616    if (values.size() >= 3 &&
617        (!base::StringToUint(values[2], &crash_seconds))) {
618      continue;
619    }
620    uint32 unresponsive_threshold = static_cast<uint32>(
621        ceil(static_cast<float>(crash_seconds) / kUnresponsiveSeconds));
622
623    CrashDataThresholds crash_data(live_threads_threshold,
624                                   unresponsive_threshold);
625    // Use the last specifier.
626    (*crash_on_hang_threads)[thread_name] = crash_data;
627  }
628}
629
630// static
631void ThreadWatcherList::InitializeAndStartWatching(
632    uint32 unresponsive_threshold,
633    const CrashOnHangThreadMap& crash_on_hang_threads) {
634  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
635
636  ThreadWatcherList* thread_watcher_list = new ThreadWatcherList();
637  CHECK(thread_watcher_list);
638
639  BrowserThread::PostTask(
640      BrowserThread::UI,
641      FROM_HERE,
642      base::Bind(&StartupTimeBomb::DisarmStartupTimeBomb));
643
644  const base::TimeDelta kSleepTime =
645      base::TimeDelta::FromSeconds(kSleepSeconds);
646  const base::TimeDelta kUnresponsiveTime =
647      base::TimeDelta::FromSeconds(kUnresponsiveSeconds);
648
649  StartWatching(BrowserThread::UI, "UI", kSleepTime, kUnresponsiveTime,
650                unresponsive_threshold, crash_on_hang_threads);
651  StartWatching(BrowserThread::IO, "IO", kSleepTime, kUnresponsiveTime,
652                unresponsive_threshold, crash_on_hang_threads);
653  StartWatching(BrowserThread::DB, "DB", kSleepTime, kUnresponsiveTime,
654                unresponsive_threshold, crash_on_hang_threads);
655  StartWatching(BrowserThread::FILE, "FILE", kSleepTime, kUnresponsiveTime,
656                unresponsive_threshold, crash_on_hang_threads);
657  StartWatching(BrowserThread::CACHE, "CACHE", kSleepTime, kUnresponsiveTime,
658                unresponsive_threshold, crash_on_hang_threads);
659}
660
661// static
662void ThreadWatcherList::StartWatching(
663    const BrowserThread::ID& thread_id,
664    const std::string& thread_name,
665    const base::TimeDelta& sleep_time,
666    const base::TimeDelta& unresponsive_time,
667    uint32 unresponsive_threshold,
668    const CrashOnHangThreadMap& crash_on_hang_threads) {
669  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
670
671  CrashOnHangThreadMap::const_iterator it =
672      crash_on_hang_threads.find(thread_name);
673  bool crash_on_hang = false;
674  uint32 live_threads_threshold = 0;
675  if (it != crash_on_hang_threads.end()) {
676    crash_on_hang = true;
677    live_threads_threshold = it->second.live_threads_threshold;
678    unresponsive_threshold = it->second.unresponsive_threshold;
679  }
680
681  ThreadWatcher::StartWatching(
682      ThreadWatcher::WatchingParams(thread_id,
683                                    thread_name,
684                                    sleep_time,
685                                    unresponsive_time,
686                                    unresponsive_threshold,
687                                    crash_on_hang,
688                                    live_threads_threshold));
689}
690
691// static
692void ThreadWatcherList::DeleteAll() {
693  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
694    WatchDogThread::PostTask(
695        FROM_HERE,
696        base::Bind(&ThreadWatcherList::DeleteAll));
697    return;
698  }
699
700  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
701  if (!g_thread_watcher_list_)
702    return;
703
704  // Delete all thread watcher objects.
705  while (!g_thread_watcher_list_->registered_.empty()) {
706    RegistrationList::iterator it = g_thread_watcher_list_->registered_.begin();
707    delete it->second;
708    g_thread_watcher_list_->registered_.erase(it);
709  }
710
711  delete g_thread_watcher_list_;
712}
713
714// static
715ThreadWatcher* ThreadWatcherList::Find(const BrowserThread::ID& thread_id) {
716  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
717  if (!g_thread_watcher_list_)
718    return NULL;
719  RegistrationList::iterator it =
720      g_thread_watcher_list_->registered_.find(thread_id);
721  if (g_thread_watcher_list_->registered_.end() == it)
722    return NULL;
723  return it->second;
724}
725
726// ThreadWatcherObserver methods and members.
727//
728// static
729ThreadWatcherObserver* ThreadWatcherObserver::g_thread_watcher_observer_ = NULL;
730
731ThreadWatcherObserver::ThreadWatcherObserver(
732    const base::TimeDelta& wakeup_interval)
733    : last_wakeup_time_(base::TimeTicks::Now()),
734      wakeup_interval_(wakeup_interval) {
735  CHECK(!g_thread_watcher_observer_);
736  g_thread_watcher_observer_ = this;
737}
738
739ThreadWatcherObserver::~ThreadWatcherObserver() {
740  DCHECK(this == g_thread_watcher_observer_);
741  g_thread_watcher_observer_ = NULL;
742}
743
744// static
745void ThreadWatcherObserver::SetupNotifications(
746    const base::TimeDelta& wakeup_interval) {
747  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
748  ThreadWatcherObserver* observer = new ThreadWatcherObserver(wakeup_interval);
749  MetricsService::SetUpNotifications(&observer->registrar_, observer);
750}
751
752// static
753void ThreadWatcherObserver::RemoveNotifications() {
754  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
755  if (!g_thread_watcher_observer_)
756    return;
757  g_thread_watcher_observer_->registrar_.RemoveAll();
758  delete g_thread_watcher_observer_;
759}
760
761void ThreadWatcherObserver::Observe(
762    int type,
763    const content::NotificationSource& source,
764    const content::NotificationDetails& details) {
765  // There is some user activity, see if thread watchers are to be awakened.
766  base::TimeTicks now = base::TimeTicks::Now();
767  if ((now - last_wakeup_time_) < wakeup_interval_)
768    return;
769  last_wakeup_time_ = now;
770  WatchDogThread::PostTask(
771      FROM_HERE,
772      base::Bind(&ThreadWatcherList::WakeUpAll));
773}
774
775// WatchDogThread methods and members.
776
777// This lock protects g_watchdog_thread.
778static base::LazyInstance<base::Lock>::Leaky
779    g_watchdog_lock = LAZY_INSTANCE_INITIALIZER;
780
781// The singleton of this class.
782static WatchDogThread* g_watchdog_thread = NULL;
783
784WatchDogThread::WatchDogThread() : Thread("BrowserWatchdog") {
785}
786
787WatchDogThread::~WatchDogThread() {
788  Stop();
789}
790
791// static
792bool WatchDogThread::CurrentlyOnWatchDogThread() {
793  base::AutoLock lock(g_watchdog_lock.Get());
794  return g_watchdog_thread &&
795      g_watchdog_thread->message_loop() == base::MessageLoop::current();
796}
797
798// static
799bool WatchDogThread::PostTask(const tracked_objects::Location& from_here,
800                              const base::Closure& task) {
801  return PostTaskHelper(from_here, task, base::TimeDelta());
802}
803
804// static
805bool WatchDogThread::PostDelayedTask(const tracked_objects::Location& from_here,
806                                     const base::Closure& task,
807                                     base::TimeDelta delay) {
808  return PostTaskHelper(from_here, task, delay);
809}
810
811// static
812bool WatchDogThread::PostTaskHelper(
813    const tracked_objects::Location& from_here,
814    const base::Closure& task,
815    base::TimeDelta delay) {
816  {
817    base::AutoLock lock(g_watchdog_lock.Get());
818
819    base::MessageLoop* message_loop = g_watchdog_thread ?
820        g_watchdog_thread->message_loop() : NULL;
821    if (message_loop) {
822      message_loop->PostDelayedTask(from_here, task, delay);
823      return true;
824    }
825  }
826
827  return false;
828}
829
830void WatchDogThread::Init() {
831  // This thread shouldn't be allowed to perform any blocking disk I/O.
832  base::ThreadRestrictions::SetIOAllowed(false);
833
834  base::AutoLock lock(g_watchdog_lock.Get());
835  CHECK(!g_watchdog_thread);
836  g_watchdog_thread = this;
837}
838
839void WatchDogThread::CleanUp() {
840  base::AutoLock lock(g_watchdog_lock.Get());
841  g_watchdog_thread = NULL;
842}
843
844namespace {
845
846// StartupWatchDogThread methods and members.
847//
848// Class for detecting hangs during startup.
849class StartupWatchDogThread : public base::Watchdog {
850 public:
851  // Constructor specifies how long the StartupWatchDogThread will wait before
852  // alarming.
853  explicit StartupWatchDogThread(const base::TimeDelta& duration)
854      : base::Watchdog(duration, "Startup watchdog thread", true) {
855  }
856
857  // Alarm is called if the time expires after an Arm() without someone calling
858  // Disarm(). When Alarm goes off, in release mode we get the crash dump
859  // without crashing and in debug mode we break into the debugger.
860  virtual void Alarm() OVERRIDE {
861#ifndef NDEBUG
862    DCHECK(false);
863#else
864    logging::DumpWithoutCrashing();
865#endif
866  }
867
868  DISALLOW_COPY_AND_ASSIGN(StartupWatchDogThread);
869};
870
871// ShutdownWatchDogThread methods and members.
872//
873// Class for detecting hangs during shutdown.
874class ShutdownWatchDogThread : public base::Watchdog {
875 public:
876  // Constructor specifies how long the ShutdownWatchDogThread will wait before
877  // alarming.
878  explicit ShutdownWatchDogThread(const base::TimeDelta& duration)
879      : base::Watchdog(duration, "Shutdown watchdog thread", true) {
880  }
881
882  // Alarm is called if the time expires after an Arm() without someone calling
883  // Disarm(). We crash the browser if this method is called.
884  virtual void Alarm() OVERRIDE {
885    ShutdownCrash();
886  }
887
888  DISALLOW_COPY_AND_ASSIGN(ShutdownWatchDogThread);
889};
890}  // namespace
891
892// StartupTimeBomb methods and members.
893//
894// static
895StartupTimeBomb* StartupTimeBomb::g_startup_timebomb_ = NULL;
896
897StartupTimeBomb::StartupTimeBomb()
898    : startup_watchdog_(NULL),
899      thread_id_(base::PlatformThread::CurrentId()) {
900  CHECK(!g_startup_timebomb_);
901  g_startup_timebomb_ = this;
902}
903
904StartupTimeBomb::~StartupTimeBomb() {
905  DCHECK(this == g_startup_timebomb_);
906  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
907  if (startup_watchdog_)
908    Disarm();
909  g_startup_timebomb_ = NULL;
910}
911
912void StartupTimeBomb::Arm(const base::TimeDelta& duration) {
913  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
914  DCHECK(!startup_watchdog_);
915  startup_watchdog_ = new StartupWatchDogThread(duration);
916  startup_watchdog_->Arm();
917  return;
918}
919
920void StartupTimeBomb::Disarm() {
921  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
922  if (startup_watchdog_) {
923    startup_watchdog_->Disarm();
924    startup_watchdog_->Cleanup();
925    DeleteStartupWatchdog();
926  }
927}
928
929void StartupTimeBomb::DeleteStartupWatchdog() {
930  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
931  if (startup_watchdog_->IsJoinable()) {
932    // Allow the watchdog thread to shutdown on UI. Watchdog thread shutdowns
933    // very fast.
934    base::ThreadRestrictions::SetIOAllowed(true);
935    delete startup_watchdog_;
936    startup_watchdog_ = NULL;
937    return;
938  }
939  base::MessageLoop::current()->PostDelayedTask(
940      FROM_HERE,
941      base::Bind(&StartupTimeBomb::DeleteStartupWatchdog,
942                 base::Unretained(this)),
943      base::TimeDelta::FromSeconds(10));
944}
945
946// static
947void StartupTimeBomb::DisarmStartupTimeBomb() {
948  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
949  if (g_startup_timebomb_)
950    g_startup_timebomb_->Disarm();
951}
952
953// ShutdownWatcherHelper methods and members.
954//
955// ShutdownWatcherHelper is a wrapper class for detecting hangs during
956// shutdown.
957ShutdownWatcherHelper::ShutdownWatcherHelper()
958    : shutdown_watchdog_(NULL),
959      thread_id_(base::PlatformThread::CurrentId()) {
960}
961
962ShutdownWatcherHelper::~ShutdownWatcherHelper() {
963  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
964  if (shutdown_watchdog_) {
965    shutdown_watchdog_->Disarm();
966    delete shutdown_watchdog_;
967    shutdown_watchdog_ = NULL;
968  }
969}
970
971void ShutdownWatcherHelper::Arm(const base::TimeDelta& duration) {
972  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
973  DCHECK(!shutdown_watchdog_);
974  base::TimeDelta actual_duration = duration;
975
976  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
977  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
978    actual_duration *= 20;
979  } else if (channel == chrome::VersionInfo::CHANNEL_BETA ||
980             channel == chrome::VersionInfo::CHANNEL_DEV) {
981    actual_duration *= 10;
982  }
983
984#if defined(OS_WIN)
985  // On Windows XP, give twice the time for shutdown.
986  if (base::win::GetVersion() <= base::win::VERSION_XP)
987    actual_duration *= 2;
988#endif
989
990  shutdown_watchdog_ = new ShutdownWatchDogThread(actual_duration);
991  shutdown_watchdog_->Arm();
992}
993