thread_watcher.cc revision 4e180b6a0b4720a9b8e9e959a882386f690f08ff
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/metrics/thread_watcher.h"
6
7#include <math.h>  // ceil
8
9#include "base/bind.h"
10#include "base/compiler_specific.h"
11#include "base/debug/alias.h"
12#include "base/lazy_instance.h"
13#include "base/strings/string_number_conversions.h"
14#include "base/strings/string_split.h"
15#include "base/strings/string_tokenizer.h"
16#include "base/strings/stringprintf.h"
17#include "base/threading/thread_restrictions.h"
18#include "build/build_config.h"
19#include "chrome/browser/metrics/metrics_service.h"
20#include "chrome/common/chrome_switches.h"
21#include "chrome/common/chrome_version_info.h"
22#include "chrome/common/dump_without_crashing.h"
23#include "chrome/common/logging_chrome.h"
24
25#if defined(OS_WIN)
26#include "base/win/windows_version.h"
27#endif
28
29using content::BrowserThread;
30
31namespace {
32
33// The following are unique function names for forcing the crash when a thread
34// is unresponsive. This makes it possible to tell from the callstack alone what
35// thread was unresponsive.
36//
37// We disable optimizations for this block of functions so the compiler doesn't
38// merge them all together.
39MSVC_DISABLE_OPTIMIZE()
40MSVC_PUSH_DISABLE_WARNING(4748)
41
42int* NullPointer() {
43  return reinterpret_cast<int*>(NULL);
44}
45
46void NullPointerCrash(int line_number) {
47#ifndef NDEBUG
48  *NullPointer() = line_number;  // Crash.
49#else
50  logging::DumpWithoutCrashing();
51#endif
52}
53
54NOINLINE void ShutdownCrash() {
55  NullPointerCrash(__LINE__);
56}
57
58NOINLINE void ThreadUnresponsive_UI() {
59  NullPointerCrash(__LINE__);
60}
61
62NOINLINE void ThreadUnresponsive_DB() {
63  NullPointerCrash(__LINE__);
64}
65
66NOINLINE void ThreadUnresponsive_FILE() {
67  NullPointerCrash(__LINE__);
68}
69
70NOINLINE void ThreadUnresponsive_FILE_USER_BLOCKING() {
71  NullPointerCrash(__LINE__);
72}
73
74NOINLINE void ThreadUnresponsive_PROCESS_LAUNCHER() {
75  NullPointerCrash(__LINE__);
76}
77
78NOINLINE void ThreadUnresponsive_CACHE() {
79  NullPointerCrash(__LINE__);
80}
81
82NOINLINE void ThreadUnresponsive_IO() {
83  NullPointerCrash(__LINE__);
84}
85
86MSVC_POP_WARNING()
87MSVC_ENABLE_OPTIMIZE();
88
89void CrashBecauseThreadWasUnresponsive(BrowserThread::ID thread_id) {
90  base::debug::Alias(&thread_id);
91
92  switch (thread_id) {
93    case BrowserThread::UI:
94      return ThreadUnresponsive_UI();
95    case BrowserThread::DB:
96      return ThreadUnresponsive_DB();
97    case BrowserThread::FILE:
98      return ThreadUnresponsive_FILE();
99    case BrowserThread::FILE_USER_BLOCKING:
100      return ThreadUnresponsive_FILE_USER_BLOCKING();
101    case BrowserThread::PROCESS_LAUNCHER:
102      return ThreadUnresponsive_PROCESS_LAUNCHER();
103    case BrowserThread::CACHE:
104      return ThreadUnresponsive_CACHE();
105    case BrowserThread::IO:
106      return ThreadUnresponsive_IO();
107    case BrowserThread::ID_COUNT:
108      CHECK(false);  // This shouldn't actually be reached!
109      break;
110
111    // Omission of the default hander is intentional -- that way the compiler
112    // should warn if our switch becomes outdated.
113  }
114
115  CHECK(false) << "Unknown thread was unresponsive.";  // Shouldn't be reached.
116}
117
118}  // namespace
119
120// ThreadWatcher methods and members.
121ThreadWatcher::ThreadWatcher(const WatchingParams& params)
122    : thread_id_(params.thread_id),
123      thread_name_(params.thread_name),
124      watched_loop_(
125          BrowserThread::GetMessageLoopProxyForThread(params.thread_id)),
126      sleep_time_(params.sleep_time),
127      unresponsive_time_(params.unresponsive_time),
128      ping_time_(base::TimeTicks::Now()),
129      pong_time_(ping_time_),
130      ping_sequence_number_(0),
131      active_(false),
132      ping_count_(params.unresponsive_threshold),
133      response_time_histogram_(NULL),
134      unresponsive_time_histogram_(NULL),
135      unresponsive_count_(0),
136      hung_processing_complete_(false),
137      unresponsive_threshold_(params.unresponsive_threshold),
138      crash_on_hang_(params.crash_on_hang),
139      live_threads_threshold_(params.live_threads_threshold),
140      weak_ptr_factory_(this) {
141  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
142  Initialize();
143}
144
145ThreadWatcher::~ThreadWatcher() {}
146
147// static
148void ThreadWatcher::StartWatching(const WatchingParams& params) {
149  DCHECK_GE(params.sleep_time.InMilliseconds(), 0);
150  DCHECK_GE(params.unresponsive_time.InMilliseconds(),
151            params.sleep_time.InMilliseconds());
152
153  // If we are not on WatchDogThread, then post a task to call StartWatching on
154  // WatchDogThread.
155  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
156    WatchDogThread::PostTask(
157        FROM_HERE,
158        base::Bind(&ThreadWatcher::StartWatching, params));
159    return;
160  }
161
162  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
163
164  // Create a new thread watcher object for the given thread and activate it.
165  ThreadWatcher* watcher = new ThreadWatcher(params);
166
167  DCHECK(watcher);
168  // If we couldn't register the thread watcher object, we are shutting down,
169  // then don't activate thread watching.
170  if (!ThreadWatcherList::IsRegistered(params.thread_id))
171    return;
172  watcher->ActivateThreadWatching();
173}
174
175void ThreadWatcher::ActivateThreadWatching() {
176  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
177  if (active_) return;
178  active_ = true;
179  ping_count_ = unresponsive_threshold_;
180  ResetHangCounters();
181  base::MessageLoop::current()->PostTask(
182      FROM_HERE,
183      base::Bind(&ThreadWatcher::PostPingMessage,
184                 weak_ptr_factory_.GetWeakPtr()));
185}
186
187void ThreadWatcher::DeActivateThreadWatching() {
188  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
189  active_ = false;
190  ping_count_ = 0;
191  weak_ptr_factory_.InvalidateWeakPtrs();
192}
193
194void ThreadWatcher::WakeUp() {
195  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
196  // There is some user activity, PostPingMessage task of thread watcher if
197  // needed.
198  if (!active_) return;
199
200  // Throw away the previous |unresponsive_count_| and start over again. Just
201  // before going to sleep, |unresponsive_count_| could be very close to
202  // |unresponsive_threshold_| and when user becomes active,
203  // |unresponsive_count_| can go over |unresponsive_threshold_| if there was no
204  // response for ping messages. Reset |unresponsive_count_| to start measuring
205  // the unresponsiveness of the threads when system becomes active.
206  unresponsive_count_ = 0;
207
208  if (ping_count_ <= 0) {
209    ping_count_ = unresponsive_threshold_;
210    ResetHangCounters();
211    PostPingMessage();
212  } else {
213    ping_count_ = unresponsive_threshold_;
214  }
215}
216
217void ThreadWatcher::PostPingMessage() {
218  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
219  // If we have stopped watching or if the user is idle, then stop sending
220  // ping messages.
221  if (!active_ || ping_count_ <= 0)
222    return;
223
224  // Save the current time when we have sent ping message.
225  ping_time_ = base::TimeTicks::Now();
226
227  // Send a ping message to the watched thread. Callback will be called on
228  // the WatchDogThread.
229  base::Closure callback(
230      base::Bind(&ThreadWatcher::OnPongMessage, weak_ptr_factory_.GetWeakPtr(),
231                 ping_sequence_number_));
232  if (watched_loop_->PostTask(
233          FROM_HERE,
234          base::Bind(&ThreadWatcher::OnPingMessage, thread_id_,
235                     callback))) {
236      // Post a task to check the responsiveness of watched thread.
237      base::MessageLoop::current()->PostDelayedTask(
238          FROM_HERE,
239          base::Bind(&ThreadWatcher::OnCheckResponsiveness,
240                     weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
241          unresponsive_time_);
242  } else {
243    // Watched thread might have gone away, stop watching it.
244    DeActivateThreadWatching();
245  }
246}
247
248void ThreadWatcher::OnPongMessage(uint64 ping_sequence_number) {
249  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
250
251  // Record watched thread's response time.
252  base::TimeTicks now = base::TimeTicks::Now();
253  base::TimeDelta response_time = now - ping_time_;
254  response_time_histogram_->AddTime(response_time);
255
256  // Save the current time when we have got pong message.
257  pong_time_ = now;
258
259  // Check if there are any extra pings in flight.
260  DCHECK_EQ(ping_sequence_number_, ping_sequence_number);
261  if (ping_sequence_number_ != ping_sequence_number)
262    return;
263
264  // Increment sequence number for the next ping message to indicate watched
265  // thread is responsive.
266  ++ping_sequence_number_;
267
268  // If we have stopped watching or if the user is idle, then stop sending
269  // ping messages.
270  if (!active_ || --ping_count_ <= 0)
271    return;
272
273  base::MessageLoop::current()->PostDelayedTask(
274      FROM_HERE,
275      base::Bind(&ThreadWatcher::PostPingMessage,
276                 weak_ptr_factory_.GetWeakPtr()),
277      sleep_time_);
278}
279
280void ThreadWatcher::OnCheckResponsiveness(uint64 ping_sequence_number) {
281  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
282  // If we have stopped watching then consider thread as responding.
283  if (!active_) {
284    responsive_ = true;
285    return;
286  }
287  // If the latest ping_sequence_number_ is not same as the ping_sequence_number
288  // that is passed in, then we can assume OnPongMessage was called.
289  // OnPongMessage increments ping_sequence_number_.
290  if (ping_sequence_number_ != ping_sequence_number) {
291    // Reset unresponsive_count_ to zero because we got a response from the
292    // watched thread.
293    ResetHangCounters();
294
295    responsive_ = true;
296    return;
297  }
298  // Record that we got no response from watched thread.
299  GotNoResponse();
300
301  // Post a task to check the responsiveness of watched thread.
302  base::MessageLoop::current()->PostDelayedTask(
303      FROM_HERE,
304      base::Bind(&ThreadWatcher::OnCheckResponsiveness,
305                 weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
306      unresponsive_time_);
307  responsive_ = false;
308}
309
310void ThreadWatcher::Initialize() {
311  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
312  ThreadWatcherList::Register(this);
313
314  const std::string response_time_histogram_name =
315      "ThreadWatcher.ResponseTime." + thread_name_;
316  response_time_histogram_ = base::Histogram::FactoryTimeGet(
317      response_time_histogram_name,
318      base::TimeDelta::FromMilliseconds(1),
319      base::TimeDelta::FromSeconds(100), 50,
320      base::Histogram::kUmaTargetedHistogramFlag);
321
322  const std::string unresponsive_time_histogram_name =
323      "ThreadWatcher.Unresponsive." + thread_name_;
324  unresponsive_time_histogram_ = base::Histogram::FactoryTimeGet(
325      unresponsive_time_histogram_name,
326      base::TimeDelta::FromMilliseconds(1),
327      base::TimeDelta::FromSeconds(100), 50,
328      base::Histogram::kUmaTargetedHistogramFlag);
329
330  const std::string responsive_count_histogram_name =
331      "ThreadWatcher.ResponsiveThreads." + thread_name_;
332  responsive_count_histogram_ = base::LinearHistogram::FactoryGet(
333      responsive_count_histogram_name, 1, 10, 11,
334      base::Histogram::kUmaTargetedHistogramFlag);
335
336  const std::string unresponsive_count_histogram_name =
337      "ThreadWatcher.UnresponsiveThreads." + thread_name_;
338  unresponsive_count_histogram_ = base::LinearHistogram::FactoryGet(
339      unresponsive_count_histogram_name, 1, 10, 11,
340      base::Histogram::kUmaTargetedHistogramFlag);
341}
342
343// static
344void ThreadWatcher::OnPingMessage(const BrowserThread::ID& thread_id,
345                                  const base::Closure& callback_task) {
346  // This method is called on watched thread.
347  DCHECK(BrowserThread::CurrentlyOn(thread_id));
348  WatchDogThread::PostTask(FROM_HERE, callback_task);
349}
350
351void ThreadWatcher::ResetHangCounters() {
352  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
353  unresponsive_count_ = 0;
354  hung_processing_complete_ = false;
355}
356
357void ThreadWatcher::GotNoResponse() {
358  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
359
360  ++unresponsive_count_;
361  if (!IsVeryUnresponsive())
362    return;
363
364  // Record total unresponsive_time since last pong message.
365  base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_;
366  unresponsive_time_histogram_->AddTime(unresponse_time);
367
368  // We have already collected stats for the non-responding watched thread.
369  if (hung_processing_complete_)
370    return;
371
372  // Record how other threads are responding.
373  uint32 responding_thread_count = 0;
374  uint32 unresponding_thread_count = 0;
375  ThreadWatcherList::GetStatusOfThreads(&responding_thread_count,
376                                        &unresponding_thread_count);
377
378  // Record how many watched threads are responding.
379  responsive_count_histogram_->Add(responding_thread_count);
380
381  // Record how many watched threads are not responding.
382  unresponsive_count_histogram_->Add(unresponding_thread_count);
383
384  // Crash the browser if the watched thread is to be crashed on hang and if the
385  // number of other threads responding is less than or equal to
386  // live_threads_threshold_ and at least one other thread is responding.
387  if (crash_on_hang_ &&
388      responding_thread_count > 0 &&
389      responding_thread_count <= live_threads_threshold_) {
390    static bool crashed_once = false;
391    if (!crashed_once) {
392      crashed_once = true;
393      CrashBecauseThreadWasUnresponsive(thread_id_);
394    }
395  }
396
397  hung_processing_complete_ = true;
398}
399
400bool ThreadWatcher::IsVeryUnresponsive() {
401  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
402  return unresponsive_count_ >= unresponsive_threshold_;
403}
404
405// ThreadWatcherList methods and members.
406//
407// static
408ThreadWatcherList* ThreadWatcherList::g_thread_watcher_list_ = NULL;
409// static
410const int ThreadWatcherList::kSleepSeconds = 1;
411// static
412const int ThreadWatcherList::kUnresponsiveSeconds = 2;
413// static
414const int ThreadWatcherList::kUnresponsiveCount = 9;
415// static
416const int ThreadWatcherList::kLiveThreadsThreshold = 2;
417
418ThreadWatcherList::CrashDataThresholds::CrashDataThresholds(
419    uint32 live_threads_threshold,
420    uint32 unresponsive_threshold)
421    : live_threads_threshold(live_threads_threshold),
422      unresponsive_threshold(unresponsive_threshold) {
423}
424
425ThreadWatcherList::CrashDataThresholds::CrashDataThresholds()
426    : live_threads_threshold(kLiveThreadsThreshold),
427      unresponsive_threshold(kUnresponsiveCount) {
428}
429
430// static
431void ThreadWatcherList::StartWatchingAll(const CommandLine& command_line) {
432  // TODO(rtenneti): Enable ThreadWatcher.
433  uint32 unresponsive_threshold;
434  CrashOnHangThreadMap crash_on_hang_threads;
435  ParseCommandLine(command_line,
436                   &unresponsive_threshold,
437                   &crash_on_hang_threads);
438
439  ThreadWatcherObserver::SetupNotifications(
440      base::TimeDelta::FromSeconds(kSleepSeconds * unresponsive_threshold));
441
442  WatchDogThread::PostDelayedTask(
443      FROM_HERE,
444      base::Bind(&ThreadWatcherList::InitializeAndStartWatching,
445                 unresponsive_threshold,
446                 crash_on_hang_threads),
447      base::TimeDelta::FromSeconds(120));
448}
449
450// static
451void ThreadWatcherList::StopWatchingAll() {
452  // TODO(rtenneti): Enable ThreadWatcher.
453  ThreadWatcherObserver::RemoveNotifications();
454  DeleteAll();
455}
456
457// static
458void ThreadWatcherList::Register(ThreadWatcher* watcher) {
459  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
460  if (!g_thread_watcher_list_)
461    return;
462  DCHECK(!g_thread_watcher_list_->Find(watcher->thread_id()));
463  g_thread_watcher_list_->registered_[watcher->thread_id()] = watcher;
464}
465
466// static
467bool ThreadWatcherList::IsRegistered(const BrowserThread::ID thread_id) {
468  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
469  return NULL != ThreadWatcherList::Find(thread_id);
470}
471
472// static
473void ThreadWatcherList::GetStatusOfThreads(uint32* responding_thread_count,
474                                           uint32* unresponding_thread_count) {
475  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
476  *responding_thread_count = 0;
477  *unresponding_thread_count = 0;
478  if (!g_thread_watcher_list_)
479    return;
480
481  for (RegistrationList::iterator it =
482           g_thread_watcher_list_->registered_.begin();
483       g_thread_watcher_list_->registered_.end() != it;
484       ++it) {
485    if (it->second->IsVeryUnresponsive())
486      ++(*unresponding_thread_count);
487    else
488      ++(*responding_thread_count);
489  }
490}
491
492// static
493void ThreadWatcherList::WakeUpAll() {
494  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
495  if (!g_thread_watcher_list_)
496    return;
497
498  for (RegistrationList::iterator it =
499           g_thread_watcher_list_->registered_.begin();
500       g_thread_watcher_list_->registered_.end() != it;
501       ++it)
502    it->second->WakeUp();
503}
504
505ThreadWatcherList::ThreadWatcherList() {
506  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
507  CHECK(!g_thread_watcher_list_);
508  g_thread_watcher_list_ = this;
509}
510
511ThreadWatcherList::~ThreadWatcherList() {
512  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
513  DCHECK(this == g_thread_watcher_list_);
514  g_thread_watcher_list_ = NULL;
515}
516
517// static
518void ThreadWatcherList::ParseCommandLine(
519    const CommandLine& command_line,
520    uint32* unresponsive_threshold,
521    CrashOnHangThreadMap* crash_on_hang_threads) {
522  // Initialize |unresponsive_threshold| to a default value.
523  *unresponsive_threshold = kUnresponsiveCount;
524
525  // Increase the unresponsive_threshold on the Stable and Beta channels to
526  // reduce the number of crashes due to ThreadWatcher.
527  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
528  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
529    *unresponsive_threshold *= 4;
530  } else if (channel == chrome::VersionInfo::CHANNEL_BETA) {
531    *unresponsive_threshold *= 2;
532  }
533
534#if defined(OS_WIN)
535  // For Windows XP (old systems), double the unresponsive_threshold to give
536  // the OS a chance to schedule UI/IO threads a time slice to respond with a
537  // pong message (to get around limitations with the OS).
538  if (base::win::GetVersion() <= base::win::VERSION_XP)
539    *unresponsive_threshold *= 2;
540#endif
541
542  uint32 crash_seconds = *unresponsive_threshold * kUnresponsiveSeconds;
543  std::string crash_on_hang_thread_names;
544  bool has_command_line_overwrite = false;
545  if (command_line.HasSwitch(switches::kCrashOnHangThreads)) {
546    crash_on_hang_thread_names =
547        command_line.GetSwitchValueASCII(switches::kCrashOnHangThreads);
548    has_command_line_overwrite = true;
549  } else if (channel != chrome::VersionInfo::CHANNEL_STABLE) {
550    // Default to crashing the browser if UI or IO or FILE threads are not
551    // responsive except in stable channel.
552    crash_on_hang_thread_names = base::StringPrintf(
553        "UI:%d:%d,IO:%d:%d,FILE:%d:%d",
554        kLiveThreadsThreshold, crash_seconds,
555        kLiveThreadsThreshold, crash_seconds,
556        kLiveThreadsThreshold, crash_seconds * 5);
557  }
558
559  ParseCommandLineCrashOnHangThreads(crash_on_hang_thread_names,
560                                     kLiveThreadsThreshold,
561                                     crash_seconds,
562                                     crash_on_hang_threads);
563
564  if (channel != chrome::VersionInfo::CHANNEL_CANARY ||
565      has_command_line_overwrite) {
566    return;
567  }
568
569  // Set up a field trial for 100% of the users to crash if either UI or IO
570  // thread is not responsive for 30 seconds (or 15 pings).
571  scoped_refptr<base::FieldTrial> field_trial(
572      base::FieldTrialList::FactoryGetFieldTrial(
573          "ThreadWatcher", 100, "default_hung_threads",
574          2014, 10, 30, base::FieldTrial::SESSION_RANDOMIZED, NULL));
575  int hung_thread_group = field_trial->AppendGroup("hung_thread", 100);
576  if (field_trial->group() == hung_thread_group) {
577    for (CrashOnHangThreadMap::iterator it = crash_on_hang_threads->begin();
578         crash_on_hang_threads->end() != it;
579         ++it) {
580      if (it->first == "FILE")
581        continue;
582      it->second.live_threads_threshold = INT_MAX;
583      if (it->first == "UI") {
584        // TODO(rtenneti): set unresponsive threshold to 120 seconds to catch
585        // the worst UI hangs and for fewer crashes due to ThreadWatcher. Reduce
586        // it to a more reasonable time ala IO thread.
587        it->second.unresponsive_threshold = 60;
588      } else {
589        it->second.unresponsive_threshold = 15;
590      }
591    }
592  }
593}
594
595// static
596void ThreadWatcherList::ParseCommandLineCrashOnHangThreads(
597    const std::string& crash_on_hang_thread_names,
598    uint32 default_live_threads_threshold,
599    uint32 default_crash_seconds,
600    CrashOnHangThreadMap* crash_on_hang_threads) {
601  base::StringTokenizer tokens(crash_on_hang_thread_names, ",");
602  std::vector<std::string> values;
603  while (tokens.GetNext()) {
604    const std::string& token = tokens.token();
605    base::SplitString(token, ':', &values);
606    std::string thread_name = values[0];
607
608    uint32 live_threads_threshold = default_live_threads_threshold;
609    uint32 crash_seconds = default_crash_seconds;
610    if (values.size() >= 2 &&
611        (!base::StringToUint(values[1], &live_threads_threshold))) {
612      continue;
613    }
614    if (values.size() >= 3 &&
615        (!base::StringToUint(values[2], &crash_seconds))) {
616      continue;
617    }
618    uint32 unresponsive_threshold = static_cast<uint32>(
619        ceil(static_cast<float>(crash_seconds) / kUnresponsiveSeconds));
620
621    CrashDataThresholds crash_data(live_threads_threshold,
622                                   unresponsive_threshold);
623    // Use the last specifier.
624    (*crash_on_hang_threads)[thread_name] = crash_data;
625  }
626}
627
628// static
629void ThreadWatcherList::InitializeAndStartWatching(
630    uint32 unresponsive_threshold,
631    const CrashOnHangThreadMap& crash_on_hang_threads) {
632  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
633
634  ThreadWatcherList* thread_watcher_list = new ThreadWatcherList();
635  CHECK(thread_watcher_list);
636
637  BrowserThread::PostTask(
638      BrowserThread::UI,
639      FROM_HERE,
640      base::Bind(&StartupTimeBomb::DisarmStartupTimeBomb));
641
642  const base::TimeDelta kSleepTime =
643      base::TimeDelta::FromSeconds(kSleepSeconds);
644  const base::TimeDelta kUnresponsiveTime =
645      base::TimeDelta::FromSeconds(kUnresponsiveSeconds);
646
647  StartWatching(BrowserThread::UI, "UI", kSleepTime, kUnresponsiveTime,
648                unresponsive_threshold, crash_on_hang_threads);
649  StartWatching(BrowserThread::IO, "IO", kSleepTime, kUnresponsiveTime,
650                unresponsive_threshold, crash_on_hang_threads);
651  StartWatching(BrowserThread::DB, "DB", kSleepTime, kUnresponsiveTime,
652                unresponsive_threshold, crash_on_hang_threads);
653  StartWatching(BrowserThread::FILE, "FILE", kSleepTime, kUnresponsiveTime,
654                unresponsive_threshold, crash_on_hang_threads);
655  StartWatching(BrowserThread::CACHE, "CACHE", kSleepTime, kUnresponsiveTime,
656                unresponsive_threshold, crash_on_hang_threads);
657}
658
659// static
660void ThreadWatcherList::StartWatching(
661    const BrowserThread::ID& thread_id,
662    const std::string& thread_name,
663    const base::TimeDelta& sleep_time,
664    const base::TimeDelta& unresponsive_time,
665    uint32 unresponsive_threshold,
666    const CrashOnHangThreadMap& crash_on_hang_threads) {
667  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
668
669  CrashOnHangThreadMap::const_iterator it =
670      crash_on_hang_threads.find(thread_name);
671  bool crash_on_hang = false;
672  uint32 live_threads_threshold = 0;
673  if (it != crash_on_hang_threads.end()) {
674    crash_on_hang = true;
675    live_threads_threshold = it->second.live_threads_threshold;
676    unresponsive_threshold = it->second.unresponsive_threshold;
677  }
678
679  ThreadWatcher::StartWatching(
680      ThreadWatcher::WatchingParams(thread_id,
681                                    thread_name,
682                                    sleep_time,
683                                    unresponsive_time,
684                                    unresponsive_threshold,
685                                    crash_on_hang,
686                                    live_threads_threshold));
687}
688
689// static
690void ThreadWatcherList::DeleteAll() {
691  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
692    WatchDogThread::PostTask(
693        FROM_HERE,
694        base::Bind(&ThreadWatcherList::DeleteAll));
695    return;
696  }
697
698  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
699  if (!g_thread_watcher_list_)
700    return;
701
702  // Delete all thread watcher objects.
703  while (!g_thread_watcher_list_->registered_.empty()) {
704    RegistrationList::iterator it = g_thread_watcher_list_->registered_.begin();
705    delete it->second;
706    g_thread_watcher_list_->registered_.erase(it);
707  }
708
709  delete g_thread_watcher_list_;
710}
711
712// static
713ThreadWatcher* ThreadWatcherList::Find(const BrowserThread::ID& thread_id) {
714  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
715  if (!g_thread_watcher_list_)
716    return NULL;
717  RegistrationList::iterator it =
718      g_thread_watcher_list_->registered_.find(thread_id);
719  if (g_thread_watcher_list_->registered_.end() == it)
720    return NULL;
721  return it->second;
722}
723
724// ThreadWatcherObserver methods and members.
725//
726// static
727ThreadWatcherObserver* ThreadWatcherObserver::g_thread_watcher_observer_ = NULL;
728
729ThreadWatcherObserver::ThreadWatcherObserver(
730    const base::TimeDelta& wakeup_interval)
731    : last_wakeup_time_(base::TimeTicks::Now()),
732      wakeup_interval_(wakeup_interval) {
733  CHECK(!g_thread_watcher_observer_);
734  g_thread_watcher_observer_ = this;
735}
736
737ThreadWatcherObserver::~ThreadWatcherObserver() {
738  DCHECK(this == g_thread_watcher_observer_);
739  g_thread_watcher_observer_ = NULL;
740}
741
742// static
743void ThreadWatcherObserver::SetupNotifications(
744    const base::TimeDelta& wakeup_interval) {
745  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
746  ThreadWatcherObserver* observer = new ThreadWatcherObserver(wakeup_interval);
747  MetricsService::SetUpNotifications(&observer->registrar_, observer);
748}
749
750// static
751void ThreadWatcherObserver::RemoveNotifications() {
752  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
753  if (!g_thread_watcher_observer_)
754    return;
755  g_thread_watcher_observer_->registrar_.RemoveAll();
756  delete g_thread_watcher_observer_;
757}
758
759void ThreadWatcherObserver::Observe(
760    int type,
761    const content::NotificationSource& source,
762    const content::NotificationDetails& details) {
763  // There is some user activity, see if thread watchers are to be awakened.
764  base::TimeTicks now = base::TimeTicks::Now();
765  if ((now - last_wakeup_time_) < wakeup_interval_)
766    return;
767  last_wakeup_time_ = now;
768  WatchDogThread::PostTask(
769      FROM_HERE,
770      base::Bind(&ThreadWatcherList::WakeUpAll));
771}
772
773// WatchDogThread methods and members.
774
775// This lock protects g_watchdog_thread.
776static base::LazyInstance<base::Lock>::Leaky
777    g_watchdog_lock = LAZY_INSTANCE_INITIALIZER;
778
779// The singleton of this class.
780static WatchDogThread* g_watchdog_thread = NULL;
781
782WatchDogThread::WatchDogThread() : Thread("BrowserWatchdog") {
783}
784
785WatchDogThread::~WatchDogThread() {
786  Stop();
787}
788
789// static
790bool WatchDogThread::CurrentlyOnWatchDogThread() {
791  base::AutoLock lock(g_watchdog_lock.Get());
792  return g_watchdog_thread &&
793      g_watchdog_thread->message_loop() == base::MessageLoop::current();
794}
795
796// static
797bool WatchDogThread::PostTask(const tracked_objects::Location& from_here,
798                              const base::Closure& task) {
799  return PostTaskHelper(from_here, task, base::TimeDelta());
800}
801
802// static
803bool WatchDogThread::PostDelayedTask(const tracked_objects::Location& from_here,
804                                     const base::Closure& task,
805                                     base::TimeDelta delay) {
806  return PostTaskHelper(from_here, task, delay);
807}
808
809// static
810bool WatchDogThread::PostTaskHelper(
811    const tracked_objects::Location& from_here,
812    const base::Closure& task,
813    base::TimeDelta delay) {
814  {
815    base::AutoLock lock(g_watchdog_lock.Get());
816
817    base::MessageLoop* message_loop = g_watchdog_thread ?
818        g_watchdog_thread->message_loop() : NULL;
819    if (message_loop) {
820      message_loop->PostDelayedTask(from_here, task, delay);
821      return true;
822    }
823  }
824
825  return false;
826}
827
828void WatchDogThread::Init() {
829  // This thread shouldn't be allowed to perform any blocking disk I/O.
830  base::ThreadRestrictions::SetIOAllowed(false);
831
832  base::AutoLock lock(g_watchdog_lock.Get());
833  CHECK(!g_watchdog_thread);
834  g_watchdog_thread = this;
835}
836
837void WatchDogThread::CleanUp() {
838  base::AutoLock lock(g_watchdog_lock.Get());
839  g_watchdog_thread = NULL;
840}
841
842namespace {
843
844// StartupWatchDogThread methods and members.
845//
846// Class for detecting hangs during startup.
847class StartupWatchDogThread : public base::Watchdog {
848 public:
849  // Constructor specifies how long the StartupWatchDogThread will wait before
850  // alarming.
851  explicit StartupWatchDogThread(const base::TimeDelta& duration)
852      : base::Watchdog(duration, "Startup watchdog thread", true) {
853  }
854
855  // Alarm is called if the time expires after an Arm() without someone calling
856  // Disarm(). When Alarm goes off, in release mode we get the crash dump
857  // without crashing and in debug mode we break into the debugger.
858  virtual void Alarm() OVERRIDE {
859#ifndef NDEBUG
860    DCHECK(false);
861#else
862    logging::DumpWithoutCrashing();
863#endif
864  }
865
866  DISALLOW_COPY_AND_ASSIGN(StartupWatchDogThread);
867};
868
869// ShutdownWatchDogThread methods and members.
870//
871// Class for detecting hangs during shutdown.
872class ShutdownWatchDogThread : public base::Watchdog {
873 public:
874  // Constructor specifies how long the ShutdownWatchDogThread will wait before
875  // alarming.
876  explicit ShutdownWatchDogThread(const base::TimeDelta& duration)
877      : base::Watchdog(duration, "Shutdown watchdog thread", true) {
878  }
879
880  // Alarm is called if the time expires after an Arm() without someone calling
881  // Disarm(). We crash the browser if this method is called.
882  virtual void Alarm() OVERRIDE {
883    ShutdownCrash();
884  }
885
886  DISALLOW_COPY_AND_ASSIGN(ShutdownWatchDogThread);
887};
888}  // namespace
889
890// StartupTimeBomb methods and members.
891//
892// static
893StartupTimeBomb* StartupTimeBomb::g_startup_timebomb_ = NULL;
894
895StartupTimeBomb::StartupTimeBomb()
896    : startup_watchdog_(NULL),
897      thread_id_(base::PlatformThread::CurrentId()) {
898  CHECK(!g_startup_timebomb_);
899  g_startup_timebomb_ = this;
900}
901
902StartupTimeBomb::~StartupTimeBomb() {
903  DCHECK(this == g_startup_timebomb_);
904  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
905  if (startup_watchdog_)
906    Disarm();
907  g_startup_timebomb_ = NULL;
908}
909
910void StartupTimeBomb::Arm(const base::TimeDelta& duration) {
911  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
912  DCHECK(!startup_watchdog_);
913  startup_watchdog_ = new StartupWatchDogThread(duration);
914  startup_watchdog_->Arm();
915  return;
916}
917
918void StartupTimeBomb::Disarm() {
919  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
920  if (startup_watchdog_) {
921    startup_watchdog_->Disarm();
922    startup_watchdog_->Cleanup();
923    DeleteStartupWatchdog();
924  }
925}
926
927void StartupTimeBomb::DeleteStartupWatchdog() {
928  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
929  if (startup_watchdog_->IsJoinable()) {
930    // Allow the watchdog thread to shutdown on UI. Watchdog thread shutdowns
931    // very fast.
932    base::ThreadRestrictions::SetIOAllowed(true);
933    delete startup_watchdog_;
934    startup_watchdog_ = NULL;
935    return;
936  }
937  base::MessageLoop::current()->PostDelayedTask(
938      FROM_HERE,
939      base::Bind(&StartupTimeBomb::DeleteStartupWatchdog,
940                 base::Unretained(this)),
941      base::TimeDelta::FromSeconds(10));
942}
943
944// static
945void StartupTimeBomb::DisarmStartupTimeBomb() {
946  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
947  if (g_startup_timebomb_)
948    g_startup_timebomb_->Disarm();
949}
950
951// ShutdownWatcherHelper methods and members.
952//
953// ShutdownWatcherHelper is a wrapper class for detecting hangs during
954// shutdown.
955ShutdownWatcherHelper::ShutdownWatcherHelper()
956    : shutdown_watchdog_(NULL),
957      thread_id_(base::PlatformThread::CurrentId()) {
958}
959
960ShutdownWatcherHelper::~ShutdownWatcherHelper() {
961  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
962  if (shutdown_watchdog_) {
963    shutdown_watchdog_->Disarm();
964    delete shutdown_watchdog_;
965    shutdown_watchdog_ = NULL;
966  }
967}
968
969void ShutdownWatcherHelper::Arm(const base::TimeDelta& duration) {
970  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
971  DCHECK(!shutdown_watchdog_);
972  base::TimeDelta actual_duration = duration;
973
974  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
975  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
976    actual_duration *= 20;
977  } else if (channel == chrome::VersionInfo::CHANNEL_BETA ||
978             channel == chrome::VersionInfo::CHANNEL_DEV) {
979    actual_duration *= 10;
980  }
981
982#if defined(OS_WIN)
983  // On Windows XP, give twice the time for shutdown.
984  if (base::win::GetVersion() <= base::win::VERSION_XP)
985    actual_duration *= 2;
986#endif
987
988  shutdown_watchdog_ = new ShutdownWatchDogThread(actual_duration);
989  shutdown_watchdog_->Arm();
990}
991