thread_watcher.cc revision 3551c9c881056c480085172ff9840cab31610854
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/metrics/thread_watcher.h"
6
7#include <math.h>  // ceil
8
9#include "base/bind.h"
10#include "base/compiler_specific.h"
11#include "base/debug/alias.h"
12#include "base/lazy_instance.h"
13#include "base/strings/string_number_conversions.h"
14#include "base/strings/string_split.h"
15#include "base/strings/string_tokenizer.h"
16#include "base/strings/stringprintf.h"
17#include "base/threading/thread_restrictions.h"
18#include "build/build_config.h"
19#include "chrome/browser/metrics/metrics_service.h"
20#include "chrome/common/chrome_switches.h"
21#include "chrome/common/chrome_version_info.h"
22#include "chrome/common/dump_without_crashing.h"
23#include "chrome/common/logging_chrome.h"
24
25#if defined(OS_WIN)
26#include "base/win/windows_version.h"
27#endif
28
29using content::BrowserThread;
30
31namespace {
32
33// The following are unique function names for forcing the crash when a thread
34// is unresponsive. This makes it possible to tell from the callstack alone what
35// thread was unresponsive.
36//
37// We disable optimizations for this block of functions so the compiler doesn't
38// merge them all together.
39MSVC_DISABLE_OPTIMIZE()
40MSVC_PUSH_DISABLE_WARNING(4748)
41
42int* NullPointer() {
43  return reinterpret_cast<int*>(NULL);
44}
45
46void NullPointerCrash(int line_number) {
47#ifndef NDEBUG
48  *NullPointer() = line_number;  // Crash.
49#else
50  logging::DumpWithoutCrashing();
51#endif
52}
53
54NOINLINE void ShutdownCrash() {
55  NullPointerCrash(__LINE__);
56}
57
58NOINLINE void ThreadUnresponsive_UI() {
59  NullPointerCrash(__LINE__);
60}
61
62NOINLINE void ThreadUnresponsive_DB() {
63  NullPointerCrash(__LINE__);
64}
65
66NOINLINE void ThreadUnresponsive_FILE() {
67  NullPointerCrash(__LINE__);
68}
69
70NOINLINE void ThreadUnresponsive_FILE_USER_BLOCKING() {
71  NullPointerCrash(__LINE__);
72}
73
74NOINLINE void ThreadUnresponsive_PROCESS_LAUNCHER() {
75  NullPointerCrash(__LINE__);
76}
77
78NOINLINE void ThreadUnresponsive_CACHE() {
79  NullPointerCrash(__LINE__);
80}
81
82NOINLINE void ThreadUnresponsive_IO() {
83  NullPointerCrash(__LINE__);
84}
85
86MSVC_POP_WARNING()
87MSVC_ENABLE_OPTIMIZE();
88
89void CrashBecauseThreadWasUnresponsive(BrowserThread::ID thread_id) {
90  base::debug::Alias(&thread_id);
91
92  switch (thread_id) {
93    case BrowserThread::UI:
94      return ThreadUnresponsive_UI();
95    case BrowserThread::DB:
96      return ThreadUnresponsive_DB();
97    case BrowserThread::FILE:
98      return ThreadUnresponsive_FILE();
99    case BrowserThread::FILE_USER_BLOCKING:
100      return ThreadUnresponsive_FILE_USER_BLOCKING();
101    case BrowserThread::PROCESS_LAUNCHER:
102      return ThreadUnresponsive_PROCESS_LAUNCHER();
103    case BrowserThread::CACHE:
104      return ThreadUnresponsive_CACHE();
105    case BrowserThread::IO:
106      return ThreadUnresponsive_IO();
107    case BrowserThread::ID_COUNT:
108      CHECK(false);  // This shouldn't actually be reached!
109      break;
110
111    // Omission of the default hander is intentional -- that way the compiler
112    // should warn if our switch becomes outdated.
113  }
114
115  CHECK(false) << "Unknown thread was unresponsive.";  // Shouldn't be reached.
116}
117
118}  // namespace
119
120// ThreadWatcher methods and members.
121ThreadWatcher::ThreadWatcher(const WatchingParams& params)
122    : thread_id_(params.thread_id),
123      thread_name_(params.thread_name),
124      watched_loop_(
125          BrowserThread::GetMessageLoopProxyForThread(params.thread_id)),
126      sleep_time_(params.sleep_time),
127      unresponsive_time_(params.unresponsive_time),
128      ping_time_(base::TimeTicks::Now()),
129      pong_time_(ping_time_),
130      ping_sequence_number_(0),
131      active_(false),
132      ping_count_(params.unresponsive_threshold),
133      response_time_histogram_(NULL),
134      unresponsive_time_histogram_(NULL),
135      unresponsive_count_(0),
136      hung_processing_complete_(false),
137      unresponsive_threshold_(params.unresponsive_threshold),
138      crash_on_hang_(params.crash_on_hang),
139      live_threads_threshold_(params.live_threads_threshold),
140      weak_ptr_factory_(this) {
141  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
142  Initialize();
143}
144
145ThreadWatcher::~ThreadWatcher() {}
146
147// static
148void ThreadWatcher::StartWatching(const WatchingParams& params) {
149  DCHECK_GE(params.sleep_time.InMilliseconds(), 0);
150  DCHECK_GE(params.unresponsive_time.InMilliseconds(),
151            params.sleep_time.InMilliseconds());
152
153  // If we are not on WatchDogThread, then post a task to call StartWatching on
154  // WatchDogThread.
155  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
156    WatchDogThread::PostTask(
157        FROM_HERE,
158        base::Bind(&ThreadWatcher::StartWatching, params));
159    return;
160  }
161
162  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
163
164  // Create a new thread watcher object for the given thread and activate it.
165  ThreadWatcher* watcher = new ThreadWatcher(params);
166
167  DCHECK(watcher);
168  // If we couldn't register the thread watcher object, we are shutting down,
169  // then don't activate thread watching.
170  if (!ThreadWatcherList::IsRegistered(params.thread_id))
171    return;
172  watcher->ActivateThreadWatching();
173}
174
175void ThreadWatcher::ActivateThreadWatching() {
176  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
177  if (active_) return;
178  active_ = true;
179  ping_count_ = unresponsive_threshold_;
180  ResetHangCounters();
181  base::MessageLoop::current()->PostTask(
182      FROM_HERE,
183      base::Bind(&ThreadWatcher::PostPingMessage,
184                 weak_ptr_factory_.GetWeakPtr()));
185}
186
187void ThreadWatcher::DeActivateThreadWatching() {
188  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
189  active_ = false;
190  ping_count_ = 0;
191  weak_ptr_factory_.InvalidateWeakPtrs();
192}
193
194void ThreadWatcher::WakeUp() {
195  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
196  // There is some user activity, PostPingMessage task of thread watcher if
197  // needed.
198  if (!active_) return;
199
200  // Throw away the previous |unresponsive_count_| and start over again. Just
201  // before going to sleep, |unresponsive_count_| could be very close to
202  // |unresponsive_threshold_| and when user becomes active,
203  // |unresponsive_count_| can go over |unresponsive_threshold_| if there was no
204  // response for ping messages. Reset |unresponsive_count_| to start measuring
205  // the unresponsiveness of the threads when system becomes active.
206  unresponsive_count_ = 0;
207
208  if (ping_count_ <= 0) {
209    ping_count_ = unresponsive_threshold_;
210    ResetHangCounters();
211    PostPingMessage();
212  } else {
213    ping_count_ = unresponsive_threshold_;
214  }
215}
216
217void ThreadWatcher::PostPingMessage() {
218  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
219  // If we have stopped watching or if the user is idle, then stop sending
220  // ping messages.
221  if (!active_ || ping_count_ <= 0)
222    return;
223
224  // Save the current time when we have sent ping message.
225  ping_time_ = base::TimeTicks::Now();
226
227  // Send a ping message to the watched thread. Callback will be called on
228  // the WatchDogThread.
229  base::Closure callback(
230      base::Bind(&ThreadWatcher::OnPongMessage, weak_ptr_factory_.GetWeakPtr(),
231                 ping_sequence_number_));
232  if (watched_loop_->PostTask(
233          FROM_HERE,
234          base::Bind(&ThreadWatcher::OnPingMessage, thread_id_,
235                     callback))) {
236      // Post a task to check the responsiveness of watched thread.
237      base::MessageLoop::current()->PostDelayedTask(
238          FROM_HERE,
239          base::Bind(&ThreadWatcher::OnCheckResponsiveness,
240                     weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
241          unresponsive_time_);
242  } else {
243    // Watched thread might have gone away, stop watching it.
244    DeActivateThreadWatching();
245  }
246}
247
248void ThreadWatcher::OnPongMessage(uint64 ping_sequence_number) {
249  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
250
251  // Record watched thread's response time.
252  base::TimeTicks now = base::TimeTicks::Now();
253  base::TimeDelta response_time = now - ping_time_;
254  response_time_histogram_->AddTime(response_time);
255
256  // Save the current time when we have got pong message.
257  pong_time_ = now;
258
259  // Check if there are any extra pings in flight.
260  DCHECK_EQ(ping_sequence_number_, ping_sequence_number);
261  if (ping_sequence_number_ != ping_sequence_number)
262    return;
263
264  // Increment sequence number for the next ping message to indicate watched
265  // thread is responsive.
266  ++ping_sequence_number_;
267
268  // If we have stopped watching or if the user is idle, then stop sending
269  // ping messages.
270  if (!active_ || --ping_count_ <= 0)
271    return;
272
273  base::MessageLoop::current()->PostDelayedTask(
274      FROM_HERE,
275      base::Bind(&ThreadWatcher::PostPingMessage,
276                 weak_ptr_factory_.GetWeakPtr()),
277      sleep_time_);
278}
279
280void ThreadWatcher::OnCheckResponsiveness(uint64 ping_sequence_number) {
281  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
282  // If we have stopped watching then consider thread as responding.
283  if (!active_) {
284    responsive_ = true;
285    return;
286  }
287  // If the latest ping_sequence_number_ is not same as the ping_sequence_number
288  // that is passed in, then we can assume OnPongMessage was called.
289  // OnPongMessage increments ping_sequence_number_.
290  if (ping_sequence_number_ != ping_sequence_number) {
291    // Reset unresponsive_count_ to zero because we got a response from the
292    // watched thread.
293    ResetHangCounters();
294
295    responsive_ = true;
296    return;
297  }
298  // Record that we got no response from watched thread.
299  GotNoResponse();
300
301  // Post a task to check the responsiveness of watched thread.
302  base::MessageLoop::current()->PostDelayedTask(
303      FROM_HERE,
304      base::Bind(&ThreadWatcher::OnCheckResponsiveness,
305                 weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
306      unresponsive_time_);
307  responsive_ = false;
308}
309
310void ThreadWatcher::Initialize() {
311  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
312  ThreadWatcherList::Register(this);
313
314  const std::string response_time_histogram_name =
315      "ThreadWatcher.ResponseTime." + thread_name_;
316  response_time_histogram_ = base::Histogram::FactoryTimeGet(
317      response_time_histogram_name,
318      base::TimeDelta::FromMilliseconds(1),
319      base::TimeDelta::FromSeconds(100), 50,
320      base::Histogram::kUmaTargetedHistogramFlag);
321
322  const std::string unresponsive_time_histogram_name =
323      "ThreadWatcher.Unresponsive." + thread_name_;
324  unresponsive_time_histogram_ = base::Histogram::FactoryTimeGet(
325      unresponsive_time_histogram_name,
326      base::TimeDelta::FromMilliseconds(1),
327      base::TimeDelta::FromSeconds(100), 50,
328      base::Histogram::kUmaTargetedHistogramFlag);
329
330  const std::string responsive_count_histogram_name =
331      "ThreadWatcher.ResponsiveThreads." + thread_name_;
332  responsive_count_histogram_ = base::LinearHistogram::FactoryGet(
333      responsive_count_histogram_name, 1, 10, 11,
334      base::Histogram::kUmaTargetedHistogramFlag);
335
336  const std::string unresponsive_count_histogram_name =
337      "ThreadWatcher.UnresponsiveThreads." + thread_name_;
338  unresponsive_count_histogram_ = base::LinearHistogram::FactoryGet(
339      unresponsive_count_histogram_name, 1, 10, 11,
340      base::Histogram::kUmaTargetedHistogramFlag);
341}
342
343// static
344void ThreadWatcher::OnPingMessage(const BrowserThread::ID& thread_id,
345                                  const base::Closure& callback_task) {
346  // This method is called on watched thread.
347  DCHECK(BrowserThread::CurrentlyOn(thread_id));
348  WatchDogThread::PostTask(FROM_HERE, callback_task);
349}
350
351void ThreadWatcher::ResetHangCounters() {
352  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
353  unresponsive_count_ = 0;
354  hung_processing_complete_ = false;
355}
356
357void ThreadWatcher::GotNoResponse() {
358  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
359
360  ++unresponsive_count_;
361  if (!IsVeryUnresponsive())
362    return;
363
364  // Record total unresponsive_time since last pong message.
365  base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_;
366  unresponsive_time_histogram_->AddTime(unresponse_time);
367
368  // We have already collected stats for the non-responding watched thread.
369  if (hung_processing_complete_)
370    return;
371
372  // Record how other threads are responding.
373  uint32 responding_thread_count = 0;
374  uint32 unresponding_thread_count = 0;
375  ThreadWatcherList::GetStatusOfThreads(&responding_thread_count,
376                                        &unresponding_thread_count);
377
378  // Record how many watched threads are responding.
379  responsive_count_histogram_->Add(responding_thread_count);
380
381  // Record how many watched threads are not responding.
382  unresponsive_count_histogram_->Add(unresponding_thread_count);
383
384  // Crash the browser if the watched thread is to be crashed on hang and if the
385  // number of other threads responding is less than or equal to
386  // live_threads_threshold_ and at least one other thread is responding.
387  if (crash_on_hang_ &&
388      responding_thread_count > 0 &&
389      responding_thread_count <= live_threads_threshold_) {
390    static bool crashed_once = false;
391    if (!crashed_once) {
392      crashed_once = true;
393      CrashBecauseThreadWasUnresponsive(thread_id_);
394    }
395  }
396
397  hung_processing_complete_ = true;
398}
399
400bool ThreadWatcher::IsVeryUnresponsive() {
401  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
402  return unresponsive_count_ >= unresponsive_threshold_;
403}
404
405// ThreadWatcherList methods and members.
406//
407// static
408ThreadWatcherList* ThreadWatcherList::g_thread_watcher_list_ = NULL;
409// static
410const int ThreadWatcherList::kSleepSeconds = 1;
411// static
412const int ThreadWatcherList::kUnresponsiveSeconds = 2;
413// static
414const int ThreadWatcherList::kUnresponsiveCount = 9;
415// static
416const int ThreadWatcherList::kLiveThreadsThreshold = 2;
417
418ThreadWatcherList::CrashDataThresholds::CrashDataThresholds(
419    uint32 live_threads_threshold,
420    uint32 unresponsive_threshold)
421    : live_threads_threshold(live_threads_threshold),
422      unresponsive_threshold(unresponsive_threshold) {
423}
424
425ThreadWatcherList::CrashDataThresholds::CrashDataThresholds()
426    : live_threads_threshold(kLiveThreadsThreshold),
427      unresponsive_threshold(kUnresponsiveCount) {
428}
429
430// static
431void ThreadWatcherList::StartWatchingAll(const CommandLine& command_line) {
432  uint32 unresponsive_threshold;
433  CrashOnHangThreadMap crash_on_hang_threads;
434  ParseCommandLine(command_line,
435                   &unresponsive_threshold,
436                   &crash_on_hang_threads);
437
438  ThreadWatcherObserver::SetupNotifications(
439      base::TimeDelta::FromSeconds(kSleepSeconds * unresponsive_threshold));
440
441  WatchDogThread::PostDelayedTask(
442      FROM_HERE,
443      base::Bind(&ThreadWatcherList::InitializeAndStartWatching,
444                 unresponsive_threshold,
445                 crash_on_hang_threads),
446      base::TimeDelta::FromSeconds(120));
447}
448
449// static
450void ThreadWatcherList::StopWatchingAll() {
451  ThreadWatcherObserver::RemoveNotifications();
452  DeleteAll();
453}
454
455// static
456void ThreadWatcherList::Register(ThreadWatcher* watcher) {
457  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
458  if (!g_thread_watcher_list_)
459    return;
460  DCHECK(!g_thread_watcher_list_->Find(watcher->thread_id()));
461  g_thread_watcher_list_->registered_[watcher->thread_id()] = watcher;
462}
463
464// static
465bool ThreadWatcherList::IsRegistered(const BrowserThread::ID thread_id) {
466  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
467  return NULL != ThreadWatcherList::Find(thread_id);
468}
469
470// static
471void ThreadWatcherList::GetStatusOfThreads(uint32* responding_thread_count,
472                                           uint32* unresponding_thread_count) {
473  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
474  *responding_thread_count = 0;
475  *unresponding_thread_count = 0;
476  if (!g_thread_watcher_list_)
477    return;
478
479  for (RegistrationList::iterator it =
480           g_thread_watcher_list_->registered_.begin();
481       g_thread_watcher_list_->registered_.end() != it;
482       ++it) {
483    if (it->second->IsVeryUnresponsive())
484      ++(*unresponding_thread_count);
485    else
486      ++(*responding_thread_count);
487  }
488}
489
490// static
491void ThreadWatcherList::WakeUpAll() {
492  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
493  if (!g_thread_watcher_list_)
494    return;
495
496  for (RegistrationList::iterator it =
497           g_thread_watcher_list_->registered_.begin();
498       g_thread_watcher_list_->registered_.end() != it;
499       ++it)
500    it->second->WakeUp();
501}
502
503ThreadWatcherList::ThreadWatcherList() {
504  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
505  CHECK(!g_thread_watcher_list_);
506  g_thread_watcher_list_ = this;
507}
508
509ThreadWatcherList::~ThreadWatcherList() {
510  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
511  DCHECK(this == g_thread_watcher_list_);
512  g_thread_watcher_list_ = NULL;
513}
514
515// static
516void ThreadWatcherList::ParseCommandLine(
517    const CommandLine& command_line,
518    uint32* unresponsive_threshold,
519    CrashOnHangThreadMap* crash_on_hang_threads) {
520  // Initialize |unresponsive_threshold| to a default value.
521  *unresponsive_threshold = kUnresponsiveCount;
522
523  // Increase the unresponsive_threshold on the Stable and Beta channels to
524  // reduce the number of crashes due to ThreadWatcher.
525  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
526  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
527    *unresponsive_threshold *= 4;
528  } else if (channel == chrome::VersionInfo::CHANNEL_BETA) {
529    *unresponsive_threshold *= 2;
530  }
531
532#if defined(OS_WIN)
533  // For Windows XP (old systems), double the unresponsive_threshold to give
534  // the OS a chance to schedule UI/IO threads a time slice to respond with a
535  // pong message (to get around limitations with the OS).
536  if (base::win::GetVersion() <= base::win::VERSION_XP)
537    *unresponsive_threshold *= 2;
538#endif
539
540  uint32 crash_seconds = *unresponsive_threshold * kUnresponsiveSeconds;
541  std::string crash_on_hang_thread_names;
542  bool has_command_line_overwrite = false;
543  if (command_line.HasSwitch(switches::kCrashOnHangThreads)) {
544    crash_on_hang_thread_names =
545        command_line.GetSwitchValueASCII(switches::kCrashOnHangThreads);
546    has_command_line_overwrite = true;
547  } else if (channel != chrome::VersionInfo::CHANNEL_STABLE) {
548    // Default to crashing the browser if UI or IO or FILE threads are not
549    // responsive except in stable channel.
550    crash_on_hang_thread_names = base::StringPrintf(
551        "UI:%d:%d,IO:%d:%d,FILE:%d:%d",
552        kLiveThreadsThreshold, crash_seconds,
553        kLiveThreadsThreshold, crash_seconds,
554        kLiveThreadsThreshold, crash_seconds * 5);
555  }
556
557  ParseCommandLineCrashOnHangThreads(crash_on_hang_thread_names,
558                                     kLiveThreadsThreshold,
559                                     crash_seconds,
560                                     crash_on_hang_threads);
561
562  if (channel != chrome::VersionInfo::CHANNEL_CANARY ||
563      has_command_line_overwrite) {
564    return;
565  }
566
567  // Set up a field trial for 100% of the users to crash if either UI or IO
568  // thread is not responsive for 30 seconds (or 15 pings).
569  scoped_refptr<base::FieldTrial> field_trial(
570      base::FieldTrialList::FactoryGetFieldTrial(
571          "ThreadWatcher", 100, "default_hung_threads",
572          2013, 10, 30, base::FieldTrial::SESSION_RANDOMIZED, NULL));
573  int hung_thread_group = field_trial->AppendGroup("hung_thread", 100);
574  if (field_trial->group() == hung_thread_group) {
575    for (CrashOnHangThreadMap::iterator it = crash_on_hang_threads->begin();
576         crash_on_hang_threads->end() != it;
577         ++it) {
578      if (it->first != "IO")
579        continue;
580      it->second.live_threads_threshold = INT_MAX;
581      it->second.unresponsive_threshold = 15;
582    }
583  }
584}
585
586// static
587void ThreadWatcherList::ParseCommandLineCrashOnHangThreads(
588    const std::string& crash_on_hang_thread_names,
589    uint32 default_live_threads_threshold,
590    uint32 default_crash_seconds,
591    CrashOnHangThreadMap* crash_on_hang_threads) {
592  base::StringTokenizer tokens(crash_on_hang_thread_names, ",");
593  std::vector<std::string> values;
594  while (tokens.GetNext()) {
595    const std::string& token = tokens.token();
596    base::SplitString(token, ':', &values);
597    std::string thread_name = values[0];
598
599    uint32 live_threads_threshold = default_live_threads_threshold;
600    uint32 crash_seconds = default_crash_seconds;
601    if (values.size() >= 2 &&
602        (!base::StringToUint(values[1], &live_threads_threshold))) {
603      continue;
604    }
605    if (values.size() >= 3 &&
606        (!base::StringToUint(values[2], &crash_seconds))) {
607      continue;
608    }
609    uint32 unresponsive_threshold = static_cast<uint32>(
610        ceil(static_cast<float>(crash_seconds) / kUnresponsiveSeconds));
611
612    CrashDataThresholds crash_data(live_threads_threshold,
613                                   unresponsive_threshold);
614    // Use the last specifier.
615    (*crash_on_hang_threads)[thread_name] = crash_data;
616  }
617}
618
619// static
620void ThreadWatcherList::InitializeAndStartWatching(
621    uint32 unresponsive_threshold,
622    const CrashOnHangThreadMap& crash_on_hang_threads) {
623  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
624
625  ThreadWatcherList* thread_watcher_list = new ThreadWatcherList();
626  CHECK(thread_watcher_list);
627
628  BrowserThread::PostTask(
629      BrowserThread::UI,
630      FROM_HERE,
631      base::Bind(&StartupTimeBomb::DisarmStartupTimeBomb));
632
633  const base::TimeDelta kSleepTime =
634      base::TimeDelta::FromSeconds(kSleepSeconds);
635  const base::TimeDelta kUnresponsiveTime =
636      base::TimeDelta::FromSeconds(kUnresponsiveSeconds);
637
638  StartWatching(BrowserThread::UI, "UI", kSleepTime, kUnresponsiveTime,
639                unresponsive_threshold, crash_on_hang_threads);
640  StartWatching(BrowserThread::IO, "IO", kSleepTime, kUnresponsiveTime,
641                unresponsive_threshold, crash_on_hang_threads);
642  StartWatching(BrowserThread::DB, "DB", kSleepTime, kUnresponsiveTime,
643                unresponsive_threshold, crash_on_hang_threads);
644  StartWatching(BrowserThread::FILE, "FILE", kSleepTime, kUnresponsiveTime,
645                unresponsive_threshold, crash_on_hang_threads);
646  StartWatching(BrowserThread::CACHE, "CACHE", kSleepTime, kUnresponsiveTime,
647                unresponsive_threshold, crash_on_hang_threads);
648}
649
650// static
651void ThreadWatcherList::StartWatching(
652    const BrowserThread::ID& thread_id,
653    const std::string& thread_name,
654    const base::TimeDelta& sleep_time,
655    const base::TimeDelta& unresponsive_time,
656    uint32 unresponsive_threshold,
657    const CrashOnHangThreadMap& crash_on_hang_threads) {
658  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
659
660  CrashOnHangThreadMap::const_iterator it =
661      crash_on_hang_threads.find(thread_name);
662  bool crash_on_hang = false;
663  uint32 live_threads_threshold = 0;
664  if (it != crash_on_hang_threads.end()) {
665    crash_on_hang = true;
666    live_threads_threshold = it->second.live_threads_threshold;
667    unresponsive_threshold = it->second.unresponsive_threshold;
668  }
669
670  ThreadWatcher::StartWatching(
671      ThreadWatcher::WatchingParams(thread_id,
672                                    thread_name,
673                                    sleep_time,
674                                    unresponsive_time,
675                                    unresponsive_threshold,
676                                    crash_on_hang,
677                                    live_threads_threshold));
678}
679
680// static
681void ThreadWatcherList::DeleteAll() {
682  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
683    WatchDogThread::PostTask(
684        FROM_HERE,
685        base::Bind(&ThreadWatcherList::DeleteAll));
686    return;
687  }
688
689  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
690  if (!g_thread_watcher_list_)
691    return;
692
693  // Delete all thread watcher objects.
694  while (!g_thread_watcher_list_->registered_.empty()) {
695    RegistrationList::iterator it = g_thread_watcher_list_->registered_.begin();
696    delete it->second;
697    g_thread_watcher_list_->registered_.erase(it);
698  }
699
700  delete g_thread_watcher_list_;
701}
702
703// static
704ThreadWatcher* ThreadWatcherList::Find(const BrowserThread::ID& thread_id) {
705  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
706  if (!g_thread_watcher_list_)
707    return NULL;
708  RegistrationList::iterator it =
709      g_thread_watcher_list_->registered_.find(thread_id);
710  if (g_thread_watcher_list_->registered_.end() == it)
711    return NULL;
712  return it->second;
713}
714
715// ThreadWatcherObserver methods and members.
716//
717// static
718ThreadWatcherObserver* ThreadWatcherObserver::g_thread_watcher_observer_ = NULL;
719
720ThreadWatcherObserver::ThreadWatcherObserver(
721    const base::TimeDelta& wakeup_interval)
722    : last_wakeup_time_(base::TimeTicks::Now()),
723      wakeup_interval_(wakeup_interval) {
724  CHECK(!g_thread_watcher_observer_);
725  g_thread_watcher_observer_ = this;
726}
727
728ThreadWatcherObserver::~ThreadWatcherObserver() {
729  DCHECK(this == g_thread_watcher_observer_);
730  g_thread_watcher_observer_ = NULL;
731}
732
733// static
734void ThreadWatcherObserver::SetupNotifications(
735    const base::TimeDelta& wakeup_interval) {
736  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
737  ThreadWatcherObserver* observer = new ThreadWatcherObserver(wakeup_interval);
738  MetricsService::SetUpNotifications(&observer->registrar_, observer);
739}
740
741// static
742void ThreadWatcherObserver::RemoveNotifications() {
743  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
744  if (!g_thread_watcher_observer_)
745    return;
746  g_thread_watcher_observer_->registrar_.RemoveAll();
747  delete g_thread_watcher_observer_;
748}
749
750void ThreadWatcherObserver::Observe(
751    int type,
752    const content::NotificationSource& source,
753    const content::NotificationDetails& details) {
754  // There is some user activity, see if thread watchers are to be awakened.
755  base::TimeTicks now = base::TimeTicks::Now();
756  if ((now - last_wakeup_time_) < wakeup_interval_)
757    return;
758  last_wakeup_time_ = now;
759  WatchDogThread::PostTask(
760      FROM_HERE,
761      base::Bind(&ThreadWatcherList::WakeUpAll));
762}
763
764// WatchDogThread methods and members.
765
766// This lock protects g_watchdog_thread.
767static base::LazyInstance<base::Lock>::Leaky
768    g_watchdog_lock = LAZY_INSTANCE_INITIALIZER;
769
770// The singleton of this class.
771static WatchDogThread* g_watchdog_thread = NULL;
772
773WatchDogThread::WatchDogThread() : Thread("BrowserWatchdog") {
774}
775
776WatchDogThread::~WatchDogThread() {
777  Stop();
778}
779
780// static
781bool WatchDogThread::CurrentlyOnWatchDogThread() {
782  base::AutoLock lock(g_watchdog_lock.Get());
783  return g_watchdog_thread &&
784      g_watchdog_thread->message_loop() == base::MessageLoop::current();
785}
786
787// static
788bool WatchDogThread::PostTask(const tracked_objects::Location& from_here,
789                              const base::Closure& task) {
790  return PostTaskHelper(from_here, task, base::TimeDelta());
791}
792
793// static
794bool WatchDogThread::PostDelayedTask(const tracked_objects::Location& from_here,
795                                     const base::Closure& task,
796                                     base::TimeDelta delay) {
797  return PostTaskHelper(from_here, task, delay);
798}
799
800// static
801bool WatchDogThread::PostTaskHelper(
802    const tracked_objects::Location& from_here,
803    const base::Closure& task,
804    base::TimeDelta delay) {
805  {
806    base::AutoLock lock(g_watchdog_lock.Get());
807
808    base::MessageLoop* message_loop = g_watchdog_thread ?
809        g_watchdog_thread->message_loop() : NULL;
810    if (message_loop) {
811      message_loop->PostDelayedTask(from_here, task, delay);
812      return true;
813    }
814  }
815
816  return false;
817}
818
819void WatchDogThread::Init() {
820  // This thread shouldn't be allowed to perform any blocking disk I/O.
821  base::ThreadRestrictions::SetIOAllowed(false);
822
823  base::AutoLock lock(g_watchdog_lock.Get());
824  CHECK(!g_watchdog_thread);
825  g_watchdog_thread = this;
826}
827
828void WatchDogThread::CleanUp() {
829  base::AutoLock lock(g_watchdog_lock.Get());
830  g_watchdog_thread = NULL;
831}
832
833namespace {
834
835// StartupWatchDogThread methods and members.
836//
837// Class for detecting hangs during startup.
838class StartupWatchDogThread : public base::Watchdog {
839 public:
840  // Constructor specifies how long the StartupWatchDogThread will wait before
841  // alarming.
842  explicit StartupWatchDogThread(const base::TimeDelta& duration)
843      : base::Watchdog(duration, "Startup watchdog thread", true) {
844  }
845
846  // Alarm is called if the time expires after an Arm() without someone calling
847  // Disarm(). When Alarm goes off, in release mode we get the crash dump
848  // without crashing and in debug mode we break into the debugger.
849  virtual void Alarm() OVERRIDE {
850#ifndef NDEBUG
851    DCHECK(false);
852#else
853    logging::DumpWithoutCrashing();
854#endif
855  }
856
857  DISALLOW_COPY_AND_ASSIGN(StartupWatchDogThread);
858};
859
860// ShutdownWatchDogThread methods and members.
861//
862// Class for detecting hangs during shutdown.
863class ShutdownWatchDogThread : public base::Watchdog {
864 public:
865  // Constructor specifies how long the ShutdownWatchDogThread will wait before
866  // alarming.
867  explicit ShutdownWatchDogThread(const base::TimeDelta& duration)
868      : base::Watchdog(duration, "Shutdown watchdog thread", true) {
869  }
870
871  // Alarm is called if the time expires after an Arm() without someone calling
872  // Disarm(). We crash the browser if this method is called.
873  virtual void Alarm() OVERRIDE {
874    ShutdownCrash();
875  }
876
877  DISALLOW_COPY_AND_ASSIGN(ShutdownWatchDogThread);
878};
879}  // namespace
880
881// StartupTimeBomb methods and members.
882//
883// static
884StartupTimeBomb* StartupTimeBomb::g_startup_timebomb_ = NULL;
885
886StartupTimeBomb::StartupTimeBomb()
887    : startup_watchdog_(NULL),
888      thread_id_(base::PlatformThread::CurrentId()) {
889  CHECK(!g_startup_timebomb_);
890  g_startup_timebomb_ = this;
891}
892
893StartupTimeBomb::~StartupTimeBomb() {
894  DCHECK(this == g_startup_timebomb_);
895  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
896  if (startup_watchdog_)
897    Disarm();
898  g_startup_timebomb_ = NULL;
899}
900
901void StartupTimeBomb::Arm(const base::TimeDelta& duration) {
902  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
903  DCHECK(!startup_watchdog_);
904  startup_watchdog_ = new StartupWatchDogThread(duration);
905  startup_watchdog_->Arm();
906  return;
907}
908
909void StartupTimeBomb::Disarm() {
910  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
911  if (startup_watchdog_) {
912    startup_watchdog_->Disarm();
913    startup_watchdog_->Cleanup();
914    DeleteStartupWatchdog();
915  }
916}
917
918void StartupTimeBomb::DeleteStartupWatchdog() {
919  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
920  if (startup_watchdog_->IsJoinable()) {
921    // Allow the watchdog thread to shutdown on UI. Watchdog thread shutdowns
922    // very fast.
923    base::ThreadRestrictions::SetIOAllowed(true);
924    delete startup_watchdog_;
925    startup_watchdog_ = NULL;
926    return;
927  }
928  base::MessageLoop::current()->PostDelayedTask(
929      FROM_HERE,
930      base::Bind(&StartupTimeBomb::DeleteStartupWatchdog,
931                 base::Unretained(this)),
932      base::TimeDelta::FromSeconds(10));
933}
934
935// static
936void StartupTimeBomb::DisarmStartupTimeBomb() {
937  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
938  if (g_startup_timebomb_)
939    g_startup_timebomb_->Disarm();
940}
941
942// ShutdownWatcherHelper methods and members.
943//
944// ShutdownWatcherHelper is a wrapper class for detecting hangs during
945// shutdown.
946ShutdownWatcherHelper::ShutdownWatcherHelper()
947    : shutdown_watchdog_(NULL),
948      thread_id_(base::PlatformThread::CurrentId()) {
949}
950
951ShutdownWatcherHelper::~ShutdownWatcherHelper() {
952  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
953  if (shutdown_watchdog_) {
954    shutdown_watchdog_->Disarm();
955    delete shutdown_watchdog_;
956    shutdown_watchdog_ = NULL;
957  }
958}
959
960void ShutdownWatcherHelper::Arm(const base::TimeDelta& duration) {
961  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
962  DCHECK(!shutdown_watchdog_);
963  base::TimeDelta actual_duration = duration;
964
965  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
966  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
967    actual_duration *= 20;
968  } else if (channel == chrome::VersionInfo::CHANNEL_BETA ||
969             channel == chrome::VersionInfo::CHANNEL_DEV) {
970    actual_duration *= 10;
971  }
972
973#if defined(OS_WIN)
974  // On Windows XP, give twice the time for shutdown.
975  if (base::win::GetVersion() <= base::win::VERSION_XP)
976    actual_duration *= 2;
977#endif
978
979  shutdown_watchdog_ = new ShutdownWatchDogThread(actual_duration);
980  shutdown_watchdog_->Arm();
981}
982