thread_watcher.cc revision 1320f92c476a1ad9d19dba2a48c72b75566198e9
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/metrics/thread_watcher.h"
6
7#include <math.h>  // ceil
8
9#include "base/bind.h"
10#include "base/compiler_specific.h"
11#include "base/debug/alias.h"
12#include "base/debug/debugger.h"
13#include "base/debug/dump_without_crashing.h"
14#include "base/lazy_instance.h"
15#include "base/metrics/field_trial.h"
16#include "base/strings/string_number_conversions.h"
17#include "base/strings/string_split.h"
18#include "base/strings/string_tokenizer.h"
19#include "base/strings/stringprintf.h"
20#include "base/threading/thread_restrictions.h"
21#include "build/build_config.h"
22#include "chrome/browser/chrome_notification_types.h"
23#include "chrome/common/chrome_switches.h"
24#include "chrome/common/chrome_version_info.h"
25#include "chrome/common/logging_chrome.h"
26#include "content/public/browser/notification_service.h"
27
28#if defined(OS_WIN)
29#include "base/win/windows_version.h"
30#endif
31
32using content::BrowserThread;
33
34namespace {
35
36// The following are unique function names for forcing the crash when a thread
37// is unresponsive. This makes it possible to tell from the callstack alone what
38// thread was unresponsive.
39//
40// We disable optimizations for this block of functions so the compiler doesn't
41// merge them all together.
42MSVC_DISABLE_OPTIMIZE()
43MSVC_PUSH_DISABLE_WARNING(4748)
44
45void ReportThreadHang() {
46#if defined(NDEBUG)
47  base::debug::DumpWithoutCrashing();
48#else
49  base::debug::BreakDebugger();
50#endif
51}
52
53#if !defined(OS_ANDROID) || !defined(NDEBUG)
54// TODO(rtenneti): Enabled crashing, after getting data.
55NOINLINE void StartupHang() {
56  ReportThreadHang();
57}
58#endif  // OS_ANDROID
59
60NOINLINE void ShutdownHang() {
61  ReportThreadHang();
62}
63
64NOINLINE void ThreadUnresponsive_UI() {
65  ReportThreadHang();
66}
67
68NOINLINE void ThreadUnresponsive_DB() {
69  ReportThreadHang();
70}
71
72NOINLINE void ThreadUnresponsive_FILE() {
73  ReportThreadHang();
74}
75
76NOINLINE void ThreadUnresponsive_FILE_USER_BLOCKING() {
77  ReportThreadHang();
78}
79
80NOINLINE void ThreadUnresponsive_PROCESS_LAUNCHER() {
81  ReportThreadHang();
82}
83
84NOINLINE void ThreadUnresponsive_CACHE() {
85  ReportThreadHang();
86}
87
88NOINLINE void ThreadUnresponsive_IO() {
89  ReportThreadHang();
90}
91
92MSVC_POP_WARNING()
93MSVC_ENABLE_OPTIMIZE();
94
95void CrashBecauseThreadWasUnresponsive(BrowserThread::ID thread_id) {
96  base::debug::Alias(&thread_id);
97
98  switch (thread_id) {
99    case BrowserThread::UI:
100      return ThreadUnresponsive_UI();
101    case BrowserThread::DB:
102      return ThreadUnresponsive_DB();
103    case BrowserThread::FILE:
104      return ThreadUnresponsive_FILE();
105    case BrowserThread::FILE_USER_BLOCKING:
106      return ThreadUnresponsive_FILE_USER_BLOCKING();
107    case BrowserThread::PROCESS_LAUNCHER:
108      return ThreadUnresponsive_PROCESS_LAUNCHER();
109    case BrowserThread::CACHE:
110      return ThreadUnresponsive_CACHE();
111    case BrowserThread::IO:
112      return ThreadUnresponsive_IO();
113    case BrowserThread::ID_COUNT:
114      CHECK(false);  // This shouldn't actually be reached!
115      break;
116
117    // Omission of the default hander is intentional -- that way the compiler
118    // should warn if our switch becomes outdated.
119  }
120
121  CHECK(false) << "Unknown thread was unresponsive.";  // Shouldn't be reached.
122}
123
124}  // namespace
125
126// ThreadWatcher methods and members.
127ThreadWatcher::ThreadWatcher(const WatchingParams& params)
128    : thread_id_(params.thread_id),
129      thread_name_(params.thread_name),
130      watched_loop_(
131          BrowserThread::GetMessageLoopProxyForThread(params.thread_id)),
132      sleep_time_(params.sleep_time),
133      unresponsive_time_(params.unresponsive_time),
134      ping_time_(base::TimeTicks::Now()),
135      pong_time_(ping_time_),
136      ping_sequence_number_(0),
137      active_(false),
138      ping_count_(params.unresponsive_threshold),
139      response_time_histogram_(NULL),
140      unresponsive_time_histogram_(NULL),
141      unresponsive_count_(0),
142      hung_processing_complete_(false),
143      unresponsive_threshold_(params.unresponsive_threshold),
144      crash_on_hang_(params.crash_on_hang),
145      live_threads_threshold_(params.live_threads_threshold),
146      weak_ptr_factory_(this) {
147  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
148  Initialize();
149}
150
151ThreadWatcher::~ThreadWatcher() {}
152
153// static
154void ThreadWatcher::StartWatching(const WatchingParams& params) {
155  DCHECK_GE(params.sleep_time.InMilliseconds(), 0);
156  DCHECK_GE(params.unresponsive_time.InMilliseconds(),
157            params.sleep_time.InMilliseconds());
158
159  // If we are not on WatchDogThread, then post a task to call StartWatching on
160  // WatchDogThread.
161  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
162    WatchDogThread::PostTask(
163        FROM_HERE,
164        base::Bind(&ThreadWatcher::StartWatching, params));
165    return;
166  }
167
168  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
169
170  // Create a new thread watcher object for the given thread and activate it.
171  ThreadWatcher* watcher = new ThreadWatcher(params);
172
173  DCHECK(watcher);
174  // If we couldn't register the thread watcher object, we are shutting down,
175  // then don't activate thread watching.
176  if (!ThreadWatcherList::IsRegistered(params.thread_id))
177    return;
178  watcher->ActivateThreadWatching();
179}
180
181void ThreadWatcher::ActivateThreadWatching() {
182  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
183  if (active_) return;
184  active_ = true;
185  ping_count_ = unresponsive_threshold_;
186  ResetHangCounters();
187  base::MessageLoop::current()->PostTask(
188      FROM_HERE,
189      base::Bind(&ThreadWatcher::PostPingMessage,
190                 weak_ptr_factory_.GetWeakPtr()));
191}
192
193void ThreadWatcher::DeActivateThreadWatching() {
194  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
195  active_ = false;
196  ping_count_ = 0;
197  weak_ptr_factory_.InvalidateWeakPtrs();
198}
199
200void ThreadWatcher::WakeUp() {
201  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
202  // There is some user activity, PostPingMessage task of thread watcher if
203  // needed.
204  if (!active_) return;
205
206  // Throw away the previous |unresponsive_count_| and start over again. Just
207  // before going to sleep, |unresponsive_count_| could be very close to
208  // |unresponsive_threshold_| and when user becomes active,
209  // |unresponsive_count_| can go over |unresponsive_threshold_| if there was no
210  // response for ping messages. Reset |unresponsive_count_| to start measuring
211  // the unresponsiveness of the threads when system becomes active.
212  unresponsive_count_ = 0;
213
214  if (ping_count_ <= 0) {
215    ping_count_ = unresponsive_threshold_;
216    ResetHangCounters();
217    PostPingMessage();
218  } else {
219    ping_count_ = unresponsive_threshold_;
220  }
221}
222
223void ThreadWatcher::PostPingMessage() {
224  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
225  // If we have stopped watching or if the user is idle, then stop sending
226  // ping messages.
227  if (!active_ || ping_count_ <= 0)
228    return;
229
230  // Save the current time when we have sent ping message.
231  ping_time_ = base::TimeTicks::Now();
232
233  // Send a ping message to the watched thread. Callback will be called on
234  // the WatchDogThread.
235  base::Closure callback(
236      base::Bind(&ThreadWatcher::OnPongMessage, weak_ptr_factory_.GetWeakPtr(),
237                 ping_sequence_number_));
238  if (watched_loop_->PostTask(
239          FROM_HERE,
240          base::Bind(&ThreadWatcher::OnPingMessage, thread_id_,
241                     callback))) {
242      // Post a task to check the responsiveness of watched thread.
243      base::MessageLoop::current()->PostDelayedTask(
244          FROM_HERE,
245          base::Bind(&ThreadWatcher::OnCheckResponsiveness,
246                     weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
247          unresponsive_time_);
248  } else {
249    // Watched thread might have gone away, stop watching it.
250    DeActivateThreadWatching();
251  }
252}
253
254void ThreadWatcher::OnPongMessage(uint64 ping_sequence_number) {
255  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
256
257  // Record watched thread's response time.
258  base::TimeTicks now = base::TimeTicks::Now();
259  base::TimeDelta response_time = now - ping_time_;
260  response_time_histogram_->AddTime(response_time);
261
262  // Save the current time when we have got pong message.
263  pong_time_ = now;
264
265  // Check if there are any extra pings in flight.
266  DCHECK_EQ(ping_sequence_number_, ping_sequence_number);
267  if (ping_sequence_number_ != ping_sequence_number)
268    return;
269
270  // Increment sequence number for the next ping message to indicate watched
271  // thread is responsive.
272  ++ping_sequence_number_;
273
274  // If we have stopped watching or if the user is idle, then stop sending
275  // ping messages.
276  if (!active_ || --ping_count_ <= 0)
277    return;
278
279  base::MessageLoop::current()->PostDelayedTask(
280      FROM_HERE,
281      base::Bind(&ThreadWatcher::PostPingMessage,
282                 weak_ptr_factory_.GetWeakPtr()),
283      sleep_time_);
284}
285
286void ThreadWatcher::OnCheckResponsiveness(uint64 ping_sequence_number) {
287  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
288  // If we have stopped watching then consider thread as responding.
289  if (!active_) {
290    responsive_ = true;
291    return;
292  }
293  // If the latest ping_sequence_number_ is not same as the ping_sequence_number
294  // that is passed in, then we can assume OnPongMessage was called.
295  // OnPongMessage increments ping_sequence_number_.
296  if (ping_sequence_number_ != ping_sequence_number) {
297    // Reset unresponsive_count_ to zero because we got a response from the
298    // watched thread.
299    ResetHangCounters();
300
301    responsive_ = true;
302    return;
303  }
304  // Record that we got no response from watched thread.
305  GotNoResponse();
306
307  // Post a task to check the responsiveness of watched thread.
308  base::MessageLoop::current()->PostDelayedTask(
309      FROM_HERE,
310      base::Bind(&ThreadWatcher::OnCheckResponsiveness,
311                 weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
312      unresponsive_time_);
313  responsive_ = false;
314}
315
316void ThreadWatcher::Initialize() {
317  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
318  ThreadWatcherList::Register(this);
319
320  const std::string response_time_histogram_name =
321      "ThreadWatcher.ResponseTime." + thread_name_;
322  response_time_histogram_ = base::Histogram::FactoryTimeGet(
323      response_time_histogram_name,
324      base::TimeDelta::FromMilliseconds(1),
325      base::TimeDelta::FromSeconds(100), 50,
326      base::Histogram::kUmaTargetedHistogramFlag);
327
328  const std::string unresponsive_time_histogram_name =
329      "ThreadWatcher.Unresponsive." + thread_name_;
330  unresponsive_time_histogram_ = base::Histogram::FactoryTimeGet(
331      unresponsive_time_histogram_name,
332      base::TimeDelta::FromMilliseconds(1),
333      base::TimeDelta::FromSeconds(100), 50,
334      base::Histogram::kUmaTargetedHistogramFlag);
335
336  const std::string responsive_count_histogram_name =
337      "ThreadWatcher.ResponsiveThreads." + thread_name_;
338  responsive_count_histogram_ = base::LinearHistogram::FactoryGet(
339      responsive_count_histogram_name, 1, 10, 11,
340      base::Histogram::kUmaTargetedHistogramFlag);
341
342  const std::string unresponsive_count_histogram_name =
343      "ThreadWatcher.UnresponsiveThreads." + thread_name_;
344  unresponsive_count_histogram_ = base::LinearHistogram::FactoryGet(
345      unresponsive_count_histogram_name, 1, 10, 11,
346      base::Histogram::kUmaTargetedHistogramFlag);
347}
348
349// static
350void ThreadWatcher::OnPingMessage(const BrowserThread::ID& thread_id,
351                                  const base::Closure& callback_task) {
352  // This method is called on watched thread.
353  DCHECK(BrowserThread::CurrentlyOn(thread_id));
354  WatchDogThread::PostTask(FROM_HERE, callback_task);
355}
356
357void ThreadWatcher::ResetHangCounters() {
358  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
359  unresponsive_count_ = 0;
360  hung_processing_complete_ = false;
361}
362
363void ThreadWatcher::GotNoResponse() {
364  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
365
366  ++unresponsive_count_;
367  if (!IsVeryUnresponsive())
368    return;
369
370  // Record total unresponsive_time since last pong message.
371  base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_;
372  unresponsive_time_histogram_->AddTime(unresponse_time);
373
374  // We have already collected stats for the non-responding watched thread.
375  if (hung_processing_complete_)
376    return;
377
378  // Record how other threads are responding.
379  uint32 responding_thread_count = 0;
380  uint32 unresponding_thread_count = 0;
381  ThreadWatcherList::GetStatusOfThreads(&responding_thread_count,
382                                        &unresponding_thread_count);
383
384  // Record how many watched threads are responding.
385  responsive_count_histogram_->Add(responding_thread_count);
386
387  // Record how many watched threads are not responding.
388  unresponsive_count_histogram_->Add(unresponding_thread_count);
389
390  // Crash the browser if the watched thread is to be crashed on hang and if the
391  // number of other threads responding is less than or equal to
392  // live_threads_threshold_ and at least one other thread is responding.
393  if (crash_on_hang_ &&
394      responding_thread_count > 0 &&
395      responding_thread_count <= live_threads_threshold_) {
396    static bool crashed_once = false;
397    if (!crashed_once) {
398      crashed_once = true;
399      CrashBecauseThreadWasUnresponsive(thread_id_);
400    }
401  }
402
403  hung_processing_complete_ = true;
404}
405
406bool ThreadWatcher::IsVeryUnresponsive() {
407  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
408  return unresponsive_count_ >= unresponsive_threshold_;
409}
410
411// ThreadWatcherList methods and members.
412//
413// static
414ThreadWatcherList* ThreadWatcherList::g_thread_watcher_list_ = NULL;
415// static
416bool ThreadWatcherList::g_stopped_ = false;
417// static
418const int ThreadWatcherList::kSleepSeconds = 1;
419// static
420const int ThreadWatcherList::kUnresponsiveSeconds = 2;
421// static
422const int ThreadWatcherList::kUnresponsiveCount = 9;
423// static
424const int ThreadWatcherList::kLiveThreadsThreshold = 2;
425// static, non-const for tests.
426int ThreadWatcherList::g_initialize_delay_seconds = 120;
427
428ThreadWatcherList::CrashDataThresholds::CrashDataThresholds(
429    uint32 live_threads_threshold,
430    uint32 unresponsive_threshold)
431    : live_threads_threshold(live_threads_threshold),
432      unresponsive_threshold(unresponsive_threshold) {
433}
434
435ThreadWatcherList::CrashDataThresholds::CrashDataThresholds()
436    : live_threads_threshold(kLiveThreadsThreshold),
437      unresponsive_threshold(kUnresponsiveCount) {
438}
439
440// static
441void ThreadWatcherList::StartWatchingAll(const CommandLine& command_line) {
442  // TODO(rtenneti): Enable ThreadWatcher.
443  uint32 unresponsive_threshold;
444  CrashOnHangThreadMap crash_on_hang_threads;
445  ParseCommandLine(command_line,
446                   &unresponsive_threshold,
447                   &crash_on_hang_threads);
448
449  ThreadWatcherObserver::SetupNotifications(
450      base::TimeDelta::FromSeconds(kSleepSeconds * unresponsive_threshold));
451
452  WatchDogThread::PostTask(
453      FROM_HERE,
454      base::Bind(&ThreadWatcherList::SetStopped, false));
455
456  WatchDogThread::PostDelayedTask(
457      FROM_HERE,
458      base::Bind(&ThreadWatcherList::InitializeAndStartWatching,
459                 unresponsive_threshold,
460                 crash_on_hang_threads),
461      base::TimeDelta::FromSeconds(g_initialize_delay_seconds));
462}
463
464// static
465void ThreadWatcherList::StopWatchingAll() {
466  // TODO(rtenneti): Enable ThreadWatcher.
467  ThreadWatcherObserver::RemoveNotifications();
468  DeleteAll();
469}
470
471// static
472void ThreadWatcherList::Register(ThreadWatcher* watcher) {
473  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
474  if (!g_thread_watcher_list_)
475    return;
476  DCHECK(!g_thread_watcher_list_->Find(watcher->thread_id()));
477  g_thread_watcher_list_->registered_[watcher->thread_id()] = watcher;
478}
479
480// static
481bool ThreadWatcherList::IsRegistered(const BrowserThread::ID thread_id) {
482  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
483  return NULL != ThreadWatcherList::Find(thread_id);
484}
485
486// static
487void ThreadWatcherList::GetStatusOfThreads(uint32* responding_thread_count,
488                                           uint32* unresponding_thread_count) {
489  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
490  *responding_thread_count = 0;
491  *unresponding_thread_count = 0;
492  if (!g_thread_watcher_list_)
493    return;
494
495  for (RegistrationList::iterator it =
496           g_thread_watcher_list_->registered_.begin();
497       g_thread_watcher_list_->registered_.end() != it;
498       ++it) {
499    if (it->second->IsVeryUnresponsive())
500      ++(*unresponding_thread_count);
501    else
502      ++(*responding_thread_count);
503  }
504}
505
506// static
507void ThreadWatcherList::WakeUpAll() {
508  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
509  if (!g_thread_watcher_list_)
510    return;
511
512  for (RegistrationList::iterator it =
513           g_thread_watcher_list_->registered_.begin();
514       g_thread_watcher_list_->registered_.end() != it;
515       ++it)
516    it->second->WakeUp();
517}
518
519ThreadWatcherList::ThreadWatcherList() {
520  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
521  CHECK(!g_thread_watcher_list_);
522  g_thread_watcher_list_ = this;
523}
524
525ThreadWatcherList::~ThreadWatcherList() {
526  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
527  DCHECK(this == g_thread_watcher_list_);
528  g_thread_watcher_list_ = NULL;
529}
530
531// static
532void ThreadWatcherList::ParseCommandLine(
533    const CommandLine& command_line,
534    uint32* unresponsive_threshold,
535    CrashOnHangThreadMap* crash_on_hang_threads) {
536  // Initialize |unresponsive_threshold| to a default value.
537  // TODO(rtenneti): Changed the default value to 4 times, until we can triage
538  // hangs automatically (and to reduce the crash dumps).
539  *unresponsive_threshold = kUnresponsiveCount * 4;
540
541  // Increase the unresponsive_threshold on the Stable and Beta channels to
542  // reduce the number of crashes due to ThreadWatcher.
543  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
544  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
545    *unresponsive_threshold *= 4;
546  } else if (channel == chrome::VersionInfo::CHANNEL_BETA) {
547    *unresponsive_threshold *= 2;
548  }
549
550#if defined(OS_WIN)
551  // For Windows XP (old systems), double the unresponsive_threshold to give
552  // the OS a chance to schedule UI/IO threads a time slice to respond with a
553  // pong message (to get around limitations with the OS).
554  if (base::win::GetVersion() <= base::win::VERSION_XP)
555    *unresponsive_threshold *= 2;
556#endif
557
558  uint32 crash_seconds = *unresponsive_threshold * kUnresponsiveSeconds;
559  std::string crash_on_hang_thread_names;
560  bool has_command_line_overwrite = false;
561  if (command_line.HasSwitch(switches::kCrashOnHangThreads)) {
562    crash_on_hang_thread_names =
563        command_line.GetSwitchValueASCII(switches::kCrashOnHangThreads);
564    has_command_line_overwrite = true;
565  } else if (channel != chrome::VersionInfo::CHANNEL_STABLE) {
566    // Default to crashing the browser if UI or IO or FILE threads are not
567    // responsive except in stable channel.
568    crash_on_hang_thread_names = base::StringPrintf(
569        "UI:%d:%d,IO:%d:%d,FILE:%d:%d",
570        kLiveThreadsThreshold, crash_seconds,
571        kLiveThreadsThreshold, crash_seconds,
572        kLiveThreadsThreshold, crash_seconds * 5);
573  }
574
575  ParseCommandLineCrashOnHangThreads(crash_on_hang_thread_names,
576                                     kLiveThreadsThreshold,
577                                     crash_seconds,
578                                     crash_on_hang_threads);
579
580  if (channel != chrome::VersionInfo::CHANNEL_CANARY ||
581      has_command_line_overwrite) {
582    return;
583  }
584
585  const char* kFieldTrialName = "ThreadWatcher";
586
587  // Nothing else to be done if the trial has already been set (i.e., when
588  // StartWatchingAll() has been already called once).
589  if (base::FieldTrialList::TrialExists(kFieldTrialName))
590    return;
591
592  // Set up a field trial for 100% of the users to crash if either UI or IO
593  // thread is not responsive for 30 seconds (or 15 pings).
594  scoped_refptr<base::FieldTrial> field_trial(
595      base::FieldTrialList::FactoryGetFieldTrial(
596          kFieldTrialName, 100, "default_hung_threads",
597          2014, 10, 30, base::FieldTrial::SESSION_RANDOMIZED, NULL));
598  int hung_thread_group = field_trial->AppendGroup("hung_thread", 100);
599  if (field_trial->group() == hung_thread_group) {
600    for (CrashOnHangThreadMap::iterator it = crash_on_hang_threads->begin();
601         crash_on_hang_threads->end() != it;
602         ++it) {
603      if (it->first == "FILE")
604        continue;
605      it->second.live_threads_threshold = INT_MAX;
606      if (it->first == "UI") {
607        // TODO(rtenneti): set unresponsive threshold to 120 seconds to catch
608        // the worst UI hangs and for fewer crashes due to ThreadWatcher. Reduce
609        // it to a more reasonable time ala IO thread.
610        it->second.unresponsive_threshold = 60;
611      } else {
612        it->second.unresponsive_threshold = 15;
613      }
614    }
615  }
616}
617
618// static
619void ThreadWatcherList::ParseCommandLineCrashOnHangThreads(
620    const std::string& crash_on_hang_thread_names,
621    uint32 default_live_threads_threshold,
622    uint32 default_crash_seconds,
623    CrashOnHangThreadMap* crash_on_hang_threads) {
624  base::StringTokenizer tokens(crash_on_hang_thread_names, ",");
625  std::vector<std::string> values;
626  while (tokens.GetNext()) {
627    const std::string& token = tokens.token();
628    base::SplitString(token, ':', &values);
629    std::string thread_name = values[0];
630
631    uint32 live_threads_threshold = default_live_threads_threshold;
632    uint32 crash_seconds = default_crash_seconds;
633    if (values.size() >= 2 &&
634        (!base::StringToUint(values[1], &live_threads_threshold))) {
635      continue;
636    }
637    if (values.size() >= 3 &&
638        (!base::StringToUint(values[2], &crash_seconds))) {
639      continue;
640    }
641    uint32 unresponsive_threshold = static_cast<uint32>(
642        ceil(static_cast<float>(crash_seconds) / kUnresponsiveSeconds));
643
644    CrashDataThresholds crash_data(live_threads_threshold,
645                                   unresponsive_threshold);
646    // Use the last specifier.
647    (*crash_on_hang_threads)[thread_name] = crash_data;
648  }
649}
650
651// static
652void ThreadWatcherList::InitializeAndStartWatching(
653    uint32 unresponsive_threshold,
654    const CrashOnHangThreadMap& crash_on_hang_threads) {
655  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
656
657  // Disarm the startup timebomb, even if stop has been called.
658  BrowserThread::PostTask(
659      BrowserThread::UI,
660      FROM_HERE,
661      base::Bind(&StartupTimeBomb::DisarmStartupTimeBomb));
662
663  // This method is deferred in relationship to its StopWatchingAll()
664  // counterpart. If a previous initialization has already happened, or if
665  // stop has been called, there's nothing left to do here.
666  if (g_thread_watcher_list_ || g_stopped_)
667    return;
668
669  ThreadWatcherList* thread_watcher_list = new ThreadWatcherList();
670  CHECK(thread_watcher_list);
671
672  const base::TimeDelta kSleepTime =
673      base::TimeDelta::FromSeconds(kSleepSeconds);
674  const base::TimeDelta kUnresponsiveTime =
675      base::TimeDelta::FromSeconds(kUnresponsiveSeconds);
676
677  StartWatching(BrowserThread::UI, "UI", kSleepTime, kUnresponsiveTime,
678                unresponsive_threshold, crash_on_hang_threads);
679  StartWatching(BrowserThread::IO, "IO", kSleepTime, kUnresponsiveTime,
680                unresponsive_threshold, crash_on_hang_threads);
681  StartWatching(BrowserThread::DB, "DB", kSleepTime, kUnresponsiveTime,
682                unresponsive_threshold, crash_on_hang_threads);
683  StartWatching(BrowserThread::FILE, "FILE", kSleepTime, kUnresponsiveTime,
684                unresponsive_threshold, crash_on_hang_threads);
685  StartWatching(BrowserThread::CACHE, "CACHE", kSleepTime, kUnresponsiveTime,
686                unresponsive_threshold, crash_on_hang_threads);
687}
688
689// static
690void ThreadWatcherList::StartWatching(
691    const BrowserThread::ID& thread_id,
692    const std::string& thread_name,
693    const base::TimeDelta& sleep_time,
694    const base::TimeDelta& unresponsive_time,
695    uint32 unresponsive_threshold,
696    const CrashOnHangThreadMap& crash_on_hang_threads) {
697  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
698
699  CrashOnHangThreadMap::const_iterator it =
700      crash_on_hang_threads.find(thread_name);
701  bool crash_on_hang = false;
702  uint32 live_threads_threshold = 0;
703  if (it != crash_on_hang_threads.end()) {
704    crash_on_hang = true;
705    live_threads_threshold = it->second.live_threads_threshold;
706    unresponsive_threshold = it->second.unresponsive_threshold;
707  }
708
709  ThreadWatcher::StartWatching(
710      ThreadWatcher::WatchingParams(thread_id,
711                                    thread_name,
712                                    sleep_time,
713                                    unresponsive_time,
714                                    unresponsive_threshold,
715                                    crash_on_hang,
716                                    live_threads_threshold));
717}
718
719// static
720void ThreadWatcherList::DeleteAll() {
721  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
722    WatchDogThread::PostTask(
723        FROM_HERE,
724        base::Bind(&ThreadWatcherList::DeleteAll));
725    return;
726  }
727
728  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
729
730  SetStopped(true);
731
732  if (!g_thread_watcher_list_)
733    return;
734
735  // Delete all thread watcher objects.
736  while (!g_thread_watcher_list_->registered_.empty()) {
737    RegistrationList::iterator it = g_thread_watcher_list_->registered_.begin();
738    delete it->second;
739    g_thread_watcher_list_->registered_.erase(it);
740  }
741
742  delete g_thread_watcher_list_;
743}
744
745// static
746ThreadWatcher* ThreadWatcherList::Find(const BrowserThread::ID& thread_id) {
747  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
748  if (!g_thread_watcher_list_)
749    return NULL;
750  RegistrationList::iterator it =
751      g_thread_watcher_list_->registered_.find(thread_id);
752  if (g_thread_watcher_list_->registered_.end() == it)
753    return NULL;
754  return it->second;
755}
756
757// static
758void ThreadWatcherList::SetStopped(bool stopped) {
759  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
760  g_stopped_ = stopped;
761}
762
763// ThreadWatcherObserver methods and members.
764//
765// static
766ThreadWatcherObserver* ThreadWatcherObserver::g_thread_watcher_observer_ = NULL;
767
768ThreadWatcherObserver::ThreadWatcherObserver(
769    const base::TimeDelta& wakeup_interval)
770    : last_wakeup_time_(base::TimeTicks::Now()),
771      wakeup_interval_(wakeup_interval) {
772  CHECK(!g_thread_watcher_observer_);
773  g_thread_watcher_observer_ = this;
774}
775
776ThreadWatcherObserver::~ThreadWatcherObserver() {
777  DCHECK(this == g_thread_watcher_observer_);
778  g_thread_watcher_observer_ = NULL;
779}
780
781// static
782void ThreadWatcherObserver::SetupNotifications(
783    const base::TimeDelta& wakeup_interval) {
784  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
785  ThreadWatcherObserver* observer = new ThreadWatcherObserver(wakeup_interval);
786  observer->registrar_.Add(
787      observer,
788      chrome::NOTIFICATION_BROWSER_OPENED,
789      content::NotificationService::AllBrowserContextsAndSources());
790  observer->registrar_.Add(observer,
791                           chrome::NOTIFICATION_BROWSER_CLOSED,
792                           content::NotificationService::AllSources());
793  observer->registrar_.Add(observer,
794                           chrome::NOTIFICATION_TAB_PARENTED,
795                           content::NotificationService::AllSources());
796  observer->registrar_.Add(observer,
797                           chrome::NOTIFICATION_TAB_CLOSING,
798                           content::NotificationService::AllSources());
799  observer->registrar_.Add(observer,
800                           content::NOTIFICATION_LOAD_START,
801                           content::NotificationService::AllSources());
802  observer->registrar_.Add(observer,
803                           content::NOTIFICATION_LOAD_STOP,
804                           content::NotificationService::AllSources());
805  observer->registrar_.Add(observer,
806                           content::NOTIFICATION_RENDERER_PROCESS_CLOSED,
807                           content::NotificationService::AllSources());
808  observer->registrar_.Add(observer,
809                           content::NOTIFICATION_RENDER_WIDGET_HOST_HANG,
810                           content::NotificationService::AllSources());
811  observer->registrar_.Add(observer,
812                           chrome::NOTIFICATION_OMNIBOX_OPENED_URL,
813                           content::NotificationService::AllSources());
814}
815
816// static
817void ThreadWatcherObserver::RemoveNotifications() {
818  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
819  if (!g_thread_watcher_observer_)
820    return;
821  g_thread_watcher_observer_->registrar_.RemoveAll();
822  delete g_thread_watcher_observer_;
823}
824
825void ThreadWatcherObserver::Observe(
826    int type,
827    const content::NotificationSource& source,
828    const content::NotificationDetails& details) {
829  // There is some user activity, see if thread watchers are to be awakened.
830  base::TimeTicks now = base::TimeTicks::Now();
831  if ((now - last_wakeup_time_) < wakeup_interval_)
832    return;
833  last_wakeup_time_ = now;
834  WatchDogThread::PostTask(
835      FROM_HERE,
836      base::Bind(&ThreadWatcherList::WakeUpAll));
837}
838
839// WatchDogThread methods and members.
840
841// This lock protects g_watchdog_thread.
842static base::LazyInstance<base::Lock>::Leaky
843    g_watchdog_lock = LAZY_INSTANCE_INITIALIZER;
844
845// The singleton of this class.
846static WatchDogThread* g_watchdog_thread = NULL;
847
848WatchDogThread::WatchDogThread() : Thread("BrowserWatchdog") {
849}
850
851WatchDogThread::~WatchDogThread() {
852  Stop();
853}
854
855// static
856bool WatchDogThread::CurrentlyOnWatchDogThread() {
857  base::AutoLock lock(g_watchdog_lock.Get());
858  return g_watchdog_thread &&
859      g_watchdog_thread->message_loop() == base::MessageLoop::current();
860}
861
862// static
863bool WatchDogThread::PostTask(const tracked_objects::Location& from_here,
864                              const base::Closure& task) {
865  return PostTaskHelper(from_here, task, base::TimeDelta());
866}
867
868// static
869bool WatchDogThread::PostDelayedTask(const tracked_objects::Location& from_here,
870                                     const base::Closure& task,
871                                     base::TimeDelta delay) {
872  return PostTaskHelper(from_here, task, delay);
873}
874
875// static
876bool WatchDogThread::PostTaskHelper(
877    const tracked_objects::Location& from_here,
878    const base::Closure& task,
879    base::TimeDelta delay) {
880  {
881    base::AutoLock lock(g_watchdog_lock.Get());
882
883    base::MessageLoop* message_loop = g_watchdog_thread ?
884        g_watchdog_thread->message_loop() : NULL;
885    if (message_loop) {
886      message_loop->PostDelayedTask(from_here, task, delay);
887      return true;
888    }
889  }
890
891  return false;
892}
893
894void WatchDogThread::Init() {
895  // This thread shouldn't be allowed to perform any blocking disk I/O.
896  base::ThreadRestrictions::SetIOAllowed(false);
897
898  base::AutoLock lock(g_watchdog_lock.Get());
899  CHECK(!g_watchdog_thread);
900  g_watchdog_thread = this;
901}
902
903void WatchDogThread::CleanUp() {
904  base::AutoLock lock(g_watchdog_lock.Get());
905  g_watchdog_thread = NULL;
906}
907
908namespace {
909
910// StartupWatchDogThread methods and members.
911//
912// Class for detecting hangs during startup.
913class StartupWatchDogThread : public base::Watchdog {
914 public:
915  // Constructor specifies how long the StartupWatchDogThread will wait before
916  // alarming.
917  explicit StartupWatchDogThread(const base::TimeDelta& duration)
918      : base::Watchdog(duration, "Startup watchdog thread", true) {
919#if defined(OS_ANDROID)
920    // TODO(rtenneti): Delete this code, after getting data.
921    start_time_clock_= base::Time::Now();
922    start_time_monotonic_ = base::TimeTicks::Now();
923    start_time_thread_now_ = base::TimeTicks::IsThreadNowSupported()
924        ? base::TimeTicks::ThreadNow() : base::TimeTicks::Now();
925#endif  // OS_ANDROID
926  }
927
928  // Alarm is called if the time expires after an Arm() without someone calling
929  // Disarm(). When Alarm goes off, in release mode we get the crash dump
930  // without crashing and in debug mode we break into the debugger.
931  virtual void Alarm() OVERRIDE {
932#if !defined(NDEBUG)
933    StartupHang();
934    return;
935#elif !defined(OS_ANDROID)
936    WatchDogThread::PostTask(FROM_HERE, base::Bind(&StartupHang));
937    return;
938#else  // Android release: gather stats to figure out when to crash.
939    // TODO(rtenneti): Delete this code, after getting data.
940    UMA_HISTOGRAM_TIMES("StartupTimeBomb.Alarm.TimeDuration",
941                        base::Time::Now() - start_time_clock_);
942    UMA_HISTOGRAM_TIMES("StartupTimeBomb.Alarm.TimeTicksDuration",
943                        base::TimeTicks::Now() - start_time_monotonic_);
944    if (base::TimeTicks::IsThreadNowSupported()) {
945      UMA_HISTOGRAM_TIMES(
946          "StartupTimeBomb.Alarm.ThreadNowDuration",
947          base::TimeTicks::ThreadNow() - start_time_thread_now_);
948    }
949    return;
950#endif  // OS_ANDROID
951  }
952
953 private:
954#if defined(OS_ANDROID)
955  // TODO(rtenneti): Delete this code, after getting data.
956  base::Time start_time_clock_;
957  base::TimeTicks start_time_monotonic_;
958  base::TimeTicks start_time_thread_now_;
959#endif  // OS_ANDROID
960
961  DISALLOW_COPY_AND_ASSIGN(StartupWatchDogThread);
962};
963
964// ShutdownWatchDogThread methods and members.
965//
966// Class for detecting hangs during shutdown.
967class ShutdownWatchDogThread : public base::Watchdog {
968 public:
969  // Constructor specifies how long the ShutdownWatchDogThread will wait before
970  // alarming.
971  explicit ShutdownWatchDogThread(const base::TimeDelta& duration)
972      : base::Watchdog(duration, "Shutdown watchdog thread", true) {
973  }
974
975  // Alarm is called if the time expires after an Arm() without someone calling
976  // Disarm(). We crash the browser if this method is called.
977  virtual void Alarm() OVERRIDE {
978    ShutdownHang();
979  }
980
981 private:
982  DISALLOW_COPY_AND_ASSIGN(ShutdownWatchDogThread);
983};
984}  // namespace
985
986// StartupTimeBomb methods and members.
987//
988// static
989StartupTimeBomb* StartupTimeBomb::g_startup_timebomb_ = NULL;
990
991StartupTimeBomb::StartupTimeBomb()
992    : startup_watchdog_(NULL),
993      thread_id_(base::PlatformThread::CurrentId()) {
994  CHECK(!g_startup_timebomb_);
995  g_startup_timebomb_ = this;
996}
997
998StartupTimeBomb::~StartupTimeBomb() {
999  DCHECK(this == g_startup_timebomb_);
1000  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1001  if (startup_watchdog_)
1002    Disarm();
1003  g_startup_timebomb_ = NULL;
1004}
1005
1006void StartupTimeBomb::Arm(const base::TimeDelta& duration) {
1007  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1008  DCHECK(!startup_watchdog_);
1009  startup_watchdog_ = new StartupWatchDogThread(duration);
1010  startup_watchdog_->Arm();
1011  return;
1012}
1013
1014void StartupTimeBomb::Disarm() {
1015  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1016  if (startup_watchdog_) {
1017    startup_watchdog_->Disarm();
1018    startup_watchdog_->Cleanup();
1019    DeleteStartupWatchdog();
1020  }
1021}
1022
1023void StartupTimeBomb::DeleteStartupWatchdog() {
1024  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1025  if (startup_watchdog_->IsJoinable()) {
1026    // Allow the watchdog thread to shutdown on UI. Watchdog thread shutdowns
1027    // very fast.
1028    base::ThreadRestrictions::SetIOAllowed(true);
1029    delete startup_watchdog_;
1030    startup_watchdog_ = NULL;
1031    return;
1032  }
1033  base::MessageLoop::current()->PostDelayedTask(
1034      FROM_HERE,
1035      base::Bind(&StartupTimeBomb::DeleteStartupWatchdog,
1036                 base::Unretained(this)),
1037      base::TimeDelta::FromSeconds(10));
1038}
1039
1040// static
1041void StartupTimeBomb::DisarmStartupTimeBomb() {
1042  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
1043  if (g_startup_timebomb_)
1044    g_startup_timebomb_->Disarm();
1045}
1046
1047// ShutdownWatcherHelper methods and members.
1048//
1049// ShutdownWatcherHelper is a wrapper class for detecting hangs during
1050// shutdown.
1051ShutdownWatcherHelper::ShutdownWatcherHelper()
1052    : shutdown_watchdog_(NULL),
1053      thread_id_(base::PlatformThread::CurrentId()) {
1054}
1055
1056ShutdownWatcherHelper::~ShutdownWatcherHelper() {
1057  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1058  if (shutdown_watchdog_) {
1059    shutdown_watchdog_->Disarm();
1060    delete shutdown_watchdog_;
1061    shutdown_watchdog_ = NULL;
1062  }
1063}
1064
1065void ShutdownWatcherHelper::Arm(const base::TimeDelta& duration) {
1066  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1067  DCHECK(!shutdown_watchdog_);
1068  base::TimeDelta actual_duration = duration;
1069
1070  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
1071  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
1072    actual_duration *= 20;
1073  } else if (channel == chrome::VersionInfo::CHANNEL_BETA ||
1074             channel == chrome::VersionInfo::CHANNEL_DEV) {
1075    actual_duration *= 10;
1076  }
1077
1078#if defined(OS_WIN)
1079  // On Windows XP, give twice the time for shutdown.
1080  if (base::win::GetVersion() <= base::win::VERSION_XP)
1081    actual_duration *= 2;
1082#endif
1083
1084  shutdown_watchdog_ = new ShutdownWatchDogThread(actual_duration);
1085  shutdown_watchdog_->Arm();
1086}
1087