1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/metrics/thread_watcher.h"
6
7#include <math.h>  // ceil
8
9#include "base/bind.h"
10#include "base/compiler_specific.h"
11#include "base/debug/alias.h"
12#include "base/debug/dump_without_crashing.h"
13#include "base/lazy_instance.h"
14#include "base/metrics/field_trial.h"
15#include "base/strings/string_number_conversions.h"
16#include "base/strings/string_split.h"
17#include "base/strings/string_tokenizer.h"
18#include "base/strings/stringprintf.h"
19#include "base/threading/thread_restrictions.h"
20#include "build/build_config.h"
21#include "chrome/browser/chrome_notification_types.h"
22#include "chrome/common/chrome_switches.h"
23#include "chrome/common/chrome_version_info.h"
24#include "chrome/common/logging_chrome.h"
25#include "content/public/browser/notification_service.h"
26
27#if defined(OS_WIN)
28#include "base/win/windows_version.h"
29#endif
30
31using content::BrowserThread;
32
33namespace {
34
35// The following are unique function names for forcing the crash when a thread
36// is unresponsive. This makes it possible to tell from the callstack alone what
37// thread was unresponsive.
38//
39// We disable optimizations for this block of functions so the compiler doesn't
40// merge them all together.
41MSVC_DISABLE_OPTIMIZE()
42MSVC_PUSH_DISABLE_WARNING(4748)
43
44#ifndef NDEBUG
45int* NullPointer() {
46  return reinterpret_cast<int*>(NULL);
47}
48#endif
49
50void NullPointerCrash(int line_number) {
51#ifndef NDEBUG
52  *NullPointer() = line_number;  // Crash.
53#else
54  base::debug::DumpWithoutCrashing();
55#endif
56}
57
58#if !defined(OS_ANDROID) || !defined(NDEBUG)
59// TODO(rtenneti): Enabled crashing, after getting data.
60NOINLINE void StartupCrash() {
61  NullPointerCrash(__LINE__);
62}
63#endif  // OS_ANDROID
64
65NOINLINE void ShutdownCrash() {
66  NullPointerCrash(__LINE__);
67}
68
69NOINLINE void ThreadUnresponsive_UI() {
70  NullPointerCrash(__LINE__);
71}
72
73NOINLINE void ThreadUnresponsive_DB() {
74  NullPointerCrash(__LINE__);
75}
76
77NOINLINE void ThreadUnresponsive_FILE() {
78  NullPointerCrash(__LINE__);
79}
80
81NOINLINE void ThreadUnresponsive_FILE_USER_BLOCKING() {
82  NullPointerCrash(__LINE__);
83}
84
85NOINLINE void ThreadUnresponsive_PROCESS_LAUNCHER() {
86  NullPointerCrash(__LINE__);
87}
88
89NOINLINE void ThreadUnresponsive_CACHE() {
90  NullPointerCrash(__LINE__);
91}
92
93NOINLINE void ThreadUnresponsive_IO() {
94  NullPointerCrash(__LINE__);
95}
96
97MSVC_POP_WARNING()
98MSVC_ENABLE_OPTIMIZE();
99
100void CrashBecauseThreadWasUnresponsive(BrowserThread::ID thread_id) {
101  base::debug::Alias(&thread_id);
102
103  switch (thread_id) {
104    case BrowserThread::UI:
105      return ThreadUnresponsive_UI();
106    case BrowserThread::DB:
107      return ThreadUnresponsive_DB();
108    case BrowserThread::FILE:
109      return ThreadUnresponsive_FILE();
110    case BrowserThread::FILE_USER_BLOCKING:
111      return ThreadUnresponsive_FILE_USER_BLOCKING();
112    case BrowserThread::PROCESS_LAUNCHER:
113      return ThreadUnresponsive_PROCESS_LAUNCHER();
114    case BrowserThread::CACHE:
115      return ThreadUnresponsive_CACHE();
116    case BrowserThread::IO:
117      return ThreadUnresponsive_IO();
118    case BrowserThread::ID_COUNT:
119      CHECK(false);  // This shouldn't actually be reached!
120      break;
121
122    // Omission of the default hander is intentional -- that way the compiler
123    // should warn if our switch becomes outdated.
124  }
125
126  CHECK(false) << "Unknown thread was unresponsive.";  // Shouldn't be reached.
127}
128
129}  // namespace
130
131// ThreadWatcher methods and members.
132ThreadWatcher::ThreadWatcher(const WatchingParams& params)
133    : thread_id_(params.thread_id),
134      thread_name_(params.thread_name),
135      watched_loop_(
136          BrowserThread::GetMessageLoopProxyForThread(params.thread_id)),
137      sleep_time_(params.sleep_time),
138      unresponsive_time_(params.unresponsive_time),
139      ping_time_(base::TimeTicks::Now()),
140      pong_time_(ping_time_),
141      ping_sequence_number_(0),
142      active_(false),
143      ping_count_(params.unresponsive_threshold),
144      response_time_histogram_(NULL),
145      unresponsive_time_histogram_(NULL),
146      unresponsive_count_(0),
147      hung_processing_complete_(false),
148      unresponsive_threshold_(params.unresponsive_threshold),
149      crash_on_hang_(params.crash_on_hang),
150      live_threads_threshold_(params.live_threads_threshold),
151      weak_ptr_factory_(this) {
152  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
153  Initialize();
154}
155
156ThreadWatcher::~ThreadWatcher() {}
157
158// static
159void ThreadWatcher::StartWatching(const WatchingParams& params) {
160  DCHECK_GE(params.sleep_time.InMilliseconds(), 0);
161  DCHECK_GE(params.unresponsive_time.InMilliseconds(),
162            params.sleep_time.InMilliseconds());
163
164  // If we are not on WatchDogThread, then post a task to call StartWatching on
165  // WatchDogThread.
166  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
167    WatchDogThread::PostTask(
168        FROM_HERE,
169        base::Bind(&ThreadWatcher::StartWatching, params));
170    return;
171  }
172
173  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
174
175  // Create a new thread watcher object for the given thread and activate it.
176  ThreadWatcher* watcher = new ThreadWatcher(params);
177
178  DCHECK(watcher);
179  // If we couldn't register the thread watcher object, we are shutting down,
180  // then don't activate thread watching.
181  if (!ThreadWatcherList::IsRegistered(params.thread_id))
182    return;
183  watcher->ActivateThreadWatching();
184}
185
186void ThreadWatcher::ActivateThreadWatching() {
187  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
188  if (active_) return;
189  active_ = true;
190  ping_count_ = unresponsive_threshold_;
191  ResetHangCounters();
192  base::MessageLoop::current()->PostTask(
193      FROM_HERE,
194      base::Bind(&ThreadWatcher::PostPingMessage,
195                 weak_ptr_factory_.GetWeakPtr()));
196}
197
198void ThreadWatcher::DeActivateThreadWatching() {
199  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
200  active_ = false;
201  ping_count_ = 0;
202  weak_ptr_factory_.InvalidateWeakPtrs();
203}
204
205void ThreadWatcher::WakeUp() {
206  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
207  // There is some user activity, PostPingMessage task of thread watcher if
208  // needed.
209  if (!active_) return;
210
211  // Throw away the previous |unresponsive_count_| and start over again. Just
212  // before going to sleep, |unresponsive_count_| could be very close to
213  // |unresponsive_threshold_| and when user becomes active,
214  // |unresponsive_count_| can go over |unresponsive_threshold_| if there was no
215  // response for ping messages. Reset |unresponsive_count_| to start measuring
216  // the unresponsiveness of the threads when system becomes active.
217  unresponsive_count_ = 0;
218
219  if (ping_count_ <= 0) {
220    ping_count_ = unresponsive_threshold_;
221    ResetHangCounters();
222    PostPingMessage();
223  } else {
224    ping_count_ = unresponsive_threshold_;
225  }
226}
227
228void ThreadWatcher::PostPingMessage() {
229  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
230  // If we have stopped watching or if the user is idle, then stop sending
231  // ping messages.
232  if (!active_ || ping_count_ <= 0)
233    return;
234
235  // Save the current time when we have sent ping message.
236  ping_time_ = base::TimeTicks::Now();
237
238  // Send a ping message to the watched thread. Callback will be called on
239  // the WatchDogThread.
240  base::Closure callback(
241      base::Bind(&ThreadWatcher::OnPongMessage, weak_ptr_factory_.GetWeakPtr(),
242                 ping_sequence_number_));
243  if (watched_loop_->PostTask(
244          FROM_HERE,
245          base::Bind(&ThreadWatcher::OnPingMessage, thread_id_,
246                     callback))) {
247      // Post a task to check the responsiveness of watched thread.
248      base::MessageLoop::current()->PostDelayedTask(
249          FROM_HERE,
250          base::Bind(&ThreadWatcher::OnCheckResponsiveness,
251                     weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
252          unresponsive_time_);
253  } else {
254    // Watched thread might have gone away, stop watching it.
255    DeActivateThreadWatching();
256  }
257}
258
259void ThreadWatcher::OnPongMessage(uint64 ping_sequence_number) {
260  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
261
262  // Record watched thread's response time.
263  base::TimeTicks now = base::TimeTicks::Now();
264  base::TimeDelta response_time = now - ping_time_;
265  response_time_histogram_->AddTime(response_time);
266
267  // Save the current time when we have got pong message.
268  pong_time_ = now;
269
270  // Check if there are any extra pings in flight.
271  DCHECK_EQ(ping_sequence_number_, ping_sequence_number);
272  if (ping_sequence_number_ != ping_sequence_number)
273    return;
274
275  // Increment sequence number for the next ping message to indicate watched
276  // thread is responsive.
277  ++ping_sequence_number_;
278
279  // If we have stopped watching or if the user is idle, then stop sending
280  // ping messages.
281  if (!active_ || --ping_count_ <= 0)
282    return;
283
284  base::MessageLoop::current()->PostDelayedTask(
285      FROM_HERE,
286      base::Bind(&ThreadWatcher::PostPingMessage,
287                 weak_ptr_factory_.GetWeakPtr()),
288      sleep_time_);
289}
290
291void ThreadWatcher::OnCheckResponsiveness(uint64 ping_sequence_number) {
292  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
293  // If we have stopped watching then consider thread as responding.
294  if (!active_) {
295    responsive_ = true;
296    return;
297  }
298  // If the latest ping_sequence_number_ is not same as the ping_sequence_number
299  // that is passed in, then we can assume OnPongMessage was called.
300  // OnPongMessage increments ping_sequence_number_.
301  if (ping_sequence_number_ != ping_sequence_number) {
302    // Reset unresponsive_count_ to zero because we got a response from the
303    // watched thread.
304    ResetHangCounters();
305
306    responsive_ = true;
307    return;
308  }
309  // Record that we got no response from watched thread.
310  GotNoResponse();
311
312  // Post a task to check the responsiveness of watched thread.
313  base::MessageLoop::current()->PostDelayedTask(
314      FROM_HERE,
315      base::Bind(&ThreadWatcher::OnCheckResponsiveness,
316                 weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
317      unresponsive_time_);
318  responsive_ = false;
319}
320
321void ThreadWatcher::Initialize() {
322  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
323  ThreadWatcherList::Register(this);
324
325  const std::string response_time_histogram_name =
326      "ThreadWatcher.ResponseTime." + thread_name_;
327  response_time_histogram_ = base::Histogram::FactoryTimeGet(
328      response_time_histogram_name,
329      base::TimeDelta::FromMilliseconds(1),
330      base::TimeDelta::FromSeconds(100), 50,
331      base::Histogram::kUmaTargetedHistogramFlag);
332
333  const std::string unresponsive_time_histogram_name =
334      "ThreadWatcher.Unresponsive." + thread_name_;
335  unresponsive_time_histogram_ = base::Histogram::FactoryTimeGet(
336      unresponsive_time_histogram_name,
337      base::TimeDelta::FromMilliseconds(1),
338      base::TimeDelta::FromSeconds(100), 50,
339      base::Histogram::kUmaTargetedHistogramFlag);
340
341  const std::string responsive_count_histogram_name =
342      "ThreadWatcher.ResponsiveThreads." + thread_name_;
343  responsive_count_histogram_ = base::LinearHistogram::FactoryGet(
344      responsive_count_histogram_name, 1, 10, 11,
345      base::Histogram::kUmaTargetedHistogramFlag);
346
347  const std::string unresponsive_count_histogram_name =
348      "ThreadWatcher.UnresponsiveThreads." + thread_name_;
349  unresponsive_count_histogram_ = base::LinearHistogram::FactoryGet(
350      unresponsive_count_histogram_name, 1, 10, 11,
351      base::Histogram::kUmaTargetedHistogramFlag);
352}
353
354// static
355void ThreadWatcher::OnPingMessage(const BrowserThread::ID& thread_id,
356                                  const base::Closure& callback_task) {
357  // This method is called on watched thread.
358  DCHECK(BrowserThread::CurrentlyOn(thread_id));
359  WatchDogThread::PostTask(FROM_HERE, callback_task);
360}
361
362void ThreadWatcher::ResetHangCounters() {
363  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
364  unresponsive_count_ = 0;
365  hung_processing_complete_ = false;
366}
367
368void ThreadWatcher::GotNoResponse() {
369  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
370
371  ++unresponsive_count_;
372  if (!IsVeryUnresponsive())
373    return;
374
375  // Record total unresponsive_time since last pong message.
376  base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_;
377  unresponsive_time_histogram_->AddTime(unresponse_time);
378
379  // We have already collected stats for the non-responding watched thread.
380  if (hung_processing_complete_)
381    return;
382
383  // Record how other threads are responding.
384  uint32 responding_thread_count = 0;
385  uint32 unresponding_thread_count = 0;
386  ThreadWatcherList::GetStatusOfThreads(&responding_thread_count,
387                                        &unresponding_thread_count);
388
389  // Record how many watched threads are responding.
390  responsive_count_histogram_->Add(responding_thread_count);
391
392  // Record how many watched threads are not responding.
393  unresponsive_count_histogram_->Add(unresponding_thread_count);
394
395  // Crash the browser if the watched thread is to be crashed on hang and if the
396  // number of other threads responding is less than or equal to
397  // live_threads_threshold_ and at least one other thread is responding.
398  if (crash_on_hang_ &&
399      responding_thread_count > 0 &&
400      responding_thread_count <= live_threads_threshold_) {
401    static bool crashed_once = false;
402    if (!crashed_once) {
403      crashed_once = true;
404      CrashBecauseThreadWasUnresponsive(thread_id_);
405    }
406  }
407
408  hung_processing_complete_ = true;
409}
410
411bool ThreadWatcher::IsVeryUnresponsive() {
412  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
413  return unresponsive_count_ >= unresponsive_threshold_;
414}
415
416// ThreadWatcherList methods and members.
417//
418// static
419ThreadWatcherList* ThreadWatcherList::g_thread_watcher_list_ = NULL;
420// static
421bool ThreadWatcherList::g_stopped_ = false;
422// static
423const int ThreadWatcherList::kSleepSeconds = 1;
424// static
425const int ThreadWatcherList::kUnresponsiveSeconds = 2;
426// static
427const int ThreadWatcherList::kUnresponsiveCount = 9;
428// static
429const int ThreadWatcherList::kLiveThreadsThreshold = 2;
430// static, non-const for tests.
431int ThreadWatcherList::g_initialize_delay_seconds = 120;
432
433ThreadWatcherList::CrashDataThresholds::CrashDataThresholds(
434    uint32 live_threads_threshold,
435    uint32 unresponsive_threshold)
436    : live_threads_threshold(live_threads_threshold),
437      unresponsive_threshold(unresponsive_threshold) {
438}
439
440ThreadWatcherList::CrashDataThresholds::CrashDataThresholds()
441    : live_threads_threshold(kLiveThreadsThreshold),
442      unresponsive_threshold(kUnresponsiveCount) {
443}
444
445// static
446void ThreadWatcherList::StartWatchingAll(const CommandLine& command_line) {
447  // TODO(rtenneti): Enable ThreadWatcher.
448  uint32 unresponsive_threshold;
449  CrashOnHangThreadMap crash_on_hang_threads;
450  ParseCommandLine(command_line,
451                   &unresponsive_threshold,
452                   &crash_on_hang_threads);
453
454  ThreadWatcherObserver::SetupNotifications(
455      base::TimeDelta::FromSeconds(kSleepSeconds * unresponsive_threshold));
456
457  WatchDogThread::PostTask(
458      FROM_HERE,
459      base::Bind(&ThreadWatcherList::SetStopped, false));
460
461  WatchDogThread::PostDelayedTask(
462      FROM_HERE,
463      base::Bind(&ThreadWatcherList::InitializeAndStartWatching,
464                 unresponsive_threshold,
465                 crash_on_hang_threads),
466      base::TimeDelta::FromSeconds(g_initialize_delay_seconds));
467}
468
469// static
470void ThreadWatcherList::StopWatchingAll() {
471  // TODO(rtenneti): Enable ThreadWatcher.
472  ThreadWatcherObserver::RemoveNotifications();
473  DeleteAll();
474}
475
476// static
477void ThreadWatcherList::Register(ThreadWatcher* watcher) {
478  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
479  if (!g_thread_watcher_list_)
480    return;
481  DCHECK(!g_thread_watcher_list_->Find(watcher->thread_id()));
482  g_thread_watcher_list_->registered_[watcher->thread_id()] = watcher;
483}
484
485// static
486bool ThreadWatcherList::IsRegistered(const BrowserThread::ID thread_id) {
487  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
488  return NULL != ThreadWatcherList::Find(thread_id);
489}
490
491// static
492void ThreadWatcherList::GetStatusOfThreads(uint32* responding_thread_count,
493                                           uint32* unresponding_thread_count) {
494  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
495  *responding_thread_count = 0;
496  *unresponding_thread_count = 0;
497  if (!g_thread_watcher_list_)
498    return;
499
500  for (RegistrationList::iterator it =
501           g_thread_watcher_list_->registered_.begin();
502       g_thread_watcher_list_->registered_.end() != it;
503       ++it) {
504    if (it->second->IsVeryUnresponsive())
505      ++(*unresponding_thread_count);
506    else
507      ++(*responding_thread_count);
508  }
509}
510
511// static
512void ThreadWatcherList::WakeUpAll() {
513  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
514  if (!g_thread_watcher_list_)
515    return;
516
517  for (RegistrationList::iterator it =
518           g_thread_watcher_list_->registered_.begin();
519       g_thread_watcher_list_->registered_.end() != it;
520       ++it)
521    it->second->WakeUp();
522}
523
524ThreadWatcherList::ThreadWatcherList() {
525  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
526  CHECK(!g_thread_watcher_list_);
527  g_thread_watcher_list_ = this;
528}
529
530ThreadWatcherList::~ThreadWatcherList() {
531  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
532  DCHECK(this == g_thread_watcher_list_);
533  g_thread_watcher_list_ = NULL;
534}
535
536// static
537void ThreadWatcherList::ParseCommandLine(
538    const CommandLine& command_line,
539    uint32* unresponsive_threshold,
540    CrashOnHangThreadMap* crash_on_hang_threads) {
541  // Initialize |unresponsive_threshold| to a default value.
542  *unresponsive_threshold = kUnresponsiveCount;
543
544  // Increase the unresponsive_threshold on the Stable and Beta channels to
545  // reduce the number of crashes due to ThreadWatcher.
546  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
547  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
548    *unresponsive_threshold *= 4;
549  } else if (channel == chrome::VersionInfo::CHANNEL_BETA) {
550    *unresponsive_threshold *= 2;
551  }
552
553#if defined(OS_WIN)
554  // For Windows XP (old systems), double the unresponsive_threshold to give
555  // the OS a chance to schedule UI/IO threads a time slice to respond with a
556  // pong message (to get around limitations with the OS).
557  if (base::win::GetVersion() <= base::win::VERSION_XP)
558    *unresponsive_threshold *= 2;
559#endif
560
561  uint32 crash_seconds = *unresponsive_threshold * kUnresponsiveSeconds;
562  std::string crash_on_hang_thread_names;
563  bool has_command_line_overwrite = false;
564  if (command_line.HasSwitch(switches::kCrashOnHangThreads)) {
565    crash_on_hang_thread_names =
566        command_line.GetSwitchValueASCII(switches::kCrashOnHangThreads);
567    has_command_line_overwrite = true;
568  } else if (channel != chrome::VersionInfo::CHANNEL_STABLE) {
569    // Default to crashing the browser if UI or IO or FILE threads are not
570    // responsive except in stable channel.
571    crash_on_hang_thread_names = base::StringPrintf(
572        "UI:%d:%d,IO:%d:%d,FILE:%d:%d",
573        kLiveThreadsThreshold, crash_seconds,
574        kLiveThreadsThreshold, crash_seconds,
575        kLiveThreadsThreshold, crash_seconds * 5);
576  }
577
578  ParseCommandLineCrashOnHangThreads(crash_on_hang_thread_names,
579                                     kLiveThreadsThreshold,
580                                     crash_seconds,
581                                     crash_on_hang_threads);
582
583  if (channel != chrome::VersionInfo::CHANNEL_CANARY ||
584      has_command_line_overwrite) {
585    return;
586  }
587
588  const char* kFieldTrialName = "ThreadWatcher";
589
590  // Nothing else to be done if the trial has already been set (i.e., when
591  // StartWatchingAll() has been already called once).
592  if (base::FieldTrialList::TrialExists(kFieldTrialName))
593    return;
594
595  // Set up a field trial for 100% of the users to crash if either UI or IO
596  // thread is not responsive for 30 seconds (or 15 pings).
597  scoped_refptr<base::FieldTrial> field_trial(
598      base::FieldTrialList::FactoryGetFieldTrial(
599          kFieldTrialName, 100, "default_hung_threads",
600          2014, 10, 30, base::FieldTrial::SESSION_RANDOMIZED, NULL));
601  int hung_thread_group = field_trial->AppendGroup("hung_thread", 100);
602  if (field_trial->group() == hung_thread_group) {
603    for (CrashOnHangThreadMap::iterator it = crash_on_hang_threads->begin();
604         crash_on_hang_threads->end() != it;
605         ++it) {
606      if (it->first == "FILE")
607        continue;
608      it->second.live_threads_threshold = INT_MAX;
609      if (it->first == "UI") {
610        // TODO(rtenneti): set unresponsive threshold to 120 seconds to catch
611        // the worst UI hangs and for fewer crashes due to ThreadWatcher. Reduce
612        // it to a more reasonable time ala IO thread.
613        it->second.unresponsive_threshold = 60;
614      } else {
615        it->second.unresponsive_threshold = 15;
616      }
617    }
618  }
619}
620
621// static
622void ThreadWatcherList::ParseCommandLineCrashOnHangThreads(
623    const std::string& crash_on_hang_thread_names,
624    uint32 default_live_threads_threshold,
625    uint32 default_crash_seconds,
626    CrashOnHangThreadMap* crash_on_hang_threads) {
627  base::StringTokenizer tokens(crash_on_hang_thread_names, ",");
628  std::vector<std::string> values;
629  while (tokens.GetNext()) {
630    const std::string& token = tokens.token();
631    base::SplitString(token, ':', &values);
632    std::string thread_name = values[0];
633
634    uint32 live_threads_threshold = default_live_threads_threshold;
635    uint32 crash_seconds = default_crash_seconds;
636    if (values.size() >= 2 &&
637        (!base::StringToUint(values[1], &live_threads_threshold))) {
638      continue;
639    }
640    if (values.size() >= 3 &&
641        (!base::StringToUint(values[2], &crash_seconds))) {
642      continue;
643    }
644    uint32 unresponsive_threshold = static_cast<uint32>(
645        ceil(static_cast<float>(crash_seconds) / kUnresponsiveSeconds));
646
647    CrashDataThresholds crash_data(live_threads_threshold,
648                                   unresponsive_threshold);
649    // Use the last specifier.
650    (*crash_on_hang_threads)[thread_name] = crash_data;
651  }
652}
653
654// static
655void ThreadWatcherList::InitializeAndStartWatching(
656    uint32 unresponsive_threshold,
657    const CrashOnHangThreadMap& crash_on_hang_threads) {
658  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
659
660  // Disarm the startup timebomb, even if stop has been called.
661  BrowserThread::PostTask(
662      BrowserThread::UI,
663      FROM_HERE,
664      base::Bind(&StartupTimeBomb::DisarmStartupTimeBomb));
665
666  // This method is deferred in relationship to its StopWatchingAll()
667  // counterpart. If a previous initialization has already happened, or if
668  // stop has been called, there's nothing left to do here.
669  if (g_thread_watcher_list_ || g_stopped_)
670    return;
671
672  ThreadWatcherList* thread_watcher_list = new ThreadWatcherList();
673  CHECK(thread_watcher_list);
674
675  const base::TimeDelta kSleepTime =
676      base::TimeDelta::FromSeconds(kSleepSeconds);
677  const base::TimeDelta kUnresponsiveTime =
678      base::TimeDelta::FromSeconds(kUnresponsiveSeconds);
679
680  StartWatching(BrowserThread::UI, "UI", kSleepTime, kUnresponsiveTime,
681                unresponsive_threshold, crash_on_hang_threads);
682  StartWatching(BrowserThread::IO, "IO", kSleepTime, kUnresponsiveTime,
683                unresponsive_threshold, crash_on_hang_threads);
684  StartWatching(BrowserThread::DB, "DB", kSleepTime, kUnresponsiveTime,
685                unresponsive_threshold, crash_on_hang_threads);
686  StartWatching(BrowserThread::FILE, "FILE", kSleepTime, kUnresponsiveTime,
687                unresponsive_threshold, crash_on_hang_threads);
688  StartWatching(BrowserThread::CACHE, "CACHE", kSleepTime, kUnresponsiveTime,
689                unresponsive_threshold, crash_on_hang_threads);
690}
691
692// static
693void ThreadWatcherList::StartWatching(
694    const BrowserThread::ID& thread_id,
695    const std::string& thread_name,
696    const base::TimeDelta& sleep_time,
697    const base::TimeDelta& unresponsive_time,
698    uint32 unresponsive_threshold,
699    const CrashOnHangThreadMap& crash_on_hang_threads) {
700  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
701
702  CrashOnHangThreadMap::const_iterator it =
703      crash_on_hang_threads.find(thread_name);
704  bool crash_on_hang = false;
705  uint32 live_threads_threshold = 0;
706  if (it != crash_on_hang_threads.end()) {
707    crash_on_hang = true;
708    live_threads_threshold = it->second.live_threads_threshold;
709    unresponsive_threshold = it->second.unresponsive_threshold;
710  }
711
712  ThreadWatcher::StartWatching(
713      ThreadWatcher::WatchingParams(thread_id,
714                                    thread_name,
715                                    sleep_time,
716                                    unresponsive_time,
717                                    unresponsive_threshold,
718                                    crash_on_hang,
719                                    live_threads_threshold));
720}
721
722// static
723void ThreadWatcherList::DeleteAll() {
724  if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
725    WatchDogThread::PostTask(
726        FROM_HERE,
727        base::Bind(&ThreadWatcherList::DeleteAll));
728    return;
729  }
730
731  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
732
733  SetStopped(true);
734
735  if (!g_thread_watcher_list_)
736    return;
737
738  // Delete all thread watcher objects.
739  while (!g_thread_watcher_list_->registered_.empty()) {
740    RegistrationList::iterator it = g_thread_watcher_list_->registered_.begin();
741    delete it->second;
742    g_thread_watcher_list_->registered_.erase(it);
743  }
744
745  delete g_thread_watcher_list_;
746}
747
748// static
749ThreadWatcher* ThreadWatcherList::Find(const BrowserThread::ID& thread_id) {
750  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
751  if (!g_thread_watcher_list_)
752    return NULL;
753  RegistrationList::iterator it =
754      g_thread_watcher_list_->registered_.find(thread_id);
755  if (g_thread_watcher_list_->registered_.end() == it)
756    return NULL;
757  return it->second;
758}
759
760// static
761void ThreadWatcherList::SetStopped(bool stopped) {
762  DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
763  g_stopped_ = stopped;
764}
765
766// ThreadWatcherObserver methods and members.
767//
768// static
769ThreadWatcherObserver* ThreadWatcherObserver::g_thread_watcher_observer_ = NULL;
770
771ThreadWatcherObserver::ThreadWatcherObserver(
772    const base::TimeDelta& wakeup_interval)
773    : last_wakeup_time_(base::TimeTicks::Now()),
774      wakeup_interval_(wakeup_interval) {
775  CHECK(!g_thread_watcher_observer_);
776  g_thread_watcher_observer_ = this;
777}
778
779ThreadWatcherObserver::~ThreadWatcherObserver() {
780  DCHECK(this == g_thread_watcher_observer_);
781  g_thread_watcher_observer_ = NULL;
782}
783
784// static
785void ThreadWatcherObserver::SetupNotifications(
786    const base::TimeDelta& wakeup_interval) {
787  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
788  ThreadWatcherObserver* observer = new ThreadWatcherObserver(wakeup_interval);
789  observer->registrar_.Add(
790      observer,
791      chrome::NOTIFICATION_BROWSER_OPENED,
792      content::NotificationService::AllBrowserContextsAndSources());
793  observer->registrar_.Add(observer,
794                           chrome::NOTIFICATION_BROWSER_CLOSED,
795                           content::NotificationService::AllSources());
796  observer->registrar_.Add(observer,
797                           chrome::NOTIFICATION_TAB_PARENTED,
798                           content::NotificationService::AllSources());
799  observer->registrar_.Add(observer,
800                           chrome::NOTIFICATION_TAB_CLOSING,
801                           content::NotificationService::AllSources());
802  observer->registrar_.Add(observer,
803                           content::NOTIFICATION_LOAD_START,
804                           content::NotificationService::AllSources());
805  observer->registrar_.Add(observer,
806                           content::NOTIFICATION_LOAD_STOP,
807                           content::NotificationService::AllSources());
808  observer->registrar_.Add(observer,
809                           content::NOTIFICATION_RENDERER_PROCESS_CLOSED,
810                           content::NotificationService::AllSources());
811  observer->registrar_.Add(observer,
812                           content::NOTIFICATION_RENDER_WIDGET_HOST_HANG,
813                           content::NotificationService::AllSources());
814  observer->registrar_.Add(observer,
815                           chrome::NOTIFICATION_OMNIBOX_OPENED_URL,
816                           content::NotificationService::AllSources());
817}
818
819// static
820void ThreadWatcherObserver::RemoveNotifications() {
821  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
822  if (!g_thread_watcher_observer_)
823    return;
824  g_thread_watcher_observer_->registrar_.RemoveAll();
825  delete g_thread_watcher_observer_;
826}
827
828void ThreadWatcherObserver::Observe(
829    int type,
830    const content::NotificationSource& source,
831    const content::NotificationDetails& details) {
832  // There is some user activity, see if thread watchers are to be awakened.
833  base::TimeTicks now = base::TimeTicks::Now();
834  if ((now - last_wakeup_time_) < wakeup_interval_)
835    return;
836  last_wakeup_time_ = now;
837  WatchDogThread::PostTask(
838      FROM_HERE,
839      base::Bind(&ThreadWatcherList::WakeUpAll));
840}
841
842// WatchDogThread methods and members.
843
844// This lock protects g_watchdog_thread.
845static base::LazyInstance<base::Lock>::Leaky
846    g_watchdog_lock = LAZY_INSTANCE_INITIALIZER;
847
848// The singleton of this class.
849static WatchDogThread* g_watchdog_thread = NULL;
850
851WatchDogThread::WatchDogThread() : Thread("BrowserWatchdog") {
852}
853
854WatchDogThread::~WatchDogThread() {
855  Stop();
856}
857
858// static
859bool WatchDogThread::CurrentlyOnWatchDogThread() {
860  base::AutoLock lock(g_watchdog_lock.Get());
861  return g_watchdog_thread &&
862      g_watchdog_thread->message_loop() == base::MessageLoop::current();
863}
864
865// static
866bool WatchDogThread::PostTask(const tracked_objects::Location& from_here,
867                              const base::Closure& task) {
868  return PostTaskHelper(from_here, task, base::TimeDelta());
869}
870
871// static
872bool WatchDogThread::PostDelayedTask(const tracked_objects::Location& from_here,
873                                     const base::Closure& task,
874                                     base::TimeDelta delay) {
875  return PostTaskHelper(from_here, task, delay);
876}
877
878// static
879bool WatchDogThread::PostTaskHelper(
880    const tracked_objects::Location& from_here,
881    const base::Closure& task,
882    base::TimeDelta delay) {
883  {
884    base::AutoLock lock(g_watchdog_lock.Get());
885
886    base::MessageLoop* message_loop = g_watchdog_thread ?
887        g_watchdog_thread->message_loop() : NULL;
888    if (message_loop) {
889      message_loop->PostDelayedTask(from_here, task, delay);
890      return true;
891    }
892  }
893
894  return false;
895}
896
897void WatchDogThread::Init() {
898  // This thread shouldn't be allowed to perform any blocking disk I/O.
899  base::ThreadRestrictions::SetIOAllowed(false);
900
901  base::AutoLock lock(g_watchdog_lock.Get());
902  CHECK(!g_watchdog_thread);
903  g_watchdog_thread = this;
904}
905
906void WatchDogThread::CleanUp() {
907  base::AutoLock lock(g_watchdog_lock.Get());
908  g_watchdog_thread = NULL;
909}
910
911namespace {
912
913// StartupWatchDogThread methods and members.
914//
915// Class for detecting hangs during startup.
916class StartupWatchDogThread : public base::Watchdog {
917 public:
918  // Constructor specifies how long the StartupWatchDogThread will wait before
919  // alarming.
920  explicit StartupWatchDogThread(const base::TimeDelta& duration)
921      : base::Watchdog(duration, "Startup watchdog thread", true) {
922#if defined(OS_ANDROID)
923    // TODO(rtenneti): Delete this code, after getting data.
924    start_time_clock_= base::Time::Now();
925    start_time_monotonic_ = base::TimeTicks::Now();
926    start_time_thread_now_ = base::TimeTicks::IsThreadNowSupported()
927        ? base::TimeTicks::ThreadNow() : base::TimeTicks::Now();
928#endif  // OS_ANDROID
929  }
930
931  // Alarm is called if the time expires after an Arm() without someone calling
932  // Disarm(). When Alarm goes off, in release mode we get the crash dump
933  // without crashing and in debug mode we break into the debugger.
934  virtual void Alarm() OVERRIDE {
935#if !defined(NDEBUG)
936    StartupCrash();
937    return;
938#elif !defined(OS_ANDROID)
939    WatchDogThread::PostTask(FROM_HERE, base::Bind(&StartupCrash));
940    return;
941#else  // Android release: gather stats to figure out when to crash.
942    // TODO(rtenneti): Delete this code, after getting data.
943    UMA_HISTOGRAM_TIMES("StartupTimeBomb.Alarm.TimeDuration",
944                        base::Time::Now() - start_time_clock_);
945    UMA_HISTOGRAM_TIMES("StartupTimeBomb.Alarm.TimeTicksDuration",
946                        base::TimeTicks::Now() - start_time_monotonic_);
947    if (base::TimeTicks::IsThreadNowSupported()) {
948      UMA_HISTOGRAM_TIMES(
949          "StartupTimeBomb.Alarm.ThreadNowDuration",
950          base::TimeTicks::ThreadNow() - start_time_thread_now_);
951    }
952    return;
953#endif  // OS_ANDROID
954  }
955
956 private:
957#if defined(OS_ANDROID)
958  // TODO(rtenneti): Delete this code, after getting data.
959  base::Time start_time_clock_;
960  base::TimeTicks start_time_monotonic_;
961  base::TimeTicks start_time_thread_now_;
962#endif  // OS_ANDROID
963
964  DISALLOW_COPY_AND_ASSIGN(StartupWatchDogThread);
965};
966
967// ShutdownWatchDogThread methods and members.
968//
969// Class for detecting hangs during shutdown.
970class ShutdownWatchDogThread : public base::Watchdog {
971 public:
972  // Constructor specifies how long the ShutdownWatchDogThread will wait before
973  // alarming.
974  explicit ShutdownWatchDogThread(const base::TimeDelta& duration)
975      : base::Watchdog(duration, "Shutdown watchdog thread", true) {
976  }
977
978  // Alarm is called if the time expires after an Arm() without someone calling
979  // Disarm(). We crash the browser if this method is called.
980  virtual void Alarm() OVERRIDE {
981    ShutdownCrash();
982  }
983
984 private:
985  DISALLOW_COPY_AND_ASSIGN(ShutdownWatchDogThread);
986};
987}  // namespace
988
989// StartupTimeBomb methods and members.
990//
991// static
992StartupTimeBomb* StartupTimeBomb::g_startup_timebomb_ = NULL;
993
994StartupTimeBomb::StartupTimeBomb()
995    : startup_watchdog_(NULL),
996      thread_id_(base::PlatformThread::CurrentId()) {
997  CHECK(!g_startup_timebomb_);
998  g_startup_timebomb_ = this;
999}
1000
1001StartupTimeBomb::~StartupTimeBomb() {
1002  DCHECK(this == g_startup_timebomb_);
1003  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1004  if (startup_watchdog_)
1005    Disarm();
1006  g_startup_timebomb_ = NULL;
1007}
1008
1009void StartupTimeBomb::Arm(const base::TimeDelta& duration) {
1010  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1011  DCHECK(!startup_watchdog_);
1012  startup_watchdog_ = new StartupWatchDogThread(duration);
1013  startup_watchdog_->Arm();
1014  return;
1015}
1016
1017void StartupTimeBomb::Disarm() {
1018  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1019  if (startup_watchdog_) {
1020    startup_watchdog_->Disarm();
1021    startup_watchdog_->Cleanup();
1022    DeleteStartupWatchdog();
1023  }
1024}
1025
1026void StartupTimeBomb::DeleteStartupWatchdog() {
1027  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1028  if (startup_watchdog_->IsJoinable()) {
1029    // Allow the watchdog thread to shutdown on UI. Watchdog thread shutdowns
1030    // very fast.
1031    base::ThreadRestrictions::SetIOAllowed(true);
1032    delete startup_watchdog_;
1033    startup_watchdog_ = NULL;
1034    return;
1035  }
1036  base::MessageLoop::current()->PostDelayedTask(
1037      FROM_HERE,
1038      base::Bind(&StartupTimeBomb::DeleteStartupWatchdog,
1039                 base::Unretained(this)),
1040      base::TimeDelta::FromSeconds(10));
1041}
1042
1043// static
1044void StartupTimeBomb::DisarmStartupTimeBomb() {
1045  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
1046  if (g_startup_timebomb_)
1047    g_startup_timebomb_->Disarm();
1048}
1049
1050// ShutdownWatcherHelper methods and members.
1051//
1052// ShutdownWatcherHelper is a wrapper class for detecting hangs during
1053// shutdown.
1054ShutdownWatcherHelper::ShutdownWatcherHelper()
1055    : shutdown_watchdog_(NULL),
1056      thread_id_(base::PlatformThread::CurrentId()) {
1057}
1058
1059ShutdownWatcherHelper::~ShutdownWatcherHelper() {
1060  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1061  if (shutdown_watchdog_) {
1062    shutdown_watchdog_->Disarm();
1063    delete shutdown_watchdog_;
1064    shutdown_watchdog_ = NULL;
1065  }
1066}
1067
1068void ShutdownWatcherHelper::Arm(const base::TimeDelta& duration) {
1069  DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1070  DCHECK(!shutdown_watchdog_);
1071  base::TimeDelta actual_duration = duration;
1072
1073  chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
1074  if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
1075    actual_duration *= 20;
1076  } else if (channel == chrome::VersionInfo::CHANNEL_BETA ||
1077             channel == chrome::VersionInfo::CHANNEL_DEV) {
1078    actual_duration *= 10;
1079  }
1080
1081#if defined(OS_WIN)
1082  // On Windows XP, give twice the time for shutdown.
1083  if (base::win::GetVersion() <= base::win::VERSION_XP)
1084    actual_duration *= 2;
1085#endif
1086
1087  shutdown_watchdog_ = new ShutdownWatchDogThread(actual_duration);
1088  shutdown_watchdog_->Arm();
1089}
1090