thread_watcher.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// This file defines a WatchDog thread that monitors the responsiveness of other
6// browser threads like UI, IO, DB, FILE and CACHED threads. It also defines
7// ThreadWatcher class which performs health check on threads that would like to
8// be watched. This file also defines ThreadWatcherList class that has list of
9// all active ThreadWatcher objects.
10//
11// ThreadWatcher class sends ping message to the watched thread and the watched
12// thread responds back with a pong message. It uploads response time
13// (difference between ping and pong times) as a histogram.
14//
15// TODO(raman): ThreadWatcher can detect hung threads. If a hung thread is
16// detected, we should probably just crash, and allow the crash system to gather
17// then stack trace.
18//
19// Example Usage:
20//
21//   The following is an example for watching responsiveness of watched (IO)
22//   thread. |sleep_time| specifies how often ping messages have to be sent to
23//   watched (IO) thread. |unresponsive_time| is the wait time after ping
24//   message is sent, to check if we have received pong message or not.
25//   |unresponsive_threshold| specifies the number of unanswered ping messages
26//   after which watched (IO) thread is considered as not responsive.
27//   |crash_on_hang| specifies if we want to crash the browser when the watched
28//   (IO) thread has become sufficiently unresponsive, while other threads are
29//   sufficiently responsive. |live_threads_threshold| specifies the number of
30//   browser threads that are to be responsive when we want to crash the browser
31//   because of hung watched (IO) thread.
32//
33//   base::TimeDelta sleep_time = base::TimeDelta::FromSeconds(5);
34//   base::TimeDelta unresponsive_time = base::TimeDelta::FromSeconds(10);
35//   uint32 unresponsive_threshold = ThreadWatcherList::kUnresponsiveCount;
36//   bool crash_on_hang = false;
37//   uint32 live_threads_threshold = ThreadWatcherList::kLiveThreadsThreshold;
38//   ThreadWatcher::StartWatching(
39//       BrowserThread::IO, "IO", sleep_time, unresponsive_time,
40//       unresponsive_threshold, crash_on_hang, live_threads_threshold);
41
42#ifndef CHROME_BROWSER_METRICS_THREAD_WATCHER_H_
43#define CHROME_BROWSER_METRICS_THREAD_WATCHER_H_
44
45#include <map>
46#include <set>
47#include <string>
48#include <vector>
49
50#include "base/basictypes.h"
51#include "base/command_line.h"
52#include "base/gtest_prod_util.h"
53#include "base/memory/ref_counted.h"
54#include "base/memory/weak_ptr.h"
55#include "base/message_loop.h"
56#include "base/metrics/histogram.h"
57#include "base/synchronization/lock.h"
58#include "base/threading/platform_thread.h"
59#include "base/threading/thread.h"
60#include "base/threading/watchdog.h"
61#include "base/time.h"
62#include "content/public/browser/browser_thread.h"
63#include "content/public/browser/notification_observer.h"
64#include "content/public/browser/notification_registrar.h"
65
66class CustomThreadWatcher;
67class StartupTimeBomb;
68class ThreadWatcherList;
69class ThreadWatcherObserver;
70
71// This class performs health check on threads that would like to be watched.
72class ThreadWatcher {
73 public:
74  // base::Bind supports methods with up to 6 parameters. WatchingParams is used
75  // as a workaround that limitation for invoking ThreadWatcher::StartWatching.
76  struct WatchingParams {
77    const content::BrowserThread::ID& thread_id;
78    const std::string& thread_name;
79    const base::TimeDelta& sleep_time;
80    const base::TimeDelta& unresponsive_time;
81    uint32 unresponsive_threshold;
82    bool crash_on_hang;
83    uint32 live_threads_threshold;
84
85    WatchingParams(const content::BrowserThread::ID& thread_id_in,
86                   const std::string& thread_name_in,
87                   const base::TimeDelta& sleep_time_in,
88                   const base::TimeDelta& unresponsive_time_in,
89                   uint32 unresponsive_threshold_in,
90                   bool crash_on_hang_in,
91                   uint32 live_threads_threshold_in)
92        : thread_id(thread_id_in),
93          thread_name(thread_name_in),
94          sleep_time(sleep_time_in),
95          unresponsive_time(unresponsive_time_in),
96          unresponsive_threshold(unresponsive_threshold_in),
97          crash_on_hang(crash_on_hang_in),
98          live_threads_threshold(live_threads_threshold_in) {
99    }
100  };
101
102  // This method starts performing health check on the given |thread_id|. It
103  // will create ThreadWatcher object for the given |thread_id|, |thread_name|.
104  // |sleep_time| is the wait time between ping messages. |unresponsive_time| is
105  // the wait time after ping message is sent, to check if we have received pong
106  // message or not. |unresponsive_threshold| is used to determine if the thread
107  // is responsive or not. The watched thread is considered unresponsive if it
108  // hasn't responded with a pong message for |unresponsive_threshold| number of
109  // ping messages. |crash_on_hang| specifies if browser should be crashed when
110  // the watched thread is unresponsive. |live_threads_threshold| specifies the
111  // number of browser threads that are to be responsive when we want to crash
112  // the browser and watched thread has become sufficiently unresponsive. It
113  // will register that ThreadWatcher object and activate the thread watching of
114  // the given thread_id.
115  static void StartWatching(const WatchingParams& params);
116
117  // Return the |thread_id_| of the thread being watched.
118  content::BrowserThread::ID thread_id() const { return thread_id_; }
119
120  // Return the name of the thread being watched.
121  std::string thread_name() const { return thread_name_; }
122
123  // Return the sleep time between ping messages to be sent to the thread.
124  base::TimeDelta sleep_time() const { return sleep_time_; }
125
126  // Return the the wait time to check the responsiveness of the thread.
127  base::TimeDelta unresponsive_time() const { return unresponsive_time_; }
128
129  // Returns true if we are montioring the thread.
130  bool active() const { return active_; }
131
132  // Returns |ping_time_| (used by unit tests).
133  base::TimeTicks ping_time() const { return ping_time_; }
134
135  // Returns |ping_sequence_number_| (used by unit tests).
136  uint64 ping_sequence_number() const { return ping_sequence_number_; }
137
138 protected:
139  // Construct a ThreadWatcher for the given |thread_id|. |sleep_time| is the
140  // wait time between ping messages. |unresponsive_time| is the wait time after
141  // ping message is sent, to check if we have received pong message or not.
142  explicit ThreadWatcher(const WatchingParams& params);
143
144  virtual ~ThreadWatcher();
145
146  // This method activates the thread watching which starts ping/pong messaging.
147  virtual void ActivateThreadWatching();
148
149  // This method de-activates the thread watching and revokes all tasks.
150  virtual void DeActivateThreadWatching();
151
152  // This will ensure that the watching is actively taking place, and awaken
153  // (i.e., post a PostPingMessage()) if the watcher has stopped pinging due to
154  // lack of user activity. It will also reset |ping_count_| to
155  // |unresponsive_threshold_|.
156  virtual void WakeUp();
157
158  // This method records when ping message was sent and it will Post a task
159  // (OnPingMessage()) to the watched thread that does nothing but respond with
160  // OnPongMessage(). It also posts a task (OnCheckResponsiveness()) to check
161  // responsiveness of monitored thread that would be called after waiting
162  // |unresponsive_time_|.
163  // This method is accessible on WatchDogThread.
164  virtual void PostPingMessage();
165
166  // This method handles a Pong Message from watched thread. It will track the
167  // response time (pong time minus ping time) via histograms. It posts a
168  // PostPingMessage() task that would be called after waiting |sleep_time_|. It
169  // increments |ping_sequence_number_| by 1.
170  // This method is accessible on WatchDogThread.
171  virtual void OnPongMessage(uint64 ping_sequence_number);
172
173  // This method will determine if the watched thread is responsive or not. If
174  // the latest |ping_sequence_number_| is not same as the
175  // |ping_sequence_number| that is passed in, then we can assume that watched
176  // thread has responded with a pong message.
177  // This method is accessible on WatchDogThread.
178  virtual void OnCheckResponsiveness(uint64 ping_sequence_number);
179
180  // Set by OnCheckResponsiveness when it determines if the watched thread is
181  // responsive or not.
182  bool responsive_;
183
184 private:
185  friend class ThreadWatcherList;
186  friend class CustomThreadWatcher;
187
188  // Allow tests to access our innards for testing purposes.
189  FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, Registration);
190  FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, ThreadResponding);
191  FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, ThreadNotResponding);
192  FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, MultipleThreadsResponding);
193  FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, MultipleThreadsNotResponding);
194
195  // Post constructor initialization.
196  void Initialize();
197
198  // Watched thread does nothing except post callback_task to the WATCHDOG
199  // Thread. This method is called on watched thread.
200  static void OnPingMessage(const content::BrowserThread::ID& thread_id,
201                            const base::Closure& callback_task);
202
203  // This method resets |unresponsive_count_| to zero because watched thread is
204  // responding to the ping message with a pong message.
205  void ResetHangCounters();
206
207  // This method records watched thread is not responding to the ping message.
208  // It increments |unresponsive_count_| by 1.
209  void GotNoResponse();
210
211  // This method returns true if the watched thread has not responded with a
212  // pong message for |unresponsive_threshold_| number of ping messages.
213  bool IsVeryUnresponsive();
214
215  // The |thread_id_| of the thread being watched. Only one instance can exist
216  // for the given |thread_id_| of the thread being watched.
217  const content::BrowserThread::ID thread_id_;
218
219  // The name of the thread being watched.
220  const std::string thread_name_;
221
222  // Used to post messages to watched thread.
223  scoped_refptr<base::MessageLoopProxy> watched_loop_;
224
225  // It is the sleep time between the receipt of a pong message back, and the
226  // sending of another ping message.
227  const base::TimeDelta sleep_time_;
228
229  // It is the duration from sending a ping message, until we check status to be
230  // sure a pong message has been returned.
231  const base::TimeDelta unresponsive_time_;
232
233  // This is the last time when ping message was sent.
234  base::TimeTicks ping_time_;
235
236  // This is the last time when we got pong message.
237  base::TimeTicks pong_time_;
238
239  // This is the sequence number of the next ping for which there is no pong. If
240  // the instance is sleeping, then it will be the sequence number for the next
241  // ping.
242  uint64 ping_sequence_number_;
243
244  // This is set to true if thread watcher is watching.
245  bool active_;
246
247  // The counter tracks least number of ping messages that will be sent to
248  // watched thread before the ping-pong mechanism will go into an extended
249  // sleep. If this value is zero, then the mechanism is in an extended sleep,
250  // and awaiting some observed user action before continuing.
251  int ping_count_;
252
253  // Histogram that keeps track of response times for the watched thread.
254  base::Histogram* response_time_histogram_;
255
256  // Histogram that keeps track of unresponsive time since the last pong message
257  // when we got no response (GotNoResponse()) from the watched thread.
258  base::Histogram* unresponsive_time_histogram_;
259
260  // Histogram that keeps track of how many threads are responding when we got
261  // no response (GotNoResponse()) from the watched thread.
262  base::Histogram* responsive_count_histogram_;
263
264  // Histogram that keeps track of how many threads are not responding when we
265  // got no response (GotNoResponse()) from the watched thread. Count includes
266  // the thread that got no response.
267  base::Histogram* unresponsive_count_histogram_;
268
269  // This counter tracks the unresponsiveness of watched thread. If this value
270  // is zero then watched thread has responded with a pong message. This is
271  // incremented by 1 when we got no response (GotNoResponse()) from the watched
272  // thread.
273  uint32 unresponsive_count_;
274
275  // This is set to true when we would have crashed the browser because the
276  // watched thread hasn't responded at least |unresponsive_threshold_| times.
277  // It is reset to false when watched thread responds with a pong message.
278  bool hung_processing_complete_;
279
280  // This is used to determine if the watched thread is responsive or not. If
281  // watched thread's |unresponsive_count_| is greater than or equal to
282  // |unresponsive_threshold_| then we would consider it as unresponsive.
283  uint32 unresponsive_threshold_;
284
285  // This is set to true if we want to crash the browser when the watched thread
286  // has become sufficiently unresponsive, while other threads are sufficiently
287  // responsive.
288  bool crash_on_hang_;
289
290  // This specifies the number of browser threads that are to be responsive when
291  // we want to crash the browser because watched thread has become sufficiently
292  // unresponsive.
293  uint32 live_threads_threshold_;
294
295  // We use this factory to create callback tasks for ThreadWatcher object. We
296  // use this during ping-pong messaging between WatchDog thread and watched
297  // thread.
298  base::WeakPtrFactory<ThreadWatcher> weak_ptr_factory_;
299
300  DISALLOW_COPY_AND_ASSIGN(ThreadWatcher);
301};
302
303// Class with a list of all active thread watchers.  A thread watcher is active
304// if it has been registered, which includes determing the histogram name. This
305// class provides utility functions to start and stop watching all browser
306// threads. Only one instance of this class exists.
307class ThreadWatcherList {
308 public:
309  // A map from BrowserThread to the actual instances.
310  typedef std::map<content::BrowserThread::ID, ThreadWatcher*> RegistrationList;
311
312  // This method posts a task on WatchDogThread to start watching all browser
313  // threads.
314  // This method is accessible on UI thread.
315  static void StartWatchingAll(const CommandLine& command_line);
316
317  // This method posts a task on WatchDogThread to RevokeAll tasks and to
318  // deactive thread watching of other threads and tell NotificationService to
319  // stop calling Observe.
320  // This method is accessible on UI thread.
321  static void StopWatchingAll();
322
323  // Register() stores a pointer to the given ThreadWatcher in a global map.
324  static void Register(ThreadWatcher* watcher);
325
326  // This method returns true if the ThreadWatcher object is registerd.
327  static bool IsRegistered(const content::BrowserThread::ID thread_id);
328
329  // This method returns number of responsive and unresponsive watched threads.
330  static void GetStatusOfThreads(uint32* responding_thread_count,
331                                 uint32* unresponding_thread_count);
332
333  // This will ensure that the watching is actively taking place, and awaken
334  // all thread watchers that are registered.
335  static void WakeUpAll();
336
337 private:
338  // Allow tests to access our innards for testing purposes.
339  friend class CustomThreadWatcher;
340  friend class ThreadWatcherTest;
341  FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, CommandLineArgs);
342
343  // This singleton holds the global list of registered ThreadWatchers.
344  ThreadWatcherList();
345
346  // Destructor deletes all registered ThreadWatcher instances.
347  virtual ~ThreadWatcherList();
348
349  // Parses the command line to get |unresponsive_threshold| from
350  // switches::kCrashOnHangSeconds, |crash_on_hang_thread_names| from
351  // switches::kCrashOnHangThreads and |live_threads_threshold| from
352  // switches::kCrashOnLive. |crash_on_hang_thread_names| is the set of watched
353  // thread's names that are to be crashed if they are not responding.
354  static void ParseCommandLine(
355      const CommandLine& command_line,
356      uint32* unresponsive_threshold,
357      std::set<std::string>* crash_on_hang_thread_names,
358      uint32* live_threads_threshold);
359
360  // This constructs the |ThreadWatcherList| singleton and starts watching
361  // browser threads by calling StartWatching() on each browser thread that is
362  // watched. It disarms StartupTimeBomb.
363  static void InitializeAndStartWatching(
364      uint32 unresponsive_threshold,
365      const std::set<std::string>& crash_on_hang_thread_names,
366      uint32 live_threads_threshold);
367
368  // This method calls ThreadWatcher::StartWatching() to perform health check on
369  // the given |thread_id|.
370  static void StartWatching(
371      const content::BrowserThread::ID& thread_id,
372      const std::string& thread_name,
373      const base::TimeDelta& sleep_time,
374      const base::TimeDelta& unresponsive_time,
375      uint32 unresponsive_threshold,
376      const std::set<std::string>& crash_on_hang_thread_names,
377      uint32 live_threads_threshold);
378
379  // Delete all thread watcher objects and remove them from global map. It also
380  // deletes |g_thread_watcher_list_|.
381  static void DeleteAll();
382
383  // The Find() method can be used to test to see if a given ThreadWatcher was
384  // already registered, or to retrieve a pointer to it from the global map.
385  static ThreadWatcher* Find(const content::BrowserThread::ID& thread_id);
386
387  // The singleton of this class and is used to keep track of information about
388  // threads that are being watched.
389  static ThreadWatcherList* g_thread_watcher_list_;
390
391  // This is the wait time between ping messages.
392  static const int kSleepSeconds;
393
394  // This is the wait time after ping message is sent, to check if we have
395  // received pong message or not.
396  static const int kUnresponsiveSeconds;
397
398  // Default values for |unresponsive_threshold|.
399  static const int kUnresponsiveCount;
400
401  // Default values for |live_threads_threshold|.
402  static const int kLiveThreadsThreshold;
403
404  // Map of all registered watched threads, from thread_id to ThreadWatcher.
405  RegistrationList registered_;
406
407  DISALLOW_COPY_AND_ASSIGN(ThreadWatcherList);
408};
409
410// This class ensures that the thread watching is actively taking place. Only
411// one instance of this class exists.
412class ThreadWatcherObserver : public content::NotificationObserver {
413 public:
414  // Registers |g_thread_watcher_observer_| as the Notifications observer.
415  // |wakeup_interval| specifies how often to wake up thread watchers. This
416  // method is accessible on UI thread.
417  static void SetupNotifications(const base::TimeDelta& wakeup_interval);
418
419  // Removes all ints from |registrar_| and deletes
420  // |g_thread_watcher_observer_|. This method is accessible on UI thread.
421  static void RemoveNotifications();
422
423 private:
424  // Constructor of |g_thread_watcher_observer_| singleton.
425  explicit ThreadWatcherObserver(const base::TimeDelta& wakeup_interval);
426
427  // Destructor of |g_thread_watcher_observer_| singleton.
428  virtual ~ThreadWatcherObserver();
429
430  // This ensures all thread watchers are active because there is some user
431  // activity. It will wake up all thread watchers every |wakeup_interval_|
432  // seconds. This is the implementation of content::NotificationObserver. When
433  // a matching notification is posted to the notification service, this method
434  // is called.
435  virtual void Observe(int type,
436                       const content::NotificationSource& source,
437                       const content::NotificationDetails& details) OVERRIDE;
438
439  // The singleton of this class.
440  static ThreadWatcherObserver* g_thread_watcher_observer_;
441
442  // The registrar that holds ints to be observed.
443  content::NotificationRegistrar registrar_;
444
445  // This is the last time when woke all thread watchers up.
446  base::TimeTicks last_wakeup_time_;
447
448  // It is the time interval between wake up calls to thread watchers.
449  const base::TimeDelta wakeup_interval_;
450
451  DISALLOW_COPY_AND_ASSIGN(ThreadWatcherObserver);
452};
453
454// Class for WatchDogThread and in its Init method, we start watching UI, IO,
455// DB, FILE, CACHED threads.
456class WatchDogThread : public base::Thread {
457 public:
458  // Constructor.
459  WatchDogThread();
460
461  // Destroys the thread and stops the thread.
462  virtual ~WatchDogThread();
463
464  // Callable on any thread.  Returns whether you're currently on a
465  // WatchDogThread.
466  static bool CurrentlyOnWatchDogThread();
467
468  // These are the same methods in message_loop.h, but are guaranteed to either
469  // get posted to the MessageLoop if it's still alive, or be deleted otherwise.
470  // They return true iff the watchdog thread existed and the task was posted.
471  // Note that even if the task is posted, there's no guarantee that it will
472  // run, since the target thread may already have a Quit message in its queue.
473  static bool PostTask(const tracked_objects::Location& from_here,
474                       const base::Closure& task);
475  static bool PostDelayedTask(const tracked_objects::Location& from_here,
476                              const base::Closure& task,
477                              base::TimeDelta delay);
478
479 protected:
480  virtual void Init() OVERRIDE;
481  virtual void CleanUp() OVERRIDE;
482
483 private:
484  static bool PostTaskHelper(
485      const tracked_objects::Location& from_here,
486      const base::Closure& task,
487      base::TimeDelta delay);
488
489  DISALLOW_COPY_AND_ASSIGN(WatchDogThread);
490};
491
492// This is a wrapper class for getting the crash dumps of the hangs during
493// startup.
494class StartupTimeBomb {
495 public:
496  // This singleton is instantiated when the browser process is launched.
497  StartupTimeBomb();
498
499  // Destructor disarm's startup_watchdog_ (if it is arm'ed) so that alarm
500  // doesn't go off.
501  ~StartupTimeBomb();
502
503  // Constructs |startup_watchdog_| which spawns a thread and starts timer.
504  // |duration| specifies how long |startup_watchdog_| will wait before it
505  // calls alarm.
506  void Arm(const base::TimeDelta& duration);
507
508  // Disarms |startup_watchdog_| thread and then deletes it which stops the
509  // Watchdog thread.
510  void Disarm();
511
512  // Disarms |g_startup_timebomb_|.
513  static void DisarmStartupTimeBomb();
514
515 private:
516  // Deletes |startup_watchdog_| if it is joinable. If |startup_watchdog_| is
517  // not joinable, then it will post a delayed task to try again.
518  void DeleteStartupWatchdog();
519
520  // The singleton of this class.
521  static StartupTimeBomb* g_startup_timebomb_;
522
523  // Watches for hangs during startup until it is disarm'ed.
524  base::Watchdog* startup_watchdog_;
525
526  // The |thread_id_| on which this object is constructed.
527  const base::PlatformThreadId thread_id_;
528
529  DISALLOW_COPY_AND_ASSIGN(StartupTimeBomb);
530};
531
532// This is a wrapper class for detecting hangs during shutdown.
533class ShutdownWatcherHelper {
534 public:
535  // Create an empty holder for |shutdown_watchdog_|.
536  ShutdownWatcherHelper();
537
538  // Destructor disarm's shutdown_watchdog_ so that alarm doesn't go off.
539  ~ShutdownWatcherHelper();
540
541  // Constructs ShutdownWatchDogThread which spawns a thread and starts timer.
542  // |duration| specifies how long it will wait before it calls alarm.
543  void Arm(const base::TimeDelta& duration);
544
545 private:
546  // shutdown_watchdog_ watches for hangs during shutdown.
547  base::Watchdog* shutdown_watchdog_;
548
549  // The |thread_id_| on which this object is constructed.
550  const base::PlatformThreadId thread_id_;
551
552  DISALLOW_COPY_AND_ASSIGN(ShutdownWatcherHelper);
553};
554
555#endif  // CHROME_BROWSER_METRICS_THREAD_WATCHER_H_
556