thread_watcher.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// This file defines a WatchDog thread that monitors the responsiveness of other 6// browser threads like UI, IO, DB, FILE and CACHED threads. It also defines 7// ThreadWatcher class which performs health check on threads that would like to 8// be watched. This file also defines ThreadWatcherList class that has list of 9// all active ThreadWatcher objects. 10// 11// ThreadWatcher class sends ping message to the watched thread and the watched 12// thread responds back with a pong message. It uploads response time 13// (difference between ping and pong times) as a histogram. 14// 15// TODO(raman): ThreadWatcher can detect hung threads. If a hung thread is 16// detected, we should probably just crash, and allow the crash system to gather 17// then stack trace. 18// 19// Example Usage: 20// 21// The following is an example for watching responsiveness of watched (IO) 22// thread. |sleep_time| specifies how often ping messages have to be sent to 23// watched (IO) thread. |unresponsive_time| is the wait time after ping 24// message is sent, to check if we have received pong message or not. 25// |unresponsive_threshold| specifies the number of unanswered ping messages 26// after which watched (IO) thread is considered as not responsive. 27// |crash_on_hang| specifies if we want to crash the browser when the watched 28// (IO) thread has become sufficiently unresponsive, while other threads are 29// sufficiently responsive. |live_threads_threshold| specifies the number of 30// browser threads that are to be responsive when we want to crash the browser 31// because of hung watched (IO) thread. 32// 33// base::TimeDelta sleep_time = base::TimeDelta::FromSeconds(5); 34// base::TimeDelta unresponsive_time = base::TimeDelta::FromSeconds(10); 35// uint32 unresponsive_threshold = ThreadWatcherList::kUnresponsiveCount; 36// bool crash_on_hang = false; 37// uint32 live_threads_threshold = ThreadWatcherList::kLiveThreadsThreshold; 38// ThreadWatcher::StartWatching( 39// BrowserThread::IO, "IO", sleep_time, unresponsive_time, 40// unresponsive_threshold, crash_on_hang, live_threads_threshold); 41 42#ifndef CHROME_BROWSER_METRICS_THREAD_WATCHER_H_ 43#define CHROME_BROWSER_METRICS_THREAD_WATCHER_H_ 44 45#include <map> 46#include <set> 47#include <string> 48#include <vector> 49 50#include "base/basictypes.h" 51#include "base/command_line.h" 52#include "base/gtest_prod_util.h" 53#include "base/memory/ref_counted.h" 54#include "base/memory/weak_ptr.h" 55#include "base/message_loop.h" 56#include "base/metrics/histogram.h" 57#include "base/synchronization/lock.h" 58#include "base/threading/platform_thread.h" 59#include "base/threading/thread.h" 60#include "base/threading/watchdog.h" 61#include "base/time.h" 62#include "content/public/browser/browser_thread.h" 63#include "content/public/browser/notification_observer.h" 64#include "content/public/browser/notification_registrar.h" 65 66class CustomThreadWatcher; 67class StartupTimeBomb; 68class ThreadWatcherList; 69class ThreadWatcherObserver; 70 71// This class performs health check on threads that would like to be watched. 72class ThreadWatcher { 73 public: 74 // base::Bind supports methods with up to 6 parameters. WatchingParams is used 75 // as a workaround that limitation for invoking ThreadWatcher::StartWatching. 76 struct WatchingParams { 77 const content::BrowserThread::ID& thread_id; 78 const std::string& thread_name; 79 const base::TimeDelta& sleep_time; 80 const base::TimeDelta& unresponsive_time; 81 uint32 unresponsive_threshold; 82 bool crash_on_hang; 83 uint32 live_threads_threshold; 84 85 WatchingParams(const content::BrowserThread::ID& thread_id_in, 86 const std::string& thread_name_in, 87 const base::TimeDelta& sleep_time_in, 88 const base::TimeDelta& unresponsive_time_in, 89 uint32 unresponsive_threshold_in, 90 bool crash_on_hang_in, 91 uint32 live_threads_threshold_in) 92 : thread_id(thread_id_in), 93 thread_name(thread_name_in), 94 sleep_time(sleep_time_in), 95 unresponsive_time(unresponsive_time_in), 96 unresponsive_threshold(unresponsive_threshold_in), 97 crash_on_hang(crash_on_hang_in), 98 live_threads_threshold(live_threads_threshold_in) { 99 } 100 }; 101 102 // This method starts performing health check on the given |thread_id|. It 103 // will create ThreadWatcher object for the given |thread_id|, |thread_name|. 104 // |sleep_time| is the wait time between ping messages. |unresponsive_time| is 105 // the wait time after ping message is sent, to check if we have received pong 106 // message or not. |unresponsive_threshold| is used to determine if the thread 107 // is responsive or not. The watched thread is considered unresponsive if it 108 // hasn't responded with a pong message for |unresponsive_threshold| number of 109 // ping messages. |crash_on_hang| specifies if browser should be crashed when 110 // the watched thread is unresponsive. |live_threads_threshold| specifies the 111 // number of browser threads that are to be responsive when we want to crash 112 // the browser and watched thread has become sufficiently unresponsive. It 113 // will register that ThreadWatcher object and activate the thread watching of 114 // the given thread_id. 115 static void StartWatching(const WatchingParams& params); 116 117 // Return the |thread_id_| of the thread being watched. 118 content::BrowserThread::ID thread_id() const { return thread_id_; } 119 120 // Return the name of the thread being watched. 121 std::string thread_name() const { return thread_name_; } 122 123 // Return the sleep time between ping messages to be sent to the thread. 124 base::TimeDelta sleep_time() const { return sleep_time_; } 125 126 // Return the the wait time to check the responsiveness of the thread. 127 base::TimeDelta unresponsive_time() const { return unresponsive_time_; } 128 129 // Returns true if we are montioring the thread. 130 bool active() const { return active_; } 131 132 // Returns |ping_time_| (used by unit tests). 133 base::TimeTicks ping_time() const { return ping_time_; } 134 135 // Returns |ping_sequence_number_| (used by unit tests). 136 uint64 ping_sequence_number() const { return ping_sequence_number_; } 137 138 protected: 139 // Construct a ThreadWatcher for the given |thread_id|. |sleep_time| is the 140 // wait time between ping messages. |unresponsive_time| is the wait time after 141 // ping message is sent, to check if we have received pong message or not. 142 explicit ThreadWatcher(const WatchingParams& params); 143 144 virtual ~ThreadWatcher(); 145 146 // This method activates the thread watching which starts ping/pong messaging. 147 virtual void ActivateThreadWatching(); 148 149 // This method de-activates the thread watching and revokes all tasks. 150 virtual void DeActivateThreadWatching(); 151 152 // This will ensure that the watching is actively taking place, and awaken 153 // (i.e., post a PostPingMessage()) if the watcher has stopped pinging due to 154 // lack of user activity. It will also reset |ping_count_| to 155 // |unresponsive_threshold_|. 156 virtual void WakeUp(); 157 158 // This method records when ping message was sent and it will Post a task 159 // (OnPingMessage()) to the watched thread that does nothing but respond with 160 // OnPongMessage(). It also posts a task (OnCheckResponsiveness()) to check 161 // responsiveness of monitored thread that would be called after waiting 162 // |unresponsive_time_|. 163 // This method is accessible on WatchDogThread. 164 virtual void PostPingMessage(); 165 166 // This method handles a Pong Message from watched thread. It will track the 167 // response time (pong time minus ping time) via histograms. It posts a 168 // PostPingMessage() task that would be called after waiting |sleep_time_|. It 169 // increments |ping_sequence_number_| by 1. 170 // This method is accessible on WatchDogThread. 171 virtual void OnPongMessage(uint64 ping_sequence_number); 172 173 // This method will determine if the watched thread is responsive or not. If 174 // the latest |ping_sequence_number_| is not same as the 175 // |ping_sequence_number| that is passed in, then we can assume that watched 176 // thread has responded with a pong message. 177 // This method is accessible on WatchDogThread. 178 virtual void OnCheckResponsiveness(uint64 ping_sequence_number); 179 180 // Set by OnCheckResponsiveness when it determines if the watched thread is 181 // responsive or not. 182 bool responsive_; 183 184 private: 185 friend class ThreadWatcherList; 186 friend class CustomThreadWatcher; 187 188 // Allow tests to access our innards for testing purposes. 189 FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, Registration); 190 FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, ThreadResponding); 191 FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, ThreadNotResponding); 192 FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, MultipleThreadsResponding); 193 FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, MultipleThreadsNotResponding); 194 195 // Post constructor initialization. 196 void Initialize(); 197 198 // Watched thread does nothing except post callback_task to the WATCHDOG 199 // Thread. This method is called on watched thread. 200 static void OnPingMessage(const content::BrowserThread::ID& thread_id, 201 const base::Closure& callback_task); 202 203 // This method resets |unresponsive_count_| to zero because watched thread is 204 // responding to the ping message with a pong message. 205 void ResetHangCounters(); 206 207 // This method records watched thread is not responding to the ping message. 208 // It increments |unresponsive_count_| by 1. 209 void GotNoResponse(); 210 211 // This method returns true if the watched thread has not responded with a 212 // pong message for |unresponsive_threshold_| number of ping messages. 213 bool IsVeryUnresponsive(); 214 215 // The |thread_id_| of the thread being watched. Only one instance can exist 216 // for the given |thread_id_| of the thread being watched. 217 const content::BrowserThread::ID thread_id_; 218 219 // The name of the thread being watched. 220 const std::string thread_name_; 221 222 // Used to post messages to watched thread. 223 scoped_refptr<base::MessageLoopProxy> watched_loop_; 224 225 // It is the sleep time between the receipt of a pong message back, and the 226 // sending of another ping message. 227 const base::TimeDelta sleep_time_; 228 229 // It is the duration from sending a ping message, until we check status to be 230 // sure a pong message has been returned. 231 const base::TimeDelta unresponsive_time_; 232 233 // This is the last time when ping message was sent. 234 base::TimeTicks ping_time_; 235 236 // This is the last time when we got pong message. 237 base::TimeTicks pong_time_; 238 239 // This is the sequence number of the next ping for which there is no pong. If 240 // the instance is sleeping, then it will be the sequence number for the next 241 // ping. 242 uint64 ping_sequence_number_; 243 244 // This is set to true if thread watcher is watching. 245 bool active_; 246 247 // The counter tracks least number of ping messages that will be sent to 248 // watched thread before the ping-pong mechanism will go into an extended 249 // sleep. If this value is zero, then the mechanism is in an extended sleep, 250 // and awaiting some observed user action before continuing. 251 int ping_count_; 252 253 // Histogram that keeps track of response times for the watched thread. 254 base::Histogram* response_time_histogram_; 255 256 // Histogram that keeps track of unresponsive time since the last pong message 257 // when we got no response (GotNoResponse()) from the watched thread. 258 base::Histogram* unresponsive_time_histogram_; 259 260 // Histogram that keeps track of how many threads are responding when we got 261 // no response (GotNoResponse()) from the watched thread. 262 base::Histogram* responsive_count_histogram_; 263 264 // Histogram that keeps track of how many threads are not responding when we 265 // got no response (GotNoResponse()) from the watched thread. Count includes 266 // the thread that got no response. 267 base::Histogram* unresponsive_count_histogram_; 268 269 // This counter tracks the unresponsiveness of watched thread. If this value 270 // is zero then watched thread has responded with a pong message. This is 271 // incremented by 1 when we got no response (GotNoResponse()) from the watched 272 // thread. 273 uint32 unresponsive_count_; 274 275 // This is set to true when we would have crashed the browser because the 276 // watched thread hasn't responded at least |unresponsive_threshold_| times. 277 // It is reset to false when watched thread responds with a pong message. 278 bool hung_processing_complete_; 279 280 // This is used to determine if the watched thread is responsive or not. If 281 // watched thread's |unresponsive_count_| is greater than or equal to 282 // |unresponsive_threshold_| then we would consider it as unresponsive. 283 uint32 unresponsive_threshold_; 284 285 // This is set to true if we want to crash the browser when the watched thread 286 // has become sufficiently unresponsive, while other threads are sufficiently 287 // responsive. 288 bool crash_on_hang_; 289 290 // This specifies the number of browser threads that are to be responsive when 291 // we want to crash the browser because watched thread has become sufficiently 292 // unresponsive. 293 uint32 live_threads_threshold_; 294 295 // We use this factory to create callback tasks for ThreadWatcher object. We 296 // use this during ping-pong messaging between WatchDog thread and watched 297 // thread. 298 base::WeakPtrFactory<ThreadWatcher> weak_ptr_factory_; 299 300 DISALLOW_COPY_AND_ASSIGN(ThreadWatcher); 301}; 302 303// Class with a list of all active thread watchers. A thread watcher is active 304// if it has been registered, which includes determing the histogram name. This 305// class provides utility functions to start and stop watching all browser 306// threads. Only one instance of this class exists. 307class ThreadWatcherList { 308 public: 309 // A map from BrowserThread to the actual instances. 310 typedef std::map<content::BrowserThread::ID, ThreadWatcher*> RegistrationList; 311 312 // This method posts a task on WatchDogThread to start watching all browser 313 // threads. 314 // This method is accessible on UI thread. 315 static void StartWatchingAll(const CommandLine& command_line); 316 317 // This method posts a task on WatchDogThread to RevokeAll tasks and to 318 // deactive thread watching of other threads and tell NotificationService to 319 // stop calling Observe. 320 // This method is accessible on UI thread. 321 static void StopWatchingAll(); 322 323 // Register() stores a pointer to the given ThreadWatcher in a global map. 324 static void Register(ThreadWatcher* watcher); 325 326 // This method returns true if the ThreadWatcher object is registerd. 327 static bool IsRegistered(const content::BrowserThread::ID thread_id); 328 329 // This method returns number of responsive and unresponsive watched threads. 330 static void GetStatusOfThreads(uint32* responding_thread_count, 331 uint32* unresponding_thread_count); 332 333 // This will ensure that the watching is actively taking place, and awaken 334 // all thread watchers that are registered. 335 static void WakeUpAll(); 336 337 private: 338 // Allow tests to access our innards for testing purposes. 339 friend class CustomThreadWatcher; 340 friend class ThreadWatcherTest; 341 FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, CommandLineArgs); 342 343 // This singleton holds the global list of registered ThreadWatchers. 344 ThreadWatcherList(); 345 346 // Destructor deletes all registered ThreadWatcher instances. 347 virtual ~ThreadWatcherList(); 348 349 // Parses the command line to get |unresponsive_threshold| from 350 // switches::kCrashOnHangSeconds, |crash_on_hang_thread_names| from 351 // switches::kCrashOnHangThreads and |live_threads_threshold| from 352 // switches::kCrashOnLive. |crash_on_hang_thread_names| is the set of watched 353 // thread's names that are to be crashed if they are not responding. 354 static void ParseCommandLine( 355 const CommandLine& command_line, 356 uint32* unresponsive_threshold, 357 std::set<std::string>* crash_on_hang_thread_names, 358 uint32* live_threads_threshold); 359 360 // This constructs the |ThreadWatcherList| singleton and starts watching 361 // browser threads by calling StartWatching() on each browser thread that is 362 // watched. It disarms StartupTimeBomb. 363 static void InitializeAndStartWatching( 364 uint32 unresponsive_threshold, 365 const std::set<std::string>& crash_on_hang_thread_names, 366 uint32 live_threads_threshold); 367 368 // This method calls ThreadWatcher::StartWatching() to perform health check on 369 // the given |thread_id|. 370 static void StartWatching( 371 const content::BrowserThread::ID& thread_id, 372 const std::string& thread_name, 373 const base::TimeDelta& sleep_time, 374 const base::TimeDelta& unresponsive_time, 375 uint32 unresponsive_threshold, 376 const std::set<std::string>& crash_on_hang_thread_names, 377 uint32 live_threads_threshold); 378 379 // Delete all thread watcher objects and remove them from global map. It also 380 // deletes |g_thread_watcher_list_|. 381 static void DeleteAll(); 382 383 // The Find() method can be used to test to see if a given ThreadWatcher was 384 // already registered, or to retrieve a pointer to it from the global map. 385 static ThreadWatcher* Find(const content::BrowserThread::ID& thread_id); 386 387 // The singleton of this class and is used to keep track of information about 388 // threads that are being watched. 389 static ThreadWatcherList* g_thread_watcher_list_; 390 391 // This is the wait time between ping messages. 392 static const int kSleepSeconds; 393 394 // This is the wait time after ping message is sent, to check if we have 395 // received pong message or not. 396 static const int kUnresponsiveSeconds; 397 398 // Default values for |unresponsive_threshold|. 399 static const int kUnresponsiveCount; 400 401 // Default values for |live_threads_threshold|. 402 static const int kLiveThreadsThreshold; 403 404 // Map of all registered watched threads, from thread_id to ThreadWatcher. 405 RegistrationList registered_; 406 407 DISALLOW_COPY_AND_ASSIGN(ThreadWatcherList); 408}; 409 410// This class ensures that the thread watching is actively taking place. Only 411// one instance of this class exists. 412class ThreadWatcherObserver : public content::NotificationObserver { 413 public: 414 // Registers |g_thread_watcher_observer_| as the Notifications observer. 415 // |wakeup_interval| specifies how often to wake up thread watchers. This 416 // method is accessible on UI thread. 417 static void SetupNotifications(const base::TimeDelta& wakeup_interval); 418 419 // Removes all ints from |registrar_| and deletes 420 // |g_thread_watcher_observer_|. This method is accessible on UI thread. 421 static void RemoveNotifications(); 422 423 private: 424 // Constructor of |g_thread_watcher_observer_| singleton. 425 explicit ThreadWatcherObserver(const base::TimeDelta& wakeup_interval); 426 427 // Destructor of |g_thread_watcher_observer_| singleton. 428 virtual ~ThreadWatcherObserver(); 429 430 // This ensures all thread watchers are active because there is some user 431 // activity. It will wake up all thread watchers every |wakeup_interval_| 432 // seconds. This is the implementation of content::NotificationObserver. When 433 // a matching notification is posted to the notification service, this method 434 // is called. 435 virtual void Observe(int type, 436 const content::NotificationSource& source, 437 const content::NotificationDetails& details) OVERRIDE; 438 439 // The singleton of this class. 440 static ThreadWatcherObserver* g_thread_watcher_observer_; 441 442 // The registrar that holds ints to be observed. 443 content::NotificationRegistrar registrar_; 444 445 // This is the last time when woke all thread watchers up. 446 base::TimeTicks last_wakeup_time_; 447 448 // It is the time interval between wake up calls to thread watchers. 449 const base::TimeDelta wakeup_interval_; 450 451 DISALLOW_COPY_AND_ASSIGN(ThreadWatcherObserver); 452}; 453 454// Class for WatchDogThread and in its Init method, we start watching UI, IO, 455// DB, FILE, CACHED threads. 456class WatchDogThread : public base::Thread { 457 public: 458 // Constructor. 459 WatchDogThread(); 460 461 // Destroys the thread and stops the thread. 462 virtual ~WatchDogThread(); 463 464 // Callable on any thread. Returns whether you're currently on a 465 // WatchDogThread. 466 static bool CurrentlyOnWatchDogThread(); 467 468 // These are the same methods in message_loop.h, but are guaranteed to either 469 // get posted to the MessageLoop if it's still alive, or be deleted otherwise. 470 // They return true iff the watchdog thread existed and the task was posted. 471 // Note that even if the task is posted, there's no guarantee that it will 472 // run, since the target thread may already have a Quit message in its queue. 473 static bool PostTask(const tracked_objects::Location& from_here, 474 const base::Closure& task); 475 static bool PostDelayedTask(const tracked_objects::Location& from_here, 476 const base::Closure& task, 477 base::TimeDelta delay); 478 479 protected: 480 virtual void Init() OVERRIDE; 481 virtual void CleanUp() OVERRIDE; 482 483 private: 484 static bool PostTaskHelper( 485 const tracked_objects::Location& from_here, 486 const base::Closure& task, 487 base::TimeDelta delay); 488 489 DISALLOW_COPY_AND_ASSIGN(WatchDogThread); 490}; 491 492// This is a wrapper class for getting the crash dumps of the hangs during 493// startup. 494class StartupTimeBomb { 495 public: 496 // This singleton is instantiated when the browser process is launched. 497 StartupTimeBomb(); 498 499 // Destructor disarm's startup_watchdog_ (if it is arm'ed) so that alarm 500 // doesn't go off. 501 ~StartupTimeBomb(); 502 503 // Constructs |startup_watchdog_| which spawns a thread and starts timer. 504 // |duration| specifies how long |startup_watchdog_| will wait before it 505 // calls alarm. 506 void Arm(const base::TimeDelta& duration); 507 508 // Disarms |startup_watchdog_| thread and then deletes it which stops the 509 // Watchdog thread. 510 void Disarm(); 511 512 // Disarms |g_startup_timebomb_|. 513 static void DisarmStartupTimeBomb(); 514 515 private: 516 // Deletes |startup_watchdog_| if it is joinable. If |startup_watchdog_| is 517 // not joinable, then it will post a delayed task to try again. 518 void DeleteStartupWatchdog(); 519 520 // The singleton of this class. 521 static StartupTimeBomb* g_startup_timebomb_; 522 523 // Watches for hangs during startup until it is disarm'ed. 524 base::Watchdog* startup_watchdog_; 525 526 // The |thread_id_| on which this object is constructed. 527 const base::PlatformThreadId thread_id_; 528 529 DISALLOW_COPY_AND_ASSIGN(StartupTimeBomb); 530}; 531 532// This is a wrapper class for detecting hangs during shutdown. 533class ShutdownWatcherHelper { 534 public: 535 // Create an empty holder for |shutdown_watchdog_|. 536 ShutdownWatcherHelper(); 537 538 // Destructor disarm's shutdown_watchdog_ so that alarm doesn't go off. 539 ~ShutdownWatcherHelper(); 540 541 // Constructs ShutdownWatchDogThread which spawns a thread and starts timer. 542 // |duration| specifies how long it will wait before it calls alarm. 543 void Arm(const base::TimeDelta& duration); 544 545 private: 546 // shutdown_watchdog_ watches for hangs during shutdown. 547 base::Watchdog* shutdown_watchdog_; 548 549 // The |thread_id_| on which this object is constructed. 550 const base::PlatformThreadId thread_id_; 551 552 DISALLOW_COPY_AND_ASSIGN(ShutdownWatcherHelper); 553}; 554 555#endif // CHROME_BROWSER_METRICS_THREAD_WATCHER_H_ 556