1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#if defined(OS_WIN)
6#include <windows.h>
7#endif
8
9#include "content/gpu/gpu_watchdog_thread.h"
10
11#include "base/bind.h"
12#include "base/bind_helpers.h"
13#include "base/command_line.h"
14#include "base/compiler_specific.h"
15#include "base/files/file_util.h"
16#include "base/power_monitor/power_monitor.h"
17#include "base/process/process.h"
18#include "build/build_config.h"
19#include "content/public/common/content_switches.h"
20#include "content/public/common/result_codes.h"
21
22namespace content {
23namespace {
24const int64 kCheckPeriodMs = 2000;
25#if defined(OS_CHROMEOS)
26const base::FilePath::CharType
27    kTtyFilePath[] = FILE_PATH_LITERAL("/sys/class/tty/tty0/active");
28#endif
29}  // namespace
30
31GpuWatchdogThread::GpuWatchdogThread(int timeout)
32    : base::Thread("Watchdog"),
33      watched_message_loop_(base::MessageLoop::current()),
34      timeout_(base::TimeDelta::FromMilliseconds(timeout)),
35      armed_(false),
36#if defined(OS_WIN)
37      watched_thread_handle_(0),
38      arm_cpu_time_(),
39#endif
40      task_observer_(this),
41      suspended_(false),
42      weak_factory_(this) {
43  DCHECK(timeout >= 0);
44
45#if defined(OS_WIN)
46  // GetCurrentThread returns a pseudo-handle that cannot be used by one thread
47  // to identify another. DuplicateHandle creates a "real" handle that can be
48  // used for this purpose.
49  BOOL result = DuplicateHandle(GetCurrentProcess(),
50                                GetCurrentThread(),
51                                GetCurrentProcess(),
52                                &watched_thread_handle_,
53                                THREAD_QUERY_INFORMATION,
54                                FALSE,
55                                0);
56  DCHECK(result);
57#endif
58
59#if defined(OS_CHROMEOS)
60  tty_file_ = base::OpenFile(base::FilePath(kTtyFilePath), "r");
61#endif
62  watched_message_loop_->AddTaskObserver(&task_observer_);
63}
64
65void GpuWatchdogThread::PostAcknowledge() {
66  // Called on the monitored thread. Responds with OnAcknowledge. Cannot use
67  // the method factory. Rely on reference counting instead.
68  message_loop()->PostTask(
69      FROM_HERE,
70      base::Bind(&GpuWatchdogThread::OnAcknowledge, this));
71}
72
73void GpuWatchdogThread::CheckArmed() {
74  // Acknowledge the watchdog if it has armed itself. The watchdog will not
75  // change its armed state until it is acknowledged.
76  if (armed()) {
77    PostAcknowledge();
78  }
79}
80
81void GpuWatchdogThread::Init() {
82  // Schedule the first check.
83  OnCheck(false);
84}
85
86void GpuWatchdogThread::CleanUp() {
87  weak_factory_.InvalidateWeakPtrs();
88}
89
90GpuWatchdogThread::GpuWatchdogTaskObserver::GpuWatchdogTaskObserver(
91    GpuWatchdogThread* watchdog)
92    : watchdog_(watchdog) {
93}
94
95GpuWatchdogThread::GpuWatchdogTaskObserver::~GpuWatchdogTaskObserver() {
96}
97
98void GpuWatchdogThread::GpuWatchdogTaskObserver::WillProcessTask(
99    const base::PendingTask& pending_task) {
100  watchdog_->CheckArmed();
101}
102
103void GpuWatchdogThread::GpuWatchdogTaskObserver::DidProcessTask(
104    const base::PendingTask& pending_task) {
105  watchdog_->CheckArmed();
106}
107
108GpuWatchdogThread::~GpuWatchdogThread() {
109  // Verify that the thread was explicitly stopped. If the thread is stopped
110  // implicitly by the destructor, CleanUp() will not be called.
111  DCHECK(!weak_factory_.HasWeakPtrs());
112
113#if defined(OS_WIN)
114  CloseHandle(watched_thread_handle_);
115#endif
116
117  base::PowerMonitor* power_monitor = base::PowerMonitor::Get();
118  if (power_monitor)
119    power_monitor->RemoveObserver(this);
120
121#if defined(OS_CHROMEOS)
122  if (tty_file_)
123    fclose(tty_file_);
124#endif
125
126  watched_message_loop_->RemoveTaskObserver(&task_observer_);
127}
128
129void GpuWatchdogThread::OnAcknowledge() {
130  CHECK(base::PlatformThread::CurrentId() == thread_id());
131
132  // The check has already been acknowledged and another has already been
133  // scheduled by a previous call to OnAcknowledge. It is normal for a
134  // watched thread to see armed_ being true multiple times before
135  // the OnAcknowledge task is run on the watchdog thread.
136  if (!armed_)
137    return;
138
139  // Revoke any pending hang termination.
140  weak_factory_.InvalidateWeakPtrs();
141  armed_ = false;
142
143  if (suspended_)
144    return;
145
146  // If it took a long time for the acknowledgement, assume the computer was
147  // recently suspended.
148  bool was_suspended = (base::Time::Now() > suspension_timeout_);
149
150  // The monitored thread has responded. Post a task to check it again.
151  message_loop()->PostDelayedTask(
152      FROM_HERE,
153      base::Bind(&GpuWatchdogThread::OnCheck, weak_factory_.GetWeakPtr(),
154          was_suspended),
155      base::TimeDelta::FromMilliseconds(kCheckPeriodMs));
156}
157
158void GpuWatchdogThread::OnCheck(bool after_suspend) {
159  CHECK(base::PlatformThread::CurrentId() == thread_id());
160
161  // Do not create any new termination tasks if one has already been created
162  // or the system is suspended.
163  if (armed_ || suspended_)
164    return;
165
166  // Must set armed before posting the task. This task might be the only task
167  // that will activate the TaskObserver on the watched thread and it must not
168  // miss the false -> true transition.
169  armed_ = true;
170
171#if defined(OS_WIN)
172  arm_cpu_time_ = GetWatchedThreadTime();
173#endif
174
175  // Immediately after the computer is woken up from being suspended it might
176  // be pretty sluggish, so allow some extra time before the next timeout.
177  base::TimeDelta timeout = timeout_ * (after_suspend ? 3 : 1);
178  suspension_timeout_ = base::Time::Now() + timeout * 2;
179
180  // Post a task to the monitored thread that does nothing but wake up the
181  // TaskObserver. Any other tasks that are pending on the watched thread will
182  // also wake up the observer. This simply ensures there is at least one.
183  watched_message_loop_->PostTask(
184      FROM_HERE,
185      base::Bind(&base::DoNothing));
186
187  // Post a task to the watchdog thread to exit if the monitored thread does
188  // not respond in time.
189  message_loop()->PostDelayedTask(
190      FROM_HERE,
191      base::Bind(
192          &GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang,
193          weak_factory_.GetWeakPtr()),
194      timeout);
195}
196
197// Use the --disable-gpu-watchdog command line switch to disable this.
198void GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang() {
199  // Should not get here while the system is suspended.
200  DCHECK(!suspended_);
201
202#if defined(OS_WIN)
203  // Defer termination until a certain amount of CPU time has elapsed on the
204  // watched thread.
205  base::TimeDelta time_since_arm = GetWatchedThreadTime() - arm_cpu_time_;
206  if (time_since_arm < timeout_) {
207    message_loop()->PostDelayedTask(
208        FROM_HERE,
209        base::Bind(
210            &GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang,
211            weak_factory_.GetWeakPtr()),
212        timeout_ - time_since_arm);
213    return;
214  }
215#endif
216
217  // If the watchdog woke up significantly behind schedule, disarm and reset
218  // the watchdog check. This is to prevent the watchdog thread from terminating
219  // when a machine wakes up from sleep or hibernation, which would otherwise
220  // appear to be a hang.
221  if (base::Time::Now() > suspension_timeout_) {
222    armed_ = false;
223    OnCheck(true);
224    return;
225  }
226
227  // For minimal developer annoyance, don't keep terminating. You need to skip
228  // the call to base::Process::Terminate below in a debugger for this to be
229  // useful.
230  static bool terminated = false;
231  if (terminated)
232    return;
233
234#if defined(OS_WIN)
235  if (IsDebuggerPresent())
236    return;
237#endif
238
239#if defined(OS_CHROMEOS)
240  // Don't crash if we're not on tty1. This avoids noise in the GPU process
241  // crashes caused by people who use VT2 but still enable crash reporting.
242  char tty_string[8] = {0};
243  if (tty_file_ &&
244      !fseek(tty_file_, 0, SEEK_SET) &&
245      fread(tty_string, 1, 7, tty_file_)) {
246    int tty_number = -1;
247    int num_res = sscanf(tty_string, "tty%d", &tty_number);
248    if (num_res == 1 && tty_number != 1)
249      return;
250  }
251#endif
252
253  LOG(ERROR) << "The GPU process hung. Terminating after "
254             << timeout_.InMilliseconds() << " ms.";
255
256  // Deliberately crash the process to create a crash dump.
257  *((volatile int*)0) = 0x1337;
258
259  terminated = true;
260}
261
262void GpuWatchdogThread::AddPowerObserver() {
263  message_loop()->PostTask(
264      FROM_HERE,
265      base::Bind(&GpuWatchdogThread::OnAddPowerObserver, this));
266}
267
268void GpuWatchdogThread::OnAddPowerObserver() {
269  base::PowerMonitor* power_monitor = base::PowerMonitor::Get();
270  DCHECK(power_monitor);
271  power_monitor->AddObserver(this);
272}
273
274void GpuWatchdogThread::OnSuspend() {
275  suspended_ = true;
276
277  // When suspending force an acknowledgement to cancel any pending termination
278  // tasks.
279  OnAcknowledge();
280}
281
282void GpuWatchdogThread::OnResume() {
283  suspended_ = false;
284
285  // After resuming jump-start the watchdog again.
286  armed_ = false;
287  OnCheck(true);
288}
289
290#if defined(OS_WIN)
291base::TimeDelta GpuWatchdogThread::GetWatchedThreadTime() {
292  FILETIME creation_time;
293  FILETIME exit_time;
294  FILETIME user_time;
295  FILETIME kernel_time;
296  BOOL result = GetThreadTimes(watched_thread_handle_,
297                               &creation_time,
298                               &exit_time,
299                               &kernel_time,
300                               &user_time);
301  DCHECK(result);
302
303  ULARGE_INTEGER user_time64;
304  user_time64.HighPart = user_time.dwHighDateTime;
305  user_time64.LowPart = user_time.dwLowDateTime;
306
307  ULARGE_INTEGER kernel_time64;
308  kernel_time64.HighPart = kernel_time.dwHighDateTime;
309  kernel_time64.LowPart = kernel_time.dwLowDateTime;
310
311  // Time is reported in units of 100 nanoseconds. Kernel and user time are
312  // summed to deal with to kinds of hangs. One is where the GPU process is
313  // stuck in user level, never calling into the kernel and kernel time is
314  // not increasing. The other is where either the kernel hangs and never
315  // returns to user level or where user level code
316  // calls into kernel level repeatedly, giving up its quanta before it is
317  // tracked, for example a loop that repeatedly Sleeps.
318  return base::TimeDelta::FromMilliseconds(static_cast<int64>(
319      (user_time64.QuadPart + kernel_time64.QuadPart) / 10000));
320}
321#endif
322
323}  // namespace content
324