Watchdog.java revision d7fdd0228e6abdbc079f9cf08b780e4222dfe7c5
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.server; 18 19import android.app.IActivityController; 20import android.os.Binder; 21import android.os.RemoteException; 22import com.android.server.am.ActivityManagerService; 23 24import android.content.BroadcastReceiver; 25import android.content.ContentResolver; 26import android.content.Context; 27import android.content.Intent; 28import android.content.IntentFilter; 29import android.os.Debug; 30import android.os.Handler; 31import android.os.IPowerManager; 32import android.os.Looper; 33import android.os.Process; 34import android.os.ServiceManager; 35import android.os.SystemClock; 36import android.os.SystemProperties; 37import android.util.EventLog; 38import android.util.Log; 39import android.util.Slog; 40 41import java.io.File; 42import java.io.FileWriter; 43import java.io.IOException; 44import java.util.ArrayList; 45 46/** This class calls its monitor every minute. Killing this process if they don't return **/ 47public class Watchdog extends Thread { 48 static final String TAG = "Watchdog"; 49 50 // Set this to true to use debug default values. 51 static final boolean DB = false; 52 53 // Set this to true to have the watchdog record kernel thread stacks when it fires 54 static final boolean RECORD_KERNEL_THREADS = true; 55 56 static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000; 57 static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; 58 59 // These are temporally ordered: larger values as lateness increases 60 static final int COMPLETED = 0; 61 static final int WAITING = 1; 62 static final int WAITED_HALF = 2; 63 static final int OVERDUE = 3; 64 65 // Which native processes to dump into dropbox's stack traces 66 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 67 "/system/bin/mediaserver", 68 "/system/bin/sdcard", 69 "/system/bin/surfaceflinger" 70 }; 71 72 static Watchdog sWatchdog; 73 74 /* This handler will be used to post message back onto the main thread */ 75 final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>(); 76 final HandlerChecker mMonitorChecker; 77 ContentResolver mResolver; 78 ActivityManagerService mActivity; 79 80 int mPhonePid; 81 IActivityController mController; 82 boolean mAllowRestart = true; 83 84 /** 85 * Used for checking status of handle threads and scheduling monitor callbacks. 86 */ 87 public final class HandlerChecker implements Runnable { 88 private final Handler mHandler; 89 private final String mName; 90 private final long mWaitMax; 91 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 92 private boolean mCompleted; 93 private Monitor mCurrentMonitor; 94 private long mStartTime; 95 96 HandlerChecker(Handler handler, String name, long waitMaxMillis) { 97 mHandler = handler; 98 mName = name; 99 mWaitMax = waitMaxMillis; 100 mCompleted = true; 101 } 102 103 public void addMonitor(Monitor monitor) { 104 mMonitors.add(monitor); 105 } 106 107 public void scheduleCheckLocked() { 108 if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) { 109 // If the target looper has recently been polling, then 110 // there is no reason to enqueue our checker on it since that 111 // is as good as it not being deadlocked. This avoid having 112 // to do a context switch to check the thread. Note that we 113 // only do this if mCheckReboot is false and we have no 114 // monitors, since those would need to be executed at this point. 115 mCompleted = true; 116 return; 117 } 118 119 if (!mCompleted) { 120 // we already have a check in flight, so no need 121 return; 122 } 123 124 mCompleted = false; 125 mCurrentMonitor = null; 126 mStartTime = SystemClock.uptimeMillis(); 127 mHandler.postAtFrontOfQueue(this); 128 } 129 130 public boolean isOverdueLocked() { 131 return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); 132 } 133 134 public int getCompletionStateLocked() { 135 if (mCompleted) { 136 return COMPLETED; 137 } else { 138 long latency = SystemClock.uptimeMillis() - mStartTime; 139 if (latency < mWaitMax/2) { 140 return WAITING; 141 } else if (latency < mWaitMax) { 142 return WAITED_HALF; 143 } 144 } 145 return OVERDUE; 146 } 147 148 public Thread getThread() { 149 return mHandler.getLooper().getThread(); 150 } 151 152 public String getName() { 153 return mName; 154 } 155 156 public String describeBlockedStateLocked() { 157 if (mCurrentMonitor == null) { 158 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 159 } else { 160 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 161 + " on " + mName + " (" + getThread().getName() + ")"; 162 } 163 } 164 165 @Override 166 public void run() { 167 final int size = mMonitors.size(); 168 for (int i = 0 ; i < size ; i++) { 169 synchronized (Watchdog.this) { 170 mCurrentMonitor = mMonitors.get(i); 171 } 172 mCurrentMonitor.monitor(); 173 } 174 175 synchronized (Watchdog.this) { 176 mCompleted = true; 177 mCurrentMonitor = null; 178 } 179 } 180 } 181 182 final class RebootRequestReceiver extends BroadcastReceiver { 183 @Override 184 public void onReceive(Context c, Intent intent) { 185 if (intent.getIntExtra("nowait", 0) != 0) { 186 rebootSystem("Received ACTION_REBOOT broadcast"); 187 return; 188 } 189 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 190 } 191 } 192 193 public interface Monitor { 194 void monitor(); 195 } 196 197 public static Watchdog getInstance() { 198 if (sWatchdog == null) { 199 sWatchdog = new Watchdog(); 200 } 201 202 return sWatchdog; 203 } 204 205 private Watchdog() { 206 super("watchdog"); 207 // Initialize handler checkers for each common thread we want to check. Note 208 // that we are not currently checking the background thread, since it can 209 // potentially hold longer running operations with no guarantees about the timeliness 210 // of operations there. 211 212 // The shared foreground thread is the main checker. It is where we 213 // will also dispatch monitor checks and do other work. 214 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 215 "foreground thread", DEFAULT_TIMEOUT); 216 mHandlerCheckers.add(mMonitorChecker); 217 // Add checker for main thread. We only do a quick check since there 218 // can be UI running on the thread. 219 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 220 "main thread", DEFAULT_TIMEOUT)); 221 // Add checker for shared UI thread. 222 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), 223 "ui thread", DEFAULT_TIMEOUT)); 224 // And also check IO thread. 225 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), 226 "i/o thread", DEFAULT_TIMEOUT)); 227 // And the display thread. 228 mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), 229 "display thread", DEFAULT_TIMEOUT)); 230 } 231 232 public void init(Context context, ActivityManagerService activity) { 233 mResolver = context.getContentResolver(); 234 mActivity = activity; 235 236 context.registerReceiver(new RebootRequestReceiver(), 237 new IntentFilter(Intent.ACTION_REBOOT), 238 android.Manifest.permission.REBOOT, null); 239 } 240 241 public void processStarted(String name, int pid) { 242 synchronized (this) { 243 if ("com.android.phone".equals(name)) { 244 mPhonePid = pid; 245 } 246 } 247 } 248 249 public void setActivityController(IActivityController controller) { 250 synchronized (this) { 251 mController = controller; 252 } 253 } 254 255 public void setAllowRestart(boolean allowRestart) { 256 synchronized (this) { 257 mAllowRestart = allowRestart; 258 } 259 } 260 261 public void addMonitor(Monitor monitor) { 262 synchronized (this) { 263 if (isAlive()) { 264 throw new RuntimeException("Monitors can't be added once the Watchdog is running"); 265 } 266 mMonitorChecker.addMonitor(monitor); 267 } 268 } 269 270 public void addThread(Handler thread) { 271 addThread(thread, DEFAULT_TIMEOUT); 272 } 273 274 public void addThread(Handler thread, long timeoutMillis) { 275 synchronized (this) { 276 if (isAlive()) { 277 throw new RuntimeException("Threads can't be added once the Watchdog is running"); 278 } 279 final String name = thread.getLooper().getThread().getName(); 280 mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); 281 } 282 } 283 284 /** 285 * Perform a full reboot of the system. 286 */ 287 void rebootSystem(String reason) { 288 Slog.i(TAG, "Rebooting system because: " + reason); 289 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 290 try { 291 pms.reboot(false, reason, false); 292 } catch (RemoteException ex) { 293 } 294 } 295 296 private int evaluateCheckerCompletionLocked() { 297 int state = COMPLETED; 298 for (int i=0; i<mHandlerCheckers.size(); i++) { 299 HandlerChecker hc = mHandlerCheckers.get(i); 300 state = Math.max(state, hc.getCompletionStateLocked()); 301 } 302 return state; 303 } 304 305 private ArrayList<HandlerChecker> getBlockedCheckersLocked() { 306 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 307 for (int i=0; i<mHandlerCheckers.size(); i++) { 308 HandlerChecker hc = mHandlerCheckers.get(i); 309 if (hc.isOverdueLocked()) { 310 checkers.add(hc); 311 } 312 } 313 return checkers; 314 } 315 316 private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) { 317 StringBuilder builder = new StringBuilder(128); 318 for (int i=0; i<checkers.size(); i++) { 319 if (builder.length() > 0) { 320 builder.append(", "); 321 } 322 builder.append(checkers.get(i).describeBlockedStateLocked()); 323 } 324 return builder.toString(); 325 } 326 327 @Override 328 public void run() { 329 boolean waitedHalf = false; 330 while (true) { 331 final ArrayList<HandlerChecker> blockedCheckers; 332 final String subject; 333 final boolean allowRestart; 334 int debuggerWasConnected = 0; 335 synchronized (this) { 336 long timeout = CHECK_INTERVAL; 337 // Make sure we (re)spin the checkers that have become idle within 338 // this wait-and-check interval 339 for (int i=0; i<mHandlerCheckers.size(); i++) { 340 HandlerChecker hc = mHandlerCheckers.get(i); 341 hc.scheduleCheckLocked(); 342 } 343 344 if (debuggerWasConnected > 0) { 345 debuggerWasConnected--; 346 } 347 348 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 349 // wait while asleep. If the device is asleep then the thing that we are waiting 350 // to timeout on is asleep as well and won't have a chance to run, causing a false 351 // positive on when to kill things. 352 long start = SystemClock.uptimeMillis(); 353 while (timeout > 0) { 354 if (Debug.isDebuggerConnected()) { 355 debuggerWasConnected = 2; 356 } 357 try { 358 wait(timeout); 359 } catch (InterruptedException e) { 360 Log.wtf(TAG, e); 361 } 362 if (Debug.isDebuggerConnected()) { 363 debuggerWasConnected = 2; 364 } 365 timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); 366 } 367 368 final int waitState = evaluateCheckerCompletionLocked(); 369 if (waitState == COMPLETED) { 370 // The monitors have returned; reset 371 waitedHalf = false; 372 continue; 373 } else if (waitState == WAITING) { 374 // still waiting but within their configured intervals; back off and recheck 375 continue; 376 } else if (waitState == WAITED_HALF) { 377 if (!waitedHalf) { 378 // We've waited half the deadlock-detection interval. Pull a stack 379 // trace and wait another half. 380 ArrayList<Integer> pids = new ArrayList<Integer>(); 381 pids.add(Process.myPid()); 382 ActivityManagerService.dumpStackTraces(true, pids, null, null, 383 NATIVE_STACKS_OF_INTEREST); 384 waitedHalf = true; 385 } 386 continue; 387 } 388 389 // something is overdue! 390 blockedCheckers = getBlockedCheckersLocked(); 391 subject = describeCheckersLocked(blockedCheckers); 392 allowRestart = mAllowRestart; 393 } 394 395 // If we got here, that means that the system is most likely hung. 396 // First collect stack traces from all threads of the system process. 397 // Then kill this process so that the system will restart. 398 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 399 400 ArrayList<Integer> pids = new ArrayList<Integer>(); 401 pids.add(Process.myPid()); 402 if (mPhonePid > 0) pids.add(mPhonePid); 403 // Pass !waitedHalf so that just in case we somehow wind up here without having 404 // dumped the halfway stacks, we properly re-initialize the trace file. 405 final File stack = ActivityManagerService.dumpStackTraces( 406 !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST); 407 408 // Give some extra time to make sure the stack traces get written. 409 // The system's been hanging for a minute, another second or two won't hurt much. 410 SystemClock.sleep(2000); 411 412 // Pull our own kernel thread stacks as well if we're configured for that 413 if (RECORD_KERNEL_THREADS) { 414 dumpKernelStackTraces(); 415 } 416 417 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log 418 doSysRq('w'); 419 doSysRq('l'); 420 421 // Try to add the error to the dropbox, but assuming that the ActivityManager 422 // itself may be deadlocked. (which has happened, causing this statement to 423 // deadlock and the watchdog as a whole to be ineffective) 424 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 425 public void run() { 426 mActivity.addErrorToDropBox( 427 "watchdog", null, "system_server", null, null, 428 subject, null, stack, null); 429 } 430 }; 431 dropboxThread.start(); 432 try { 433 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 434 } catch (InterruptedException ignored) {} 435 436 IActivityController controller; 437 synchronized (this) { 438 controller = mController; 439 } 440 if (controller != null) { 441 Slog.i(TAG, "Reporting stuck state to activity controller"); 442 try { 443 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 444 // 1 = keep waiting, -1 = kill system 445 int res = controller.systemNotResponding(subject); 446 if (res >= 0) { 447 Slog.i(TAG, "Activity controller requested to coninue to wait"); 448 waitedHalf = false; 449 continue; 450 } 451 } catch (RemoteException e) { 452 } 453 } 454 455 // Only kill the process if the debugger is not attached. 456 if (Debug.isDebuggerConnected()) { 457 debuggerWasConnected = 2; 458 } 459 if (debuggerWasConnected >= 2) { 460 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 461 } else if (debuggerWasConnected > 0) { 462 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 463 } else if (!allowRestart) { 464 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 465 } else { 466 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 467 for (int i=0; i<blockedCheckers.size(); i++) { 468 Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:"); 469 StackTraceElement[] stackTrace 470 = blockedCheckers.get(i).getThread().getStackTrace(); 471 for (StackTraceElement element: stackTrace) { 472 Slog.w(TAG, " at " + element); 473 } 474 } 475 Slog.w(TAG, "*** GOODBYE!"); 476 Process.killProcess(Process.myPid()); 477 System.exit(10); 478 } 479 480 waitedHalf = false; 481 } 482 } 483 484 private void doSysRq(char c) { 485 try { 486 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 487 sysrq_trigger.write(c); 488 sysrq_trigger.close(); 489 } catch (IOException e) { 490 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 491 } 492 } 493 494 private File dumpKernelStackTraces() { 495 String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); 496 if (tracesPath == null || tracesPath.length() == 0) { 497 return null; 498 } 499 500 native_dumpKernelStacks(tracesPath); 501 return new File(tracesPath); 502 } 503 504 private native void native_dumpKernelStacks(String tracesPath); 505} 506