Watchdog.java revision 9158825f9c41869689d6b1786d7c7aa8bdd524ce
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.server; 18 19import android.app.IActivityController; 20import android.os.Binder; 21import android.os.RemoteException; 22import com.android.server.am.ActivityManagerService; 23import com.android.server.power.PowerManagerService; 24 25import android.app.AlarmManager; 26import android.app.PendingIntent; 27import android.content.BroadcastReceiver; 28import android.content.ContentResolver; 29import android.content.Context; 30import android.content.Intent; 31import android.content.IntentFilter; 32import android.os.BatteryManager; 33import android.os.Debug; 34import android.os.Handler; 35import android.os.Looper; 36import android.os.Process; 37import android.os.ServiceManager; 38import android.os.SystemClock; 39import android.os.SystemProperties; 40import android.util.EventLog; 41import android.util.Log; 42import android.util.Slog; 43 44import java.io.File; 45import java.io.FileWriter; 46import java.io.IOException; 47import java.util.ArrayList; 48import java.util.Calendar; 49 50/** This class calls its monitor every minute. Killing this process if they don't return **/ 51public class Watchdog extends Thread { 52 static final String TAG = "Watchdog"; 53 static final boolean localLOGV = false || false; 54 55 // Set this to true to use debug default values. 56 static final boolean DB = false; 57 58 // Set this to true to have the watchdog record kernel thread stacks when it fires 59 static final boolean RECORD_KERNEL_THREADS = true; 60 61 static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000; 62 static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; 63 64 // These are temporally ordered: larger values as lateness increases 65 static final int COMPLETED = 0; 66 static final int WAITING = 1; 67 static final int WAITED_HALF = 2; 68 static final int OVERDUE = 3; 69 70 // Which native processes to dump into dropbox's stack traces 71 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 72 "/system/bin/mediaserver", 73 "/system/bin/sdcard", 74 "/system/bin/surfaceflinger" 75 }; 76 77 static Watchdog sWatchdog; 78 79 /* This handler will be used to post message back onto the main thread */ 80 final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>(); 81 final HandlerChecker mMonitorChecker; 82 ContentResolver mResolver; 83 ActivityManagerService mActivity; 84 85 int mPhonePid; 86 IActivityController mController; 87 boolean mAllowRestart = true; 88 89 /** 90 * Used for checking status of handle threads and scheduling monitor callbacks. 91 */ 92 public final class HandlerChecker implements Runnable { 93 private final Handler mHandler; 94 private final String mName; 95 private final long mWaitMax; 96 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 97 private boolean mCompleted; 98 private Monitor mCurrentMonitor; 99 private long mStartTime; 100 101 HandlerChecker(Handler handler, String name, long waitMaxMillis) { 102 mHandler = handler; 103 mName = name; 104 mWaitMax = waitMaxMillis; 105 mCompleted = true; 106 } 107 108 public void addMonitor(Monitor monitor) { 109 mMonitors.add(monitor); 110 } 111 112 public void scheduleCheckLocked() { 113 if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) { 114 // If the target looper is or just recently was idling, then 115 // there is no reason to enqueue our checker on it since that 116 // is as good as it not being deadlocked. This avoid having 117 // to do a context switch to check the thread. Note that we 118 // only do this if mCheckReboot is false and we have no 119 // monitors, since those would need to be executed at this point. 120 mCompleted = true; 121 return; 122 } 123 124 if (!mCompleted) { 125 // we already have a check in flight, so no need 126 return; 127 } 128 129 mCompleted = false; 130 mCurrentMonitor = null; 131 mStartTime = SystemClock.uptimeMillis(); 132 mHandler.postAtFrontOfQueue(this); 133 } 134 135 public boolean isOverdueLocked() { 136 return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); 137 } 138 139 public int getCompletionStateLocked() { 140 if (mCompleted) { 141 return COMPLETED; 142 } else { 143 long latency = SystemClock.uptimeMillis() - mStartTime; 144 if (latency < mWaitMax/2) { 145 return WAITING; 146 } else if (latency < mWaitMax) { 147 return WAITED_HALF; 148 } 149 } 150 return OVERDUE; 151 } 152 153 public Thread getThread() { 154 return mHandler.getLooper().getThread(); 155 } 156 157 public String getName() { 158 return mName; 159 } 160 161 public String describeBlockedStateLocked() { 162 if (mCurrentMonitor == null) { 163 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 164 } else { 165 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 166 + " on " + mName + " (" + getThread().getName() + ")"; 167 } 168 } 169 170 @Override 171 public void run() { 172 final int size = mMonitors.size(); 173 for (int i = 0 ; i < size ; i++) { 174 synchronized (Watchdog.this) { 175 mCurrentMonitor = mMonitors.get(i); 176 } 177 mCurrentMonitor.monitor(); 178 } 179 180 synchronized (Watchdog.this) { 181 mCompleted = true; 182 mCurrentMonitor = null; 183 } 184 } 185 } 186 187 final class RebootRequestReceiver extends BroadcastReceiver { 188 @Override 189 public void onReceive(Context c, Intent intent) { 190 if (intent.getIntExtra("nowait", 0) != 0) { 191 rebootSystem("Received ACTION_REBOOT broadcast"); 192 return; 193 } 194 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 195 } 196 } 197 198 public interface Monitor { 199 void monitor(); 200 } 201 202 public static Watchdog getInstance() { 203 if (sWatchdog == null) { 204 sWatchdog = new Watchdog(); 205 } 206 207 return sWatchdog; 208 } 209 210 private Watchdog() { 211 super("watchdog"); 212 // Initialize handler checkers for each common thread we want to check. Note 213 // that we are not currently checking the background thread, since it can 214 // potentially hold longer running operations with no guarantees about the timeliness 215 // of operations there. 216 217 // The shared foreground thread is the main checker. It is where we 218 // will also dispatch monitor checks and do other work. 219 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 220 "foreground thread", DEFAULT_TIMEOUT); 221 mHandlerCheckers.add(mMonitorChecker); 222 // Add checker for main thread. We only do a quick check since there 223 // can be UI running on the thread. 224 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 225 "main thread", DEFAULT_TIMEOUT)); 226 // Add checker for shared UI thread. 227 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), 228 "ui thread", DEFAULT_TIMEOUT)); 229 // And also check IO thread. 230 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), 231 "i/o thread", DEFAULT_TIMEOUT)); 232 } 233 234 public void init(Context context, ActivityManagerService activity) { 235 mResolver = context.getContentResolver(); 236 mActivity = activity; 237 238 context.registerReceiver(new RebootRequestReceiver(), 239 new IntentFilter(Intent.ACTION_REBOOT), 240 android.Manifest.permission.REBOOT, null); 241 } 242 243 public void processStarted(String name, int pid) { 244 synchronized (this) { 245 if ("com.android.phone".equals(name)) { 246 mPhonePid = pid; 247 } 248 } 249 } 250 251 public void setActivityController(IActivityController controller) { 252 synchronized (this) { 253 mController = controller; 254 } 255 } 256 257 public void setAllowRestart(boolean allowRestart) { 258 synchronized (this) { 259 mAllowRestart = allowRestart; 260 } 261 } 262 263 public void addMonitor(Monitor monitor) { 264 synchronized (this) { 265 if (isAlive()) { 266 throw new RuntimeException("Monitors can't be added once the Watchdog is running"); 267 } 268 mMonitorChecker.addMonitor(monitor); 269 } 270 } 271 272 public void addThread(Handler thread, String name) { 273 addThread(thread, name, DEFAULT_TIMEOUT); 274 } 275 276 public void addThread(Handler thread, String name, long timeoutMillis) { 277 synchronized (this) { 278 if (isAlive()) { 279 throw new RuntimeException("Threads can't be added once the Watchdog is running"); 280 } 281 mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); 282 } 283 } 284 285 /** 286 * Perform a full reboot of the system. 287 */ 288 void rebootSystem(String reason) { 289 Slog.i(TAG, "Rebooting system because: " + reason); 290 PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power"); 291 pms.reboot(false, reason, false); 292 } 293 294 private int evaluateCheckerCompletionLocked() { 295 int state = COMPLETED; 296 for (int i=0; i<mHandlerCheckers.size(); i++) { 297 HandlerChecker hc = mHandlerCheckers.get(i); 298 state = Math.max(state, hc.getCompletionStateLocked()); 299 } 300 return state; 301 } 302 303 private ArrayList<HandlerChecker> getBlockedCheckersLocked() { 304 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 305 for (int i=0; i<mHandlerCheckers.size(); i++) { 306 HandlerChecker hc = mHandlerCheckers.get(i); 307 if (hc.isOverdueLocked()) { 308 checkers.add(hc); 309 } 310 } 311 return checkers; 312 } 313 314 private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) { 315 StringBuilder builder = new StringBuilder(128); 316 for (int i=0; i<checkers.size(); i++) { 317 if (builder.length() > 0) { 318 builder.append(", "); 319 } 320 builder.append(checkers.get(i).describeBlockedStateLocked()); 321 } 322 return builder.toString(); 323 } 324 325 @Override 326 public void run() { 327 boolean waitedHalf = false; 328 while (true) { 329 final ArrayList<HandlerChecker> blockedCheckers; 330 final String subject; 331 final boolean allowRestart; 332 synchronized (this) { 333 long timeout = CHECK_INTERVAL; 334 // Make sure we (re)spin the checkers that have become idle within 335 // this wait-and-check interval 336 for (int i=0; i<mHandlerCheckers.size(); i++) { 337 HandlerChecker hc = mHandlerCheckers.get(i); 338 hc.scheduleCheckLocked(); 339 } 340 341 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 342 // wait while asleep. If the device is asleep then the thing that we are waiting 343 // to timeout on is asleep as well and won't have a chance to run, causing a false 344 // positive on when to kill things. 345 long start = SystemClock.uptimeMillis(); 346 while (timeout > 0) { 347 try { 348 wait(timeout); 349 } catch (InterruptedException e) { 350 Log.wtf(TAG, e); 351 } 352 timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); 353 } 354 355 final int waitState = evaluateCheckerCompletionLocked(); 356 if (waitState == COMPLETED) { 357 // The monitors have returned; reset 358 waitedHalf = false; 359 continue; 360 } else if (waitState == WAITING) { 361 // still waiting but within their configured intervals; back off and recheck 362 continue; 363 } else if (waitState == WAITED_HALF) { 364 if (!waitedHalf) { 365 // We've waited half the deadlock-detection interval. Pull a stack 366 // trace and wait another half. 367 ArrayList<Integer> pids = new ArrayList<Integer>(); 368 pids.add(Process.myPid()); 369 ActivityManagerService.dumpStackTraces(true, pids, null, null, 370 NATIVE_STACKS_OF_INTEREST); 371 waitedHalf = true; 372 } 373 continue; 374 } 375 376 // something is overdue! 377 blockedCheckers = getBlockedCheckersLocked(); 378 subject = describeCheckersLocked(blockedCheckers); 379 allowRestart = mAllowRestart; 380 } 381 382 // If we got here, that means that the system is most likely hung. 383 // First collect stack traces from all threads of the system process. 384 // Then kill this process so that the system will restart. 385 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 386 387 ArrayList<Integer> pids = new ArrayList<Integer>(); 388 pids.add(Process.myPid()); 389 if (mPhonePid > 0) pids.add(mPhonePid); 390 // Pass !waitedHalf so that just in case we somehow wind up here without having 391 // dumped the halfway stacks, we properly re-initialize the trace file. 392 final File stack = ActivityManagerService.dumpStackTraces( 393 !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST); 394 395 // Give some extra time to make sure the stack traces get written. 396 // The system's been hanging for a minute, another second or two won't hurt much. 397 SystemClock.sleep(2000); 398 399 // Pull our own kernel thread stacks as well if we're configured for that 400 if (RECORD_KERNEL_THREADS) { 401 dumpKernelStackTraces(); 402 } 403 404 // Trigger the kernel to dump all blocked threads to the kernel log 405 try { 406 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 407 sysrq_trigger.write("w"); 408 sysrq_trigger.close(); 409 } catch (IOException e) { 410 Slog.e(TAG, "Failed to write to /proc/sysrq-trigger"); 411 Slog.e(TAG, e.getMessage()); 412 } 413 414 // Try to add the error to the dropbox, but assuming that the ActivityManager 415 // itself may be deadlocked. (which has happened, causing this statement to 416 // deadlock and the watchdog as a whole to be ineffective) 417 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 418 public void run() { 419 mActivity.addErrorToDropBox( 420 "watchdog", null, "system_server", null, null, 421 subject, null, stack, null); 422 } 423 }; 424 dropboxThread.start(); 425 try { 426 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 427 } catch (InterruptedException ignored) {} 428 429 IActivityController controller; 430 synchronized (this) { 431 controller = mController; 432 } 433 if (controller != null) { 434 Slog.i(TAG, "Reporting stuck state to activity controller"); 435 try { 436 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 437 // 1 = keep waiting, -1 = kill system 438 int res = controller.systemNotResponding(subject); 439 if (res >= 0) { 440 Slog.i(TAG, "Activity controller requested to coninue to wait"); 441 waitedHalf = false; 442 continue; 443 } 444 } catch (RemoteException e) { 445 } 446 } 447 448 // Only kill the process if the debugger is not attached. 449 if (Debug.isDebuggerConnected()) { 450 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 451 } else if (!allowRestart) { 452 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 453 } else { 454 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 455 for (int i=0; i<blockedCheckers.size(); i++) { 456 Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:"); 457 StackTraceElement[] stackTrace 458 = blockedCheckers.get(i).getThread().getStackTrace(); 459 for (StackTraceElement element: stackTrace) { 460 Slog.w(TAG, " at " + element); 461 } 462 } 463 Slog.w(TAG, "*** GOODBYE!"); 464 Process.killProcess(Process.myPid()); 465 System.exit(10); 466 } 467 468 waitedHalf = false; 469 } 470 } 471 472 private File dumpKernelStackTraces() { 473 String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); 474 if (tracesPath == null || tracesPath.length() == 0) { 475 return null; 476 } 477 478 native_dumpKernelStacks(tracesPath); 479 return new File(tracesPath); 480 } 481 482 private native void native_dumpKernelStacks(String tracesPath); 483} 484