Watchdog.java revision 98eb06a12e41c1dcebf40865be5be9ad6d8e10bc
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.server; 18 19import com.android.server.am.ActivityManagerService; 20import com.android.server.power.PowerManagerService; 21 22import android.app.AlarmManager; 23import android.app.PendingIntent; 24import android.content.BroadcastReceiver; 25import android.content.ContentResolver; 26import android.content.Context; 27import android.content.Intent; 28import android.content.IntentFilter; 29import android.os.BatteryManager; 30import android.os.Debug; 31import android.os.Handler; 32import android.os.Looper; 33import android.os.Message; 34import android.os.Process; 35import android.os.ServiceManager; 36import android.os.SystemClock; 37import android.os.SystemProperties; 38import android.util.EventLog; 39import android.util.Log; 40import android.util.Slog; 41 42import java.io.File; 43import java.io.FileWriter; 44import java.io.IOException; 45import java.util.ArrayList; 46import java.util.Calendar; 47 48/** This class calls its monitor every minute. Killing this process if they don't return **/ 49public class Watchdog extends Thread { 50 static final String TAG = "Watchdog"; 51 static final boolean localLOGV = false || false; 52 53 // Set this to true to use debug default values. 54 static final boolean DB = false; 55 56 // Set this to true to have the watchdog record kernel thread stacks when it fires 57 static final boolean RECORD_KERNEL_THREADS = true; 58 59 static final int MONITOR = 2718; 60 61 static final int TIME_TO_RESTART = DB ? 15*1000 : 60*1000; 62 static final int TIME_TO_WAIT = TIME_TO_RESTART / 2; 63 64 static final int MEMCHECK_DEFAULT_MIN_SCREEN_OFF = DB ? 1*60 : 5*60; // 5 minutes 65 static final int MEMCHECK_DEFAULT_MIN_ALARM = DB ? 1*60 : 3*60; // 3 minutes 66 static final int MEMCHECK_DEFAULT_RECHECK_INTERVAL = DB ? 1*60 : 5*60; // 5 minutes 67 68 static final int REBOOT_DEFAULT_INTERVAL = DB ? 1 : 0; // never force reboot 69 static final int REBOOT_DEFAULT_START_TIME = 3*60*60; // 3:00am 70 static final int REBOOT_DEFAULT_WINDOW = 60*60; // within 1 hour 71 72 static final String REBOOT_ACTION = "com.android.service.Watchdog.REBOOT"; 73 74 static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 75 "/system/bin/mediaserver", 76 "/system/bin/sdcard", 77 "/system/bin/surfaceflinger" 78 }; 79 80 static Watchdog sWatchdog; 81 82 /* This handler will be used to post message back onto the main thread */ 83 final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>(); 84 final HandlerChecker mMonitorChecker; 85 ContentResolver mResolver; 86 BatteryService mBattery; 87 PowerManagerService mPower; 88 AlarmManagerService mAlarm; 89 ActivityManagerService mActivity; 90 91 int mPhonePid; 92 93 final Calendar mCalendar = Calendar.getInstance(); 94 int mMinScreenOff = MEMCHECK_DEFAULT_MIN_SCREEN_OFF; 95 int mMinAlarm = MEMCHECK_DEFAULT_MIN_ALARM; 96 boolean mNeedScheduledCheck; 97 PendingIntent mCheckupIntent; 98 PendingIntent mRebootIntent; 99 100 long mBootTime; 101 int mRebootInterval; 102 103 boolean mReqRebootNoWait; // should wait for one interval before reboot? 104 int mReqRebootInterval = -1; // >= 0 if a reboot has been requested 105 int mReqRebootStartTime = -1; // >= 0 if a specific start time has been requested 106 int mReqRebootWindow = -1; // >= 0 if a specific window has been requested 107 int mReqMinScreenOff = -1; // >= 0 if a specific screen off time has been requested 108 int mReqMinNextAlarm = -1; // >= 0 if specific time to next alarm has been requested 109 int mReqRecheckInterval= -1; // >= 0 if a specific recheck interval has been requested 110 111 /** 112 * Used for checking status of handle threads and scheduling monitor callbacks. 113 */ 114 public final class HandlerChecker implements Runnable { 115 private final Handler mHandler; 116 private final String mName; 117 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 118 private final boolean mCheckReboot; 119 private boolean mCompleted; 120 private Monitor mCurrentMonitor; 121 122 HandlerChecker(Handler handler, String name, boolean checkReboot) { 123 mHandler = handler; 124 mName = name; 125 mCheckReboot = checkReboot; 126 } 127 128 public void addMonitor(Monitor monitor) { 129 mMonitors.add(monitor); 130 } 131 132 public void scheduleCheckLocked() { 133 mCompleted = false; 134 mCurrentMonitor = null; 135 mHandler.postAtFrontOfQueue(this); 136 } 137 138 public boolean isCompletedLocked() { 139 return mCompleted; 140 } 141 142 public String describeBlockedStateLocked() { 143 return mCurrentMonitor == null ? mName : mCurrentMonitor.getClass().getName(); 144 } 145 146 @Override 147 public void run() { 148 // See if we should force a reboot. 149 if (mCheckReboot) { 150 int rebootInterval = mReqRebootInterval >= 0 151 ? mReqRebootInterval : REBOOT_DEFAULT_INTERVAL; 152 if (mRebootInterval != rebootInterval) { 153 mRebootInterval = rebootInterval; 154 // We have been running long enough that a reboot can 155 // be considered... 156 checkReboot(false); 157 } 158 } 159 160 final int size = mMonitors.size(); 161 for (int i = 0 ; i < size ; i++) { 162 synchronized (Watchdog.this) { 163 mCurrentMonitor = mMonitors.get(i); 164 } 165 mCurrentMonitor.monitor(); 166 } 167 168 synchronized (Watchdog.this) { 169 mCompleted = true; 170 mCurrentMonitor = null; 171 } 172 } 173 } 174 175 final class RebootReceiver extends BroadcastReceiver { 176 @Override 177 public void onReceive(Context c, Intent intent) { 178 if (localLOGV) Slog.v(TAG, "Alarm went off, checking reboot."); 179 checkReboot(true); 180 } 181 } 182 183 final class RebootRequestReceiver extends BroadcastReceiver { 184 @Override 185 public void onReceive(Context c, Intent intent) { 186 mReqRebootNoWait = intent.getIntExtra("nowait", 0) != 0; 187 mReqRebootInterval = intent.getIntExtra("interval", -1); 188 mReqRebootStartTime = intent.getIntExtra("startTime", -1); 189 mReqRebootWindow = intent.getIntExtra("window", -1); 190 mReqMinScreenOff = intent.getIntExtra("minScreenOff", -1); 191 mReqMinNextAlarm = intent.getIntExtra("minNextAlarm", -1); 192 mReqRecheckInterval = intent.getIntExtra("recheckInterval", -1); 193 EventLog.writeEvent(EventLogTags.WATCHDOG_REQUESTED_REBOOT, 194 mReqRebootNoWait ? 1 : 0, mReqRebootInterval, 195 mReqRecheckInterval, mReqRebootStartTime, 196 mReqRebootWindow, mReqMinScreenOff, mReqMinNextAlarm); 197 checkReboot(true); 198 } 199 } 200 201 public interface Monitor { 202 void monitor(); 203 } 204 205 public static Watchdog getInstance() { 206 if (sWatchdog == null) { 207 sWatchdog = new Watchdog(); 208 } 209 210 return sWatchdog; 211 } 212 213 private Watchdog() { 214 super("watchdog"); 215 // Initialize handler checkers for each common thread we want to check. Note 216 // that we are not currently checking the background thread, since it can 217 // potentially hold longer running operations with no guarantees about the timeliness 218 // of operations there. 219 220 // The shared foreground thread is the main checker. It is where we 221 // will also dispatch monitor checks and do other work. 222 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), "foreground thread", true); 223 mHandlerCheckers.add(mMonitorChecker); 224 // Add checker for main thread. We only do a quick check since there 225 // can be UI running on the thread. 226 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 227 "main thread", false)); 228 // Add checker for shared UI thread. 229 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), "ui thread", false)); 230 // And also check IO thread. 231 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), "i/o thread", false)); 232 } 233 234 public void init(Context context, BatteryService battery, 235 PowerManagerService power, AlarmManagerService alarm, 236 ActivityManagerService activity) { 237 mResolver = context.getContentResolver(); 238 mBattery = battery; 239 mPower = power; 240 mAlarm = alarm; 241 mActivity = activity; 242 243 context.registerReceiver(new RebootReceiver(), 244 new IntentFilter(REBOOT_ACTION)); 245 mRebootIntent = PendingIntent.getBroadcast(context, 246 0, new Intent(REBOOT_ACTION), 0); 247 248 context.registerReceiver(new RebootRequestReceiver(), 249 new IntentFilter(Intent.ACTION_REBOOT), 250 android.Manifest.permission.REBOOT, null); 251 252 mBootTime = System.currentTimeMillis(); 253 } 254 255 public void processStarted(String name, int pid) { 256 synchronized (this) { 257 if ("com.android.phone".equals(name)) { 258 mPhonePid = pid; 259 } 260 } 261 } 262 263 public void addMonitor(Monitor monitor) { 264 synchronized (this) { 265 if (isAlive()) { 266 throw new RuntimeException("Monitors can't be added once the Watchdog is running"); 267 } 268 mMonitorChecker.addMonitor(monitor); 269 } 270 } 271 272 public void addThread(Handler thread, String name) { 273 synchronized (this) { 274 if (isAlive()) { 275 throw new RuntimeException("Threads can't be added once the Watchdog is running"); 276 } 277 mHandlerCheckers.add(new HandlerChecker(thread, name, false)); 278 } 279 } 280 281 void checkReboot(boolean fromAlarm) { 282 int rebootInterval = mReqRebootInterval >= 0 ? mReqRebootInterval 283 : REBOOT_DEFAULT_INTERVAL; 284 mRebootInterval = rebootInterval; 285 if (rebootInterval <= 0) { 286 // No reboot interval requested. 287 if (localLOGV) Slog.v(TAG, "No need to schedule a reboot alarm!"); 288 mAlarm.remove(mRebootIntent); 289 return; 290 } 291 292 long rebootStartTime = mReqRebootStartTime >= 0 ? mReqRebootStartTime 293 : REBOOT_DEFAULT_START_TIME; 294 long rebootWindowMillis = (mReqRebootWindow >= 0 ? mReqRebootWindow 295 : REBOOT_DEFAULT_WINDOW) * 1000; 296 long recheckInterval = (mReqRecheckInterval >= 0 ? mReqRecheckInterval 297 : MEMCHECK_DEFAULT_RECHECK_INTERVAL) * 1000; 298 299 retrieveBrutalityAmount(); 300 301 long realStartTime; 302 long now; 303 304 synchronized (this) { 305 now = System.currentTimeMillis(); 306 realStartTime = computeCalendarTime(mCalendar, now, 307 rebootStartTime); 308 309 long rebootIntervalMillis = rebootInterval*24*60*60*1000; 310 if (DB || mReqRebootNoWait || 311 (now-mBootTime) >= (rebootIntervalMillis-rebootWindowMillis)) { 312 if (fromAlarm && rebootWindowMillis <= 0) { 313 // No reboot window -- just immediately reboot. 314 EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now, 315 (int)rebootIntervalMillis, (int)rebootStartTime*1000, 316 (int)rebootWindowMillis, ""); 317 rebootSystem("Checkin scheduled forced"); 318 return; 319 } 320 321 // Are we within the reboot window? 322 if (now < realStartTime) { 323 // Schedule alarm for next check interval. 324 realStartTime = computeCalendarTime(mCalendar, 325 now, rebootStartTime); 326 } else if (now < (realStartTime+rebootWindowMillis)) { 327 String doit = shouldWeBeBrutalLocked(now); 328 EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now, 329 (int)rebootInterval, (int)rebootStartTime*1000, 330 (int)rebootWindowMillis, doit != null ? doit : ""); 331 if (doit == null) { 332 rebootSystem("Checked scheduled range"); 333 return; 334 } 335 336 // Schedule next alarm either within the window or in the 337 // next interval. 338 if ((now+recheckInterval) >= (realStartTime+rebootWindowMillis)) { 339 realStartTime = computeCalendarTime(mCalendar, 340 now + rebootIntervalMillis, rebootStartTime); 341 } else { 342 realStartTime = now + recheckInterval; 343 } 344 } else { 345 // Schedule alarm for next check interval. 346 realStartTime = computeCalendarTime(mCalendar, 347 now + rebootIntervalMillis, rebootStartTime); 348 } 349 } 350 } 351 352 if (localLOGV) Slog.v(TAG, "Scheduling next reboot alarm for " 353 + ((realStartTime-now)/1000/60) + "m from now"); 354 mAlarm.remove(mRebootIntent); 355 mAlarm.set(AlarmManager.RTC_WAKEUP, realStartTime, mRebootIntent); 356 } 357 358 /** 359 * Perform a full reboot of the system. 360 */ 361 void rebootSystem(String reason) { 362 Slog.i(TAG, "Rebooting system because: " + reason); 363 PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power"); 364 pms.reboot(false, reason, false); 365 } 366 367 /** 368 * Load the current Gservices settings for when 369 * {@link #shouldWeBeBrutalLocked} will allow the brutality to happen. 370 * Must not be called with the lock held. 371 */ 372 void retrieveBrutalityAmount() { 373 mMinScreenOff = (mReqMinScreenOff >= 0 ? mReqMinScreenOff 374 : MEMCHECK_DEFAULT_MIN_SCREEN_OFF) * 1000; 375 mMinAlarm = (mReqMinNextAlarm >= 0 ? mReqMinNextAlarm 376 : MEMCHECK_DEFAULT_MIN_ALARM) * 1000; 377 } 378 379 /** 380 * Determine whether it is a good time to kill, crash, or otherwise 381 * plunder the current situation for the overall long-term benefit of 382 * the world. 383 * 384 * @param curTime The current system time. 385 * @return Returns null if this is a good time, else a String with the 386 * text of why it is not a good time. 387 */ 388 String shouldWeBeBrutalLocked(long curTime) { 389 if (mBattery == null || !mBattery.isPowered(BatteryManager.BATTERY_PLUGGED_ANY)) { 390 return "battery"; 391 } 392 393 if (mMinScreenOff >= 0 && (mPower == null || 394 mPower.timeSinceScreenWasLastOn() < mMinScreenOff)) { 395 return "screen"; 396 } 397 398 if (mMinAlarm >= 0 && (mAlarm == null || 399 mAlarm.timeToNextAlarm() < mMinAlarm)) { 400 return "alarm"; 401 } 402 403 return null; 404 } 405 406 static long computeCalendarTime(Calendar c, long curTime, 407 long secondsSinceMidnight) { 408 409 // start with now 410 c.setTimeInMillis(curTime); 411 412 int val = (int)secondsSinceMidnight / (60*60); 413 c.set(Calendar.HOUR_OF_DAY, val); 414 secondsSinceMidnight -= val * (60*60); 415 val = (int)secondsSinceMidnight / 60; 416 c.set(Calendar.MINUTE, val); 417 c.set(Calendar.SECOND, (int)secondsSinceMidnight - (val*60)); 418 c.set(Calendar.MILLISECOND, 0); 419 420 long newTime = c.getTimeInMillis(); 421 if (newTime < curTime) { 422 // The given time (in seconds since midnight) has already passed for today, so advance 423 // by one day (due to daylight savings, etc., the delta may differ from 24 hours). 424 c.add(Calendar.DAY_OF_MONTH, 1); 425 newTime = c.getTimeInMillis(); 426 } 427 428 return newTime; 429 } 430 431 private boolean haveAllCheckersCompletedLocked() { 432 for (int i=0; i<mHandlerCheckers.size(); i++) { 433 HandlerChecker hc = mHandlerCheckers.get(i); 434 if (!hc.isCompletedLocked()) { 435 return false; 436 } 437 } 438 return true; 439 } 440 441 private String describeBlockedCheckersLocked() { 442 StringBuilder builder = new StringBuilder(128); 443 for (int i=0; i<mHandlerCheckers.size(); i++) { 444 HandlerChecker hc = mHandlerCheckers.get(i); 445 if (!hc.isCompletedLocked()) { 446 if (builder.length() > 0) { 447 builder.append(", "); 448 } 449 builder.append(hc.describeBlockedStateLocked()); 450 } 451 } 452 return builder.toString(); 453 } 454 455 @Override 456 public void run() { 457 boolean waitedHalf = false; 458 while (true) { 459 final String name; 460 synchronized (this) { 461 long timeout = TIME_TO_WAIT; 462 if (!waitedHalf) { 463 // If we are not at the half-point of waiting, perform a 464 // new set of checks. Otherwise we are still waiting for a previous set. 465 for (int i=0; i<mHandlerCheckers.size(); i++) { 466 HandlerChecker hc = mHandlerCheckers.get(i); 467 hc.scheduleCheckLocked(); 468 } 469 } 470 471 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 472 // wait while asleep. If the device is asleep then the thing that we are waiting 473 // to timeout on is asleep as well and won't have a chance to run, causing a false 474 // positive on when to kill things. 475 long start = SystemClock.uptimeMillis(); 476 while (timeout > 0) { 477 try { 478 wait(timeout); 479 } catch (InterruptedException e) { 480 Log.wtf(TAG, e); 481 } 482 timeout = TIME_TO_WAIT - (SystemClock.uptimeMillis() - start); 483 } 484 485 if (haveAllCheckersCompletedLocked()) { 486 // The monitors have returned. 487 waitedHalf = false; 488 continue; 489 } 490 491 if (!waitedHalf) { 492 // We've waited half the deadlock-detection interval. Pull a stack 493 // trace and wait another half. 494 ArrayList<Integer> pids = new ArrayList<Integer>(); 495 pids.add(Process.myPid()); 496 ActivityManagerService.dumpStackTraces(true, pids, null, null, 497 NATIVE_STACKS_OF_INTEREST); 498 waitedHalf = true; 499 continue; 500 } 501 502 name = describeBlockedCheckersLocked(); 503 } 504 505 // If we got here, that means that the system is most likely hung. 506 // First collect stack traces from all threads of the system process. 507 // Then kill this process so that the system will restart. 508 EventLog.writeEvent(EventLogTags.WATCHDOG, name); 509 510 ArrayList<Integer> pids = new ArrayList<Integer>(); 511 pids.add(Process.myPid()); 512 if (mPhonePid > 0) pids.add(mPhonePid); 513 // Pass !waitedHalf so that just in case we somehow wind up here without having 514 // dumped the halfway stacks, we properly re-initialize the trace file. 515 final File stack = ActivityManagerService.dumpStackTraces( 516 !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST); 517 518 // Give some extra time to make sure the stack traces get written. 519 // The system's been hanging for a minute, another second or two won't hurt much. 520 SystemClock.sleep(2000); 521 522 // Pull our own kernel thread stacks as well if we're configured for that 523 if (RECORD_KERNEL_THREADS) { 524 dumpKernelStackTraces(); 525 } 526 527 // Trigger the kernel to dump all blocked threads to the kernel log 528 try { 529 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 530 sysrq_trigger.write("w"); 531 sysrq_trigger.close(); 532 } catch (IOException e) { 533 Slog.e(TAG, "Failed to write to /proc/sysrq-trigger"); 534 Slog.e(TAG, e.getMessage()); 535 } 536 537 // Try to add the error to the dropbox, but assuming that the ActivityManager 538 // itself may be deadlocked. (which has happened, causing this statement to 539 // deadlock and the watchdog as a whole to be ineffective) 540 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 541 public void run() { 542 mActivity.addErrorToDropBox( 543 "watchdog", null, "system_server", null, null, 544 name, null, stack, null); 545 } 546 }; 547 dropboxThread.start(); 548 try { 549 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 550 } catch (InterruptedException ignored) {} 551 552 // Only kill the process if the debugger is not attached. 553 if (!Debug.isDebuggerConnected()) { 554 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + name); 555 Slog.w(TAG, "Main thread stack trace:"); 556 StackTraceElement[] stackTrace = Looper.getMainLooper().getThread().getStackTrace(); 557 for (StackTraceElement element: stackTrace) { 558 Slog.w(TAG, "\tat " + element); 559 } 560 Slog.w(TAG, "<End of main thread stack trace>"); 561 Process.killProcess(Process.myPid()); 562 System.exit(10); 563 } else { 564 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 565 } 566 567 waitedHalf = false; 568 } 569 } 570 571 private File dumpKernelStackTraces() { 572 String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); 573 if (tracesPath == null || tracesPath.length() == 0) { 574 return null; 575 } 576 577 native_dumpKernelStacks(tracesPath); 578 return new File(tracesPath); 579 } 580 581 private native void native_dumpKernelStacks(String tracesPath); 582} 583