Watchdog.java revision 98eb06a12e41c1dcebf40865be5be9ad6d8e10bc
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import com.android.server.am.ActivityManagerService;
20import com.android.server.power.PowerManagerService;
21
22import android.app.AlarmManager;
23import android.app.PendingIntent;
24import android.content.BroadcastReceiver;
25import android.content.ContentResolver;
26import android.content.Context;
27import android.content.Intent;
28import android.content.IntentFilter;
29import android.os.BatteryManager;
30import android.os.Debug;
31import android.os.Handler;
32import android.os.Looper;
33import android.os.Message;
34import android.os.Process;
35import android.os.ServiceManager;
36import android.os.SystemClock;
37import android.os.SystemProperties;
38import android.util.EventLog;
39import android.util.Log;
40import android.util.Slog;
41
42import java.io.File;
43import java.io.FileWriter;
44import java.io.IOException;
45import java.util.ArrayList;
46import java.util.Calendar;
47
48/** This class calls its monitor every minute. Killing this process if they don't return **/
49public class Watchdog extends Thread {
50    static final String TAG = "Watchdog";
51    static final boolean localLOGV = false || false;
52
53    // Set this to true to use debug default values.
54    static final boolean DB = false;
55
56    // Set this to true to have the watchdog record kernel thread stacks when it fires
57    static final boolean RECORD_KERNEL_THREADS = true;
58
59    static final int MONITOR = 2718;
60
61    static final int TIME_TO_RESTART = DB ? 15*1000 : 60*1000;
62    static final int TIME_TO_WAIT = TIME_TO_RESTART / 2;
63
64    static final int MEMCHECK_DEFAULT_MIN_SCREEN_OFF = DB ? 1*60 : 5*60;   // 5 minutes
65    static final int MEMCHECK_DEFAULT_MIN_ALARM = DB ? 1*60 : 3*60;        // 3 minutes
66    static final int MEMCHECK_DEFAULT_RECHECK_INTERVAL = DB ? 1*60 : 5*60; // 5 minutes
67
68    static final int REBOOT_DEFAULT_INTERVAL = DB ? 1 : 0;                 // never force reboot
69    static final int REBOOT_DEFAULT_START_TIME = 3*60*60;                  // 3:00am
70    static final int REBOOT_DEFAULT_WINDOW = 60*60;                        // within 1 hour
71
72    static final String REBOOT_ACTION = "com.android.service.Watchdog.REBOOT";
73
74    static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
75        "/system/bin/mediaserver",
76        "/system/bin/sdcard",
77        "/system/bin/surfaceflinger"
78    };
79
80    static Watchdog sWatchdog;
81
82    /* This handler will be used to post message back onto the main thread */
83    final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>();
84    final HandlerChecker mMonitorChecker;
85    ContentResolver mResolver;
86    BatteryService mBattery;
87    PowerManagerService mPower;
88    AlarmManagerService mAlarm;
89    ActivityManagerService mActivity;
90
91    int mPhonePid;
92
93    final Calendar mCalendar = Calendar.getInstance();
94    int mMinScreenOff = MEMCHECK_DEFAULT_MIN_SCREEN_OFF;
95    int mMinAlarm = MEMCHECK_DEFAULT_MIN_ALARM;
96    boolean mNeedScheduledCheck;
97    PendingIntent mCheckupIntent;
98    PendingIntent mRebootIntent;
99
100    long mBootTime;
101    int mRebootInterval;
102
103    boolean mReqRebootNoWait;     // should wait for one interval before reboot?
104    int mReqRebootInterval = -1;  // >= 0 if a reboot has been requested
105    int mReqRebootStartTime = -1; // >= 0 if a specific start time has been requested
106    int mReqRebootWindow = -1;    // >= 0 if a specific window has been requested
107    int mReqMinScreenOff = -1;    // >= 0 if a specific screen off time has been requested
108    int mReqMinNextAlarm = -1;    // >= 0 if specific time to next alarm has been requested
109    int mReqRecheckInterval= -1;  // >= 0 if a specific recheck interval has been requested
110
111    /**
112     * Used for checking status of handle threads and scheduling monitor callbacks.
113     */
114    public final class HandlerChecker implements Runnable {
115        private final Handler mHandler;
116        private final String mName;
117        private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
118        private final boolean mCheckReboot;
119        private boolean mCompleted;
120        private Monitor mCurrentMonitor;
121
122        HandlerChecker(Handler handler, String name, boolean checkReboot) {
123            mHandler = handler;
124            mName = name;
125            mCheckReboot = checkReboot;
126        }
127
128        public void addMonitor(Monitor monitor) {
129            mMonitors.add(monitor);
130        }
131
132        public void scheduleCheckLocked() {
133            mCompleted = false;
134            mCurrentMonitor = null;
135            mHandler.postAtFrontOfQueue(this);
136        }
137
138        public boolean isCompletedLocked() {
139            return mCompleted;
140        }
141
142        public String describeBlockedStateLocked() {
143            return mCurrentMonitor == null ? mName : mCurrentMonitor.getClass().getName();
144        }
145
146        @Override
147        public void run() {
148            // See if we should force a reboot.
149            if (mCheckReboot) {
150                int rebootInterval = mReqRebootInterval >= 0
151                        ? mReqRebootInterval : REBOOT_DEFAULT_INTERVAL;
152                if (mRebootInterval != rebootInterval) {
153                    mRebootInterval = rebootInterval;
154                    // We have been running long enough that a reboot can
155                    // be considered...
156                    checkReboot(false);
157                }
158            }
159
160            final int size = mMonitors.size();
161            for (int i = 0 ; i < size ; i++) {
162                synchronized (Watchdog.this) {
163                    mCurrentMonitor = mMonitors.get(i);
164                }
165                mCurrentMonitor.monitor();
166            }
167
168            synchronized (Watchdog.this) {
169                mCompleted = true;
170                mCurrentMonitor = null;
171            }
172        }
173    }
174
175    final class RebootReceiver extends BroadcastReceiver {
176        @Override
177        public void onReceive(Context c, Intent intent) {
178            if (localLOGV) Slog.v(TAG, "Alarm went off, checking reboot.");
179            checkReboot(true);
180        }
181    }
182
183    final class RebootRequestReceiver extends BroadcastReceiver {
184        @Override
185        public void onReceive(Context c, Intent intent) {
186            mReqRebootNoWait = intent.getIntExtra("nowait", 0) != 0;
187            mReqRebootInterval = intent.getIntExtra("interval", -1);
188            mReqRebootStartTime = intent.getIntExtra("startTime", -1);
189            mReqRebootWindow = intent.getIntExtra("window", -1);
190            mReqMinScreenOff = intent.getIntExtra("minScreenOff", -1);
191            mReqMinNextAlarm = intent.getIntExtra("minNextAlarm", -1);
192            mReqRecheckInterval = intent.getIntExtra("recheckInterval", -1);
193            EventLog.writeEvent(EventLogTags.WATCHDOG_REQUESTED_REBOOT,
194                    mReqRebootNoWait ? 1 : 0, mReqRebootInterval,
195                            mReqRecheckInterval, mReqRebootStartTime,
196                    mReqRebootWindow, mReqMinScreenOff, mReqMinNextAlarm);
197            checkReboot(true);
198        }
199    }
200
201    public interface Monitor {
202        void monitor();
203    }
204
205    public static Watchdog getInstance() {
206        if (sWatchdog == null) {
207            sWatchdog = new Watchdog();
208        }
209
210        return sWatchdog;
211    }
212
213    private Watchdog() {
214        super("watchdog");
215        // Initialize handler checkers for each common thread we want to check.  Note
216        // that we are not currently checking the background thread, since it can
217        // potentially hold longer running operations with no guarantees about the timeliness
218        // of operations there.
219
220        // The shared foreground thread is the main checker.  It is where we
221        // will also dispatch monitor checks and do other work.
222        mMonitorChecker = new HandlerChecker(FgThread.getHandler(), "foreground thread", true);
223        mHandlerCheckers.add(mMonitorChecker);
224        // Add checker for main thread.  We only do a quick check since there
225        // can be UI running on the thread.
226        mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
227                "main thread", false));
228        // Add checker for shared UI thread.
229        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), "ui thread", false));
230        // And also check IO thread.
231        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), "i/o thread", false));
232    }
233
234    public void init(Context context, BatteryService battery,
235            PowerManagerService power, AlarmManagerService alarm,
236            ActivityManagerService activity) {
237        mResolver = context.getContentResolver();
238        mBattery = battery;
239        mPower = power;
240        mAlarm = alarm;
241        mActivity = activity;
242
243        context.registerReceiver(new RebootReceiver(),
244                new IntentFilter(REBOOT_ACTION));
245        mRebootIntent = PendingIntent.getBroadcast(context,
246                0, new Intent(REBOOT_ACTION), 0);
247
248        context.registerReceiver(new RebootRequestReceiver(),
249                new IntentFilter(Intent.ACTION_REBOOT),
250                android.Manifest.permission.REBOOT, null);
251
252        mBootTime = System.currentTimeMillis();
253    }
254
255    public void processStarted(String name, int pid) {
256        synchronized (this) {
257            if ("com.android.phone".equals(name)) {
258                mPhonePid = pid;
259            }
260        }
261    }
262
263    public void addMonitor(Monitor monitor) {
264        synchronized (this) {
265            if (isAlive()) {
266                throw new RuntimeException("Monitors can't be added once the Watchdog is running");
267            }
268            mMonitorChecker.addMonitor(monitor);
269        }
270    }
271
272    public void addThread(Handler thread, String name) {
273        synchronized (this) {
274            if (isAlive()) {
275                throw new RuntimeException("Threads can't be added once the Watchdog is running");
276            }
277            mHandlerCheckers.add(new HandlerChecker(thread, name, false));
278        }
279    }
280
281    void checkReboot(boolean fromAlarm) {
282        int rebootInterval = mReqRebootInterval >= 0 ? mReqRebootInterval
283                : REBOOT_DEFAULT_INTERVAL;
284        mRebootInterval = rebootInterval;
285        if (rebootInterval <= 0) {
286            // No reboot interval requested.
287            if (localLOGV) Slog.v(TAG, "No need to schedule a reboot alarm!");
288            mAlarm.remove(mRebootIntent);
289            return;
290        }
291
292        long rebootStartTime = mReqRebootStartTime >= 0 ? mReqRebootStartTime
293                : REBOOT_DEFAULT_START_TIME;
294        long rebootWindowMillis = (mReqRebootWindow >= 0 ? mReqRebootWindow
295                : REBOOT_DEFAULT_WINDOW) * 1000;
296        long recheckInterval = (mReqRecheckInterval >= 0 ? mReqRecheckInterval
297                : MEMCHECK_DEFAULT_RECHECK_INTERVAL) * 1000;
298
299        retrieveBrutalityAmount();
300
301        long realStartTime;
302        long now;
303
304        synchronized (this) {
305            now = System.currentTimeMillis();
306            realStartTime = computeCalendarTime(mCalendar, now,
307                    rebootStartTime);
308
309            long rebootIntervalMillis = rebootInterval*24*60*60*1000;
310            if (DB || mReqRebootNoWait ||
311                    (now-mBootTime) >= (rebootIntervalMillis-rebootWindowMillis)) {
312                if (fromAlarm && rebootWindowMillis <= 0) {
313                    // No reboot window -- just immediately reboot.
314                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
315                            (int)rebootIntervalMillis, (int)rebootStartTime*1000,
316                            (int)rebootWindowMillis, "");
317                    rebootSystem("Checkin scheduled forced");
318                    return;
319                }
320
321                // Are we within the reboot window?
322                if (now < realStartTime) {
323                    // Schedule alarm for next check interval.
324                    realStartTime = computeCalendarTime(mCalendar,
325                            now, rebootStartTime);
326                } else if (now < (realStartTime+rebootWindowMillis)) {
327                    String doit = shouldWeBeBrutalLocked(now);
328                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
329                            (int)rebootInterval, (int)rebootStartTime*1000,
330                            (int)rebootWindowMillis, doit != null ? doit : "");
331                    if (doit == null) {
332                        rebootSystem("Checked scheduled range");
333                        return;
334                    }
335
336                    // Schedule next alarm either within the window or in the
337                    // next interval.
338                    if ((now+recheckInterval) >= (realStartTime+rebootWindowMillis)) {
339                        realStartTime = computeCalendarTime(mCalendar,
340                                now + rebootIntervalMillis, rebootStartTime);
341                    } else {
342                        realStartTime = now + recheckInterval;
343                    }
344                } else {
345                    // Schedule alarm for next check interval.
346                    realStartTime = computeCalendarTime(mCalendar,
347                            now + rebootIntervalMillis, rebootStartTime);
348                }
349            }
350        }
351
352        if (localLOGV) Slog.v(TAG, "Scheduling next reboot alarm for "
353                + ((realStartTime-now)/1000/60) + "m from now");
354        mAlarm.remove(mRebootIntent);
355        mAlarm.set(AlarmManager.RTC_WAKEUP, realStartTime, mRebootIntent);
356    }
357
358    /**
359     * Perform a full reboot of the system.
360     */
361    void rebootSystem(String reason) {
362        Slog.i(TAG, "Rebooting system because: " + reason);
363        PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power");
364        pms.reboot(false, reason, false);
365    }
366
367    /**
368     * Load the current Gservices settings for when
369     * {@link #shouldWeBeBrutalLocked} will allow the brutality to happen.
370     * Must not be called with the lock held.
371     */
372    void retrieveBrutalityAmount() {
373        mMinScreenOff = (mReqMinScreenOff >= 0 ? mReqMinScreenOff
374                : MEMCHECK_DEFAULT_MIN_SCREEN_OFF) * 1000;
375        mMinAlarm = (mReqMinNextAlarm >= 0 ? mReqMinNextAlarm
376                : MEMCHECK_DEFAULT_MIN_ALARM) * 1000;
377    }
378
379    /**
380     * Determine whether it is a good time to kill, crash, or otherwise
381     * plunder the current situation for the overall long-term benefit of
382     * the world.
383     *
384     * @param curTime The current system time.
385     * @return Returns null if this is a good time, else a String with the
386     * text of why it is not a good time.
387     */
388    String shouldWeBeBrutalLocked(long curTime) {
389        if (mBattery == null || !mBattery.isPowered(BatteryManager.BATTERY_PLUGGED_ANY)) {
390            return "battery";
391        }
392
393        if (mMinScreenOff >= 0 && (mPower == null ||
394                mPower.timeSinceScreenWasLastOn() < mMinScreenOff)) {
395            return "screen";
396        }
397
398        if (mMinAlarm >= 0 && (mAlarm == null ||
399                mAlarm.timeToNextAlarm() < mMinAlarm)) {
400            return "alarm";
401        }
402
403        return null;
404    }
405
406    static long computeCalendarTime(Calendar c, long curTime,
407            long secondsSinceMidnight) {
408
409        // start with now
410        c.setTimeInMillis(curTime);
411
412        int val = (int)secondsSinceMidnight / (60*60);
413        c.set(Calendar.HOUR_OF_DAY, val);
414        secondsSinceMidnight -= val * (60*60);
415        val = (int)secondsSinceMidnight / 60;
416        c.set(Calendar.MINUTE, val);
417        c.set(Calendar.SECOND, (int)secondsSinceMidnight - (val*60));
418        c.set(Calendar.MILLISECOND, 0);
419
420        long newTime = c.getTimeInMillis();
421        if (newTime < curTime) {
422            // The given time (in seconds since midnight) has already passed for today, so advance
423            // by one day (due to daylight savings, etc., the delta may differ from 24 hours).
424            c.add(Calendar.DAY_OF_MONTH, 1);
425            newTime = c.getTimeInMillis();
426        }
427
428        return newTime;
429    }
430
431    private boolean haveAllCheckersCompletedLocked() {
432        for (int i=0; i<mHandlerCheckers.size(); i++) {
433            HandlerChecker hc = mHandlerCheckers.get(i);
434            if (!hc.isCompletedLocked()) {
435                return false;
436            }
437        }
438        return true;
439    }
440
441    private String describeBlockedCheckersLocked() {
442        StringBuilder builder = new StringBuilder(128);
443        for (int i=0; i<mHandlerCheckers.size(); i++) {
444            HandlerChecker hc = mHandlerCheckers.get(i);
445            if (!hc.isCompletedLocked()) {
446                if (builder.length() > 0) {
447                    builder.append(", ");
448                }
449                builder.append(hc.describeBlockedStateLocked());
450            }
451        }
452        return builder.toString();
453    }
454
455    @Override
456    public void run() {
457        boolean waitedHalf = false;
458        while (true) {
459            final String name;
460            synchronized (this) {
461                long timeout = TIME_TO_WAIT;
462                if (!waitedHalf) {
463                    // If we are not at the half-point of waiting, perform a
464                    // new set of checks.  Otherwise we are still waiting for a previous set.
465                    for (int i=0; i<mHandlerCheckers.size(); i++) {
466                        HandlerChecker hc = mHandlerCheckers.get(i);
467                        hc.scheduleCheckLocked();
468                    }
469                }
470
471                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
472                // wait while asleep. If the device is asleep then the thing that we are waiting
473                // to timeout on is asleep as well and won't have a chance to run, causing a false
474                // positive on when to kill things.
475                long start = SystemClock.uptimeMillis();
476                while (timeout > 0) {
477                    try {
478                        wait(timeout);
479                    } catch (InterruptedException e) {
480                        Log.wtf(TAG, e);
481                    }
482                    timeout = TIME_TO_WAIT - (SystemClock.uptimeMillis() - start);
483                }
484
485                if (haveAllCheckersCompletedLocked()) {
486                    // The monitors have returned.
487                    waitedHalf = false;
488                    continue;
489                }
490
491                if (!waitedHalf) {
492                    // We've waited half the deadlock-detection interval.  Pull a stack
493                    // trace and wait another half.
494                    ArrayList<Integer> pids = new ArrayList<Integer>();
495                    pids.add(Process.myPid());
496                    ActivityManagerService.dumpStackTraces(true, pids, null, null,
497                            NATIVE_STACKS_OF_INTEREST);
498                    waitedHalf = true;
499                    continue;
500                }
501
502                name = describeBlockedCheckersLocked();
503            }
504
505            // If we got here, that means that the system is most likely hung.
506            // First collect stack traces from all threads of the system process.
507            // Then kill this process so that the system will restart.
508            EventLog.writeEvent(EventLogTags.WATCHDOG, name);
509
510            ArrayList<Integer> pids = new ArrayList<Integer>();
511            pids.add(Process.myPid());
512            if (mPhonePid > 0) pids.add(mPhonePid);
513            // Pass !waitedHalf so that just in case we somehow wind up here without having
514            // dumped the halfway stacks, we properly re-initialize the trace file.
515            final File stack = ActivityManagerService.dumpStackTraces(
516                    !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
517
518            // Give some extra time to make sure the stack traces get written.
519            // The system's been hanging for a minute, another second or two won't hurt much.
520            SystemClock.sleep(2000);
521
522            // Pull our own kernel thread stacks as well if we're configured for that
523            if (RECORD_KERNEL_THREADS) {
524                dumpKernelStackTraces();
525            }
526
527            // Trigger the kernel to dump all blocked threads to the kernel log
528            try {
529                FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
530                sysrq_trigger.write("w");
531                sysrq_trigger.close();
532            } catch (IOException e) {
533                Slog.e(TAG, "Failed to write to /proc/sysrq-trigger");
534                Slog.e(TAG, e.getMessage());
535            }
536
537            // Try to add the error to the dropbox, but assuming that the ActivityManager
538            // itself may be deadlocked.  (which has happened, causing this statement to
539            // deadlock and the watchdog as a whole to be ineffective)
540            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
541                    public void run() {
542                        mActivity.addErrorToDropBox(
543                                "watchdog", null, "system_server", null, null,
544                                name, null, stack, null);
545                    }
546                };
547            dropboxThread.start();
548            try {
549                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
550            } catch (InterruptedException ignored) {}
551
552            // Only kill the process if the debugger is not attached.
553            if (!Debug.isDebuggerConnected()) {
554                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + name);
555                Slog.w(TAG, "Main thread stack trace:");
556                StackTraceElement[] stackTrace = Looper.getMainLooper().getThread().getStackTrace();
557                for (StackTraceElement element: stackTrace) {
558                    Slog.w(TAG, "\tat " + element);
559                }
560                Slog.w(TAG, "<End of main thread stack trace>");
561                Process.killProcess(Process.myPid());
562                System.exit(10);
563            } else {
564                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
565            }
566
567            waitedHalf = false;
568        }
569    }
570
571    private File dumpKernelStackTraces() {
572        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
573        if (tracesPath == null || tracesPath.length() == 0) {
574            return null;
575        }
576
577        native_dumpKernelStacks(tracesPath);
578        return new File(tracesPath);
579    }
580
581    private native void native_dumpKernelStacks(String tracesPath);
582}
583