Watchdog.java revision 8bd64df2adb26fe9547ae3961a58631e241b613e
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import android.app.IActivityController;
20import android.os.Binder;
21import android.os.RemoteException;
22import com.android.server.am.ActivityManagerService;
23import com.android.server.power.PowerManagerService;
24
25import android.app.AlarmManager;
26import android.app.PendingIntent;
27import android.content.BroadcastReceiver;
28import android.content.ContentResolver;
29import android.content.Context;
30import android.content.Intent;
31import android.content.IntentFilter;
32import android.os.BatteryManager;
33import android.os.Debug;
34import android.os.Handler;
35import android.os.Looper;
36import android.os.Message;
37import android.os.Process;
38import android.os.ServiceManager;
39import android.os.SystemClock;
40import android.os.SystemProperties;
41import android.util.EventLog;
42import android.util.Log;
43import android.util.Slog;
44
45import java.io.File;
46import java.io.FileWriter;
47import java.io.IOException;
48import java.util.ArrayList;
49import java.util.Calendar;
50
51/** This class calls its monitor every minute. Killing this process if they don't return **/
52public class Watchdog extends Thread {
53    static final String TAG = "Watchdog";
54    static final boolean localLOGV = false || false;
55
56    // Set this to true to use debug default values.
57    static final boolean DB = false;
58
59    // Set this to true to have the watchdog record kernel thread stacks when it fires
60    static final boolean RECORD_KERNEL_THREADS = true;
61
62    static final int MONITOR = 2718;
63
64    static final int TIME_TO_RESTART = DB ? 15*1000 : 60*1000;
65    static final int TIME_TO_WAIT = TIME_TO_RESTART / 2;
66
67    static final int MEMCHECK_DEFAULT_MIN_SCREEN_OFF = DB ? 1*60 : 5*60;   // 5 minutes
68    static final int MEMCHECK_DEFAULT_MIN_ALARM = DB ? 1*60 : 3*60;        // 3 minutes
69    static final int MEMCHECK_DEFAULT_RECHECK_INTERVAL = DB ? 1*60 : 5*60; // 5 minutes
70
71    static final int REBOOT_DEFAULT_INTERVAL = DB ? 1 : 0;                 // never force reboot
72    static final int REBOOT_DEFAULT_START_TIME = 3*60*60;                  // 3:00am
73    static final int REBOOT_DEFAULT_WINDOW = 60*60;                        // within 1 hour
74
75    static final String REBOOT_ACTION = "com.android.service.Watchdog.REBOOT";
76
77    static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
78        "/system/bin/mediaserver",
79        "/system/bin/sdcard",
80        "/system/bin/surfaceflinger"
81    };
82
83    static Watchdog sWatchdog;
84
85    /* This handler will be used to post message back onto the main thread */
86    final Handler mHandler;
87    final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
88    ContentResolver mResolver;
89    BatteryService mBattery;
90    PowerManagerService mPower;
91    AlarmManagerService mAlarm;
92    ActivityManagerService mActivity;
93    boolean mCompleted;
94    Monitor mCurrentMonitor;
95
96    int mPhonePid;
97    IActivityController mController;
98    boolean mAllowRestart = true;
99
100    final Calendar mCalendar = Calendar.getInstance();
101    int mMinScreenOff = MEMCHECK_DEFAULT_MIN_SCREEN_OFF;
102    int mMinAlarm = MEMCHECK_DEFAULT_MIN_ALARM;
103    boolean mNeedScheduledCheck;
104    PendingIntent mCheckupIntent;
105    PendingIntent mRebootIntent;
106
107    long mBootTime;
108    int mRebootInterval;
109
110    boolean mReqRebootNoWait;     // should wait for one interval before reboot?
111    int mReqRebootInterval = -1;  // >= 0 if a reboot has been requested
112    int mReqRebootStartTime = -1; // >= 0 if a specific start time has been requested
113    int mReqRebootWindow = -1;    // >= 0 if a specific window has been requested
114    int mReqMinScreenOff = -1;    // >= 0 if a specific screen off time has been requested
115    int mReqMinNextAlarm = -1;    // >= 0 if specific time to next alarm has been requested
116    int mReqRecheckInterval= -1;  // >= 0 if a specific recheck interval has been requested
117
118    /**
119     * Used for scheduling monitor callbacks and checking memory usage.
120     */
121    final class HeartbeatHandler extends Handler {
122        HeartbeatHandler(Looper looper) {
123            super(looper);
124        }
125
126        @Override
127        public void handleMessage(Message msg) {
128            switch (msg.what) {
129                case MONITOR: {
130                    // See if we should force a reboot.
131                    int rebootInterval = mReqRebootInterval >= 0
132                            ? mReqRebootInterval : REBOOT_DEFAULT_INTERVAL;
133                    if (mRebootInterval != rebootInterval) {
134                        mRebootInterval = rebootInterval;
135                        // We have been running long enough that a reboot can
136                        // be considered...
137                        checkReboot(false);
138                    }
139
140                    final int size = mMonitors.size();
141                    for (int i = 0 ; i < size ; i++) {
142                        synchronized (Watchdog.this) {
143                            mCurrentMonitor = mMonitors.get(i);
144                        }
145                        mCurrentMonitor.monitor();
146                    }
147
148                    synchronized (Watchdog.this) {
149                        mCompleted = true;
150                        mCurrentMonitor = null;
151                    }
152                } break;
153            }
154        }
155    }
156
157    final class RebootReceiver extends BroadcastReceiver {
158        @Override
159        public void onReceive(Context c, Intent intent) {
160            if (localLOGV) Slog.v(TAG, "Alarm went off, checking reboot.");
161            checkReboot(true);
162        }
163    }
164
165    final class RebootRequestReceiver extends BroadcastReceiver {
166        @Override
167        public void onReceive(Context c, Intent intent) {
168            mReqRebootNoWait = intent.getIntExtra("nowait", 0) != 0;
169            mReqRebootInterval = intent.getIntExtra("interval", -1);
170            mReqRebootStartTime = intent.getIntExtra("startTime", -1);
171            mReqRebootWindow = intent.getIntExtra("window", -1);
172            mReqMinScreenOff = intent.getIntExtra("minScreenOff", -1);
173            mReqMinNextAlarm = intent.getIntExtra("minNextAlarm", -1);
174            mReqRecheckInterval = intent.getIntExtra("recheckInterval", -1);
175            EventLog.writeEvent(EventLogTags.WATCHDOG_REQUESTED_REBOOT,
176                    mReqRebootNoWait ? 1 : 0, mReqRebootInterval,
177                            mReqRecheckInterval, mReqRebootStartTime,
178                    mReqRebootWindow, mReqMinScreenOff, mReqMinNextAlarm);
179            checkReboot(true);
180        }
181    }
182
183    public interface Monitor {
184        void monitor();
185    }
186
187    public static Watchdog getInstance() {
188        if (sWatchdog == null) {
189            sWatchdog = new Watchdog();
190        }
191
192        return sWatchdog;
193    }
194
195    private Watchdog() {
196        super("watchdog");
197        // Explicitly bind the HeartbeatHandler to run on the ServerThread, so
198        // that it can't get accidentally bound to another thread.
199        mHandler = new HeartbeatHandler(Looper.getMainLooper());
200    }
201
202    public void init(Context context, BatteryService battery,
203            PowerManagerService power, AlarmManagerService alarm,
204            ActivityManagerService activity) {
205        mResolver = context.getContentResolver();
206        mBattery = battery;
207        mPower = power;
208        mAlarm = alarm;
209        mActivity = activity;
210
211        context.registerReceiver(new RebootReceiver(),
212                new IntentFilter(REBOOT_ACTION));
213        mRebootIntent = PendingIntent.getBroadcast(context,
214                0, new Intent(REBOOT_ACTION), 0);
215
216        context.registerReceiver(new RebootRequestReceiver(),
217                new IntentFilter(Intent.ACTION_REBOOT),
218                android.Manifest.permission.REBOOT, null);
219
220        mBootTime = System.currentTimeMillis();
221    }
222
223    public void processStarted(String name, int pid) {
224        synchronized (this) {
225            if ("com.android.phone".equals(name)) {
226                mPhonePid = pid;
227            }
228        }
229    }
230
231    public void setActivityController(IActivityController controller) {
232        synchronized (this) {
233            mController = controller;
234        }
235    }
236
237    public void setAllowRestart(boolean allowRestart) {
238        synchronized (this) {
239            mAllowRestart = allowRestart;
240        }
241    }
242
243    public void addMonitor(Monitor monitor) {
244        synchronized (this) {
245            if (isAlive()) {
246                throw new RuntimeException("Monitors can't be added while the Watchdog is running");
247            }
248            mMonitors.add(monitor);
249        }
250    }
251
252    void checkReboot(boolean fromAlarm) {
253        int rebootInterval = mReqRebootInterval >= 0 ? mReqRebootInterval
254                : REBOOT_DEFAULT_INTERVAL;
255        mRebootInterval = rebootInterval;
256        if (rebootInterval <= 0) {
257            // No reboot interval requested.
258            if (localLOGV) Slog.v(TAG, "No need to schedule a reboot alarm!");
259            mAlarm.remove(mRebootIntent);
260            return;
261        }
262
263        long rebootStartTime = mReqRebootStartTime >= 0 ? mReqRebootStartTime
264                : REBOOT_DEFAULT_START_TIME;
265        long rebootWindowMillis = (mReqRebootWindow >= 0 ? mReqRebootWindow
266                : REBOOT_DEFAULT_WINDOW) * 1000;
267        long recheckInterval = (mReqRecheckInterval >= 0 ? mReqRecheckInterval
268                : MEMCHECK_DEFAULT_RECHECK_INTERVAL) * 1000;
269
270        retrieveBrutalityAmount();
271
272        long realStartTime;
273        long now;
274
275        synchronized (this) {
276            now = System.currentTimeMillis();
277            realStartTime = computeCalendarTime(mCalendar, now,
278                    rebootStartTime);
279
280            long rebootIntervalMillis = rebootInterval*24*60*60*1000;
281            if (DB || mReqRebootNoWait ||
282                    (now-mBootTime) >= (rebootIntervalMillis-rebootWindowMillis)) {
283                if (fromAlarm && rebootWindowMillis <= 0) {
284                    // No reboot window -- just immediately reboot.
285                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
286                            (int)rebootIntervalMillis, (int)rebootStartTime*1000,
287                            (int)rebootWindowMillis, "");
288                    rebootSystem("Checkin scheduled forced");
289                    return;
290                }
291
292                // Are we within the reboot window?
293                if (now < realStartTime) {
294                    // Schedule alarm for next check interval.
295                    realStartTime = computeCalendarTime(mCalendar,
296                            now, rebootStartTime);
297                } else if (now < (realStartTime+rebootWindowMillis)) {
298                    String doit = shouldWeBeBrutalLocked(now);
299                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
300                            (int)rebootInterval, (int)rebootStartTime*1000,
301                            (int)rebootWindowMillis, doit != null ? doit : "");
302                    if (doit == null) {
303                        rebootSystem("Checked scheduled range");
304                        return;
305                    }
306
307                    // Schedule next alarm either within the window or in the
308                    // next interval.
309                    if ((now+recheckInterval) >= (realStartTime+rebootWindowMillis)) {
310                        realStartTime = computeCalendarTime(mCalendar,
311                                now + rebootIntervalMillis, rebootStartTime);
312                    } else {
313                        realStartTime = now + recheckInterval;
314                    }
315                } else {
316                    // Schedule alarm for next check interval.
317                    realStartTime = computeCalendarTime(mCalendar,
318                            now + rebootIntervalMillis, rebootStartTime);
319                }
320            }
321        }
322
323        if (localLOGV) Slog.v(TAG, "Scheduling next reboot alarm for "
324                + ((realStartTime-now)/1000/60) + "m from now");
325        mAlarm.remove(mRebootIntent);
326        mAlarm.set(AlarmManager.RTC_WAKEUP, realStartTime, mRebootIntent);
327    }
328
329    /**
330     * Perform a full reboot of the system.
331     */
332    void rebootSystem(String reason) {
333        Slog.i(TAG, "Rebooting system because: " + reason);
334        PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power");
335        pms.reboot(false, reason, false);
336    }
337
338    /**
339     * Load the current Gservices settings for when
340     * {@link #shouldWeBeBrutalLocked} will allow the brutality to happen.
341     * Must not be called with the lock held.
342     */
343    void retrieveBrutalityAmount() {
344        mMinScreenOff = (mReqMinScreenOff >= 0 ? mReqMinScreenOff
345                : MEMCHECK_DEFAULT_MIN_SCREEN_OFF) * 1000;
346        mMinAlarm = (mReqMinNextAlarm >= 0 ? mReqMinNextAlarm
347                : MEMCHECK_DEFAULT_MIN_ALARM) * 1000;
348    }
349
350    /**
351     * Determine whether it is a good time to kill, crash, or otherwise
352     * plunder the current situation for the overall long-term benefit of
353     * the world.
354     *
355     * @param curTime The current system time.
356     * @return Returns null if this is a good time, else a String with the
357     * text of why it is not a good time.
358     */
359    String shouldWeBeBrutalLocked(long curTime) {
360        if (mBattery == null || !mBattery.isPowered(BatteryManager.BATTERY_PLUGGED_ANY)) {
361            return "battery";
362        }
363
364        if (mMinScreenOff >= 0 && (mPower == null ||
365                mPower.timeSinceScreenWasLastOn() < mMinScreenOff)) {
366            return "screen";
367        }
368
369        if (mMinAlarm >= 0 && (mAlarm == null ||
370                mAlarm.timeToNextAlarm() < mMinAlarm)) {
371            return "alarm";
372        }
373
374        return null;
375    }
376
377    static long computeCalendarTime(Calendar c, long curTime,
378            long secondsSinceMidnight) {
379
380        // start with now
381        c.setTimeInMillis(curTime);
382
383        int val = (int)secondsSinceMidnight / (60*60);
384        c.set(Calendar.HOUR_OF_DAY, val);
385        secondsSinceMidnight -= val * (60*60);
386        val = (int)secondsSinceMidnight / 60;
387        c.set(Calendar.MINUTE, val);
388        c.set(Calendar.SECOND, (int)secondsSinceMidnight - (val*60));
389        c.set(Calendar.MILLISECOND, 0);
390
391        long newTime = c.getTimeInMillis();
392        if (newTime < curTime) {
393            // The given time (in seconds since midnight) has already passed for today, so advance
394            // by one day (due to daylight savings, etc., the delta may differ from 24 hours).
395            c.add(Calendar.DAY_OF_MONTH, 1);
396            newTime = c.getTimeInMillis();
397        }
398
399        return newTime;
400    }
401
402    @Override
403    public void run() {
404        boolean waitedHalf = false;
405        while (true) {
406            mCompleted = false;
407            mHandler.sendEmptyMessage(MONITOR);
408
409
410            final String name;
411            final boolean allowRestart;
412            synchronized (this) {
413                long timeout = TIME_TO_WAIT;
414
415                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
416                // wait while asleep. If the device is asleep then the thing that we are waiting
417                // to timeout on is asleep as well and won't have a chance to run, causing a false
418                // positive on when to kill things.
419                long start = SystemClock.uptimeMillis();
420                while (timeout > 0) {
421                    try {
422                        wait(timeout);
423                    } catch (InterruptedException e) {
424                        Log.wtf(TAG, e);
425                    }
426                    timeout = TIME_TO_WAIT - (SystemClock.uptimeMillis() - start);
427                }
428
429                if (mCompleted) {
430                    // The monitors have returned.
431                    waitedHalf = false;
432                    continue;
433                }
434
435                if (!waitedHalf) {
436                    // We've waited half the deadlock-detection interval.  Pull a stack
437                    // trace and wait another half.
438                    ArrayList<Integer> pids = new ArrayList<Integer>();
439                    pids.add(Process.myPid());
440                    ActivityManagerService.dumpStackTraces(true, pids, null, null,
441                            NATIVE_STACKS_OF_INTEREST);
442                    waitedHalf = true;
443                    continue;
444                }
445
446                name = (mCurrentMonitor != null) ?
447                    mCurrentMonitor.getClass().getName() : "null";
448                allowRestart = mAllowRestart;
449            }
450
451            // If we got here, that means that the system is most likely hung.
452            // First collect stack traces from all threads of the system process.
453            // Then kill this process so that the system will restart.
454            EventLog.writeEvent(EventLogTags.WATCHDOG, name);
455
456            ArrayList<Integer> pids = new ArrayList<Integer>();
457            pids.add(Process.myPid());
458            if (mPhonePid > 0) pids.add(mPhonePid);
459            // Pass !waitedHalf so that just in case we somehow wind up here without having
460            // dumped the halfway stacks, we properly re-initialize the trace file.
461            final File stack = ActivityManagerService.dumpStackTraces(
462                    !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
463
464            // Give some extra time to make sure the stack traces get written.
465            // The system's been hanging for a minute, another second or two won't hurt much.
466            SystemClock.sleep(2000);
467
468            // Pull our own kernel thread stacks as well if we're configured for that
469            if (RECORD_KERNEL_THREADS) {
470                dumpKernelStackTraces();
471            }
472
473            // Trigger the kernel to dump all blocked threads to the kernel log
474            try {
475                FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
476                sysrq_trigger.write("w");
477                sysrq_trigger.close();
478            } catch (IOException e) {
479                Slog.e(TAG, "Failed to write to /proc/sysrq-trigger");
480                Slog.e(TAG, e.getMessage());
481            }
482
483            // Try to add the error to the dropbox, but assuming that the ActivityManager
484            // itself may be deadlocked.  (which has happened, causing this statement to
485            // deadlock and the watchdog as a whole to be ineffective)
486            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
487                    public void run() {
488                        mActivity.addErrorToDropBox(
489                                "watchdog", null, "system_server", null, null,
490                                name, null, stack, null);
491                    }
492                };
493            dropboxThread.start();
494            try {
495                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
496            } catch (InterruptedException ignored) {}
497
498            IActivityController controller;
499            synchronized (this) {
500                controller = mController;
501            }
502            if (controller != null) {
503                Slog.i(TAG, "Reporting stuck state to activity controller");
504                try {
505                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
506                    // 1 = keep waiting, -1 = kill system
507                    int res = controller.systemNotResponding(name);
508                    if (res >= 0) {
509                        Slog.i(TAG, "Activity controller requested to coninue to wait");
510                        waitedHalf = false;
511                        continue;
512                    }
513                } catch (RemoteException e) {
514                }
515            }
516
517            // Only kill the process if the debugger is not attached.
518            if (Debug.isDebuggerConnected()) {
519                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
520            } else if (!allowRestart) {
521                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
522            } else {
523                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + name);
524                Process.killProcess(Process.myPid());
525                System.exit(10);
526            }
527
528            waitedHalf = false;
529        }
530    }
531
532    private File dumpKernelStackTraces() {
533        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
534        if (tracesPath == null || tracesPath.length() == 0) {
535            return null;
536        }
537
538        native_dumpKernelStacks(tracesPath);
539        return new File(tracesPath);
540    }
541
542    private native void native_dumpKernelStacks(String tracesPath);
543}
544