Watchdog.java revision 4de9936e85696208dfe91d1c40e3e5226e57634a
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import com.android.server.am.ActivityManagerService;
20import com.android.server.power.PowerManagerService;
21
22import android.app.AlarmManager;
23import android.app.PendingIntent;
24import android.content.BroadcastReceiver;
25import android.content.ContentResolver;
26import android.content.Context;
27import android.content.Intent;
28import android.content.IntentFilter;
29import android.os.Debug;
30import android.os.Handler;
31import android.os.Message;
32import android.os.Process;
33import android.os.ServiceManager;
34import android.os.SystemClock;
35import android.os.SystemProperties;
36import android.util.EventLog;
37import android.util.Log;
38import android.util.Slog;
39
40import java.io.File;
41import java.util.ArrayList;
42import java.util.Calendar;
43
44/** This class calls its monitor every minute. Killing this process if they don't return **/
45public class Watchdog extends Thread {
46    static final String TAG = "Watchdog";
47    static final boolean localLOGV = false || false;
48
49    // Set this to true to use debug default values.
50    static final boolean DB = false;
51
52    // Set this to true to have the watchdog record kernel thread stacks when it fires
53    static final boolean RECORD_KERNEL_THREADS = true;
54
55    static final int MONITOR = 2718;
56
57    static final int TIME_TO_RESTART = DB ? 15*1000 : 60*1000;
58    static final int TIME_TO_WAIT = TIME_TO_RESTART / 2;
59
60    static final int MEMCHECK_DEFAULT_MIN_SCREEN_OFF = DB ? 1*60 : 5*60;   // 5 minutes
61    static final int MEMCHECK_DEFAULT_MIN_ALARM = DB ? 1*60 : 3*60;        // 3 minutes
62    static final int MEMCHECK_DEFAULT_RECHECK_INTERVAL = DB ? 1*60 : 5*60; // 5 minutes
63
64    static final int REBOOT_DEFAULT_INTERVAL = DB ? 1 : 0;                 // never force reboot
65    static final int REBOOT_DEFAULT_START_TIME = 3*60*60;                  // 3:00am
66    static final int REBOOT_DEFAULT_WINDOW = 60*60;                        // within 1 hour
67
68    static final String REBOOT_ACTION = "com.android.service.Watchdog.REBOOT";
69
70    static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
71        "/system/bin/mediaserver",
72        "/system/bin/sdcard",
73        "/system/bin/surfaceflinger"
74    };
75
76    static Watchdog sWatchdog;
77
78    /* This handler will be used to post message back onto the main thread */
79    final Handler mHandler;
80    final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
81    ContentResolver mResolver;
82    BatteryService mBattery;
83    PowerManagerService mPower;
84    AlarmManagerService mAlarm;
85    ActivityManagerService mActivity;
86    boolean mCompleted;
87    boolean mForceKillSystem;
88    Monitor mCurrentMonitor;
89
90    int mPhonePid;
91
92    final Calendar mCalendar = Calendar.getInstance();
93    int mMinScreenOff = MEMCHECK_DEFAULT_MIN_SCREEN_OFF;
94    int mMinAlarm = MEMCHECK_DEFAULT_MIN_ALARM;
95    boolean mNeedScheduledCheck;
96    PendingIntent mCheckupIntent;
97    PendingIntent mRebootIntent;
98
99    long mBootTime;
100    int mRebootInterval;
101
102    boolean mReqRebootNoWait;     // should wait for one interval before reboot?
103    int mReqRebootInterval = -1;  // >= 0 if a reboot has been requested
104    int mReqRebootStartTime = -1; // >= 0 if a specific start time has been requested
105    int mReqRebootWindow = -1;    // >= 0 if a specific window has been requested
106    int mReqMinScreenOff = -1;    // >= 0 if a specific screen off time has been requested
107    int mReqMinNextAlarm = -1;    // >= 0 if specific time to next alarm has been requested
108    int mReqRecheckInterval= -1;  // >= 0 if a specific recheck interval has been requested
109
110    /**
111     * Used for scheduling monitor callbacks and checking memory usage.
112     */
113    final class HeartbeatHandler extends Handler {
114        @Override
115        public void handleMessage(Message msg) {
116            switch (msg.what) {
117                case MONITOR: {
118                    // See if we should force a reboot.
119                    int rebootInterval = mReqRebootInterval >= 0
120                            ? mReqRebootInterval : REBOOT_DEFAULT_INTERVAL;
121                    if (mRebootInterval != rebootInterval) {
122                        mRebootInterval = rebootInterval;
123                        // We have been running long enough that a reboot can
124                        // be considered...
125                        checkReboot(false);
126                    }
127
128                    final int size = mMonitors.size();
129                    for (int i = 0 ; i < size ; i++) {
130                        mCurrentMonitor = mMonitors.get(i);
131                        mCurrentMonitor.monitor();
132                    }
133
134                    synchronized (Watchdog.this) {
135                        mCompleted = true;
136                        mCurrentMonitor = null;
137                    }
138                } break;
139            }
140        }
141    }
142
143    final class RebootReceiver extends BroadcastReceiver {
144        @Override
145        public void onReceive(Context c, Intent intent) {
146            if (localLOGV) Slog.v(TAG, "Alarm went off, checking reboot.");
147            checkReboot(true);
148        }
149    }
150
151    final class RebootRequestReceiver extends BroadcastReceiver {
152        @Override
153        public void onReceive(Context c, Intent intent) {
154            mReqRebootNoWait = intent.getIntExtra("nowait", 0) != 0;
155            mReqRebootInterval = intent.getIntExtra("interval", -1);
156            mReqRebootStartTime = intent.getIntExtra("startTime", -1);
157            mReqRebootWindow = intent.getIntExtra("window", -1);
158            mReqMinScreenOff = intent.getIntExtra("minScreenOff", -1);
159            mReqMinNextAlarm = intent.getIntExtra("minNextAlarm", -1);
160            mReqRecheckInterval = intent.getIntExtra("recheckInterval", -1);
161            EventLog.writeEvent(EventLogTags.WATCHDOG_REQUESTED_REBOOT,
162                    mReqRebootNoWait ? 1 : 0, mReqRebootInterval,
163                            mReqRecheckInterval, mReqRebootStartTime,
164                    mReqRebootWindow, mReqMinScreenOff, mReqMinNextAlarm);
165            checkReboot(true);
166        }
167    }
168
169    public interface Monitor {
170        void monitor();
171    }
172
173    public static Watchdog getInstance() {
174        if (sWatchdog == null) {
175            sWatchdog = new Watchdog();
176        }
177
178        return sWatchdog;
179    }
180
181    private Watchdog() {
182        super("watchdog");
183        mHandler = new HeartbeatHandler();
184    }
185
186    public void init(Context context, BatteryService battery,
187            PowerManagerService power, AlarmManagerService alarm,
188            ActivityManagerService activity) {
189        mResolver = context.getContentResolver();
190        mBattery = battery;
191        mPower = power;
192        mAlarm = alarm;
193        mActivity = activity;
194
195        context.registerReceiver(new RebootReceiver(),
196                new IntentFilter(REBOOT_ACTION));
197        mRebootIntent = PendingIntent.getBroadcast(context,
198                0, new Intent(REBOOT_ACTION), 0);
199
200        context.registerReceiver(new RebootRequestReceiver(),
201                new IntentFilter(Intent.ACTION_REBOOT),
202                android.Manifest.permission.REBOOT, null);
203
204        mBootTime = System.currentTimeMillis();
205    }
206
207    public void processStarted(String name, int pid) {
208        synchronized (this) {
209            if ("com.android.phone".equals(name)) {
210                mPhonePid = pid;
211            }
212        }
213    }
214
215    public void addMonitor(Monitor monitor) {
216        synchronized (this) {
217            if (isAlive()) {
218                throw new RuntimeException("Monitors can't be added while the Watchdog is running");
219            }
220            mMonitors.add(monitor);
221        }
222    }
223
224    void checkReboot(boolean fromAlarm) {
225        int rebootInterval = mReqRebootInterval >= 0 ? mReqRebootInterval
226                : REBOOT_DEFAULT_INTERVAL;
227        mRebootInterval = rebootInterval;
228        if (rebootInterval <= 0) {
229            // No reboot interval requested.
230            if (localLOGV) Slog.v(TAG, "No need to schedule a reboot alarm!");
231            mAlarm.remove(mRebootIntent);
232            return;
233        }
234
235        long rebootStartTime = mReqRebootStartTime >= 0 ? mReqRebootStartTime
236                : REBOOT_DEFAULT_START_TIME;
237        long rebootWindowMillis = (mReqRebootWindow >= 0 ? mReqRebootWindow
238                : REBOOT_DEFAULT_WINDOW) * 1000;
239        long recheckInterval = (mReqRecheckInterval >= 0 ? mReqRecheckInterval
240                : MEMCHECK_DEFAULT_RECHECK_INTERVAL) * 1000;
241
242        retrieveBrutalityAmount();
243
244        long realStartTime;
245        long now;
246
247        synchronized (this) {
248            now = System.currentTimeMillis();
249            realStartTime = computeCalendarTime(mCalendar, now,
250                    rebootStartTime);
251
252            long rebootIntervalMillis = rebootInterval*24*60*60*1000;
253            if (DB || mReqRebootNoWait ||
254                    (now-mBootTime) >= (rebootIntervalMillis-rebootWindowMillis)) {
255                if (fromAlarm && rebootWindowMillis <= 0) {
256                    // No reboot window -- just immediately reboot.
257                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
258                            (int)rebootIntervalMillis, (int)rebootStartTime*1000,
259                            (int)rebootWindowMillis, "");
260                    rebootSystem("Checkin scheduled forced");
261                    return;
262                }
263
264                // Are we within the reboot window?
265                if (now < realStartTime) {
266                    // Schedule alarm for next check interval.
267                    realStartTime = computeCalendarTime(mCalendar,
268                            now, rebootStartTime);
269                } else if (now < (realStartTime+rebootWindowMillis)) {
270                    String doit = shouldWeBeBrutalLocked(now);
271                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
272                            (int)rebootInterval, (int)rebootStartTime*1000,
273                            (int)rebootWindowMillis, doit != null ? doit : "");
274                    if (doit == null) {
275                        rebootSystem("Checked scheduled range");
276                        return;
277                    }
278
279                    // Schedule next alarm either within the window or in the
280                    // next interval.
281                    if ((now+recheckInterval) >= (realStartTime+rebootWindowMillis)) {
282                        realStartTime = computeCalendarTime(mCalendar,
283                                now + rebootIntervalMillis, rebootStartTime);
284                    } else {
285                        realStartTime = now + recheckInterval;
286                    }
287                } else {
288                    // Schedule alarm for next check interval.
289                    realStartTime = computeCalendarTime(mCalendar,
290                            now + rebootIntervalMillis, rebootStartTime);
291                }
292            }
293        }
294
295        if (localLOGV) Slog.v(TAG, "Scheduling next reboot alarm for "
296                + ((realStartTime-now)/1000/60) + "m from now");
297        mAlarm.remove(mRebootIntent);
298        mAlarm.set(AlarmManager.RTC_WAKEUP, realStartTime, mRebootIntent);
299    }
300
301    /**
302     * Perform a full reboot of the system.
303     */
304    void rebootSystem(String reason) {
305        Slog.i(TAG, "Rebooting system because: " + reason);
306        PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power");
307        pms.reboot(reason);
308    }
309
310    /**
311     * Load the current Gservices settings for when
312     * {@link #shouldWeBeBrutalLocked} will allow the brutality to happen.
313     * Must not be called with the lock held.
314     */
315    void retrieveBrutalityAmount() {
316        mMinScreenOff = (mReqMinScreenOff >= 0 ? mReqMinScreenOff
317                : MEMCHECK_DEFAULT_MIN_SCREEN_OFF) * 1000;
318        mMinAlarm = (mReqMinNextAlarm >= 0 ? mReqMinNextAlarm
319                : MEMCHECK_DEFAULT_MIN_ALARM) * 1000;
320    }
321
322    /**
323     * Determine whether it is a good time to kill, crash, or otherwise
324     * plunder the current situation for the overall long-term benefit of
325     * the world.
326     *
327     * @param curTime The current system time.
328     * @return Returns null if this is a good time, else a String with the
329     * text of why it is not a good time.
330     */
331    String shouldWeBeBrutalLocked(long curTime) {
332        if (mBattery == null || !mBattery.isPowered()) {
333            return "battery";
334        }
335
336        if (mMinScreenOff >= 0 && (mPower == null ||
337                mPower.timeSinceScreenWasLastOn() < mMinScreenOff)) {
338            return "screen";
339        }
340
341        if (mMinAlarm >= 0 && (mAlarm == null ||
342                mAlarm.timeToNextAlarm() < mMinAlarm)) {
343            return "alarm";
344        }
345
346        return null;
347    }
348
349    static long computeCalendarTime(Calendar c, long curTime,
350            long secondsSinceMidnight) {
351
352        // start with now
353        c.setTimeInMillis(curTime);
354
355        int val = (int)secondsSinceMidnight / (60*60);
356        c.set(Calendar.HOUR_OF_DAY, val);
357        secondsSinceMidnight -= val * (60*60);
358        val = (int)secondsSinceMidnight / 60;
359        c.set(Calendar.MINUTE, val);
360        c.set(Calendar.SECOND, (int)secondsSinceMidnight - (val*60));
361        c.set(Calendar.MILLISECOND, 0);
362
363        long newTime = c.getTimeInMillis();
364        if (newTime < curTime) {
365            // The given time (in seconds since midnight) has already passed for today, so advance
366            // by one day (due to daylight savings, etc., the delta may differ from 24 hours).
367            c.add(Calendar.DAY_OF_MONTH, 1);
368            newTime = c.getTimeInMillis();
369        }
370
371        return newTime;
372    }
373
374    @Override
375    public void run() {
376        boolean waitedHalf = false;
377        while (true) {
378            mCompleted = false;
379            mHandler.sendEmptyMessage(MONITOR);
380
381            synchronized (this) {
382                long timeout = TIME_TO_WAIT;
383
384                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
385                // wait while asleep. If the device is asleep then the thing that we are waiting
386                // to timeout on is asleep as well and won't have a chance to run, causing a false
387                // positive on when to kill things.
388                long start = SystemClock.uptimeMillis();
389                while (timeout > 0 && !mForceKillSystem) {
390                    try {
391                        wait(timeout);  // notifyAll() is called when mForceKillSystem is set
392                    } catch (InterruptedException e) {
393                        Log.wtf(TAG, e);
394                    }
395                    timeout = TIME_TO_WAIT - (SystemClock.uptimeMillis() - start);
396                }
397
398                if (mCompleted && !mForceKillSystem) {
399                    // The monitors have returned.
400                    waitedHalf = false;
401                    continue;
402                }
403
404                if (!waitedHalf) {
405                    // We've waited half the deadlock-detection interval.  Pull a stack
406                    // trace and wait another half.
407                    ArrayList<Integer> pids = new ArrayList<Integer>();
408                    pids.add(Process.myPid());
409                    ActivityManagerService.dumpStackTraces(true, pids, null, null,
410                            NATIVE_STACKS_OF_INTEREST);
411                    waitedHalf = true;
412                    continue;
413                }
414            }
415
416            // If we got here, that means that the system is most likely hung.
417            // First collect stack traces from all threads of the system process.
418            // Then kill this process so that the system will restart.
419
420            final String name = (mCurrentMonitor != null) ?
421                    mCurrentMonitor.getClass().getName() : "null";
422            EventLog.writeEvent(EventLogTags.WATCHDOG, name);
423
424            ArrayList<Integer> pids = new ArrayList<Integer>();
425            pids.add(Process.myPid());
426            if (mPhonePid > 0) pids.add(mPhonePid);
427            // Pass !waitedHalf so that just in case we somehow wind up here without having
428            // dumped the halfway stacks, we properly re-initialize the trace file.
429            final File stack = ActivityManagerService.dumpStackTraces(
430                    !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
431
432            // Give some extra time to make sure the stack traces get written.
433            // The system's been hanging for a minute, another second or two won't hurt much.
434            SystemClock.sleep(2000);
435
436            // Pull our own kernel thread stacks as well if we're configured for that
437            if (RECORD_KERNEL_THREADS) {
438                dumpKernelStackTraces();
439            }
440
441            // Try to add the error to the dropbox, but assuming that the ActivityManager
442            // itself may be deadlocked.  (which has happened, causing this statement to
443            // deadlock and the watchdog as a whole to be ineffective)
444            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
445                    public void run() {
446                        mActivity.addErrorToDropBox(
447                                "watchdog", null, "system_server", null, null,
448                                name, null, stack, null);
449                    }
450                };
451            dropboxThread.start();
452            try {
453                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
454            } catch (InterruptedException ignored) {}
455
456            // Only kill the process if the debugger is not attached.
457            if (!Debug.isDebuggerConnected()) {
458                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + name);
459                Process.killProcess(Process.myPid());
460                System.exit(10);
461            } else {
462                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
463            }
464
465            waitedHalf = false;
466        }
467    }
468
469    private File dumpKernelStackTraces() {
470        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
471        if (tracesPath == null || tracesPath.length() == 0) {
472            return null;
473        }
474
475        native_dumpKernelStacks(tracesPath);
476        return new File(tracesPath);
477    }
478
479    private native void native_dumpKernelStacks(String tracesPath);
480}
481