Watchdog.java revision 784827b27cf4cd82bf074b571e63cb5e660c54af
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import com.android.server.am.ActivityManagerService;
20
21import android.app.AlarmManager;
22import android.app.PendingIntent;
23import android.content.BroadcastReceiver;
24import android.content.ContentResolver;
25import android.content.Context;
26import android.content.Intent;
27import android.content.IntentFilter;
28import android.os.Debug;
29import android.os.Handler;
30import android.os.Message;
31import android.os.Process;
32import android.os.ServiceManager;
33import android.os.SystemClock;
34import android.os.SystemProperties;
35import android.provider.Settings;
36import android.util.EventLog;
37import android.util.Log;
38import android.util.Slog;
39
40import java.io.File;
41import java.util.ArrayList;
42import java.util.Calendar;
43
44/** This class calls its monitor every minute. Killing this process if they don't return **/
45public class Watchdog extends Thread {
46    static final String TAG = "Watchdog";
47    static final boolean localLOGV = false || false;
48
49    // Set this to true to use debug default values.
50    static final boolean DB = false;
51
52    // Set this to true to have the watchdog record kernel thread stacks when it fires
53    static final boolean RECORD_KERNEL_THREADS = true;
54
55    static final int MONITOR = 2718;
56
57    static final int TIME_TO_RESTART = DB ? 15*1000 : 60*1000;
58    static final int TIME_TO_WAIT = TIME_TO_RESTART / 2;
59
60    static final int MEMCHECK_DEFAULT_MIN_SCREEN_OFF = DB ? 1*60 : 5*60;   // 5 minutes
61    static final int MEMCHECK_DEFAULT_MIN_ALARM = DB ? 1*60 : 3*60;        // 3 minutes
62    static final int MEMCHECK_DEFAULT_RECHECK_INTERVAL = DB ? 1*60 : 5*60; // 5 minutes
63
64    static final int REBOOT_DEFAULT_INTERVAL = DB ? 1 : 0;                 // never force reboot
65    static final int REBOOT_DEFAULT_START_TIME = 3*60*60;                  // 3:00am
66    static final int REBOOT_DEFAULT_WINDOW = 60*60;                        // within 1 hour
67
68    static final String REBOOT_ACTION = "com.android.service.Watchdog.REBOOT";
69
70    static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
71        "/system/bin/mediaserver",
72        "/system/bin/sdcard",
73        "/system/bin/surfaceflinger"
74    };
75
76    static Watchdog sWatchdog;
77
78    /* This handler will be used to post message back onto the main thread */
79    final Handler mHandler;
80    final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
81    ContentResolver mResolver;
82    BatteryService mBattery;
83    PowerManagerService mPower;
84    AlarmManagerService mAlarm;
85    ActivityManagerService mActivity;
86    boolean mCompleted;
87    boolean mForceKillSystem;
88    Monitor mCurrentMonitor;
89
90    int mPhonePid;
91
92    final Calendar mCalendar = Calendar.getInstance();
93    int mMinScreenOff = MEMCHECK_DEFAULT_MIN_SCREEN_OFF;
94    int mMinAlarm = MEMCHECK_DEFAULT_MIN_ALARM;
95    boolean mNeedScheduledCheck;
96    PendingIntent mCheckupIntent;
97    PendingIntent mRebootIntent;
98
99    long mBootTime;
100    int mRebootInterval;
101
102    boolean mReqRebootNoWait;     // should wait for one interval before reboot?
103    int mReqRebootInterval = -1;  // >= 0 if a reboot has been requested
104    int mReqRebootStartTime = -1; // >= 0 if a specific start time has been requested
105    int mReqRebootWindow = -1;    // >= 0 if a specific window has been requested
106    int mReqMinScreenOff = -1;    // >= 0 if a specific screen off time has been requested
107    int mReqMinNextAlarm = -1;    // >= 0 if specific time to next alarm has been requested
108    int mReqRecheckInterval= -1;  // >= 0 if a specific recheck interval has been requested
109
110    /**
111     * Used for scheduling monitor callbacks and checking memory usage.
112     */
113    final class HeartbeatHandler extends Handler {
114        @Override
115        public void handleMessage(Message msg) {
116            switch (msg.what) {
117                case MONITOR: {
118                    // See if we should force a reboot.
119                    int rebootInterval = mReqRebootInterval >= 0
120                            ? mReqRebootInterval : Settings.Secure.getInt(
121                            mResolver, Settings.Secure.REBOOT_INTERVAL,
122                            REBOOT_DEFAULT_INTERVAL);
123                    if (mRebootInterval != rebootInterval) {
124                        mRebootInterval = rebootInterval;
125                        // We have been running long enough that a reboot can
126                        // be considered...
127                        checkReboot(false);
128                    }
129
130                    final int size = mMonitors.size();
131                    for (int i = 0 ; i < size ; i++) {
132                        mCurrentMonitor = mMonitors.get(i);
133                        mCurrentMonitor.monitor();
134                    }
135
136                    synchronized (Watchdog.this) {
137                        mCompleted = true;
138                        mCurrentMonitor = null;
139                    }
140                } break;
141            }
142        }
143    }
144
145    final class RebootReceiver extends BroadcastReceiver {
146        @Override
147        public void onReceive(Context c, Intent intent) {
148            if (localLOGV) Slog.v(TAG, "Alarm went off, checking reboot.");
149            checkReboot(true);
150        }
151    }
152
153    final class RebootRequestReceiver extends BroadcastReceiver {
154        @Override
155        public void onReceive(Context c, Intent intent) {
156            mReqRebootNoWait = intent.getIntExtra("nowait", 0) != 0;
157            mReqRebootInterval = intent.getIntExtra("interval", -1);
158            mReqRebootStartTime = intent.getIntExtra("startTime", -1);
159            mReqRebootWindow = intent.getIntExtra("window", -1);
160            mReqMinScreenOff = intent.getIntExtra("minScreenOff", -1);
161            mReqMinNextAlarm = intent.getIntExtra("minNextAlarm", -1);
162            mReqRecheckInterval = intent.getIntExtra("recheckInterval", -1);
163            EventLog.writeEvent(EventLogTags.WATCHDOG_REQUESTED_REBOOT,
164                    mReqRebootNoWait ? 1 : 0, mReqRebootInterval,
165                            mReqRecheckInterval, mReqRebootStartTime,
166                    mReqRebootWindow, mReqMinScreenOff, mReqMinNextAlarm);
167            checkReboot(true);
168        }
169    }
170
171    public interface Monitor {
172        void monitor();
173    }
174
175    public static Watchdog getInstance() {
176        if (sWatchdog == null) {
177            sWatchdog = new Watchdog();
178        }
179
180        return sWatchdog;
181    }
182
183    private Watchdog() {
184        super("watchdog");
185        mHandler = new HeartbeatHandler();
186    }
187
188    public void init(Context context, BatteryService battery,
189            PowerManagerService power, AlarmManagerService alarm,
190            ActivityManagerService activity) {
191        mResolver = context.getContentResolver();
192        mBattery = battery;
193        mPower = power;
194        mAlarm = alarm;
195        mActivity = activity;
196
197        context.registerReceiver(new RebootReceiver(),
198                new IntentFilter(REBOOT_ACTION));
199        mRebootIntent = PendingIntent.getBroadcast(context,
200                0, new Intent(REBOOT_ACTION), 0);
201
202        context.registerReceiver(new RebootRequestReceiver(),
203                new IntentFilter(Intent.ACTION_REBOOT),
204                android.Manifest.permission.REBOOT, null);
205
206        mBootTime = System.currentTimeMillis();
207    }
208
209    public void processStarted(String name, int pid) {
210        synchronized (this) {
211            if ("com.android.phone".equals(name)) {
212                mPhonePid = pid;
213            }
214        }
215    }
216
217    public void addMonitor(Monitor monitor) {
218        synchronized (this) {
219            if (isAlive()) {
220                throw new RuntimeException("Monitors can't be added while the Watchdog is running");
221            }
222            mMonitors.add(monitor);
223        }
224    }
225
226    void checkReboot(boolean fromAlarm) {
227        int rebootInterval = mReqRebootInterval >= 0 ? mReqRebootInterval
228                : Settings.Secure.getInt(
229                mResolver, Settings.Secure.REBOOT_INTERVAL,
230                REBOOT_DEFAULT_INTERVAL);
231        mRebootInterval = rebootInterval;
232        if (rebootInterval <= 0) {
233            // No reboot interval requested.
234            if (localLOGV) Slog.v(TAG, "No need to schedule a reboot alarm!");
235            mAlarm.remove(mRebootIntent);
236            return;
237        }
238
239        long rebootStartTime = mReqRebootStartTime >= 0 ? mReqRebootStartTime
240                : Settings.Secure.getLong(
241                mResolver, Settings.Secure.REBOOT_START_TIME,
242                REBOOT_DEFAULT_START_TIME);
243        long rebootWindowMillis = (mReqRebootWindow >= 0 ? mReqRebootWindow
244                : Settings.Secure.getLong(
245                mResolver, Settings.Secure.REBOOT_WINDOW,
246                REBOOT_DEFAULT_WINDOW)) * 1000;
247        long recheckInterval = (mReqRecheckInterval >= 0 ? mReqRecheckInterval
248                : Settings.Secure.getLong(
249                mResolver, Settings.Secure.MEMCHECK_RECHECK_INTERVAL,
250                MEMCHECK_DEFAULT_RECHECK_INTERVAL)) * 1000;
251
252        retrieveBrutalityAmount();
253
254        long realStartTime;
255        long now;
256
257        synchronized (this) {
258            now = System.currentTimeMillis();
259            realStartTime = computeCalendarTime(mCalendar, now,
260                    rebootStartTime);
261
262            long rebootIntervalMillis = rebootInterval*24*60*60*1000;
263            if (DB || mReqRebootNoWait ||
264                    (now-mBootTime) >= (rebootIntervalMillis-rebootWindowMillis)) {
265                if (fromAlarm && rebootWindowMillis <= 0) {
266                    // No reboot window -- just immediately reboot.
267                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
268                            (int)rebootIntervalMillis, (int)rebootStartTime*1000,
269                            (int)rebootWindowMillis, "");
270                    rebootSystem("Checkin scheduled forced");
271                    return;
272                }
273
274                // Are we within the reboot window?
275                if (now < realStartTime) {
276                    // Schedule alarm for next check interval.
277                    realStartTime = computeCalendarTime(mCalendar,
278                            now, rebootStartTime);
279                } else if (now < (realStartTime+rebootWindowMillis)) {
280                    String doit = shouldWeBeBrutalLocked(now);
281                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
282                            (int)rebootInterval, (int)rebootStartTime*1000,
283                            (int)rebootWindowMillis, doit != null ? doit : "");
284                    if (doit == null) {
285                        rebootSystem("Checked scheduled range");
286                        return;
287                    }
288
289                    // Schedule next alarm either within the window or in the
290                    // next interval.
291                    if ((now+recheckInterval) >= (realStartTime+rebootWindowMillis)) {
292                        realStartTime = computeCalendarTime(mCalendar,
293                                now + rebootIntervalMillis, rebootStartTime);
294                    } else {
295                        realStartTime = now + recheckInterval;
296                    }
297                } else {
298                    // Schedule alarm for next check interval.
299                    realStartTime = computeCalendarTime(mCalendar,
300                            now + rebootIntervalMillis, rebootStartTime);
301                }
302            }
303        }
304
305        if (localLOGV) Slog.v(TAG, "Scheduling next reboot alarm for "
306                + ((realStartTime-now)/1000/60) + "m from now");
307        mAlarm.remove(mRebootIntent);
308        mAlarm.set(AlarmManager.RTC_WAKEUP, realStartTime, mRebootIntent);
309    }
310
311    /**
312     * Perform a full reboot of the system.
313     */
314    void rebootSystem(String reason) {
315        Slog.i(TAG, "Rebooting system because: " + reason);
316        PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power");
317        pms.reboot(reason);
318    }
319
320    /**
321     * Load the current Gservices settings for when
322     * {@link #shouldWeBeBrutalLocked} will allow the brutality to happen.
323     * Must not be called with the lock held.
324     */
325    void retrieveBrutalityAmount() {
326        mMinScreenOff = (mReqMinScreenOff >= 0 ? mReqMinScreenOff
327                : Settings.Secure.getInt(
328                mResolver, Settings.Secure.MEMCHECK_MIN_SCREEN_OFF,
329                MEMCHECK_DEFAULT_MIN_SCREEN_OFF)) * 1000;
330        mMinAlarm = (mReqMinNextAlarm >= 0 ? mReqMinNextAlarm
331                : Settings.Secure.getInt(
332                mResolver, Settings.Secure.MEMCHECK_MIN_ALARM,
333                MEMCHECK_DEFAULT_MIN_ALARM)) * 1000;
334    }
335
336    /**
337     * Determine whether it is a good time to kill, crash, or otherwise
338     * plunder the current situation for the overall long-term benefit of
339     * the world.
340     *
341     * @param curTime The current system time.
342     * @return Returns null if this is a good time, else a String with the
343     * text of why it is not a good time.
344     */
345    String shouldWeBeBrutalLocked(long curTime) {
346        if (mBattery == null || !mBattery.isPowered()) {
347            return "battery";
348        }
349
350        if (mMinScreenOff >= 0 && (mPower == null ||
351                mPower.timeSinceScreenOn() < mMinScreenOff)) {
352            return "screen";
353        }
354
355        if (mMinAlarm >= 0 && (mAlarm == null ||
356                mAlarm.timeToNextAlarm() < mMinAlarm)) {
357            return "alarm";
358        }
359
360        return null;
361    }
362
363    static long computeCalendarTime(Calendar c, long curTime,
364            long secondsSinceMidnight) {
365
366        // start with now
367        c.setTimeInMillis(curTime);
368
369        int val = (int)secondsSinceMidnight / (60*60);
370        c.set(Calendar.HOUR_OF_DAY, val);
371        secondsSinceMidnight -= val * (60*60);
372        val = (int)secondsSinceMidnight / 60;
373        c.set(Calendar.MINUTE, val);
374        c.set(Calendar.SECOND, (int)secondsSinceMidnight - (val*60));
375        c.set(Calendar.MILLISECOND, 0);
376
377        long newTime = c.getTimeInMillis();
378        if (newTime < curTime) {
379            // The given time (in seconds since midnight) has already passed for today, so advance
380            // by one day (due to daylight savings, etc., the delta may differ from 24 hours).
381            c.add(Calendar.DAY_OF_MONTH, 1);
382            newTime = c.getTimeInMillis();
383        }
384
385        return newTime;
386    }
387
388    @Override
389    public void run() {
390        boolean waitedHalf = false;
391        while (true) {
392            mCompleted = false;
393            mHandler.sendEmptyMessage(MONITOR);
394
395            synchronized (this) {
396                long timeout = TIME_TO_WAIT;
397
398                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
399                // wait while asleep. If the device is asleep then the thing that we are waiting
400                // to timeout on is asleep as well and won't have a chance to run, causing a false
401                // positive on when to kill things.
402                long start = SystemClock.uptimeMillis();
403                while (timeout > 0 && !mForceKillSystem) {
404                    try {
405                        wait(timeout);  // notifyAll() is called when mForceKillSystem is set
406                    } catch (InterruptedException e) {
407                        Log.wtf(TAG, e);
408                    }
409                    timeout = TIME_TO_WAIT - (SystemClock.uptimeMillis() - start);
410                }
411
412                if (mCompleted && !mForceKillSystem) {
413                    // The monitors have returned.
414                    waitedHalf = false;
415                    continue;
416                }
417
418                if (!waitedHalf) {
419                    // We've waited half the deadlock-detection interval.  Pull a stack
420                    // trace and wait another half.
421                    ArrayList<Integer> pids = new ArrayList<Integer>();
422                    pids.add(Process.myPid());
423                    ActivityManagerService.dumpStackTraces(true, pids, null, null,
424                            NATIVE_STACKS_OF_INTEREST);
425                    waitedHalf = true;
426                    continue;
427                }
428            }
429
430            // If we got here, that means that the system is most likely hung.
431            // First collect stack traces from all threads of the system process.
432            // Then kill this process so that the system will restart.
433
434            final String name = (mCurrentMonitor != null) ?
435                    mCurrentMonitor.getClass().getName() : "null";
436            EventLog.writeEvent(EventLogTags.WATCHDOG, name);
437
438            ArrayList<Integer> pids = new ArrayList<Integer>();
439            pids.add(Process.myPid());
440            if (mPhonePid > 0) pids.add(mPhonePid);
441            // Pass !waitedHalf so that just in case we somehow wind up here without having
442            // dumped the halfway stacks, we properly re-initialize the trace file.
443            final File stack = ActivityManagerService.dumpStackTraces(
444                    !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
445
446            // Give some extra time to make sure the stack traces get written.
447            // The system's been hanging for a minute, another second or two won't hurt much.
448            SystemClock.sleep(2000);
449
450            // Pull our own kernel thread stacks as well if we're configured for that
451            if (RECORD_KERNEL_THREADS) {
452                dumpKernelStackTraces();
453            }
454
455            // Try to add the error to the dropbox, but assuming that the ActivityManager
456            // itself may be deadlocked.  (which has happened, causing this statement to
457            // deadlock and the watchdog as a whole to be ineffective)
458            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
459                    public void run() {
460                        mActivity.addErrorToDropBox(
461                                "watchdog", null, "system_server", null, null,
462                                name, null, stack, null);
463                    }
464                };
465            dropboxThread.start();
466            try {
467                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
468            } catch (InterruptedException ignored) {}
469
470            // Only kill the process if the debugger is not attached.
471            if (!Debug.isDebuggerConnected()) {
472                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + name);
473                Process.killProcess(Process.myPid());
474                System.exit(10);
475            } else {
476                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
477            }
478
479            waitedHalf = false;
480        }
481    }
482
483    private File dumpKernelStackTraces() {
484        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
485        if (tracesPath == null || tracesPath.length() == 0) {
486            return null;
487        }
488
489        native_dumpKernelStacks(tracesPath);
490        return new File(tracesPath);
491    }
492
493    private native void native_dumpKernelStacks(String tracesPath);
494}
495