1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import com.android.server.am.ActivityManagerService;
20import com.android.server.power.PowerManagerService;
21
22import android.app.AlarmManager;
23import android.app.PendingIntent;
24import android.content.BroadcastReceiver;
25import android.content.ContentResolver;
26import android.content.Context;
27import android.content.Intent;
28import android.content.IntentFilter;
29import android.os.BatteryManager;
30import android.os.Debug;
31import android.os.Handler;
32import android.os.Message;
33import android.os.Process;
34import android.os.ServiceManager;
35import android.os.SystemClock;
36import android.os.SystemProperties;
37import android.util.EventLog;
38import android.util.Log;
39import android.util.Slog;
40
41import java.io.File;
42import java.util.ArrayList;
43import java.util.Calendar;
44
45/** This class calls its monitor every minute. Killing this process if they don't return **/
46public class Watchdog extends Thread {
47    static final String TAG = "Watchdog";
48    static final boolean localLOGV = false || false;
49
50    // Set this to true to use debug default values.
51    static final boolean DB = false;
52
53    // Set this to true to have the watchdog record kernel thread stacks when it fires
54    static final boolean RECORD_KERNEL_THREADS = true;
55
56    static final int MONITOR = 2718;
57
58    static final int TIME_TO_RESTART = DB ? 15*1000 : 60*1000;
59    static final int TIME_TO_WAIT = TIME_TO_RESTART / 2;
60
61    static final int MEMCHECK_DEFAULT_MIN_SCREEN_OFF = DB ? 1*60 : 5*60;   // 5 minutes
62    static final int MEMCHECK_DEFAULT_MIN_ALARM = DB ? 1*60 : 3*60;        // 3 minutes
63    static final int MEMCHECK_DEFAULT_RECHECK_INTERVAL = DB ? 1*60 : 5*60; // 5 minutes
64
65    static final int REBOOT_DEFAULT_INTERVAL = DB ? 1 : 0;                 // never force reboot
66    static final int REBOOT_DEFAULT_START_TIME = 3*60*60;                  // 3:00am
67    static final int REBOOT_DEFAULT_WINDOW = 60*60;                        // within 1 hour
68
69    static final String REBOOT_ACTION = "com.android.service.Watchdog.REBOOT";
70
71    static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
72        "/system/bin/mediaserver",
73        "/system/bin/sdcard",
74        "/system/bin/surfaceflinger"
75    };
76
77    static Watchdog sWatchdog;
78
79    /* This handler will be used to post message back onto the main thread */
80    final Handler mHandler;
81    final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
82    ContentResolver mResolver;
83    BatteryService mBattery;
84    PowerManagerService mPower;
85    AlarmManagerService mAlarm;
86    ActivityManagerService mActivity;
87    boolean mCompleted;
88    boolean mForceKillSystem;
89    Monitor mCurrentMonitor;
90
91    int mPhonePid;
92
93    final Calendar mCalendar = Calendar.getInstance();
94    int mMinScreenOff = MEMCHECK_DEFAULT_MIN_SCREEN_OFF;
95    int mMinAlarm = MEMCHECK_DEFAULT_MIN_ALARM;
96    boolean mNeedScheduledCheck;
97    PendingIntent mCheckupIntent;
98    PendingIntent mRebootIntent;
99
100    long mBootTime;
101    int mRebootInterval;
102
103    boolean mReqRebootNoWait;     // should wait for one interval before reboot?
104    int mReqRebootInterval = -1;  // >= 0 if a reboot has been requested
105    int mReqRebootStartTime = -1; // >= 0 if a specific start time has been requested
106    int mReqRebootWindow = -1;    // >= 0 if a specific window has been requested
107    int mReqMinScreenOff = -1;    // >= 0 if a specific screen off time has been requested
108    int mReqMinNextAlarm = -1;    // >= 0 if specific time to next alarm has been requested
109    int mReqRecheckInterval= -1;  // >= 0 if a specific recheck interval has been requested
110
111    /**
112     * Used for scheduling monitor callbacks and checking memory usage.
113     */
114    final class HeartbeatHandler extends Handler {
115        @Override
116        public void handleMessage(Message msg) {
117            switch (msg.what) {
118                case MONITOR: {
119                    // See if we should force a reboot.
120                    int rebootInterval = mReqRebootInterval >= 0
121                            ? mReqRebootInterval : REBOOT_DEFAULT_INTERVAL;
122                    if (mRebootInterval != rebootInterval) {
123                        mRebootInterval = rebootInterval;
124                        // We have been running long enough that a reboot can
125                        // be considered...
126                        checkReboot(false);
127                    }
128
129                    final int size = mMonitors.size();
130                    for (int i = 0 ; i < size ; i++) {
131                        mCurrentMonitor = mMonitors.get(i);
132                        mCurrentMonitor.monitor();
133                    }
134
135                    synchronized (Watchdog.this) {
136                        mCompleted = true;
137                        mCurrentMonitor = null;
138                    }
139                } break;
140            }
141        }
142    }
143
144    final class RebootReceiver extends BroadcastReceiver {
145        @Override
146        public void onReceive(Context c, Intent intent) {
147            if (localLOGV) Slog.v(TAG, "Alarm went off, checking reboot.");
148            checkReboot(true);
149        }
150    }
151
152    final class RebootRequestReceiver extends BroadcastReceiver {
153        @Override
154        public void onReceive(Context c, Intent intent) {
155            mReqRebootNoWait = intent.getIntExtra("nowait", 0) != 0;
156            mReqRebootInterval = intent.getIntExtra("interval", -1);
157            mReqRebootStartTime = intent.getIntExtra("startTime", -1);
158            mReqRebootWindow = intent.getIntExtra("window", -1);
159            mReqMinScreenOff = intent.getIntExtra("minScreenOff", -1);
160            mReqMinNextAlarm = intent.getIntExtra("minNextAlarm", -1);
161            mReqRecheckInterval = intent.getIntExtra("recheckInterval", -1);
162            EventLog.writeEvent(EventLogTags.WATCHDOG_REQUESTED_REBOOT,
163                    mReqRebootNoWait ? 1 : 0, mReqRebootInterval,
164                            mReqRecheckInterval, mReqRebootStartTime,
165                    mReqRebootWindow, mReqMinScreenOff, mReqMinNextAlarm);
166            checkReboot(true);
167        }
168    }
169
170    public interface Monitor {
171        void monitor();
172    }
173
174    public static Watchdog getInstance() {
175        if (sWatchdog == null) {
176            sWatchdog = new Watchdog();
177        }
178
179        return sWatchdog;
180    }
181
182    private Watchdog() {
183        super("watchdog");
184        mHandler = new HeartbeatHandler();
185    }
186
187    public void init(Context context, BatteryService battery,
188            PowerManagerService power, AlarmManagerService alarm,
189            ActivityManagerService activity) {
190        mResolver = context.getContentResolver();
191        mBattery = battery;
192        mPower = power;
193        mAlarm = alarm;
194        mActivity = activity;
195
196        context.registerReceiver(new RebootReceiver(),
197                new IntentFilter(REBOOT_ACTION));
198        mRebootIntent = PendingIntent.getBroadcast(context,
199                0, new Intent(REBOOT_ACTION), 0);
200
201        context.registerReceiver(new RebootRequestReceiver(),
202                new IntentFilter(Intent.ACTION_REBOOT),
203                android.Manifest.permission.REBOOT, null);
204
205        mBootTime = System.currentTimeMillis();
206    }
207
208    public void processStarted(String name, int pid) {
209        synchronized (this) {
210            if ("com.android.phone".equals(name)) {
211                mPhonePid = pid;
212            }
213        }
214    }
215
216    public void addMonitor(Monitor monitor) {
217        synchronized (this) {
218            if (isAlive()) {
219                throw new RuntimeException("Monitors can't be added while the Watchdog is running");
220            }
221            mMonitors.add(monitor);
222        }
223    }
224
225    void checkReboot(boolean fromAlarm) {
226        int rebootInterval = mReqRebootInterval >= 0 ? mReqRebootInterval
227                : REBOOT_DEFAULT_INTERVAL;
228        mRebootInterval = rebootInterval;
229        if (rebootInterval <= 0) {
230            // No reboot interval requested.
231            if (localLOGV) Slog.v(TAG, "No need to schedule a reboot alarm!");
232            mAlarm.remove(mRebootIntent);
233            return;
234        }
235
236        long rebootStartTime = mReqRebootStartTime >= 0 ? mReqRebootStartTime
237                : REBOOT_DEFAULT_START_TIME;
238        long rebootWindowMillis = (mReqRebootWindow >= 0 ? mReqRebootWindow
239                : REBOOT_DEFAULT_WINDOW) * 1000;
240        long recheckInterval = (mReqRecheckInterval >= 0 ? mReqRecheckInterval
241                : MEMCHECK_DEFAULT_RECHECK_INTERVAL) * 1000;
242
243        retrieveBrutalityAmount();
244
245        long realStartTime;
246        long now;
247
248        synchronized (this) {
249            now = System.currentTimeMillis();
250            realStartTime = computeCalendarTime(mCalendar, now,
251                    rebootStartTime);
252
253            long rebootIntervalMillis = rebootInterval*24*60*60*1000;
254            if (DB || mReqRebootNoWait ||
255                    (now-mBootTime) >= (rebootIntervalMillis-rebootWindowMillis)) {
256                if (fromAlarm && rebootWindowMillis <= 0) {
257                    // No reboot window -- just immediately reboot.
258                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
259                            (int)rebootIntervalMillis, (int)rebootStartTime*1000,
260                            (int)rebootWindowMillis, "");
261                    rebootSystem("Checkin scheduled forced");
262                    return;
263                }
264
265                // Are we within the reboot window?
266                if (now < realStartTime) {
267                    // Schedule alarm for next check interval.
268                    realStartTime = computeCalendarTime(mCalendar,
269                            now, rebootStartTime);
270                } else if (now < (realStartTime+rebootWindowMillis)) {
271                    String doit = shouldWeBeBrutalLocked(now);
272                    EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
273                            (int)rebootInterval, (int)rebootStartTime*1000,
274                            (int)rebootWindowMillis, doit != null ? doit : "");
275                    if (doit == null) {
276                        rebootSystem("Checked scheduled range");
277                        return;
278                    }
279
280                    // Schedule next alarm either within the window or in the
281                    // next interval.
282                    if ((now+recheckInterval) >= (realStartTime+rebootWindowMillis)) {
283                        realStartTime = computeCalendarTime(mCalendar,
284                                now + rebootIntervalMillis, rebootStartTime);
285                    } else {
286                        realStartTime = now + recheckInterval;
287                    }
288                } else {
289                    // Schedule alarm for next check interval.
290                    realStartTime = computeCalendarTime(mCalendar,
291                            now + rebootIntervalMillis, rebootStartTime);
292                }
293            }
294        }
295
296        if (localLOGV) Slog.v(TAG, "Scheduling next reboot alarm for "
297                + ((realStartTime-now)/1000/60) + "m from now");
298        mAlarm.remove(mRebootIntent);
299        mAlarm.set(AlarmManager.RTC_WAKEUP, realStartTime, mRebootIntent);
300    }
301
302    /**
303     * Perform a full reboot of the system.
304     */
305    void rebootSystem(String reason) {
306        Slog.i(TAG, "Rebooting system because: " + reason);
307        PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power");
308        pms.reboot(false, reason, false);
309    }
310
311    /**
312     * Load the current Gservices settings for when
313     * {@link #shouldWeBeBrutalLocked} will allow the brutality to happen.
314     * Must not be called with the lock held.
315     */
316    void retrieveBrutalityAmount() {
317        mMinScreenOff = (mReqMinScreenOff >= 0 ? mReqMinScreenOff
318                : MEMCHECK_DEFAULT_MIN_SCREEN_OFF) * 1000;
319        mMinAlarm = (mReqMinNextAlarm >= 0 ? mReqMinNextAlarm
320                : MEMCHECK_DEFAULT_MIN_ALARM) * 1000;
321    }
322
323    /**
324     * Determine whether it is a good time to kill, crash, or otherwise
325     * plunder the current situation for the overall long-term benefit of
326     * the world.
327     *
328     * @param curTime The current system time.
329     * @return Returns null if this is a good time, else a String with the
330     * text of why it is not a good time.
331     */
332    String shouldWeBeBrutalLocked(long curTime) {
333        if (mBattery == null || !mBattery.isPowered(BatteryManager.BATTERY_PLUGGED_ANY)) {
334            return "battery";
335        }
336
337        if (mMinScreenOff >= 0 && (mPower == null ||
338                mPower.timeSinceScreenWasLastOn() < mMinScreenOff)) {
339            return "screen";
340        }
341
342        if (mMinAlarm >= 0 && (mAlarm == null ||
343                mAlarm.timeToNextAlarm() < mMinAlarm)) {
344            return "alarm";
345        }
346
347        return null;
348    }
349
350    static long computeCalendarTime(Calendar c, long curTime,
351            long secondsSinceMidnight) {
352
353        // start with now
354        c.setTimeInMillis(curTime);
355
356        int val = (int)secondsSinceMidnight / (60*60);
357        c.set(Calendar.HOUR_OF_DAY, val);
358        secondsSinceMidnight -= val * (60*60);
359        val = (int)secondsSinceMidnight / 60;
360        c.set(Calendar.MINUTE, val);
361        c.set(Calendar.SECOND, (int)secondsSinceMidnight - (val*60));
362        c.set(Calendar.MILLISECOND, 0);
363
364        long newTime = c.getTimeInMillis();
365        if (newTime < curTime) {
366            // The given time (in seconds since midnight) has already passed for today, so advance
367            // by one day (due to daylight savings, etc., the delta may differ from 24 hours).
368            c.add(Calendar.DAY_OF_MONTH, 1);
369            newTime = c.getTimeInMillis();
370        }
371
372        return newTime;
373    }
374
375    @Override
376    public void run() {
377        boolean waitedHalf = false;
378        while (true) {
379            mCompleted = false;
380            mHandler.sendEmptyMessage(MONITOR);
381
382            synchronized (this) {
383                long timeout = TIME_TO_WAIT;
384
385                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
386                // wait while asleep. If the device is asleep then the thing that we are waiting
387                // to timeout on is asleep as well and won't have a chance to run, causing a false
388                // positive on when to kill things.
389                long start = SystemClock.uptimeMillis();
390                while (timeout > 0 && !mForceKillSystem) {
391                    try {
392                        wait(timeout);  // notifyAll() is called when mForceKillSystem is set
393                    } catch (InterruptedException e) {
394                        Log.wtf(TAG, e);
395                    }
396                    timeout = TIME_TO_WAIT - (SystemClock.uptimeMillis() - start);
397                }
398
399                if (mCompleted && !mForceKillSystem) {
400                    // The monitors have returned.
401                    waitedHalf = false;
402                    continue;
403                }
404
405                if (!waitedHalf) {
406                    // We've waited half the deadlock-detection interval.  Pull a stack
407                    // trace and wait another half.
408                    ArrayList<Integer> pids = new ArrayList<Integer>();
409                    pids.add(Process.myPid());
410                    ActivityManagerService.dumpStackTraces(true, pids, null, null,
411                            NATIVE_STACKS_OF_INTEREST);
412                    waitedHalf = true;
413                    continue;
414                }
415            }
416
417            // If we got here, that means that the system is most likely hung.
418            // First collect stack traces from all threads of the system process.
419            // Then kill this process so that the system will restart.
420
421            final String name = (mCurrentMonitor != null) ?
422                    mCurrentMonitor.getClass().getName() : "null";
423            EventLog.writeEvent(EventLogTags.WATCHDOG, name);
424
425            ArrayList<Integer> pids = new ArrayList<Integer>();
426            pids.add(Process.myPid());
427            if (mPhonePid > 0) pids.add(mPhonePid);
428            // Pass !waitedHalf so that just in case we somehow wind up here without having
429            // dumped the halfway stacks, we properly re-initialize the trace file.
430            final File stack = ActivityManagerService.dumpStackTraces(
431                    !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
432
433            // Give some extra time to make sure the stack traces get written.
434            // The system's been hanging for a minute, another second or two won't hurt much.
435            SystemClock.sleep(2000);
436
437            // Pull our own kernel thread stacks as well if we're configured for that
438            if (RECORD_KERNEL_THREADS) {
439                dumpKernelStackTraces();
440            }
441
442            // Try to add the error to the dropbox, but assuming that the ActivityManager
443            // itself may be deadlocked.  (which has happened, causing this statement to
444            // deadlock and the watchdog as a whole to be ineffective)
445            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
446                    public void run() {
447                        mActivity.addErrorToDropBox(
448                                "watchdog", null, "system_server", null, null,
449                                name, null, stack, null);
450                    }
451                };
452            dropboxThread.start();
453            try {
454                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
455            } catch (InterruptedException ignored) {}
456
457            // Only kill the process if the debugger is not attached.
458            if (!Debug.isDebuggerConnected()) {
459                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + name);
460                Process.killProcess(Process.myPid());
461                System.exit(10);
462            } else {
463                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
464            }
465
466            waitedHalf = false;
467        }
468    }
469
470    private File dumpKernelStackTraces() {
471        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
472        if (tracesPath == null || tracesPath.length() == 0) {
473            return null;
474        }
475
476        native_dumpKernelStacks(tracesPath);
477        return new File(tracesPath);
478    }
479
480    private native void native_dumpKernelStacks(String tracesPath);
481}
482