Watchdog.java revision 9158825f9c41869689d6b1786d7c7aa8bdd524ce
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import android.app.IActivityController;
20import android.os.Binder;
21import android.os.RemoteException;
22import com.android.server.am.ActivityManagerService;
23import com.android.server.power.PowerManagerService;
24
25import android.app.AlarmManager;
26import android.app.PendingIntent;
27import android.content.BroadcastReceiver;
28import android.content.ContentResolver;
29import android.content.Context;
30import android.content.Intent;
31import android.content.IntentFilter;
32import android.os.BatteryManager;
33import android.os.Debug;
34import android.os.Handler;
35import android.os.Looper;
36import android.os.Process;
37import android.os.ServiceManager;
38import android.os.SystemClock;
39import android.os.SystemProperties;
40import android.util.EventLog;
41import android.util.Log;
42import android.util.Slog;
43
44import java.io.File;
45import java.io.FileWriter;
46import java.io.IOException;
47import java.util.ArrayList;
48import java.util.Calendar;
49
50/** This class calls its monitor every minute. Killing this process if they don't return **/
51public class Watchdog extends Thread {
52    static final String TAG = "Watchdog";
53    static final boolean localLOGV = false || false;
54
55    // Set this to true to use debug default values.
56    static final boolean DB = false;
57
58    // Set this to true to have the watchdog record kernel thread stacks when it fires
59    static final boolean RECORD_KERNEL_THREADS = true;
60
61    static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
62    static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
63
64    // These are temporally ordered: larger values as lateness increases
65    static final int COMPLETED = 0;
66    static final int WAITING = 1;
67    static final int WAITED_HALF = 2;
68    static final int OVERDUE = 3;
69
70    // Which native processes to dump into dropbox's stack traces
71    public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
72        "/system/bin/mediaserver",
73        "/system/bin/sdcard",
74        "/system/bin/surfaceflinger"
75    };
76
77    static Watchdog sWatchdog;
78
79    /* This handler will be used to post message back onto the main thread */
80    final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>();
81    final HandlerChecker mMonitorChecker;
82    ContentResolver mResolver;
83    ActivityManagerService mActivity;
84
85    int mPhonePid;
86    IActivityController mController;
87    boolean mAllowRestart = true;
88
89    /**
90     * Used for checking status of handle threads and scheduling monitor callbacks.
91     */
92    public final class HandlerChecker implements Runnable {
93        private final Handler mHandler;
94        private final String mName;
95        private final long mWaitMax;
96        private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
97        private boolean mCompleted;
98        private Monitor mCurrentMonitor;
99        private long mStartTime;
100
101        HandlerChecker(Handler handler, String name, long waitMaxMillis) {
102            mHandler = handler;
103            mName = name;
104            mWaitMax = waitMaxMillis;
105            mCompleted = true;
106        }
107
108        public void addMonitor(Monitor monitor) {
109            mMonitors.add(monitor);
110        }
111
112        public void scheduleCheckLocked() {
113            if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) {
114                // If the target looper is or just recently was idling, then
115                // there is no reason to enqueue our checker on it since that
116                // is as good as it not being deadlocked.  This avoid having
117                // to do a context switch to check the thread.  Note that we
118                // only do this if mCheckReboot is false and we have no
119                // monitors, since those would need to be executed at this point.
120                mCompleted = true;
121                return;
122            }
123
124            if (!mCompleted) {
125                // we already have a check in flight, so no need
126                return;
127            }
128
129            mCompleted = false;
130            mCurrentMonitor = null;
131            mStartTime = SystemClock.uptimeMillis();
132            mHandler.postAtFrontOfQueue(this);
133        }
134
135        public boolean isOverdueLocked() {
136            return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
137        }
138
139        public int getCompletionStateLocked() {
140            if (mCompleted) {
141                return COMPLETED;
142            } else {
143                long latency = SystemClock.uptimeMillis() - mStartTime;
144                if (latency < mWaitMax/2) {
145                    return WAITING;
146                } else if (latency < mWaitMax) {
147                    return WAITED_HALF;
148                }
149            }
150            return OVERDUE;
151        }
152
153        public Thread getThread() {
154            return mHandler.getLooper().getThread();
155        }
156
157        public String getName() {
158            return mName;
159        }
160
161        public String describeBlockedStateLocked() {
162            if (mCurrentMonitor == null) {
163                return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
164            } else {
165                return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
166                        + " on " + mName + " (" + getThread().getName() + ")";
167            }
168        }
169
170        @Override
171        public void run() {
172            final int size = mMonitors.size();
173            for (int i = 0 ; i < size ; i++) {
174                synchronized (Watchdog.this) {
175                    mCurrentMonitor = mMonitors.get(i);
176                }
177                mCurrentMonitor.monitor();
178            }
179
180            synchronized (Watchdog.this) {
181                mCompleted = true;
182                mCurrentMonitor = null;
183            }
184        }
185    }
186
187    final class RebootRequestReceiver extends BroadcastReceiver {
188        @Override
189        public void onReceive(Context c, Intent intent) {
190            if (intent.getIntExtra("nowait", 0) != 0) {
191                rebootSystem("Received ACTION_REBOOT broadcast");
192                return;
193            }
194            Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
195        }
196    }
197
198    public interface Monitor {
199        void monitor();
200    }
201
202    public static Watchdog getInstance() {
203        if (sWatchdog == null) {
204            sWatchdog = new Watchdog();
205        }
206
207        return sWatchdog;
208    }
209
210    private Watchdog() {
211        super("watchdog");
212        // Initialize handler checkers for each common thread we want to check.  Note
213        // that we are not currently checking the background thread, since it can
214        // potentially hold longer running operations with no guarantees about the timeliness
215        // of operations there.
216
217        // The shared foreground thread is the main checker.  It is where we
218        // will also dispatch monitor checks and do other work.
219        mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
220                "foreground thread", DEFAULT_TIMEOUT);
221        mHandlerCheckers.add(mMonitorChecker);
222        // Add checker for main thread.  We only do a quick check since there
223        // can be UI running on the thread.
224        mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
225                "main thread", DEFAULT_TIMEOUT));
226        // Add checker for shared UI thread.
227        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
228                "ui thread", DEFAULT_TIMEOUT));
229        // And also check IO thread.
230        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
231                "i/o thread", DEFAULT_TIMEOUT));
232    }
233
234    public void init(Context context, ActivityManagerService activity) {
235        mResolver = context.getContentResolver();
236        mActivity = activity;
237
238        context.registerReceiver(new RebootRequestReceiver(),
239                new IntentFilter(Intent.ACTION_REBOOT),
240                android.Manifest.permission.REBOOT, null);
241    }
242
243    public void processStarted(String name, int pid) {
244        synchronized (this) {
245            if ("com.android.phone".equals(name)) {
246                mPhonePid = pid;
247            }
248        }
249    }
250
251    public void setActivityController(IActivityController controller) {
252        synchronized (this) {
253            mController = controller;
254        }
255    }
256
257    public void setAllowRestart(boolean allowRestart) {
258        synchronized (this) {
259            mAllowRestart = allowRestart;
260        }
261    }
262
263    public void addMonitor(Monitor monitor) {
264        synchronized (this) {
265            if (isAlive()) {
266                throw new RuntimeException("Monitors can't be added once the Watchdog is running");
267            }
268            mMonitorChecker.addMonitor(monitor);
269        }
270    }
271
272    public void addThread(Handler thread, String name) {
273        addThread(thread, name, DEFAULT_TIMEOUT);
274    }
275
276    public void addThread(Handler thread, String name, long timeoutMillis) {
277        synchronized (this) {
278            if (isAlive()) {
279                throw new RuntimeException("Threads can't be added once the Watchdog is running");
280            }
281            mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
282        }
283    }
284
285    /**
286     * Perform a full reboot of the system.
287     */
288    void rebootSystem(String reason) {
289        Slog.i(TAG, "Rebooting system because: " + reason);
290        PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power");
291        pms.reboot(false, reason, false);
292    }
293
294    private int evaluateCheckerCompletionLocked() {
295        int state = COMPLETED;
296        for (int i=0; i<mHandlerCheckers.size(); i++) {
297            HandlerChecker hc = mHandlerCheckers.get(i);
298            state = Math.max(state, hc.getCompletionStateLocked());
299        }
300        return state;
301    }
302
303    private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
304        ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
305        for (int i=0; i<mHandlerCheckers.size(); i++) {
306            HandlerChecker hc = mHandlerCheckers.get(i);
307            if (hc.isOverdueLocked()) {
308                checkers.add(hc);
309            }
310        }
311        return checkers;
312    }
313
314    private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
315        StringBuilder builder = new StringBuilder(128);
316        for (int i=0; i<checkers.size(); i++) {
317            if (builder.length() > 0) {
318                builder.append(", ");
319            }
320            builder.append(checkers.get(i).describeBlockedStateLocked());
321        }
322        return builder.toString();
323    }
324
325    @Override
326    public void run() {
327        boolean waitedHalf = false;
328        while (true) {
329            final ArrayList<HandlerChecker> blockedCheckers;
330            final String subject;
331            final boolean allowRestart;
332            synchronized (this) {
333                long timeout = CHECK_INTERVAL;
334                // Make sure we (re)spin the checkers that have become idle within
335                // this wait-and-check interval
336                for (int i=0; i<mHandlerCheckers.size(); i++) {
337                    HandlerChecker hc = mHandlerCheckers.get(i);
338                    hc.scheduleCheckLocked();
339                }
340
341                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
342                // wait while asleep. If the device is asleep then the thing that we are waiting
343                // to timeout on is asleep as well and won't have a chance to run, causing a false
344                // positive on when to kill things.
345                long start = SystemClock.uptimeMillis();
346                while (timeout > 0) {
347                    try {
348                        wait(timeout);
349                    } catch (InterruptedException e) {
350                        Log.wtf(TAG, e);
351                    }
352                    timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
353                }
354
355                final int waitState = evaluateCheckerCompletionLocked();
356                if (waitState == COMPLETED) {
357                    // The monitors have returned; reset
358                    waitedHalf = false;
359                    continue;
360                } else if (waitState == WAITING) {
361                    // still waiting but within their configured intervals; back off and recheck
362                    continue;
363                } else if (waitState == WAITED_HALF) {
364                    if (!waitedHalf) {
365                        // We've waited half the deadlock-detection interval.  Pull a stack
366                        // trace and wait another half.
367                        ArrayList<Integer> pids = new ArrayList<Integer>();
368                        pids.add(Process.myPid());
369                        ActivityManagerService.dumpStackTraces(true, pids, null, null,
370                                NATIVE_STACKS_OF_INTEREST);
371                        waitedHalf = true;
372                    }
373                    continue;
374                }
375
376                // something is overdue!
377                blockedCheckers = getBlockedCheckersLocked();
378                subject = describeCheckersLocked(blockedCheckers);
379                allowRestart = mAllowRestart;
380            }
381
382            // If we got here, that means that the system is most likely hung.
383            // First collect stack traces from all threads of the system process.
384            // Then kill this process so that the system will restart.
385            EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
386
387            ArrayList<Integer> pids = new ArrayList<Integer>();
388            pids.add(Process.myPid());
389            if (mPhonePid > 0) pids.add(mPhonePid);
390            // Pass !waitedHalf so that just in case we somehow wind up here without having
391            // dumped the halfway stacks, we properly re-initialize the trace file.
392            final File stack = ActivityManagerService.dumpStackTraces(
393                    !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
394
395            // Give some extra time to make sure the stack traces get written.
396            // The system's been hanging for a minute, another second or two won't hurt much.
397            SystemClock.sleep(2000);
398
399            // Pull our own kernel thread stacks as well if we're configured for that
400            if (RECORD_KERNEL_THREADS) {
401                dumpKernelStackTraces();
402            }
403
404            // Trigger the kernel to dump all blocked threads to the kernel log
405            try {
406                FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
407                sysrq_trigger.write("w");
408                sysrq_trigger.close();
409            } catch (IOException e) {
410                Slog.e(TAG, "Failed to write to /proc/sysrq-trigger");
411                Slog.e(TAG, e.getMessage());
412            }
413
414            // Try to add the error to the dropbox, but assuming that the ActivityManager
415            // itself may be deadlocked.  (which has happened, causing this statement to
416            // deadlock and the watchdog as a whole to be ineffective)
417            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
418                    public void run() {
419                        mActivity.addErrorToDropBox(
420                                "watchdog", null, "system_server", null, null,
421                                subject, null, stack, null);
422                    }
423                };
424            dropboxThread.start();
425            try {
426                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
427            } catch (InterruptedException ignored) {}
428
429            IActivityController controller;
430            synchronized (this) {
431                controller = mController;
432            }
433            if (controller != null) {
434                Slog.i(TAG, "Reporting stuck state to activity controller");
435                try {
436                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
437                    // 1 = keep waiting, -1 = kill system
438                    int res = controller.systemNotResponding(subject);
439                    if (res >= 0) {
440                        Slog.i(TAG, "Activity controller requested to coninue to wait");
441                        waitedHalf = false;
442                        continue;
443                    }
444                } catch (RemoteException e) {
445                }
446            }
447
448            // Only kill the process if the debugger is not attached.
449            if (Debug.isDebuggerConnected()) {
450                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
451            } else if (!allowRestart) {
452                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
453            } else {
454                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
455                for (int i=0; i<blockedCheckers.size(); i++) {
456                    Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
457                    StackTraceElement[] stackTrace
458                            = blockedCheckers.get(i).getThread().getStackTrace();
459                    for (StackTraceElement element: stackTrace) {
460                        Slog.w(TAG, "    at " + element);
461                    }
462                }
463                Slog.w(TAG, "*** GOODBYE!");
464                Process.killProcess(Process.myPid());
465                System.exit(10);
466            }
467
468            waitedHalf = false;
469        }
470    }
471
472    private File dumpKernelStackTraces() {
473        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
474        if (tracesPath == null || tracesPath.length() == 0) {
475            return null;
476        }
477
478        native_dumpKernelStacks(tracesPath);
479        return new File(tracesPath);
480    }
481
482    private native void native_dumpKernelStacks(String tracesPath);
483}
484