1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import android.app.IActivityController;
20import android.os.Binder;
21import android.os.RemoteException;
22import com.android.server.am.ActivityManagerService;
23
24import android.content.BroadcastReceiver;
25import android.content.ContentResolver;
26import android.content.Context;
27import android.content.Intent;
28import android.content.IntentFilter;
29import android.os.Debug;
30import android.os.Handler;
31import android.os.IPowerManager;
32import android.os.Looper;
33import android.os.Process;
34import android.os.ServiceManager;
35import android.os.SystemClock;
36import android.os.SystemProperties;
37import android.util.EventLog;
38import android.util.Log;
39import android.util.Slog;
40
41import java.io.File;
42import java.io.FileWriter;
43import java.io.IOException;
44import java.util.ArrayList;
45
46/** This class calls its monitor every minute. Killing this process if they don't return **/
47public class Watchdog extends Thread {
48    static final String TAG = "Watchdog";
49    static final boolean localLOGV = false || false;
50
51    // Set this to true to use debug default values.
52    static final boolean DB = false;
53
54    // Set this to true to have the watchdog record kernel thread stacks when it fires
55    static final boolean RECORD_KERNEL_THREADS = true;
56
57    static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
58    static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
59
60    // These are temporally ordered: larger values as lateness increases
61    static final int COMPLETED = 0;
62    static final int WAITING = 1;
63    static final int WAITED_HALF = 2;
64    static final int OVERDUE = 3;
65
66    // Which native processes to dump into dropbox's stack traces
67    public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
68        "/system/bin/mediaserver",
69        "/system/bin/sdcard",
70        "/system/bin/surfaceflinger"
71    };
72
73    static Watchdog sWatchdog;
74
75    /* This handler will be used to post message back onto the main thread */
76    final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>();
77    final HandlerChecker mMonitorChecker;
78    ContentResolver mResolver;
79    ActivityManagerService mActivity;
80
81    int mPhonePid;
82    IActivityController mController;
83    boolean mAllowRestart = true;
84
85    /**
86     * Used for checking status of handle threads and scheduling monitor callbacks.
87     */
88    public final class HandlerChecker implements Runnable {
89        private final Handler mHandler;
90        private final String mName;
91        private final long mWaitMax;
92        private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
93        private boolean mCompleted;
94        private Monitor mCurrentMonitor;
95        private long mStartTime;
96
97        HandlerChecker(Handler handler, String name, long waitMaxMillis) {
98            mHandler = handler;
99            mName = name;
100            mWaitMax = waitMaxMillis;
101            mCompleted = true;
102        }
103
104        public void addMonitor(Monitor monitor) {
105            mMonitors.add(monitor);
106        }
107
108        public void scheduleCheckLocked() {
109            if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) {
110                // If the target looper is or just recently was idling, then
111                // there is no reason to enqueue our checker on it since that
112                // is as good as it not being deadlocked.  This avoid having
113                // to do a context switch to check the thread.  Note that we
114                // only do this if mCheckReboot is false and we have no
115                // monitors, since those would need to be executed at this point.
116                mCompleted = true;
117                return;
118            }
119
120            if (!mCompleted) {
121                // we already have a check in flight, so no need
122                return;
123            }
124
125            mCompleted = false;
126            mCurrentMonitor = null;
127            mStartTime = SystemClock.uptimeMillis();
128            mHandler.postAtFrontOfQueue(this);
129        }
130
131        public boolean isOverdueLocked() {
132            return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
133        }
134
135        public int getCompletionStateLocked() {
136            if (mCompleted) {
137                return COMPLETED;
138            } else {
139                long latency = SystemClock.uptimeMillis() - mStartTime;
140                if (latency < mWaitMax/2) {
141                    return WAITING;
142                } else if (latency < mWaitMax) {
143                    return WAITED_HALF;
144                }
145            }
146            return OVERDUE;
147        }
148
149        public Thread getThread() {
150            return mHandler.getLooper().getThread();
151        }
152
153        public String getName() {
154            return mName;
155        }
156
157        public String describeBlockedStateLocked() {
158            if (mCurrentMonitor == null) {
159                return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
160            } else {
161                return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
162                        + " on " + mName + " (" + getThread().getName() + ")";
163            }
164        }
165
166        @Override
167        public void run() {
168            final int size = mMonitors.size();
169            for (int i = 0 ; i < size ; i++) {
170                synchronized (Watchdog.this) {
171                    mCurrentMonitor = mMonitors.get(i);
172                }
173                mCurrentMonitor.monitor();
174            }
175
176            synchronized (Watchdog.this) {
177                mCompleted = true;
178                mCurrentMonitor = null;
179            }
180        }
181    }
182
183    final class RebootRequestReceiver extends BroadcastReceiver {
184        @Override
185        public void onReceive(Context c, Intent intent) {
186            if (intent.getIntExtra("nowait", 0) != 0) {
187                rebootSystem("Received ACTION_REBOOT broadcast");
188                return;
189            }
190            Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
191        }
192    }
193
194    public interface Monitor {
195        void monitor();
196    }
197
198    public static Watchdog getInstance() {
199        if (sWatchdog == null) {
200            sWatchdog = new Watchdog();
201        }
202
203        return sWatchdog;
204    }
205
206    private Watchdog() {
207        super("watchdog");
208        // Initialize handler checkers for each common thread we want to check.  Note
209        // that we are not currently checking the background thread, since it can
210        // potentially hold longer running operations with no guarantees about the timeliness
211        // of operations there.
212
213        // The shared foreground thread is the main checker.  It is where we
214        // will also dispatch monitor checks and do other work.
215        mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
216                "foreground thread", DEFAULT_TIMEOUT);
217        mHandlerCheckers.add(mMonitorChecker);
218        // Add checker for main thread.  We only do a quick check since there
219        // can be UI running on the thread.
220        mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
221                "main thread", DEFAULT_TIMEOUT));
222        // Add checker for shared UI thread.
223        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
224                "ui thread", DEFAULT_TIMEOUT));
225        // And also check IO thread.
226        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
227                "i/o thread", DEFAULT_TIMEOUT));
228        // And the display thread.
229        mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
230                "display thread", DEFAULT_TIMEOUT));
231    }
232
233    public void init(Context context, ActivityManagerService activity) {
234        mResolver = context.getContentResolver();
235        mActivity = activity;
236
237        context.registerReceiver(new RebootRequestReceiver(),
238                new IntentFilter(Intent.ACTION_REBOOT),
239                android.Manifest.permission.REBOOT, null);
240    }
241
242    public void processStarted(String name, int pid) {
243        synchronized (this) {
244            if ("com.android.phone".equals(name)) {
245                mPhonePid = pid;
246            }
247        }
248    }
249
250    public void setActivityController(IActivityController controller) {
251        synchronized (this) {
252            mController = controller;
253        }
254    }
255
256    public void setAllowRestart(boolean allowRestart) {
257        synchronized (this) {
258            mAllowRestart = allowRestart;
259        }
260    }
261
262    public void addMonitor(Monitor monitor) {
263        synchronized (this) {
264            if (isAlive()) {
265                throw new RuntimeException("Monitors can't be added once the Watchdog is running");
266            }
267            mMonitorChecker.addMonitor(monitor);
268        }
269    }
270
271    public void addThread(Handler thread) {
272        addThread(thread, DEFAULT_TIMEOUT);
273    }
274
275    public void addThread(Handler thread, long timeoutMillis) {
276        synchronized (this) {
277            if (isAlive()) {
278                throw new RuntimeException("Threads can't be added once the Watchdog is running");
279            }
280            final String name = thread.getLooper().getThread().getName();
281            mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
282        }
283    }
284
285    /**
286     * Perform a full reboot of the system.
287     */
288    void rebootSystem(String reason) {
289        Slog.i(TAG, "Rebooting system because: " + reason);
290        IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
291        try {
292            pms.reboot(false, reason, false);
293        } catch (RemoteException ex) {
294        }
295    }
296
297    private int evaluateCheckerCompletionLocked() {
298        int state = COMPLETED;
299        for (int i=0; i<mHandlerCheckers.size(); i++) {
300            HandlerChecker hc = mHandlerCheckers.get(i);
301            state = Math.max(state, hc.getCompletionStateLocked());
302        }
303        return state;
304    }
305
306    private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
307        ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
308        for (int i=0; i<mHandlerCheckers.size(); i++) {
309            HandlerChecker hc = mHandlerCheckers.get(i);
310            if (hc.isOverdueLocked()) {
311                checkers.add(hc);
312            }
313        }
314        return checkers;
315    }
316
317    private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
318        StringBuilder builder = new StringBuilder(128);
319        for (int i=0; i<checkers.size(); i++) {
320            if (builder.length() > 0) {
321                builder.append(", ");
322            }
323            builder.append(checkers.get(i).describeBlockedStateLocked());
324        }
325        return builder.toString();
326    }
327
328    @Override
329    public void run() {
330        boolean waitedHalf = false;
331        while (true) {
332            final ArrayList<HandlerChecker> blockedCheckers;
333            final String subject;
334            final boolean allowRestart;
335            int debuggerWasConnected = 0;
336            synchronized (this) {
337                long timeout = CHECK_INTERVAL;
338                // Make sure we (re)spin the checkers that have become idle within
339                // this wait-and-check interval
340                for (int i=0; i<mHandlerCheckers.size(); i++) {
341                    HandlerChecker hc = mHandlerCheckers.get(i);
342                    hc.scheduleCheckLocked();
343                }
344
345                if (debuggerWasConnected > 0) {
346                    debuggerWasConnected--;
347                }
348
349                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
350                // wait while asleep. If the device is asleep then the thing that we are waiting
351                // to timeout on is asleep as well and won't have a chance to run, causing a false
352                // positive on when to kill things.
353                long start = SystemClock.uptimeMillis();
354                while (timeout > 0) {
355                    if (Debug.isDebuggerConnected()) {
356                        debuggerWasConnected = 2;
357                    }
358                    try {
359                        wait(timeout);
360                    } catch (InterruptedException e) {
361                        Log.wtf(TAG, e);
362                    }
363                    if (Debug.isDebuggerConnected()) {
364                        debuggerWasConnected = 2;
365                    }
366                    timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
367                }
368
369                final int waitState = evaluateCheckerCompletionLocked();
370                if (waitState == COMPLETED) {
371                    // The monitors have returned; reset
372                    waitedHalf = false;
373                    continue;
374                } else if (waitState == WAITING) {
375                    // still waiting but within their configured intervals; back off and recheck
376                    continue;
377                } else if (waitState == WAITED_HALF) {
378                    if (!waitedHalf) {
379                        // We've waited half the deadlock-detection interval.  Pull a stack
380                        // trace and wait another half.
381                        ArrayList<Integer> pids = new ArrayList<Integer>();
382                        pids.add(Process.myPid());
383                        ActivityManagerService.dumpStackTraces(true, pids, null, null,
384                                NATIVE_STACKS_OF_INTEREST);
385                        waitedHalf = true;
386                    }
387                    continue;
388                }
389
390                // something is overdue!
391                blockedCheckers = getBlockedCheckersLocked();
392                subject = describeCheckersLocked(blockedCheckers);
393                allowRestart = mAllowRestart;
394            }
395
396            // If we got here, that means that the system is most likely hung.
397            // First collect stack traces from all threads of the system process.
398            // Then kill this process so that the system will restart.
399            EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
400
401            ArrayList<Integer> pids = new ArrayList<Integer>();
402            pids.add(Process.myPid());
403            if (mPhonePid > 0) pids.add(mPhonePid);
404            // Pass !waitedHalf so that just in case we somehow wind up here without having
405            // dumped the halfway stacks, we properly re-initialize the trace file.
406            final File stack = ActivityManagerService.dumpStackTraces(
407                    !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
408
409            // Give some extra time to make sure the stack traces get written.
410            // The system's been hanging for a minute, another second or two won't hurt much.
411            SystemClock.sleep(2000);
412
413            // Pull our own kernel thread stacks as well if we're configured for that
414            if (RECORD_KERNEL_THREADS) {
415                dumpKernelStackTraces();
416            }
417
418            // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
419            doSysRq('w');
420            doSysRq('l');
421
422            // Try to add the error to the dropbox, but assuming that the ActivityManager
423            // itself may be deadlocked.  (which has happened, causing this statement to
424            // deadlock and the watchdog as a whole to be ineffective)
425            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
426                    public void run() {
427                        mActivity.addErrorToDropBox(
428                                "watchdog", null, "system_server", null, null,
429                                subject, null, stack, null);
430                    }
431                };
432            dropboxThread.start();
433            try {
434                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
435            } catch (InterruptedException ignored) {}
436
437            IActivityController controller;
438            synchronized (this) {
439                controller = mController;
440            }
441            if (controller != null) {
442                Slog.i(TAG, "Reporting stuck state to activity controller");
443                try {
444                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
445                    // 1 = keep waiting, -1 = kill system
446                    int res = controller.systemNotResponding(subject);
447                    if (res >= 0) {
448                        Slog.i(TAG, "Activity controller requested to coninue to wait");
449                        waitedHalf = false;
450                        continue;
451                    }
452                } catch (RemoteException e) {
453                }
454            }
455
456            // Only kill the process if the debugger is not attached.
457            if (Debug.isDebuggerConnected()) {
458                debuggerWasConnected = 2;
459            }
460            if (debuggerWasConnected >= 2) {
461                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
462            } else if (debuggerWasConnected > 0) {
463                Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
464            } else if (!allowRestart) {
465                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
466            } else {
467                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
468                for (int i=0; i<blockedCheckers.size(); i++) {
469                    Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
470                    StackTraceElement[] stackTrace
471                            = blockedCheckers.get(i).getThread().getStackTrace();
472                    for (StackTraceElement element: stackTrace) {
473                        Slog.w(TAG, "    at " + element);
474                    }
475                }
476                Slog.w(TAG, "*** GOODBYE!");
477                Process.killProcess(Process.myPid());
478                System.exit(10);
479            }
480
481            waitedHalf = false;
482        }
483    }
484
485    private void doSysRq(char c) {
486        try {
487            FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
488            sysrq_trigger.write(c);
489            sysrq_trigger.close();
490        } catch (IOException e) {
491            Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
492        }
493    }
494
495    private File dumpKernelStackTraces() {
496        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
497        if (tracesPath == null || tracesPath.length() == 0) {
498            return null;
499        }
500
501        native_dumpKernelStacks(tracesPath);
502        return new File(tracesPath);
503    }
504
505    private native void native_dumpKernelStacks(String tracesPath);
506}
507