1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import android.app.IActivityController;
20import android.os.Binder;
21import android.os.RemoteException;
22import com.android.server.am.ActivityManagerService;
23
24import android.content.BroadcastReceiver;
25import android.content.ContentResolver;
26import android.content.Context;
27import android.content.Intent;
28import android.content.IntentFilter;
29import android.os.Debug;
30import android.os.Handler;
31import android.os.IPowerManager;
32import android.os.Looper;
33import android.os.Process;
34import android.os.ServiceManager;
35import android.os.SystemClock;
36import android.os.SystemProperties;
37import android.util.EventLog;
38import android.util.Log;
39import android.util.Slog;
40
41import java.io.File;
42import java.io.FileWriter;
43import java.io.IOException;
44import java.util.ArrayList;
45
46/** This class calls its monitor every minute. Killing this process if they don't return **/
47public class Watchdog extends Thread {
48    static final String TAG = "Watchdog";
49
50    // Set this to true to use debug default values.
51    static final boolean DB = false;
52
53    // Set this to true to have the watchdog record kernel thread stacks when it fires
54    static final boolean RECORD_KERNEL_THREADS = true;
55
56    static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
57    static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
58
59    // These are temporally ordered: larger values as lateness increases
60    static final int COMPLETED = 0;
61    static final int WAITING = 1;
62    static final int WAITED_HALF = 2;
63    static final int OVERDUE = 3;
64
65    // Which native processes to dump into dropbox's stack traces
66    public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
67        "/system/bin/audioserver",
68        "/system/bin/cameraserver",
69        "/system/bin/drmserver",
70        "/system/bin/mediadrmserver",
71        "/system/bin/mediaserver",
72        "/system/bin/sdcard",
73        "/system/bin/surfaceflinger",
74        "media.codec",     // system/bin/mediacodec
75        "media.extractor", // system/bin/mediaextractor
76        "com.android.bluetooth",  // Bluetooth service
77    };
78
79    static Watchdog sWatchdog;
80
81    /* This handler will be used to post message back onto the main thread */
82    final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
83    final HandlerChecker mMonitorChecker;
84    ContentResolver mResolver;
85    ActivityManagerService mActivity;
86
87    int mPhonePid;
88    IActivityController mController;
89    boolean mAllowRestart = true;
90
91    /**
92     * Used for checking status of handle threads and scheduling monitor callbacks.
93     */
94    public final class HandlerChecker implements Runnable {
95        private final Handler mHandler;
96        private final String mName;
97        private final long mWaitMax;
98        private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
99        private boolean mCompleted;
100        private Monitor mCurrentMonitor;
101        private long mStartTime;
102
103        HandlerChecker(Handler handler, String name, long waitMaxMillis) {
104            mHandler = handler;
105            mName = name;
106            mWaitMax = waitMaxMillis;
107            mCompleted = true;
108        }
109
110        public void addMonitor(Monitor monitor) {
111            mMonitors.add(monitor);
112        }
113
114        public void scheduleCheckLocked() {
115            if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
116                // If the target looper has recently been polling, then
117                // there is no reason to enqueue our checker on it since that
118                // is as good as it not being deadlocked.  This avoid having
119                // to do a context switch to check the thread.  Note that we
120                // only do this if mCheckReboot is false and we have no
121                // monitors, since those would need to be executed at this point.
122                mCompleted = true;
123                return;
124            }
125
126            if (!mCompleted) {
127                // we already have a check in flight, so no need
128                return;
129            }
130
131            mCompleted = false;
132            mCurrentMonitor = null;
133            mStartTime = SystemClock.uptimeMillis();
134            mHandler.postAtFrontOfQueue(this);
135        }
136
137        public boolean isOverdueLocked() {
138            return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
139        }
140
141        public int getCompletionStateLocked() {
142            if (mCompleted) {
143                return COMPLETED;
144            } else {
145                long latency = SystemClock.uptimeMillis() - mStartTime;
146                if (latency < mWaitMax/2) {
147                    return WAITING;
148                } else if (latency < mWaitMax) {
149                    return WAITED_HALF;
150                }
151            }
152            return OVERDUE;
153        }
154
155        public Thread getThread() {
156            return mHandler.getLooper().getThread();
157        }
158
159        public String getName() {
160            return mName;
161        }
162
163        public String describeBlockedStateLocked() {
164            if (mCurrentMonitor == null) {
165                return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
166            } else {
167                return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
168                        + " on " + mName + " (" + getThread().getName() + ")";
169            }
170        }
171
172        @Override
173        public void run() {
174            final int size = mMonitors.size();
175            for (int i = 0 ; i < size ; i++) {
176                synchronized (Watchdog.this) {
177                    mCurrentMonitor = mMonitors.get(i);
178                }
179                mCurrentMonitor.monitor();
180            }
181
182            synchronized (Watchdog.this) {
183                mCompleted = true;
184                mCurrentMonitor = null;
185            }
186        }
187    }
188
189    final class RebootRequestReceiver extends BroadcastReceiver {
190        @Override
191        public void onReceive(Context c, Intent intent) {
192            if (intent.getIntExtra("nowait", 0) != 0) {
193                rebootSystem("Received ACTION_REBOOT broadcast");
194                return;
195            }
196            Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
197        }
198    }
199
200    /** Monitor for checking the availability of binder threads. The monitor will block until
201     * there is a binder thread available to process in coming IPCs to make sure other processes
202     * can still communicate with the service.
203     */
204    private static final class BinderThreadMonitor implements Watchdog.Monitor {
205        @Override
206        public void monitor() {
207            Binder.blockUntilThreadAvailable();
208        }
209    }
210
211    public interface Monitor {
212        void monitor();
213    }
214
215    public static Watchdog getInstance() {
216        if (sWatchdog == null) {
217            sWatchdog = new Watchdog();
218        }
219
220        return sWatchdog;
221    }
222
223    private Watchdog() {
224        super("watchdog");
225        // Initialize handler checkers for each common thread we want to check.  Note
226        // that we are not currently checking the background thread, since it can
227        // potentially hold longer running operations with no guarantees about the timeliness
228        // of operations there.
229
230        // The shared foreground thread is the main checker.  It is where we
231        // will also dispatch monitor checks and do other work.
232        mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
233                "foreground thread", DEFAULT_TIMEOUT);
234        mHandlerCheckers.add(mMonitorChecker);
235        // Add checker for main thread.  We only do a quick check since there
236        // can be UI running on the thread.
237        mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
238                "main thread", DEFAULT_TIMEOUT));
239        // Add checker for shared UI thread.
240        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
241                "ui thread", DEFAULT_TIMEOUT));
242        // And also check IO thread.
243        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
244                "i/o thread", DEFAULT_TIMEOUT));
245        // And the display thread.
246        mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
247                "display thread", DEFAULT_TIMEOUT));
248
249        // Initialize monitor for Binder threads.
250        addMonitor(new BinderThreadMonitor());
251    }
252
253    public void init(Context context, ActivityManagerService activity) {
254        mResolver = context.getContentResolver();
255        mActivity = activity;
256
257        context.registerReceiver(new RebootRequestReceiver(),
258                new IntentFilter(Intent.ACTION_REBOOT),
259                android.Manifest.permission.REBOOT, null);
260    }
261
262    public void processStarted(String name, int pid) {
263        synchronized (this) {
264            if ("com.android.phone".equals(name)) {
265                mPhonePid = pid;
266            }
267        }
268    }
269
270    public void setActivityController(IActivityController controller) {
271        synchronized (this) {
272            mController = controller;
273        }
274    }
275
276    public void setAllowRestart(boolean allowRestart) {
277        synchronized (this) {
278            mAllowRestart = allowRestart;
279        }
280    }
281
282    public void addMonitor(Monitor monitor) {
283        synchronized (this) {
284            if (isAlive()) {
285                throw new RuntimeException("Monitors can't be added once the Watchdog is running");
286            }
287            mMonitorChecker.addMonitor(monitor);
288        }
289    }
290
291    public void addThread(Handler thread) {
292        addThread(thread, DEFAULT_TIMEOUT);
293    }
294
295    public void addThread(Handler thread, long timeoutMillis) {
296        synchronized (this) {
297            if (isAlive()) {
298                throw new RuntimeException("Threads can't be added once the Watchdog is running");
299            }
300            final String name = thread.getLooper().getThread().getName();
301            mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
302        }
303    }
304
305    /**
306     * Perform a full reboot of the system.
307     */
308    void rebootSystem(String reason) {
309        Slog.i(TAG, "Rebooting system because: " + reason);
310        IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
311        try {
312            pms.reboot(false, reason, false);
313        } catch (RemoteException ex) {
314        }
315    }
316
317    private int evaluateCheckerCompletionLocked() {
318        int state = COMPLETED;
319        for (int i=0; i<mHandlerCheckers.size(); i++) {
320            HandlerChecker hc = mHandlerCheckers.get(i);
321            state = Math.max(state, hc.getCompletionStateLocked());
322        }
323        return state;
324    }
325
326    private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
327        ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
328        for (int i=0; i<mHandlerCheckers.size(); i++) {
329            HandlerChecker hc = mHandlerCheckers.get(i);
330            if (hc.isOverdueLocked()) {
331                checkers.add(hc);
332            }
333        }
334        return checkers;
335    }
336
337    private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
338        StringBuilder builder = new StringBuilder(128);
339        for (int i=0; i<checkers.size(); i++) {
340            if (builder.length() > 0) {
341                builder.append(", ");
342            }
343            builder.append(checkers.get(i).describeBlockedStateLocked());
344        }
345        return builder.toString();
346    }
347
348    @Override
349    public void run() {
350        boolean waitedHalf = false;
351        while (true) {
352            final ArrayList<HandlerChecker> blockedCheckers;
353            final String subject;
354            final boolean allowRestart;
355            int debuggerWasConnected = 0;
356            synchronized (this) {
357                long timeout = CHECK_INTERVAL;
358                // Make sure we (re)spin the checkers that have become idle within
359                // this wait-and-check interval
360                for (int i=0; i<mHandlerCheckers.size(); i++) {
361                    HandlerChecker hc = mHandlerCheckers.get(i);
362                    hc.scheduleCheckLocked();
363                }
364
365                if (debuggerWasConnected > 0) {
366                    debuggerWasConnected--;
367                }
368
369                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
370                // wait while asleep. If the device is asleep then the thing that we are waiting
371                // to timeout on is asleep as well and won't have a chance to run, causing a false
372                // positive on when to kill things.
373                long start = SystemClock.uptimeMillis();
374                while (timeout > 0) {
375                    if (Debug.isDebuggerConnected()) {
376                        debuggerWasConnected = 2;
377                    }
378                    try {
379                        wait(timeout);
380                    } catch (InterruptedException e) {
381                        Log.wtf(TAG, e);
382                    }
383                    if (Debug.isDebuggerConnected()) {
384                        debuggerWasConnected = 2;
385                    }
386                    timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
387                }
388
389                final int waitState = evaluateCheckerCompletionLocked();
390                if (waitState == COMPLETED) {
391                    // The monitors have returned; reset
392                    waitedHalf = false;
393                    continue;
394                } else if (waitState == WAITING) {
395                    // still waiting but within their configured intervals; back off and recheck
396                    continue;
397                } else if (waitState == WAITED_HALF) {
398                    if (!waitedHalf) {
399                        // We've waited half the deadlock-detection interval.  Pull a stack
400                        // trace and wait another half.
401                        ArrayList<Integer> pids = new ArrayList<Integer>();
402                        pids.add(Process.myPid());
403                        ActivityManagerService.dumpStackTraces(true, pids, null, null,
404                                NATIVE_STACKS_OF_INTEREST);
405                        waitedHalf = true;
406                    }
407                    continue;
408                }
409
410                // something is overdue!
411                blockedCheckers = getBlockedCheckersLocked();
412                subject = describeCheckersLocked(blockedCheckers);
413                allowRestart = mAllowRestart;
414            }
415
416            // If we got here, that means that the system is most likely hung.
417            // First collect stack traces from all threads of the system process.
418            // Then kill this process so that the system will restart.
419            EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
420
421            ArrayList<Integer> pids = new ArrayList<Integer>();
422            pids.add(Process.myPid());
423            if (mPhonePid > 0) pids.add(mPhonePid);
424            // Pass !waitedHalf so that just in case we somehow wind up here without having
425            // dumped the halfway stacks, we properly re-initialize the trace file.
426            final File stack = ActivityManagerService.dumpStackTraces(
427                    !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
428
429            // Give some extra time to make sure the stack traces get written.
430            // The system's been hanging for a minute, another second or two won't hurt much.
431            SystemClock.sleep(2000);
432
433            // Pull our own kernel thread stacks as well if we're configured for that
434            if (RECORD_KERNEL_THREADS) {
435                dumpKernelStackTraces();
436            }
437
438            // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
439            doSysRq('w');
440            doSysRq('l');
441
442            // Try to add the error to the dropbox, but assuming that the ActivityManager
443            // itself may be deadlocked.  (which has happened, causing this statement to
444            // deadlock and the watchdog as a whole to be ineffective)
445            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
446                    public void run() {
447                        mActivity.addErrorToDropBox(
448                                "watchdog", null, "system_server", null, null,
449                                subject, null, stack, null);
450                    }
451                };
452            dropboxThread.start();
453            try {
454                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
455            } catch (InterruptedException ignored) {}
456
457            IActivityController controller;
458            synchronized (this) {
459                controller = mController;
460            }
461            if (controller != null) {
462                Slog.i(TAG, "Reporting stuck state to activity controller");
463                try {
464                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
465                    // 1 = keep waiting, -1 = kill system
466                    int res = controller.systemNotResponding(subject);
467                    if (res >= 0) {
468                        Slog.i(TAG, "Activity controller requested to coninue to wait");
469                        waitedHalf = false;
470                        continue;
471                    }
472                } catch (RemoteException e) {
473                }
474            }
475
476            // Only kill the process if the debugger is not attached.
477            if (Debug.isDebuggerConnected()) {
478                debuggerWasConnected = 2;
479            }
480            if (debuggerWasConnected >= 2) {
481                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
482            } else if (debuggerWasConnected > 0) {
483                Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
484            } else if (!allowRestart) {
485                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
486            } else {
487                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
488                for (int i=0; i<blockedCheckers.size(); i++) {
489                    Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
490                    StackTraceElement[] stackTrace
491                            = blockedCheckers.get(i).getThread().getStackTrace();
492                    for (StackTraceElement element: stackTrace) {
493                        Slog.w(TAG, "    at " + element);
494                    }
495                }
496                Slog.w(TAG, "*** GOODBYE!");
497                Process.killProcess(Process.myPid());
498                System.exit(10);
499            }
500
501            waitedHalf = false;
502        }
503    }
504
505    private void doSysRq(char c) {
506        try {
507            FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
508            sysrq_trigger.write(c);
509            sysrq_trigger.close();
510        } catch (IOException e) {
511            Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
512        }
513    }
514
515    private File dumpKernelStackTraces() {
516        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
517        if (tracesPath == null || tracesPath.length() == 0) {
518            return null;
519        }
520
521        native_dumpKernelStacks(tracesPath);
522        return new File(tracesPath);
523    }
524
525    private native void native_dumpKernelStacks(String tracesPath);
526}
527