1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import android.app.IActivityController;
20import android.os.Binder;
21import android.os.RemoteException;
22import com.android.server.am.ActivityManagerService;
23
24import android.content.BroadcastReceiver;
25import android.content.ContentResolver;
26import android.content.Context;
27import android.content.Intent;
28import android.content.IntentFilter;
29import android.os.Debug;
30import android.os.Handler;
31import android.os.IPowerManager;
32import android.os.Looper;
33import android.os.Process;
34import android.os.ServiceManager;
35import android.os.SystemClock;
36import android.os.SystemProperties;
37import android.util.EventLog;
38import android.util.Log;
39import android.util.Slog;
40
41import java.io.File;
42import java.io.FileWriter;
43import java.io.IOException;
44import java.util.ArrayList;
45
46/** This class calls its monitor every minute. Killing this process if they don't return **/
47public class Watchdog extends Thread {
48    static final String TAG = "Watchdog";
49
50    // Set this to true to use debug default values.
51    static final boolean DB = false;
52
53    // Set this to true to have the watchdog record kernel thread stacks when it fires
54    static final boolean RECORD_KERNEL_THREADS = true;
55
56    static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
57    static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
58
59    // These are temporally ordered: larger values as lateness increases
60    static final int COMPLETED = 0;
61    static final int WAITING = 1;
62    static final int WAITED_HALF = 2;
63    static final int OVERDUE = 3;
64
65    // Which native processes to dump into dropbox's stack traces
66    public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
67        "/system/bin/mediaserver",
68        "/system/bin/sdcard",
69        "/system/bin/surfaceflinger"
70    };
71
72    static Watchdog sWatchdog;
73
74    /* This handler will be used to post message back onto the main thread */
75    final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
76    final HandlerChecker mMonitorChecker;
77    ContentResolver mResolver;
78    ActivityManagerService mActivity;
79
80    int mPhonePid;
81    IActivityController mController;
82    boolean mAllowRestart = true;
83
84    /**
85     * Used for checking status of handle threads and scheduling monitor callbacks.
86     */
87    public final class HandlerChecker implements Runnable {
88        private final Handler mHandler;
89        private final String mName;
90        private final long mWaitMax;
91        private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
92        private boolean mCompleted;
93        private Monitor mCurrentMonitor;
94        private long mStartTime;
95
96        HandlerChecker(Handler handler, String name, long waitMaxMillis) {
97            mHandler = handler;
98            mName = name;
99            mWaitMax = waitMaxMillis;
100            mCompleted = true;
101        }
102
103        public void addMonitor(Monitor monitor) {
104            mMonitors.add(monitor);
105        }
106
107        public void scheduleCheckLocked() {
108            if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
109                // If the target looper has recently been polling, then
110                // there is no reason to enqueue our checker on it since that
111                // is as good as it not being deadlocked.  This avoid having
112                // to do a context switch to check the thread.  Note that we
113                // only do this if mCheckReboot is false and we have no
114                // monitors, since those would need to be executed at this point.
115                mCompleted = true;
116                return;
117            }
118
119            if (!mCompleted) {
120                // we already have a check in flight, so no need
121                return;
122            }
123
124            mCompleted = false;
125            mCurrentMonitor = null;
126            mStartTime = SystemClock.uptimeMillis();
127            mHandler.postAtFrontOfQueue(this);
128        }
129
130        public boolean isOverdueLocked() {
131            return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
132        }
133
134        public int getCompletionStateLocked() {
135            if (mCompleted) {
136                return COMPLETED;
137            } else {
138                long latency = SystemClock.uptimeMillis() - mStartTime;
139                if (latency < mWaitMax/2) {
140                    return WAITING;
141                } else if (latency < mWaitMax) {
142                    return WAITED_HALF;
143                }
144            }
145            return OVERDUE;
146        }
147
148        public Thread getThread() {
149            return mHandler.getLooper().getThread();
150        }
151
152        public String getName() {
153            return mName;
154        }
155
156        public String describeBlockedStateLocked() {
157            if (mCurrentMonitor == null) {
158                return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
159            } else {
160                return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
161                        + " on " + mName + " (" + getThread().getName() + ")";
162            }
163        }
164
165        @Override
166        public void run() {
167            final int size = mMonitors.size();
168            for (int i = 0 ; i < size ; i++) {
169                synchronized (Watchdog.this) {
170                    mCurrentMonitor = mMonitors.get(i);
171                }
172                mCurrentMonitor.monitor();
173            }
174
175            synchronized (Watchdog.this) {
176                mCompleted = true;
177                mCurrentMonitor = null;
178            }
179        }
180    }
181
182    final class RebootRequestReceiver extends BroadcastReceiver {
183        @Override
184        public void onReceive(Context c, Intent intent) {
185            if (intent.getIntExtra("nowait", 0) != 0) {
186                rebootSystem("Received ACTION_REBOOT broadcast");
187                return;
188            }
189            Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
190        }
191    }
192
193    /** Monitor for checking the availability of binder threads. The monitor will block until
194     * there is a binder thread available to process in coming IPCs to make sure other processes
195     * can still communicate with the service.
196     */
197    private static final class BinderThreadMonitor implements Watchdog.Monitor {
198        @Override
199        public void monitor() {
200            Binder.blockUntilThreadAvailable();
201        }
202    }
203
204    public interface Monitor {
205        void monitor();
206    }
207
208    public static Watchdog getInstance() {
209        if (sWatchdog == null) {
210            sWatchdog = new Watchdog();
211        }
212
213        return sWatchdog;
214    }
215
216    private Watchdog() {
217        super("watchdog");
218        // Initialize handler checkers for each common thread we want to check.  Note
219        // that we are not currently checking the background thread, since it can
220        // potentially hold longer running operations with no guarantees about the timeliness
221        // of operations there.
222
223        // The shared foreground thread is the main checker.  It is where we
224        // will also dispatch monitor checks and do other work.
225        mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
226                "foreground thread", DEFAULT_TIMEOUT);
227        mHandlerCheckers.add(mMonitorChecker);
228        // Add checker for main thread.  We only do a quick check since there
229        // can be UI running on the thread.
230        mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
231                "main thread", DEFAULT_TIMEOUT));
232        // Add checker for shared UI thread.
233        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
234                "ui thread", DEFAULT_TIMEOUT));
235        // And also check IO thread.
236        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
237                "i/o thread", DEFAULT_TIMEOUT));
238        // And the display thread.
239        mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
240                "display thread", DEFAULT_TIMEOUT));
241
242        // Initialize monitor for Binder threads.
243        addMonitor(new BinderThreadMonitor());
244    }
245
246    public void init(Context context, ActivityManagerService activity) {
247        mResolver = context.getContentResolver();
248        mActivity = activity;
249
250        context.registerReceiver(new RebootRequestReceiver(),
251                new IntentFilter(Intent.ACTION_REBOOT),
252                android.Manifest.permission.REBOOT, null);
253    }
254
255    public void processStarted(String name, int pid) {
256        synchronized (this) {
257            if ("com.android.phone".equals(name)) {
258                mPhonePid = pid;
259            }
260        }
261    }
262
263    public void setActivityController(IActivityController controller) {
264        synchronized (this) {
265            mController = controller;
266        }
267    }
268
269    public void setAllowRestart(boolean allowRestart) {
270        synchronized (this) {
271            mAllowRestart = allowRestart;
272        }
273    }
274
275    public void addMonitor(Monitor monitor) {
276        synchronized (this) {
277            if (isAlive()) {
278                throw new RuntimeException("Monitors can't be added once the Watchdog is running");
279            }
280            mMonitorChecker.addMonitor(monitor);
281        }
282    }
283
284    public void addThread(Handler thread) {
285        addThread(thread, DEFAULT_TIMEOUT);
286    }
287
288    public void addThread(Handler thread, long timeoutMillis) {
289        synchronized (this) {
290            if (isAlive()) {
291                throw new RuntimeException("Threads can't be added once the Watchdog is running");
292            }
293            final String name = thread.getLooper().getThread().getName();
294            mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
295        }
296    }
297
298    /**
299     * Perform a full reboot of the system.
300     */
301    void rebootSystem(String reason) {
302        Slog.i(TAG, "Rebooting system because: " + reason);
303        IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
304        try {
305            pms.reboot(false, reason, false);
306        } catch (RemoteException ex) {
307        }
308    }
309
310    private int evaluateCheckerCompletionLocked() {
311        int state = COMPLETED;
312        for (int i=0; i<mHandlerCheckers.size(); i++) {
313            HandlerChecker hc = mHandlerCheckers.get(i);
314            state = Math.max(state, hc.getCompletionStateLocked());
315        }
316        return state;
317    }
318
319    private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
320        ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
321        for (int i=0; i<mHandlerCheckers.size(); i++) {
322            HandlerChecker hc = mHandlerCheckers.get(i);
323            if (hc.isOverdueLocked()) {
324                checkers.add(hc);
325            }
326        }
327        return checkers;
328    }
329
330    private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
331        StringBuilder builder = new StringBuilder(128);
332        for (int i=0; i<checkers.size(); i++) {
333            if (builder.length() > 0) {
334                builder.append(", ");
335            }
336            builder.append(checkers.get(i).describeBlockedStateLocked());
337        }
338        return builder.toString();
339    }
340
341    @Override
342    public void run() {
343        boolean waitedHalf = false;
344        while (true) {
345            final ArrayList<HandlerChecker> blockedCheckers;
346            final String subject;
347            final boolean allowRestart;
348            int debuggerWasConnected = 0;
349            synchronized (this) {
350                long timeout = CHECK_INTERVAL;
351                // Make sure we (re)spin the checkers that have become idle within
352                // this wait-and-check interval
353                for (int i=0; i<mHandlerCheckers.size(); i++) {
354                    HandlerChecker hc = mHandlerCheckers.get(i);
355                    hc.scheduleCheckLocked();
356                }
357
358                if (debuggerWasConnected > 0) {
359                    debuggerWasConnected--;
360                }
361
362                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
363                // wait while asleep. If the device is asleep then the thing that we are waiting
364                // to timeout on is asleep as well and won't have a chance to run, causing a false
365                // positive on when to kill things.
366                long start = SystemClock.uptimeMillis();
367                while (timeout > 0) {
368                    if (Debug.isDebuggerConnected()) {
369                        debuggerWasConnected = 2;
370                    }
371                    try {
372                        wait(timeout);
373                    } catch (InterruptedException e) {
374                        Log.wtf(TAG, e);
375                    }
376                    if (Debug.isDebuggerConnected()) {
377                        debuggerWasConnected = 2;
378                    }
379                    timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
380                }
381
382                final int waitState = evaluateCheckerCompletionLocked();
383                if (waitState == COMPLETED) {
384                    // The monitors have returned; reset
385                    waitedHalf = false;
386                    continue;
387                } else if (waitState == WAITING) {
388                    // still waiting but within their configured intervals; back off and recheck
389                    continue;
390                } else if (waitState == WAITED_HALF) {
391                    if (!waitedHalf) {
392                        // We've waited half the deadlock-detection interval.  Pull a stack
393                        // trace and wait another half.
394                        ArrayList<Integer> pids = new ArrayList<Integer>();
395                        pids.add(Process.myPid());
396                        ActivityManagerService.dumpStackTraces(true, pids, null, null,
397                                NATIVE_STACKS_OF_INTEREST);
398                        waitedHalf = true;
399                    }
400                    continue;
401                }
402
403                // something is overdue!
404                blockedCheckers = getBlockedCheckersLocked();
405                subject = describeCheckersLocked(blockedCheckers);
406                allowRestart = mAllowRestart;
407            }
408
409            // If we got here, that means that the system is most likely hung.
410            // First collect stack traces from all threads of the system process.
411            // Then kill this process so that the system will restart.
412            EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
413
414            ArrayList<Integer> pids = new ArrayList<Integer>();
415            pids.add(Process.myPid());
416            if (mPhonePid > 0) pids.add(mPhonePid);
417            // Pass !waitedHalf so that just in case we somehow wind up here without having
418            // dumped the halfway stacks, we properly re-initialize the trace file.
419            final File stack = ActivityManagerService.dumpStackTraces(
420                    !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
421
422            // Give some extra time to make sure the stack traces get written.
423            // The system's been hanging for a minute, another second or two won't hurt much.
424            SystemClock.sleep(2000);
425
426            // Pull our own kernel thread stacks as well if we're configured for that
427            if (RECORD_KERNEL_THREADS) {
428                dumpKernelStackTraces();
429            }
430
431            // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
432            doSysRq('w');
433            doSysRq('l');
434
435            // Try to add the error to the dropbox, but assuming that the ActivityManager
436            // itself may be deadlocked.  (which has happened, causing this statement to
437            // deadlock and the watchdog as a whole to be ineffective)
438            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
439                    public void run() {
440                        mActivity.addErrorToDropBox(
441                                "watchdog", null, "system_server", null, null,
442                                subject, null, stack, null);
443                    }
444                };
445            dropboxThread.start();
446            try {
447                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
448            } catch (InterruptedException ignored) {}
449
450            IActivityController controller;
451            synchronized (this) {
452                controller = mController;
453            }
454            if (controller != null) {
455                Slog.i(TAG, "Reporting stuck state to activity controller");
456                try {
457                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
458                    // 1 = keep waiting, -1 = kill system
459                    int res = controller.systemNotResponding(subject);
460                    if (res >= 0) {
461                        Slog.i(TAG, "Activity controller requested to coninue to wait");
462                        waitedHalf = false;
463                        continue;
464                    }
465                } catch (RemoteException e) {
466                }
467            }
468
469            // Only kill the process if the debugger is not attached.
470            if (Debug.isDebuggerConnected()) {
471                debuggerWasConnected = 2;
472            }
473            if (debuggerWasConnected >= 2) {
474                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
475            } else if (debuggerWasConnected > 0) {
476                Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
477            } else if (!allowRestart) {
478                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
479            } else {
480                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
481                for (int i=0; i<blockedCheckers.size(); i++) {
482                    Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
483                    StackTraceElement[] stackTrace
484                            = blockedCheckers.get(i).getThread().getStackTrace();
485                    for (StackTraceElement element: stackTrace) {
486                        Slog.w(TAG, "    at " + element);
487                    }
488                }
489                Slog.w(TAG, "*** GOODBYE!");
490                Process.killProcess(Process.myPid());
491                System.exit(10);
492            }
493
494            waitedHalf = false;
495        }
496    }
497
498    private void doSysRq(char c) {
499        try {
500            FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
501            sysrq_trigger.write(c);
502            sysrq_trigger.close();
503        } catch (IOException e) {
504            Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
505        }
506    }
507
508    private File dumpKernelStackTraces() {
509        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
510        if (tracesPath == null || tracesPath.length() == 0) {
511            return null;
512        }
513
514        native_dumpKernelStacks(tracesPath);
515        return new File(tracesPath);
516    }
517
518    private native void native_dumpKernelStacks(String tracesPath);
519}
520