1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import android.app.IActivityController;
20import android.os.Binder;
21import android.os.RemoteException;
22import com.android.server.am.ActivityManagerService;
23
24import android.content.BroadcastReceiver;
25import android.content.ContentResolver;
26import android.content.Context;
27import android.content.Intent;
28import android.content.IntentFilter;
29import android.hidl.manager.V1_0.IServiceManager;
30import android.os.Debug;
31import android.os.Handler;
32import android.os.IPowerManager;
33import android.os.Looper;
34import android.os.Process;
35import android.os.ServiceManager;
36import android.os.SystemClock;
37import android.os.SystemProperties;
38import android.util.EventLog;
39import android.util.Log;
40import android.util.Slog;
41
42import java.io.File;
43import java.io.FileWriter;
44import java.io.IOException;
45import java.util.ArrayList;
46import java.util.Arrays;
47import java.util.HashSet;
48import java.util.List;
49
50/** This class calls its monitor every minute. Killing this process if they don't return **/
51public class Watchdog extends Thread {
52    static final String TAG = "Watchdog";
53
54    // Set this to true to use debug default values.
55    static final boolean DB = false;
56
57    // Set this to true to have the watchdog record kernel thread stacks when it fires
58    static final boolean RECORD_KERNEL_THREADS = true;
59
60    static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
61    static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
62
63    // These are temporally ordered: larger values as lateness increases
64    static final int COMPLETED = 0;
65    static final int WAITING = 1;
66    static final int WAITED_HALF = 2;
67    static final int OVERDUE = 3;
68
69    // Which native processes to dump into dropbox's stack traces
70    public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
71        "/system/bin/audioserver",
72        "/system/bin/cameraserver",
73        "/system/bin/drmserver",
74        "/system/bin/mediadrmserver",
75        "/system/bin/mediaserver",
76        "/system/bin/sdcard",
77        "/system/bin/surfaceflinger",
78        "media.extractor", // system/bin/mediaextractor
79        "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
80        "com.android.bluetooth",  // Bluetooth service
81    };
82
83    public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
84        "android.hardware.audio@2.0::IDevicesFactory",
85        "android.hardware.bluetooth@1.0::IBluetoothHci",
86        "android.hardware.camera.provider@2.4::ICameraProvider",
87        "android.hardware.graphics.composer@2.1::IComposer",
88        "android.hardware.vr@1.0::IVr",
89        "android.hardware.media.omx@1.0::IOmx"
90    );
91
92    static Watchdog sWatchdog;
93
94    /* This handler will be used to post message back onto the main thread */
95    final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
96    final HandlerChecker mMonitorChecker;
97    ContentResolver mResolver;
98    ActivityManagerService mActivity;
99
100    int mPhonePid;
101    IActivityController mController;
102    boolean mAllowRestart = true;
103
104    /**
105     * Used for checking status of handle threads and scheduling monitor callbacks.
106     */
107    public final class HandlerChecker implements Runnable {
108        private final Handler mHandler;
109        private final String mName;
110        private final long mWaitMax;
111        private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
112        private boolean mCompleted;
113        private Monitor mCurrentMonitor;
114        private long mStartTime;
115
116        HandlerChecker(Handler handler, String name, long waitMaxMillis) {
117            mHandler = handler;
118            mName = name;
119            mWaitMax = waitMaxMillis;
120            mCompleted = true;
121        }
122
123        public void addMonitor(Monitor monitor) {
124            mMonitors.add(monitor);
125        }
126
127        public void scheduleCheckLocked() {
128            if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
129                // If the target looper has recently been polling, then
130                // there is no reason to enqueue our checker on it since that
131                // is as good as it not being deadlocked.  This avoid having
132                // to do a context switch to check the thread.  Note that we
133                // only do this if mCheckReboot is false and we have no
134                // monitors, since those would need to be executed at this point.
135                mCompleted = true;
136                return;
137            }
138
139            if (!mCompleted) {
140                // we already have a check in flight, so no need
141                return;
142            }
143
144            mCompleted = false;
145            mCurrentMonitor = null;
146            mStartTime = SystemClock.uptimeMillis();
147            mHandler.postAtFrontOfQueue(this);
148        }
149
150        public boolean isOverdueLocked() {
151            return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
152        }
153
154        public int getCompletionStateLocked() {
155            if (mCompleted) {
156                return COMPLETED;
157            } else {
158                long latency = SystemClock.uptimeMillis() - mStartTime;
159                if (latency < mWaitMax/2) {
160                    return WAITING;
161                } else if (latency < mWaitMax) {
162                    return WAITED_HALF;
163                }
164            }
165            return OVERDUE;
166        }
167
168        public Thread getThread() {
169            return mHandler.getLooper().getThread();
170        }
171
172        public String getName() {
173            return mName;
174        }
175
176        public String describeBlockedStateLocked() {
177            if (mCurrentMonitor == null) {
178                return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
179            } else {
180                return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
181                        + " on " + mName + " (" + getThread().getName() + ")";
182            }
183        }
184
185        @Override
186        public void run() {
187            final int size = mMonitors.size();
188            for (int i = 0 ; i < size ; i++) {
189                synchronized (Watchdog.this) {
190                    mCurrentMonitor = mMonitors.get(i);
191                }
192                mCurrentMonitor.monitor();
193            }
194
195            synchronized (Watchdog.this) {
196                mCompleted = true;
197                mCurrentMonitor = null;
198            }
199        }
200    }
201
202    final class RebootRequestReceiver extends BroadcastReceiver {
203        @Override
204        public void onReceive(Context c, Intent intent) {
205            if (intent.getIntExtra("nowait", 0) != 0) {
206                rebootSystem("Received ACTION_REBOOT broadcast");
207                return;
208            }
209            Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
210        }
211    }
212
213    /** Monitor for checking the availability of binder threads. The monitor will block until
214     * there is a binder thread available to process in coming IPCs to make sure other processes
215     * can still communicate with the service.
216     */
217    private static final class BinderThreadMonitor implements Watchdog.Monitor {
218        @Override
219        public void monitor() {
220            Binder.blockUntilThreadAvailable();
221        }
222    }
223
224    public interface Monitor {
225        void monitor();
226    }
227
228    public static Watchdog getInstance() {
229        if (sWatchdog == null) {
230            sWatchdog = new Watchdog();
231        }
232
233        return sWatchdog;
234    }
235
236    private Watchdog() {
237        super("watchdog");
238        // Initialize handler checkers for each common thread we want to check.  Note
239        // that we are not currently checking the background thread, since it can
240        // potentially hold longer running operations with no guarantees about the timeliness
241        // of operations there.
242
243        // The shared foreground thread is the main checker.  It is where we
244        // will also dispatch monitor checks and do other work.
245        mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
246                "foreground thread", DEFAULT_TIMEOUT);
247        mHandlerCheckers.add(mMonitorChecker);
248        // Add checker for main thread.  We only do a quick check since there
249        // can be UI running on the thread.
250        mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
251                "main thread", DEFAULT_TIMEOUT));
252        // Add checker for shared UI thread.
253        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
254                "ui thread", DEFAULT_TIMEOUT));
255        // And also check IO thread.
256        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
257                "i/o thread", DEFAULT_TIMEOUT));
258        // And the display thread.
259        mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
260                "display thread", DEFAULT_TIMEOUT));
261
262        // Initialize monitor for Binder threads.
263        addMonitor(new BinderThreadMonitor());
264    }
265
266    public void init(Context context, ActivityManagerService activity) {
267        mResolver = context.getContentResolver();
268        mActivity = activity;
269
270        context.registerReceiver(new RebootRequestReceiver(),
271                new IntentFilter(Intent.ACTION_REBOOT),
272                android.Manifest.permission.REBOOT, null);
273    }
274
275    public void processStarted(String name, int pid) {
276        synchronized (this) {
277            if ("com.android.phone".equals(name)) {
278                mPhonePid = pid;
279            }
280        }
281    }
282
283    public void setActivityController(IActivityController controller) {
284        synchronized (this) {
285            mController = controller;
286        }
287    }
288
289    public void setAllowRestart(boolean allowRestart) {
290        synchronized (this) {
291            mAllowRestart = allowRestart;
292        }
293    }
294
295    public void addMonitor(Monitor monitor) {
296        synchronized (this) {
297            if (isAlive()) {
298                throw new RuntimeException("Monitors can't be added once the Watchdog is running");
299            }
300            mMonitorChecker.addMonitor(monitor);
301        }
302    }
303
304    public void addThread(Handler thread) {
305        addThread(thread, DEFAULT_TIMEOUT);
306    }
307
308    public void addThread(Handler thread, long timeoutMillis) {
309        synchronized (this) {
310            if (isAlive()) {
311                throw new RuntimeException("Threads can't be added once the Watchdog is running");
312            }
313            final String name = thread.getLooper().getThread().getName();
314            mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
315        }
316    }
317
318    /**
319     * Perform a full reboot of the system.
320     */
321    void rebootSystem(String reason) {
322        Slog.i(TAG, "Rebooting system because: " + reason);
323        IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
324        try {
325            pms.reboot(false, reason, false);
326        } catch (RemoteException ex) {
327        }
328    }
329
330    private int evaluateCheckerCompletionLocked() {
331        int state = COMPLETED;
332        for (int i=0; i<mHandlerCheckers.size(); i++) {
333            HandlerChecker hc = mHandlerCheckers.get(i);
334            state = Math.max(state, hc.getCompletionStateLocked());
335        }
336        return state;
337    }
338
339    private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
340        ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
341        for (int i=0; i<mHandlerCheckers.size(); i++) {
342            HandlerChecker hc = mHandlerCheckers.get(i);
343            if (hc.isOverdueLocked()) {
344                checkers.add(hc);
345            }
346        }
347        return checkers;
348    }
349
350    private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
351        StringBuilder builder = new StringBuilder(128);
352        for (int i=0; i<checkers.size(); i++) {
353            if (builder.length() > 0) {
354                builder.append(", ");
355            }
356            builder.append(checkers.get(i).describeBlockedStateLocked());
357        }
358        return builder.toString();
359    }
360
361    private ArrayList<Integer> getInterestingHalPids() {
362        try {
363            IServiceManager serviceManager = IServiceManager.getService();
364            ArrayList<IServiceManager.InstanceDebugInfo> dump =
365                    serviceManager.debugDump();
366            HashSet<Integer> pids = new HashSet<>();
367            for (IServiceManager.InstanceDebugInfo info : dump) {
368                if (info.pid == IServiceManager.PidConstant.NO_PID) {
369                    continue;
370                }
371
372                if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
373                    continue;
374                }
375
376                pids.add(info.pid);
377            }
378            return new ArrayList<Integer>(pids);
379        } catch (RemoteException e) {
380            return new ArrayList<Integer>();
381        }
382    }
383
384    private ArrayList<Integer> getInterestingNativePids() {
385        ArrayList<Integer> pids = getInterestingHalPids();
386
387        int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
388        if (nativePids != null) {
389            pids.ensureCapacity(pids.size() + nativePids.length);
390            for (int i : nativePids) {
391                pids.add(i);
392            }
393        }
394
395        return pids;
396    }
397
398    @Override
399    public void run() {
400        boolean waitedHalf = false;
401        while (true) {
402            final ArrayList<HandlerChecker> blockedCheckers;
403            final String subject;
404            final boolean allowRestart;
405            int debuggerWasConnected = 0;
406            synchronized (this) {
407                long timeout = CHECK_INTERVAL;
408                // Make sure we (re)spin the checkers that have become idle within
409                // this wait-and-check interval
410                for (int i=0; i<mHandlerCheckers.size(); i++) {
411                    HandlerChecker hc = mHandlerCheckers.get(i);
412                    hc.scheduleCheckLocked();
413                }
414
415                if (debuggerWasConnected > 0) {
416                    debuggerWasConnected--;
417                }
418
419                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
420                // wait while asleep. If the device is asleep then the thing that we are waiting
421                // to timeout on is asleep as well and won't have a chance to run, causing a false
422                // positive on when to kill things.
423                long start = SystemClock.uptimeMillis();
424                while (timeout > 0) {
425                    if (Debug.isDebuggerConnected()) {
426                        debuggerWasConnected = 2;
427                    }
428                    try {
429                        wait(timeout);
430                    } catch (InterruptedException e) {
431                        Log.wtf(TAG, e);
432                    }
433                    if (Debug.isDebuggerConnected()) {
434                        debuggerWasConnected = 2;
435                    }
436                    timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
437                }
438
439                final int waitState = evaluateCheckerCompletionLocked();
440                if (waitState == COMPLETED) {
441                    // The monitors have returned; reset
442                    waitedHalf = false;
443                    continue;
444                } else if (waitState == WAITING) {
445                    // still waiting but within their configured intervals; back off and recheck
446                    continue;
447                } else if (waitState == WAITED_HALF) {
448                    if (!waitedHalf) {
449                        // We've waited half the deadlock-detection interval.  Pull a stack
450                        // trace and wait another half.
451                        ArrayList<Integer> pids = new ArrayList<Integer>();
452                        pids.add(Process.myPid());
453                        ActivityManagerService.dumpStackTraces(true, pids, null, null,
454                            getInterestingNativePids());
455                        waitedHalf = true;
456                    }
457                    continue;
458                }
459
460                // something is overdue!
461                blockedCheckers = getBlockedCheckersLocked();
462                subject = describeCheckersLocked(blockedCheckers);
463                allowRestart = mAllowRestart;
464            }
465
466            // If we got here, that means that the system is most likely hung.
467            // First collect stack traces from all threads of the system process.
468            // Then kill this process so that the system will restart.
469            EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
470
471            ArrayList<Integer> pids = new ArrayList<>();
472            pids.add(Process.myPid());
473            if (mPhonePid > 0) pids.add(mPhonePid);
474            // Pass !waitedHalf so that just in case we somehow wind up here without having
475            // dumped the halfway stacks, we properly re-initialize the trace file.
476            final File stack = ActivityManagerService.dumpStackTraces(
477                    !waitedHalf, pids, null, null, getInterestingNativePids());
478
479            // Give some extra time to make sure the stack traces get written.
480            // The system's been hanging for a minute, another second or two won't hurt much.
481            SystemClock.sleep(2000);
482
483            // Pull our own kernel thread stacks as well if we're configured for that
484            if (RECORD_KERNEL_THREADS) {
485                dumpKernelStackTraces();
486            }
487
488            // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
489            doSysRq('w');
490            doSysRq('l');
491
492            // Try to add the error to the dropbox, but assuming that the ActivityManager
493            // itself may be deadlocked.  (which has happened, causing this statement to
494            // deadlock and the watchdog as a whole to be ineffective)
495            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
496                    public void run() {
497                        mActivity.addErrorToDropBox(
498                                "watchdog", null, "system_server", null, null,
499                                subject, null, stack, null);
500                    }
501                };
502            dropboxThread.start();
503            try {
504                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
505            } catch (InterruptedException ignored) {}
506
507            IActivityController controller;
508            synchronized (this) {
509                controller = mController;
510            }
511            if (controller != null) {
512                Slog.i(TAG, "Reporting stuck state to activity controller");
513                try {
514                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
515                    // 1 = keep waiting, -1 = kill system
516                    int res = controller.systemNotResponding(subject);
517                    if (res >= 0) {
518                        Slog.i(TAG, "Activity controller requested to coninue to wait");
519                        waitedHalf = false;
520                        continue;
521                    }
522                } catch (RemoteException e) {
523                }
524            }
525
526            // Only kill the process if the debugger is not attached.
527            if (Debug.isDebuggerConnected()) {
528                debuggerWasConnected = 2;
529            }
530            if (debuggerWasConnected >= 2) {
531                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
532            } else if (debuggerWasConnected > 0) {
533                Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
534            } else if (!allowRestart) {
535                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
536            } else {
537                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
538                for (int i=0; i<blockedCheckers.size(); i++) {
539                    Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
540                    StackTraceElement[] stackTrace
541                            = blockedCheckers.get(i).getThread().getStackTrace();
542                    for (StackTraceElement element: stackTrace) {
543                        Slog.w(TAG, "    at " + element);
544                    }
545                }
546                Slog.w(TAG, "*** GOODBYE!");
547                Process.killProcess(Process.myPid());
548                System.exit(10);
549            }
550
551            waitedHalf = false;
552        }
553    }
554
555    private void doSysRq(char c) {
556        try {
557            FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
558            sysrq_trigger.write(c);
559            sysrq_trigger.close();
560        } catch (IOException e) {
561            Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
562        }
563    }
564
565    private File dumpKernelStackTraces() {
566        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
567        if (tracesPath == null || tracesPath.length() == 0) {
568            return null;
569        }
570
571        native_dumpKernelStacks(tracesPath);
572        return new File(tracesPath);
573    }
574
575    private native void native_dumpKernelStacks(String tracesPath);
576}
577