1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import android.app.IActivityController;
20import android.os.Binder;
21import android.os.Build;
22import android.os.RemoteException;
23import android.system.ErrnoException;
24import android.system.OsConstants;
25import android.system.StructRlimit;
26import com.android.internal.os.ZygoteConnectionConstants;
27import com.android.server.am.ActivityManagerService;
28
29import android.content.BroadcastReceiver;
30import android.content.ContentResolver;
31import android.content.Context;
32import android.content.Intent;
33import android.content.IntentFilter;
34import android.hidl.manager.V1_0.IServiceManager;
35import android.os.Debug;
36import android.os.Handler;
37import android.os.IPowerManager;
38import android.os.Looper;
39import android.os.Process;
40import android.os.ServiceManager;
41import android.os.SystemClock;
42import android.os.SystemProperties;
43import android.util.EventLog;
44import android.util.Log;
45import android.util.Slog;
46
47import java.io.File;
48import java.io.FileWriter;
49import java.io.IOException;
50import java.util.ArrayList;
51import java.util.Arrays;
52import java.util.Collections;
53import java.util.HashSet;
54import java.util.List;
55
56/** This class calls its monitor every minute. Killing this process if they don't return **/
57public class Watchdog extends Thread {
58    static final String TAG = "Watchdog";
59
60    // Set this to true to use debug default values.
61    static final boolean DB = false;
62
63    // Set this to true to have the watchdog record kernel thread stacks when it fires
64    static final boolean RECORD_KERNEL_THREADS = true;
65
66    // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with
67    //         timeout in com.android.internal.os.ZygoteConnection, or wrapped applications
68    //         can trigger the watchdog.
69    // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped
70    //         applications may not work with a debug build. CTS will fail.
71    static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
72    static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
73
74    // These are temporally ordered: larger values as lateness increases
75    static final int COMPLETED = 0;
76    static final int WAITING = 1;
77    static final int WAITED_HALF = 2;
78    static final int OVERDUE = 3;
79
80    // Which native processes to dump into dropbox's stack traces
81    public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
82        "/system/bin/audioserver",
83        "/system/bin/cameraserver",
84        "/system/bin/drmserver",
85        "/system/bin/mediadrmserver",
86        "/system/bin/mediaserver",
87        "/system/bin/sdcard",
88        "/system/bin/surfaceflinger",
89        "media.extractor", // system/bin/mediaextractor
90        "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
91        "com.android.bluetooth",  // Bluetooth service
92    };
93
94    public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
95        "android.hardware.audio@2.0::IDevicesFactory",
96        "android.hardware.bluetooth@1.0::IBluetoothHci",
97        "android.hardware.camera.provider@2.4::ICameraProvider",
98        "android.hardware.graphics.composer@2.1::IComposer",
99        "android.hardware.media.omx@1.0::IOmx",
100        "android.hardware.sensors@1.0::ISensors",
101        "android.hardware.vr@1.0::IVr"
102    );
103
104    static Watchdog sWatchdog;
105
106    /* This handler will be used to post message back onto the main thread */
107    final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
108    final HandlerChecker mMonitorChecker;
109    ContentResolver mResolver;
110    ActivityManagerService mActivity;
111
112    int mPhonePid;
113    IActivityController mController;
114    boolean mAllowRestart = true;
115    final OpenFdMonitor mOpenFdMonitor;
116
117    /**
118     * Used for checking status of handle threads and scheduling monitor callbacks.
119     */
120    public final class HandlerChecker implements Runnable {
121        private final Handler mHandler;
122        private final String mName;
123        private final long mWaitMax;
124        private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
125        private boolean mCompleted;
126        private Monitor mCurrentMonitor;
127        private long mStartTime;
128
129        HandlerChecker(Handler handler, String name, long waitMaxMillis) {
130            mHandler = handler;
131            mName = name;
132            mWaitMax = waitMaxMillis;
133            mCompleted = true;
134        }
135
136        public void addMonitor(Monitor monitor) {
137            mMonitors.add(monitor);
138        }
139
140        public void scheduleCheckLocked() {
141            if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
142                // If the target looper has recently been polling, then
143                // there is no reason to enqueue our checker on it since that
144                // is as good as it not being deadlocked.  This avoid having
145                // to do a context switch to check the thread.  Note that we
146                // only do this if mCheckReboot is false and we have no
147                // monitors, since those would need to be executed at this point.
148                mCompleted = true;
149                return;
150            }
151
152            if (!mCompleted) {
153                // we already have a check in flight, so no need
154                return;
155            }
156
157            mCompleted = false;
158            mCurrentMonitor = null;
159            mStartTime = SystemClock.uptimeMillis();
160            mHandler.postAtFrontOfQueue(this);
161        }
162
163        public boolean isOverdueLocked() {
164            return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
165        }
166
167        public int getCompletionStateLocked() {
168            if (mCompleted) {
169                return COMPLETED;
170            } else {
171                long latency = SystemClock.uptimeMillis() - mStartTime;
172                if (latency < mWaitMax/2) {
173                    return WAITING;
174                } else if (latency < mWaitMax) {
175                    return WAITED_HALF;
176                }
177            }
178            return OVERDUE;
179        }
180
181        public Thread getThread() {
182            return mHandler.getLooper().getThread();
183        }
184
185        public String getName() {
186            return mName;
187        }
188
189        public String describeBlockedStateLocked() {
190            if (mCurrentMonitor == null) {
191                return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
192            } else {
193                return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
194                        + " on " + mName + " (" + getThread().getName() + ")";
195            }
196        }
197
198        @Override
199        public void run() {
200            final int size = mMonitors.size();
201            for (int i = 0 ; i < size ; i++) {
202                synchronized (Watchdog.this) {
203                    mCurrentMonitor = mMonitors.get(i);
204                }
205                mCurrentMonitor.monitor();
206            }
207
208            synchronized (Watchdog.this) {
209                mCompleted = true;
210                mCurrentMonitor = null;
211            }
212        }
213    }
214
215    final class RebootRequestReceiver extends BroadcastReceiver {
216        @Override
217        public void onReceive(Context c, Intent intent) {
218            if (intent.getIntExtra("nowait", 0) != 0) {
219                rebootSystem("Received ACTION_REBOOT broadcast");
220                return;
221            }
222            Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
223        }
224    }
225
226    /** Monitor for checking the availability of binder threads. The monitor will block until
227     * there is a binder thread available to process in coming IPCs to make sure other processes
228     * can still communicate with the service.
229     */
230    private static final class BinderThreadMonitor implements Watchdog.Monitor {
231        @Override
232        public void monitor() {
233            Binder.blockUntilThreadAvailable();
234        }
235    }
236
237    public interface Monitor {
238        void monitor();
239    }
240
241    public static Watchdog getInstance() {
242        if (sWatchdog == null) {
243            sWatchdog = new Watchdog();
244        }
245
246        return sWatchdog;
247    }
248
249    private Watchdog() {
250        super("watchdog");
251        // Initialize handler checkers for each common thread we want to check.  Note
252        // that we are not currently checking the background thread, since it can
253        // potentially hold longer running operations with no guarantees about the timeliness
254        // of operations there.
255
256        // The shared foreground thread is the main checker.  It is where we
257        // will also dispatch monitor checks and do other work.
258        mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
259                "foreground thread", DEFAULT_TIMEOUT);
260        mHandlerCheckers.add(mMonitorChecker);
261        // Add checker for main thread.  We only do a quick check since there
262        // can be UI running on the thread.
263        mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
264                "main thread", DEFAULT_TIMEOUT));
265        // Add checker for shared UI thread.
266        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
267                "ui thread", DEFAULT_TIMEOUT));
268        // And also check IO thread.
269        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
270                "i/o thread", DEFAULT_TIMEOUT));
271        // And the display thread.
272        mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
273                "display thread", DEFAULT_TIMEOUT));
274
275        // Initialize monitor for Binder threads.
276        addMonitor(new BinderThreadMonitor());
277
278        mOpenFdMonitor = OpenFdMonitor.create();
279
280        // See the notes on DEFAULT_TIMEOUT.
281        assert DB ||
282                DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
283    }
284
285    public void init(Context context, ActivityManagerService activity) {
286        mResolver = context.getContentResolver();
287        mActivity = activity;
288
289        context.registerReceiver(new RebootRequestReceiver(),
290                new IntentFilter(Intent.ACTION_REBOOT),
291                android.Manifest.permission.REBOOT, null);
292    }
293
294    public void processStarted(String name, int pid) {
295        synchronized (this) {
296            if ("com.android.phone".equals(name)) {
297                mPhonePid = pid;
298            }
299        }
300    }
301
302    public void setActivityController(IActivityController controller) {
303        synchronized (this) {
304            mController = controller;
305        }
306    }
307
308    public void setAllowRestart(boolean allowRestart) {
309        synchronized (this) {
310            mAllowRestart = allowRestart;
311        }
312    }
313
314    public void addMonitor(Monitor monitor) {
315        synchronized (this) {
316            if (isAlive()) {
317                throw new RuntimeException("Monitors can't be added once the Watchdog is running");
318            }
319            mMonitorChecker.addMonitor(monitor);
320        }
321    }
322
323    public void addThread(Handler thread) {
324        addThread(thread, DEFAULT_TIMEOUT);
325    }
326
327    public void addThread(Handler thread, long timeoutMillis) {
328        synchronized (this) {
329            if (isAlive()) {
330                throw new RuntimeException("Threads can't be added once the Watchdog is running");
331            }
332            final String name = thread.getLooper().getThread().getName();
333            mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
334        }
335    }
336
337    /**
338     * Perform a full reboot of the system.
339     */
340    void rebootSystem(String reason) {
341        Slog.i(TAG, "Rebooting system because: " + reason);
342        IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
343        try {
344            pms.reboot(false, reason, false);
345        } catch (RemoteException ex) {
346        }
347    }
348
349    private int evaluateCheckerCompletionLocked() {
350        int state = COMPLETED;
351        for (int i=0; i<mHandlerCheckers.size(); i++) {
352            HandlerChecker hc = mHandlerCheckers.get(i);
353            state = Math.max(state, hc.getCompletionStateLocked());
354        }
355        return state;
356    }
357
358    private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
359        ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
360        for (int i=0; i<mHandlerCheckers.size(); i++) {
361            HandlerChecker hc = mHandlerCheckers.get(i);
362            if (hc.isOverdueLocked()) {
363                checkers.add(hc);
364            }
365        }
366        return checkers;
367    }
368
369    private String describeCheckersLocked(List<HandlerChecker> checkers) {
370        StringBuilder builder = new StringBuilder(128);
371        for (int i=0; i<checkers.size(); i++) {
372            if (builder.length() > 0) {
373                builder.append(", ");
374            }
375            builder.append(checkers.get(i).describeBlockedStateLocked());
376        }
377        return builder.toString();
378    }
379
380    private ArrayList<Integer> getInterestingHalPids() {
381        try {
382            IServiceManager serviceManager = IServiceManager.getService();
383            ArrayList<IServiceManager.InstanceDebugInfo> dump =
384                    serviceManager.debugDump();
385            HashSet<Integer> pids = new HashSet<>();
386            for (IServiceManager.InstanceDebugInfo info : dump) {
387                if (info.pid == IServiceManager.PidConstant.NO_PID) {
388                    continue;
389                }
390
391                if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
392                    continue;
393                }
394
395                pids.add(info.pid);
396            }
397            return new ArrayList<Integer>(pids);
398        } catch (RemoteException e) {
399            return new ArrayList<Integer>();
400        }
401    }
402
403    private ArrayList<Integer> getInterestingNativePids() {
404        ArrayList<Integer> pids = getInterestingHalPids();
405
406        int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
407        if (nativePids != null) {
408            pids.ensureCapacity(pids.size() + nativePids.length);
409            for (int i : nativePids) {
410                pids.add(i);
411            }
412        }
413
414        return pids;
415    }
416
417    @Override
418    public void run() {
419        boolean waitedHalf = false;
420        while (true) {
421            final List<HandlerChecker> blockedCheckers;
422            final String subject;
423            final boolean allowRestart;
424            int debuggerWasConnected = 0;
425            synchronized (this) {
426                long timeout = CHECK_INTERVAL;
427                // Make sure we (re)spin the checkers that have become idle within
428                // this wait-and-check interval
429                for (int i=0; i<mHandlerCheckers.size(); i++) {
430                    HandlerChecker hc = mHandlerCheckers.get(i);
431                    hc.scheduleCheckLocked();
432                }
433
434                if (debuggerWasConnected > 0) {
435                    debuggerWasConnected--;
436                }
437
438                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
439                // wait while asleep. If the device is asleep then the thing that we are waiting
440                // to timeout on is asleep as well and won't have a chance to run, causing a false
441                // positive on when to kill things.
442                long start = SystemClock.uptimeMillis();
443                while (timeout > 0) {
444                    if (Debug.isDebuggerConnected()) {
445                        debuggerWasConnected = 2;
446                    }
447                    try {
448                        wait(timeout);
449                    } catch (InterruptedException e) {
450                        Log.wtf(TAG, e);
451                    }
452                    if (Debug.isDebuggerConnected()) {
453                        debuggerWasConnected = 2;
454                    }
455                    timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
456                }
457
458                boolean fdLimitTriggered = false;
459                if (mOpenFdMonitor != null) {
460                    fdLimitTriggered = mOpenFdMonitor.monitor();
461                }
462
463                if (!fdLimitTriggered) {
464                    final int waitState = evaluateCheckerCompletionLocked();
465                    if (waitState == COMPLETED) {
466                        // The monitors have returned; reset
467                        waitedHalf = false;
468                        continue;
469                    } else if (waitState == WAITING) {
470                        // still waiting but within their configured intervals; back off and recheck
471                        continue;
472                    } else if (waitState == WAITED_HALF) {
473                        if (!waitedHalf) {
474                            // We've waited half the deadlock-detection interval.  Pull a stack
475                            // trace and wait another half.
476                            ArrayList<Integer> pids = new ArrayList<Integer>();
477                            pids.add(Process.myPid());
478                            ActivityManagerService.dumpStackTraces(true, pids, null, null,
479                                getInterestingNativePids());
480                            waitedHalf = true;
481                        }
482                        continue;
483                    }
484
485                    // something is overdue!
486                    blockedCheckers = getBlockedCheckersLocked();
487                    subject = describeCheckersLocked(blockedCheckers);
488                } else {
489                    blockedCheckers = Collections.emptyList();
490                    subject = "Open FD high water mark reached";
491                }
492                allowRestart = mAllowRestart;
493            }
494
495            // If we got here, that means that the system is most likely hung.
496            // First collect stack traces from all threads of the system process.
497            // Then kill this process so that the system will restart.
498            EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
499
500            ArrayList<Integer> pids = new ArrayList<>();
501            pids.add(Process.myPid());
502            if (mPhonePid > 0) pids.add(mPhonePid);
503            // Pass !waitedHalf so that just in case we somehow wind up here without having
504            // dumped the halfway stacks, we properly re-initialize the trace file.
505            final File stack = ActivityManagerService.dumpStackTraces(
506                    !waitedHalf, pids, null, null, getInterestingNativePids());
507
508            // Give some extra time to make sure the stack traces get written.
509            // The system's been hanging for a minute, another second or two won't hurt much.
510            SystemClock.sleep(2000);
511
512            // Pull our own kernel thread stacks as well if we're configured for that
513            if (RECORD_KERNEL_THREADS) {
514                dumpKernelStackTraces();
515            }
516
517            // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
518            doSysRq('w');
519            doSysRq('l');
520
521            // Try to add the error to the dropbox, but assuming that the ActivityManager
522            // itself may be deadlocked.  (which has happened, causing this statement to
523            // deadlock and the watchdog as a whole to be ineffective)
524            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
525                    public void run() {
526                        mActivity.addErrorToDropBox(
527                                "watchdog", null, "system_server", null, null,
528                                subject, null, stack, null);
529                    }
530                };
531            dropboxThread.start();
532            try {
533                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
534            } catch (InterruptedException ignored) {}
535
536            IActivityController controller;
537            synchronized (this) {
538                controller = mController;
539            }
540            if (controller != null) {
541                Slog.i(TAG, "Reporting stuck state to activity controller");
542                try {
543                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
544                    // 1 = keep waiting, -1 = kill system
545                    int res = controller.systemNotResponding(subject);
546                    if (res >= 0) {
547                        Slog.i(TAG, "Activity controller requested to coninue to wait");
548                        waitedHalf = false;
549                        continue;
550                    }
551                } catch (RemoteException e) {
552                }
553            }
554
555            // Only kill the process if the debugger is not attached.
556            if (Debug.isDebuggerConnected()) {
557                debuggerWasConnected = 2;
558            }
559            if (debuggerWasConnected >= 2) {
560                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
561            } else if (debuggerWasConnected > 0) {
562                Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
563            } else if (!allowRestart) {
564                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
565            } else {
566                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
567                for (int i=0; i<blockedCheckers.size(); i++) {
568                    Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
569                    StackTraceElement[] stackTrace
570                            = blockedCheckers.get(i).getThread().getStackTrace();
571                    for (StackTraceElement element: stackTrace) {
572                        Slog.w(TAG, "    at " + element);
573                    }
574                }
575                Slog.w(TAG, "*** GOODBYE!");
576                Process.killProcess(Process.myPid());
577                System.exit(10);
578            }
579
580            waitedHalf = false;
581        }
582    }
583
584    private void doSysRq(char c) {
585        try {
586            FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
587            sysrq_trigger.write(c);
588            sysrq_trigger.close();
589        } catch (IOException e) {
590            Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
591        }
592    }
593
594    private File dumpKernelStackTraces() {
595        String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
596        if (tracesPath == null || tracesPath.length() == 0) {
597            return null;
598        }
599
600        native_dumpKernelStacks(tracesPath);
601        return new File(tracesPath);
602    }
603
604    private native void native_dumpKernelStacks(String tracesPath);
605
606    public static final class OpenFdMonitor {
607        /**
608         * Number of FDs below the soft limit that we trigger a runtime restart at. This was
609         * chosen arbitrarily, but will need to be at least 6 in order to have a sufficient number
610         * of FDs in reserve to complete a dump.
611         */
612        private static final int FD_HIGH_WATER_MARK = 12;
613
614        private final File mDumpDir;
615        private final File mFdHighWaterMark;
616
617        public static OpenFdMonitor create() {
618            // Only run the FD monitor on debuggable builds (such as userdebug and eng builds).
619            if (!Build.IS_DEBUGGABLE) {
620                return null;
621            }
622
623            // Don't run the FD monitor on builds that have a global ANR trace file. We're using
624            // the ANR trace directory as a quick hack in order to get these traces in bugreports
625            // and we wouldn't want to overwrite something important.
626            final String dumpDirStr = SystemProperties.get("dalvik.vm.stack-trace-dir", "");
627            if (dumpDirStr.isEmpty()) {
628                return null;
629            }
630
631            final StructRlimit rlimit;
632            try {
633                rlimit = android.system.Os.getrlimit(OsConstants.RLIMIT_NOFILE);
634            } catch (ErrnoException errno) {
635                Slog.w(TAG, "Error thrown from getrlimit(RLIMIT_NOFILE)", errno);
636                return null;
637            }
638
639            // The assumption we're making here is that FD numbers are allocated (more or less)
640            // sequentially, which is currently (and historically) true since open is currently
641            // specified to always return the lowest-numbered non-open file descriptor for the
642            // current process.
643            //
644            // We do this to avoid having to enumerate the contents of /proc/self/fd in order to
645            // count the number of descriptors open in the process.
646            final File fdThreshold = new File("/proc/self/fd/" + (rlimit.rlim_cur - FD_HIGH_WATER_MARK));
647            return new OpenFdMonitor(new File(dumpDirStr), fdThreshold);
648        }
649
650        OpenFdMonitor(File dumpDir, File fdThreshold) {
651            mDumpDir = dumpDir;
652            mFdHighWaterMark = fdThreshold;
653        }
654
655        private void dumpOpenDescriptors() {
656            try {
657                File dumpFile = File.createTempFile("anr_fd_", "", mDumpDir);
658                java.lang.Process proc = new ProcessBuilder()
659                    .command("/system/bin/lsof", "-p", String.valueOf(Process.myPid()))
660                    .redirectErrorStream(true)
661                    .redirectOutput(dumpFile)
662                    .start();
663
664                int returnCode = proc.waitFor();
665                if (returnCode != 0) {
666                    Slog.w(TAG, "Unable to dump open descriptors, lsof return code: "
667                        + returnCode);
668                    dumpFile.delete();
669                }
670            } catch (IOException | InterruptedException ex) {
671                Slog.w(TAG, "Unable to dump open descriptors: " + ex);
672            }
673        }
674
675        /**
676         * @return {@code true} if the high water mark was breached and a dump was written,
677         *     {@code false} otherwise.
678         */
679        public boolean monitor() {
680            if (mFdHighWaterMark.exists()) {
681                dumpOpenDescriptors();
682                return true;
683            }
684
685            return false;
686        }
687    }
688}
689