1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.server;
18
19import android.app.IActivityController;
20import android.os.Binder;
21import android.os.Build;
22import android.os.RemoteException;
23import android.system.ErrnoException;
24import android.system.OsConstants;
25import android.system.StructRlimit;
26import com.android.internal.os.ZygoteConnectionConstants;
27import com.android.server.am.ActivityManagerService;
28
29import android.content.BroadcastReceiver;
30import android.content.ContentResolver;
31import android.content.Context;
32import android.content.Intent;
33import android.content.IntentFilter;
34import android.hidl.manager.V1_0.IServiceManager;
35import android.os.Debug;
36import android.os.Handler;
37import android.os.IPowerManager;
38import android.os.Looper;
39import android.os.Process;
40import android.os.ServiceManager;
41import android.os.SystemClock;
42import android.os.SystemProperties;
43import android.util.EventLog;
44import android.util.Log;
45import android.util.Slog;
46
47import java.io.File;
48import java.io.FileWriter;
49import java.io.IOException;
50import java.util.ArrayList;
51import java.util.Arrays;
52import java.util.Collections;
53import java.util.HashSet;
54import java.util.List;
55
56/** This class calls its monitor every minute. Killing this process if they don't return **/
57public class Watchdog extends Thread {
58    static final String TAG = "Watchdog";
59
60    // Set this to true to use debug default values.
61    static final boolean DB = false;
62
63    // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with
64    //         timeout in com.android.internal.os.ZygoteConnection, or wrapped applications
65    //         can trigger the watchdog.
66    // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped
67    //         applications may not work with a debug build. CTS will fail.
68    static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
69    static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
70
71    // These are temporally ordered: larger values as lateness increases
72    static final int COMPLETED = 0;
73    static final int WAITING = 1;
74    static final int WAITED_HALF = 2;
75    static final int OVERDUE = 3;
76
77    // Which native processes to dump into dropbox's stack traces
78    public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
79        "/system/bin/audioserver",
80        "/system/bin/cameraserver",
81        "/system/bin/drmserver",
82        "/system/bin/mediadrmserver",
83        "/system/bin/mediaserver",
84        "/system/bin/sdcard",
85        "/system/bin/surfaceflinger",
86        "media.extractor", // system/bin/mediaextractor
87        "media.metrics", // system/bin/mediametrics
88        "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
89        "com.android.bluetooth",  // Bluetooth service
90        "statsd",  // Stats daemon
91    };
92
93    public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
94        "android.hardware.audio@2.0::IDevicesFactory",
95        "android.hardware.audio@4.0::IDevicesFactory",
96        "android.hardware.bluetooth@1.0::IBluetoothHci",
97        "android.hardware.camera.provider@2.4::ICameraProvider",
98        "android.hardware.graphics.composer@2.1::IComposer",
99        "android.hardware.media.omx@1.0::IOmx",
100        "android.hardware.media.omx@1.0::IOmxStore",
101        "android.hardware.sensors@1.0::ISensors",
102        "android.hardware.vr@1.0::IVr"
103    );
104
105    static Watchdog sWatchdog;
106
107    /* This handler will be used to post message back onto the main thread */
108    final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
109    final HandlerChecker mMonitorChecker;
110    ContentResolver mResolver;
111    ActivityManagerService mActivity;
112
113    int mPhonePid;
114    IActivityController mController;
115    boolean mAllowRestart = true;
116    final OpenFdMonitor mOpenFdMonitor;
117
118    /**
119     * Used for checking status of handle threads and scheduling monitor callbacks.
120     */
121    public final class HandlerChecker implements Runnable {
122        private final Handler mHandler;
123        private final String mName;
124        private final long mWaitMax;
125        private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
126        private boolean mCompleted;
127        private Monitor mCurrentMonitor;
128        private long mStartTime;
129
130        HandlerChecker(Handler handler, String name, long waitMaxMillis) {
131            mHandler = handler;
132            mName = name;
133            mWaitMax = waitMaxMillis;
134            mCompleted = true;
135        }
136
137        public void addMonitor(Monitor monitor) {
138            mMonitors.add(monitor);
139        }
140
141        public void scheduleCheckLocked() {
142            if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
143                // If the target looper has recently been polling, then
144                // there is no reason to enqueue our checker on it since that
145                // is as good as it not being deadlocked.  This avoid having
146                // to do a context switch to check the thread.  Note that we
147                // only do this if mCheckReboot is false and we have no
148                // monitors, since those would need to be executed at this point.
149                mCompleted = true;
150                return;
151            }
152
153            if (!mCompleted) {
154                // we already have a check in flight, so no need
155                return;
156            }
157
158            mCompleted = false;
159            mCurrentMonitor = null;
160            mStartTime = SystemClock.uptimeMillis();
161            mHandler.postAtFrontOfQueue(this);
162        }
163
164        public boolean isOverdueLocked() {
165            return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
166        }
167
168        public int getCompletionStateLocked() {
169            if (mCompleted) {
170                return COMPLETED;
171            } else {
172                long latency = SystemClock.uptimeMillis() - mStartTime;
173                if (latency < mWaitMax/2) {
174                    return WAITING;
175                } else if (latency < mWaitMax) {
176                    return WAITED_HALF;
177                }
178            }
179            return OVERDUE;
180        }
181
182        public Thread getThread() {
183            return mHandler.getLooper().getThread();
184        }
185
186        public String getName() {
187            return mName;
188        }
189
190        public String describeBlockedStateLocked() {
191            if (mCurrentMonitor == null) {
192                return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
193            } else {
194                return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
195                        + " on " + mName + " (" + getThread().getName() + ")";
196            }
197        }
198
199        @Override
200        public void run() {
201            final int size = mMonitors.size();
202            for (int i = 0 ; i < size ; i++) {
203                synchronized (Watchdog.this) {
204                    mCurrentMonitor = mMonitors.get(i);
205                }
206                mCurrentMonitor.monitor();
207            }
208
209            synchronized (Watchdog.this) {
210                mCompleted = true;
211                mCurrentMonitor = null;
212            }
213        }
214    }
215
216    final class RebootRequestReceiver extends BroadcastReceiver {
217        @Override
218        public void onReceive(Context c, Intent intent) {
219            if (intent.getIntExtra("nowait", 0) != 0) {
220                rebootSystem("Received ACTION_REBOOT broadcast");
221                return;
222            }
223            Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
224        }
225    }
226
227    /** Monitor for checking the availability of binder threads. The monitor will block until
228     * there is a binder thread available to process in coming IPCs to make sure other processes
229     * can still communicate with the service.
230     */
231    private static final class BinderThreadMonitor implements Watchdog.Monitor {
232        @Override
233        public void monitor() {
234            Binder.blockUntilThreadAvailable();
235        }
236    }
237
238    public interface Monitor {
239        void monitor();
240    }
241
242    public static Watchdog getInstance() {
243        if (sWatchdog == null) {
244            sWatchdog = new Watchdog();
245        }
246
247        return sWatchdog;
248    }
249
250    private Watchdog() {
251        super("watchdog");
252        // Initialize handler checkers for each common thread we want to check.  Note
253        // that we are not currently checking the background thread, since it can
254        // potentially hold longer running operations with no guarantees about the timeliness
255        // of operations there.
256
257        // The shared foreground thread is the main checker.  It is where we
258        // will also dispatch monitor checks and do other work.
259        mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
260                "foreground thread", DEFAULT_TIMEOUT);
261        mHandlerCheckers.add(mMonitorChecker);
262        // Add checker for main thread.  We only do a quick check since there
263        // can be UI running on the thread.
264        mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
265                "main thread", DEFAULT_TIMEOUT));
266        // Add checker for shared UI thread.
267        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
268                "ui thread", DEFAULT_TIMEOUT));
269        // And also check IO thread.
270        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
271                "i/o thread", DEFAULT_TIMEOUT));
272        // And the display thread.
273        mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
274                "display thread", DEFAULT_TIMEOUT));
275
276        // Initialize monitor for Binder threads.
277        addMonitor(new BinderThreadMonitor());
278
279        mOpenFdMonitor = OpenFdMonitor.create();
280
281        // See the notes on DEFAULT_TIMEOUT.
282        assert DB ||
283                DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
284    }
285
286    public void init(Context context, ActivityManagerService activity) {
287        mResolver = context.getContentResolver();
288        mActivity = activity;
289
290        context.registerReceiver(new RebootRequestReceiver(),
291                new IntentFilter(Intent.ACTION_REBOOT),
292                android.Manifest.permission.REBOOT, null);
293    }
294
295    public void processStarted(String name, int pid) {
296        synchronized (this) {
297            if ("com.android.phone".equals(name)) {
298                mPhonePid = pid;
299            }
300        }
301    }
302
303    public void setActivityController(IActivityController controller) {
304        synchronized (this) {
305            mController = controller;
306        }
307    }
308
309    public void setAllowRestart(boolean allowRestart) {
310        synchronized (this) {
311            mAllowRestart = allowRestart;
312        }
313    }
314
315    public void addMonitor(Monitor monitor) {
316        synchronized (this) {
317            if (isAlive()) {
318                throw new RuntimeException("Monitors can't be added once the Watchdog is running");
319            }
320            mMonitorChecker.addMonitor(monitor);
321        }
322    }
323
324    public void addThread(Handler thread) {
325        addThread(thread, DEFAULT_TIMEOUT);
326    }
327
328    public void addThread(Handler thread, long timeoutMillis) {
329        synchronized (this) {
330            if (isAlive()) {
331                throw new RuntimeException("Threads can't be added once the Watchdog is running");
332            }
333            final String name = thread.getLooper().getThread().getName();
334            mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
335        }
336    }
337
338    /**
339     * Perform a full reboot of the system.
340     */
341    void rebootSystem(String reason) {
342        Slog.i(TAG, "Rebooting system because: " + reason);
343        IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
344        try {
345            pms.reboot(false, reason, false);
346        } catch (RemoteException ex) {
347        }
348    }
349
350    private int evaluateCheckerCompletionLocked() {
351        int state = COMPLETED;
352        for (int i=0; i<mHandlerCheckers.size(); i++) {
353            HandlerChecker hc = mHandlerCheckers.get(i);
354            state = Math.max(state, hc.getCompletionStateLocked());
355        }
356        return state;
357    }
358
359    private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
360        ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
361        for (int i=0; i<mHandlerCheckers.size(); i++) {
362            HandlerChecker hc = mHandlerCheckers.get(i);
363            if (hc.isOverdueLocked()) {
364                checkers.add(hc);
365            }
366        }
367        return checkers;
368    }
369
370    private String describeCheckersLocked(List<HandlerChecker> checkers) {
371        StringBuilder builder = new StringBuilder(128);
372        for (int i=0; i<checkers.size(); i++) {
373            if (builder.length() > 0) {
374                builder.append(", ");
375            }
376            builder.append(checkers.get(i).describeBlockedStateLocked());
377        }
378        return builder.toString();
379    }
380
381    private ArrayList<Integer> getInterestingHalPids() {
382        try {
383            IServiceManager serviceManager = IServiceManager.getService();
384            ArrayList<IServiceManager.InstanceDebugInfo> dump =
385                    serviceManager.debugDump();
386            HashSet<Integer> pids = new HashSet<>();
387            for (IServiceManager.InstanceDebugInfo info : dump) {
388                if (info.pid == IServiceManager.PidConstant.NO_PID) {
389                    continue;
390                }
391
392                if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
393                    continue;
394                }
395
396                pids.add(info.pid);
397            }
398            return new ArrayList<Integer>(pids);
399        } catch (RemoteException e) {
400            return new ArrayList<Integer>();
401        }
402    }
403
404    private ArrayList<Integer> getInterestingNativePids() {
405        ArrayList<Integer> pids = getInterestingHalPids();
406
407        int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
408        if (nativePids != null) {
409            pids.ensureCapacity(pids.size() + nativePids.length);
410            for (int i : nativePids) {
411                pids.add(i);
412            }
413        }
414
415        return pids;
416    }
417
418    @Override
419    public void run() {
420        boolean waitedHalf = false;
421        while (true) {
422            final List<HandlerChecker> blockedCheckers;
423            final String subject;
424            final boolean allowRestart;
425            int debuggerWasConnected = 0;
426            synchronized (this) {
427                long timeout = CHECK_INTERVAL;
428                // Make sure we (re)spin the checkers that have become idle within
429                // this wait-and-check interval
430                for (int i=0; i<mHandlerCheckers.size(); i++) {
431                    HandlerChecker hc = mHandlerCheckers.get(i);
432                    hc.scheduleCheckLocked();
433                }
434
435                if (debuggerWasConnected > 0) {
436                    debuggerWasConnected--;
437                }
438
439                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
440                // wait while asleep. If the device is asleep then the thing that we are waiting
441                // to timeout on is asleep as well and won't have a chance to run, causing a false
442                // positive on when to kill things.
443                long start = SystemClock.uptimeMillis();
444                while (timeout > 0) {
445                    if (Debug.isDebuggerConnected()) {
446                        debuggerWasConnected = 2;
447                    }
448                    try {
449                        wait(timeout);
450                    } catch (InterruptedException e) {
451                        Log.wtf(TAG, e);
452                    }
453                    if (Debug.isDebuggerConnected()) {
454                        debuggerWasConnected = 2;
455                    }
456                    timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
457                }
458
459                boolean fdLimitTriggered = false;
460                if (mOpenFdMonitor != null) {
461                    fdLimitTriggered = mOpenFdMonitor.monitor();
462                }
463
464                if (!fdLimitTriggered) {
465                    final int waitState = evaluateCheckerCompletionLocked();
466                    if (waitState == COMPLETED) {
467                        // The monitors have returned; reset
468                        waitedHalf = false;
469                        continue;
470                    } else if (waitState == WAITING) {
471                        // still waiting but within their configured intervals; back off and recheck
472                        continue;
473                    } else if (waitState == WAITED_HALF) {
474                        if (!waitedHalf) {
475                            // We've waited half the deadlock-detection interval.  Pull a stack
476                            // trace and wait another half.
477                            ArrayList<Integer> pids = new ArrayList<Integer>();
478                            pids.add(Process.myPid());
479                            ActivityManagerService.dumpStackTraces(true, pids, null, null,
480                                getInterestingNativePids());
481                            waitedHalf = true;
482                        }
483                        continue;
484                    }
485
486                    // something is overdue!
487                    blockedCheckers = getBlockedCheckersLocked();
488                    subject = describeCheckersLocked(blockedCheckers);
489                } else {
490                    blockedCheckers = Collections.emptyList();
491                    subject = "Open FD high water mark reached";
492                }
493                allowRestart = mAllowRestart;
494            }
495
496            // If we got here, that means that the system is most likely hung.
497            // First collect stack traces from all threads of the system process.
498            // Then kill this process so that the system will restart.
499            EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
500
501            ArrayList<Integer> pids = new ArrayList<>();
502            pids.add(Process.myPid());
503            if (mPhonePid > 0) pids.add(mPhonePid);
504            // Pass !waitedHalf so that just in case we somehow wind up here without having
505            // dumped the halfway stacks, we properly re-initialize the trace file.
506            final File stack = ActivityManagerService.dumpStackTraces(
507                    !waitedHalf, pids, null, null, getInterestingNativePids());
508
509            // Give some extra time to make sure the stack traces get written.
510            // The system's been hanging for a minute, another second or two won't hurt much.
511            SystemClock.sleep(2000);
512
513            // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
514            doSysRq('w');
515            doSysRq('l');
516
517            // Try to add the error to the dropbox, but assuming that the ActivityManager
518            // itself may be deadlocked.  (which has happened, causing this statement to
519            // deadlock and the watchdog as a whole to be ineffective)
520            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
521                    public void run() {
522                        mActivity.addErrorToDropBox(
523                                "watchdog", null, "system_server", null, null,
524                                subject, null, stack, null);
525                    }
526                };
527            dropboxThread.start();
528            try {
529                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
530            } catch (InterruptedException ignored) {}
531
532            IActivityController controller;
533            synchronized (this) {
534                controller = mController;
535            }
536            if (controller != null) {
537                Slog.i(TAG, "Reporting stuck state to activity controller");
538                try {
539                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
540                    // 1 = keep waiting, -1 = kill system
541                    int res = controller.systemNotResponding(subject);
542                    if (res >= 0) {
543                        Slog.i(TAG, "Activity controller requested to coninue to wait");
544                        waitedHalf = false;
545                        continue;
546                    }
547                } catch (RemoteException e) {
548                }
549            }
550
551            // Only kill the process if the debugger is not attached.
552            if (Debug.isDebuggerConnected()) {
553                debuggerWasConnected = 2;
554            }
555            if (debuggerWasConnected >= 2) {
556                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
557            } else if (debuggerWasConnected > 0) {
558                Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
559            } else if (!allowRestart) {
560                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
561            } else {
562                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
563                WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
564                Slog.w(TAG, "*** GOODBYE!");
565                Process.killProcess(Process.myPid());
566                System.exit(10);
567            }
568
569            waitedHalf = false;
570        }
571    }
572
573    private void doSysRq(char c) {
574        try {
575            FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
576            sysrq_trigger.write(c);
577            sysrq_trigger.close();
578        } catch (IOException e) {
579            Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
580        }
581    }
582
583    public static final class OpenFdMonitor {
584        /**
585         * Number of FDs below the soft limit that we trigger a runtime restart at. This was
586         * chosen arbitrarily, but will need to be at least 6 in order to have a sufficient number
587         * of FDs in reserve to complete a dump.
588         */
589        private static final int FD_HIGH_WATER_MARK = 12;
590
591        private final File mDumpDir;
592        private final File mFdHighWaterMark;
593
594        public static OpenFdMonitor create() {
595            // Only run the FD monitor on debuggable builds (such as userdebug and eng builds).
596            if (!Build.IS_DEBUGGABLE) {
597                return null;
598            }
599
600            // Don't run the FD monitor on builds that have a global ANR trace file. We're using
601            // the ANR trace directory as a quick hack in order to get these traces in bugreports
602            // and we wouldn't want to overwrite something important.
603            final String dumpDirStr = SystemProperties.get("dalvik.vm.stack-trace-dir", "");
604            if (dumpDirStr.isEmpty()) {
605                return null;
606            }
607
608            final StructRlimit rlimit;
609            try {
610                rlimit = android.system.Os.getrlimit(OsConstants.RLIMIT_NOFILE);
611            } catch (ErrnoException errno) {
612                Slog.w(TAG, "Error thrown from getrlimit(RLIMIT_NOFILE)", errno);
613                return null;
614            }
615
616            // The assumption we're making here is that FD numbers are allocated (more or less)
617            // sequentially, which is currently (and historically) true since open is currently
618            // specified to always return the lowest-numbered non-open file descriptor for the
619            // current process.
620            //
621            // We do this to avoid having to enumerate the contents of /proc/self/fd in order to
622            // count the number of descriptors open in the process.
623            final File fdThreshold = new File("/proc/self/fd/" + (rlimit.rlim_cur - FD_HIGH_WATER_MARK));
624            return new OpenFdMonitor(new File(dumpDirStr), fdThreshold);
625        }
626
627        OpenFdMonitor(File dumpDir, File fdThreshold) {
628            mDumpDir = dumpDir;
629            mFdHighWaterMark = fdThreshold;
630        }
631
632        private void dumpOpenDescriptors() {
633            try {
634                File dumpFile = File.createTempFile("anr_fd_", "", mDumpDir);
635                java.lang.Process proc = new ProcessBuilder()
636                    .command("/system/bin/lsof", "-p", String.valueOf(Process.myPid()))
637                    .redirectErrorStream(true)
638                    .redirectOutput(dumpFile)
639                    .start();
640
641                int returnCode = proc.waitFor();
642                if (returnCode != 0) {
643                    Slog.w(TAG, "Unable to dump open descriptors, lsof return code: "
644                        + returnCode);
645                    dumpFile.delete();
646                }
647            } catch (IOException | InterruptedException ex) {
648                Slog.w(TAG, "Unable to dump open descriptors: " + ex);
649            }
650        }
651
652        /**
653         * @return {@code true} if the high water mark was breached and a dump was written,
654         *     {@code false} otherwise.
655         */
656        public boolean monitor() {
657            if (mFdHighWaterMark.exists()) {
658                dumpOpenDescriptors();
659                return true;
660            }
661
662            return false;
663        }
664    }
665}
666