17dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Copyright (c) 2013 The Chromium Authors. All rights reserved.
27dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Use of this source code is governed by a BSD-style license that can be
37dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// found in the LICENSE file.
47dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
57dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "base/process/kill.h"
67dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
77dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include <signal.h>
87dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include <sys/event.h>
97dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include <sys/types.h>
107dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include <sys/wait.h>
117dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
126e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)#include "base/files/file_util.h"
13a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)#include "base/files/scoped_file.h"
147dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "base/logging.h"
157dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "base/posix/eintr_wrapper.h"
167dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
177dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdochnamespace base {
187dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
197dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdochnamespace {
207dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
217dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdochconst int kWaitBeforeKillSeconds = 2;
227dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
237dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Reap |child| process. This call blocks until completion.
247dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdochvoid BlockingReap(pid_t child) {
257dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  const pid_t result = HANDLE_EINTR(waitpid(child, NULL, 0));
267dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  if (result == -1) {
277dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    DPLOG(ERROR) << "waitpid(" << child << ", NULL, 0)";
287dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  }
297dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
307dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
317dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Waits for |timeout| seconds for the given |child| to exit and reap it. If
327dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// the child doesn't exit within the time specified, kills it.
337dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch//
347dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// This function takes two approaches: first, it tries to use kqueue to
357dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// observe when the process exits. kevent can monitor a kqueue with a
367dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// timeout, so this method is preferred to wait for a specified period of
377dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// time. Once the kqueue indicates the process has exited, waitpid will reap
387dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// the exited child. If the kqueue doesn't provide an exit event notification,
397dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// before the timeout expires, or if the kqueue fails or misbehaves, the
407dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// process will be mercilessly killed and reaped.
417dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch//
427dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// A child process passed to this function may be in one of several states:
437dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// running, terminated and not yet reaped, and (apparently, and unfortunately)
447dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// terminated and already reaped. Normally, a process will at least have been
457dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// asked to exit before this function is called, but this is not required.
467dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// If a process is terminating and unreaped, there may be a window between the
477dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// time that kqueue will no longer recognize it and when it becomes an actual
487dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// zombie that a non-blocking (WNOHANG) waitpid can reap. This condition is
497dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// detected when kqueue indicates that the process is not running and a
507dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// non-blocking waitpid fails to reap the process but indicates that it is
517dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// still running. In this event, a blocking attempt to reap the process
527dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// collects the known-dying child, preventing zombies from congregating.
537dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch//
547dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// In the event that the kqueue misbehaves entirely, as it might under a
557dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// EMFILE condition ("too many open files", or out of file descriptors), this
567dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// function will forcibly kill and reap the child without delay. This
577dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// eliminates another potential zombie vector. (If you're out of file
587dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// descriptors, you're probably deep into something else, but that doesn't
597dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// mean that zombies be allowed to kick you while you're down.)
607dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch//
617dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// The fact that this function seemingly can be called to wait on a child
627dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// that's not only already terminated but already reaped is a bit of a
637dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// problem: a reaped child's pid can be reclaimed and may refer to a distinct
647dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// process in that case. The fact that this function can seemingly be called
657dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// to wait on a process that's not even a child is also a problem: kqueue will
667dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// work in that case, but waitpid won't, and killing a non-child might not be
677dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// the best approach.
687dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdochvoid WaitForChildToDie(pid_t child, int timeout) {
697dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  DCHECK(child > 0);
707dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  DCHECK(timeout > 0);
717dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
727dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // DON'T ADD ANY EARLY RETURNS TO THIS FUNCTION without ensuring that
737dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // |child| has been reaped. Specifically, even if a kqueue, kevent, or other
747dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // call fails, this function should fall back to the last resort of trying
757dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // to kill and reap the process. Not observing this rule will resurrect
767dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // zombies.
777dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
787dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  int result;
797dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
80a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  ScopedFD kq(HANDLE_EINTR(kqueue()));
81a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  if (!kq.is_valid()) {
827dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    DPLOG(ERROR) << "kqueue()";
837dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  } else {
847dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    struct kevent change = {0};
857dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    EV_SET(&change, child, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, NULL);
86a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    result = HANDLE_EINTR(kevent(kq.get(), &change, 1, NULL, 0, NULL));
877dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
887dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    if (result == -1) {
897dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      if (errno != ESRCH) {
907dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        DPLOG(ERROR) << "kevent (setup " << child << ")";
917dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      } else {
927dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        // At this point, one of the following has occurred:
937dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        // 1. The process has died but has not yet been reaped.
947dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        // 2. The process has died and has already been reaped.
957dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        // 3. The process is in the process of dying. It's no longer
967dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        //    kqueueable, but it may not be waitable yet either. Mark calls
977dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        //    this case the "zombie death race".
987dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
997dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        result = HANDLE_EINTR(waitpid(child, NULL, WNOHANG));
1007dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1017dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        if (result != 0) {
1027dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          // A positive result indicates case 1. waitpid succeeded and reaped
1037dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          // the child. A result of -1 indicates case 2. The child has already
1047dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          // been reaped. In both of these cases, no further action is
1057dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          // necessary.
1067dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          return;
1077dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        }
1087dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1097dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        // |result| is 0, indicating case 3. The process will be waitable in
1107dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        // short order. Fall back out of the kqueue code to kill it (for good
1117dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        // measure) and reap it.
1127dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      }
1137dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    } else {
1147dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      // Keep track of the elapsed time to be able to restart kevent if it's
1157dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      // interrupted.
1167dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      TimeDelta remaining_delta = TimeDelta::FromSeconds(timeout);
1177dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      TimeTicks deadline = TimeTicks::Now() + remaining_delta;
1187dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      result = -1;
1197dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      struct kevent event = {0};
1207dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      while (remaining_delta.InMilliseconds() > 0) {
1217dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        const struct timespec remaining_timespec = remaining_delta.ToTimeSpec();
122a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        result = kevent(kq.get(), NULL, 0, &event, 1, &remaining_timespec);
1237dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        if (result == -1 && errno == EINTR) {
1247dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          remaining_delta = deadline - TimeTicks::Now();
1257dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          result = 0;
1267dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        } else {
1277dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          break;
1287dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        }
1297dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      }
1307dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1317dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      if (result == -1) {
1327dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        DPLOG(ERROR) << "kevent (wait " << child << ")";
1337dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      } else if (result > 1) {
1347dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        DLOG(ERROR) << "kevent (wait " << child << "): unexpected result "
1357dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch                    << result;
1367dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      } else if (result == 1) {
1377dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        if ((event.fflags & NOTE_EXIT) &&
1387dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch            (event.ident == static_cast<uintptr_t>(child))) {
1397dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          // The process is dead or dying. This won't block for long, if at
1407dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          // all.
1417dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          BlockingReap(child);
1427dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          return;
1437dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        } else {
1447dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch          DLOG(ERROR) << "kevent (wait " << child
1457dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch                      << "): unexpected event: fflags=" << event.fflags
1467dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch                      << ", ident=" << event.ident;
1477dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch        }
1487dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      }
1497dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    }
1507dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  }
1517dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1527dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // The child is still alive, or is very freshly dead. Be sure by sending it
1537dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // a signal. This is safe even if it's freshly dead, because it will be a
1547dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // zombie (or on the way to zombiedom) and kill will return 0 even if the
1557dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // signal is not delivered to a live process.
1567dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  result = kill(child, SIGKILL);
1577dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  if (result == -1) {
1587dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    DPLOG(ERROR) << "kill(" << child << ", SIGKILL)";
1597dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  } else {
1607dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    // The child is definitely on the way out now. BlockingReap won't need to
1617dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    // wait for long, if at all.
1627dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch    BlockingReap(child);
1637dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  }
1647dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
1657dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1667dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}  // namespace
1677dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1687dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdochvoid EnsureProcessTerminated(ProcessHandle process) {
1697dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  WaitForChildToDie(process, kWaitBeforeKillSeconds);
1707dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
1717dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1727dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}  // namespace base
173