1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/tools/flip_server/epoll_server.h"
6
7#include <stdlib.h>  // for abort
8#include <errno.h>    // for errno and strerror_r
9#include <algorithm>
10#include <ostream>
11#include <unistd.h>  // For read, pipe, close and write.
12#include <utility>
13#include <vector>
14
15#include "base/logging.h"
16#include "base/timer/timer.h"
17
18// Design notes: An efficient implementation of ready list has the following
19// desirable properties:
20//
21// A. O(1) insertion into/removal from the list in any location.
22// B. Once the callback is found by hash lookup using the fd, the lookup of
23//    corresponding entry in the list is O(1).
24// C. Safe insertion into/removal from the list during list iteration. (The
25//    ready list's purpose is to enable completely event driven I/O model.
26//    Thus, all the interesting bits happen in the callback. It is critical
27//    to not place any restriction on the API during list iteration.
28//
29// The current implementation achieves these goals with the following design:
30//
31// - The ready list is constructed as a doubly linked list to enable O(1)
32//   insertion/removal (see man 3 queue).
33// - The forward and backward links are directly embedded inside the
34//   CBAndEventMask struct. This enables O(1) lookup in the list for a given
35//   callback. (Techincally, we could've used std::list of hash_set::iterator,
36//   and keep a list::iterator in CBAndEventMask to achieve the same effect.
37//   However, iterators have two problems: no way to portably invalidate them,
38//   and no way to tell whether an iterator is singular or not. The only way to
39//   overcome these issues is to keep bools in both places, but that throws off
40//   memory alignment (up to 7 wasted bytes for each bool). The extra level of
41//   indirection will also likely be less cache friendly. Direct manipulation
42//   of link pointers makes it easier to retrieve the CBAndEventMask from the
43//   list, easier to check whether an CBAndEventMask is in the list, uses less
44//   memory (save 32 bytes/fd), and does not affect cache usage (we need to
45//   read in the struct to use the callback anyway).)
46// - Embed the fd directly into CBAndEventMask and switch to using hash_set.
47//   This removes the need to store hash_map::iterator in the list just so that
48//   we can get both the fd and the callback.
49// - The ready list is "one shot": each entry is removed before OnEvent is
50//   called. This removes the mutation-while-iterating problem.
51// - Use two lists to keep track of callbacks. The ready_list_ is the one used
52//   for registration. Before iteration, the ready_list_ is swapped into the
53//   tmp_list_. Once iteration is done, tmp_list_ will be empty, and
54//   ready_list_ will have all the new ready fds.
55
56// The size we use for buffers passed to strerror_r
57static const int kErrorBufferSize = 256;
58
59namespace net {
60
61// Clears the pipe and returns.  Used for waking the epoll server up.
62class ReadPipeCallback : public EpollCallbackInterface {
63 public:
64  virtual void OnEvent(int fd, EpollEvent* event) OVERRIDE {
65    DCHECK(event->in_events == EPOLLIN);
66    int data;
67    int data_read = 1;
68    // Read until the pipe is empty.
69    while (data_read > 0) {
70      data_read = read(fd, &data, sizeof(data));
71    }
72  }
73  virtual void OnShutdown(EpollServer *eps, int fd) OVERRIDE {}
74  virtual void OnRegistration(EpollServer*, int, int) OVERRIDE {}
75  virtual void OnModification(int, int) OVERRIDE {}       // COV_NF_LINE
76  virtual void OnUnregistration(int, bool) OVERRIDE {}    // COV_NF_LINE
77};
78
79////////////////////////////////////////////////////////////////////////////////
80////////////////////////////////////////////////////////////////////////////////
81
82EpollServer::EpollServer()
83  : epoll_fd_(epoll_create(1024)),
84    timeout_in_us_(0),
85    recorded_now_in_us_(0),
86    ready_list_size_(0),
87    wake_cb_(new ReadPipeCallback),
88    read_fd_(-1),
89    write_fd_(-1),
90    in_wait_for_events_and_execute_callbacks_(false),
91    in_shutdown_(false) {
92  // ensure that the epoll_fd_ is valid.
93  CHECK_NE(epoll_fd_, -1);
94  LIST_INIT(&ready_list_);
95  LIST_INIT(&tmp_list_);
96
97  int pipe_fds[2];
98  if (pipe(pipe_fds) < 0) {
99    // Unfortunately, it is impossible to test any such initialization in
100    // a constructor (as virtual methods do not yet work).
101    // This -could- be solved by moving initialization to an outside
102    // call...
103    int saved_errno = errno;
104    char buf[kErrorBufferSize];
105    LOG(FATAL) << "Error " << saved_errno
106               << " in pipe(): " << strerror_r(saved_errno, buf, sizeof(buf));
107  }
108  read_fd_ = pipe_fds[0];
109  write_fd_ = pipe_fds[1];
110  RegisterFD(read_fd_, wake_cb_.get(), EPOLLIN);
111}
112
113void EpollServer::CleanupFDToCBMap() {
114  FDToCBMap::iterator cb_iter = cb_map_.begin();
115  while (cb_iter != cb_map_.end()) {
116    int fd = cb_iter->fd;
117    CB* cb = cb_iter->cb;
118
119    cb_iter->in_use = true;
120    if (cb) {
121      cb->OnShutdown(this, fd);
122    }
123
124    cb_map_.erase(cb_iter);
125    cb_iter = cb_map_.begin();
126  }
127}
128
129void EpollServer::CleanupTimeToAlarmCBMap() {
130  TimeToAlarmCBMap::iterator erase_it;
131
132  // Call OnShutdown() on alarms. Note that the structure of the loop
133  // is similar to the structure of loop in the function HandleAlarms()
134  for (TimeToAlarmCBMap::iterator i = alarm_map_.begin();
135       i != alarm_map_.end();
136      ) {
137    // Note that OnShutdown() can call UnregisterAlarm() on
138    // other iterators. OnShutdown() should not call UnregisterAlarm()
139    // on self because by definition the iterator is not valid any more.
140    i->second->OnShutdown(this);
141    erase_it = i;
142    ++i;
143    alarm_map_.erase(erase_it);
144  }
145}
146
147EpollServer::~EpollServer() {
148  DCHECK_EQ(in_shutdown_, false);
149  in_shutdown_ = true;
150#ifdef EPOLL_SERVER_EVENT_TRACING
151  LOG(INFO) << "\n" << event_recorder_;
152#endif
153  VLOG(2) << "Shutting down epoll server ";
154  CleanupFDToCBMap();
155
156  LIST_INIT(&ready_list_);
157  LIST_INIT(&tmp_list_);
158
159  CleanupTimeToAlarmCBMap();
160
161  close(read_fd_);
162  close(write_fd_);
163  close(epoll_fd_);
164}
165
166// Whether a CBAandEventMask is on the ready list is determined by a non-NULL
167// le_prev pointer (le_next being NULL indicates end of list).
168inline void EpollServer::AddToReadyList(CBAndEventMask* cb_and_mask) {
169  if (cb_and_mask->entry.le_prev == NULL) {
170    LIST_INSERT_HEAD(&ready_list_, cb_and_mask, entry);
171    ++ready_list_size_;
172  }
173}
174
175inline void EpollServer::RemoveFromReadyList(
176    const CBAndEventMask& cb_and_mask) {
177  if (cb_and_mask.entry.le_prev != NULL) {
178    LIST_REMOVE(&cb_and_mask, entry);
179    // Clean up all the ready list states. Don't bother with the other fields
180    // as they are initialized when the CBAandEventMask is added to the ready
181    // list. This saves a few cycles in the inner loop.
182    cb_and_mask.entry.le_prev = NULL;
183    --ready_list_size_;
184    if (ready_list_size_ == 0) {
185      DCHECK(ready_list_.lh_first == NULL);
186      DCHECK(tmp_list_.lh_first == NULL);
187    }
188  }
189}
190
191void EpollServer::RegisterFD(int fd, CB* cb, int event_mask) {
192  CHECK(cb);
193  VLOG(3) << "RegisterFD fd=" << fd << " event_mask=" << event_mask;
194  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
195  if (cb_map_.end() != fd_i) {
196    // do we just abort, or do we just unregister the other guy?
197    // for now, lets just unregister the other guy.
198
199    // unregister any callback that may already be registered for this FD.
200    CB* other_cb = fd_i->cb;
201    if (other_cb) {
202      // Must remove from the ready list before erasing.
203      RemoveFromReadyList(*fd_i);
204      other_cb->OnUnregistration(fd, true);
205      ModFD(fd, event_mask);
206    } else {
207      // already unregistered, so just recycle the node.
208      AddFD(fd, event_mask);
209    }
210    fd_i->cb = cb;
211    fd_i->event_mask = event_mask;
212    fd_i->events_to_fake = 0;
213  } else {
214    AddFD(fd, event_mask);
215    cb_map_.insert(CBAndEventMask(cb, event_mask, fd));
216  }
217
218
219  // set the FD to be non-blocking.
220  SetNonblocking(fd);
221
222  cb->OnRegistration(this, fd, event_mask);
223}
224
225int EpollServer::GetFlags(int fd) {
226  return fcntl(fd, F_GETFL, 0);
227}
228
229void EpollServer::SetNonblocking(int fd) {
230  int flags = GetFlags(fd);
231  if (flags == -1) {
232    int saved_errno = errno;
233    char buf[kErrorBufferSize];
234    LOG(FATAL) << "Error " << saved_errno
235               << " doing fcntl(" << fd << ", F_GETFL, 0): "
236               << strerror_r(saved_errno, buf, sizeof(buf));
237  }
238  if (!(flags & O_NONBLOCK)) {
239    int saved_flags = flags;
240    flags = SetFlags(fd, flags | O_NONBLOCK);
241    if (flags == -1) {
242      // bad.
243      int saved_errno = errno;
244      char buf[kErrorBufferSize];
245      LOG(FATAL) << "Error " << saved_errno
246        << " doing fcntl(" << fd << ", F_SETFL, " << saved_flags << "): "
247        << strerror_r(saved_errno, buf, sizeof(buf));
248    }
249  }
250}
251
252int EpollServer::epoll_wait_impl(int epfd,
253                                 struct epoll_event* events,
254                                 int max_events,
255                                 int timeout_in_ms) {
256  return epoll_wait(epfd, events, max_events, timeout_in_ms);
257}
258
259void EpollServer::RegisterFDForWrite(int fd, CB* cb) {
260  RegisterFD(fd, cb, EPOLLOUT);
261}
262
263void EpollServer::RegisterFDForReadWrite(int fd, CB* cb) {
264  RegisterFD(fd, cb, EPOLLIN | EPOLLOUT);
265}
266
267void EpollServer::RegisterFDForRead(int fd, CB* cb) {
268  RegisterFD(fd, cb, EPOLLIN);
269}
270
271void EpollServer::UnregisterFD(int fd) {
272  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
273  if (cb_map_.end() == fd_i || fd_i->cb == NULL) {
274    // Doesn't exist in server, or has gone through UnregisterFD once and still
275    // inside the callchain of OnEvent.
276    return;
277  }
278#ifdef EPOLL_SERVER_EVENT_TRACING
279  event_recorder_.RecordUnregistration(fd);
280#endif
281  CB* cb = fd_i->cb;
282  // Since the links are embedded within the struct, we must remove it from the
283  // list before erasing it from the hash_set.
284  RemoveFromReadyList(*fd_i);
285  DelFD(fd);
286  cb->OnUnregistration(fd, false);
287  // fd_i->cb is NULL if that fd is unregistered inside the callchain of
288  // OnEvent. Since the EpollServer needs a valid CBAndEventMask after OnEvent
289  // returns in order to add it to the ready list, we cannot have UnregisterFD
290  // erase the entry if it is in use. Thus, a NULL fd_i->cb is used as a
291  // condition that tells the EpollServer that this entry is unused at a later
292  // point.
293  if (!fd_i->in_use) {
294    cb_map_.erase(fd_i);
295  } else {
296    // Remove all trace of the registration, and just keep the node alive long
297    // enough so the code that calls OnEvent doesn't have to worry about
298    // figuring out whether the CBAndEventMask is valid or not.
299    fd_i->cb = NULL;
300    fd_i->event_mask = 0;
301    fd_i->events_to_fake = 0;
302  }
303}
304
305void EpollServer::ModifyCallback(int fd, int event_mask) {
306  ModifyFD(fd, ~0, event_mask);
307}
308
309void EpollServer::StopRead(int fd) {
310  ModifyFD(fd, EPOLLIN, 0);
311}
312
313void EpollServer::StartRead(int fd) {
314  ModifyFD(fd, 0, EPOLLIN);
315}
316
317void EpollServer::StopWrite(int fd) {
318  ModifyFD(fd, EPOLLOUT, 0);
319}
320
321void EpollServer::StartWrite(int fd) {
322  ModifyFD(fd, 0, EPOLLOUT);
323}
324
325void EpollServer::HandleEvent(int fd, int event_mask) {
326#ifdef EPOLL_SERVER_EVENT_TRACING
327  event_recorder_.RecordEpollEvent(fd, event_mask);
328#endif
329  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
330  if (fd_i == cb_map_.end() || fd_i->cb == NULL) {
331    // Ignore the event.
332    // This could occur if epoll() returns a set of events, and
333    // while processing event A (earlier) we removed the callback
334    // for event B (and are now processing event B).
335    return;
336  }
337  fd_i->events_asserted = event_mask;
338  CBAndEventMask* cb_and_mask = const_cast<CBAndEventMask*>(&*fd_i);
339  AddToReadyList(cb_and_mask);
340}
341
342class TrueFalseGuard {
343 public:
344  explicit TrueFalseGuard(bool* guarded_bool) : guarded_bool_(guarded_bool) {
345    DCHECK(guarded_bool_ != NULL);
346    DCHECK(*guarded_bool_ == false);
347    *guarded_bool_ = true;
348  }
349  ~TrueFalseGuard() {
350    *guarded_bool_ = false;
351  }
352 private:
353  bool* guarded_bool_;
354};
355
356void EpollServer::WaitForEventsAndExecuteCallbacks() {
357  if (in_wait_for_events_and_execute_callbacks_) {
358    LOG(DFATAL) <<
359      "Attempting to call WaitForEventsAndExecuteCallbacks"
360      " when an ancestor to the current function is already"
361      " WaitForEventsAndExecuteCallbacks!";
362    // The line below is actually tested, but in coverage mode,
363    // we never see it.
364    return;  // COV_NF_LINE
365  }
366  TrueFalseGuard recursion_guard(&in_wait_for_events_and_execute_callbacks_);
367  if (alarm_map_.empty()) {
368    // no alarms, this is business as usual.
369    WaitForEventsAndCallHandleEvents(timeout_in_us_,
370                                     events_,
371                                     events_size_);
372    recorded_now_in_us_ = 0;
373    return;
374  }
375
376  // store the 'now'. If we recomputed 'now' every iteration
377  // down below, then we might never exit that loop-- any
378  // long-running alarms might install other long-running
379  // alarms, etc. By storing it here now, we ensure that
380  // a more reasonable amount of work is done here.
381  int64 now_in_us  = NowInUsec();
382
383  // Get the first timeout from the alarm_map where it is
384  // stored in absolute time.
385  int64 next_alarm_time_in_us =  alarm_map_.begin()->first;
386  VLOG(4) << "next_alarm_time = " << next_alarm_time_in_us
387          << " now             = " << now_in_us
388          << " timeout_in_us = " << timeout_in_us_;
389
390  int64 wait_time_in_us;
391  int64 alarm_timeout_in_us = next_alarm_time_in_us - now_in_us;
392
393  // If the next alarm is sooner than the default timeout, or if there is no
394  // timeout (timeout_in_us_ == -1), wake up when the alarm should fire.
395  // Otherwise use the default timeout.
396  if (alarm_timeout_in_us < timeout_in_us_ || timeout_in_us_ < 0) {
397    wait_time_in_us = std::max(alarm_timeout_in_us, static_cast<int64>(0));
398  } else {
399    wait_time_in_us = timeout_in_us_;
400  }
401
402  VLOG(4) << "wait_time_in_us = " << wait_time_in_us;
403
404  // wait for events.
405
406  WaitForEventsAndCallHandleEvents(wait_time_in_us,
407                                   events_,
408                                   events_size_);
409  CallAndReregisterAlarmEvents();
410  recorded_now_in_us_ = 0;
411}
412
413void EpollServer::SetFDReady(int fd, int events_to_fake) {
414  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
415  if (cb_map_.end() != fd_i && fd_i->cb != NULL) {
416    // This const_cast is necessary for LIST_HEAD_INSERT to work. Declaring
417    // entry mutable is insufficient because LIST_HEAD_INSERT assigns the
418    // forward pointer of the list head to the current cb_and_mask, and the
419    // compiler complains that it can't assign a const T* to a T*.
420    CBAndEventMask* cb_and_mask = const_cast<CBAndEventMask*>(&*fd_i);
421    // Note that there is no clearly correct behavior here when
422    // cb_and_mask->events_to_fake != 0 and this function is called.
423    // Of the two operations:
424    //      cb_and_mask->events_to_fake = events_to_fake
425    //      cb_and_mask->events_to_fake |= events_to_fake
426    // the first was picked because it discourages users from calling
427    // SetFDReady repeatedly to build up the correct event set as it is more
428    // efficient to call SetFDReady once with the correct, final mask.
429    cb_and_mask->events_to_fake = events_to_fake;
430    AddToReadyList(cb_and_mask);
431  }
432}
433
434void EpollServer::SetFDNotReady(int fd) {
435  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
436  if (cb_map_.end() != fd_i) {
437    RemoveFromReadyList(*fd_i);
438  }
439}
440
441bool EpollServer::IsFDReady(int fd) const {
442  FDToCBMap::const_iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
443  return (cb_map_.end() != fd_i &&
444          fd_i->cb != NULL &&
445          fd_i->entry.le_prev != NULL);
446}
447
448void EpollServer::VerifyReadyList() const {
449  int count = 0;
450  CBAndEventMask* cur = ready_list_.lh_first;
451  for (; cur; cur = cur->entry.le_next) {
452    ++count;
453  }
454  for (cur = tmp_list_.lh_first; cur; cur = cur->entry.le_next) {
455    ++count;
456  }
457  CHECK_EQ(ready_list_size_, count) << "Ready list size does not match count";
458}
459
460void EpollServer::RegisterAlarm(int64 timeout_time_in_us, AlarmCB* ac) {
461  CHECK(ac);
462  if (ContainsAlarm(ac)) {
463    LOG(FATAL) << "Alarm already exists " << ac;
464  }
465  VLOG(4) << "RegisteringAlarm at : " << timeout_time_in_us;
466
467  TimeToAlarmCBMap::iterator alarm_iter =
468      alarm_map_.insert(std::make_pair(timeout_time_in_us, ac));
469
470  all_alarms_.insert(ac);
471  // Pass the iterator to the EpollAlarmCallbackInterface.
472  ac->OnRegistration(alarm_iter, this);
473}
474
475// Unregister a specific alarm callback: iterator_token must be a
476//  valid iterator. The caller must ensure the validity of the iterator.
477void EpollServer::UnregisterAlarm(const AlarmRegToken& iterator_token) {
478  AlarmCB* cb = iterator_token->second;
479  alarm_map_.erase(iterator_token);
480  all_alarms_.erase(cb);
481  cb->OnUnregistration();
482}
483
484int EpollServer::NumFDsRegistered() const {
485  DCHECK(cb_map_.size() >= 1);
486  // Omit the internal FD (read_fd_)
487  return cb_map_.size() - 1;
488}
489
490void EpollServer::Wake() {
491  char data = 'd';  // 'd' is for data.  It's good enough for me.
492  int rv = write(write_fd_, &data, 1);
493  DCHECK(rv == 1);
494}
495
496int64 EpollServer::NowInUsec() const {
497  return base::Time::Now().ToInternalValue();
498}
499
500int64 EpollServer::ApproximateNowInUsec() const {
501  if (recorded_now_in_us_ != 0) {
502    return recorded_now_in_us_;
503  }
504  return this->NowInUsec();
505}
506
507std::string EpollServer::EventMaskToString(int event_mask) {
508  std::string s;
509  if (event_mask & EPOLLIN) s += "EPOLLIN ";
510  if (event_mask & EPOLLPRI) s += "EPOLLPRI ";
511  if (event_mask & EPOLLOUT) s += "EPOLLOUT ";
512  if (event_mask & EPOLLRDNORM) s += "EPOLLRDNORM ";
513  if (event_mask & EPOLLRDBAND) s += "EPOLLRDBAND ";
514  if (event_mask & EPOLLWRNORM) s += "EPOLLWRNORM ";
515  if (event_mask & EPOLLWRBAND) s += "EPOLLWRBAND ";
516  if (event_mask & EPOLLMSG) s += "EPOLLMSG ";
517  if (event_mask & EPOLLERR) s += "EPOLLERR ";
518  if (event_mask & EPOLLHUP) s += "EPOLLHUP ";
519  if (event_mask & EPOLLONESHOT) s += "EPOLLONESHOT ";
520  if (event_mask & EPOLLET) s += "EPOLLET ";
521  return s;
522}
523
524void EpollServer::LogStateOnCrash() {
525  LOG(ERROR) << "----------------------Epoll Server---------------------------";
526  LOG(ERROR) << "Epoll server " << this << " polling on fd " << epoll_fd_;
527  LOG(ERROR) << "timeout_in_us_: " << timeout_in_us_;
528
529  // Log sessions with alarms.
530  LOG(ERROR) << alarm_map_.size() << " alarms registered.";
531  for (TimeToAlarmCBMap::iterator it = alarm_map_.begin();
532       it != alarm_map_.end();
533       ++it) {
534    const bool skipped =
535        alarms_reregistered_and_should_be_skipped_.find(it->second)
536        != alarms_reregistered_and_should_be_skipped_.end();
537    LOG(ERROR) << "Alarm " << it->second << " registered at time " << it->first
538               << " and should be skipped = " << skipped;
539  }
540
541  LOG(ERROR) << cb_map_.size() << " fd callbacks registered.";
542  for (FDToCBMap::iterator it = cb_map_.begin();
543       it != cb_map_.end();
544       ++it) {
545    LOG(ERROR) << "fd: " << it->fd << " with mask " << it->event_mask
546               << " registered with cb: " << it->cb;
547  }
548  LOG(ERROR) << "----------------------/Epoll Server--------------------------";
549}
550
551
552
553////////////////////////////////////////////////////////////////////////////////
554////////////////////////////////////////////////////////////////////////////////
555
556void EpollServer::DelFD(int fd) const {
557  struct epoll_event ee;
558  memset(&ee, 0, sizeof(ee));
559#ifdef EPOLL_SERVER_EVENT_TRACING
560  event_recorder_.RecordFDMaskEvent(fd, 0, "DelFD");
561#endif
562  if (epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, fd, &ee)) {
563    int saved_errno = errno;
564    char buf[kErrorBufferSize];
565    LOG(FATAL) << "Epoll set removal error for fd " << fd << ": "
566               << strerror_r(saved_errno, buf, sizeof(buf));
567  }
568}
569
570////////////////////////////////////////
571
572void EpollServer::AddFD(int fd, int event_mask) const {
573  struct epoll_event ee;
574  memset(&ee, 0, sizeof(ee));
575  ee.events = event_mask | EPOLLERR | EPOLLHUP;
576  ee.data.fd = fd;
577#ifdef EPOLL_SERVER_EVENT_TRACING
578  event_recorder_.RecordFDMaskEvent(fd, ee.events, "AddFD");
579#endif
580  if (epoll_ctl(epoll_fd_, EPOLL_CTL_ADD, fd, &ee)) {
581    int saved_errno = errno;
582    char buf[kErrorBufferSize];
583    LOG(FATAL) << "Epoll set insertion error for fd " << fd << ": "
584               << strerror_r(saved_errno, buf, sizeof(buf));
585  }
586}
587
588////////////////////////////////////////
589
590void EpollServer::ModFD(int fd, int event_mask) const {
591  struct epoll_event ee;
592  memset(&ee, 0, sizeof(ee));
593  ee.events = event_mask | EPOLLERR | EPOLLHUP;
594  ee.data.fd = fd;
595#ifdef EPOLL_SERVER_EVENT_TRACING
596  event_recorder_.RecordFDMaskEvent(fd, ee.events, "ModFD");
597#endif
598  VLOG(3) <<  "modifying fd= " << fd << " "
599          << EventMaskToString(ee.events);
600  if (epoll_ctl(epoll_fd_, EPOLL_CTL_MOD, fd, &ee)) {
601    int saved_errno = errno;
602    char buf[kErrorBufferSize];
603    LOG(FATAL) << "Epoll set modification error for fd " << fd << ": "
604               << strerror_r(saved_errno, buf, sizeof(buf));
605  }
606}
607
608////////////////////////////////////////
609
610void EpollServer::ModifyFD(int fd, int remove_event, int add_event) {
611  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
612  if (cb_map_.end() == fd_i) {
613    VLOG(2) << "Didn't find the fd " << fd << "in internal structures";
614    return;
615  }
616
617  if (fd_i->cb != NULL) {
618    int & event_mask = fd_i->event_mask;
619    VLOG(3) << "fd= " << fd
620            << " event_mask before: " << EventMaskToString(event_mask);
621    event_mask &= ~remove_event;
622    event_mask |= add_event;
623
624    VLOG(3) << " event_mask after: " << EventMaskToString(event_mask);
625
626    ModFD(fd, event_mask);
627
628    fd_i->cb->OnModification(fd, event_mask);
629  }
630}
631
632void EpollServer::WaitForEventsAndCallHandleEvents(int64 timeout_in_us,
633                                                   struct epoll_event events[],
634                                                   int events_size) {
635  if (timeout_in_us == 0 || ready_list_.lh_first != NULL) {
636    // If ready list is not empty, then don't sleep at all.
637    timeout_in_us = 0;
638  } else if (timeout_in_us < 0) {
639    LOG(INFO) << "Negative epoll timeout: " << timeout_in_us
640              << "us; epoll will wait forever for events.";
641    // If timeout_in_us is < 0 we are supposed to Wait forever.  This means we
642    // should set timeout_in_us to -1000 so we will
643    // Wait(-1000/1000) == Wait(-1) == Wait forever.
644    timeout_in_us = -1000;
645  } else {
646    // If timeout is specified, and the ready list is empty.
647    if (timeout_in_us < 1000) {
648      timeout_in_us = 1000;
649    }
650  }
651  const int timeout_in_ms = timeout_in_us / 1000;
652  int nfds = epoll_wait_impl(epoll_fd_,
653                             events,
654                             events_size,
655                             timeout_in_ms);
656  VLOG(3) << "nfds=" << nfds;
657
658#ifdef EPOLL_SERVER_EVENT_TRACING
659  event_recorder_.RecordEpollWaitEvent(timeout_in_ms, nfds);
660#endif
661
662  // If you're wondering why the NowInUsec() is recorded here, the answer is
663  // simple: If we did it before the epoll_wait_impl, then the max error for
664  // the ApproximateNowInUs() call would be as large as the maximum length of
665  // epoll_wait, which can be arbitrarily long. Since this would make
666  // ApproximateNowInUs() worthless, we instead record the time -after- we've
667  // done epoll_wait, which guarantees that the maximum error is the amount of
668  // time it takes to process all the events generated by epoll_wait.
669  recorded_now_in_us_ = NowInUsec();
670  if (nfds > 0) {
671    for (int i = 0; i < nfds; ++i) {
672      int event_mask = events[i].events;
673      int fd = events[i].data.fd;
674      HandleEvent(fd, event_mask);
675    }
676  } else if (nfds < 0) {
677    // Catch interrupted syscall and just ignore it and move on.
678    if (errno != EINTR && errno != 0) {
679      int saved_errno = errno;
680      char buf[kErrorBufferSize];
681      LOG(FATAL) << "Error " << saved_errno << " in epoll_wait: "
682                 << strerror_r(saved_errno, buf, sizeof(buf));
683    }
684  }
685
686  // Now run through the ready list.
687  if (ready_list_.lh_first) {
688    CallReadyListCallbacks();
689  }
690}
691
692void EpollServer::CallReadyListCallbacks() {
693  // Check pre-conditions.
694  DCHECK(tmp_list_.lh_first == NULL);
695  // Swap out the ready_list_ into the tmp_list_ before traversing the list to
696  // enable SetFDReady() to just push new items into the ready_list_.
697  std::swap(ready_list_.lh_first, tmp_list_.lh_first);
698  if (tmp_list_.lh_first) {
699    tmp_list_.lh_first->entry.le_prev = &tmp_list_.lh_first;
700    EpollEvent event(0, false);
701    while (tmp_list_.lh_first != NULL) {
702      DCHECK_GT(ready_list_size_, 0);
703      CBAndEventMask* cb_and_mask = tmp_list_.lh_first;
704      RemoveFromReadyList(*cb_and_mask);
705
706      event.out_ready_mask = 0;
707      event.in_events =
708        cb_and_mask->events_asserted | cb_and_mask->events_to_fake;
709      // TODO(fenix): get rid of the two separate fields in cb_and_mask.
710      cb_and_mask->events_asserted = 0;
711      cb_and_mask->events_to_fake = 0;
712      {
713        // OnEvent() may call UnRegister, so we set in_use, here. Any
714        // UnRegister call will now simply set the cb to NULL instead of
715        // invalidating the cb_and_mask object (by deleting the object in the
716        // map to which cb_and_mask refers)
717        TrueFalseGuard in_use_guard(&(cb_and_mask->in_use));
718        cb_and_mask->cb->OnEvent(cb_and_mask->fd, &event);
719      }
720
721      // Since OnEvent may have called UnregisterFD, we must check here that
722      // the callback is still valid. If it isn't, then UnregisterFD *was*
723      // called, and we should now get rid of the object.
724      if (cb_and_mask->cb == NULL) {
725        cb_map_.erase(*cb_and_mask);
726      } else if (event.out_ready_mask != 0) {
727        cb_and_mask->events_to_fake = event.out_ready_mask;
728        AddToReadyList(cb_and_mask);
729      }
730    }
731  }
732  DCHECK(tmp_list_.lh_first == NULL);
733}
734
735void EpollServer::CallAndReregisterAlarmEvents() {
736  int64 now_in_us = recorded_now_in_us_;
737  DCHECK_NE(0, recorded_now_in_us_);
738
739  TimeToAlarmCBMap::iterator erase_it;
740
741  // execute alarms.
742  for (TimeToAlarmCBMap::iterator i = alarm_map_.begin();
743       i != alarm_map_.end();
744      ) {
745    if (i->first > now_in_us) {
746      break;
747    }
748    AlarmCB* cb = i->second;
749    // Execute the OnAlarm() only if we did not register
750    // it in this loop itself.
751    const bool added_in_this_round =
752        alarms_reregistered_and_should_be_skipped_.find(cb)
753        != alarms_reregistered_and_should_be_skipped_.end();
754    if (added_in_this_round) {
755      ++i;
756      continue;
757    }
758    all_alarms_.erase(cb);
759    const int64 new_timeout_time_in_us = cb->OnAlarm();
760
761    erase_it = i;
762    ++i;
763    alarm_map_.erase(erase_it);
764
765    if (new_timeout_time_in_us > 0) {
766      // We add to hash_set only if the new timeout is <= now_in_us.
767      // if timeout is > now_in_us then we have no fear that this alarm
768      // can be reexecuted in this loop, and hence we do not need to
769      // worry about a recursive loop.
770      DVLOG(3) << "Reregistering alarm "
771               << " " << cb
772               << " " << new_timeout_time_in_us
773               << " " << now_in_us;
774      if (new_timeout_time_in_us <= now_in_us) {
775        alarms_reregistered_and_should_be_skipped_.insert(cb);
776      }
777      RegisterAlarm(new_timeout_time_in_us, cb);
778    }
779  }
780  alarms_reregistered_and_should_be_skipped_.clear();
781}
782
783EpollAlarm::EpollAlarm() : eps_(NULL), registered_(false) {
784}
785
786EpollAlarm::~EpollAlarm() {
787  UnregisterIfRegistered();
788}
789
790int64 EpollAlarm::OnAlarm() {
791  registered_ = false;
792  return 0;
793}
794
795void EpollAlarm::OnRegistration(const EpollServer::AlarmRegToken& token,
796                                EpollServer* eps) {
797  DCHECK_EQ(false, registered_);
798
799  token_ = token;
800  eps_ = eps;
801  registered_ = true;
802}
803
804void EpollAlarm::OnUnregistration() {
805  registered_ = false;
806}
807
808void EpollAlarm::OnShutdown(EpollServer* eps) {
809  registered_ = false;
810  eps_ = NULL;
811}
812
813// If the alarm was registered, unregister it.
814void EpollAlarm::UnregisterIfRegistered() {
815  if (!registered_) {
816    return;
817  }
818  eps_->UnregisterAlarm(token_);
819}
820
821}  // namespace net
822
823