1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/tools/epoll_server/epoll_server.h"
6
7#include <unistd.h>  // For read, pipe, close and write.
8#include <stdlib.h>  // for abort
9#include <errno.h>    // for errno and strerror_r
10#include <algorithm>
11#include <utility>
12#include <vector>
13
14#include "base/logging.h"
15#include "base/timer/timer.h"
16
17// Design notes: An efficient implementation of ready list has the following
18// desirable properties:
19//
20// A. O(1) insertion into/removal from the list in any location.
21// B. Once the callback is found by hash lookup using the fd, the lookup of
22//    corresponding entry in the list is O(1).
23// C. Safe insertion into/removal from the list during list iteration. (The
24//    ready list's purpose is to enable completely event driven I/O model.
25//    Thus, all the interesting bits happen in the callback. It is critical
26//    to not place any restriction on the API during list iteration.
27//
28// The current implementation achieves these goals with the following design:
29//
30// - The ready list is constructed as a doubly linked list to enable O(1)
31//   insertion/removal (see man 3 queue).
32// - The forward and backward links are directly embedded inside the
33//   CBAndEventMask struct. This enables O(1) lookup in the list for a given
34//   callback. (Techincally, we could've used std::list of hash_set::iterator,
35//   and keep a list::iterator in CBAndEventMask to achieve the same effect.
36//   However, iterators have two problems: no way to portably invalidate them,
37//   and no way to tell whether an iterator is singular or not. The only way to
38//   overcome these issues is to keep bools in both places, but that throws off
39//   memory alignment (up to 7 wasted bytes for each bool). The extra level of
40//   indirection will also likely be less cache friendly. Direct manipulation
41//   of link pointers makes it easier to retrieve the CBAndEventMask from the
42//   list, easier to check whether an CBAndEventMask is in the list, uses less
43//   memory (save 32 bytes/fd), and does not affect cache usage (we need to
44//   read in the struct to use the callback anyway).)
45// - Embed the fd directly into CBAndEventMask and switch to using hash_set.
46//   This removes the need to store hash_map::iterator in the list just so that
47//   we can get both the fd and the callback.
48// - The ready list is "one shot": each entry is removed before OnEvent is
49//   called. This removes the mutation-while-iterating problem.
50// - Use two lists to keep track of callbacks. The ready_list_ is the one used
51//   for registration. Before iteration, the ready_list_ is swapped into the
52//   tmp_list_. Once iteration is done, tmp_list_ will be empty, and
53//   ready_list_ will have all the new ready fds.
54
55// The size we use for buffers passed to strerror_r
56static const int kErrorBufferSize = 256;
57
58namespace net {
59
60// Clears the pipe and returns.  Used for waking the epoll server up.
61class ReadPipeCallback : public EpollCallbackInterface {
62 public:
63  virtual void OnEvent(int fd, EpollEvent* event) OVERRIDE {
64    DCHECK(event->in_events == EPOLLIN);
65    int data;
66    int data_read = 1;
67    // Read until the pipe is empty.
68    while (data_read > 0) {
69      data_read = read(fd, &data, sizeof(data));
70    }
71  }
72  virtual void OnShutdown(EpollServer *eps, int fd) OVERRIDE {}
73  virtual void OnRegistration(EpollServer*, int, int) OVERRIDE {}
74  virtual void OnModification(int, int) OVERRIDE {}       // COV_NF_LINE
75  virtual void OnUnregistration(int, bool) OVERRIDE {}    // COV_NF_LINE
76};
77
78////////////////////////////////////////////////////////////////////////////////
79////////////////////////////////////////////////////////////////////////////////
80
81EpollServer::EpollServer()
82  : epoll_fd_(epoll_create(1024)),
83    timeout_in_us_(0),
84    recorded_now_in_us_(0),
85    ready_list_size_(0),
86    wake_cb_(new ReadPipeCallback),
87    read_fd_(-1),
88    write_fd_(-1),
89    in_wait_for_events_and_execute_callbacks_(false),
90    in_shutdown_(false) {
91  // ensure that the epoll_fd_ is valid.
92  CHECK_NE(epoll_fd_, -1);
93  LIST_INIT(&ready_list_);
94  LIST_INIT(&tmp_list_);
95
96  int pipe_fds[2];
97  if (pipe(pipe_fds) < 0) {
98    // Unfortunately, it is impossible to test any such initialization in
99    // a constructor (as virtual methods do not yet work).
100    // This -could- be solved by moving initialization to an outside
101    // call...
102    int saved_errno = errno;
103    char buf[kErrorBufferSize];
104    LOG(FATAL) << "Error " << saved_errno
105               << " in pipe(): " << strerror_r(saved_errno, buf, sizeof(buf));
106  }
107  read_fd_ = pipe_fds[0];
108  write_fd_ = pipe_fds[1];
109  RegisterFD(read_fd_, wake_cb_.get(), EPOLLIN);
110}
111
112void EpollServer::CleanupFDToCBMap() {
113  FDToCBMap::iterator cb_iter = cb_map_.begin();
114  while (cb_iter != cb_map_.end()) {
115    int fd = cb_iter->fd;
116    CB* cb = cb_iter->cb;
117
118    cb_iter->in_use = true;
119    if (cb) {
120      cb->OnShutdown(this, fd);
121    }
122
123    cb_map_.erase(cb_iter);
124    cb_iter = cb_map_.begin();
125  }
126}
127
128void EpollServer::CleanupTimeToAlarmCBMap() {
129  TimeToAlarmCBMap::iterator erase_it;
130
131  // Call OnShutdown() on alarms. Note that the structure of the loop
132  // is similar to the structure of loop in the function HandleAlarms()
133  for (TimeToAlarmCBMap::iterator i = alarm_map_.begin();
134       i != alarm_map_.end();
135      ) {
136    // Note that OnShutdown() can call UnregisterAlarm() on
137    // other iterators. OnShutdown() should not call UnregisterAlarm()
138    // on self because by definition the iterator is not valid any more.
139    i->second->OnShutdown(this);
140    erase_it = i;
141    ++i;
142    alarm_map_.erase(erase_it);
143  }
144}
145
146EpollServer::~EpollServer() {
147  DCHECK_EQ(in_shutdown_, false);
148  in_shutdown_ = true;
149#ifdef EPOLL_SERVER_EVENT_TRACING
150  LOG(INFO) << "\n" << event_recorder_;
151#endif
152  VLOG(2) << "Shutting down epoll server ";
153  CleanupFDToCBMap();
154
155  LIST_INIT(&ready_list_);
156  LIST_INIT(&tmp_list_);
157
158  CleanupTimeToAlarmCBMap();
159
160  close(read_fd_);
161  close(write_fd_);
162  close(epoll_fd_);
163}
164
165// Whether a CBAandEventMask is on the ready list is determined by a non-NULL
166// le_prev pointer (le_next being NULL indicates end of list).
167inline void EpollServer::AddToReadyList(CBAndEventMask* cb_and_mask) {
168  if (cb_and_mask->entry.le_prev == NULL) {
169    LIST_INSERT_HEAD(&ready_list_, cb_and_mask, entry);
170    ++ready_list_size_;
171  }
172}
173
174inline void EpollServer::RemoveFromReadyList(
175    const CBAndEventMask& cb_and_mask) {
176  if (cb_and_mask.entry.le_prev != NULL) {
177    LIST_REMOVE(&cb_and_mask, entry);
178    // Clean up all the ready list states. Don't bother with the other fields
179    // as they are initialized when the CBAandEventMask is added to the ready
180    // list. This saves a few cycles in the inner loop.
181    cb_and_mask.entry.le_prev = NULL;
182    --ready_list_size_;
183    if (ready_list_size_ == 0) {
184      DCHECK(ready_list_.lh_first == NULL);
185      DCHECK(tmp_list_.lh_first == NULL);
186    }
187  }
188}
189
190void EpollServer::RegisterFD(int fd, CB* cb, int event_mask) {
191  CHECK(cb);
192  VLOG(3) << "RegisterFD fd=" << fd << " event_mask=" << event_mask;
193  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
194  if (cb_map_.end() != fd_i) {
195    // do we just abort, or do we just unregister the other guy?
196    // for now, lets just unregister the other guy.
197
198    // unregister any callback that may already be registered for this FD.
199    CB* other_cb = fd_i->cb;
200    if (other_cb) {
201      // Must remove from the ready list before erasing.
202      RemoveFromReadyList(*fd_i);
203      other_cb->OnUnregistration(fd, true);
204      ModFD(fd, event_mask);
205    } else {
206      // already unregistered, so just recycle the node.
207      AddFD(fd, event_mask);
208    }
209    fd_i->cb = cb;
210    fd_i->event_mask = event_mask;
211    fd_i->events_to_fake = 0;
212  } else {
213    AddFD(fd, event_mask);
214    cb_map_.insert(CBAndEventMask(cb, event_mask, fd));
215  }
216
217
218  // set the FD to be non-blocking.
219  SetNonblocking(fd);
220
221  cb->OnRegistration(this, fd, event_mask);
222}
223
224int EpollServer::GetFlags(int fd) {
225  return fcntl(fd, F_GETFL, 0);
226}
227
228void EpollServer::SetNonblocking(int fd) {
229  int flags = GetFlags(fd);
230  if (flags == -1) {
231    int saved_errno = errno;
232    char buf[kErrorBufferSize];
233    LOG(FATAL) << "Error " << saved_errno
234               << " doing fcntl(" << fd << ", F_GETFL, 0): "
235               << strerror_r(saved_errno, buf, sizeof(buf));
236  }
237  if (!(flags & O_NONBLOCK)) {
238    int saved_flags = flags;
239    flags = SetFlags(fd, flags | O_NONBLOCK);
240    if (flags == -1) {
241      // bad.
242      int saved_errno = errno;
243      char buf[kErrorBufferSize];
244      LOG(FATAL) << "Error " << saved_errno
245        << " doing fcntl(" << fd << ", F_SETFL, " << saved_flags << "): "
246        << strerror_r(saved_errno, buf, sizeof(buf));
247    }
248  }
249}
250
251int EpollServer::epoll_wait_impl(int epfd,
252                                 struct epoll_event* events,
253                                 int max_events,
254                                 int timeout_in_ms) {
255  return epoll_wait(epfd, events, max_events, timeout_in_ms);
256}
257
258void EpollServer::RegisterFDForWrite(int fd, CB* cb) {
259  RegisterFD(fd, cb, EPOLLOUT);
260}
261
262void EpollServer::RegisterFDForReadWrite(int fd, CB* cb) {
263  RegisterFD(fd, cb, EPOLLIN | EPOLLOUT);
264}
265
266void EpollServer::RegisterFDForRead(int fd, CB* cb) {
267  RegisterFD(fd, cb, EPOLLIN);
268}
269
270void EpollServer::UnregisterFD(int fd) {
271  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
272  if (cb_map_.end() == fd_i || fd_i->cb == NULL) {
273    // Doesn't exist in server, or has gone through UnregisterFD once and still
274    // inside the callchain of OnEvent.
275    return;
276  }
277#ifdef EPOLL_SERVER_EVENT_TRACING
278  event_recorder_.RecordUnregistration(fd);
279#endif
280  CB* cb = fd_i->cb;
281  // Since the links are embedded within the struct, we must remove it from the
282  // list before erasing it from the hash_set.
283  RemoveFromReadyList(*fd_i);
284  DelFD(fd);
285  cb->OnUnregistration(fd, false);
286  // fd_i->cb is NULL if that fd is unregistered inside the callchain of
287  // OnEvent. Since the EpollServer needs a valid CBAndEventMask after OnEvent
288  // returns in order to add it to the ready list, we cannot have UnregisterFD
289  // erase the entry if it is in use. Thus, a NULL fd_i->cb is used as a
290  // condition that tells the EpollServer that this entry is unused at a later
291  // point.
292  if (!fd_i->in_use) {
293    cb_map_.erase(fd_i);
294  } else {
295    // Remove all trace of the registration, and just keep the node alive long
296    // enough so the code that calls OnEvent doesn't have to worry about
297    // figuring out whether the CBAndEventMask is valid or not.
298    fd_i->cb = NULL;
299    fd_i->event_mask = 0;
300    fd_i->events_to_fake = 0;
301  }
302}
303
304void EpollServer::ModifyCallback(int fd, int event_mask) {
305  ModifyFD(fd, ~0, event_mask);
306}
307
308void EpollServer::StopRead(int fd) {
309  ModifyFD(fd, EPOLLIN, 0);
310}
311
312void EpollServer::StartRead(int fd) {
313  ModifyFD(fd, 0, EPOLLIN);
314}
315
316void EpollServer::StopWrite(int fd) {
317  ModifyFD(fd, EPOLLOUT, 0);
318}
319
320void EpollServer::StartWrite(int fd) {
321  ModifyFD(fd, 0, EPOLLOUT);
322}
323
324void EpollServer::HandleEvent(int fd, int event_mask) {
325#ifdef EPOLL_SERVER_EVENT_TRACING
326  event_recorder_.RecordEpollEvent(fd, event_mask);
327#endif
328  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
329  if (fd_i == cb_map_.end() || fd_i->cb == NULL) {
330    // Ignore the event.
331    // This could occur if epoll() returns a set of events, and
332    // while processing event A (earlier) we removed the callback
333    // for event B (and are now processing event B).
334    return;
335  }
336  fd_i->events_asserted = event_mask;
337  CBAndEventMask* cb_and_mask = const_cast<CBAndEventMask*>(&*fd_i);
338  AddToReadyList(cb_and_mask);
339}
340
341class TrueFalseGuard {
342 public:
343  explicit TrueFalseGuard(bool* guarded_bool) : guarded_bool_(guarded_bool) {
344    DCHECK(guarded_bool_ != NULL);
345    DCHECK(*guarded_bool_ == false);
346    *guarded_bool_ = true;
347  }
348  ~TrueFalseGuard() {
349    *guarded_bool_ = false;
350  }
351 private:
352  bool* guarded_bool_;
353};
354
355void EpollServer::WaitForEventsAndExecuteCallbacks() {
356  if (in_wait_for_events_and_execute_callbacks_) {
357    LOG(DFATAL) <<
358      "Attempting to call WaitForEventsAndExecuteCallbacks"
359      " when an ancestor to the current function is already"
360      " WaitForEventsAndExecuteCallbacks!";
361    // The line below is actually tested, but in coverage mode,
362    // we never see it.
363    return;  // COV_NF_LINE
364  }
365  TrueFalseGuard recursion_guard(&in_wait_for_events_and_execute_callbacks_);
366  if (alarm_map_.empty()) {
367    // no alarms, this is business as usual.
368    WaitForEventsAndCallHandleEvents(timeout_in_us_,
369                                     events_,
370                                     events_size_);
371    recorded_now_in_us_ = 0;
372    return;
373  }
374
375  // store the 'now'. If we recomputed 'now' every iteration
376  // down below, then we might never exit that loop-- any
377  // long-running alarms might install other long-running
378  // alarms, etc. By storing it here now, we ensure that
379  // a more reasonable amount of work is done here.
380  int64 now_in_us  = NowInUsec();
381
382  // Get the first timeout from the alarm_map where it is
383  // stored in absolute time.
384  int64 next_alarm_time_in_us =  alarm_map_.begin()->first;
385  VLOG(4) << "next_alarm_time = " << next_alarm_time_in_us
386          << " now             = " << now_in_us
387          << " timeout_in_us = " << timeout_in_us_;
388
389  int64 wait_time_in_us;
390  int64 alarm_timeout_in_us = next_alarm_time_in_us - now_in_us;
391
392  // If the next alarm is sooner than the default timeout, or if there is no
393  // timeout (timeout_in_us_ == -1), wake up when the alarm should fire.
394  // Otherwise use the default timeout.
395  if (alarm_timeout_in_us < timeout_in_us_ || timeout_in_us_ < 0) {
396    wait_time_in_us = std::max(alarm_timeout_in_us, static_cast<int64>(0));
397  } else {
398    wait_time_in_us = timeout_in_us_;
399  }
400
401  VLOG(4) << "wait_time_in_us = " << wait_time_in_us;
402
403  // wait for events.
404
405  WaitForEventsAndCallHandleEvents(wait_time_in_us,
406                                   events_,
407                                   events_size_);
408  CallAndReregisterAlarmEvents();
409  recorded_now_in_us_ = 0;
410}
411
412void EpollServer::SetFDReady(int fd, int events_to_fake) {
413  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
414  if (cb_map_.end() != fd_i && fd_i->cb != NULL) {
415    // This const_cast is necessary for LIST_HEAD_INSERT to work. Declaring
416    // entry mutable is insufficient because LIST_HEAD_INSERT assigns the
417    // forward pointer of the list head to the current cb_and_mask, and the
418    // compiler complains that it can't assign a const T* to a T*.
419    CBAndEventMask* cb_and_mask = const_cast<CBAndEventMask*>(&*fd_i);
420    // Note that there is no clearly correct behavior here when
421    // cb_and_mask->events_to_fake != 0 and this function is called.
422    // Of the two operations:
423    //      cb_and_mask->events_to_fake = events_to_fake
424    //      cb_and_mask->events_to_fake |= events_to_fake
425    // the first was picked because it discourages users from calling
426    // SetFDReady repeatedly to build up the correct event set as it is more
427    // efficient to call SetFDReady once with the correct, final mask.
428    cb_and_mask->events_to_fake = events_to_fake;
429    AddToReadyList(cb_and_mask);
430  }
431}
432
433void EpollServer::SetFDNotReady(int fd) {
434  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
435  if (cb_map_.end() != fd_i) {
436    RemoveFromReadyList(*fd_i);
437  }
438}
439
440bool EpollServer::IsFDReady(int fd) const {
441  FDToCBMap::const_iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
442  return (cb_map_.end() != fd_i &&
443          fd_i->cb != NULL &&
444          fd_i->entry.le_prev != NULL);
445}
446
447void EpollServer::VerifyReadyList() const {
448  int count = 0;
449  CBAndEventMask* cur = ready_list_.lh_first;
450  for (; cur; cur = cur->entry.le_next) {
451    ++count;
452  }
453  for (cur = tmp_list_.lh_first; cur; cur = cur->entry.le_next) {
454    ++count;
455  }
456  CHECK_EQ(ready_list_size_, count) << "Ready list size does not match count";
457}
458
459void EpollServer::RegisterAlarm(int64 timeout_time_in_us, AlarmCB* ac) {
460  CHECK(ac);
461  if (ContainsAlarm(ac)) {
462    LOG(FATAL) << "Alarm already exists " << ac;
463  }
464  VLOG(4) << "RegisteringAlarm at : " << timeout_time_in_us;
465
466  TimeToAlarmCBMap::iterator alarm_iter =
467      alarm_map_.insert(std::make_pair(timeout_time_in_us, ac));
468
469  all_alarms_.insert(ac);
470  // Pass the iterator to the EpollAlarmCallbackInterface.
471  ac->OnRegistration(alarm_iter, this);
472}
473
474// Unregister a specific alarm callback: iterator_token must be a
475//  valid iterator. The caller must ensure the validity of the iterator.
476void EpollServer::UnregisterAlarm(const AlarmRegToken& iterator_token) {
477  AlarmCB* cb = iterator_token->second;
478  alarm_map_.erase(iterator_token);
479  all_alarms_.erase(cb);
480  cb->OnUnregistration();
481}
482
483int EpollServer::NumFDsRegistered() const {
484  DCHECK_GE(cb_map_.size(), 1u);
485  // Omit the internal FD (read_fd_)
486  return cb_map_.size() - 1;
487}
488
489void EpollServer::Wake() {
490  char data = 'd';  // 'd' is for data.  It's good enough for me.
491  int rv = write(write_fd_, &data, 1);
492  DCHECK_EQ(rv, 1);
493}
494
495int64 EpollServer::NowInUsec() const {
496  return base::Time::Now().ToInternalValue();
497}
498
499int64 EpollServer::ApproximateNowInUsec() const {
500  if (recorded_now_in_us_ != 0) {
501    return recorded_now_in_us_;
502  }
503  return this->NowInUsec();
504}
505
506std::string EpollServer::EventMaskToString(int event_mask) {
507  std::string s;
508  if (event_mask & EPOLLIN) s += "EPOLLIN ";
509  if (event_mask & EPOLLPRI) s += "EPOLLPRI ";
510  if (event_mask & EPOLLOUT) s += "EPOLLOUT ";
511  if (event_mask & EPOLLRDNORM) s += "EPOLLRDNORM ";
512  if (event_mask & EPOLLRDBAND) s += "EPOLLRDBAND ";
513  if (event_mask & EPOLLWRNORM) s += "EPOLLWRNORM ";
514  if (event_mask & EPOLLWRBAND) s += "EPOLLWRBAND ";
515  if (event_mask & EPOLLMSG) s += "EPOLLMSG ";
516  if (event_mask & EPOLLERR) s += "EPOLLERR ";
517  if (event_mask & EPOLLHUP) s += "EPOLLHUP ";
518  if (event_mask & EPOLLONESHOT) s += "EPOLLONESHOT ";
519  if (event_mask & EPOLLET) s += "EPOLLET ";
520  return s;
521}
522
523void EpollServer::LogStateOnCrash() {
524  LOG(ERROR) << "----------------------Epoll Server---------------------------";
525  LOG(ERROR) << "Epoll server " << this << " polling on fd " << epoll_fd_;
526  LOG(ERROR) << "timeout_in_us_: " << timeout_in_us_;
527
528  // Log sessions with alarms.
529  LOG(ERROR) << alarm_map_.size() << " alarms registered.";
530  for (TimeToAlarmCBMap::iterator it = alarm_map_.begin();
531       it != alarm_map_.end();
532       ++it) {
533    const bool skipped =
534        alarms_reregistered_and_should_be_skipped_.find(it->second)
535        != alarms_reregistered_and_should_be_skipped_.end();
536    LOG(ERROR) << "Alarm " << it->second << " registered at time " << it->first
537               << " and should be skipped = " << skipped;
538  }
539
540  LOG(ERROR) << cb_map_.size() << " fd callbacks registered.";
541  for (FDToCBMap::iterator it = cb_map_.begin();
542       it != cb_map_.end();
543       ++it) {
544    LOG(ERROR) << "fd: " << it->fd << " with mask " << it->event_mask
545               << " registered with cb: " << it->cb;
546  }
547  LOG(ERROR) << "----------------------/Epoll Server--------------------------";
548}
549
550
551
552////////////////////////////////////////////////////////////////////////////////
553////////////////////////////////////////////////////////////////////////////////
554
555void EpollServer::DelFD(int fd) const {
556  struct epoll_event ee;
557  memset(&ee, 0, sizeof(ee));
558#ifdef EPOLL_SERVER_EVENT_TRACING
559  event_recorder_.RecordFDMaskEvent(fd, 0, "DelFD");
560#endif
561  if (epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, fd, &ee)) {
562    int saved_errno = errno;
563    char buf[kErrorBufferSize];
564    LOG(FATAL) << "Epoll set removal error for fd " << fd << ": "
565               << strerror_r(saved_errno, buf, sizeof(buf));
566  }
567}
568
569////////////////////////////////////////
570
571void EpollServer::AddFD(int fd, int event_mask) const {
572  struct epoll_event ee;
573  memset(&ee, 0, sizeof(ee));
574  ee.events = event_mask | EPOLLERR | EPOLLHUP;
575  ee.data.fd = fd;
576#ifdef EPOLL_SERVER_EVENT_TRACING
577  event_recorder_.RecordFDMaskEvent(fd, ee.events, "AddFD");
578#endif
579  if (epoll_ctl(epoll_fd_, EPOLL_CTL_ADD, fd, &ee)) {
580    int saved_errno = errno;
581    char buf[kErrorBufferSize];
582    LOG(FATAL) << "Epoll set insertion error for fd " << fd << ": "
583               << strerror_r(saved_errno, buf, sizeof(buf));
584  }
585}
586
587////////////////////////////////////////
588
589void EpollServer::ModFD(int fd, int event_mask) const {
590  struct epoll_event ee;
591  memset(&ee, 0, sizeof(ee));
592  ee.events = event_mask | EPOLLERR | EPOLLHUP;
593  ee.data.fd = fd;
594#ifdef EPOLL_SERVER_EVENT_TRACING
595  event_recorder_.RecordFDMaskEvent(fd, ee.events, "ModFD");
596#endif
597  VLOG(3) <<  "modifying fd= " << fd << " "
598          << EventMaskToString(ee.events);
599  if (epoll_ctl(epoll_fd_, EPOLL_CTL_MOD, fd, &ee)) {
600    int saved_errno = errno;
601    char buf[kErrorBufferSize];
602    LOG(FATAL) << "Epoll set modification error for fd " << fd << ": "
603               << strerror_r(saved_errno, buf, sizeof(buf));
604  }
605}
606
607////////////////////////////////////////
608
609void EpollServer::ModifyFD(int fd, int remove_event, int add_event) {
610  FDToCBMap::iterator fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
611  if (cb_map_.end() == fd_i) {
612    VLOG(2) << "Didn't find the fd " << fd << "in internal structures";
613    return;
614  }
615
616  if (fd_i->cb != NULL) {
617    int & event_mask = fd_i->event_mask;
618    VLOG(3) << "fd= " << fd
619            << " event_mask before: " << EventMaskToString(event_mask);
620    event_mask &= ~remove_event;
621    event_mask |= add_event;
622
623    VLOG(3) << " event_mask after: " << EventMaskToString(event_mask);
624
625    ModFD(fd, event_mask);
626
627    fd_i->cb->OnModification(fd, event_mask);
628  }
629}
630
631void EpollServer::WaitForEventsAndCallHandleEvents(int64 timeout_in_us,
632                                                   struct epoll_event events[],
633                                                   int events_size) {
634  if (timeout_in_us == 0 || ready_list_.lh_first != NULL) {
635    // If ready list is not empty, then don't sleep at all.
636    timeout_in_us = 0;
637  } else if (timeout_in_us < 0) {
638    LOG(INFO) << "Negative epoll timeout: " << timeout_in_us
639              << "us; epoll will wait forever for events.";
640    // If timeout_in_us is < 0 we are supposed to Wait forever.  This means we
641    // should set timeout_in_us to -1000 so we will
642    // Wait(-1000/1000) == Wait(-1) == Wait forever.
643    timeout_in_us = -1000;
644  } else {
645    // If timeout is specified, and the ready list is empty.
646    if (timeout_in_us < 1000) {
647      timeout_in_us = 1000;
648    }
649  }
650  const int timeout_in_ms = timeout_in_us / 1000;
651  int nfds = epoll_wait_impl(epoll_fd_,
652                             events,
653                             events_size,
654                             timeout_in_ms);
655  VLOG(3) << "nfds=" << nfds;
656
657#ifdef EPOLL_SERVER_EVENT_TRACING
658  event_recorder_.RecordEpollWaitEvent(timeout_in_ms, nfds);
659#endif
660
661  // If you're wondering why the NowInUsec() is recorded here, the answer is
662  // simple: If we did it before the epoll_wait_impl, then the max error for
663  // the ApproximateNowInUs() call would be as large as the maximum length of
664  // epoll_wait, which can be arbitrarily long. Since this would make
665  // ApproximateNowInUs() worthless, we instead record the time -after- we've
666  // done epoll_wait, which guarantees that the maximum error is the amount of
667  // time it takes to process all the events generated by epoll_wait.
668  recorded_now_in_us_ = NowInUsec();
669  if (nfds > 0) {
670    for (int i = 0; i < nfds; ++i) {
671      int event_mask = events[i].events;
672      int fd = events[i].data.fd;
673      HandleEvent(fd, event_mask);
674    }
675  } else if (nfds < 0) {
676    // Catch interrupted syscall and just ignore it and move on.
677    if (errno != EINTR && errno != 0) {
678      int saved_errno = errno;
679      char buf[kErrorBufferSize];
680      LOG(FATAL) << "Error " << saved_errno << " in epoll_wait: "
681                 << strerror_r(saved_errno, buf, sizeof(buf));
682    }
683  }
684
685  // Now run through the ready list.
686  if (ready_list_.lh_first) {
687    CallReadyListCallbacks();
688  }
689}
690
691void EpollServer::CallReadyListCallbacks() {
692  // Check pre-conditions.
693  DCHECK(tmp_list_.lh_first == NULL);
694  // Swap out the ready_list_ into the tmp_list_ before traversing the list to
695  // enable SetFDReady() to just push new items into the ready_list_.
696  std::swap(ready_list_.lh_first, tmp_list_.lh_first);
697  if (tmp_list_.lh_first) {
698    tmp_list_.lh_first->entry.le_prev = &tmp_list_.lh_first;
699    EpollEvent event(0, false);
700    while (tmp_list_.lh_first != NULL) {
701      DCHECK_GT(ready_list_size_, 0);
702      CBAndEventMask* cb_and_mask = tmp_list_.lh_first;
703      RemoveFromReadyList(*cb_and_mask);
704
705      event.out_ready_mask = 0;
706      event.in_events =
707        cb_and_mask->events_asserted | cb_and_mask->events_to_fake;
708      // TODO(fenix): get rid of the two separate fields in cb_and_mask.
709      cb_and_mask->events_asserted = 0;
710      cb_and_mask->events_to_fake = 0;
711      {
712        // OnEvent() may call UnRegister, so we set in_use, here. Any
713        // UnRegister call will now simply set the cb to NULL instead of
714        // invalidating the cb_and_mask object (by deleting the object in the
715        // map to which cb_and_mask refers)
716        TrueFalseGuard in_use_guard(&(cb_and_mask->in_use));
717        cb_and_mask->cb->OnEvent(cb_and_mask->fd, &event);
718      }
719
720      // Since OnEvent may have called UnregisterFD, we must check here that
721      // the callback is still valid. If it isn't, then UnregisterFD *was*
722      // called, and we should now get rid of the object.
723      if (cb_and_mask->cb == NULL) {
724        cb_map_.erase(*cb_and_mask);
725      } else if (event.out_ready_mask != 0) {
726        cb_and_mask->events_to_fake = event.out_ready_mask;
727        AddToReadyList(cb_and_mask);
728      }
729    }
730  }
731  DCHECK(tmp_list_.lh_first == NULL);
732}
733
734void EpollServer::CallAndReregisterAlarmEvents() {
735  int64 now_in_us = recorded_now_in_us_;
736  DCHECK_NE(0, recorded_now_in_us_);
737
738  TimeToAlarmCBMap::iterator erase_it;
739
740  // execute alarms.
741  for (TimeToAlarmCBMap::iterator i = alarm_map_.begin();
742       i != alarm_map_.end();
743      ) {
744    if (i->first > now_in_us) {
745      break;
746    }
747    AlarmCB* cb = i->second;
748    // Execute the OnAlarm() only if we did not register
749    // it in this loop itself.
750    const bool added_in_this_round =
751        alarms_reregistered_and_should_be_skipped_.find(cb)
752        != alarms_reregistered_and_should_be_skipped_.end();
753    if (added_in_this_round) {
754      ++i;
755      continue;
756    }
757    all_alarms_.erase(cb);
758    const int64 new_timeout_time_in_us = cb->OnAlarm();
759
760    erase_it = i;
761    ++i;
762    alarm_map_.erase(erase_it);
763
764    if (new_timeout_time_in_us > 0) {
765      // We add to hash_set only if the new timeout is <= now_in_us.
766      // if timeout is > now_in_us then we have no fear that this alarm
767      // can be reexecuted in this loop, and hence we do not need to
768      // worry about a recursive loop.
769      DVLOG(3) << "Reregistering alarm "
770               << " " << cb
771               << " " << new_timeout_time_in_us
772               << " " << now_in_us;
773      if (new_timeout_time_in_us <= now_in_us) {
774        alarms_reregistered_and_should_be_skipped_.insert(cb);
775      }
776      RegisterAlarm(new_timeout_time_in_us, cb);
777    }
778  }
779  alarms_reregistered_and_should_be_skipped_.clear();
780}
781
782EpollAlarm::EpollAlarm() : eps_(NULL), registered_(false) {
783}
784
785EpollAlarm::~EpollAlarm() {
786  UnregisterIfRegistered();
787}
788
789int64 EpollAlarm::OnAlarm() {
790  registered_ = false;
791  return 0;
792}
793
794void EpollAlarm::OnRegistration(const EpollServer::AlarmRegToken& token,
795                                EpollServer* eps) {
796  DCHECK_EQ(false, registered_);
797
798  token_ = token;
799  eps_ = eps;
800  registered_ = true;
801}
802
803void EpollAlarm::OnUnregistration() {
804  registered_ = false;
805}
806
807void EpollAlarm::OnShutdown(EpollServer* eps) {
808  registered_ = false;
809  eps_ = NULL;
810}
811
812// If the alarm was registered, unregister it.
813void EpollAlarm::UnregisterIfRegistered() {
814  if (!registered_) {
815    return;
816  }
817  eps_->UnregisterAlarm(token_);
818}
819
820}  // namespace net
821