epoll_server.h revision ddb351dbec246cf1fab5ec20d2d5520909041de1
1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
7#pragma once
9#include <fcntl.h>
10#include <sys/queue.h>
11#include <ext/hash_map>  // it is annoying that gcc does this. oh well.
12#include <ext/hash_set>
13#include <map>
14#include <string>
15#include <utility>
16#include <set>
17#include <vector>
22// causes code to exist which didn't before.
23// This code tracks each event generated by the epollserver,
24// as well as providing a per-fd-registered summary of
25// events. Note that enabling this code vastly slows
26// down operations, and uses substantially more
27// memory. For these reasons, it should only be enabled when doing
28// developer debugging at his/her workstation.
30// A structure called 'EventRecorder' will exist when
31// the macro is defined. See the EventRecorder class interface
32// within the EpollServer class for more details.
34#include <iostream>
35#include "base/logging.h"
38#include "base/basictypes.h"
39#include "base/memory/scoped_ptr.h"
40#include <sys/epoll.h>
42namespace net {
44class EpollServer;
45class EpollAlarmCallbackInterface;
46class ReadPipeCallback;
48struct EpollEvent {
49  EpollEvent(int events, bool is_epoll_wait)
50      : in_events(events),
51        out_ready_mask(0) {
52  }
54  int in_events;            // incoming events
55  int out_ready_mask;       // the new event mask for ready list (0 means don't
56                            // get on the ready list). This field is always
57                            // initialized to 0 when the event is passed to
58                            // OnEvent.
61// Callbacks which go into EpollServers are expected to derive from this class.
62class EpollCallbackInterface {
63 public:
64  // Summary:
65  //   Called when the callback is registered into a EpollServer.
66  // Args:
67  //   eps - the poll server into which this callback was registered
68  //   fd - the file descriptor which was registered
69  //   event_mask - the event mask (composed of EPOLLIN, EPOLLOUT, etc)
70  //                which was registered (and will initially be used
71  //                in the epoll() calls)
72  virtual void OnRegistration(EpollServer* eps, int fd, int event_mask) = 0;
74  // Summary:
75  //   Called when the event_mask is modified (for a file-descriptor)
76  // Args:
77  //   fd - the file descriptor which was registered
78  //   event_mask - the event mask (composed of EPOLLIN, EPOLLOUT, etc)
79  //                which was is now curren (and will be used
80  //                in subsequent epoll() calls)
81  virtual void OnModification(int fd, int event_mask) = 0;
83  // Summary:
84  //   Called whenever an event occurs on the file-descriptor.
85  //   This is where the bulk of processing is expected to occur.
86  // Args:
87  //   fd - the file descriptor which was registered
88  //   event - a struct that contains the event mask (composed of EPOLLIN,
89  //           EPOLLOUT, etc), a flag that indicates whether this is a true
90  //           epoll_wait event vs one from the ready list, and an output
91  //           parameter for OnEvent to inform the EpollServer whether to put
92  //           this fd on the ready list.
93  virtual void OnEvent(int fd, EpollEvent* event) = 0;
95  // Summary:
96  //   Called when the file-descriptor is unregistered from the poll-server.
97  // Args:
98  //   fd - the file descriptor which was registered, and of this call, is now
99  //        unregistered.
100  //   replaced - If true, this callback is being replaced by another, otherwise
101  //              it is simply being removed.
102  virtual void OnUnregistration(int fd, bool replaced) = 0;
104  // Summary:
105  //   Called when the epoll server is shutting down.  This is different from
106  //   OnUnregistration because the subclass may want to clean up memory.
107  //   This is called in leiu of OnUnregistration.
108  // Args:
109  //  fd - the file descriptor which was registered.
110  virtual void OnShutdown(EpollServer* eps, int fd) = 0;
112  virtual ~EpollCallbackInterface() {}
114 protected:
115  EpollCallbackInterface() {}
121class EpollServer {
122 public:
123  typedef EpollAlarmCallbackInterface AlarmCB;
124  typedef EpollCallbackInterface CB;
126  typedef std::multimap<int64, AlarmCB*> TimeToAlarmCBMap;
127  typedef TimeToAlarmCBMap::iterator AlarmRegToken;
129  // Summary:
130  //   Constructor:
131  //    By default, we don't wait any amount of time for events, and
132  //    we suggest to the epoll-system that we're going to use on-the-order
133  //    of 1024 FDs.
134  EpollServer();
136  ////////////////////////////////////////
138  // Destructor
139  virtual ~EpollServer();
141  ////////////////////////////////////////
143  // Summary
144  //   Register a callback to be called whenever an event contained
145  //   in the set of events included in event_mask occurs on the
146  //   file-descriptor 'fd'
147  //
148  //   Note that only one callback is allowed to be registered for
149  //   any specific file-decriptor.
150  //
151  //   If a callback is registered for a file-descriptor which has already
152  //   been registered, then the previous callback is unregistered with
153  //   the 'replaced' flag set to true. I.e. the previous callback's
154  //   OnUnregistration() function is called like so:
155  //      OnUnregistration(fd, true);
156  //
157  //  The epoll server does NOT take on ownership of the callback: the callback
158  //  creator is responsible for managing that memory.
159  //
160  // Args:
161  //   fd - a valid file-descriptor
162  //   cb - an instance of a subclass of EpollCallbackInterface
163  //   event_mask - a combination of (EPOLLOUT, EPOLLIN.. etc) indicating
164  //                the events for which the callback would like to be
165  //                called.
166  virtual void RegisterFD(int fd, CB* cb, int event_mask);
168  ////////////////////////////////////////
170  // Summary:
171  //   A shortcut for RegisterFD which sets things up such that the
172  //   callback is called when 'fd' is available for writing.
173  // Args:
174  //   fd - a valid file-descriptor
175  //   cb - an instance of a subclass of EpollCallbackInterface
176  virtual void RegisterFDForWrite(int fd, CB* cb);
178  ////////////////////////////////////////
180  // Summary:
181  //   A shortcut for RegisterFD which sets things up such that the
182  //   callback is called when 'fd' is available for reading or writing.
183  // Args:
184  //   fd - a valid file-descriptor
185  //   cb - an instance of a subclass of EpollCallbackInterface
186  virtual void RegisterFDForReadWrite(int fd, CB* cb);
188  ////////////////////////////////////////
190  // Summary:
191  //   A shortcut for RegisterFD which sets things up such that the
192  //   callback is called when 'fd' is available for reading.
193  // Args:
194  //   fd - a valid file-descriptor
195  //   cb - an instance of a subclass of EpollCallbackInterface
196  virtual void RegisterFDForRead(int fd, CB* cb);
198  ////////////////////////////////////////
200  // Summary:
201  //   Removes the FD and the associated callback from the pollserver.
202  //   If the callback is registered with other FDs, they will continue
203  //   to be processed using the callback without modification.
204  //   If the file-descriptor specified is not registered in the
205  //   epoll_server, then nothing happens as a result of this call.
206  // Args:
207  //   fd - the file-descriptor which should no-longer be monitored.
208  virtual void UnregisterFD(int fd);
210  ////////////////////////////////////////
212  // Summary:
213  //   Modifies the event mask for the file-descriptor, replacing
214  //   the old event_mask with the new one specified here.
215  //   If the file-descriptor specified is not registered in the
216  //   epoll_server, then nothing happens as a result of this call.
217  // Args:
218  //   fd - the fd whose event mask should be modified.
219  //   event_mask - the new event mask.
220  virtual void ModifyCallback(int fd, int event_mask);
222  ////////////////////////////////////////
224  // Summary:
225  //   Modifies the event mask for the file-descriptor such that we
226  //   no longer request events when 'fd' is readable.
227  //   If the file-descriptor specified is not registered in the
228  //   epoll_server, then nothing happens as a result of this call.
229  // Args:
230  //   fd - the fd whose event mask should be modified.
231  virtual void StopRead(int fd);
233  ////////////////////////////////////////
235  // Summary:
236  //   Modifies the event mask for the file-descriptor such that we
237  //   request events when 'fd' is readable.
238  //   If the file-descriptor specified is not registered in the
239  //   epoll_server, then nothing happens as a result of this call.
240  // Args:
241  //   fd - the fd whose event mask should be modified.
242  virtual void StartRead(int fd);
244  ////////////////////////////////////////
246  // Summary:
247  //   Modifies the event mask for the file-descriptor such that we
248  //   no longer request events when 'fd' is writable.
249  //   If the file-descriptor specified is not registered in the
250  //   epoll_server, then nothing happens as a result of this call.
251  // Args:
252  //   fd - the fd whose event mask should be modified.
253  virtual void StopWrite(int fd);
255  ////////////////////////////////////////
257  // Summary:
258  //   Modifies the event mask for the file-descriptor such that we
259  //   request events when 'fd' is writable.
260  //   If the file-descriptor specified is not registered in the
261  //   epoll_server, then nothing happens as a result of this call.
262  // Args:
263  //   fd - the fd whose event mask should be modified.
264  virtual void StartWrite(int fd);
266  ////////////////////////////////////////
268  // Summary:
269  //   Looks up the callback associated with the file-desriptor 'fd'.
270  //   If a callback is associated with this file-descriptor, then
271  //   it's OnEvent() method is called with the file-descriptor 'fd',
272  //   and event_mask 'event_mask'
273  //
274  //   If no callback is registered for this file-descriptor, nothing
275  //   will happen as a result of this call.
276  //
277  //   This function is used internally by the EpollServer, but is
278  //   available publically so that events might be 'faked'. Calling
279  //   this function with an fd and event_mask is equivalent (as far
280  //   as the callback is concerned) to having a real event generated
281  //   by epoll (except, of course, that read(), etc won't necessarily
282  //   be able to read anything)
283  // Args:
284  //   fd - the file-descriptor on which an event has occured.
285  //   event_mask - a bitmask representing the events which have occured
286  //                on/for this fd. This bitmask is composed of
287  //                POLLIN, POLLOUT, etc.
288  //
289  void HandleEvent(int fd, int event_mask);
291  // Summary:
292  //   Call this when you want the pollserver to
293  //   wait for events and execute the callbacks associated with
294  //   the file-descriptors on which those events have occured.
295  //   Depending on the value of timeout_in_us_, this may or may
296  //   not return immediately. Please reference the set_timeout()
297  //   function for the specific behaviour.
298  virtual void WaitForEventsAndExecuteCallbacks();
300  // Summary:
301  //   When an fd is registered to use edge trigger notification, the ready
302  //   list can be used to simulate level trigger semantics. Edge trigger
303  //   registration doesn't send an initial event, and only rising edge (going
304  //   from blocked to unblocked) events are sent. A callback can put itself on
305  //   the ready list by calling SetFDReady() after calling RegisterFD(). The
306  //   OnEvent method of all callbacks associated with the fds on the ready
307  //   list will be called immediately after processing the events returned by
308  //   epoll_wait(). The fd is removed from the ready list before the
309  //   callback's OnEvent() method is invoked. To stay on the ready list, the
310  //   OnEvent() (or some function in that call chain) must call SetFDReady
311  //   again. When a fd is unregistered using UnregisterFD(), the fd is
312  //   automatically removed from the ready list.
313  //
314  //   When the callback for a edge triggered fd hits the falling edge (about
315  //   to block, either because of it got an EAGAIN, or had a short read/write
316  //   operation), it should remove itself from the ready list using
317  //   SetFDNotReady() (since OnEvent cannot distinguish between invocation
318  //   from the ready list vs from a normal epoll event). All four ready list
319  //   methods are safe to be called  within the context of the callbacks.
320  //
321  //   Since the ready list invokes EpollCallbackInterface::OnEvent, only fds
322  //   that are registered with the EpollServer will be put on the ready list.
323  //   SetFDReady() and SetFDNotReady() will do nothing if the EpollServer
324  //   doesn't know about the fd passed in.
325  //
326  //   Since the ready list cannot reliably determine proper set of events
327  //   which should be sent to the callback, SetFDReady() requests the caller
328  //   to provide the ready list with the event mask, which will be used later
329  //   when OnEvent() is invoked by the ready list. Hence, the event_mask
330  //   passedto SetFDReady() does not affect the actual epoll registration of
331  //   the fd with the kernel. If a fd is already put on the ready list, and
332  //   SetFDReady() is called again for that fd with a different event_mask,
333  //   the event_mask will be updated.
334  virtual void SetFDReady(int fd, int events_to_fake);
336  virtual void SetFDNotReady(int fd);
338  // Summary:
339  //   IsFDReady(), ReadyListSize(), and VerifyReadyList are intended as
340  //   debugging tools and for writing unit tests.
341  //   ISFDReady() returns whether a fd is in the ready list.
342  //   ReadyListSize() returns the number of fds on the ready list.
343  //   VerifyReadyList() checks the consistency of internal data structure. It
344  //   will CHECK if it finds an error.
345  virtual bool IsFDReady(int fd) const;
347  size_t ReadyListSize() const { return ready_list_size_; }
349  void VerifyReadyList() const;
351  ////////////////////////////////////////
353  // Summary:
354  //   Registers an alarm 'ac' to go off at time 'timeout_time_in_us'.
355  //   If the callback returns a positive number from its OnAlarm() function,
356  //   then the callback will be re-registered at that time, else the alarm
357  //   owner is responsible for freeing up memory.
358  //
359  //   Important: A give AlarmCB* can not be registered again if it is already
360  //    registered. If a user wants to register a callback again it should first
361  //    unregister the previous callback before calling RegisterAlarm again.
362  // Args:
363  //   timeout_time_in_us - the absolute time at which the alarm should go off
364  //   ac - the alarm which will be called.
365  virtual void RegisterAlarm(int64 timeout_time_in_us, AlarmCB* ac);
367  // Summary:
368  //   Registers an alarm 'ac' to go off at time: (ApproximateNowInUs() +
369  //   delta_in_us). While this is somewhat less accurate (see the description
370  //   for ApproximateNowInUs() to see how 'approximate'), the error is never
371  //   worse than the amount of time it takes to process all events in one
372  //   WaitForEvents.  As with 'RegisterAlarm()', if the callback returns a
373  //   positive number from its OnAlarm() function, then the callback will be
374  //   re-registered at that time, else the alarm owner is responsible for
375  //   freeing up memory.
376  //   Note that this function is purely a convienence. The
377  //   same thing may be accomplished by using RegisterAlarm with
378  //   ApproximateNowInUs() directly.
379  //
380  //   Important: A give AlarmCB* can not be registered again if it is already
381  //    registered. If a user wants to register a callback again it should first
382  //    unregister the previous callback before calling RegisterAlarm again.
383  // Args:
384  //   delta_in_us - the delta in microseconds from the ApproximateTimeInUs() at
385  //                 which point the alarm should go off.
386  //   ac - the alarm which will be called.
387  void RegisterAlarmApproximateDelta(int64 delta_in_us, AlarmCB* ac) {
388    RegisterAlarm(ApproximateNowInUsec() + delta_in_us, ac);
389  }
391  ////////////////////////////////////////
393  // Summary:
394  //   Unregister  the alarm referred to by iterator_token; Callers should
395  //   be warned that a token may have become already invalid when OnAlarm()
396  //   is called, was unregistered, or OnShutdown was called on that alarm.
397  // Args:
398  //    iterator_token - iterator to the alarm callback to unregister.
399  virtual void UnregisterAlarm(
400      const EpollServer::AlarmRegToken& iterator_token);
402  ////////////////////////////////////////
404  // Summary:
405  //   returns the number of file-descriptors registered in this EpollServer.
406  // Returns:
407  //   number of FDs registered (discounting the internal pipe used for Wake)
408  virtual int NumFDsRegistered() const;
410  // Summary:
411  //   Force the epoll server to wake up (by writing to an internal pipe).
412  virtual void Wake();
414  // Summary:
415  //   Wrapper around WallTimer's NowInUsec.  We do this so that we can test
416  //   EpollServer without using the system clock (and can avoid the flakiness
417  //   that would ensue)
418  // Returns:
419  //   the current time as number of microseconds since the Unix epoch.
420  virtual int64 NowInUsec() const;
422  // Summary:
423  //   Since calling NowInUsec() many thousands of times per
424  //   WaitForEventsAndExecuteCallbacks function call is, to say the least,
425  //   inefficient, we allow users to use an approximate time instead. The
426  //   time returned from this function is as accurate as NowInUsec() when
427  //   WaitForEventsAndExecuteCallbacks is not an ancestor of the caller's
428  //   callstack.
429  //   However, when WaitForEventsAndExecuteCallbacks -is- an ancestor, then
430  //   this function returns the time at which the
431  //   WaitForEventsAndExecuteCallbacks function started to process events or
432  //   alarms.
433  //
434  //   Essentially, this function makes available a fast and mostly accurate
435  //   mechanism for getting the time for any function handling an event or
436  //   alarm. When functions which are not handling callbacks or alarms call
437  //   this function, they get the slow and "absolutely" accurate time.
438  //
439  //   Users should be encouraged to use this function.
440  // Returns:
441  //   the "approximate" current time as number of microseconds since the Unix
442  //   epoch.
443  virtual int64 ApproximateNowInUsec() const;
445  static std::string EventMaskToString(int event_mask);
447  // Summary:
448  //   Logs the state of the epoll server with LOG(ERROR).
449  void LogStateOnCrash();
451  // Summary:
452  //   Set the timeout to the value specified.
453  //   If the timeout is set to a negative number,
454  //      WaitForEventsAndExecuteCallbacks() will only return when an event has
455  //      occured
456  //   If the timeout is set to zero,
457  //      WaitForEventsAndExecuteCallbacks() will return immediately
458  //   If the timeout is set to a positive number,
459  //      WaitForEventsAndExecuteCallbacks() will return when an event has
460  //      occured, or when timeout_in_us microseconds has elapsed, whichever
461  //      is first.
462  //  Args:
463  //    timeout_in_us - value specified depending on behaviour desired.
464  //                    See above.
465  void set_timeout_in_us(int64 timeout_in_us) {
466    timeout_in_us_ = timeout_in_us;
467  }
469  ////////////////////////////////////////
471  // Summary:
472  //   Accessor for the current value of timeout_in_us.
473  int timeout_in_us() const { return timeout_in_us_; }
475  // Summary:
476  // Returns true when the EpollServer() is being destroyed.
477  bool in_shutdown() const { return in_shutdown_; }
479  bool ContainsAlarm(EpollAlarmCallbackInterface* alarm) const {
480    return all_alarms_.find(alarm) != all_alarms_.end();
481  }
483  // Summary:
484  //   A function for implementing the ready list. It invokes OnEvent for each
485  //   of the fd in the ready list, and takes care of adding them back to the
486  //   ready list if the callback requests it (by checking that out_ready_mask
487  //   is non-zero).
488  void CallReadyListCallbacks();
490  // Granularity at which time moves when considering what alarms are on.
491  // See function: DoRoundingOnNow() on exact usage.
492  static const int kMinimumEffectiveAlarmQuantum;
493 protected:
495  virtual int GetFlags(int fd);
496  inline int SetFlags(int fd, int flags) {
497    return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
498  }
500  virtual void SetNonblocking(int fd);
502  // This exists here so that we can override this function in unittests
503  // in order to make effective mock EpollServer objects.
504  virtual int epoll_wait_impl(int epfd,
505                              struct epoll_event* events,
506                              int max_events,
507                              int timeout_in_ms);
509  // this struct is used internally, and is never used by anything external
510  // to this class. Some of its members are declared mutable to get around the
511  // restriction imposed by hash_set. Since hash_set knows nothing about the
512  // objects it stores, it has to assume that every bit of the object is used
513  // in the hash function and equal_to comparison. Thus hash_set::iterator is a
514  // const iterator. In this case, the only thing that must stay constant is
515  // fd. Everything else are just along for the ride and changing them doesn't
516  // compromise the hash_set integrity.
517  struct CBAndEventMask {
518    CBAndEventMask()
519        : cb(NULL),
520          fd(-1),
521          event_mask(0),
522          events_asserted(0),
523          events_to_fake(0),
524          in_use(false) {
525      entry.le_next = NULL;
526      entry.le_prev = NULL;
527    }
529    CBAndEventMask(EpollCallbackInterface* cb,
530                   int event_mask,
531                   int fd)
532        : cb(cb), fd(fd), event_mask(event_mask), events_asserted(0),
533          events_to_fake(0), in_use(false) {
534      entry.le_next = NULL;
535      entry.le_prev = NULL;
536    }
538    // Required operator for hash_set. Normally operator== should be a free
539    // standing function. However, since CBAndEventMask is a protected type and
540    // it will never be a base class, it makes no difference.
541    bool operator==(const CBAndEventMask& cb_and_mask) const {
542      return fd == cb_and_mask.fd;
543    }
544    // A callback. If the fd is unregistered inside the callchain of OnEvent,
545    // the cb will be set to NULL.
546    mutable EpollCallbackInterface* cb;
548    mutable LIST_ENTRY(CBAndEventMask) entry;
549    // file descriptor registered with the epoll server.
550    int fd;
551    // the current event_mask registered for this callback.
552    mutable int event_mask;
553    // the event_mask that was returned by epoll
554    mutable int events_asserted;
555    // the event_mask for the ready list to use to call OnEvent.
556    mutable int events_to_fake;
557    // toggle around calls to OnEvent to tell UnregisterFD to not erase the
558    // iterator because HandleEvent is using it.
559    mutable bool in_use;
560  };
562  // Custom hash function to be used by hash_set.
563  struct CBAndEventMaskHash {
564    size_t operator()(const CBAndEventMask& cb_and_eventmask) const {
565      return static_cast<size_t>(cb_and_eventmask.fd);
566    }
567  };
569  typedef __gnu_cxx::hash_set<CBAndEventMask, CBAndEventMaskHash> FDToCBMap;
571  // the following four functions are OS-specific, and are likely
572  // to be changed in a subclass if the poll/select method is changed
573  // from epoll.
575  // Summary:
576  //   Deletes a file-descriptor from the set of FDs that should be
577  //   monitored with epoll.
578  //   Note that this only deals with modifying data relating -directly-
579  //   with the epoll call-- it does not modify any data within the
580  //   epoll_server.
581  // Args:
582  //   fd - the file descriptor to-be-removed from the monitoring set
583  virtual void DelFD(int fd) const;
585  ////////////////////////////////////////
587  // Summary:
588  //   Adds a file-descriptor to the set of FDs that should be
589  //   monitored with epoll.
590  //   Note that this only deals with modifying data relating -directly-
591  //   with the epoll call.
592  // Args:
593  //   fd - the file descriptor to-be-added to the monitoring set
594  //   event_mask - the event mask (consisting of EPOLLIN, EPOLLOUT, etc
595  //                 OR'd together) which will be associated with this
596  //                 FD initially.
597  virtual void AddFD(int fd, int event_mask) const;
599  ////////////////////////////////////////
601  // Summary:
602  //   Modifies a file-descriptor in the set of FDs that should be
603  //   monitored with epoll.
604  //   Note that this only deals with modifying data relating -directly-
605  //   with the epoll call.
606  // Args:
607  //   fd - the file descriptor to-be-added to the monitoring set
608  //   event_mask - the event mask (consisting of EPOLLIN, EPOLLOUT, etc
609  //                 OR'd together) which will be associated with this
610  //                 FD after this call.
611  virtual void ModFD(int fd, int event_mask) const;
613  ////////////////////////////////////////
615  // Summary:
616  //   Modified the event mask associated with an FD in the set of
617  //   data needed by epoll.
618  //   Events are removed before they are added, thus, if ~0 is put
619  //   in 'remove_event', whatever is put in 'add_event' will be
620  //   the new event mask.
621  //   If the file-descriptor specified is not registered in the
622  //   epoll_server, then nothing happens as a result of this call.
623  // Args:
624  //   fd - the file descriptor whose event mask is to be modified
625  //   remove_event - the events which are to be removed from the current
626  //                  event_mask
627  //   add_event - the events which are to be added to the current event_mask
628  //
629  //
630  virtual void ModifyFD(int fd, int remove_event, int add_event);
632  ////////////////////////////////////////
634  // Summary:
635  //   Waits for events, and calls HandleEvents() for each
636  //   fd, event pair discovered to possibly have an event.
637  //   Note that a callback (B) may get a spurious event if
638  //   another callback (A) has closed a file-descriptor N, and
639  //   the callback (B) has a newly opened file-descriptor, which
640  //   also happens to be N.
641  virtual void WaitForEventsAndCallHandleEvents(int64 timeout_in_us,
642                                                struct epoll_event events[],
643                                                int events_size);
647  // Summary:
648  //   An internal function for implementing the ready list. It adds a fd's
649  //   CBAndEventMask to the ready list. If the fd is already on the ready
650  //   list, it is a no-op.
651  void AddToReadyList(CBAndEventMask* cb_and_mask);
653  // Summary:
654  //   An internal function for implementing the ready list. It remove a fd's
655  //   CBAndEventMask from the ready list. If the fd is not on the ready list,
656  //   it is a no-op.
657  void RemoveFromReadyList(const CBAndEventMask& cb_and_mask);
659  // Summary:
660  // Calls any pending alarms that should go off and reregisters them if they
661  // were recurring.
662  virtual void CallAndReregisterAlarmEvents();
664  // The file-descriptor created for epolling
665  int epoll_fd_;
667  // The mapping of file-descriptor to CBAndEventMasks
668  FDToCBMap cb_map_;
670  // Custom hash function to be used by hash_set.
671  struct AlarmCBHash {
672    size_t operator()(AlarmCB*const& p) const {
673      return reinterpret_cast<size_t>(p);
674    }
675  };
678  // TOOD(sushantj): Having this hash_set is avoidable. We currently have it
679  // only so that we can enforce stringent checks that a caller can not register
680  // the same alarm twice. One option is to have an implementation in which
681  // this hash_set is used only in the debug mode.
682  typedef __gnu_cxx::hash_set<AlarmCB*, AlarmCBHash> AlarmCBMap;
683  AlarmCBMap all_alarms_;
685  TimeToAlarmCBMap alarm_map_;
687  // The amount of time in microseconds that we'll wait before returning
688  // from the WaitForEventsAndExecuteCallbacks() function.
689  // If this is positive, wait that many microseconds.
690  // If this is negative, wait forever, or for the first event that occurs
691  // If this is zero, never wait for an event.
692  int64 timeout_in_us_;
694  // This is nonzero only after the invocation of epoll_wait_impl within
695  // WaitForEventsAndCallHandleEvents and before the function
696  // WaitForEventsAndExecuteCallbacks returns.  At all other times, this is
697  // zero. This enables us to have relatively accurate time returned from the
698  // ApproximateNowInUs() function. See that function for more details.
699  int64 recorded_now_in_us_;
701  // This is used to implement CallAndReregisterAlarmEvents. This stores
702  // all alarms that were reregistered because OnAlarm() returned a
703  // value > 0 and the time at which they should be executed is less that
704  // the current time.  By storing such alarms in this map we ensure
705  // that while calling CallAndReregisterAlarmEvents we do not call
706  // OnAlarm on any alarm in this set. This ensures that we do not
707  // go in an infinite loop.
708  AlarmCBMap alarms_reregistered_and_should_be_skipped_;
710  LIST_HEAD(ReadyList, CBAndEventMask) ready_list_;
711  LIST_HEAD(TmpList, CBAndEventMask) tmp_list_;
712  int ready_list_size_;
713  // TODO(alyssar): make this into something that scales up.
714  static const int events_size_ = 256;
715  struct epoll_event events_[256];
717  // These controls the granularity for alarms
718  // See function CallAndReregisterAlarmEvents()
719  // TODO(sushantj): Add test for this.
720  int64 DoRoundingOnNow(int64 now_in_us) const;
723  struct EventRecorder {
724   public:
725    EventRecorder() : num_records_(0), record_threshold_(10000) {}
727    ~EventRecorder() {
728      Clear();
729    }
731    // When a number of events equals the record threshold,
732    // the collected data summary for all FDs will be written
733    // to LOG(INFO). Note that this does not include the
734    // individual events (if you'reinterested in those, you'll
735    // have to get at them programmatically).
736    // After any such flushing to LOG(INFO) all events will
737    // be cleared.
738    // Note that the definition of an 'event' is a bit 'hazy',
739    // as it includes the 'Unregistration' event, and perhaps
740    // others.
741    void set_record_threshold(int64 new_threshold) {
742      record_threshold_ = new_threshold;
743    }
745    void Clear() {
746      for (int i = 0; i < debug_events_.size(); ++i) {
747        delete debug_events_[i];
748      }
749      debug_events_.clear();
750      unregistered_fds_.clear();
751      event_counts_.clear();
752    }
754    void MaybeRecordAndClear() {
755      ++num_records_;
756      if ((num_records_ > record_threshold_) &&
757          (record_threshold_ > 0)) {
758        LOG(INFO) << "\n" << *this;
759        num_records_ = 0;
760        Clear();
761      }
762    }
764    void RecordFDMaskEvent(int fd, int mask, const char* function) {
765      FDMaskOutput* fdmo = new FDMaskOutput(fd, mask, function);
766      debug_events_.push_back(fdmo);
767      MaybeRecordAndClear();
768    }
770    void RecordEpollWaitEvent(int timeout_in_ms,
771                              int num_events_generated) {
772      EpollWaitOutput* ewo = new EpollWaitOutput(timeout_in_ms,
773                                                  num_events_generated);
774      debug_events_.push_back(ewo);
775      MaybeRecordAndClear();
776    }
778    void RecordEpollEvent(int fd, int event_mask) {
779      Events& events_for_fd = event_counts_[fd];
780      events_for_fd.AssignFromMask(event_mask);
781      MaybeRecordAndClear();
782    }
784    friend ostream& operator<<(ostream& os, const EventRecorder& er) {
785      for (int i = 0; i < er.unregistered_fds_.size(); ++i) {
786        os << "fd: " << er.unregistered_fds_[i] << "\n";
787        os << er.unregistered_fds_[i];
788      }
789      for (EventCountsMap::const_iterator i = er.event_counts_.begin();
790           i != er.event_counts_.end();
791           ++i) {
792        os << "fd: " << i->first << "\n";
793        os << i->second;
794      }
795      for (int i = 0; i < er.debug_events_.size(); ++i) {
796        os << *(er.debug_events_[i]) << "\n";
797      }
798      return os;
799    }
801    void RecordUnregistration(int fd) {
802      EventCountsMap::iterator i = event_counts_.find(fd);
803      if (i != event_counts_.end()) {
804        unregistered_fds_.push_back(i->second);
805        event_counts_.erase(i);
806      }
807      MaybeRecordAndClear();
808    }
810   protected:
811    class DebugOutput {
812     public:
813      friend ostream& operator<<(ostream& os, const DebugOutput& debug_output) {
814        debug_output.OutputToStream(os);
815        return os;
816      }
817      virtual void OutputToStream(ostream* os) const = 0;
818      virtual ~DebugOutput() {}
819    };
821    class FDMaskOutput : public DebugOutput {
822     public:
823      FDMaskOutput(int fd, int mask, const char* function) :
824          fd_(fd), mask_(mask), function_(function) {}
825      virtual void OutputToStream(ostream* os) const {
826        (*os) << "func: " << function_
827              << "\tfd: " << fd_;
828        if (mask_ != 0) {
829           (*os) << "\tmask: " << EventMaskToString(mask_);
830        }
831      }
832      int fd_;
833      int mask_;
834      const char* function_;
835    };
837    class EpollWaitOutput : public DebugOutput {
838     public:
839      EpollWaitOutput(int timeout_in_ms,
840                      int num_events_generated) :
841          timeout_in_ms_(timeout_in_ms),
842          num_events_generated_(num_events_generated) {}
843      virtual void OutputToStream(ostream* os) const {
844        (*os) << "timeout_in_ms: " << timeout_in_ms_
845              << "\tnum_events_generated: " << num_events_generated_;
846      }
847     protected:
848      int timeout_in_ms_;
849      int num_events_generated_;
850    };
852    struct Events {
853      Events() :
854          epoll_in(0),
855          epoll_pri(0),
856          epoll_out(0),
857          epoll_rdnorm(0),
858          epoll_rdband(0),
859          epoll_wrnorm(0),
860          epoll_wrband(0),
861          epoll_msg(0),
862          epoll_err(0),
863          epoll_hup(0),
864          epoll_oneshot(0),
865          epoll_et(0) {}
867      void AssignFromMask(int event_mask) {
868        if (event_mask & EPOLLIN) ++epoll_in;
869        if (event_mask & EPOLLPRI) ++epoll_pri;
870        if (event_mask & EPOLLOUT) ++epoll_out;
871        if (event_mask & EPOLLRDNORM) ++epoll_rdnorm;
872        if (event_mask & EPOLLRDBAND) ++epoll_rdband;
873        if (event_mask & EPOLLWRNORM) ++epoll_wrnorm;
874        if (event_mask & EPOLLWRBAND) ++epoll_wrband;
875        if (event_mask & EPOLLMSG) ++epoll_msg;
876        if (event_mask & EPOLLERR) ++epoll_err;
877        if (event_mask & EPOLLHUP) ++epoll_hup;
878        if (event_mask & EPOLLONESHOT) ++epoll_oneshot;
879        if (event_mask & EPOLLET) ++epoll_et;
880      };
882      friend ostream& operator<<(ostream& os, const Events& ev) {
883        if (ev.epoll_in) {
884          os << "\t      EPOLLIN: " << ev.epoll_in << "\n";
885        }
886        if (ev.epoll_pri) {
887          os << "\t     EPOLLPRI: " << ev.epoll_pri << "\n";
888        }
889        if (ev.epoll_out) {
890          os << "\t     EPOLLOUT: " << ev.epoll_out << "\n";
891        }
892        if (ev.epoll_rdnorm) {
893          os << "\t  EPOLLRDNORM: " << ev.epoll_rdnorm << "\n";
894        }
895        if (ev.epoll_rdband) {
896          os << "\t  EPOLLRDBAND: " << ev.epoll_rdband << "\n";
897        }
898        if (ev.epoll_wrnorm) {
899          os << "\t  EPOLLWRNORM: " << ev.epoll_wrnorm << "\n";
900        }
901        if (ev.epoll_wrband) {
902          os << "\t  EPOLLWRBAND: " << ev.epoll_wrband << "\n";
903        }
904        if (ev.epoll_msg) {
905          os << "\t     EPOLLMSG: " << ev.epoll_msg << "\n";
906        }
907        if (ev.epoll_err) {
908          os << "\t     EPOLLERR: " << ev.epoll_err << "\n";
909        }
910        if (ev.epoll_hup) {
911          os << "\t     EPOLLHUP: " << ev.epoll_hup << "\n";
912        }
913        if (ev.epoll_oneshot) {
914          os << "\t EPOLLONESHOT: " << ev.epoll_oneshot << "\n";
915        }
916        if (ev.epoll_et) {
917          os << "\t      EPOLLET: " << ev.epoll_et << "\n";
918        }
919        return os;
920      }
922      unsigned int epoll_in;
923      unsigned int epoll_pri;
924      unsigned int epoll_out;
925      unsigned int epoll_rdnorm;
926      unsigned int epoll_rdband;
927      unsigned int epoll_wrnorm;
928      unsigned int epoll_wrband;
929      unsigned int epoll_msg;
930      unsigned int epoll_err;
931      unsigned int epoll_hup;
932      unsigned int epoll_oneshot;
933      unsigned int epoll_et;
934    };
936    std::vector<DebugOutput*> debug_events_;
937    std::vector<Events> unregistered_fds_;
938    typedef __gnu_cxx::hash_map<int, Events> EventCountsMap;
939    EventCountsMap event_counts_;
940    int64 num_records_;
941    int64 record_threshold_;
942  };
944  void ClearEventRecords() {
945    event_recorder_.Clear();
946  }
947  void WriteEventRecords(ostream* os) const {
948    (*os) << event_recorder_;
949  }
951  mutable EventRecorder event_recorder_;
955 private:
956  // Helper functions used in the destructor.
957  void CleanupFDToCBMap();
958  void CleanupTimeToAlarmCBMap();
960  // The callback registered to the fds below.  As the purpose of their
961  // registration is to wake the epoll server it just clears the pipe and
962  // returns.
963  scoped_ptr<ReadPipeCallback> wake_cb_;
965  // A pipe owned by the epoll server.  The server will be registered to listen
966  // on read_fd_ and can be woken by Wake() which writes to write_fd_.
967  int read_fd_;
968  int write_fd_;
970  // This boolean is checked to see if it is false at the top of the
971  // WaitForEventsAndExecuteCallbacks function. If not, then it either returns
972  // without doing work, and logs to ERROR, or aborts the program (in
973  // DEBUG mode). If so, then it sets the bool to true, does work, and
974  // sets it back to false when done. This catches unwanted recursion.
975  bool in_wait_for_events_and_execute_callbacks_;
977  // Returns true when the EpollServer() is being destroyed.
978  bool in_shutdown_;
983class EpollAlarmCallbackInterface {
984 public:
985  // Summary:
986  //   Called when an alarm times out. Invalidates an AlarmRegToken.
987  //   WARNING: If a token was saved to refer to an alarm callback, OnAlarm must
988  //   delete it, as the reference is no longer valid.
989  // Returns:
990  //   the unix time (in microseconds) at which this alarm should be signaled
991  //   again, or 0 if the alarm should be removed.
992  virtual int64 OnAlarm() = 0;
994  // Summary:
995  //   Called when the an alarm is registered. Invalidates an AlarmRegToken.
996  // Args:
997  //   token: the iterator to the the alarm registered in the alarm map.
998  //   WARNING: this token becomes invalid when the alarm fires, is
999  //   unregistered, or OnShutdown is called on that alarm.
1000  //   eps: the epoll server the alarm is registered with.
1001  virtual void OnRegistration(const EpollServer::AlarmRegToken& token,
1002                              EpollServer* eps) = 0;
1004  // Summary:
1005  //   Called when the an alarm is unregistered.
1006  //   WARNING: It is not valid to unregister a callback and then use the token
1007  //   that was saved to refer to the callback.
1008  virtual void OnUnregistration() = 0;
1010  // Summary:
1011  //   Called when the epoll server is shutting down.
1012  //   Invalidates the AlarmRegToken that was given when this alarm was
1013  //   registered.
1014  virtual void OnShutdown(EpollServer* eps) = 0;
1016  virtual ~EpollAlarmCallbackInterface() {}
1018 protected:
1019  EpollAlarmCallbackInterface() {}
1022// A simple alarm which unregisters itself on destruction.
1025// Any classes overriding these functions must either call the implementation
1026// of the parent class, or is must otherwise make sure that the 'registered_'
1027// boolean and the token, 'token_', are updated appropriately.
1028class EpollAlarm : public EpollAlarmCallbackInterface {
1029 public:
1030  EpollAlarm();
1032  virtual ~EpollAlarm();
1034  // Marks the alarm as unregistered and returns 0.  The return value may be
1035  // safely ignored by subclasses.
1036  virtual int64 OnAlarm();
1038  // Marks the alarm as registered, and stores the token.
1039  virtual void OnRegistration(const EpollServer::AlarmRegToken& token,
1040                              EpollServer* eps);
1042  // Marks the alarm as unregistered.
1043  virtual void OnUnregistration();
1045  // Marks the alarm as unregistered.
1046  virtual void OnShutdown(EpollServer* eps);
1048  // If the alarm was registered, unregister it.
1049  void UnregisterIfRegistered();
1051  bool registered() const { return registered_; }
1053  const EpollServer* eps() const { return eps_; }
1055 private:
1056  EpollServer::AlarmRegToken token_;
1057  EpollServer* eps_;
1058  bool registered_;
1061}  // namespace net