1/*
2 * Copyright 2000-2003 Niels Provos <provos@citi.umich.edu>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 *    derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27#ifdef HAVE_CONFIG_H
28#include "config.h"
29#endif
30
31#include <stdint.h>
32#include <sys/types.h>
33#include <sys/resource.h>
34#ifdef HAVE_SYS_TIME_H
35#include <sys/time.h>
36#else
37#include <sys/_libevent_time.h>
38#endif
39#include <sys/queue.h>
40#include <sys/epoll.h>
41#include <signal.h>
42#include <stdio.h>
43#include <stdlib.h>
44#include <string.h>
45#include <unistd.h>
46#include <errno.h>
47#ifdef HAVE_FCNTL_H
48#include <fcntl.h>
49#endif
50
51#include "event.h"
52#include "event-internal.h"
53#include "evsignal.h"
54#include "log.h"
55
56/* due to limitations in the epoll interface, we need to keep track of
57 * all file descriptors outself.
58 */
59struct evepoll {
60	struct event *evread;
61	struct event *evwrite;
62};
63
64struct epollop {
65	struct evepoll *fds;
66	int nfds;
67	struct epoll_event *events;
68	int nevents;
69	int epfd;
70};
71
72static void *epoll_init	(struct event_base *);
73static int epoll_add	(void *, struct event *);
74static int epoll_del	(void *, struct event *);
75static int epoll_dispatch	(struct event_base *, void *, struct timeval *);
76static void epoll_dealloc	(struct event_base *, void *);
77
78const struct eventop epollops = {
79	"epoll",
80	epoll_init,
81	epoll_add,
82	epoll_del,
83	epoll_dispatch,
84	epoll_dealloc,
85	1 /* need reinit */
86};
87
88#ifdef HAVE_SETFD
89#define FD_CLOSEONEXEC(x) do { \
90        if (fcntl(x, F_SETFD, 1) == -1) \
91                event_warn("fcntl(%d, F_SETFD)", x); \
92} while (0)
93#else
94#define FD_CLOSEONEXEC(x)
95#endif
96
97/* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
98 * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
99 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
100 * largest number of msec we can support here is 2147482.  Let's
101 * round that down by 47 seconds.
102 */
103#define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
104
105#define INITIAL_NFILES 32
106#define INITIAL_NEVENTS 32
107#define MAX_NEVENTS 4096
108
109static void *
110epoll_init(struct event_base *base)
111{
112	int epfd;
113	struct epollop *epollop;
114
115	/* Disable epollueue when this environment variable is set */
116	if (evutil_getenv("EVENT_NOEPOLL"))
117		return (NULL);
118
119	/* Initalize the kernel queue */
120	if ((epfd = epoll_create(32000)) == -1) {
121		if (errno != ENOSYS)
122			event_warn("epoll_create");
123		return (NULL);
124	}
125
126	FD_CLOSEONEXEC(epfd);
127
128	if (!(epollop = calloc(1, sizeof(struct epollop))))
129		return (NULL);
130
131	epollop->epfd = epfd;
132
133	/* Initalize fields */
134	epollop->events = malloc(INITIAL_NEVENTS * sizeof(struct epoll_event));
135	if (epollop->events == NULL) {
136		free(epollop);
137		return (NULL);
138	}
139	epollop->nevents = INITIAL_NEVENTS;
140
141	epollop->fds = calloc(INITIAL_NFILES, sizeof(struct evepoll));
142	if (epollop->fds == NULL) {
143		free(epollop->events);
144		free(epollop);
145		return (NULL);
146	}
147	epollop->nfds = INITIAL_NFILES;
148
149	evsignal_init(base);
150
151	return (epollop);
152}
153
154static int
155epoll_recalc(struct event_base *base, void *arg, int max)
156{
157	struct epollop *epollop = arg;
158
159	if (max >= epollop->nfds) {
160		struct evepoll *fds;
161		int nfds;
162
163		nfds = epollop->nfds;
164		while (nfds <= max)
165			nfds <<= 1;
166
167		fds = realloc(epollop->fds, nfds * sizeof(struct evepoll));
168		if (fds == NULL) {
169			event_warn("realloc");
170			return (-1);
171		}
172		epollop->fds = fds;
173		memset(fds + epollop->nfds, 0,
174		    (nfds - epollop->nfds) * sizeof(struct evepoll));
175		epollop->nfds = nfds;
176	}
177
178	return (0);
179}
180
181static int
182epoll_dispatch(struct event_base *base, void *arg, struct timeval *tv)
183{
184	struct epollop *epollop = arg;
185	struct epoll_event *events = epollop->events;
186	struct evepoll *evep;
187	int i, res, timeout = -1;
188
189	if (tv != NULL)
190		timeout = tv->tv_sec * 1000 + (tv->tv_usec + 999) / 1000;
191
192	if (timeout > MAX_EPOLL_TIMEOUT_MSEC) {
193		/* Linux kernels can wait forever if the timeout is too big;
194		 * see comment on MAX_EPOLL_TIMEOUT_MSEC. */
195		timeout = MAX_EPOLL_TIMEOUT_MSEC;
196	}
197
198	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
199
200	if (res == -1) {
201		if (errno != EINTR) {
202			event_warn("epoll_wait");
203			return (-1);
204		}
205
206		evsignal_process(base);
207		return (0);
208	} else if (base->sig.evsignal_caught) {
209		evsignal_process(base);
210	}
211
212	event_debug(("%s: epoll_wait reports %d", __func__, res));
213
214	for (i = 0; i < res; i++) {
215		int what = events[i].events;
216		struct event *evread = NULL, *evwrite = NULL;
217		int fd = events[i].data.fd;
218
219		if (fd < 0 || fd >= epollop->nfds)
220			continue;
221		evep = &epollop->fds[fd];
222
223		if (what & (EPOLLHUP|EPOLLERR)) {
224			evread = evep->evread;
225			evwrite = evep->evwrite;
226		} else {
227			if (what & EPOLLIN) {
228				evread = evep->evread;
229			}
230
231			if (what & EPOLLOUT) {
232				evwrite = evep->evwrite;
233			}
234		}
235
236		if (!(evread||evwrite))
237			continue;
238
239		if (evread != NULL)
240			event_active(evread, EV_READ, 1);
241		if (evwrite != NULL)
242			event_active(evwrite, EV_WRITE, 1);
243	}
244
245	if (res == epollop->nevents && epollop->nevents < MAX_NEVENTS) {
246		/* We used all of the event space this time.  We should
247		   be ready for more events next time. */
248		int new_nevents = epollop->nevents * 2;
249		struct epoll_event *new_events;
250
251		new_events = realloc(epollop->events,
252		    new_nevents * sizeof(struct epoll_event));
253		if (new_events) {
254			epollop->events = new_events;
255			epollop->nevents = new_nevents;
256		}
257	}
258
259	return (0);
260}
261
262
263static int
264epoll_add(void *arg, struct event *ev)
265{
266	struct epollop *epollop = arg;
267	struct epoll_event epev = {0, {0}};
268	struct evepoll *evep;
269	int fd, op, events;
270
271	if (ev->ev_events & EV_SIGNAL)
272		return (evsignal_add(ev));
273
274	fd = ev->ev_fd;
275	if (fd >= epollop->nfds) {
276		/* Extent the file descriptor array as necessary */
277		if (epoll_recalc(ev->ev_base, epollop, fd) == -1)
278			return (-1);
279	}
280	evep = &epollop->fds[fd];
281	op = EPOLL_CTL_ADD;
282	events = 0;
283	if (evep->evread != NULL) {
284		events |= EPOLLIN;
285		op = EPOLL_CTL_MOD;
286	}
287	if (evep->evwrite != NULL) {
288		events |= EPOLLOUT;
289		op = EPOLL_CTL_MOD;
290	}
291
292	if (ev->ev_events & EV_READ)
293		events |= EPOLLIN;
294	if (ev->ev_events & EV_WRITE)
295		events |= EPOLLOUT;
296
297	epev.data.fd = fd;
298	epev.events = events;
299	if (epoll_ctl(epollop->epfd, op, ev->ev_fd, &epev) == -1)
300			return (-1);
301
302	/* Update events responsible */
303	if (ev->ev_events & EV_READ)
304		evep->evread = ev;
305	if (ev->ev_events & EV_WRITE)
306		evep->evwrite = ev;
307
308	return (0);
309}
310
311static int
312epoll_del(void *arg, struct event *ev)
313{
314	struct epollop *epollop = arg;
315	struct epoll_event epev = {0, {0}};
316	struct evepoll *evep;
317	int fd, events, op;
318	int needwritedelete = 1, needreaddelete = 1;
319
320	if (ev->ev_events & EV_SIGNAL)
321		return (evsignal_del(ev));
322
323	fd = ev->ev_fd;
324	if (fd >= epollop->nfds)
325		return (0);
326	evep = &epollop->fds[fd];
327
328	op = EPOLL_CTL_DEL;
329	events = 0;
330
331	if (ev->ev_events & EV_READ)
332		events |= EPOLLIN;
333	if (ev->ev_events & EV_WRITE)
334		events |= EPOLLOUT;
335
336	if ((events & (EPOLLIN|EPOLLOUT)) != (EPOLLIN|EPOLLOUT)) {
337		if ((events & EPOLLIN) && evep->evwrite != NULL) {
338			needwritedelete = 0;
339			events = EPOLLOUT;
340			op = EPOLL_CTL_MOD;
341		} else if ((events & EPOLLOUT) && evep->evread != NULL) {
342			needreaddelete = 0;
343			events = EPOLLIN;
344			op = EPOLL_CTL_MOD;
345		}
346	}
347
348	epev.events = events;
349	epev.data.fd = fd;
350
351	if (needreaddelete)
352		evep->evread = NULL;
353	if (needwritedelete)
354		evep->evwrite = NULL;
355
356	if (epoll_ctl(epollop->epfd, op, fd, &epev) == -1)
357		return (-1);
358
359	return (0);
360}
361
362static void
363epoll_dealloc(struct event_base *base, void *arg)
364{
365	struct epollop *epollop = arg;
366
367	evsignal_dealloc(base);
368	if (epollop->fds)
369		free(epollop->fds);
370	if (epollop->events)
371		free(epollop->events);
372	if (epollop->epfd >= 0)
373		close(epollop->epfd);
374
375	memset(epollop, 0, sizeof(struct epollop));
376	free(epollop);
377}
378