1/*	$OpenBSD: kqueue.c,v 1.5 2002/07/10 14:41:31 art Exp $	*/
2
3/*
4 * Copyright 2000-2002 Niels Provos <provos@citi.umich.edu>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. The name of the author may not be used to endorse or promote products
16 *    derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29#ifdef HAVE_CONFIG_H
30#include "config.h"
31#endif
32
33#define _GNU_SOURCE 1
34
35#include <sys/types.h>
36#ifdef HAVE_SYS_TIME_H
37#include <sys/time.h>
38#else
39#include <sys/_libevent_time.h>
40#endif
41#include <sys/queue.h>
42#include <sys/event.h>
43#include <signal.h>
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
48#include <errno.h>
49#include <assert.h>
50#ifdef HAVE_INTTYPES_H
51#include <inttypes.h>
52#endif
53
54/* Some platforms apparently define the udata field of struct kevent as
55 * intptr_t, whereas others define it as void*.  There doesn't seem to be an
56 * easy way to tell them apart via autoconf, so we need to use OS macros. */
57#if defined(HAVE_INTTYPES_H) && !defined(__OpenBSD__) && !defined(__FreeBSD__) && !defined(__darwin__) && !defined(__APPLE__)
58#define PTR_TO_UDATA(x)	((intptr_t)(x))
59#else
60#define PTR_TO_UDATA(x)	(x)
61#endif
62
63#include "event.h"
64#include "event-internal.h"
65#include "log.h"
66
67#define EVLIST_X_KQINKERNEL	0x1000
68
69#define NEVENT		64
70
71struct kqop {
72	struct kevent *changes;
73	int nchanges;
74	struct kevent *events;
75	struct event_list evsigevents[NSIG];
76	int nevents;
77	int kq;
78	pid_t pid;
79};
80
81static void *kq_init	(struct event_base *);
82static int kq_add	(void *, struct event *);
83static int kq_del	(void *, struct event *);
84static int kq_dispatch	(struct event_base *, void *, struct timeval *);
85static int kq_insert	(struct kqop *, struct kevent *);
86static void kq_dealloc (struct event_base *, void *);
87
88const struct eventop kqops = {
89	"kqueue",
90	kq_init,
91	kq_add,
92	kq_del,
93	kq_dispatch,
94	kq_dealloc,
95	1 /* need reinit */
96};
97
98static void *
99kq_init(struct event_base *base)
100{
101	int i, kq;
102	struct kqop *kqueueop;
103
104	/* Disable kqueue when this environment variable is set */
105	if (evutil_getenv("EVENT_NOKQUEUE"))
106		return (NULL);
107
108	if (!(kqueueop = calloc(1, sizeof(struct kqop))))
109		return (NULL);
110
111	/* Initalize the kernel queue */
112
113	if ((kq = kqueue()) == -1) {
114		event_warn("kqueue");
115		free (kqueueop);
116		return (NULL);
117	}
118
119	kqueueop->kq = kq;
120
121	kqueueop->pid = getpid();
122
123	/* Initalize fields */
124	kqueueop->changes = malloc(NEVENT * sizeof(struct kevent));
125	if (kqueueop->changes == NULL) {
126		free (kqueueop);
127		return (NULL);
128	}
129	kqueueop->events = malloc(NEVENT * sizeof(struct kevent));
130	if (kqueueop->events == NULL) {
131		free (kqueueop->changes);
132		free (kqueueop);
133		return (NULL);
134	}
135	kqueueop->nevents = NEVENT;
136
137	/* we need to keep track of multiple events per signal */
138	for (i = 0; i < NSIG; ++i) {
139		TAILQ_INIT(&kqueueop->evsigevents[i]);
140	}
141
142	/* Check for Mac OS X kqueue bug. */
143	kqueueop->changes[0].ident = -1;
144	kqueueop->changes[0].filter = EVFILT_READ;
145	kqueueop->changes[0].flags = EV_ADD;
146	/*
147	 * If kqueue works, then kevent will succeed, and it will
148	 * stick an error in events[0].  If kqueue is broken, then
149	 * kevent will fail.
150	 */
151	if (kevent(kq,
152		kqueueop->changes, 1, kqueueop->events, NEVENT, NULL) != 1 ||
153	    kqueueop->events[0].ident != -1 ||
154	    kqueueop->events[0].flags != EV_ERROR) {
155		event_warn("%s: detected broken kqueue; not using.", __func__);
156		free(kqueueop->changes);
157		free(kqueueop->events);
158		free(kqueueop);
159		close(kq);
160		return (NULL);
161	}
162
163	return (kqueueop);
164}
165
166static int
167kq_insert(struct kqop *kqop, struct kevent *kev)
168{
169	int nevents = kqop->nevents;
170
171	if (kqop->nchanges == nevents) {
172		struct kevent *newchange;
173		struct kevent *newresult;
174
175		nevents *= 2;
176
177		newchange = realloc(kqop->changes,
178				    nevents * sizeof(struct kevent));
179		if (newchange == NULL) {
180			event_warn("%s: malloc", __func__);
181			return (-1);
182		}
183		kqop->changes = newchange;
184
185		newresult = realloc(kqop->events,
186				    nevents * sizeof(struct kevent));
187
188		/*
189		 * If we fail, we don't have to worry about freeing,
190		 * the next realloc will pick it up.
191		 */
192		if (newresult == NULL) {
193			event_warn("%s: malloc", __func__);
194			return (-1);
195		}
196		kqop->events = newresult;
197
198		kqop->nevents = nevents;
199	}
200
201	memcpy(&kqop->changes[kqop->nchanges++], kev, sizeof(struct kevent));
202
203	event_debug(("%s: fd %d %s%s",
204		__func__, (int)kev->ident,
205		kev->filter == EVFILT_READ ? "EVFILT_READ" : "EVFILT_WRITE",
206		kev->flags == EV_DELETE ? " (del)" : ""));
207
208	return (0);
209}
210
211static void
212kq_sighandler(int sig)
213{
214	/* Do nothing here */
215}
216
217static int
218kq_dispatch(struct event_base *base, void *arg, struct timeval *tv)
219{
220	struct kqop *kqop = arg;
221	struct kevent *changes = kqop->changes;
222	struct kevent *events = kqop->events;
223	struct event *ev;
224	struct timespec ts, *ts_p = NULL;
225	int i, res;
226
227	if (tv != NULL) {
228		TIMEVAL_TO_TIMESPEC(tv, &ts);
229		ts_p = &ts;
230	}
231
232	res = kevent(kqop->kq, changes, kqop->nchanges,
233	    events, kqop->nevents, ts_p);
234	kqop->nchanges = 0;
235	if (res == -1) {
236		if (errno != EINTR) {
237                        event_warn("kevent");
238			return (-1);
239		}
240
241		return (0);
242	}
243
244	event_debug(("%s: kevent reports %d", __func__, res));
245
246	for (i = 0; i < res; i++) {
247		int which = 0;
248
249		if (events[i].flags & EV_ERROR) {
250			/*
251			 * Error messages that can happen, when a delete fails.
252			 *   EBADF happens when the file discriptor has been
253			 *   closed,
254			 *   ENOENT when the file discriptor was closed and
255			 *   then reopened.
256			 *   EINVAL for some reasons not understood; EINVAL
257			 *   should not be returned ever; but FreeBSD does :-\
258			 * An error is also indicated when a callback deletes
259			 * an event we are still processing.  In that case
260			 * the data field is set to ENOENT.
261			 */
262			if (events[i].data == EBADF ||
263			    events[i].data == EINVAL ||
264			    events[i].data == ENOENT)
265				continue;
266			errno = events[i].data;
267			return (-1);
268		}
269
270		if (events[i].filter == EVFILT_READ) {
271			which |= EV_READ;
272		} else if (events[i].filter == EVFILT_WRITE) {
273			which |= EV_WRITE;
274		} else if (events[i].filter == EVFILT_SIGNAL) {
275			which |= EV_SIGNAL;
276		}
277
278		if (!which)
279			continue;
280
281		if (events[i].filter == EVFILT_SIGNAL) {
282			struct event_list *head =
283			    (struct event_list *)events[i].udata;
284			TAILQ_FOREACH(ev, head, ev_signal_next) {
285				event_active(ev, which, events[i].data);
286			}
287		} else {
288			ev = (struct event *)events[i].udata;
289
290			if (!(ev->ev_events & EV_PERSIST))
291				ev->ev_flags &= ~EVLIST_X_KQINKERNEL;
292
293			event_active(ev, which, 1);
294		}
295	}
296
297	return (0);
298}
299
300
301static int
302kq_add(void *arg, struct event *ev)
303{
304	struct kqop *kqop = arg;
305	struct kevent kev;
306
307	if (ev->ev_events & EV_SIGNAL) {
308		int nsignal = EVENT_SIGNAL(ev);
309
310		assert(nsignal >= 0 && nsignal < NSIG);
311		if (TAILQ_EMPTY(&kqop->evsigevents[nsignal])) {
312			struct timespec timeout = { 0, 0 };
313
314			memset(&kev, 0, sizeof(kev));
315			kev.ident = nsignal;
316			kev.filter = EVFILT_SIGNAL;
317			kev.flags = EV_ADD;
318			kev.udata = PTR_TO_UDATA(&kqop->evsigevents[nsignal]);
319
320			/* Be ready for the signal if it is sent any
321			 * time between now and the next call to
322			 * kq_dispatch. */
323			if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1)
324				return (-1);
325
326			if (_evsignal_set_handler(ev->ev_base, nsignal,
327				kq_sighandler) == -1)
328				return (-1);
329		}
330
331		TAILQ_INSERT_TAIL(&kqop->evsigevents[nsignal], ev,
332		    ev_signal_next);
333		ev->ev_flags |= EVLIST_X_KQINKERNEL;
334		return (0);
335	}
336
337	if (ev->ev_events & EV_READ) {
338 		memset(&kev, 0, sizeof(kev));
339		kev.ident = ev->ev_fd;
340		kev.filter = EVFILT_READ;
341#ifdef NOTE_EOF
342		/* Make it behave like select() and poll() */
343		kev.fflags = NOTE_EOF;
344#endif
345		kev.flags = EV_ADD;
346		if (!(ev->ev_events & EV_PERSIST))
347			kev.flags |= EV_ONESHOT;
348		kev.udata = PTR_TO_UDATA(ev);
349
350		if (kq_insert(kqop, &kev) == -1)
351			return (-1);
352
353		ev->ev_flags |= EVLIST_X_KQINKERNEL;
354	}
355
356	if (ev->ev_events & EV_WRITE) {
357 		memset(&kev, 0, sizeof(kev));
358		kev.ident = ev->ev_fd;
359		kev.filter = EVFILT_WRITE;
360		kev.flags = EV_ADD;
361		if (!(ev->ev_events & EV_PERSIST))
362			kev.flags |= EV_ONESHOT;
363		kev.udata = PTR_TO_UDATA(ev);
364
365		if (kq_insert(kqop, &kev) == -1)
366			return (-1);
367
368		ev->ev_flags |= EVLIST_X_KQINKERNEL;
369	}
370
371	return (0);
372}
373
374static int
375kq_del(void *arg, struct event *ev)
376{
377	struct kqop *kqop = arg;
378	struct kevent kev;
379
380	if (!(ev->ev_flags & EVLIST_X_KQINKERNEL))
381		return (0);
382
383	if (ev->ev_events & EV_SIGNAL) {
384		int nsignal = EVENT_SIGNAL(ev);
385		struct timespec timeout = { 0, 0 };
386
387		assert(nsignal >= 0 && nsignal < NSIG);
388		TAILQ_REMOVE(&kqop->evsigevents[nsignal], ev, ev_signal_next);
389		if (TAILQ_EMPTY(&kqop->evsigevents[nsignal])) {
390			memset(&kev, 0, sizeof(kev));
391			kev.ident = nsignal;
392			kev.filter = EVFILT_SIGNAL;
393			kev.flags = EV_DELETE;
394
395			/* Because we insert signal events
396			 * immediately, we need to delete them
397			 * immediately, too */
398			if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1)
399				return (-1);
400
401			if (_evsignal_restore_handler(ev->ev_base,
402				nsignal) == -1)
403				return (-1);
404		}
405
406		ev->ev_flags &= ~EVLIST_X_KQINKERNEL;
407		return (0);
408	}
409
410	if (ev->ev_events & EV_READ) {
411 		memset(&kev, 0, sizeof(kev));
412		kev.ident = ev->ev_fd;
413		kev.filter = EVFILT_READ;
414		kev.flags = EV_DELETE;
415
416		if (kq_insert(kqop, &kev) == -1)
417			return (-1);
418
419		ev->ev_flags &= ~EVLIST_X_KQINKERNEL;
420	}
421
422	if (ev->ev_events & EV_WRITE) {
423 		memset(&kev, 0, sizeof(kev));
424		kev.ident = ev->ev_fd;
425		kev.filter = EVFILT_WRITE;
426		kev.flags = EV_DELETE;
427
428		if (kq_insert(kqop, &kev) == -1)
429			return (-1);
430
431		ev->ev_flags &= ~EVLIST_X_KQINKERNEL;
432	}
433
434	return (0);
435}
436
437static void
438kq_dealloc(struct event_base *base, void *arg)
439{
440	struct kqop *kqop = arg;
441
442	if (kqop->changes)
443		free(kqop->changes);
444	if (kqop->events)
445		free(kqop->events);
446	if (kqop->kq >= 0 && kqop->pid == getpid())
447		close(kqop->kq);
448	memset(kqop, 0, sizeof(struct kqop));
449	free(kqop);
450}
451