1 /*
2  * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3  * Copyright 2007-2012 Niels Provos, Nick Mathewson
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. The name of the author may not be used to endorse or promote products
14  *    derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 #include "event2/event-config.h"
28 
29 #include <stdint.h>
30 #include <sys/types.h>
31 #include <sys/resource.h>
32 #ifdef _EVENT_HAVE_SYS_TIME_H
33 #include <sys/time.h>
34 #endif
35 #include <sys/queue.h>
36 #include <sys/epoll.h>
37 #include <signal.h>
38 #include <limits.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <unistd.h>
43 #include <errno.h>
44 #ifdef _EVENT_HAVE_FCNTL_H
45 #include <fcntl.h>
46 #endif
47 
48 #include "event-internal.h"
49 #include "evsignal-internal.h"
50 #include "event2/thread.h"
51 #include "evthread-internal.h"
52 #include "log-internal.h"
53 #include "evmap-internal.h"
54 #include "changelist-internal.h"
55 
56 struct epollop {
57 	struct epoll_event *events;
58 	int nevents;
59 	int epfd;
60 };
61 
62 static void *epoll_init(struct event_base *);
63 static int epoll_dispatch(struct event_base *, struct timeval *);
64 static void epoll_dealloc(struct event_base *);
65 
66 static const struct eventop epollops_changelist = {
67 	"epoll (with changelist)",
68 	epoll_init,
69 	event_changelist_add,
70 	event_changelist_del,
71 	epoll_dispatch,
72 	epoll_dealloc,
73 	1, /* need reinit */
74 	EV_FEATURE_ET|EV_FEATURE_O1,
75 	EVENT_CHANGELIST_FDINFO_SIZE
76 };
77 
78 
79 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
80     short old, short events, void *p);
81 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
82     short old, short events, void *p);
83 
84 const struct eventop epollops = {
85 	"epoll",
86 	epoll_init,
87 	epoll_nochangelist_add,
88 	epoll_nochangelist_del,
89 	epoll_dispatch,
90 	epoll_dealloc,
91 	1, /* need reinit */
92 	EV_FEATURE_ET|EV_FEATURE_O1,
93 	0
94 };
95 
96 #define INITIAL_NEVENT 32
97 #define MAX_NEVENT 4096
98 
99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
100  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
101  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
102  * largest number of msec we can support here is 2147482.  Let's
103  * round that down by 47 seconds.
104  */
105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
106 
107 static void *
epoll_init(struct event_base * base)108 epoll_init(struct event_base *base)
109 {
110 	int epfd;
111 	struct epollop *epollop;
112 
113 	/* Initialize the kernel queue.  (The size field is ignored since
114 	 * 2.6.8.) */
115 	if ((epfd = epoll_create(32000)) == -1) {
116 		if (errno != ENOSYS)
117 			event_warn("epoll_create");
118 		return (NULL);
119 	}
120 
121 	evutil_make_socket_closeonexec(epfd);
122 
123 	if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
124 		close(epfd);
125 		return (NULL);
126 	}
127 
128 	epollop->epfd = epfd;
129 
130 	/* Initialize fields */
131 	epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
132 	if (epollop->events == NULL) {
133 		mm_free(epollop);
134 		close(epfd);
135 		return (NULL);
136 	}
137 	epollop->nevents = INITIAL_NEVENT;
138 
139 	if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
140 	    ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
141 		evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
142 		base->evsel = &epollops_changelist;
143 
144 	evsig_init(base);
145 
146 	return (epollop);
147 }
148 
149 static const char *
change_to_string(int change)150 change_to_string(int change)
151 {
152 	change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
153 	if (change == EV_CHANGE_ADD) {
154 		return "add";
155 	} else if (change == EV_CHANGE_DEL) {
156 		return "del";
157 	} else if (change == 0) {
158 		return "none";
159 	} else {
160 		return "???";
161 	}
162 }
163 
164 static const char *
epoll_op_to_string(int op)165 epoll_op_to_string(int op)
166 {
167 	return op == EPOLL_CTL_ADD?"ADD":
168 	    op == EPOLL_CTL_DEL?"DEL":
169 	    op == EPOLL_CTL_MOD?"MOD":
170 	    "???";
171 }
172 
173 static int
epoll_apply_one_change(struct event_base * base,struct epollop * epollop,const struct event_change * ch)174 epoll_apply_one_change(struct event_base *base,
175     struct epollop *epollop,
176     const struct event_change *ch)
177 {
178 	struct epoll_event epev;
179 	int op, events = 0;
180 
181 	if (1) {
182 		/* The logic here is a little tricky.  If we had no events set
183 		   on the fd before, we need to set op="ADD" and set
184 		   events=the events we want to add.  If we had any events set
185 		   on the fd before, and we want any events to remain on the
186 		   fd, we need to say op="MOD" and set events=the events we
187 		   want to remain.  But if we want to delete the last event,
188 		   we say op="DEL" and set events=the remaining events.  What
189 		   fun!
190 		*/
191 
192 		/* TODO: Turn this into a switch or a table lookup. */
193 
194 		if ((ch->read_change & EV_CHANGE_ADD) ||
195 		    (ch->write_change & EV_CHANGE_ADD)) {
196 			/* If we are adding anything at all, we'll want to do
197 			 * either an ADD or a MOD. */
198 			events = 0;
199 			op = EPOLL_CTL_ADD;
200 			if (ch->read_change & EV_CHANGE_ADD) {
201 				events |= EPOLLIN;
202 			} else if (ch->read_change & EV_CHANGE_DEL) {
203 				;
204 			} else if (ch->old_events & EV_READ) {
205 				events |= EPOLLIN;
206 			}
207 			if (ch->write_change & EV_CHANGE_ADD) {
208 				events |= EPOLLOUT;
209 			} else if (ch->write_change & EV_CHANGE_DEL) {
210 				;
211 			} else if (ch->old_events & EV_WRITE) {
212 				events |= EPOLLOUT;
213 			}
214 			if ((ch->read_change|ch->write_change) & EV_ET)
215 				events |= EPOLLET;
216 
217 			if (ch->old_events) {
218 				/* If MOD fails, we retry as an ADD, and if
219 				 * ADD fails we will retry as a MOD.  So the
220 				 * only hard part here is to guess which one
221 				 * will work.  As a heuristic, we'll try
222 				 * MOD first if we think there were old
223 				 * events and ADD if we think there were none.
224 				 *
225 				 * We can be wrong about the MOD if the file
226 				 * has in fact been closed and re-opened.
227 				 *
228 				 * We can be wrong about the ADD if the
229 				 * the fd has been re-created with a dup()
230 				 * of the same file that it was before.
231 				 */
232 				op = EPOLL_CTL_MOD;
233 			}
234 		} else if ((ch->read_change & EV_CHANGE_DEL) ||
235 		    (ch->write_change & EV_CHANGE_DEL)) {
236 			/* If we're deleting anything, we'll want to do a MOD
237 			 * or a DEL. */
238 			op = EPOLL_CTL_DEL;
239 
240 			if (ch->read_change & EV_CHANGE_DEL) {
241 				if (ch->write_change & EV_CHANGE_DEL) {
242 					events = EPOLLIN|EPOLLOUT;
243 				} else if (ch->old_events & EV_WRITE) {
244 					events = EPOLLOUT;
245 					op = EPOLL_CTL_MOD;
246 				} else {
247 					events = EPOLLIN;
248 				}
249 			} else if (ch->write_change & EV_CHANGE_DEL) {
250 				if (ch->old_events & EV_READ) {
251 					events = EPOLLIN;
252 					op = EPOLL_CTL_MOD;
253 				} else {
254 					events = EPOLLOUT;
255 				}
256 			}
257 		}
258 
259 		if (!events)
260 			return 0;
261 
262 		memset(&epev, 0, sizeof(epev));
263 		epev.data.fd = ch->fd;
264 		epev.events = events;
265 		if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
266 			if (op == EPOLL_CTL_MOD && errno == ENOENT) {
267 				/* If a MOD operation fails with ENOENT, the
268 				 * fd was probably closed and re-opened.  We
269 				 * should retry the operation as an ADD.
270 				 */
271 				if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
272 					event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
273 					    (int)epev.events, ch->fd);
274 					return -1;
275 				} else {
276 					event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
277 						(int)epev.events,
278 						ch->fd));
279 				}
280 			} else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
281 				/* If an ADD operation fails with EEXIST,
282 				 * either the operation was redundant (as with a
283 				 * precautionary add), or we ran into a fun
284 				 * kernel bug where using dup*() to duplicate the
285 				 * same file into the same fd gives you the same epitem
286 				 * rather than a fresh one.  For the second case,
287 				 * we must retry with MOD. */
288 				if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
289 					event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
290 					    (int)epev.events, ch->fd);
291 					return -1;
292 				} else {
293 					event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
294 						(int)epev.events,
295 						ch->fd));
296 				}
297 			} else if (op == EPOLL_CTL_DEL &&
298 			    (errno == ENOENT || errno == EBADF ||
299 				errno == EPERM)) {
300 				/* If a delete fails with one of these errors,
301 				 * that's fine too: we closed the fd before we
302 				 * got around to calling epoll_dispatch. */
303 				event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
304 					(int)epev.events,
305 					ch->fd,
306 					strerror(errno)));
307 			} else {
308 				event_warn("Epoll %s(%d) on fd %d failed.  Old events were %d; read change was %d (%s); write change was %d (%s)",
309 				    epoll_op_to_string(op),
310 				    (int)epev.events,
311 				    ch->fd,
312 				    ch->old_events,
313 				    ch->read_change,
314 				    change_to_string(ch->read_change),
315 				    ch->write_change,
316 				    change_to_string(ch->write_change));
317 				return -1;
318 			}
319 		} else {
320 			event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
321 				epoll_op_to_string(op),
322 				(int)epev.events,
323 				(int)ch->fd,
324 				ch->old_events,
325 				ch->read_change,
326 				ch->write_change));
327 		}
328 	}
329 	return 0;
330 }
331 
332 static int
epoll_apply_changes(struct event_base * base)333 epoll_apply_changes(struct event_base *base)
334 {
335 	struct event_changelist *changelist = &base->changelist;
336 	struct epollop *epollop = base->evbase;
337 	struct event_change *ch;
338 
339 	int r = 0;
340 	int i;
341 
342 	for (i = 0; i < changelist->n_changes; ++i) {
343 		ch = &changelist->changes[i];
344 		if (epoll_apply_one_change(base, epollop, ch) < 0)
345 			r = -1;
346 	}
347 
348 	return (r);
349 }
350 
351 static int
epoll_nochangelist_add(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)352 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
353     short old, short events, void *p)
354 {
355 	struct event_change ch;
356 	ch.fd = fd;
357 	ch.old_events = old;
358 	ch.read_change = ch.write_change = 0;
359 	if (events & EV_WRITE)
360 		ch.write_change = EV_CHANGE_ADD |
361 		    (events & EV_ET);
362 	if (events & EV_READ)
363 		ch.read_change = EV_CHANGE_ADD |
364 		    (events & EV_ET);
365 
366 	return epoll_apply_one_change(base, base->evbase, &ch);
367 }
368 
369 static int
epoll_nochangelist_del(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)370 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
371     short old, short events, void *p)
372 {
373 	struct event_change ch;
374 	ch.fd = fd;
375 	ch.old_events = old;
376 	ch.read_change = ch.write_change = 0;
377 	if (events & EV_WRITE)
378 		ch.write_change = EV_CHANGE_DEL;
379 	if (events & EV_READ)
380 		ch.read_change = EV_CHANGE_DEL;
381 
382 	return epoll_apply_one_change(base, base->evbase, &ch);
383 }
384 
385 static int
epoll_dispatch(struct event_base * base,struct timeval * tv)386 epoll_dispatch(struct event_base *base, struct timeval *tv)
387 {
388 	struct epollop *epollop = base->evbase;
389 	struct epoll_event *events = epollop->events;
390 	int i, res;
391 	long timeout = -1;
392 
393 	if (tv != NULL) {
394 		timeout = evutil_tv_to_msec(tv);
395 		if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
396 			/* Linux kernels can wait forever if the timeout is
397 			 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
398 			timeout = MAX_EPOLL_TIMEOUT_MSEC;
399 		}
400 	}
401 
402 	epoll_apply_changes(base);
403 	event_changelist_remove_all(&base->changelist, base);
404 
405 	EVBASE_RELEASE_LOCK(base, th_base_lock);
406 
407 	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
408 
409 	EVBASE_ACQUIRE_LOCK(base, th_base_lock);
410 
411 	if (res == -1) {
412 		if (errno != EINTR) {
413 			event_warn("epoll_wait");
414 			return (-1);
415 		}
416 
417 		return (0);
418 	}
419 
420 	event_debug(("%s: epoll_wait reports %d", __func__, res));
421 	EVUTIL_ASSERT(res <= epollop->nevents);
422 
423 	for (i = 0; i < res; i++) {
424 		int what = events[i].events;
425 		short ev = 0;
426 
427 		if (what & (EPOLLHUP|EPOLLERR)) {
428 			ev = EV_READ | EV_WRITE;
429 		} else {
430 			if (what & EPOLLIN)
431 				ev |= EV_READ;
432 			if (what & EPOLLOUT)
433 				ev |= EV_WRITE;
434 		}
435 
436 		if (!ev)
437 			continue;
438 
439 		evmap_io_active(base, events[i].data.fd, ev | EV_ET);
440 	}
441 
442 	if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
443 		/* We used all of the event space this time.  We should
444 		   be ready for more events next time. */
445 		int new_nevents = epollop->nevents * 2;
446 		struct epoll_event *new_events;
447 
448 		new_events = mm_realloc(epollop->events,
449 		    new_nevents * sizeof(struct epoll_event));
450 		if (new_events) {
451 			epollop->events = new_events;
452 			epollop->nevents = new_nevents;
453 		}
454 	}
455 
456 	return (0);
457 }
458 
459 
460 static void
epoll_dealloc(struct event_base * base)461 epoll_dealloc(struct event_base *base)
462 {
463 	struct epollop *epollop = base->evbase;
464 
465 	evsig_dealloc(base);
466 	if (epollop->events)
467 		mm_free(epollop->events);
468 	if (epollop->epfd >= 0)
469 		close(epollop->epfd);
470 
471 	memset(epollop, 0, sizeof(struct epollop));
472 	mm_free(epollop);
473 }
474