1 // Copyright 2016 syzkaller project authors. All rights reserved.
2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
3 
4 // This file is shared between executor and csource package.
5 
6 #include <stdlib.h>
7 #include <sys/syscall.h>
8 #include <sys/types.h>
9 #include <unistd.h>
10 
11 #if SYZ_EXECUTOR
12 struct cover_t;
13 static void cover_reset(cover_t* cov);
14 #endif
15 
16 #if SYZ_EXECUTOR || SYZ_THREADED
17 #include <linux/futex.h>
18 #include <pthread.h>
19 
20 typedef struct {
21 	int state;
22 } event_t;
23 
event_init(event_t * ev)24 static void event_init(event_t* ev)
25 {
26 	ev->state = 0;
27 }
28 
event_reset(event_t * ev)29 static void event_reset(event_t* ev)
30 {
31 	ev->state = 0;
32 }
33 
event_set(event_t * ev)34 static void event_set(event_t* ev)
35 {
36 	if (ev->state)
37 		fail("event already set");
38 	__atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
39 	syscall(SYS_futex, &ev->state, FUTEX_WAKE);
40 }
41 
event_wait(event_t * ev)42 static void event_wait(event_t* ev)
43 {
44 	while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
45 		syscall(SYS_futex, &ev->state, FUTEX_WAIT, 0, 0);
46 }
47 
event_isset(event_t * ev)48 static int event_isset(event_t* ev)
49 {
50 	return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
51 }
52 
event_timedwait(event_t * ev,uint64 timeout)53 static int event_timedwait(event_t* ev, uint64 timeout)
54 {
55 	uint64 start = current_time_ms();
56 	uint64 now = start;
57 	for (;;) {
58 		uint64 remain = timeout - (now - start);
59 		struct timespec ts;
60 		ts.tv_sec = remain / 1000;
61 		ts.tv_nsec = (remain % 1000) * 1000 * 1000;
62 		syscall(SYS_futex, &ev->state, FUTEX_WAIT, 0, &ts);
63 		if (__atomic_load_n(&ev->state, __ATOMIC_RELAXED))
64 			return 1;
65 		now = current_time_ms();
66 		if (now - start > timeout)
67 			return 0;
68 	}
69 }
70 #endif
71 
72 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE || SYZ_ENABLE_NETDEV
73 #include <stdarg.h>
74 #include <stdbool.h>
75 #include <string.h>
76 
vsnprintf_check(char * str,size_t size,const char * format,va_list args)77 static void vsnprintf_check(char* str, size_t size, const char* format, va_list args)
78 {
79 	int rv;
80 
81 	rv = vsnprintf(str, size, format, args);
82 	if (rv < 0)
83 		fail("tun: snprintf failed");
84 	if ((size_t)rv >= size)
85 		fail("tun: string '%s...' doesn't fit into buffer", str);
86 }
87 
88 #define COMMAND_MAX_LEN 128
89 #define PATH_PREFIX "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin "
90 #define PATH_PREFIX_LEN (sizeof(PATH_PREFIX) - 1)
91 
execute_command(bool panic,const char * format,...)92 static void execute_command(bool panic, const char* format, ...)
93 {
94 	va_list args;
95 	char command[PATH_PREFIX_LEN + COMMAND_MAX_LEN];
96 	int rv;
97 
98 	va_start(args, format);
99 	// Executor process does not have any env, including PATH.
100 	// On some distributions, system/shell adds a minimal PATH, on some it does not.
101 	// Set own standard PATH to make it work across distributions.
102 	memcpy(command, PATH_PREFIX, PATH_PREFIX_LEN);
103 	vsnprintf_check(command + PATH_PREFIX_LEN, COMMAND_MAX_LEN, format, args);
104 	va_end(args);
105 	rv = system(command);
106 	if (rv) {
107 		if (panic)
108 			fail("command '%s' failed: %d", &command[0], rv);
109 		debug("command '%s': %d\n", &command[0], rv);
110 	}
111 }
112 #endif
113 
114 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
115 #include <arpa/inet.h>
116 #include <errno.h>
117 #include <fcntl.h>
118 #include <linux/if.h>
119 #include <linux/if_ether.h>
120 #include <linux/if_tun.h>
121 #include <linux/ip.h>
122 #include <linux/tcp.h>
123 #include <net/if_arp.h>
124 #include <stdarg.h>
125 #include <stdbool.h>
126 #include <sys/ioctl.h>
127 #include <sys/stat.h>
128 
129 static int tunfd = -1;
130 static int tun_frags_enabled;
131 
132 // We just need this to be large enough to hold headers that we parse (ethernet/ip/tcp).
133 // Rest of the packet (if any) will be silently truncated which is fine.
134 #define SYZ_TUN_MAX_PACKET_SIZE 1000
135 
136 #define TUN_IFACE "syz_tun"
137 
138 #define LOCAL_MAC "aa:aa:aa:aa:aa:aa"
139 #define REMOTE_MAC "aa:aa:aa:aa:aa:bb"
140 
141 #define LOCAL_IPV4 "172.20.20.170"
142 #define REMOTE_IPV4 "172.20.20.187"
143 
144 #define LOCAL_IPV6 "fe80::aa"
145 #define REMOTE_IPV6 "fe80::bb"
146 
147 #ifndef IFF_NAPI
148 #define IFF_NAPI 0x0010
149 #endif
150 #ifndef IFF_NAPI_FRAGS
151 #define IFF_NAPI_FRAGS 0x0020
152 #endif
153 
initialize_tun(void)154 static void initialize_tun(void)
155 {
156 #if SYZ_EXECUTOR
157 	if (!flag_enable_tun)
158 		return;
159 #endif
160 	tunfd = open("/dev/net/tun", O_RDWR | O_NONBLOCK);
161 	if (tunfd == -1) {
162 #if SYZ_EXECUTOR
163 		fail("tun: can't open /dev/net/tun\n");
164 #else
165 		printf("tun: can't open /dev/net/tun: please enable CONFIG_TUN=y\n");
166 		printf("otherwise fuzzing or reproducing might not work as intended\n");
167 		return;
168 #endif
169 	}
170 	// Remap tun onto higher fd number to hide it from fuzzer and to keep
171 	// fd numbers stable regardless of whether tun is opened or not (also see kMaxFd).
172 	const int kTunFd = 240;
173 	if (dup2(tunfd, kTunFd) < 0)
174 		fail("dup2(tunfd, kTunFd) failed");
175 	close(tunfd);
176 	tunfd = kTunFd;
177 
178 	struct ifreq ifr;
179 	memset(&ifr, 0, sizeof(ifr));
180 	strncpy(ifr.ifr_name, TUN_IFACE, IFNAMSIZ);
181 	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_NAPI | IFF_NAPI_FRAGS;
182 	if (ioctl(tunfd, TUNSETIFF, (void*)&ifr) < 0) {
183 		// IFF_NAPI_FRAGS requires root, so try without it.
184 		ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
185 		if (ioctl(tunfd, TUNSETIFF, (void*)&ifr) < 0)
186 			fail("tun: ioctl(TUNSETIFF) failed");
187 	}
188 	// If IFF_NAPI_FRAGS is not supported it will be silently dropped,
189 	// so query the effective flags.
190 	if (ioctl(tunfd, TUNGETIFF, (void*)&ifr) < 0)
191 		fail("tun: ioctl(TUNGETIFF) failed");
192 	tun_frags_enabled = (ifr.ifr_flags & IFF_NAPI_FRAGS) != 0;
193 	debug("tun_frags_enabled=%d\n", tun_frags_enabled);
194 
195 	// Disable IPv6 DAD, otherwise the address remains unusable until DAD completes.
196 	// Don't panic because this is an optional config.
197 	execute_command(0, "sysctl -w net.ipv6.conf.%s.accept_dad=0", TUN_IFACE);
198 
199 	// Disable IPv6 router solicitation to prevent IPv6 spam.
200 	// Don't panic because this is an optional config.
201 	execute_command(0, "sysctl -w net.ipv6.conf.%s.router_solicitations=0", TUN_IFACE);
202 	// There seems to be no way to disable IPv6 MTD to prevent more IPv6 spam.
203 
204 	execute_command(1, "ip link set dev %s address %s", TUN_IFACE, LOCAL_MAC);
205 	execute_command(1, "ip addr add %s/24 dev %s", LOCAL_IPV4, TUN_IFACE);
206 	execute_command(1, "ip neigh add %s lladdr %s dev %s nud permanent",
207 			REMOTE_IPV4, REMOTE_MAC, TUN_IFACE);
208 	// Don't panic because ipv6 may be not enabled in kernel.
209 	execute_command(0, "ip -6 addr add %s/120 dev %s", LOCAL_IPV6, TUN_IFACE);
210 	execute_command(0, "ip -6 neigh add %s lladdr %s dev %s nud permanent",
211 			REMOTE_IPV6, REMOTE_MAC, TUN_IFACE);
212 	execute_command(1, "ip link set dev %s up", TUN_IFACE);
213 }
214 #endif
215 
216 #if SYZ_EXECUTOR || SYZ_ENABLE_NETDEV
217 #include <arpa/inet.h>
218 #include <errno.h>
219 #include <fcntl.h>
220 #include <linux/if.h>
221 #include <linux/if_ether.h>
222 #include <linux/if_tun.h>
223 #include <linux/ip.h>
224 #include <linux/tcp.h>
225 #include <net/if_arp.h>
226 #include <stdarg.h>
227 #include <stdbool.h>
228 #include <sys/ioctl.h>
229 #include <sys/stat.h>
230 #include <sys/uio.h>
231 
232 // Addresses are chosen to be in the same subnet as tun addresses.
233 #define DEV_IPV4 "172.20.20.%d"
234 #define DEV_IPV6 "fe80::%02hx"
235 #define DEV_MAC "aa:aa:aa:aa:aa:%02hx"
236 
snprintf_check(char * str,size_t size,const char * format,...)237 static void snprintf_check(char* str, size_t size, const char* format, ...)
238 {
239 	va_list args;
240 
241 	va_start(args, format);
242 	vsnprintf_check(str, size, format, args);
243 	va_end(args);
244 }
245 
246 // We test in a separate namespace, which does not have any network devices initially (even lo).
247 // Create/up as many as we can.
initialize_netdevices(void)248 static void initialize_netdevices(void)
249 {
250 #if SYZ_EXECUTOR
251 	if (!flag_enable_net_dev)
252 		return;
253 #endif
254 	unsigned i;
255 	const char* devtypes[] = {"ip6gretap", "bridge", "vcan", "bond", "team"};
256 	// If you extend this array, also update netdev_addr_id in vnet.txt.
257 	const char* devnames[] = {"lo", "sit0", "bridge0", "vcan0", "tunl0",
258 				  "gre0", "gretap0", "ip_vti0", "ip6_vti0",
259 				  "ip6tnl0", "ip6gre0", "ip6gretap0",
260 				  "erspan0", "bond0", "veth0", "veth1", "team0",
261 				  "veth0_to_bridge", "veth1_to_bridge",
262 				  "veth0_to_bond", "veth1_to_bond",
263 				  "veth0_to_team", "veth1_to_team"};
264 	const char* devmasters[] = {"bridge", "bond", "team"};
265 
266 	for (i = 0; i < sizeof(devtypes) / (sizeof(devtypes[0])); i++)
267 		execute_command(0, "ip link add dev %s0 type %s", devtypes[i], devtypes[i]);
268 	// This adds connected veth0 and veth1 devices.
269 	execute_command(0, "ip link add type veth");
270 
271 	// This creates connected bridge/bond/team_slave devices of type veth,
272 	// and makes them slaves of bridge/bond/team devices, respectively.
273 	// Note: slave devices don't need MAC/IP addresses, only master devices.
274 	//       veth0_to_* is not slave devices, which still need ip addresses.
275 	for (i = 0; i < sizeof(devmasters) / (sizeof(devmasters[0])); i++) {
276 		execute_command(0, "ip link add name %s_slave_0 type veth peer name veth0_to_%s", devmasters[i], devmasters[i]);
277 		execute_command(0, "ip link add name %s_slave_1 type veth peer name veth1_to_%s", devmasters[i], devmasters[i]);
278 		execute_command(0, "ip link set %s_slave_0 master %s0", devmasters[i], devmasters[i]);
279 		execute_command(0, "ip link set %s_slave_1 master %s0", devmasters[i], devmasters[i]);
280 		execute_command(0, "ip link set veth0_to_%s up", devmasters[i]);
281 		execute_command(0, "ip link set veth1_to_%s up", devmasters[i]);
282 	}
283 	// bond/team_slave_* will set up automatically when set their master.
284 	// But bridge_slave_* need to set up manually.
285 	execute_command(0, "ip link set bridge_slave_0 up");
286 	execute_command(0, "ip link set bridge_slave_1 up");
287 
288 	for (i = 0; i < sizeof(devnames) / (sizeof(devnames[0])); i++) {
289 		char addr[32];
290 		// Assign some unique address to devices. Some devices won't up without this.
291 		// Devices that don't need these addresses will simply ignore them.
292 		// Shift addresses by 10 because 0 subnet address can mean special things.
293 		snprintf_check(addr, sizeof(addr), DEV_IPV4, i + 10);
294 		execute_command(0, "ip -4 addr add %s/24 dev %s", addr, devnames[i]);
295 		snprintf_check(addr, sizeof(addr), DEV_IPV6, i + 10);
296 		execute_command(0, "ip -6 addr add %s/120 dev %s", addr, devnames[i]);
297 		snprintf_check(addr, sizeof(addr), DEV_MAC, i + 10);
298 		execute_command(0, "ip link set dev %s address %s", devnames[i], addr);
299 		execute_command(0, "ip link set dev %s up", devnames[i]);
300 	}
301 }
302 #endif
303 
304 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE && (__NR_syz_extract_tcp_res || SYZ_REPEAT)
305 #include <errno.h>
306 
read_tun(char * data,int size)307 static int read_tun(char* data, int size)
308 {
309 	if (tunfd < 0)
310 		return -1;
311 
312 	int rv = read(tunfd, data, size);
313 	if (rv < 0) {
314 		if (errno == EAGAIN)
315 			return -1;
316 		// Tun sometimes returns this, unclear if it's a kernel bug or not.
317 		if (errno == EBADFD)
318 			return -1;
319 		fail("tun: read failed with %d", rv);
320 	}
321 	return rv;
322 }
323 #endif
324 
325 #if SYZ_EXECUTOR || __NR_syz_emit_ethernet && SYZ_TUN_ENABLE
326 #include <stdbool.h>
327 #include <sys/uio.h>
328 
329 #define MAX_FRAGS 4
330 struct vnet_fragmentation {
331 	uint32 full;
332 	uint32 count;
333 	uint32 frags[MAX_FRAGS];
334 };
335 
syz_emit_ethernet(long a0,long a1,long a2)336 static long syz_emit_ethernet(long a0, long a1, long a2)
337 {
338 	// syz_emit_ethernet(len len[packet], packet ptr[in, eth_packet], frags ptr[in, vnet_fragmentation, opt])
339 	// vnet_fragmentation {
340 	// 	full	int32[0:1]
341 	// 	count	int32[1:4]
342 	// 	frags	array[int32[0:4096], 4]
343 	// }
344 	if (tunfd < 0)
345 		return (uintptr_t)-1;
346 
347 	uint32 length = a0;
348 	char* data = (char*)a1;
349 	debug_dump_data(data, length);
350 
351 	struct vnet_fragmentation* frags = (struct vnet_fragmentation*)a2;
352 	struct iovec vecs[MAX_FRAGS + 1];
353 	uint32 nfrags = 0;
354 	if (!tun_frags_enabled || frags == NULL) {
355 		vecs[nfrags].iov_base = data;
356 		vecs[nfrags].iov_len = length;
357 		nfrags++;
358 	} else {
359 		bool full = true;
360 		uint32 i, count = 0;
361 		NONFAILING(full = frags->full);
362 		NONFAILING(count = frags->count);
363 		if (count > MAX_FRAGS)
364 			count = MAX_FRAGS;
365 		for (i = 0; i < count && length != 0; i++) {
366 			uint32 size = 0;
367 			NONFAILING(size = frags->frags[i]);
368 			if (size > length)
369 				size = length;
370 			vecs[nfrags].iov_base = data;
371 			vecs[nfrags].iov_len = size;
372 			nfrags++;
373 			data += size;
374 			length -= size;
375 		}
376 		if (length != 0 && (full || nfrags == 0)) {
377 			vecs[nfrags].iov_base = data;
378 			vecs[nfrags].iov_len = length;
379 			nfrags++;
380 		}
381 	}
382 	return writev(tunfd, vecs, nfrags);
383 }
384 #endif
385 
386 #if SYZ_EXECUTOR || SYZ_REPEAT && SYZ_TUN_ENABLE
flush_tun()387 static void flush_tun()
388 {
389 #if SYZ_EXECUTOR
390 	if (!flag_enable_tun)
391 		return;
392 #endif
393 	char data[SYZ_TUN_MAX_PACKET_SIZE];
394 	while (read_tun(&data[0], sizeof(data)) != -1) {
395 	}
396 }
397 #endif
398 
399 #if SYZ_EXECUTOR || __NR_syz_extract_tcp_res && SYZ_TUN_ENABLE
400 #ifndef __ANDROID__
401 // Can't include <linux/ipv6.h>, since it causes
402 // conflicts due to some structs redefinition.
403 struct ipv6hdr {
404 	__u8 priority : 4,
405 	    version : 4;
406 	__u8 flow_lbl[3];
407 
408 	__be16 payload_len;
409 	__u8 nexthdr;
410 	__u8 hop_limit;
411 
412 	struct in6_addr saddr;
413 	struct in6_addr daddr;
414 };
415 #endif
416 
417 struct tcp_resources {
418 	uint32 seq;
419 	uint32 ack;
420 };
421 
syz_extract_tcp_res(long a0,long a1,long a2)422 static long syz_extract_tcp_res(long a0, long a1, long a2)
423 {
424 	// syz_extract_tcp_res(res ptr[out, tcp_resources], seq_inc int32, ack_inc int32)
425 
426 	if (tunfd < 0)
427 		return (uintptr_t)-1;
428 
429 	char data[SYZ_TUN_MAX_PACKET_SIZE];
430 	int rv = read_tun(&data[0], sizeof(data));
431 	if (rv == -1)
432 		return (uintptr_t)-1;
433 	size_t length = rv;
434 	debug_dump_data(data, length);
435 
436 	struct tcphdr* tcphdr;
437 
438 	if (length < sizeof(struct ethhdr))
439 		return (uintptr_t)-1;
440 	struct ethhdr* ethhdr = (struct ethhdr*)&data[0];
441 
442 	if (ethhdr->h_proto == htons(ETH_P_IP)) {
443 		if (length < sizeof(struct ethhdr) + sizeof(struct iphdr))
444 			return (uintptr_t)-1;
445 		struct iphdr* iphdr = (struct iphdr*)&data[sizeof(struct ethhdr)];
446 		if (iphdr->protocol != IPPROTO_TCP)
447 			return (uintptr_t)-1;
448 		if (length < sizeof(struct ethhdr) + iphdr->ihl * 4 + sizeof(struct tcphdr))
449 			return (uintptr_t)-1;
450 		tcphdr = (struct tcphdr*)&data[sizeof(struct ethhdr) + iphdr->ihl * 4];
451 	} else {
452 		if (length < sizeof(struct ethhdr) + sizeof(struct ipv6hdr))
453 			return (uintptr_t)-1;
454 		struct ipv6hdr* ipv6hdr = (struct ipv6hdr*)&data[sizeof(struct ethhdr)];
455 		// TODO: parse and skip extension headers.
456 		if (ipv6hdr->nexthdr != IPPROTO_TCP)
457 			return (uintptr_t)-1;
458 		if (length < sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + sizeof(struct tcphdr))
459 			return (uintptr_t)-1;
460 		tcphdr = (struct tcphdr*)&data[sizeof(struct ethhdr) + sizeof(struct ipv6hdr)];
461 	}
462 
463 	struct tcp_resources* res = (struct tcp_resources*)a0;
464 	NONFAILING(res->seq = htonl((ntohl(tcphdr->seq) + (uint32)a1)));
465 	NONFAILING(res->ack = htonl((ntohl(tcphdr->ack_seq) + (uint32)a2)));
466 
467 	debug("extracted seq: %08x\n", res->seq);
468 	debug("extracted ack: %08x\n", res->ack);
469 
470 	return 0;
471 }
472 #endif
473 
474 #if SYZ_EXECUTOR || __NR_syz_open_dev
475 #include <fcntl.h>
476 #include <string.h>
477 #include <sys/stat.h>
478 #include <sys/types.h>
479 
syz_open_dev(long a0,long a1,long a2)480 static long syz_open_dev(long a0, long a1, long a2)
481 {
482 	if (a0 == 0xc || a0 == 0xb) {
483 		// syz_open_dev$char(dev const[0xc], major intptr, minor intptr) fd
484 		// syz_open_dev$block(dev const[0xb], major intptr, minor intptr) fd
485 		char buf[128];
486 		sprintf(buf, "/dev/%s/%d:%d", a0 == 0xc ? "char" : "block", (uint8)a1, (uint8)a2);
487 		return open(buf, O_RDWR, 0);
488 	} else {
489 		// syz_open_dev(dev strconst, id intptr, flags flags[open_flags]) fd
490 		char buf[1024];
491 		char* hash;
492 		NONFAILING(strncpy(buf, (char*)a0, sizeof(buf) - 1));
493 		buf[sizeof(buf) - 1] = 0;
494 		while ((hash = strchr(buf, '#'))) {
495 			*hash = '0' + (char)(a1 % 10); // 10 devices should be enough for everyone.
496 			a1 /= 10;
497 		}
498 		return open(buf, a2, 0);
499 	}
500 }
501 #endif
502 
503 #if SYZ_EXECUTOR || __NR_syz_open_procfs
504 #include <fcntl.h>
505 #include <string.h>
506 #include <sys/stat.h>
507 #include <sys/types.h>
508 
syz_open_procfs(long a0,long a1)509 static long syz_open_procfs(long a0, long a1)
510 {
511 	// syz_open_procfs(pid pid, file ptr[in, string[procfs_file]]) fd
512 
513 	char buf[128];
514 	memset(buf, 0, sizeof(buf));
515 	if (a0 == 0) {
516 		NONFAILING(snprintf(buf, sizeof(buf), "/proc/self/%s", (char*)a1));
517 	} else if (a0 == -1) {
518 		NONFAILING(snprintf(buf, sizeof(buf), "/proc/thread-self/%s", (char*)a1));
519 	} else {
520 		NONFAILING(snprintf(buf, sizeof(buf), "/proc/self/task/%d/%s", (int)a0, (char*)a1));
521 	}
522 	int fd = open(buf, O_RDWR);
523 	if (fd == -1)
524 		fd = open(buf, O_RDONLY);
525 	return fd;
526 }
527 #endif
528 
529 #if SYZ_EXECUTOR || __NR_syz_open_pts
530 #include <fcntl.h>
531 #include <sys/ioctl.h>
532 #include <sys/stat.h>
533 #include <sys/types.h>
534 
syz_open_pts(long a0,long a1)535 static long syz_open_pts(long a0, long a1)
536 {
537 	// syz_openpts(fd fd[tty], flags flags[open_flags]) fd[tty]
538 	int ptyno = 0;
539 	if (ioctl(a0, TIOCGPTN, &ptyno))
540 		return -1;
541 	char buf[128];
542 	sprintf(buf, "/dev/pts/%d", ptyno);
543 	return open(buf, a1, 0);
544 }
545 #endif
546 
547 #if SYZ_EXECUTOR || __NR_syz_init_net_socket
548 #if SYZ_EXECUTOR || SYZ_SANDBOX_NONE || SYZ_SANDBOX_SETUID || SYZ_SANDBOX_NAMESPACE
549 #include <fcntl.h>
550 #include <sched.h>
551 #include <sys/stat.h>
552 #include <sys/types.h>
553 #include <unistd.h>
554 
555 const int kInitNetNsFd = 239; // see kMaxFd
556 // syz_init_net_socket opens a socket in init net namespace.
557 // Used for families that can only be created in init net namespace.
syz_init_net_socket(long domain,long type,long proto)558 static long syz_init_net_socket(long domain, long type, long proto)
559 {
560 	int netns = open("/proc/self/ns/net", O_RDONLY);
561 	if (netns == -1)
562 		return netns;
563 	if (setns(kInitNetNsFd, 0))
564 		return -1;
565 	int sock = syscall(__NR_socket, domain, type, proto);
566 	int err = errno;
567 	if (setns(netns, 0))
568 		fail("setns(netns) failed");
569 	close(netns);
570 	errno = err;
571 	return sock;
572 }
573 #else
syz_init_net_socket(long domain,long type,long proto)574 static long syz_init_net_socket(long domain, long type, long proto)
575 {
576 	return syscall(__NR_socket, domain, type, proto);
577 }
578 #endif
579 #endif
580 
581 #if SYZ_EXECUTOR || __NR_syz_genetlink_get_family_id
582 #include <errno.h>
583 #include <linux/genetlink.h>
584 #include <linux/netlink.h>
585 #include <sys/socket.h>
586 #include <sys/types.h>
587 
syz_genetlink_get_family_id(long name)588 static long syz_genetlink_get_family_id(long name)
589 {
590 	char buf[512] = {0};
591 	struct nlmsghdr* hdr = (struct nlmsghdr*)buf;
592 	struct genlmsghdr* genlhdr = (struct genlmsghdr*)NLMSG_DATA(hdr);
593 	struct nlattr* attr = (struct nlattr*)(genlhdr + 1);
594 	hdr->nlmsg_len = sizeof(*hdr) + sizeof(*genlhdr) + sizeof(*attr) + GENL_NAMSIZ;
595 	hdr->nlmsg_type = GENL_ID_CTRL;
596 	hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
597 	genlhdr->cmd = CTRL_CMD_GETFAMILY;
598 	attr->nla_type = CTRL_ATTR_FAMILY_NAME;
599 	attr->nla_len = sizeof(*attr) + GENL_NAMSIZ;
600 	NONFAILING(strncpy((char*)(attr + 1), (char*)name, GENL_NAMSIZ));
601 	struct iovec iov = {hdr, hdr->nlmsg_len};
602 	struct sockaddr_nl addr = {0};
603 	addr.nl_family = AF_NETLINK;
604 	debug("syz_genetlink_get_family_id(%s)\n", (char*)(attr + 1));
605 	int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
606 	if (fd == -1) {
607 		debug("syz_genetlink_get_family_id: socket failed: %d\n", errno);
608 		return -1;
609 	}
610 	struct msghdr msg = {&addr, sizeof(addr), &iov, 1, NULL, 0, 0};
611 	if (sendmsg(fd, &msg, 0) == -1) {
612 		debug("syz_genetlink_get_family_id: sendmsg failed: %d\n", errno);
613 		close(fd);
614 		return -1;
615 	}
616 	ssize_t n = recv(fd, buf, sizeof(buf), 0);
617 	close(fd);
618 	if (n <= 0) {
619 		debug("syz_genetlink_get_family_id: recv failed: %d\n", errno);
620 		return -1;
621 	}
622 	if (hdr->nlmsg_type != GENL_ID_CTRL) {
623 		debug("syz_genetlink_get_family_id: wrong reply type: %d\n", hdr->nlmsg_type);
624 		return -1;
625 	}
626 	for (; (char*)attr < buf + n; attr = (struct nlattr*)((char*)attr + NLMSG_ALIGN(attr->nla_len))) {
627 		if (attr->nla_type == CTRL_ATTR_FAMILY_ID)
628 			return *(uint16*)(attr + 1);
629 	}
630 	debug("syz_genetlink_get_family_id: no CTRL_ATTR_FAMILY_ID attr\n");
631 	return -1;
632 }
633 #endif
634 
635 #if SYZ_EXECUTOR || __NR_syz_mount_image || __NR_syz_read_part_table
636 #include <errno.h>
637 #include <fcntl.h>
638 #include <linux/loop.h>
639 #include <sys/ioctl.h>
640 #include <sys/stat.h>
641 #include <sys/types.h>
642 
643 struct fs_image_segment {
644 	void* data;
645 	uintptr_t size;
646 	uintptr_t offset;
647 };
648 
649 #define IMAGE_MAX_SEGMENTS 4096
650 #define IMAGE_MAX_SIZE (129 << 20)
651 
652 #if GOARCH_386
653 #define SYZ_memfd_create 356
654 #elif GOARCH_amd64
655 #define SYZ_memfd_create 319
656 #elif GOARCH_arm
657 #define SYZ_memfd_create 385
658 #elif GOARCH_arm64
659 #define SYZ_memfd_create 279
660 #elif GOARCH_ppc64le
661 #define SYZ_memfd_create 360
662 #endif
663 #endif
664 
665 #if SYZ_EXECUTOR || __NR_syz_read_part_table
666 // syz_read_part_table(size intptr, nsegs len[segments], segments ptr[in, array[fs_image_segment]])
syz_read_part_table(unsigned long size,unsigned long nsegs,long segments)667 static long syz_read_part_table(unsigned long size, unsigned long nsegs, long segments)
668 {
669 	char loopname[64], linkname[64];
670 	int loopfd, err = 0, res = -1;
671 	unsigned long i, j;
672 	// See the comment in syz_mount_image.
673 	struct fs_image_segment* segs = (struct fs_image_segment*)segments;
674 
675 	if (nsegs > IMAGE_MAX_SEGMENTS)
676 		nsegs = IMAGE_MAX_SEGMENTS;
677 	for (i = 0; i < nsegs; i++) {
678 		if (segs[i].size > IMAGE_MAX_SIZE)
679 			segs[i].size = IMAGE_MAX_SIZE;
680 		segs[i].offset %= IMAGE_MAX_SIZE;
681 		if (segs[i].offset > IMAGE_MAX_SIZE - segs[i].size)
682 			segs[i].offset = IMAGE_MAX_SIZE - segs[i].size;
683 		if (size < segs[i].offset + segs[i].offset)
684 			size = segs[i].offset + segs[i].offset;
685 	}
686 	if (size > IMAGE_MAX_SIZE)
687 		size = IMAGE_MAX_SIZE;
688 	int memfd = syscall(SYZ_memfd_create, "syz_read_part_table", 0);
689 	if (memfd == -1) {
690 		err = errno;
691 		goto error;
692 	}
693 	if (ftruncate(memfd, size)) {
694 		err = errno;
695 		goto error_close_memfd;
696 	}
697 	for (i = 0; i < nsegs; i++) {
698 		if (pwrite(memfd, segs[i].data, segs[i].size, segs[i].offset) < 0) {
699 			debug("syz_read_part_table: pwrite[%u] failed: %d\n", (int)i, errno);
700 		}
701 	}
702 	snprintf(loopname, sizeof(loopname), "/dev/loop%llu", procid);
703 	loopfd = open(loopname, O_RDWR);
704 	if (loopfd == -1) {
705 		err = errno;
706 		goto error_close_memfd;
707 	}
708 	if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
709 		if (errno != EBUSY) {
710 			err = errno;
711 			goto error_close_loop;
712 		}
713 		ioctl(loopfd, LOOP_CLR_FD, 0);
714 		usleep(1000);
715 		if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
716 			err = errno;
717 			goto error_close_loop;
718 		}
719 	}
720 	struct loop_info64 info;
721 	if (ioctl(loopfd, LOOP_GET_STATUS64, &info)) {
722 		err = errno;
723 		goto error_clear_loop;
724 	}
725 #if SYZ_EXECUTOR
726 	cover_reset(0);
727 #endif
728 	info.lo_flags |= LO_FLAGS_PARTSCAN;
729 	if (ioctl(loopfd, LOOP_SET_STATUS64, &info)) {
730 		err = errno;
731 		goto error_clear_loop;
732 	}
733 	res = 0;
734 	// If we managed to parse some partitions, symlink them into our work dir.
735 	for (i = 1, j = 0; i < 8; i++) {
736 		snprintf(loopname, sizeof(loopname), "/dev/loop%llup%d", procid, (int)i);
737 		struct stat statbuf;
738 		if (stat(loopname, &statbuf) == 0) {
739 			snprintf(linkname, sizeof(linkname), "./file%d", (int)j++);
740 			if (symlink(loopname, linkname)) {
741 				debug("syz_read_part_table: symlink(%s, %s) failed: %d\n", loopname, linkname, errno);
742 			}
743 		}
744 	}
745 error_clear_loop:
746 	ioctl(loopfd, LOOP_CLR_FD, 0);
747 error_close_loop:
748 	close(loopfd);
749 error_close_memfd:
750 	close(memfd);
751 error:
752 	errno = err;
753 	return res;
754 }
755 #endif
756 
757 #if SYZ_EXECUTOR || __NR_syz_mount_image
758 #include <string.h>
759 #include <sys/mount.h>
760 
761 //syz_mount_image(fs ptr[in, string[disk_filesystems]], dir ptr[in, filename], size intptr, nsegs len[segments], segments ptr[in, array[fs_image_segment]], flags flags[mount_flags], opts ptr[in, fs_options[vfat_options]])
762 //fs_image_segment {
763 //	data	ptr[in, array[int8]]
764 //	size	len[data, intptr]
765 //	offset	intptr
766 //}
syz_mount_image(long fsarg,long dir,unsigned long size,unsigned long nsegs,long segments,long flags,long optsarg)767 static long syz_mount_image(long fsarg, long dir, unsigned long size, unsigned long nsegs, long segments, long flags, long optsarg)
768 {
769 	char loopname[64], fs[32], opts[256];
770 	int loopfd, err = 0, res = -1;
771 	unsigned long i;
772 	// Strictly saying we ought to do a nonfailing copyout of segments into a local var.
773 	// But some filesystems have large number of segments (2000+),
774 	// we can't allocate that much on stack and allocating elsewhere is problematic,
775 	// so we just use the memory allocated by fuzzer.
776 	struct fs_image_segment* segs = (struct fs_image_segment*)segments;
777 
778 	if (nsegs > IMAGE_MAX_SEGMENTS)
779 		nsegs = IMAGE_MAX_SEGMENTS;
780 	for (i = 0; i < nsegs; i++) {
781 		if (segs[i].size > IMAGE_MAX_SIZE)
782 			segs[i].size = IMAGE_MAX_SIZE;
783 		segs[i].offset %= IMAGE_MAX_SIZE;
784 		if (segs[i].offset > IMAGE_MAX_SIZE - segs[i].size)
785 			segs[i].offset = IMAGE_MAX_SIZE - segs[i].size;
786 		if (size < segs[i].offset + segs[i].offset)
787 			size = segs[i].offset + segs[i].offset;
788 	}
789 	if (size > IMAGE_MAX_SIZE)
790 		size = IMAGE_MAX_SIZE;
791 	int memfd = syscall(SYZ_memfd_create, "syz_mount_image", 0);
792 	if (memfd == -1) {
793 		err = errno;
794 		goto error;
795 	}
796 	if (ftruncate(memfd, size)) {
797 		err = errno;
798 		goto error_close_memfd;
799 	}
800 	for (i = 0; i < nsegs; i++) {
801 		if (pwrite(memfd, segs[i].data, segs[i].size, segs[i].offset) < 0) {
802 			debug("syz_mount_image: pwrite[%u] failed: %d\n", (int)i, errno);
803 		}
804 	}
805 	snprintf(loopname, sizeof(loopname), "/dev/loop%llu", procid);
806 	loopfd = open(loopname, O_RDWR);
807 	if (loopfd == -1) {
808 		err = errno;
809 		goto error_close_memfd;
810 	}
811 	if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
812 		if (errno != EBUSY) {
813 			err = errno;
814 			goto error_close_loop;
815 		}
816 		ioctl(loopfd, LOOP_CLR_FD, 0);
817 		usleep(1000);
818 		if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
819 			err = errno;
820 			goto error_close_loop;
821 		}
822 	}
823 	mkdir((char*)dir, 0777);
824 	memset(fs, 0, sizeof(fs));
825 	NONFAILING(strncpy(fs, (char*)fsarg, sizeof(fs) - 1));
826 	memset(opts, 0, sizeof(opts));
827 	// Leave some space for the additional options we append below.
828 	NONFAILING(strncpy(opts, (char*)optsarg, sizeof(opts) - 32));
829 	if (strcmp(fs, "iso9660") == 0) {
830 		flags |= MS_RDONLY;
831 	} else if (strncmp(fs, "ext", 3) == 0) {
832 		// For ext2/3/4 we have to have errors=continue because the image
833 		// can contain errors=panic flag and can legally crash kernel.
834 		if (strstr(opts, "errors=panic") || strstr(opts, "errors=remount-ro") == 0)
835 			strcat(opts, ",errors=continue");
836 	} else if (strcmp(fs, "xfs") == 0) {
837 		// For xfs we need nouuid because xfs has a global uuids table
838 		// and if two parallel executors mounts fs with the same uuid, second mount fails.
839 		strcat(opts, ",nouuid");
840 	}
841 	debug("syz_mount_image: size=%llu segs=%llu loop='%s' dir='%s' fs='%s' flags=%llu opts='%s'\n", (uint64)size, (uint64)nsegs, loopname, (char*)dir, fs, (uint64)flags, opts);
842 #if SYZ_EXECUTOR
843 	cover_reset(0);
844 #endif
845 	if (mount(loopname, (char*)dir, fs, flags, opts)) {
846 		err = errno;
847 		goto error_clear_loop;
848 	}
849 	res = 0;
850 error_clear_loop:
851 	ioctl(loopfd, LOOP_CLR_FD, 0);
852 error_close_loop:
853 	close(loopfd);
854 error_close_memfd:
855 	close(memfd);
856 error:
857 	errno = err;
858 	return res;
859 }
860 #endif
861 
862 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu
863 #include <errno.h>
864 #include <fcntl.h>
865 #include <linux/kvm.h>
866 #include <stdarg.h>
867 #include <stddef.h>
868 #include <sys/ioctl.h>
869 #include <sys/stat.h>
870 
871 #if defined(__x86_64__)
872 #include "common_kvm_amd64.h"
873 #elif defined(__aarch64__)
874 #include "common_kvm_arm64.h"
875 #else
syz_kvm_setup_cpu(long a0,long a1,long a2,long a3,long a4,long a5,long a6,long a7)876 static long syz_kvm_setup_cpu(long a0, long a1, long a2, long a3, long a4, long a5, long a6, long a7)
877 {
878 	return 0;
879 }
880 #endif
881 #endif
882 
883 #if SYZ_EXECUTOR || SYZ_FAULT_INJECTION || SYZ_SANDBOX_NAMESPACE || SYZ_ENABLE_CGROUPS
884 #include <errno.h>
885 #include <fcntl.h>
886 #include <stdarg.h>
887 #include <stdbool.h>
888 #include <string.h>
889 #include <sys/stat.h>
890 #include <sys/types.h>
891 
write_file(const char * file,const char * what,...)892 static bool write_file(const char* file, const char* what, ...)
893 {
894 	char buf[1024];
895 	va_list args;
896 	va_start(args, what);
897 	vsnprintf(buf, sizeof(buf), what, args);
898 	va_end(args);
899 	buf[sizeof(buf) - 1] = 0;
900 	int len = strlen(buf);
901 
902 	int fd = open(file, O_WRONLY | O_CLOEXEC);
903 	if (fd == -1)
904 		return false;
905 	if (write(fd, buf, len) != len) {
906 		int err = errno;
907 		close(fd);
908 		errno = err;
909 		return false;
910 	}
911 	close(fd);
912 	return true;
913 }
914 #endif
915 
916 #if SYZ_EXECUTOR || SYZ_RESET_NET_NAMESPACE
917 #include <errno.h>
918 #include <linux/net.h>
919 #include <netinet/in.h>
920 #include <string.h>
921 #include <sys/socket.h>
922 
923 // checkpoint/reset_net_namespace partially resets net namespace to initial state
924 // after each test. Currently it resets only ipv4 netfilter state.
925 // Ideally, we just create a new net namespace for each test,
926 // however it's too slow (1-1.5 seconds per namespace, not parallelizable).
927 
928 // Linux headers do not compile for C++, so we have to define the structs manualy.
929 #define XT_TABLE_SIZE 1536
930 #define XT_MAX_ENTRIES 10
931 
932 struct xt_counters {
933 	uint64 pcnt, bcnt;
934 };
935 
936 struct ipt_getinfo {
937 	char name[32];
938 	unsigned int valid_hooks;
939 	unsigned int hook_entry[5];
940 	unsigned int underflow[5];
941 	unsigned int num_entries;
942 	unsigned int size;
943 };
944 
945 struct ipt_get_entries {
946 	char name[32];
947 	unsigned int size;
948 	void* entrytable[XT_TABLE_SIZE / sizeof(void*)];
949 };
950 
951 struct ipt_replace {
952 	char name[32];
953 	unsigned int valid_hooks;
954 	unsigned int num_entries;
955 	unsigned int size;
956 	unsigned int hook_entry[5];
957 	unsigned int underflow[5];
958 	unsigned int num_counters;
959 	struct xt_counters* counters;
960 	char entrytable[XT_TABLE_SIZE];
961 };
962 
963 struct ipt_table_desc {
964 	const char* name;
965 	struct ipt_getinfo info;
966 	struct ipt_replace replace;
967 };
968 
969 static struct ipt_table_desc ipv4_tables[] = {
970     {.name = "filter"},
971     {.name = "nat"},
972     {.name = "mangle"},
973     {.name = "raw"},
974     {.name = "security"},
975 };
976 
977 static struct ipt_table_desc ipv6_tables[] = {
978     {.name = "filter"},
979     {.name = "nat"},
980     {.name = "mangle"},
981     {.name = "raw"},
982     {.name = "security"},
983 };
984 
985 #define IPT_BASE_CTL 64
986 #define IPT_SO_SET_REPLACE (IPT_BASE_CTL)
987 #define IPT_SO_GET_INFO (IPT_BASE_CTL)
988 #define IPT_SO_GET_ENTRIES (IPT_BASE_CTL + 1)
989 
990 struct arpt_getinfo {
991 	char name[32];
992 	unsigned int valid_hooks;
993 	unsigned int hook_entry[3];
994 	unsigned int underflow[3];
995 	unsigned int num_entries;
996 	unsigned int size;
997 };
998 
999 struct arpt_get_entries {
1000 	char name[32];
1001 	unsigned int size;
1002 	void* entrytable[XT_TABLE_SIZE / sizeof(void*)];
1003 };
1004 
1005 struct arpt_replace {
1006 	char name[32];
1007 	unsigned int valid_hooks;
1008 	unsigned int num_entries;
1009 	unsigned int size;
1010 	unsigned int hook_entry[3];
1011 	unsigned int underflow[3];
1012 	unsigned int num_counters;
1013 	struct xt_counters* counters;
1014 	char entrytable[XT_TABLE_SIZE];
1015 };
1016 
1017 struct arpt_table_desc {
1018 	const char* name;
1019 	struct arpt_getinfo info;
1020 	struct arpt_replace replace;
1021 };
1022 
1023 static struct arpt_table_desc arpt_tables[] = {
1024     {.name = "filter"},
1025 };
1026 
1027 #define ARPT_BASE_CTL 96
1028 #define ARPT_SO_SET_REPLACE (ARPT_BASE_CTL)
1029 #define ARPT_SO_GET_INFO (ARPT_BASE_CTL)
1030 #define ARPT_SO_GET_ENTRIES (ARPT_BASE_CTL + 1)
1031 
checkpoint_iptables(struct ipt_table_desc * tables,int num_tables,int family,int level)1032 static void checkpoint_iptables(struct ipt_table_desc* tables, int num_tables, int family, int level)
1033 {
1034 	struct ipt_get_entries entries;
1035 	socklen_t optlen;
1036 	int fd, i;
1037 
1038 	fd = socket(family, SOCK_STREAM, IPPROTO_TCP);
1039 	if (fd == -1) {
1040 		switch (errno) {
1041 		case EAFNOSUPPORT:
1042 		case ENOPROTOOPT:
1043 			return;
1044 		}
1045 		fail("iptable checkpoint %d: socket failed", family);
1046 	}
1047 	for (i = 0; i < num_tables; i++) {
1048 		struct ipt_table_desc* table = &tables[i];
1049 		strcpy(table->info.name, table->name);
1050 		strcpy(table->replace.name, table->name);
1051 		optlen = sizeof(table->info);
1052 		if (getsockopt(fd, level, IPT_SO_GET_INFO, &table->info, &optlen)) {
1053 			switch (errno) {
1054 			case EPERM:
1055 			case ENOENT:
1056 			case ENOPROTOOPT:
1057 				continue;
1058 			}
1059 			fail("iptable checkpoint %s/%d: getsockopt(IPT_SO_GET_INFO)", table->name, family);
1060 		}
1061 		debug("iptable checkpoint %s/%d: checkpoint entries=%d hooks=%x size=%d\n",
1062 		      table->name, family, table->info.num_entries,
1063 		      table->info.valid_hooks, table->info.size);
1064 		if (table->info.size > sizeof(table->replace.entrytable))
1065 			fail("iptable checkpoint %s/%d: table size is too large: %u",
1066 			     table->name, family, table->info.size);
1067 		if (table->info.num_entries > XT_MAX_ENTRIES)
1068 			fail("iptable checkpoint %s/%d: too many counters: %u",
1069 			     table->name, family, table->info.num_entries);
1070 		memset(&entries, 0, sizeof(entries));
1071 		strcpy(entries.name, table->name);
1072 		entries.size = table->info.size;
1073 		optlen = sizeof(entries) - sizeof(entries.entrytable) + table->info.size;
1074 		if (getsockopt(fd, level, IPT_SO_GET_ENTRIES, &entries, &optlen))
1075 			fail("iptable checkpoint %s/%d: getsockopt(IPT_SO_GET_ENTRIES)",
1076 			     table->name, family);
1077 		table->replace.valid_hooks = table->info.valid_hooks;
1078 		table->replace.num_entries = table->info.num_entries;
1079 		table->replace.size = table->info.size;
1080 		memcpy(table->replace.hook_entry, table->info.hook_entry, sizeof(table->replace.hook_entry));
1081 		memcpy(table->replace.underflow, table->info.underflow, sizeof(table->replace.underflow));
1082 		memcpy(table->replace.entrytable, entries.entrytable, table->info.size);
1083 	}
1084 	close(fd);
1085 }
1086 
reset_iptables(struct ipt_table_desc * tables,int num_tables,int family,int level)1087 static void reset_iptables(struct ipt_table_desc* tables, int num_tables, int family, int level)
1088 {
1089 	struct xt_counters counters[XT_MAX_ENTRIES];
1090 	struct ipt_get_entries entries;
1091 	struct ipt_getinfo info;
1092 	socklen_t optlen;
1093 	int fd, i;
1094 
1095 	fd = socket(family, SOCK_STREAM, IPPROTO_TCP);
1096 	if (fd == -1) {
1097 		switch (errno) {
1098 		case EAFNOSUPPORT:
1099 		case ENOPROTOOPT:
1100 			return;
1101 		}
1102 		fail("iptable %d: socket failed", family);
1103 	}
1104 	for (i = 0; i < num_tables; i++) {
1105 		struct ipt_table_desc* table = &tables[i];
1106 		if (table->info.valid_hooks == 0)
1107 			continue;
1108 		memset(&info, 0, sizeof(info));
1109 		strcpy(info.name, table->name);
1110 		optlen = sizeof(info);
1111 		if (getsockopt(fd, level, IPT_SO_GET_INFO, &info, &optlen))
1112 			fail("iptable %s/%d: getsockopt(IPT_SO_GET_INFO)", table->name, family);
1113 		if (memcmp(&table->info, &info, sizeof(table->info)) == 0) {
1114 			memset(&entries, 0, sizeof(entries));
1115 			strcpy(entries.name, table->name);
1116 			entries.size = table->info.size;
1117 			optlen = sizeof(entries) - sizeof(entries.entrytable) + entries.size;
1118 			if (getsockopt(fd, level, IPT_SO_GET_ENTRIES, &entries, &optlen))
1119 				fail("iptable %s/%d: getsockopt(IPT_SO_GET_ENTRIES)", table->name, family);
1120 			if (memcmp(table->replace.entrytable, entries.entrytable, table->info.size) == 0)
1121 				continue;
1122 		}
1123 		debug("iptable %s/%d: resetting\n", table->name, family);
1124 		table->replace.num_counters = info.num_entries;
1125 		table->replace.counters = counters;
1126 		optlen = sizeof(table->replace) - sizeof(table->replace.entrytable) + table->replace.size;
1127 		if (setsockopt(fd, level, IPT_SO_SET_REPLACE, &table->replace, optlen))
1128 			fail("iptable %s/%d: setsockopt(IPT_SO_SET_REPLACE)", table->name, family);
1129 	}
1130 	close(fd);
1131 }
1132 
checkpoint_arptables(void)1133 static void checkpoint_arptables(void)
1134 {
1135 	struct arpt_get_entries entries;
1136 	socklen_t optlen;
1137 	unsigned i;
1138 	int fd;
1139 
1140 	fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
1141 	if (fd == -1) {
1142 		switch (errno) {
1143 		case EAFNOSUPPORT:
1144 		case ENOPROTOOPT:
1145 			return;
1146 		}
1147 		fail("arptable checkpoint: socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)");
1148 	}
1149 	for (i = 0; i < sizeof(arpt_tables) / sizeof(arpt_tables[0]); i++) {
1150 		struct arpt_table_desc* table = &arpt_tables[i];
1151 		strcpy(table->info.name, table->name);
1152 		strcpy(table->replace.name, table->name);
1153 		optlen = sizeof(table->info);
1154 		if (getsockopt(fd, SOL_IP, ARPT_SO_GET_INFO, &table->info, &optlen)) {
1155 			switch (errno) {
1156 			case EPERM:
1157 			case ENOENT:
1158 			case ENOPROTOOPT:
1159 				continue;
1160 			}
1161 			fail("arptable checkpoint %s: getsockopt(ARPT_SO_GET_INFO)", table->name);
1162 		}
1163 		debug("arptable checkpoint %s: entries=%d hooks=%x size=%d\n",
1164 		      table->name, table->info.num_entries, table->info.valid_hooks, table->info.size);
1165 		if (table->info.size > sizeof(table->replace.entrytable))
1166 			fail("arptable checkpoint %s: table size is too large: %u",
1167 			     table->name, table->info.size);
1168 		if (table->info.num_entries > XT_MAX_ENTRIES)
1169 			fail("arptable checkpoint %s: too many counters: %u",
1170 			     table->name, table->info.num_entries);
1171 		memset(&entries, 0, sizeof(entries));
1172 		strcpy(entries.name, table->name);
1173 		entries.size = table->info.size;
1174 		optlen = sizeof(entries) - sizeof(entries.entrytable) + table->info.size;
1175 		if (getsockopt(fd, SOL_IP, ARPT_SO_GET_ENTRIES, &entries, &optlen))
1176 			fail("arptable checkpoint %s: getsockopt(ARPT_SO_GET_ENTRIES)", table->name);
1177 		table->replace.valid_hooks = table->info.valid_hooks;
1178 		table->replace.num_entries = table->info.num_entries;
1179 		table->replace.size = table->info.size;
1180 		memcpy(table->replace.hook_entry, table->info.hook_entry, sizeof(table->replace.hook_entry));
1181 		memcpy(table->replace.underflow, table->info.underflow, sizeof(table->replace.underflow));
1182 		memcpy(table->replace.entrytable, entries.entrytable, table->info.size);
1183 	}
1184 	close(fd);
1185 }
1186 
reset_arptables()1187 static void reset_arptables()
1188 {
1189 	struct xt_counters counters[XT_MAX_ENTRIES];
1190 	struct arpt_get_entries entries;
1191 	struct arpt_getinfo info;
1192 	socklen_t optlen;
1193 	unsigned i;
1194 	int fd;
1195 
1196 	fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
1197 	if (fd == -1) {
1198 		switch (errno) {
1199 		case EAFNOSUPPORT:
1200 		case ENOPROTOOPT:
1201 			return;
1202 		}
1203 		fail("arptable: socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)");
1204 	}
1205 	for (i = 0; i < sizeof(arpt_tables) / sizeof(arpt_tables[0]); i++) {
1206 		struct arpt_table_desc* table = &arpt_tables[i];
1207 		if (table->info.valid_hooks == 0)
1208 			continue;
1209 		memset(&info, 0, sizeof(info));
1210 		strcpy(info.name, table->name);
1211 		optlen = sizeof(info);
1212 		if (getsockopt(fd, SOL_IP, ARPT_SO_GET_INFO, &info, &optlen))
1213 			fail("arptable %s:getsockopt(ARPT_SO_GET_INFO)", table->name);
1214 		if (memcmp(&table->info, &info, sizeof(table->info)) == 0) {
1215 			memset(&entries, 0, sizeof(entries));
1216 			strcpy(entries.name, table->name);
1217 			entries.size = table->info.size;
1218 			optlen = sizeof(entries) - sizeof(entries.entrytable) + entries.size;
1219 			if (getsockopt(fd, SOL_IP, ARPT_SO_GET_ENTRIES, &entries, &optlen))
1220 				fail("arptable %s: getsockopt(ARPT_SO_GET_ENTRIES)", table->name);
1221 			if (memcmp(table->replace.entrytable, entries.entrytable, table->info.size) == 0)
1222 				continue;
1223 			debug("arptable %s: data changed\n", table->name);
1224 		} else {
1225 			debug("arptable %s: header changed\n", table->name);
1226 		}
1227 		debug("arptable %s: resetting\n", table->name);
1228 		table->replace.num_counters = info.num_entries;
1229 		table->replace.counters = counters;
1230 		optlen = sizeof(table->replace) - sizeof(table->replace.entrytable) + table->replace.size;
1231 		if (setsockopt(fd, SOL_IP, ARPT_SO_SET_REPLACE, &table->replace, optlen))
1232 			fail("arptable %s: setsockopt(ARPT_SO_SET_REPLACE)", table->name);
1233 	}
1234 	close(fd);
1235 }
1236 
1237 #include <linux/if.h>
1238 #include <linux/netfilter_bridge/ebtables.h>
1239 
1240 struct ebt_table_desc {
1241 	const char* name;
1242 	struct ebt_replace replace;
1243 	char entrytable[XT_TABLE_SIZE];
1244 };
1245 
1246 static struct ebt_table_desc ebt_tables[] = {
1247     {.name = "filter"},
1248     {.name = "nat"},
1249     {.name = "broute"},
1250 };
1251 
checkpoint_ebtables(void)1252 static void checkpoint_ebtables(void)
1253 {
1254 	socklen_t optlen;
1255 	unsigned i;
1256 	int fd;
1257 
1258 	fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
1259 	if (fd == -1) {
1260 		switch (errno) {
1261 		case EAFNOSUPPORT:
1262 		case ENOPROTOOPT:
1263 			return;
1264 		}
1265 		fail("ebtable checkpoint: socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)");
1266 	}
1267 	for (i = 0; i < sizeof(ebt_tables) / sizeof(ebt_tables[0]); i++) {
1268 		struct ebt_table_desc* table = &ebt_tables[i];
1269 		strcpy(table->replace.name, table->name);
1270 		optlen = sizeof(table->replace);
1271 		if (getsockopt(fd, SOL_IP, EBT_SO_GET_INIT_INFO, &table->replace, &optlen)) {
1272 			switch (errno) {
1273 			case EPERM:
1274 			case ENOENT:
1275 			case ENOPROTOOPT:
1276 				continue;
1277 			}
1278 			fail("ebtable checkpoint %s: getsockopt(EBT_SO_GET_INIT_INFO)", table->name);
1279 		}
1280 		debug("ebtable checkpoint %s: entries=%d hooks=%x size=%d\n",
1281 		      table->name, table->replace.nentries, table->replace.valid_hooks,
1282 		      table->replace.entries_size);
1283 		if (table->replace.entries_size > sizeof(table->entrytable))
1284 			fail("ebtable checkpoint %s: table size is too large: %u",
1285 			     table->name, table->replace.entries_size);
1286 		table->replace.num_counters = 0;
1287 		table->replace.entries = table->entrytable;
1288 		optlen = sizeof(table->replace) + table->replace.entries_size;
1289 		if (getsockopt(fd, SOL_IP, EBT_SO_GET_INIT_ENTRIES, &table->replace, &optlen))
1290 			fail("ebtable checkpoint %s: getsockopt(EBT_SO_GET_INIT_ENTRIES)", table->name);
1291 	}
1292 	close(fd);
1293 }
1294 
reset_ebtables()1295 static void reset_ebtables()
1296 {
1297 	struct ebt_replace replace;
1298 	char entrytable[XT_TABLE_SIZE];
1299 	socklen_t optlen;
1300 	unsigned i, j, h;
1301 	int fd;
1302 
1303 	fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
1304 	if (fd == -1) {
1305 		switch (errno) {
1306 		case EAFNOSUPPORT:
1307 		case ENOPROTOOPT:
1308 			return;
1309 		}
1310 		fail("ebtable: socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)");
1311 	}
1312 	for (i = 0; i < sizeof(ebt_tables) / sizeof(ebt_tables[0]); i++) {
1313 		struct ebt_table_desc* table = &ebt_tables[i];
1314 		if (table->replace.valid_hooks == 0)
1315 			continue;
1316 		memset(&replace, 0, sizeof(replace));
1317 		strcpy(replace.name, table->name);
1318 		optlen = sizeof(replace);
1319 		if (getsockopt(fd, SOL_IP, EBT_SO_GET_INFO, &replace, &optlen))
1320 			fail("ebtable %s: getsockopt(EBT_SO_GET_INFO)", table->name);
1321 		replace.num_counters = 0;
1322 		table->replace.entries = 0;
1323 		for (h = 0; h < NF_BR_NUMHOOKS; h++)
1324 			table->replace.hook_entry[h] = 0;
1325 		if (memcmp(&table->replace, &replace, sizeof(table->replace)) == 0) {
1326 			memset(&entrytable, 0, sizeof(entrytable));
1327 			replace.entries = entrytable;
1328 			optlen = sizeof(replace) + replace.entries_size;
1329 			if (getsockopt(fd, SOL_IP, EBT_SO_GET_ENTRIES, &replace, &optlen))
1330 				fail("ebtable %s: getsockopt(EBT_SO_GET_ENTRIES)", table->name);
1331 			if (memcmp(table->entrytable, entrytable, replace.entries_size) == 0)
1332 				continue;
1333 		}
1334 		debug("ebtable %s: resetting\n", table->name);
1335 		// Kernel does not seem to return actual entry points (wat?).
1336 		for (j = 0, h = 0; h < NF_BR_NUMHOOKS; h++) {
1337 			if (table->replace.valid_hooks & (1 << h)) {
1338 				table->replace.hook_entry[h] = (struct ebt_entries*)table->entrytable + j;
1339 				j++;
1340 			}
1341 		}
1342 		table->replace.entries = table->entrytable;
1343 		optlen = sizeof(table->replace) + table->replace.entries_size;
1344 		if (setsockopt(fd, SOL_IP, EBT_SO_SET_ENTRIES, &table->replace, optlen))
1345 			fail("ebtable %s: setsockopt(EBT_SO_SET_ENTRIES)", table->name);
1346 	}
1347 	close(fd);
1348 }
1349 
checkpoint_net_namespace(void)1350 static void checkpoint_net_namespace(void)
1351 {
1352 #if SYZ_EXECUTOR
1353 	if (flag_sandbox == sandbox_setuid)
1354 		return;
1355 #endif
1356 	checkpoint_ebtables();
1357 	checkpoint_arptables();
1358 	checkpoint_iptables(ipv4_tables, sizeof(ipv4_tables) / sizeof(ipv4_tables[0]), AF_INET, SOL_IP);
1359 	checkpoint_iptables(ipv6_tables, sizeof(ipv6_tables) / sizeof(ipv6_tables[0]), AF_INET6, SOL_IPV6);
1360 }
1361 
reset_net_namespace(void)1362 static void reset_net_namespace(void)
1363 {
1364 #if SYZ_EXECUTOR
1365 	if (flag_sandbox == sandbox_setuid)
1366 		return;
1367 #endif
1368 	reset_ebtables();
1369 	reset_arptables();
1370 	reset_iptables(ipv4_tables, sizeof(ipv4_tables) / sizeof(ipv4_tables[0]), AF_INET, SOL_IP);
1371 	reset_iptables(ipv6_tables, sizeof(ipv6_tables) / sizeof(ipv6_tables[0]), AF_INET6, SOL_IPV6);
1372 }
1373 #endif
1374 
1375 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
1376 #include <fcntl.h>
1377 #include <sys/mount.h>
1378 #include <sys/stat.h>
1379 #include <sys/types.h>
1380 
setup_cgroups()1381 static void setup_cgroups()
1382 {
1383 	if (mkdir("/syzcgroup", 0777)) {
1384 		debug("mkdir(/syzcgroup) failed: %d\n", errno);
1385 	}
1386 	if (mkdir("/syzcgroup/unified", 0777)) {
1387 		debug("mkdir(/syzcgroup/unified) failed: %d\n", errno);
1388 	}
1389 	if (mount("none", "/syzcgroup/unified", "cgroup2", 0, NULL)) {
1390 		debug("mount(cgroup2) failed: %d\n", errno);
1391 	}
1392 	if (chmod("/syzcgroup/unified", 0777)) {
1393 		debug("chmod(/syzcgroup/unified) failed: %d\n", errno);
1394 	}
1395 	if (!write_file("/syzcgroup/unified/cgroup.subtree_control", "+cpu +memory +io +pids +rdma")) {
1396 		debug("write(cgroup.subtree_control) failed: %d\n", errno);
1397 	}
1398 	if (mkdir("/syzcgroup/cpu", 0777)) {
1399 		debug("mkdir(/syzcgroup/cpu) failed: %d\n", errno);
1400 	}
1401 	if (mount("none", "/syzcgroup/cpu", "cgroup", 0, "cpuset,cpuacct,perf_event,hugetlb")) {
1402 		debug("mount(cgroup cpu) failed: %d\n", errno);
1403 	}
1404 	if (!write_file("/syzcgroup/cpu/cgroup.clone_children", "1")) {
1405 		debug("write(/syzcgroup/cpu/cgroup.clone_children) failed: %d\n", errno);
1406 	}
1407 	if (chmod("/syzcgroup/cpu", 0777)) {
1408 		debug("chmod(/syzcgroup/cpu) failed: %d\n", errno);
1409 	}
1410 	if (mkdir("/syzcgroup/net", 0777)) {
1411 		debug("mkdir(/syzcgroup/net) failed: %d\n", errno);
1412 	}
1413 	if (mount("none", "/syzcgroup/net", "cgroup", 0, "net_cls,net_prio,devices,freezer")) {
1414 		debug("mount(cgroup net) failed: %d\n", errno);
1415 	}
1416 	if (chmod("/syzcgroup/net", 0777)) {
1417 		debug("chmod(/syzcgroup/net) failed: %d\n", errno);
1418 	}
1419 }
1420 
1421 // TODO(dvyukov): this should be under a separate define for separate minimization,
1422 // but for now we bundle this with cgroups.
setup_binfmt_misc()1423 static void setup_binfmt_misc()
1424 {
1425 	if (mount(0, "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, 0)) {
1426 		debug("mount(binfmt_misc) failed: %d\n", errno);
1427 	}
1428 	if (!write_file("/proc/sys/fs/binfmt_misc/register", ":syz0:M:0:\x01::./file0:")) {
1429 		debug("write(/proc/sys/fs/binfmt_misc/register, syz0) failed: %d\n", errno);
1430 	}
1431 	if (!write_file("/proc/sys/fs/binfmt_misc/register", ":syz1:M:1:\x02::./file0:POC")) {
1432 		debug("write(/proc/sys/fs/binfmt_misc/register, syz1) failed: %d\n", errno);
1433 	}
1434 }
1435 #endif
1436 
1437 #if SYZ_EXECUTOR || SYZ_SANDBOX_NONE || SYZ_SANDBOX_SETUID || SYZ_SANDBOX_NAMESPACE
1438 #include <errno.h>
1439 #include <sys/mount.h>
1440 
setup_common()1441 static void setup_common()
1442 {
1443 	if (mount(0, "/sys/fs/fuse/connections", "fusectl", 0, 0)) {
1444 		debug("mount(fusectl) failed: %d\n", errno);
1445 	}
1446 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
1447 	setup_cgroups();
1448 	setup_binfmt_misc();
1449 #endif
1450 }
1451 #endif
1452 
1453 #if SYZ_EXECUTOR || SYZ_SANDBOX_NONE || SYZ_SANDBOX_SETUID || SYZ_SANDBOX_NAMESPACE
1454 #include <sched.h>
1455 #include <sys/prctl.h>
1456 #include <sys/resource.h>
1457 #include <sys/time.h>
1458 #include <sys/wait.h>
1459 
1460 static void loop();
1461 
sandbox_common()1462 static void sandbox_common()
1463 {
1464 	prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
1465 	setpgrp();
1466 	setsid();
1467 
1468 #if SYZ_EXECUTOR || __NR_syz_init_net_socket
1469 	int netns = open("/proc/self/ns/net", O_RDONLY);
1470 	if (netns == -1)
1471 		fail("open(/proc/self/ns/net) failed");
1472 	if (dup2(netns, kInitNetNsFd) < 0)
1473 		fail("dup2(netns, kInitNetNsFd) failed");
1474 	close(netns);
1475 #endif
1476 
1477 	struct rlimit rlim;
1478 	rlim.rlim_cur = rlim.rlim_max = 160 << 20;
1479 	setrlimit(RLIMIT_AS, &rlim);
1480 	rlim.rlim_cur = rlim.rlim_max = 8 << 20;
1481 	setrlimit(RLIMIT_MEMLOCK, &rlim);
1482 	rlim.rlim_cur = rlim.rlim_max = 136 << 20;
1483 	setrlimit(RLIMIT_FSIZE, &rlim);
1484 	rlim.rlim_cur = rlim.rlim_max = 1 << 20;
1485 	setrlimit(RLIMIT_STACK, &rlim);
1486 	rlim.rlim_cur = rlim.rlim_max = 0;
1487 	setrlimit(RLIMIT_CORE, &rlim);
1488 	rlim.rlim_cur = rlim.rlim_max = 256; // see kMaxFd
1489 	setrlimit(RLIMIT_NOFILE, &rlim);
1490 
1491 	// CLONE_NEWNS/NEWCGROUP cause EINVAL on some systems,
1492 	// so we do them separately of clone in do_sandbox_namespace.
1493 	if (unshare(CLONE_NEWNS)) {
1494 		debug("unshare(CLONE_NEWNS): %d\n", errno);
1495 	}
1496 	if (unshare(CLONE_NEWIPC)) {
1497 		debug("unshare(CLONE_NEWIPC): %d\n", errno);
1498 	}
1499 	if (unshare(0x02000000)) {
1500 		debug("unshare(CLONE_NEWCGROUP): %d\n", errno);
1501 	}
1502 	if (unshare(CLONE_NEWUTS)) {
1503 		debug("unshare(CLONE_NEWUTS): %d\n", errno);
1504 	}
1505 	if (unshare(CLONE_SYSVSEM)) {
1506 		debug("unshare(CLONE_SYSVSEM): %d\n", errno);
1507 	}
1508 }
1509 
wait_for_loop(int pid)1510 int wait_for_loop(int pid)
1511 {
1512 	if (pid < 0)
1513 		fail("sandbox fork failed");
1514 	debug("spawned loop pid %d\n", pid);
1515 	int status = 0;
1516 	while (waitpid(-1, &status, __WALL) != pid) {
1517 	}
1518 	return WEXITSTATUS(status);
1519 }
1520 #endif
1521 
1522 #if SYZ_EXECUTOR || SYZ_SANDBOX_NONE
1523 #include <sched.h>
1524 #include <sys/types.h>
1525 
do_sandbox_none(void)1526 static int do_sandbox_none(void)
1527 {
1528 	// CLONE_NEWPID takes effect for the first child of the current process,
1529 	// so we do it before fork to make the loop "init" process of the namespace.
1530 	// We ought to do fail here, but sandbox=none is used in pkg/ipc tests
1531 	// and they are usually run under non-root.
1532 	// Also since debug is stripped by pkg/csource, we need to do {}
1533 	// even though we generally don't do {} around single statements.
1534 	if (unshare(CLONE_NEWPID)) {
1535 		debug("unshare(CLONE_NEWPID): %d\n", errno);
1536 	}
1537 	int pid = fork();
1538 	if (pid != 0)
1539 		return wait_for_loop(pid);
1540 
1541 	setup_common();
1542 	sandbox_common();
1543 	if (unshare(CLONE_NEWNET)) {
1544 		debug("unshare(CLONE_NEWNET): %d\n", errno);
1545 	}
1546 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
1547 	initialize_tun();
1548 #endif
1549 #if SYZ_EXECUTOR || SYZ_ENABLE_NETDEV
1550 	initialize_netdevices();
1551 #endif
1552 	loop();
1553 	doexit(1);
1554 }
1555 #endif
1556 
1557 #if SYZ_EXECUTOR || SYZ_SANDBOX_SETUID
1558 #include <grp.h>
1559 #include <sched.h>
1560 #include <sys/prctl.h>
1561 
do_sandbox_setuid(void)1562 static int do_sandbox_setuid(void)
1563 {
1564 	if (unshare(CLONE_NEWPID)) {
1565 		debug("unshare(CLONE_NEWPID): %d\n", errno);
1566 	}
1567 	int pid = fork();
1568 	if (pid != 0)
1569 		return wait_for_loop(pid);
1570 
1571 	setup_common();
1572 	sandbox_common();
1573 	if (unshare(CLONE_NEWNET)) {
1574 		debug("unshare(CLONE_NEWNET): %d\n", errno);
1575 	}
1576 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
1577 	initialize_tun();
1578 #endif
1579 #if SYZ_EXECUTOR || SYZ_ENABLE_NETDEV
1580 	initialize_netdevices();
1581 #endif
1582 
1583 	const int nobody = 65534;
1584 	if (setgroups(0, NULL))
1585 		fail("failed to setgroups");
1586 	if (syscall(SYS_setresgid, nobody, nobody, nobody))
1587 		fail("failed to setresgid");
1588 	if (syscall(SYS_setresuid, nobody, nobody, nobody))
1589 		fail("failed to setresuid");
1590 
1591 	// This is required to open /proc/self/* files.
1592 	// Otherwise they are owned by root and we can't open them after setuid.
1593 	// See task_dump_owner function in kernel.
1594 	prctl(PR_SET_DUMPABLE, 1, 0, 0, 0);
1595 
1596 	loop();
1597 	doexit(1);
1598 }
1599 #endif
1600 
1601 #if SYZ_EXECUTOR || SYZ_SANDBOX_NAMESPACE
1602 #include <linux/capability.h>
1603 #include <sched.h>
1604 #include <sys/mman.h>
1605 #include <sys/mount.h>
1606 
1607 static int real_uid;
1608 static int real_gid;
1609 __attribute__((aligned(64 << 10))) static char sandbox_stack[1 << 20];
1610 
namespace_sandbox_proc(void * arg)1611 static int namespace_sandbox_proc(void* arg)
1612 {
1613 	sandbox_common();
1614 
1615 	// /proc/self/setgroups is not present on some systems, ignore error.
1616 	write_file("/proc/self/setgroups", "deny");
1617 	if (!write_file("/proc/self/uid_map", "0 %d 1\n", real_uid))
1618 		fail("write of /proc/self/uid_map failed");
1619 	if (!write_file("/proc/self/gid_map", "0 %d 1\n", real_gid))
1620 		fail("write of /proc/self/gid_map failed");
1621 
1622 	// CLONE_NEWNET must always happen before tun setup,
1623 	// because we want the tun device in the test namespace.
1624 	if (unshare(CLONE_NEWNET))
1625 		fail("unshare(CLONE_NEWNET)");
1626 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
1627 	// We setup tun here as it needs to be in the test net namespace,
1628 	// which in turn needs to be in the test user namespace.
1629 	// However, IFF_NAPI_FRAGS will fail as we are not root already.
1630 	// There does not seem to be a call sequence that would satisfy all of that.
1631 	initialize_tun();
1632 #endif
1633 #if SYZ_EXECUTOR || SYZ_ENABLE_NETDEV
1634 	initialize_netdevices();
1635 #endif
1636 
1637 	if (mkdir("./syz-tmp", 0777))
1638 		fail("mkdir(syz-tmp) failed");
1639 	if (mount("", "./syz-tmp", "tmpfs", 0, NULL))
1640 		fail("mount(tmpfs) failed");
1641 	if (mkdir("./syz-tmp/newroot", 0777))
1642 		fail("mkdir failed");
1643 	if (mkdir("./syz-tmp/newroot/dev", 0700))
1644 		fail("mkdir failed");
1645 	unsigned bind_mount_flags = MS_BIND | MS_REC | MS_PRIVATE;
1646 	if (mount("/dev", "./syz-tmp/newroot/dev", NULL, bind_mount_flags, NULL))
1647 		fail("mount(dev) failed");
1648 	if (mkdir("./syz-tmp/newroot/proc", 0700))
1649 		fail("mkdir failed");
1650 	if (mount(NULL, "./syz-tmp/newroot/proc", "proc", 0, NULL))
1651 		fail("mount(proc) failed");
1652 	if (mkdir("./syz-tmp/newroot/selinux", 0700))
1653 		fail("mkdir failed");
1654 	// selinux mount used to be at /selinux, but then moved to /sys/fs/selinux.
1655 	const char* selinux_path = "./syz-tmp/newroot/selinux";
1656 	if (mount("/selinux", selinux_path, NULL, bind_mount_flags, NULL)) {
1657 		if (errno != ENOENT)
1658 			fail("mount(/selinux) failed");
1659 		if (mount("/sys/fs/selinux", selinux_path, NULL, bind_mount_flags, NULL) && errno != ENOENT)
1660 			fail("mount(/sys/fs/selinux) failed");
1661 	}
1662 	if (mkdir("./syz-tmp/newroot/sys", 0700))
1663 		fail("mkdir failed");
1664 	if (mount("/sys", "./syz-tmp/newroot/sys", 0, bind_mount_flags, NULL))
1665 		fail("mount(sysfs) failed");
1666 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
1667 	if (mkdir("./syz-tmp/newroot/syzcgroup", 0700))
1668 		fail("mkdir failed");
1669 	if (mkdir("./syz-tmp/newroot/syzcgroup/unified", 0700))
1670 		fail("mkdir failed");
1671 	if (mkdir("./syz-tmp/newroot/syzcgroup/cpu", 0700))
1672 		fail("mkdir failed");
1673 	if (mkdir("./syz-tmp/newroot/syzcgroup/net", 0700))
1674 		fail("mkdir failed");
1675 	if (mount("/syzcgroup/unified", "./syz-tmp/newroot/syzcgroup/unified", NULL, bind_mount_flags, NULL)) {
1676 		debug("mount(cgroup2, MS_BIND) failed: %d\n", errno);
1677 	}
1678 	if (mount("/syzcgroup/cpu", "./syz-tmp/newroot/syzcgroup/cpu", NULL, bind_mount_flags, NULL)) {
1679 		debug("mount(cgroup/cpu, MS_BIND) failed: %d\n", errno);
1680 	}
1681 	if (mount("/syzcgroup/net", "./syz-tmp/newroot/syzcgroup/net", NULL, bind_mount_flags, NULL)) {
1682 		debug("mount(cgroup/net, MS_BIND) failed: %d\n", errno);
1683 	}
1684 #endif
1685 	if (mkdir("./syz-tmp/pivot", 0777))
1686 		fail("mkdir failed");
1687 	if (syscall(SYS_pivot_root, "./syz-tmp", "./syz-tmp/pivot")) {
1688 		debug("pivot_root failed\n");
1689 		if (chdir("./syz-tmp"))
1690 			fail("chdir failed");
1691 	} else {
1692 		debug("pivot_root OK\n");
1693 		if (chdir("/"))
1694 			fail("chdir failed");
1695 		if (umount2("./pivot", MNT_DETACH))
1696 			fail("umount failed");
1697 	}
1698 	if (chroot("./newroot"))
1699 		fail("chroot failed");
1700 	if (chdir("/"))
1701 		fail("chdir failed");
1702 
1703 	// Drop CAP_SYS_PTRACE so that test processes can't attach to parent processes.
1704 	// Previously it lead to hangs because the loop process stopped due to SIGSTOP.
1705 	// Note that a process can always ptrace its direct children, which is enough
1706 	// for testing purposes.
1707 	struct __user_cap_header_struct cap_hdr = {};
1708 	struct __user_cap_data_struct cap_data[2] = {};
1709 	cap_hdr.version = _LINUX_CAPABILITY_VERSION_3;
1710 	cap_hdr.pid = getpid();
1711 	if (syscall(SYS_capget, &cap_hdr, &cap_data))
1712 		fail("capget failed");
1713 	cap_data[0].effective &= ~(1 << CAP_SYS_PTRACE);
1714 	cap_data[0].permitted &= ~(1 << CAP_SYS_PTRACE);
1715 	cap_data[0].inheritable &= ~(1 << CAP_SYS_PTRACE);
1716 	if (syscall(SYS_capset, &cap_hdr, &cap_data))
1717 		fail("capset failed");
1718 
1719 	loop();
1720 	doexit(1);
1721 }
1722 
do_sandbox_namespace(void)1723 static int do_sandbox_namespace(void)
1724 {
1725 	int pid;
1726 
1727 	setup_common();
1728 	real_uid = getuid();
1729 	real_gid = getgid();
1730 	mprotect(sandbox_stack, 4096, PROT_NONE); // to catch stack underflows
1731 	pid = clone(namespace_sandbox_proc, &sandbox_stack[sizeof(sandbox_stack) - 64],
1732 		    CLONE_NEWUSER | CLONE_NEWPID, 0);
1733 	return wait_for_loop(pid);
1734 }
1735 #endif
1736 
1737 #if SYZ_EXECUTOR || SYZ_REPEAT && SYZ_USE_TMP_DIR
1738 #include <dirent.h>
1739 #include <errno.h>
1740 #include <string.h>
1741 #include <sys/ioctl.h>
1742 #include <sys/mount.h>
1743 
1744 #define FS_IOC_SETFLAGS _IOW('f', 2, long)
1745 
1746 // One does not simply remove a directory.
1747 // There can be mounts, so we need to try to umount.
1748 // Moreover, a mount can be mounted several times, so we need to try to umount in a loop.
1749 // Moreover, after umount a dir can become non-empty again, so we need another loop.
1750 // Moreover, a mount can be re-mounted as read-only and then we will fail to make a dir empty.
remove_dir(const char * dir)1751 static void remove_dir(const char* dir)
1752 {
1753 	DIR* dp;
1754 	struct dirent* ep;
1755 	int iter = 0;
1756 retry:
1757 	while (umount2(dir, MNT_DETACH) == 0) {
1758 		debug("umount(%s)\n", dir);
1759 	}
1760 	dp = opendir(dir);
1761 	if (dp == NULL) {
1762 		if (errno == EMFILE) {
1763 			// This happens when the test process casts prlimit(NOFILE) on us.
1764 			// Ideally we somehow prevent test processes from messing with parent processes.
1765 			// But full sandboxing is expensive, so let's ignore this error for now.
1766 			exitf("opendir(%s) failed due to NOFILE, exiting", dir);
1767 		}
1768 		exitf("opendir(%s) failed", dir);
1769 	}
1770 	while ((ep = readdir(dp))) {
1771 		if (strcmp(ep->d_name, ".") == 0 || strcmp(ep->d_name, "..") == 0)
1772 			continue;
1773 		char filename[FILENAME_MAX];
1774 		snprintf(filename, sizeof(filename), "%s/%s", dir, ep->d_name);
1775 		// If it's 9p mount with broken transport, lstat will fail.
1776 		// So try to umount first.
1777 		while (umount2(filename, MNT_DETACH) == 0) {
1778 			debug("umount(%s)\n", filename);
1779 		}
1780 		struct stat st;
1781 		if (lstat(filename, &st))
1782 			exitf("lstat(%s) failed", filename);
1783 		if (S_ISDIR(st.st_mode)) {
1784 			remove_dir(filename);
1785 			continue;
1786 		}
1787 		int i;
1788 		for (i = 0;; i++) {
1789 			debug("unlink(%s)\n", filename);
1790 			if (unlink(filename) == 0)
1791 				break;
1792 			if (errno == EPERM) {
1793 				// Try to reset FS_XFLAG_IMMUTABLE.
1794 				int fd = open(filename, O_RDONLY);
1795 				if (fd != -1) {
1796 					long flags = 0;
1797 					if (ioctl(fd, FS_IOC_SETFLAGS, &flags) == 0)
1798 						debug("reset FS_XFLAG_IMMUTABLE\n");
1799 					close(fd);
1800 					continue;
1801 				}
1802 			}
1803 			if (errno == EROFS) {
1804 				debug("ignoring EROFS\n");
1805 				break;
1806 			}
1807 			if (errno != EBUSY || i > 100)
1808 				exitf("unlink(%s) failed", filename);
1809 			debug("umount(%s)\n", filename);
1810 			if (umount2(filename, MNT_DETACH))
1811 				exitf("umount(%s) failed", filename);
1812 		}
1813 	}
1814 	closedir(dp);
1815 	int i;
1816 	for (i = 0;; i++) {
1817 		debug("rmdir(%s)\n", dir);
1818 		if (rmdir(dir) == 0)
1819 			break;
1820 		if (i < 100) {
1821 			if (errno == EPERM) {
1822 				// Try to reset FS_XFLAG_IMMUTABLE.
1823 				int fd = open(dir, O_RDONLY);
1824 				if (fd != -1) {
1825 					long flags = 0;
1826 					if (ioctl(fd, FS_IOC_SETFLAGS, &flags) == 0)
1827 						debug("reset FS_XFLAG_IMMUTABLE\n");
1828 					close(fd);
1829 					continue;
1830 				}
1831 			}
1832 			if (errno == EROFS) {
1833 				debug("ignoring EROFS\n");
1834 				break;
1835 			}
1836 			if (errno == EBUSY) {
1837 				debug("umount(%s)\n", dir);
1838 				if (umount2(dir, MNT_DETACH))
1839 					exitf("umount(%s) failed", dir);
1840 				continue;
1841 			}
1842 			if (errno == ENOTEMPTY) {
1843 				if (iter < 100) {
1844 					iter++;
1845 					goto retry;
1846 				}
1847 			}
1848 		}
1849 		exitf("rmdir(%s) failed", dir);
1850 	}
1851 }
1852 #endif
1853 
1854 #if SYZ_EXECUTOR || SYZ_FAULT_INJECTION
1855 #include <fcntl.h>
1856 #include <string.h>
1857 #include <sys/stat.h>
1858 #include <sys/types.h>
1859 
inject_fault(int nth)1860 static int inject_fault(int nth)
1861 {
1862 	int fd;
1863 	char buf[16];
1864 
1865 	fd = open("/proc/thread-self/fail-nth", O_RDWR);
1866 	// We treat errors here as temporal/non-critical because we see
1867 	// occasional ENOENT/EACCES errors returned. It seems that fuzzer
1868 	// somehow gets its hands to it.
1869 	if (fd == -1)
1870 		exitf("failed to open /proc/thread-self/fail-nth");
1871 	sprintf(buf, "%d", nth + 1);
1872 	if (write(fd, buf, strlen(buf)) != (ssize_t)strlen(buf))
1873 		exitf("failed to write /proc/thread-self/fail-nth");
1874 	return fd;
1875 }
1876 #endif
1877 
1878 #if SYZ_EXECUTOR
fault_injected(int fail_fd)1879 static int fault_injected(int fail_fd)
1880 {
1881 	char buf[16];
1882 	int n = read(fail_fd, buf, sizeof(buf) - 1);
1883 	if (n <= 0)
1884 		exitf("failed to read /proc/thread-self/fail-nth");
1885 	int res = n == 2 && buf[0] == '0' && buf[1] == '\n';
1886 	buf[0] = '0';
1887 	if (write(fail_fd, buf, 1) != 1)
1888 		exitf("failed to write /proc/thread-self/fail-nth");
1889 	close(fail_fd);
1890 	return res;
1891 }
1892 #endif
1893 
1894 #if SYZ_EXECUTOR || SYZ_REPEAT
1895 #include <dirent.h>
1896 #include <errno.h>
1897 #include <fcntl.h>
1898 #include <signal.h>
1899 #include <string.h>
1900 #include <sys/stat.h>
1901 #include <sys/types.h>
1902 #include <sys/wait.h>
1903 
kill_and_wait(int pid,int * status)1904 static void kill_and_wait(int pid, int* status)
1905 {
1906 	kill(-pid, SIGKILL);
1907 	kill(pid, SIGKILL);
1908 	int i;
1909 	// First, give it up to 100 ms to surrender.
1910 	for (i = 0; i < 100; i++) {
1911 		if (waitpid(-1, status, WNOHANG | __WALL) == pid)
1912 			return;
1913 		usleep(1000);
1914 	}
1915 	// Now, try to abort fuse connections as they cause deadlocks,
1916 	// see Documentation/filesystems/fuse.txt for details.
1917 	// There is no good way to figure out the right connections
1918 	// provided that the process could use unshare(CLONE_NEWNS),
1919 	// so we abort all.
1920 	debug("kill is not working\n");
1921 	DIR* dir = opendir("/sys/fs/fuse/connections");
1922 	if (dir) {
1923 		for (;;) {
1924 			struct dirent* ent = readdir(dir);
1925 			if (!ent)
1926 				break;
1927 			if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
1928 				continue;
1929 			char abort[300];
1930 			snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort", ent->d_name);
1931 			int fd = open(abort, O_WRONLY);
1932 			if (fd == -1) {
1933 				debug("failed to open %s: %d\n", abort, errno);
1934 				continue;
1935 			}
1936 			debug("aborting fuse conn %s\n", ent->d_name);
1937 			if (write(fd, abort, 1) < 0) {
1938 				debug("failed to abort: %d\n", errno);
1939 			}
1940 			close(fd);
1941 		}
1942 		closedir(dir);
1943 	} else {
1944 		debug("failed to open /sys/fs/fuse/connections: %d\n", errno);
1945 	}
1946 	// Now, just wait, no other options.
1947 	while (waitpid(-1, status, __WALL) != pid) {
1948 	}
1949 }
1950 #endif
1951 
1952 #if SYZ_EXECUTOR || SYZ_REPEAT && (SYZ_ENABLE_CGROUPS || SYZ_RESET_NET_NAMESPACE)
1953 #include <fcntl.h>
1954 #include <sys/ioctl.h>
1955 #include <sys/stat.h>
1956 #include <sys/types.h>
1957 #include <unistd.h>
1958 
1959 #define SYZ_HAVE_SETUP_LOOP 1
setup_loop()1960 static void setup_loop()
1961 {
1962 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
1963 	int pid = getpid();
1964 	char cgroupdir[64];
1965 	char procs_file[128];
1966 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/unified/syz%llu", procid);
1967 	if (mkdir(cgroupdir, 0777)) {
1968 		debug("mkdir(%s) failed: %d\n", cgroupdir, errno);
1969 	}
1970 	snprintf(procs_file, sizeof(procs_file), "%s/cgroup.procs", cgroupdir);
1971 	if (!write_file(procs_file, "%d", pid)) {
1972 		debug("write(%s) failed: %d\n", procs_file, errno);
1973 	}
1974 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/cpu/syz%llu", procid);
1975 	if (mkdir(cgroupdir, 0777)) {
1976 		debug("mkdir(%s) failed: %d\n", cgroupdir, errno);
1977 	}
1978 	snprintf(procs_file, sizeof(procs_file), "%s/cgroup.procs", cgroupdir);
1979 	if (!write_file(procs_file, "%d", pid)) {
1980 		debug("write(%s) failed: %d\n", procs_file, errno);
1981 	}
1982 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/net/syz%llu", procid);
1983 	if (mkdir(cgroupdir, 0777)) {
1984 		debug("mkdir(%s) failed: %d\n", cgroupdir, errno);
1985 	}
1986 	snprintf(procs_file, sizeof(procs_file), "%s/cgroup.procs", cgroupdir);
1987 	if (!write_file(procs_file, "%d", pid)) {
1988 		debug("write(%s) failed: %d\n", procs_file, errno);
1989 	}
1990 #endif
1991 #if SYZ_EXECUTOR || SYZ_RESET_NET_NAMESPACE
1992 	checkpoint_net_namespace();
1993 #endif
1994 }
1995 #endif
1996 
1997 #if SYZ_EXECUTOR || SYZ_REPEAT && (SYZ_RESET_NET_NAMESPACE || __NR_syz_mount_image || __NR_syz_read_part_table)
1998 #define SYZ_HAVE_RESET_LOOP 1
reset_loop()1999 static void reset_loop()
2000 {
2001 #if SYZ_EXECUTOR || __NR_syz_mount_image || __NR_syz_read_part_table
2002 	char buf[64];
2003 	snprintf(buf, sizeof(buf), "/dev/loop%llu", procid);
2004 	int loopfd = open(buf, O_RDWR);
2005 	if (loopfd != -1) {
2006 		ioctl(loopfd, LOOP_CLR_FD, 0);
2007 		close(loopfd);
2008 	}
2009 #endif
2010 #if SYZ_EXECUTOR || SYZ_RESET_NET_NAMESPACE
2011 	reset_net_namespace();
2012 #endif
2013 }
2014 #endif
2015 
2016 #if SYZ_EXECUTOR || SYZ_REPEAT
2017 #include <sys/prctl.h>
2018 
2019 #define SYZ_HAVE_SETUP_TEST 1
setup_test()2020 static void setup_test()
2021 {
2022 	prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
2023 	setpgrp();
2024 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
2025 	char cgroupdir[64];
2026 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/unified/syz%llu", procid);
2027 	if (symlink(cgroupdir, "./cgroup")) {
2028 		debug("symlink(%s, ./cgroup) failed: %d\n", cgroupdir, errno);
2029 	}
2030 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/cpu/syz%llu", procid);
2031 	if (symlink(cgroupdir, "./cgroup.cpu")) {
2032 		debug("symlink(%s, ./cgroup.cpu) failed: %d\n", cgroupdir, errno);
2033 	}
2034 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/net/syz%llu", procid);
2035 	if (symlink(cgroupdir, "./cgroup.net")) {
2036 		debug("symlink(%s, ./cgroup.net) failed: %d\n", cgroupdir, errno);
2037 	}
2038 #endif
2039 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
2040 	// Read all remaining packets from tun to better
2041 	// isolate consequently executing programs.
2042 	flush_tun();
2043 #endif
2044 }
2045 
2046 #define SYZ_HAVE_RESET_TEST 1
reset_test()2047 static void reset_test()
2048 {
2049 	// Keeping a 9p transport pipe open will hang the proccess dead,
2050 	// so close all opened file descriptors.
2051 	int fd;
2052 	for (fd = 3; fd < 30; fd++)
2053 		close(fd);
2054 }
2055 #endif
2056