1 /*
2  * eBPF kernel space program part
3  *
4  * Toy eBPF program for demonstration purposes, some parts derived from
5  * kernel tree's samples/bpf/sockex2_kern.c example.
6  *
7  * More background on eBPF, kernel tree: Documentation/networking/filter.txt
8  *
9  * Note, this file is rather large, and most classifier and actions are
10  * likely smaller to accomplish one specific use-case and are tailored
11  * for high performance. For performance reasons, you might also have the
12  * classifier and action already merged inside the classifier.
13  *
14  * In order to show various features it serves as a bigger programming
15  * example, which you should feel free to rip apart and experiment with.
16  *
17  * Compilation, configuration example:
18  *
19  *  Note: as long as the BPF backend in LLVM is still experimental,
20  *  you need to build LLVM with LLVM with --enable-experimental-targets=BPF
21  *  Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
22  *  and you have libelf.h and gelf.h headers and can link tc against -lelf.
23  *
24  *  In case you need to sync kernel headers, go to your kernel source tree:
25  *  # make headers_install INSTALL_HDR_PATH=/usr/
26  *
27  *  $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
28  *  $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
29  *  $ objdump -h bpf.o
30  *  [...]
31  *  3 classifier    000007f8  0000000000000000  0000000000000000  00000040  2**3
32  *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
33  *  4 action-mark   00000088  0000000000000000  0000000000000000  00000838  2**3
34  *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
35  *  5 action-rand   00000098  0000000000000000  0000000000000000  000008c0  2**3
36  *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
37  *  6 maps          00000030  0000000000000000  0000000000000000  00000958  2**2
38  *                  CONTENTS, ALLOC, LOAD, DATA
39  *  7 license       00000004  0000000000000000  0000000000000000  00000988  2**0
40  *                  CONTENTS, ALLOC, LOAD, DATA
41  *  [...]
42  *  # echo 1 > /proc/sys/net/core/bpf_jit_enable
43  *  $ gcc bpf_agent.c -o bpf_agent -Wall -O2
44  *  # ./bpf_agent /tmp/bpf-uds      (e.g. on a different terminal)
45  *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
46  *                             action bpf obj bpf.o sec action-mark            \
47  *                             action bpf obj bpf.o sec action-rand ok
48  *  # tc filter show dev em1
49  *  filter parent 1: protocol all pref 49152 bpf
50  *  filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
51  *    action order 1: bpf bpf.o:[action-mark] default-action pipe
52  *    index 52 ref 1 bind 1
53  *
54  *    action order 2: bpf bpf.o:[action-rand] default-action pipe
55  *    index 53 ref 1 bind 1
56  *
57  *    action order 3: gact action pass
58  *    random type none pass val 0
59  *    index 38 ref 1 bind 1
60  *
61  * The same program can also be installed on ingress side (as opposed to above
62  * egress configuration), e.g.:
63  *
64  * # tc qdisc add dev em1 handle ffff: ingress
65  * # tc filter add dev em1 parent ffff: bpf obj ...
66  *
67  * Notes on BPF agent:
68  *
69  * In the above example, the bpf_agent creates the unix domain socket
70  * natively. "tc exec" can also spawn a shell and hold the socktes there:
71  *
72  *  # tc exec bpf imp /tmp/bpf-uds
73  *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
74  *                             action bpf obj bpf.o sec action-mark            \
75  *                             action bpf obj bpf.o sec action-rand ok
76  *  sh-4.2# (shell spawned from tc exec)
77  *  sh-4.2# bpf_agent
78  *  [...]
79  *
80  * This will read out fds over environment and produce the same data dump
81  * as below. This has the advantage that the spawned shell owns the fds
82  * and thus if the agent is restarted, it can reattach to the same fds, also
83  * various programs can easily read/modify the data simultaneously from user
84  * space side.
85  *
86  * If the shell is unnecessary, the agent can also just be spawned directly
87  * via tc exec:
88  *
89  *  # tc exec bpf imp /tmp/bpf-uds run bpf_agent
90  *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
91  *                             action bpf obj bpf.o sec action-mark            \
92  *                             action bpf obj bpf.o sec action-rand ok
93  *
94  * BPF agent example output:
95  *
96  * ver: 1
97  * obj: bpf.o
98  * dev: 64770
99  * ino: 6045133
100  * maps: 3
101  * map0:
102  *  `- fd: 4
103  *   | serial: 1
104  *   | type: 1
105  *   | max elem: 256
106  *   | size key: 1
107  *   ` size val: 16
108  * map1:
109  *  `- fd: 5
110  *   | serial: 2
111  *   | type: 1
112  *   | max elem: 1024
113  *   | size key: 4
114  *   ` size val: 16
115  * map2:
116  *  `- fd: 6
117  *   | serial: 3
118  *   | type: 2
119  *   | max elem: 64
120  *   | size key: 4
121  *   ` size val: 8
122  * data, period: 5sec
123  *  `- number of drops:	cpu0:     0	cpu1:     0	cpu2:     0	cpu3:     0
124  *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 0, mis: 0]	q3:[pkts: 0, mis: 0]
125  *   ` protos:	tcp:[pkts: 0, bytes: 0]	udp:[pkts: 0, bytes: 0]	icmp:[pkts: 0, bytes: 0]
126  * data, period: 5sec
127  *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     0	cpu3:     1
128  *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 24, mis: 14]	q3:[pkts: 0, mis: 0]
129  *   ` protos:	tcp:[pkts: 13, bytes: 1989]	udp:[pkts: 10, bytes: 710]	icmp:[pkts: 0, bytes: 0]
130  * data, period: 5sec
131  *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     3	cpu3:     3
132  *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 39, mis: 21]	q3:[pkts: 0, mis: 0]
133  *   ` protos:	tcp:[pkts: 20, bytes: 3549]	udp:[pkts: 18, bytes: 1278]	icmp:[pkts: 0, bytes: 0]
134  * [...]
135  *
136  * This now means, the below classifier and action pipeline has been loaded
137  * as eBPF bytecode into the kernel, the kernel has verified that the
138  * execution of the bytecode is "safe", and it has JITed the programs
139  * afterwards, so that upon invocation they're running on native speed. tc
140  * has transferred all map file descriptors to the bpf_agent via IPC and
141  * even after tc exits, the agent can read out or modify all map data.
142  *
143  * Note that the export to the uds is done only once in the classifier and
144  * not in the action. It's enough to export the (here) shared descriptors
145  * once.
146  *
147  * If you need to disassemble the generated JIT image (echo with 2), the
148  * kernel tree has under tools/net/ a small helper, you can invoke e.g.
149  * `bpf_jit_disasm -o`.
150  *
151  * Please find in the code below further comments.
152  *
153  *   -- Happy eBPF hacking! ;)
154  */
155 #include <stdint.h>
156 #include <stdbool.h>
157 #include <sys/types.h>
158 #include <sys/socket.h>
159 #include <asm/types.h>
160 #include <linux/in.h>
161 #include <linux/if.h>
162 #include <linux/if_ether.h>
163 #include <linux/ip.h>
164 #include <linux/ipv6.h>
165 #include <linux/if_tunnel.h>
166 #include <linux/filter.h>
167 #include <linux/bpf.h>
168 
169 /* Common, shared definitions with ebpf_agent.c. */
170 #include "bpf_shared.h"
171 /* BPF helper functions for our example. */
172 #include "../../include/bpf_api.h"
173 
174 /* Could be defined here as well, or included from the header. */
175 #define TC_ACT_UNSPEC		(-1)
176 #define TC_ACT_OK		0
177 #define TC_ACT_RECLASSIFY	1
178 #define TC_ACT_SHOT		2
179 #define TC_ACT_PIPE		3
180 #define TC_ACT_STOLEN		4
181 #define TC_ACT_QUEUED		5
182 #define TC_ACT_REPEAT		6
183 
184 /* Other, misc stuff. */
185 #define IP_MF			0x2000
186 #define IP_OFFSET		0x1FFF
187 
188 /* eBPF map definitions, all placed in section "maps". */
189 struct bpf_elf_map __section("maps") map_proto = {
190 	.type		=	BPF_MAP_TYPE_HASH,
191 	.id		=	BPF_MAP_ID_PROTO,
192 	.size_key	=	sizeof(uint8_t),
193 	.size_value	=	sizeof(struct count_tuple),
194 	.max_elem	=	256,
195 };
196 
197 struct bpf_elf_map __section("maps") map_queue = {
198 	.type		=	BPF_MAP_TYPE_HASH,
199 	.id		=	BPF_MAP_ID_QUEUE,
200 	.size_key	=	sizeof(uint32_t),
201 	.size_value	=	sizeof(struct count_queue),
202 	.max_elem	=	1024,
203 };
204 
205 struct bpf_elf_map __section("maps") map_drops = {
206 	.type		=	BPF_MAP_TYPE_ARRAY,
207 	.id		=	BPF_MAP_ID_DROPS,
208 	.size_key	=	sizeof(uint32_t),
209 	.size_value	=	sizeof(long),
210 	.max_elem	=	64,
211 };
212 
213 /* Helper functions and definitions for the flow dissector used by the
214  * example classifier. This resembles the kernel's flow dissector to
215  * some extend and is just used as an example to show what's possible
216  * with eBPF.
217  */
218 struct sockaddr;
219 
220 struct vlan_hdr {
221 	__be16 h_vlan_TCI;
222 	__be16 h_vlan_encapsulated_proto;
223 };
224 
225 struct flow_keys {
226 	__u32 src;
227 	__u32 dst;
228 	union {
229 		__u32 ports;
230 		__u16 port16[2];
231 	};
232 	__s32 th_off;
233 	__u8 ip_proto;
234 };
235 
flow_ports_offset(__u8 ip_proto)236 static inline int flow_ports_offset(__u8 ip_proto)
237 {
238 	switch (ip_proto) {
239 	case IPPROTO_TCP:
240 	case IPPROTO_UDP:
241 	case IPPROTO_DCCP:
242 	case IPPROTO_ESP:
243 	case IPPROTO_SCTP:
244 	case IPPROTO_UDPLITE:
245 	default:
246 		return 0;
247 	case IPPROTO_AH:
248 		return 4;
249 	}
250 }
251 
flow_is_frag(struct __sk_buff * skb,int nh_off)252 static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off)
253 {
254 	return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
255 		  (IP_MF | IP_OFFSET));
256 }
257 
flow_parse_ipv4(struct __sk_buff * skb,int nh_off,__u8 * ip_proto,struct flow_keys * flow)258 static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
259 				  __u8 *ip_proto, struct flow_keys *flow)
260 {
261 	__u8 ip_ver_len;
262 
263 	if (unlikely(flow_is_frag(skb, nh_off)))
264 		*ip_proto = 0;
265 	else
266 		*ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
267 							     protocol));
268 	if (*ip_proto != IPPROTO_GRE) {
269 		flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
270 		flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
271 	}
272 
273 	ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
274 	if (likely(ip_ver_len == 0x45))
275 		nh_off += 20;
276 	else
277 		nh_off += (ip_ver_len & 0xF) << 2;
278 
279 	return nh_off;
280 }
281 
flow_addr_hash_ipv6(struct __sk_buff * skb,int off)282 static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
283 {
284 	__u32 w0 = load_word(skb, off);
285 	__u32 w1 = load_word(skb, off + sizeof(w0));
286 	__u32 w2 = load_word(skb, off + sizeof(w0) * 2);
287 	__u32 w3 = load_word(skb, off + sizeof(w0) * 3);
288 
289 	return w0 ^ w1 ^ w2 ^ w3;
290 }
291 
flow_parse_ipv6(struct __sk_buff * skb,int nh_off,__u8 * ip_proto,struct flow_keys * flow)292 static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
293 				  __u8 *ip_proto, struct flow_keys *flow)
294 {
295 	*ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));
296 
297 	flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
298 	flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));
299 
300 	return nh_off + sizeof(struct ipv6hdr);
301 }
302 
flow_dissector(struct __sk_buff * skb,struct flow_keys * flow)303 static inline bool flow_dissector(struct __sk_buff *skb,
304 				  struct flow_keys *flow)
305 {
306 	int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
307 	__be16 proto = skb->protocol;
308 	__u8 ip_proto;
309 
310 	/* TODO: check for skb->vlan_tci, skb->vlan_proto first */
311 	if (proto == htons(ETH_P_8021AD)) {
312 		proto = load_half(skb, nh_off +
313 				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
314 		nh_off += sizeof(struct vlan_hdr);
315 	}
316 	if (proto == htons(ETH_P_8021Q)) {
317 		proto = load_half(skb, nh_off +
318 				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
319 		nh_off += sizeof(struct vlan_hdr);
320 	}
321 
322 	if (likely(proto == htons(ETH_P_IP)))
323 		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
324 	else if (proto == htons(ETH_P_IPV6))
325 		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
326 	else
327 		return false;
328 
329 	switch (ip_proto) {
330 	case IPPROTO_GRE: {
331 		struct gre_hdr {
332 			__be16 flags;
333 			__be16 proto;
334 		};
335 
336 		__u16 gre_flags = load_half(skb, nh_off +
337 					    offsetof(struct gre_hdr, flags));
338 		__u16 gre_proto = load_half(skb, nh_off +
339 					    offsetof(struct gre_hdr, proto));
340 
341 		if (gre_flags & (GRE_VERSION | GRE_ROUTING))
342 			break;
343 
344 		nh_off += 4;
345 		if (gre_flags & GRE_CSUM)
346 			nh_off += 4;
347 		if (gre_flags & GRE_KEY)
348 			nh_off += 4;
349 		if (gre_flags & GRE_SEQ)
350 			nh_off += 4;
351 
352 		if (gre_proto == ETH_P_8021Q) {
353 			gre_proto = load_half(skb, nh_off +
354 					      offsetof(struct vlan_hdr,
355 						       h_vlan_encapsulated_proto));
356 			nh_off += sizeof(struct vlan_hdr);
357 		}
358 		if (gre_proto == ETH_P_IP)
359 			nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
360 		else if (gre_proto == ETH_P_IPV6)
361 			nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
362 		else
363 			return false;
364 		break;
365 	}
366 	case IPPROTO_IPIP:
367 		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
368 		break;
369 	case IPPROTO_IPV6:
370 		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
371 	default:
372 		break;
373 	}
374 
375 	nh_off += flow_ports_offset(ip_proto);
376 
377 	flow->ports = load_word(skb, nh_off);
378 	flow->th_off = nh_off;
379 	flow->ip_proto = ip_proto;
380 
381 	return true;
382 }
383 
cls_update_proto_map(const struct __sk_buff * skb,const struct flow_keys * flow)384 static inline void cls_update_proto_map(const struct __sk_buff *skb,
385 					const struct flow_keys *flow)
386 {
387 	uint8_t proto = flow->ip_proto;
388 	struct count_tuple *ct, _ct;
389 
390 	ct = map_lookup_elem(&map_proto, &proto);
391 	if (likely(ct)) {
392 		lock_xadd(&ct->packets, 1);
393 		lock_xadd(&ct->bytes, skb->len);
394 		return;
395 	}
396 
397 	/* No hit yet, we need to create a new entry. */
398 	_ct.packets = 1;
399 	_ct.bytes = skb->len;
400 
401 	map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
402 }
403 
cls_update_queue_map(const struct __sk_buff * skb)404 static inline void cls_update_queue_map(const struct __sk_buff *skb)
405 {
406 	uint32_t queue = skb->queue_mapping;
407 	struct count_queue *cq, _cq;
408 	bool mismatch;
409 
410 	mismatch = skb->queue_mapping != get_smp_processor_id();
411 
412 	cq = map_lookup_elem(&map_queue, &queue);
413 	if (likely(cq)) {
414 		lock_xadd(&cq->total, 1);
415 		if (mismatch)
416 			lock_xadd(&cq->mismatch, 1);
417 		return;
418 	}
419 
420 	/* No hit yet, we need to create a new entry. */
421 	_cq.total = 1;
422 	_cq.mismatch = mismatch ? 1 : 0;
423 
424 	map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
425 }
426 
427 /* eBPF program definitions, placed in various sections, which can
428  * have custom section names. If custom names are in use, it's
429  * required to point tc to the correct section, e.g.
430  *
431  *     tc filter add [...] bpf obj cls.o sec cls-tos [...]
432  *
433  * in case the program resides in __section("cls-tos").
434  *
435  * Default section for cls_bpf is: "classifier", for act_bpf is:
436  * "action". Naturally, if for example multiple actions are present
437  * in the same file, they need to have distinct section names.
438  *
439  * It is however not required to have multiple programs sharing
440  * a file.
441  */
442 __section("classifier")
cls_main(struct __sk_buff * skb)443 int cls_main(struct __sk_buff *skb)
444 {
445 	struct flow_keys flow;
446 
447 	if (!flow_dissector(skb, &flow))
448 		return 0; /* No match in cls_bpf. */
449 
450 	cls_update_proto_map(skb, &flow);
451 	cls_update_queue_map(skb);
452 
453 	return flow.ip_proto;
454 }
455 
act_update_drop_map(void)456 static inline void act_update_drop_map(void)
457 {
458 	uint32_t *count, cpu = get_smp_processor_id();
459 
460 	count = map_lookup_elem(&map_drops, &cpu);
461 	if (count)
462 		/* Only this cpu is accessing this element. */
463 		(*count)++;
464 }
465 
466 __section("action-mark")
act_mark_main(struct __sk_buff * skb)467 int act_mark_main(struct __sk_buff *skb)
468 {
469 	/* You could also mangle skb data here with the helper function
470 	 * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
471 	 * do that already in the classifier itself as a merged combination
472 	 * of classifier'n'action model.
473 	 */
474 
475 	if (skb->mark == 0xcafe) {
476 		act_update_drop_map();
477 		return TC_ACT_SHOT;
478 	}
479 
480 	/* Default configured tc opcode. */
481 	return TC_ACT_UNSPEC;
482 }
483 
484 __section("action-rand")
act_rand_main(struct __sk_buff * skb)485 int act_rand_main(struct __sk_buff *skb)
486 {
487 	/* Sorry, we're near event horizon ... */
488 	if ((get_prandom_u32() & 3) == 0) {
489 		act_update_drop_map();
490 		return TC_ACT_SHOT;
491 	}
492 
493 	return TC_ACT_UNSPEC;
494 }
495 
496 /* Last but not least, the file contains a license. Some future helper
497  * functions may only be available with a GPL license.
498  */
499 BPF_LICENSE("GPL");
500