1 /*
2  * Copyright (c) 2015 PLUMgrid, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef _GNU_SOURCE
17 #define _GNU_SOURCE
18 #endif
19 
20 #include <arpa/inet.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <inttypes.h>
24 #include <limits.h>
25 #include <linux/bpf.h>
26 #include <linux/bpf_common.h>
27 #include <linux/if_packet.h>
28 #include <linux/perf_event.h>
29 #include <linux/pkt_cls.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sched.h>
32 #include <linux/unistd.h>
33 #include <linux/version.h>
34 #include <net/ethernet.h>
35 #include <net/if.h>
36 #include <sched.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <sys/ioctl.h>
42 #include <sys/resource.h>
43 #include <sys/stat.h>
44 #include <sys/types.h>
45 #include <unistd.h>
46 #include <linux/if_alg.h>
47 
48 #include "libbpf.h"
49 #include "perf_reader.h"
50 
51 // TODO: Remove this when CentOS 6 support is not needed anymore
52 #include "setns.h"
53 
54 // TODO: remove these defines when linux-libc-dev exports them properly
55 
56 #ifndef __NR_bpf
57 #if defined(__powerpc64__)
58 #define __NR_bpf 361
59 #elif defined(__s390x__)
60 #define __NR_bpf 351
61 #elif defined(__aarch64__)
62 #define __NR_bpf 280
63 #else
64 #define __NR_bpf 321
65 #endif
66 #endif
67 
68 #ifndef SO_ATTACH_BPF
69 #define SO_ATTACH_BPF 50
70 #endif
71 
72 #ifndef PERF_EVENT_IOC_SET_BPF
73 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
74 #endif
75 
76 #ifndef PERF_FLAG_FD_CLOEXEC
77 #define PERF_FLAG_FD_CLOEXEC (1UL << 3)
78 #endif
79 
80 // TODO: Remove this when CentOS 6 support is not needed anymore
81 #ifndef AF_ALG
82 #define AF_ALG 38
83 #endif
84 
85 #define min(x, y) ((x) < (y) ? (x) : (y))
86 
87 struct bpf_helper {
88   char *name;
89   char *required_version;
90 };
91 
92 static struct bpf_helper helpers[] = {
93   {"map_lookup_elem", "3.19"},
94   {"map_update_elem", "3.19"},
95   {"map_delete_elem", "3.19"},
96   {"probe_read", "4.1"},
97   {"ktime_get_ns", "4.1"},
98   {"trace_printk", "4.1"},
99   {"get_prandom_u32", "4.1"},
100   {"get_smp_processor_id", "4.1"},
101   {"skb_store_bytes", "4.1"},
102   {"l3_csum_replace", "4.1"},
103   {"l4_csum_replace", "4.1"},
104   {"tail_call", "4.2"},
105   {"clone_redirect", "4.2"},
106   {"get_current_pid_tgid", "4.2"},
107   {"get_current_uid_gid", "4.2"},
108   {"get_current_comm", "4.2"},
109   {"get_cgroup_classid", "4.3"},
110   {"skb_vlan_push", "4.3"},
111   {"skb_vlan_pop", "4.3"},
112   {"skb_get_tunnel_key", "4.3"},
113   {"skb_set_tunnel_key", "4.3"},
114   {"perf_event_read", "4.3"},
115   {"redirect", "4.4"},
116   {"get_route_realm", "4.4"},
117   {"perf_event_output", "4.4"},
118   {"skb_load_bytes", "4.5"},
119   {"get_stackid", "4.6"},
120   {"csum_diff", "4.6"},
121   {"skb_get_tunnel_opt", "4.6"},
122   {"skb_set_tunnel_opt", "4.6"},
123   {"skb_change_proto", "4.8"},
124   {"skb_change_type", "4.8"},
125   {"skb_under_cgroup", "4.8"},
126   {"get_hash_recalc", "4.8"},
127   {"get_current_task", "4.8"},
128   {"probe_write_user", "4.8"},
129   {"current_task_under_cgroup", "4.9"},
130   {"skb_change_tail", "4.9"},
131   {"skb_pull_data", "4.9"},
132   {"csum_update", "4.9"},
133   {"set_hash_invalid", "4.9"},
134   {"get_numa_node_id", "4.10"},
135   {"skb_change_head", "4.10"},
136   {"xdp_adjust_head", "4.10"},
137   {"probe_read_str", "4.11"},
138   {"get_socket_cookie", "4.12"},
139   {"get_socket_uid", "4.12"},
140   {"set_hash", "4.13"},
141   {"setsockopt", "4.13"},
142   {"skb_adjust_room", "4.13"},
143   {"redirect_map", "4.14"},
144   {"sk_redirect_map", "4.14"},
145   {"sock_map_update", "4.14"},
146   {"xdp_adjust_meta", "4.15"},
147   {"perf_event_read_value", "4.15"},
148   {"perf_prog_read_value", "4.15"},
149   {"getsockopt", "4.15"},
150   {"override_return", "4.16"},
151   {"sock_ops_cb_flags_set", "4.16"},
152   {"msg_redirect_map", "4.17"},
153   {"msg_apply_bytes", "4.17"},
154   {"msg_cork_bytes", "4.17"},
155   {"msg_pull_data", "4.17"},
156   {"bind", "4.17"},
157   {"xdp_adjust_tail", "4.18"},
158   {"skb_get_xfrm_state", "4.18"},
159   {"get_stack", "4.18"},
160   {"skb_load_bytes_relative", "4.18"},
161   {"fib_lookup", "4.18"},
162   {"sock_hash_update", "4.18"},
163   {"msg_redirect_hash", "4.18"},
164   {"sk_redirect_hash", "4.18"},
165   {"lwt_push_encap", "4.18"},
166   {"lwt_seg6_store_bytes", "4.18"},
167   {"lwt_seg6_adjust_srh", "4.18"},
168   {"lwt_seg6_action", "4.18"},
169   {"rc_repeat", "4.18"},
170   {"rc_keydown", "4.18"},
171   {"skb_cgroup_id", "4.18"},
172   {"get_current_cgroup_id", "4.18"},
173   {"get_local_storage", "4.19"},
174   {"sk_select_reuseport", "4.19"},
175   {"skb_ancestor_cgroup_id", "4.19"},
176   {"sk_lookup_tcp", "4.20"},
177   {"sk_lookup_udp", "4.20"},
178   {"sk_release", "4.20"},
179   {"map_push_elem", "4.20"},
180   {"map_pop_elem", "4.20"},
181   {"map_peak_elem", "4.20"},
182   {"msg_push_data", "4.20"},
183 };
184 
ptr_to_u64(void * ptr)185 static uint64_t ptr_to_u64(void *ptr)
186 {
187   return (uint64_t) (unsigned long) ptr;
188 }
189 
bpf_create_map(enum bpf_map_type map_type,const char * name,int key_size,int value_size,int max_entries,int map_flags)190 int bpf_create_map(enum bpf_map_type map_type, const char *name,
191                    int key_size, int value_size,
192                    int max_entries, int map_flags)
193 {
194   size_t name_len = name ? strlen(name) : 0;
195   union bpf_attr attr;
196   memset(&attr, 0, sizeof(attr));
197   attr.map_type = map_type;
198   attr.key_size = key_size;
199   attr.value_size = value_size;
200   attr.max_entries = max_entries;
201   attr.map_flags = map_flags;
202   memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
203 
204   int ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
205 
206   if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) {
207     memset(attr.map_name, 0, BPF_OBJ_NAME_LEN);
208     ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
209   }
210 
211   if (ret < 0 && errno == EPERM) {
212     // see note below about the rationale for this retry
213 
214     struct rlimit rl = {};
215     if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
216       rl.rlim_max = RLIM_INFINITY;
217       rl.rlim_cur = rl.rlim_max;
218       if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
219         ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
220     }
221   }
222   return ret;
223 }
224 
bpf_update_elem(int fd,void * key,void * value,unsigned long long flags)225 int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
226 {
227   union bpf_attr attr;
228   memset(&attr, 0, sizeof(attr));
229   attr.map_fd = fd;
230   attr.key = ptr_to_u64(key);
231   attr.value = ptr_to_u64(value);
232   attr.flags = flags;
233 
234   return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
235 }
236 
bpf_lookup_elem(int fd,void * key,void * value)237 int bpf_lookup_elem(int fd, void *key, void *value)
238 {
239   union bpf_attr attr;
240   memset(&attr, 0, sizeof(attr));
241   attr.map_fd = fd;
242   attr.key = ptr_to_u64(key);
243   attr.value = ptr_to_u64(value);
244 
245   return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
246 }
247 
bpf_delete_elem(int fd,void * key)248 int bpf_delete_elem(int fd, void *key)
249 {
250   union bpf_attr attr;
251   memset(&attr, 0, sizeof(attr));
252   attr.map_fd = fd;
253   attr.key = ptr_to_u64(key);
254 
255   return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
256 }
257 
bpf_get_first_key(int fd,void * key,size_t key_size)258 int bpf_get_first_key(int fd, void *key, size_t key_size)
259 {
260   union bpf_attr attr;
261   int i, res;
262 
263   memset(&attr, 0, sizeof(attr));
264   attr.map_fd = fd;
265   attr.key = 0;
266   attr.next_key = ptr_to_u64(key);
267 
268   // 4.12 and above kernel supports passing NULL to BPF_MAP_GET_NEXT_KEY
269   // to get first key of the map. For older kernels, the call will fail.
270   res = syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
271   if (res < 0 && errno == EFAULT) {
272     // Fall back to try to find a non-existing key.
273     static unsigned char try_values[3] = {0, 0xff, 0x55};
274     attr.key = ptr_to_u64(key);
275     for (i = 0; i < 3; i++) {
276       memset(key, try_values[i], key_size);
277       // We want to check the existence of the key but we don't know the size
278       // of map's value. So we pass an invalid pointer for value, expect
279       // the call to fail and check if the error is ENOENT indicating the
280       // key doesn't exist. If we use NULL for the invalid pointer, it might
281       // trigger a page fault in kernel and affect performance. Hence we use
282       // ~0 which will fail and return fast.
283       // This should fail since we pass an invalid pointer for value.
284       if (bpf_lookup_elem(fd, key, (void *)~0) >= 0)
285         return -1;
286       // This means the key doesn't exist.
287       if (errno == ENOENT)
288         return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
289     }
290     return -1;
291   } else {
292     return res;
293   }
294 }
295 
bpf_get_next_key(int fd,void * key,void * next_key)296 int bpf_get_next_key(int fd, void *key, void *next_key)
297 {
298   union bpf_attr attr;
299   memset(&attr, 0, sizeof(attr));
300   attr.map_fd = fd;
301   attr.key = ptr_to_u64(key);
302   attr.next_key = ptr_to_u64(next_key);
303 
304   return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
305 }
306 
bpf_print_hints(int ret,char * log)307 static void bpf_print_hints(int ret, char *log)
308 {
309   if (ret < 0)
310     fprintf(stderr, "bpf: Failed to load program: %s\n", strerror(errno));
311   if (log == NULL)
312     return;
313   else
314     fprintf(stderr, "%s\n", log);
315 
316   if (ret >= 0)
317     return;
318 
319   // The following error strings will need maintenance to match LLVM.
320 
321   // stack busting
322   if (strstr(log, "invalid stack off=-") != NULL) {
323     fprintf(stderr, "HINT: Looks like you exceeded the BPF stack limit. "
324       "This can happen if you allocate too much local variable storage. "
325       "For example, if you allocated a 1 Kbyte struct (maybe for "
326       "BPF_PERF_OUTPUT), busting a max stack of 512 bytes.\n\n");
327   }
328 
329   // didn't check NULL on map lookup
330   if (strstr(log, "invalid mem access 'map_value_or_null'") != NULL) {
331     fprintf(stderr, "HINT: The 'map_value_or_null' error can happen if "
332       "you dereference a pointer value from a map lookup without first "
333       "checking if that pointer is NULL.\n\n");
334   }
335 
336   // lacking a bpf_probe_read
337   if (strstr(log, "invalid mem access 'inv'") != NULL) {
338     fprintf(stderr, "HINT: The invalid mem access 'inv' error can happen "
339       "if you try to dereference memory without first using "
340       "bpf_probe_read() to copy it to the BPF stack. Sometimes the "
341       "bpf_probe_read is automatic by the bcc rewriter, other times "
342       "you'll need to be explicit.\n\n");
343   }
344 
345   // helper function not found in kernel
346   char *helper_str = strstr(log, "invalid func ");
347   if (helper_str != NULL) {
348     helper_str += strlen("invalid func ");
349     char *str = strchr(helper_str, '#');
350     if (str != NULL) {
351       helper_str = str + 1;
352     }
353     unsigned int helper_id = atoi(helper_str);
354     if (helper_id && helper_id < sizeof(helpers) / sizeof(struct bpf_helper)) {
355       struct bpf_helper helper = helpers[helper_id - 1];
356       fprintf(stderr, "HINT: bpf_%s missing (added in Linux %s).\n\n",
357               helper.name, helper.required_version);
358     }
359   }
360 }
361 #define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))
362 
bpf_obj_get_info(int prog_map_fd,void * info,uint32_t * info_len)363 int bpf_obj_get_info(int prog_map_fd, void *info, uint32_t *info_len)
364 {
365   union bpf_attr attr;
366   int err;
367 
368   memset(&attr, 0, sizeof(attr));
369   attr.info.bpf_fd = prog_map_fd;
370   attr.info.info_len = *info_len;
371   attr.info.info = ptr_to_u64(info);
372 
373   err = syscall(__NR_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
374   if (!err)
375           *info_len = attr.info.info_len;
376 
377   return err;
378 }
379 
bpf_prog_compute_tag(const struct bpf_insn * insns,int prog_len,unsigned long long * ptag)380 int bpf_prog_compute_tag(const struct bpf_insn *insns, int prog_len,
381                          unsigned long long *ptag)
382 {
383   struct sockaddr_alg alg = {
384     .salg_family    = AF_ALG,
385     .salg_type      = "hash",
386     .salg_name      = "sha1",
387   };
388   int shafd = socket(AF_ALG, SOCK_SEQPACKET | SOCK_CLOEXEC, 0);
389   if (shafd < 0) {
390     fprintf(stderr, "sha1 socket not available %s\n", strerror(errno));
391     return -1;
392   }
393   int ret = bind(shafd, (struct sockaddr *)&alg, sizeof(alg));
394   if (ret < 0) {
395     fprintf(stderr, "sha1 bind fail %s\n", strerror(errno));
396     close(shafd);
397     return ret;
398   }
399   int shafd2 = accept4(shafd, NULL, 0, SOCK_CLOEXEC);
400   if (shafd2 < 0) {
401     fprintf(stderr, "sha1 accept fail %s\n", strerror(errno));
402     close(shafd);
403     return -1;
404   }
405   struct bpf_insn prog[prog_len / 8];
406   bool map_ld_seen = false;
407   int i;
408   for (i = 0; i < prog_len / 8; i++) {
409     prog[i] = insns[i];
410     if (insns[i].code == (BPF_LD | BPF_DW | BPF_IMM) &&
411         insns[i].src_reg == BPF_PSEUDO_MAP_FD &&
412         !map_ld_seen) {
413       prog[i].imm = 0;
414       map_ld_seen = true;
415     } else if (insns[i].code == 0 && map_ld_seen) {
416       prog[i].imm = 0;
417       map_ld_seen = false;
418     } else {
419       map_ld_seen = false;
420     }
421   }
422   ret = write(shafd2, prog, prog_len);
423   if (ret != prog_len) {
424     fprintf(stderr, "sha1 write fail %s\n", strerror(errno));
425     close(shafd2);
426     close(shafd);
427     return -1;
428   }
429 
430   union {
431 	  unsigned char sha[20];
432 	  unsigned long long tag;
433   } u = {};
434   ret = read(shafd2, u.sha, 20);
435   if (ret != 20) {
436     fprintf(stderr, "sha1 read fail %s\n", strerror(errno));
437     close(shafd2);
438     close(shafd);
439     return -1;
440   }
441   *ptag = __builtin_bswap64(u.tag);
442   close(shafd2);
443   close(shafd);
444   return 0;
445 }
446 
bpf_prog_get_tag(int fd,unsigned long long * ptag)447 int bpf_prog_get_tag(int fd, unsigned long long *ptag)
448 {
449   char fmt[64];
450   snprintf(fmt, sizeof(fmt), "/proc/self/fdinfo/%d", fd);
451   FILE * f = fopen(fmt, "re");
452   if (!f) {
453 /*    fprintf(stderr, "failed to open fdinfo %s\n", strerror(errno));*/
454     return -1;
455   }
456   fgets(fmt, sizeof(fmt), f); // pos
457   fgets(fmt, sizeof(fmt), f); // flags
458   fgets(fmt, sizeof(fmt), f); // mnt_id
459   fgets(fmt, sizeof(fmt), f); // prog_type
460   fgets(fmt, sizeof(fmt), f); // prog_jited
461   fgets(fmt, sizeof(fmt), f); // prog_tag
462   fclose(f);
463   char *p = strchr(fmt, ':');
464   if (!p) {
465 /*    fprintf(stderr, "broken fdinfo %s\n", fmt);*/
466     return -2;
467   }
468   unsigned long long tag = 0;
469   sscanf(p + 1, "%llx", &tag);
470   *ptag = tag;
471   return 0;
472 }
473 
bpf_prog_load(enum bpf_prog_type prog_type,const char * name,const struct bpf_insn * insns,int prog_len,const char * license,unsigned kern_version,int log_level,char * log_buf,unsigned log_buf_size)474 int bpf_prog_load(enum bpf_prog_type prog_type, const char *name,
475                   const struct bpf_insn *insns, int prog_len,
476                   const char *license, unsigned kern_version,
477                   int log_level, char *log_buf, unsigned log_buf_size)
478 {
479   size_t name_len = name ? strlen(name) : 0;
480   union bpf_attr attr;
481   char *tmp_log_buf = NULL;
482   unsigned tmp_log_buf_size = 0;
483   int ret = 0, name_offset = 0;
484 
485   memset(&attr, 0, sizeof(attr));
486 
487   attr.prog_type = prog_type;
488   attr.kern_version = kern_version;
489   attr.license = ptr_to_u64((void *)license);
490 
491   attr.insns = ptr_to_u64((void *)insns);
492   attr.insn_cnt = prog_len / sizeof(struct bpf_insn);
493   if (attr.insn_cnt > BPF_MAXINSNS) {
494     errno = EINVAL;
495     fprintf(stderr,
496             "bpf: %s. Program %s too large (%u insns), at most %d insns\n\n",
497             strerror(errno), name, attr.insn_cnt, BPF_MAXINSNS);
498     return -1;
499   }
500 
501   attr.log_level = log_level;
502   if (attr.log_level > 0) {
503     if (log_buf_size > 0) {
504       // Use user-provided log buffer if availiable.
505       log_buf[0] = 0;
506       attr.log_buf = ptr_to_u64(log_buf);
507       attr.log_size = log_buf_size;
508     } else {
509       // Create and use temporary log buffer if user didn't provide one.
510       tmp_log_buf_size = LOG_BUF_SIZE;
511       tmp_log_buf = malloc(tmp_log_buf_size);
512       if (!tmp_log_buf) {
513         fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n",
514                 strerror(errno));
515         attr.log_level = 0;
516       } else {
517         tmp_log_buf[0] = 0;
518         attr.log_buf = ptr_to_u64(tmp_log_buf);
519         attr.log_size = tmp_log_buf_size;
520       }
521     }
522   }
523 
524   if (strncmp(name, "kprobe__", 8) == 0)
525     name_offset = 8;
526   else if (strncmp(name, "tracepoint__", 12) == 0)
527     name_offset = 12;
528   else if (strncmp(name, "raw_tracepoint__", 16) == 0)
529     name_offset = 16;
530   memcpy(attr.prog_name, name + name_offset,
531          min(name_len - name_offset, BPF_OBJ_NAME_LEN - 1));
532 
533   ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
534   // BPF object name is not supported on older Kernels.
535   // If we failed due to this, clear the name and try again.
536   if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) {
537     memset(attr.prog_name, 0, BPF_OBJ_NAME_LEN);
538     ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
539   }
540 
541   if (ret < 0 && errno == EPERM) {
542     // When EPERM is returned, two reasons are possible:
543     //  1. user has no permissions for bpf()
544     //  2. user has insufficent rlimit for locked memory
545     // Unfortunately, there is no api to inspect the current usage of locked
546     // mem for the user, so an accurate calculation of how much memory to lock
547     // for this new program is difficult to calculate. As a hack, bump the limit
548     // to unlimited. If program load fails again, return the error.
549     struct rlimit rl = {};
550     if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
551       rl.rlim_max = RLIM_INFINITY;
552       rl.rlim_cur = rl.rlim_max;
553       if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
554         ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
555     }
556   }
557 
558   // The load has failed. Handle log message.
559   if (ret < 0) {
560     // User has provided a log buffer.
561     if (log_buf_size) {
562       // If logging is not already enabled, enable it and do the syscall again.
563       if (attr.log_level == 0) {
564         attr.log_level = 1;
565         attr.log_buf = ptr_to_u64(log_buf);
566         attr.log_size = log_buf_size;
567         ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
568       }
569       // Print the log message and return.
570       bpf_print_hints(ret, log_buf);
571       if (errno == ENOSPC)
572         fprintf(stderr, "bpf: log_buf size may be insufficient\n");
573       goto return_result;
574     }
575 
576     // User did not provide log buffer. We will try to increase size of
577     // our temporary log buffer to get full error message.
578     if (tmp_log_buf)
579       free(tmp_log_buf);
580     tmp_log_buf_size = LOG_BUF_SIZE;
581     if (attr.log_level == 0)
582       attr.log_level = 1;
583     for (;;) {
584       tmp_log_buf = malloc(tmp_log_buf_size);
585       if (!tmp_log_buf) {
586         fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n",
587                 strerror(errno));
588         goto return_result;
589       }
590       tmp_log_buf[0] = 0;
591       attr.log_buf = ptr_to_u64(tmp_log_buf);
592       attr.log_size = tmp_log_buf_size;
593 
594       ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
595       if (ret < 0 && errno == ENOSPC) {
596         // Temporary buffer size is not enough. Double it and try again.
597         free(tmp_log_buf);
598         tmp_log_buf = NULL;
599         tmp_log_buf_size <<= 1;
600       } else {
601         break;
602       }
603     }
604   }
605 
606   // Check if we should print the log message if log_level is not 0,
607   // either specified by user or set due to error.
608   if (attr.log_level > 0) {
609     // Don't print if user enabled logging and provided log buffer,
610     // but there is no error.
611     if (log_buf && ret < 0)
612       bpf_print_hints(ret, log_buf);
613     else if (tmp_log_buf)
614       bpf_print_hints(ret, tmp_log_buf);
615   }
616 
617 return_result:
618   if (tmp_log_buf)
619     free(tmp_log_buf);
620   return ret;
621 }
622 
bpf_open_raw_sock(const char * name)623 int bpf_open_raw_sock(const char *name)
624 {
625   struct sockaddr_ll sll;
626   int sock;
627 
628   sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
629   if (sock < 0) {
630     fprintf(stderr, "cannot create raw socket\n");
631     return -1;
632   }
633 
634   /* Do not bind on empty interface names */
635   if (!name || *name == '\0')
636     return sock;
637 
638   memset(&sll, 0, sizeof(sll));
639   sll.sll_family = AF_PACKET;
640   sll.sll_ifindex = if_nametoindex(name);
641   if (sll.sll_ifindex == 0) {
642     fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno));
643     close(sock);
644     return -1;
645   }
646   sll.sll_protocol = htons(ETH_P_ALL);
647   if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
648     fprintf(stderr, "bind to %s: %s\n", name, strerror(errno));
649     close(sock);
650     return -1;
651   }
652 
653   return sock;
654 }
655 
bpf_attach_socket(int sock,int prog)656 int bpf_attach_socket(int sock, int prog) {
657   return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog));
658 }
659 
660 #define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
bpf_find_probe_type(const char * event_type)661 static int bpf_find_probe_type(const char *event_type)
662 {
663   int fd;
664   int ret;
665   char buf[PATH_MAX];
666 
667   ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
668   if (ret < 0 || ret >= (int)sizeof(buf))
669     return -1;
670 
671   fd = open(buf, O_RDONLY | O_CLOEXEC);
672   if (fd < 0)
673     return -1;
674   ret = read(fd, buf, sizeof(buf));
675   close(fd);
676   if (ret < 0 || ret >= (int)sizeof(buf))
677     return -1;
678   errno = 0;
679   ret = (int)strtol(buf, NULL, 10);
680   return errno ? -1 : ret;
681 }
682 
683 #define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
bpf_get_retprobe_bit(const char * event_type)684 static int bpf_get_retprobe_bit(const char *event_type)
685 {
686   int fd;
687   int ret;
688   char buf[PATH_MAX];
689 
690   ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
691   if (ret < 0 || ret >= (int)sizeof(buf))
692     return -1;
693 
694   fd = open(buf, O_RDONLY | O_CLOEXEC);
695   if (fd < 0)
696     return -1;
697   ret = read(fd, buf, sizeof(buf));
698   close(fd);
699   if (ret < 0 || ret >= (int)sizeof(buf))
700     return -1;
701   if (strlen(buf) < strlen("config:"))
702     return -1;
703   errno = 0;
704   ret = (int)strtol(buf + strlen("config:"), NULL, 10);
705   return errno ? -1 : ret;
706 }
707 
708 /*
709  * new kernel API allows creating [k,u]probe with perf_event_open, which
710  * makes it easier to clean up the [k,u]probe. This function tries to
711  * create pfd with the new API.
712  */
bpf_try_perf_event_open_with_probe(const char * name,uint64_t offs,int pid,char * event_type,int is_return)713 static int bpf_try_perf_event_open_with_probe(const char *name, uint64_t offs,
714              int pid, char *event_type, int is_return)
715 {
716   struct perf_event_attr attr = {};
717   int type = bpf_find_probe_type(event_type);
718   int is_return_bit = bpf_get_retprobe_bit(event_type);
719   int cpu = 0;
720 
721   if (type < 0 || is_return_bit < 0)
722     return -1;
723   attr.sample_period = 1;
724   attr.wakeup_events = 1;
725   if (is_return)
726     attr.config |= 1 << is_return_bit;
727 
728   /*
729    * struct perf_event_attr in latest perf_event.h has the following
730    * extension to config1 and config2. To keep bcc compatibe with
731    * older perf_event.h, we use config1 and config2 here instead of
732    * kprobe_func, uprobe_path, kprobe_addr, and probe_offset.
733    *
734    * union {
735    *  __u64 bp_addr;
736    *  __u64 kprobe_func;
737    *  __u64 uprobe_path;
738    *  __u64 config1;
739    * };
740    * union {
741    *   __u64 bp_len;
742    *   __u64 kprobe_addr;
743    *   __u64 probe_offset;
744    *   __u64 config2;
745    * };
746    */
747   attr.config2 = offs;  /* config2 here is kprobe_addr or probe_offset */
748   attr.size = sizeof(attr);
749   attr.type = type;
750   /* config1 here is kprobe_func or  uprobe_path */
751   attr.config1 = ptr_to_u64((void *)name);
752   // PID filter is only possible for uprobe events.
753   if (pid < 0)
754     pid = -1;
755   // perf_event_open API doesn't allow both pid and cpu to be -1.
756   // So only set it to -1 when PID is not -1.
757   // Tracing events do not do CPU filtering in any cases.
758   if (pid != -1)
759     cpu = -1;
760   return syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */,
761                  PERF_FLAG_FD_CLOEXEC);
762 }
763 
764 // When a valid Perf Event FD provided through pfd, it will be used to enable
765 // and attach BPF program to the event, and event_path will be ignored.
766 // Otherwise, event_path is expected to contain the path to the event in debugfs
767 // and it will be used to open the Perf Event FD.
768 // In either case, if the attach partially failed (such as issue with the
769 // ioctl operations), the **caller** need to clean up the Perf Event FD, either
770 // provided by the caller or opened here.
bpf_attach_tracing_event(int progfd,const char * event_path,int pid,int * pfd)771 static int bpf_attach_tracing_event(int progfd, const char *event_path, int pid,
772                                     int *pfd)
773 {
774   int efd, cpu = 0;
775   ssize_t bytes;
776   char buf[PATH_MAX];
777   struct perf_event_attr attr = {};
778   // Caller did not provided a valid Perf Event FD. Create one with the debugfs
779   // event path provided.
780   if (*pfd < 0) {
781     snprintf(buf, sizeof(buf), "%s/id", event_path);
782     efd = open(buf, O_RDONLY | O_CLOEXEC, 0);
783     if (efd < 0) {
784       fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
785       return -1;
786     }
787 
788     bytes = read(efd, buf, sizeof(buf));
789     if (bytes <= 0 || bytes >= (int)sizeof(buf)) {
790       fprintf(stderr, "read(%s): %s\n", buf, strerror(errno));
791       close(efd);
792       return -1;
793     }
794     close(efd);
795     buf[bytes] = '\0';
796     attr.config = strtol(buf, NULL, 0);
797     attr.type = PERF_TYPE_TRACEPOINT;
798     attr.sample_period = 1;
799     attr.wakeup_events = 1;
800     // PID filter is only possible for uprobe events.
801     if (pid < 0)
802       pid = -1;
803     // perf_event_open API doesn't allow both pid and cpu to be -1.
804     // So only set it to -1 when PID is not -1.
805     // Tracing events do not do CPU filtering in any cases.
806     if (pid != -1)
807       cpu = -1;
808     *pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
809     if (*pfd < 0) {
810       fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno));
811       return -1;
812     }
813   }
814 
815   if (ioctl(*pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) {
816     perror("ioctl(PERF_EVENT_IOC_SET_BPF)");
817     return -1;
818   }
819   if (ioctl(*pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
820     perror("ioctl(PERF_EVENT_IOC_ENABLE)");
821     return -1;
822   }
823 
824   return 0;
825 }
826 
bpf_attach_kprobe(int progfd,enum bpf_probe_attach_type attach_type,const char * ev_name,const char * fn_name,uint64_t fn_offset)827 int bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type,
828                       const char *ev_name, const char *fn_name, uint64_t fn_offset)
829 {
830   int kfd, pfd = -1;
831   char buf[256];
832   char event_alias[128];
833   static char *event_type = "kprobe";
834 
835   // Try create the kprobe Perf Event with perf_event_open API.
836   pfd = bpf_try_perf_event_open_with_probe(fn_name, fn_offset, -1, event_type,
837                                            attach_type != BPF_PROBE_ENTRY);
838   // If failed, most likely Kernel doesn't support the new perf_event_open API
839   // yet. Try create the event using debugfs.
840   if (pfd < 0) {
841     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
842     kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
843     if (kfd < 0) {
844       fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
845       goto error;
846     }
847 
848     snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
849 
850     if (fn_offset > 0 && attach_type == BPF_PROBE_ENTRY)
851       snprintf(buf, sizeof(buf), "p:%ss/%s %s+%"PRIu64,
852                event_type, event_alias, fn_name, fn_offset);
853     else
854       snprintf(buf, sizeof(buf), "%c:%ss/%s %s",
855                attach_type == BPF_PROBE_ENTRY ? 'p' : 'r',
856                event_type, event_alias, fn_name);
857 
858     if (write(kfd, buf, strlen(buf)) < 0) {
859       if (errno == ENOENT)
860          fprintf(stderr, "cannot attach kprobe, probe entry may not exist\n");
861       else
862          fprintf(stderr, "cannot attach kprobe, %s\n", strerror(errno));
863       close(kfd);
864       goto error;
865     }
866     close(kfd);
867     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
868   }
869   // If perf_event_open succeeded, bpf_attach_tracing_event will use the created
870   // Perf Event FD directly and buf would be empty and unused.
871   // Otherwise it will read the event ID from the path in buf, create the
872   // Perf Event event using that ID, and updated value of pfd.
873   if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0)
874     return pfd;
875 
876 error:
877   bpf_close_perf_event_fd(pfd);
878   return -1;
879 }
880 
enter_mount_ns(int pid)881 static int enter_mount_ns(int pid) {
882   struct stat self_stat, target_stat;
883   int self_fd = -1, target_fd = -1;
884   char buf[64];
885 
886   if (pid < 0)
887     return -1;
888 
889   if ((size_t)snprintf(buf, sizeof(buf), "/proc/%d/ns/mnt", pid) >= sizeof(buf))
890     return -1;
891 
892   self_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
893   if (self_fd < 0) {
894     perror("open(/proc/self/ns/mnt)");
895     return -1;
896   }
897 
898   target_fd = open(buf, O_RDONLY | O_CLOEXEC);
899   if (target_fd < 0) {
900     perror("open(/proc/<pid>/ns/mnt)");
901     goto error;
902   }
903 
904   if (fstat(self_fd, &self_stat)) {
905     perror("fstat(self_fd)");
906     goto error;
907   }
908 
909   if (fstat(target_fd, &target_stat)) {
910     perror("fstat(target_fd)");
911     goto error;
912   }
913 
914   // both target and current ns are same, avoid setns and close all fds
915   if (self_stat.st_ino == target_stat.st_ino)
916     goto error;
917 
918   if (setns(target_fd, CLONE_NEWNS)) {
919     perror("setns(target)");
920     goto error;
921   }
922 
923   close(target_fd);
924   return self_fd;
925 
926 error:
927   if (self_fd >= 0)
928     close(self_fd);
929   if (target_fd >= 0)
930     close(target_fd);
931   return -1;
932 }
933 
exit_mount_ns(int fd)934 static void exit_mount_ns(int fd) {
935   if (fd < 0)
936     return;
937 
938   if (setns(fd, CLONE_NEWNS))
939     perror("setns");
940   close(fd);
941 }
942 
bpf_attach_uprobe(int progfd,enum bpf_probe_attach_type attach_type,const char * ev_name,const char * binary_path,uint64_t offset,pid_t pid)943 int bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type,
944                       const char *ev_name, const char *binary_path,
945                       uint64_t offset, pid_t pid)
946 {
947   char buf[PATH_MAX];
948   char event_alias[PATH_MAX];
949   static char *event_type = "uprobe";
950   int res, kfd = -1, pfd = -1, ns_fd = -1;
951   // Try create the uprobe Perf Event with perf_event_open API.
952   pfd = bpf_try_perf_event_open_with_probe(binary_path, offset, pid, event_type,
953                                            attach_type != BPF_PROBE_ENTRY);
954   // If failed, most likely Kernel doesn't support the new perf_event_open API
955   // yet. Try create the event using debugfs.
956   if (pfd < 0) {
957     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
958     kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
959     if (kfd < 0) {
960       fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
961       goto error;
962     }
963 
964     res = snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
965     if (res < 0 || res >= (int)sizeof(event_alias)) {
966       fprintf(stderr, "Event name (%s) is too long for buffer\n", ev_name);
967       goto error;
968     }
969     res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
970                    event_type, event_alias, binary_path, (unsigned long)offset);
971     if (res < 0 || res >= (int)sizeof(buf)) {
972       fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias);
973       goto error;
974     }
975 
976     ns_fd = enter_mount_ns(pid);
977     if (write(kfd, buf, strlen(buf)) < 0) {
978       if (errno == EINVAL)
979         fprintf(stderr, "check dmesg output for possible cause\n");
980       goto error;
981     }
982     close(kfd);
983     kfd = -1;
984     exit_mount_ns(ns_fd);
985     ns_fd = -1;
986 
987     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
988   }
989   // If perf_event_open succeeded, bpf_attach_tracing_event will use the created
990   // Perf Event FD directly and buf would be empty and unused.
991   // Otherwise it will read the event ID from the path in buf, create the
992   // Perf Event event using that ID, and updated value of pfd.
993   if (bpf_attach_tracing_event(progfd, buf, pid, &pfd) == 0)
994     return pfd;
995 
996 error:
997   if (kfd >= 0)
998     close(kfd);
999   exit_mount_ns(ns_fd);
1000   bpf_close_perf_event_fd(pfd);
1001   return -1;
1002 }
1003 
bpf_detach_probe(const char * ev_name,const char * event_type)1004 static int bpf_detach_probe(const char *ev_name, const char *event_type)
1005 {
1006   int kfd = -1, res;
1007   char buf[PATH_MAX];
1008   int found_event = 0;
1009   size_t bufsize = 0;
1010   char *cptr = NULL;
1011   FILE *fp;
1012 
1013   /*
1014    * For [k,u]probe created with perf_event_open (on newer kernel), it is
1015    * not necessary to clean it up in [k,u]probe_events. We first look up
1016    * the %s_bcc_%d line in [k,u]probe_events. If the event is not found,
1017    * it is safe to skip the cleaning up process (write -:... to the file).
1018    */
1019   snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
1020   fp = fopen(buf, "re");
1021   if (!fp) {
1022     fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
1023     goto error;
1024   }
1025 
1026   res = snprintf(buf, sizeof(buf), "%ss/%s_bcc_%d", event_type, ev_name, getpid());
1027   if (res < 0 || res >= (int)sizeof(buf)) {
1028     fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
1029     goto error;
1030   }
1031 
1032   while (getline(&cptr, &bufsize, fp) != -1)
1033     if (strstr(cptr, buf) != NULL) {
1034       found_event = 1;
1035       break;
1036     }
1037   free(cptr);
1038   fclose(fp);
1039   fp = NULL;
1040 
1041   if (!found_event)
1042     return 0;
1043 
1044   snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
1045   kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
1046   if (kfd < 0) {
1047     fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
1048     goto error;
1049   }
1050 
1051   res = snprintf(buf, sizeof(buf), "-:%ss/%s_bcc_%d", event_type, ev_name, getpid());
1052   if (res < 0 || res >= (int)sizeof(buf)) {
1053     fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
1054     goto error;
1055   }
1056   if (write(kfd, buf, strlen(buf)) < 0) {
1057     fprintf(stderr, "write(%s): %s\n", buf, strerror(errno));
1058     goto error;
1059   }
1060 
1061   close(kfd);
1062   return 0;
1063 
1064 error:
1065   if (kfd >= 0)
1066     close(kfd);
1067   if (fp)
1068     fclose(fp);
1069   return -1;
1070 }
1071 
bpf_detach_kprobe(const char * ev_name)1072 int bpf_detach_kprobe(const char *ev_name)
1073 {
1074   return bpf_detach_probe(ev_name, "kprobe");
1075 }
1076 
bpf_detach_uprobe(const char * ev_name)1077 int bpf_detach_uprobe(const char *ev_name)
1078 {
1079   return bpf_detach_probe(ev_name, "uprobe");
1080 }
1081 
1082 
bpf_attach_tracepoint(int progfd,const char * tp_category,const char * tp_name)1083 int bpf_attach_tracepoint(int progfd, const char *tp_category,
1084                           const char *tp_name)
1085 {
1086   char buf[256];
1087   int pfd = -1;
1088 
1089   snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/%s",
1090            tp_category, tp_name);
1091   if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0)
1092     return pfd;
1093 
1094   bpf_close_perf_event_fd(pfd);
1095   return -1;
1096 }
1097 
bpf_detach_tracepoint(const char * tp_category,const char * tp_name)1098 int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) {
1099   tp_category = NULL;
1100   tp_name = NULL;
1101   // Right now, there is nothing to do, but it's a good idea to encourage
1102   // callers to detach anything they attach.
1103   return 0;
1104 }
1105 
bpf_attach_raw_tracepoint(int progfd,char * tp_name)1106 int bpf_attach_raw_tracepoint(int progfd, char *tp_name)
1107 {
1108   union bpf_attr attr;
1109   int ret;
1110 
1111   bzero(&attr, sizeof(attr));
1112   attr.raw_tracepoint.name = ptr_to_u64(tp_name);
1113   attr.raw_tracepoint.prog_fd = progfd;
1114 
1115   ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
1116   if (ret < 0)
1117     fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno));
1118   return ret;
1119 }
1120 
bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,perf_reader_lost_cb lost_cb,void * cb_cookie,int pid,int cpu,int page_cnt)1121 void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
1122                             perf_reader_lost_cb lost_cb, void *cb_cookie,
1123                             int pid, int cpu, int page_cnt) {
1124   int pfd;
1125   struct perf_event_attr attr = {};
1126   struct perf_reader *reader = NULL;
1127 
1128   reader = perf_reader_new(raw_cb, lost_cb, cb_cookie, page_cnt);
1129   if (!reader)
1130     goto error;
1131 
1132   attr.config = 10;//PERF_COUNT_SW_BPF_OUTPUT;
1133   attr.type = PERF_TYPE_SOFTWARE;
1134   attr.sample_type = PERF_SAMPLE_RAW;
1135   attr.sample_period = 1;
1136   attr.wakeup_events = 1;
1137   pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
1138   if (pfd < 0) {
1139     fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
1140     fprintf(stderr, "   (check your kernel for PERF_COUNT_SW_BPF_OUTPUT support, 4.4 or newer)\n");
1141     goto error;
1142   }
1143   perf_reader_set_fd(reader, pfd);
1144 
1145   if (perf_reader_mmap(reader) < 0)
1146     goto error;
1147 
1148   if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
1149     perror("ioctl(PERF_EVENT_IOC_ENABLE)");
1150     goto error;
1151   }
1152 
1153   return reader;
1154 
1155 error:
1156   if (reader)
1157     perf_reader_free(reader);
1158 
1159   return NULL;
1160 }
1161 
invalid_perf_config(uint32_t type,uint64_t config)1162 static int invalid_perf_config(uint32_t type, uint64_t config) {
1163   switch (type) {
1164   case PERF_TYPE_HARDWARE:
1165     if (config >= PERF_COUNT_HW_MAX) {
1166       fprintf(stderr, "HARDWARE perf event config out of range\n");
1167       goto is_invalid;
1168     }
1169     return 0;
1170   case PERF_TYPE_SOFTWARE:
1171     if (config >= PERF_COUNT_SW_MAX) {
1172       fprintf(stderr, "SOFTWARE perf event config out of range\n");
1173       goto is_invalid;
1174     } else if (config == 10 /* PERF_COUNT_SW_BPF_OUTPUT */) {
1175       fprintf(stderr, "Unable to open or attach perf event for BPF_OUTPUT\n");
1176       goto is_invalid;
1177     }
1178     return 0;
1179   case PERF_TYPE_HW_CACHE:
1180     if (((config >> 16) >= PERF_COUNT_HW_CACHE_RESULT_MAX) ||
1181         (((config >> 8) & 0xff) >= PERF_COUNT_HW_CACHE_OP_MAX) ||
1182         ((config & 0xff) >= PERF_COUNT_HW_CACHE_MAX)) {
1183       fprintf(stderr, "HW_CACHE perf event config out of range\n");
1184       goto is_invalid;
1185     }
1186     return 0;
1187   case PERF_TYPE_TRACEPOINT:
1188   case PERF_TYPE_BREAKPOINT:
1189     fprintf(stderr,
1190             "Unable to open or attach TRACEPOINT or BREAKPOINT events\n");
1191     goto is_invalid;
1192   default:
1193     return 0;
1194   }
1195 is_invalid:
1196   fprintf(stderr, "Invalid perf event type %" PRIu32 " config %" PRIu64 "\n",
1197           type, config);
1198   return 1;
1199 }
1200 
bpf_open_perf_event(uint32_t type,uint64_t config,int pid,int cpu)1201 int bpf_open_perf_event(uint32_t type, uint64_t config, int pid, int cpu) {
1202   int fd;
1203   struct perf_event_attr attr = {};
1204 
1205   if (invalid_perf_config(type, config)) {
1206     return -1;
1207   }
1208 
1209   attr.sample_period = LONG_MAX;
1210   attr.type = type;
1211   attr.config = config;
1212 
1213   fd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
1214   if (fd < 0) {
1215     fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
1216     return -1;
1217   }
1218 
1219   if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
1220     perror("ioctl(PERF_EVENT_IOC_ENABLE)");
1221     close(fd);
1222     return -1;
1223   }
1224 
1225   return fd;
1226 }
1227 
bpf_attach_xdp(const char * dev_name,int progfd,uint32_t flags)1228 int bpf_attach_xdp(const char *dev_name, int progfd, uint32_t flags) {
1229     struct sockaddr_nl sa;
1230     int sock, seq = 0, len, ret = -1;
1231     char buf[4096];
1232     struct nlattr *nla, *nla_xdp;
1233     struct {
1234         struct nlmsghdr  nh;
1235         struct ifinfomsg ifinfo;
1236         char             attrbuf[64];
1237     } req;
1238     struct nlmsghdr *nh;
1239     struct nlmsgerr *err;
1240     socklen_t addrlen;
1241 
1242     memset(&sa, 0, sizeof(sa));
1243     sa.nl_family = AF_NETLINK;
1244 
1245     sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
1246     if (sock < 0) {
1247         fprintf(stderr, "bpf: opening a netlink socket: %s\n", strerror(errno));
1248         return -1;
1249     }
1250 
1251     if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
1252         fprintf(stderr, "bpf: bind to netlink: %s\n", strerror(errno));
1253         goto cleanup;
1254     }
1255 
1256     addrlen = sizeof(sa);
1257     if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
1258         fprintf(stderr, "bpf: get sock name of netlink: %s\n", strerror(errno));
1259         goto cleanup;
1260     }
1261 
1262     if (addrlen != sizeof(sa)) {
1263         fprintf(stderr, "bpf: wrong netlink address length: %d\n", addrlen);
1264         goto cleanup;
1265     }
1266 
1267     memset(&req, 0, sizeof(req));
1268     req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1269     req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1270     req.nh.nlmsg_type = RTM_SETLINK;
1271     req.nh.nlmsg_pid = 0;
1272     req.nh.nlmsg_seq = ++seq;
1273     req.ifinfo.ifi_family = AF_UNSPEC;
1274     req.ifinfo.ifi_index = if_nametoindex(dev_name);
1275     if (req.ifinfo.ifi_index == 0) {
1276         fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno));
1277         goto cleanup;
1278     }
1279 
1280     nla = (struct nlattr *)(((char *)&req)
1281                             + NLMSG_ALIGN(req.nh.nlmsg_len));
1282     nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
1283 
1284     nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
1285     nla->nla_len = NLA_HDRLEN;
1286 
1287     // we specify the FD passed over by the user
1288     nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
1289     nla_xdp->nla_len = NLA_HDRLEN + sizeof(progfd);
1290     memcpy((char *)nla_xdp + NLA_HDRLEN, &progfd, sizeof(progfd));
1291     nla->nla_len += nla_xdp->nla_len;
1292 
1293     // parse flags as passed by the user
1294     if (flags) {
1295         nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
1296         nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/;
1297         nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
1298         memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
1299         nla->nla_len += nla_xdp->nla_len;
1300     }
1301 
1302     req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
1303 
1304     if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
1305         fprintf(stderr, "bpf: send to netlink: %s\n", strerror(errno));
1306         goto cleanup;
1307     }
1308 
1309     len = recv(sock, buf, sizeof(buf), 0);
1310     if (len < 0) {
1311         fprintf(stderr, "bpf: recv from netlink: %s\n", strerror(errno));
1312         goto cleanup;
1313     }
1314 
1315     for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len);
1316          nh = NLMSG_NEXT(nh, len)) {
1317         if (nh->nlmsg_pid != sa.nl_pid) {
1318             fprintf(stderr, "bpf: Wrong pid %u, expected %u\n",
1319                    nh->nlmsg_pid, sa.nl_pid);
1320             errno = EBADMSG;
1321             goto cleanup;
1322         }
1323         if (nh->nlmsg_seq != (unsigned int)seq) {
1324             fprintf(stderr, "bpf: Wrong seq %d, expected %d\n",
1325                    nh->nlmsg_seq, seq);
1326             errno = EBADMSG;
1327             goto cleanup;
1328         }
1329         switch (nh->nlmsg_type) {
1330             case NLMSG_ERROR:
1331                 err = (struct nlmsgerr *)NLMSG_DATA(nh);
1332                 if (!err->error)
1333                     continue;
1334                 fprintf(stderr, "bpf: nlmsg error %s\n", strerror(-err->error));
1335                 errno = -err->error;
1336                 goto cleanup;
1337             case NLMSG_DONE:
1338                 break;
1339         }
1340     }
1341 
1342     ret = 0;
1343 
1344 cleanup:
1345     close(sock);
1346     return ret;
1347 }
1348 
bpf_attach_perf_event_raw(int progfd,void * perf_event_attr,pid_t pid,int cpu,int group_fd,unsigned long extra_flags)1349 int bpf_attach_perf_event_raw(int progfd, void *perf_event_attr, pid_t pid,
1350                               int cpu, int group_fd, unsigned long extra_flags) {
1351   int fd = syscall(__NR_perf_event_open, perf_event_attr, pid, cpu, group_fd,
1352                    PERF_FLAG_FD_CLOEXEC | extra_flags);
1353   if (fd < 0) {
1354     perror("perf_event_open failed");
1355     return -1;
1356   }
1357   if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, progfd) != 0) {
1358     perror("ioctl(PERF_EVENT_IOC_SET_BPF) failed");
1359     close(fd);
1360     return -1;
1361   }
1362   if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) != 0) {
1363     perror("ioctl(PERF_EVENT_IOC_ENABLE) failed");
1364     close(fd);
1365     return -1;
1366   }
1367 
1368   return fd;
1369 }
1370 
bpf_attach_perf_event(int progfd,uint32_t ev_type,uint32_t ev_config,uint64_t sample_period,uint64_t sample_freq,pid_t pid,int cpu,int group_fd)1371 int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
1372                           uint64_t sample_period, uint64_t sample_freq,
1373                           pid_t pid, int cpu, int group_fd) {
1374   if (invalid_perf_config(ev_type, ev_config)) {
1375     return -1;
1376   }
1377   if (!((sample_period > 0) ^ (sample_freq > 0))) {
1378     fprintf(
1379       stderr, "Exactly one of sample_period / sample_freq should be set\n"
1380     );
1381     return -1;
1382   }
1383 
1384   struct perf_event_attr attr = {};
1385   attr.type = ev_type;
1386   attr.config = ev_config;
1387   if (pid > 0)
1388     attr.inherit = 1;
1389   if (sample_freq > 0) {
1390     attr.freq = 1;
1391     attr.sample_freq = sample_freq;
1392   } else {
1393     attr.sample_period = sample_period;
1394   }
1395 
1396   return bpf_attach_perf_event_raw(progfd, &attr, pid, cpu, group_fd, 0);
1397 }
1398 
bpf_close_perf_event_fd(int fd)1399 int bpf_close_perf_event_fd(int fd) {
1400   int res, error = 0;
1401   if (fd >= 0) {
1402     res = ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
1403     if (res != 0) {
1404       perror("ioctl(PERF_EVENT_IOC_DISABLE) failed");
1405       error = res;
1406     }
1407     res = close(fd);
1408     if (res != 0) {
1409       perror("close perf event FD failed");
1410       error = (res && !error) ? res : error;
1411     }
1412   }
1413   return error;
1414 }
1415 
bpf_obj_pin(int fd,const char * pathname)1416 int bpf_obj_pin(int fd, const char *pathname)
1417 {
1418   union bpf_attr attr;
1419 
1420   memset(&attr, 0, sizeof(attr));
1421   attr.pathname = ptr_to_u64((void *)pathname);
1422   attr.bpf_fd = fd;
1423 
1424   return syscall(__NR_bpf, BPF_OBJ_PIN, &attr, sizeof(attr));
1425 }
1426 
bpf_obj_get(const char * pathname)1427 int bpf_obj_get(const char *pathname)
1428 {
1429   union bpf_attr attr;
1430 
1431   memset(&attr, 0, sizeof(attr));
1432   attr.pathname = ptr_to_u64((void *)pathname);
1433 
1434   return syscall(__NR_bpf, BPF_OBJ_GET, &attr, sizeof(attr));
1435 }
1436 
bpf_prog_get_next_id(uint32_t start_id,uint32_t * next_id)1437 int bpf_prog_get_next_id(uint32_t start_id, uint32_t *next_id)
1438 {
1439   union bpf_attr attr;
1440   int err;
1441 
1442   memset(&attr, 0, sizeof(attr));
1443   attr.start_id = start_id;
1444 
1445   err = syscall(__NR_bpf, BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr));
1446   if (!err)
1447     *next_id = attr.next_id;
1448 
1449   return err;
1450 }
1451 
bpf_prog_get_fd_by_id(uint32_t id)1452 int bpf_prog_get_fd_by_id(uint32_t id)
1453 {
1454   union bpf_attr attr;
1455 
1456   memset(&attr, 0, sizeof(attr));
1457   attr.prog_id = id;
1458 
1459   return syscall(__NR_bpf, BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr));
1460 }
1461 
bpf_map_get_fd_by_id(uint32_t id)1462 int bpf_map_get_fd_by_id(uint32_t id)
1463 {
1464   union bpf_attr attr;
1465 
1466   memset(&attr, 0, sizeof(attr));
1467   attr.map_id = id;
1468 
1469   return syscall(__NR_bpf, BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr));
1470 }
1471