1 /*
2  * Copyright (c) 2015 PLUMgrid, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef _GNU_SOURCE
17 #define _GNU_SOURCE
18 #endif
19 
20 #include <arpa/inet.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <inttypes.h>
24 #include <limits.h>
25 #include <linux/bpf.h>
26 #include <linux/bpf_common.h>
27 #include <linux/if_packet.h>
28 #include <linux/perf_event.h>
29 #include <linux/pkt_cls.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sched.h>
32 #include <linux/unistd.h>
33 #include <linux/version.h>
34 #include <net/ethernet.h>
35 #include <net/if.h>
36 #include <sched.h>
37 #include <stdbool.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <sys/ioctl.h>
42 #include <sys/resource.h>
43 #include <sys/stat.h>
44 #include <sys/types.h>
45 #include <unistd.h>
46 #include <linux/if_alg.h>
47 
48 #include "libbpf.h"
49 #include "perf_reader.h"
50 
51 // TODO: Remove this when CentOS 6 support is not needed anymore
52 #include "setns.h"
53 
54 // TODO: remove these defines when linux-libc-dev exports them properly
55 
56 #ifndef __NR_bpf
57 #if defined(__powerpc64__)
58 #define __NR_bpf 361
59 #elif defined(__s390x__)
60 #define __NR_bpf 351
61 #elif defined(__aarch64__)
62 #define __NR_bpf 280
63 #else
64 #define __NR_bpf 321
65 #endif
66 #endif
67 
68 #ifndef SO_ATTACH_BPF
69 #define SO_ATTACH_BPF 50
70 #endif
71 
72 #ifndef PERF_EVENT_IOC_SET_BPF
73 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
74 #endif
75 
76 #ifndef PERF_FLAG_FD_CLOEXEC
77 #define PERF_FLAG_FD_CLOEXEC (1UL << 3)
78 #endif
79 
80 // TODO: Remove this when CentOS 6 support is not needed anymore
81 #ifndef AF_ALG
82 #define AF_ALG 38
83 #endif
84 
85 #define min(x, y) ((x) < (y) ? (x) : (y))
86 
87 struct bpf_helper {
88   char *name;
89   char *required_version;
90 };
91 
92 static struct bpf_helper helpers[] = {
93   {"map_lookup_elem", "3.19"},
94   {"map_update_elem", "3.19"},
95   {"map_delete_elem", "3.19"},
96   {"probe_read", "4.1"},
97   {"ktime_get_ns", "4.1"},
98   {"trace_printk", "4.1"},
99   {"get_prandom_u32", "4.1"},
100   {"get_smp_processor_id", "4.1"},
101   {"skb_store_bytes", "4.1"},
102   {"l3_csum_replace", "4.1"},
103   {"l4_csum_replace", "4.1"},
104   {"tail_call", "4.2"},
105   {"clone_redirect", "4.2"},
106   {"get_current_pid_tgid", "4.2"},
107   {"get_current_uid_gid", "4.2"},
108   {"get_current_comm", "4.2"},
109   {"get_cgroup_classid", "4.3"},
110   {"skb_vlan_push", "4.3"},
111   {"skb_vlan_pop", "4.3"},
112   {"skb_get_tunnel_key", "4.3"},
113   {"skb_set_tunnel_key", "4.3"},
114   {"perf_event_read", "4.3"},
115   {"redirect", "4.4"},
116   {"get_route_realm", "4.4"},
117   {"perf_event_output", "4.4"},
118   {"skb_load_bytes", "4.5"},
119   {"get_stackid", "4.6"},
120   {"csum_diff", "4.6"},
121   {"skb_get_tunnel_opt", "4.6"},
122   {"skb_set_tunnel_opt", "4.6"},
123   {"skb_change_proto", "4.8"},
124   {"skb_change_type", "4.8"},
125   {"skb_under_cgroup", "4.8"},
126   {"get_hash_recalc", "4.8"},
127   {"get_current_task", "4.8"},
128   {"probe_write_user", "4.8"},
129   {"current_task_under_cgroup", "4.9"},
130   {"skb_change_tail", "4.9"},
131   {"skb_pull_data", "4.9"},
132   {"csum_update", "4.9"},
133   {"set_hash_invalid", "4.9"},
134   {"get_numa_node_id", "4.10"},
135   {"skb_change_head", "4.10"},
136   {"xdp_adjust_head", "4.10"},
137   {"probe_read_str", "4.11"},
138   {"get_socket_cookie", "4.12"},
139   {"get_socket_uid", "4.12"},
140   {"set_hash", "4.13"},
141   {"setsockopt", "4.13"},
142   {"skb_adjust_room", "4.13"},
143   {"redirect_map", "4.14"},
144   {"sk_redirect_map", "4.14"},
145   {"sock_map_update", "4.14"},
146   {"xdp_adjust_meta", "4.15"},
147   {"perf_event_read_value", "4.15"},
148   {"perf_prog_read_value", "4.15"},
149   {"getsockopt", "4.15"},
150   {"override_return", "4.16"},
151   {"sock_ops_cb_flags_set", "4.16"},
152   {"msg_redirect_map", "4.17"},
153   {"msg_apply_bytes", "4.17"},
154   {"msg_cork_bytes", "4.17"},
155   {"msg_pull_data", "4.17"},
156   {"bind", "4.17"},
157   {"xdp_adjust_tail", "4.18"},
158   {"skb_get_xfrm_state", "4.18"},
159   {"get_stack", "4.18"},
160   {"skb_load_bytes_relative", "4.18"},
161   {"fib_lookup", "4.18"},
162   {"sock_hash_update", "4.18"},
163   {"msg_redirect_hash", "4.18"},
164   {"sk_redirect_hash", "4.18"},
165   {"lwt_push_encap", "4.18"},
166   {"lwt_seg6_store_bytes", "4.18"},
167   {"lwt_seg6_adjust_srh", "4.18"},
168   {"lwt_seg6_action", "4.18"},
169   {"rc_repeat", "4.18"},
170   {"rc_keydown", "4.18"},
171   {"skb_cgroup_id", "4.18"},
172   {"get_current_cgroup_id", "4.18"},
173   {"get_local_storage", "4.19"},
174   {"sk_select_reuseport", "4.19"},
175   {"skb_ancestor_cgroup_id", "4.19"},
176   {"sk_lookup_tcp", "4.20"},
177   {"sk_lookup_udp", "4.20"},
178   {"sk_release", "4.20"},
179   {"map_push_elem", "4.20"},
180   {"map_pop_elem", "4.20"},
181   {"map_peak_elem", "4.20"},
182   {"msg_push_data", "4.20"},
183   {"msg_pop_data", "4.21"},
184   {"rc_pointer_rel", "4.21"},
185 };
186 
ptr_to_u64(void * ptr)187 static uint64_t ptr_to_u64(void *ptr)
188 {
189   return (uint64_t) (unsigned long) ptr;
190 }
191 
bpf_create_map(enum bpf_map_type map_type,const char * name,int key_size,int value_size,int max_entries,int map_flags)192 int bpf_create_map(enum bpf_map_type map_type, const char *name,
193                    int key_size, int value_size,
194                    int max_entries, int map_flags)
195 {
196   size_t name_len = name ? strlen(name) : 0;
197   union bpf_attr attr;
198   memset(&attr, 0, sizeof(attr));
199   attr.map_type = map_type;
200   attr.key_size = key_size;
201   attr.value_size = value_size;
202   attr.max_entries = max_entries;
203   attr.map_flags = map_flags;
204   memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
205 
206   int ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
207 
208   if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) {
209     memset(attr.map_name, 0, BPF_OBJ_NAME_LEN);
210     ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
211   }
212 
213   if (ret < 0 && errno == EPERM) {
214     // see note below about the rationale for this retry
215 
216     struct rlimit rl = {};
217     if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
218       rl.rlim_max = RLIM_INFINITY;
219       rl.rlim_cur = rl.rlim_max;
220       if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
221         ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
222     }
223   }
224   return ret;
225 }
226 
bpf_update_elem(int fd,void * key,void * value,unsigned long long flags)227 int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
228 {
229   union bpf_attr attr;
230   memset(&attr, 0, sizeof(attr));
231   attr.map_fd = fd;
232   attr.key = ptr_to_u64(key);
233   attr.value = ptr_to_u64(value);
234   attr.flags = flags;
235 
236   return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
237 }
238 
bpf_lookup_elem(int fd,void * key,void * value)239 int bpf_lookup_elem(int fd, void *key, void *value)
240 {
241   union bpf_attr attr;
242   memset(&attr, 0, sizeof(attr));
243   attr.map_fd = fd;
244   attr.key = ptr_to_u64(key);
245   attr.value = ptr_to_u64(value);
246 
247   return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
248 }
249 
bpf_delete_elem(int fd,void * key)250 int bpf_delete_elem(int fd, void *key)
251 {
252   union bpf_attr attr;
253   memset(&attr, 0, sizeof(attr));
254   attr.map_fd = fd;
255   attr.key = ptr_to_u64(key);
256 
257   return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
258 }
259 
bpf_get_first_key(int fd,void * key,size_t key_size)260 int bpf_get_first_key(int fd, void *key, size_t key_size)
261 {
262   union bpf_attr attr;
263   int i, res;
264 
265   memset(&attr, 0, sizeof(attr));
266   attr.map_fd = fd;
267   attr.key = 0;
268   attr.next_key = ptr_to_u64(key);
269 
270   // 4.12 and above kernel supports passing NULL to BPF_MAP_GET_NEXT_KEY
271   // to get first key of the map. For older kernels, the call will fail.
272   res = syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
273   if (res < 0 && errno == EFAULT) {
274     // Fall back to try to find a non-existing key.
275     static unsigned char try_values[3] = {0, 0xff, 0x55};
276     attr.key = ptr_to_u64(key);
277     for (i = 0; i < 3; i++) {
278       memset(key, try_values[i], key_size);
279       // We want to check the existence of the key but we don't know the size
280       // of map's value. So we pass an invalid pointer for value, expect
281       // the call to fail and check if the error is ENOENT indicating the
282       // key doesn't exist. If we use NULL for the invalid pointer, it might
283       // trigger a page fault in kernel and affect performance. Hence we use
284       // ~0 which will fail and return fast.
285       // This should fail since we pass an invalid pointer for value.
286       if (bpf_lookup_elem(fd, key, (void *)~0) >= 0)
287         return -1;
288       // This means the key doesn't exist.
289       if (errno == ENOENT)
290         return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
291     }
292     return -1;
293   } else {
294     return res;
295   }
296 }
297 
bpf_get_next_key(int fd,void * key,void * next_key)298 int bpf_get_next_key(int fd, void *key, void *next_key)
299 {
300   union bpf_attr attr;
301   memset(&attr, 0, sizeof(attr));
302   attr.map_fd = fd;
303   attr.key = ptr_to_u64(key);
304   attr.next_key = ptr_to_u64(next_key);
305 
306   return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
307 }
308 
bpf_print_hints(int ret,char * log)309 static void bpf_print_hints(int ret, char *log)
310 {
311   if (ret < 0)
312     fprintf(stderr, "bpf: Failed to load program: %s\n", strerror(errno));
313   if (log == NULL)
314     return;
315   else
316     fprintf(stderr, "%s\n", log);
317 
318   if (ret >= 0)
319     return;
320 
321   // The following error strings will need maintenance to match LLVM.
322 
323   // stack busting
324   if (strstr(log, "invalid stack off=-") != NULL) {
325     fprintf(stderr, "HINT: Looks like you exceeded the BPF stack limit. "
326       "This can happen if you allocate too much local variable storage. "
327       "For example, if you allocated a 1 Kbyte struct (maybe for "
328       "BPF_PERF_OUTPUT), busting a max stack of 512 bytes.\n\n");
329   }
330 
331   // didn't check NULL on map lookup
332   if (strstr(log, "invalid mem access 'map_value_or_null'") != NULL) {
333     fprintf(stderr, "HINT: The 'map_value_or_null' error can happen if "
334       "you dereference a pointer value from a map lookup without first "
335       "checking if that pointer is NULL.\n\n");
336   }
337 
338   // lacking a bpf_probe_read
339   if (strstr(log, "invalid mem access 'inv'") != NULL) {
340     fprintf(stderr, "HINT: The invalid mem access 'inv' error can happen "
341       "if you try to dereference memory without first using "
342       "bpf_probe_read() to copy it to the BPF stack. Sometimes the "
343       "bpf_probe_read is automatic by the bcc rewriter, other times "
344       "you'll need to be explicit.\n\n");
345   }
346 
347   // referencing global/static variables or read only data
348   if (strstr(log, "unknown opcode") != NULL) {
349     fprintf(stderr, "HINT: The 'unknown opcode' can happen if you reference "
350       "a global or static variable, or data in read-only section. For example,"
351       " 'char *p = \"hello\"' will result in p referencing a read-only section,"
352       " and 'char p[] = \"hello\"' will have \"hello\" stored on the stack.\n\n");
353   }
354 
355   // helper function not found in kernel
356   char *helper_str = strstr(log, "invalid func ");
357   if (helper_str != NULL) {
358     helper_str += strlen("invalid func ");
359     char *str = strchr(helper_str, '#');
360     if (str != NULL) {
361       helper_str = str + 1;
362     }
363     unsigned int helper_id = atoi(helper_str);
364     if (helper_id && helper_id < sizeof(helpers) / sizeof(struct bpf_helper)) {
365       struct bpf_helper helper = helpers[helper_id - 1];
366       fprintf(stderr, "HINT: bpf_%s missing (added in Linux %s).\n\n",
367               helper.name, helper.required_version);
368     }
369   }
370 }
371 #define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))
372 
bpf_obj_get_info(int prog_map_fd,void * info,uint32_t * info_len)373 int bpf_obj_get_info(int prog_map_fd, void *info, uint32_t *info_len)
374 {
375   union bpf_attr attr;
376   int err;
377 
378   memset(&attr, 0, sizeof(attr));
379   attr.info.bpf_fd = prog_map_fd;
380   attr.info.info_len = *info_len;
381   attr.info.info = ptr_to_u64(info);
382 
383   err = syscall(__NR_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
384   if (!err)
385           *info_len = attr.info.info_len;
386 
387   return err;
388 }
389 
bpf_prog_compute_tag(const struct bpf_insn * insns,int prog_len,unsigned long long * ptag)390 int bpf_prog_compute_tag(const struct bpf_insn *insns, int prog_len,
391                          unsigned long long *ptag)
392 {
393   struct sockaddr_alg alg = {
394     .salg_family    = AF_ALG,
395     .salg_type      = "hash",
396     .salg_name      = "sha1",
397   };
398   int shafd = socket(AF_ALG, SOCK_SEQPACKET | SOCK_CLOEXEC, 0);
399   if (shafd < 0) {
400     fprintf(stderr, "sha1 socket not available %s\n", strerror(errno));
401     return -1;
402   }
403   int ret = bind(shafd, (struct sockaddr *)&alg, sizeof(alg));
404   if (ret < 0) {
405     fprintf(stderr, "sha1 bind fail %s\n", strerror(errno));
406     close(shafd);
407     return ret;
408   }
409   int shafd2 = accept4(shafd, NULL, 0, SOCK_CLOEXEC);
410   if (shafd2 < 0) {
411     fprintf(stderr, "sha1 accept fail %s\n", strerror(errno));
412     close(shafd);
413     return -1;
414   }
415   struct bpf_insn prog[prog_len / 8];
416   bool map_ld_seen = false;
417   int i;
418   for (i = 0; i < prog_len / 8; i++) {
419     prog[i] = insns[i];
420     if (insns[i].code == (BPF_LD | BPF_DW | BPF_IMM) &&
421         insns[i].src_reg == BPF_PSEUDO_MAP_FD &&
422         !map_ld_seen) {
423       prog[i].imm = 0;
424       map_ld_seen = true;
425     } else if (insns[i].code == 0 && map_ld_seen) {
426       prog[i].imm = 0;
427       map_ld_seen = false;
428     } else {
429       map_ld_seen = false;
430     }
431   }
432   ret = write(shafd2, prog, prog_len);
433   if (ret != prog_len) {
434     fprintf(stderr, "sha1 write fail %s\n", strerror(errno));
435     close(shafd2);
436     close(shafd);
437     return -1;
438   }
439 
440   union {
441 	  unsigned char sha[20];
442 	  unsigned long long tag;
443   } u = {};
444   ret = read(shafd2, u.sha, 20);
445   if (ret != 20) {
446     fprintf(stderr, "sha1 read fail %s\n", strerror(errno));
447     close(shafd2);
448     close(shafd);
449     return -1;
450   }
451   *ptag = __builtin_bswap64(u.tag);
452   close(shafd2);
453   close(shafd);
454   return 0;
455 }
456 
bpf_prog_get_tag(int fd,unsigned long long * ptag)457 int bpf_prog_get_tag(int fd, unsigned long long *ptag)
458 {
459   char fmt[64];
460   snprintf(fmt, sizeof(fmt), "/proc/self/fdinfo/%d", fd);
461   FILE * f = fopen(fmt, "re");
462   if (!f) {
463 /*    fprintf(stderr, "failed to open fdinfo %s\n", strerror(errno));*/
464     return -1;
465   }
466   fgets(fmt, sizeof(fmt), f); // pos
467   fgets(fmt, sizeof(fmt), f); // flags
468   fgets(fmt, sizeof(fmt), f); // mnt_id
469   fgets(fmt, sizeof(fmt), f); // prog_type
470   fgets(fmt, sizeof(fmt), f); // prog_jited
471   fgets(fmt, sizeof(fmt), f); // prog_tag
472   fclose(f);
473   char *p = strchr(fmt, ':');
474   if (!p) {
475 /*    fprintf(stderr, "broken fdinfo %s\n", fmt);*/
476     return -2;
477   }
478   unsigned long long tag = 0;
479   sscanf(p + 1, "%llx", &tag);
480   *ptag = tag;
481   return 0;
482 }
483 
bpf_prog_load(enum bpf_prog_type prog_type,const char * name,const struct bpf_insn * insns,int prog_len,const char * license,unsigned kern_version,int log_level,char * log_buf,unsigned log_buf_size)484 int bpf_prog_load(enum bpf_prog_type prog_type, const char *name,
485                   const struct bpf_insn *insns, int prog_len,
486                   const char *license, unsigned kern_version,
487                   int log_level, char *log_buf, unsigned log_buf_size)
488 {
489   size_t name_len = name ? strlen(name) : 0;
490   union bpf_attr attr;
491   char *tmp_log_buf = NULL;
492   unsigned tmp_log_buf_size = 0;
493   int ret = 0, name_offset = 0;
494 
495   memset(&attr, 0, sizeof(attr));
496 
497   attr.prog_type = prog_type;
498   attr.kern_version = kern_version;
499   attr.license = ptr_to_u64((void *)license);
500 
501   attr.insns = ptr_to_u64((void *)insns);
502   attr.insn_cnt = prog_len / sizeof(struct bpf_insn);
503   if (attr.insn_cnt > BPF_MAXINSNS) {
504     errno = EINVAL;
505     fprintf(stderr,
506             "bpf: %s. Program %s too large (%u insns), at most %d insns\n\n",
507             strerror(errno), name, attr.insn_cnt, BPF_MAXINSNS);
508     return -1;
509   }
510 
511   attr.log_level = log_level;
512   if (attr.log_level > 0) {
513     if (log_buf_size > 0) {
514       // Use user-provided log buffer if availiable.
515       log_buf[0] = 0;
516       attr.log_buf = ptr_to_u64(log_buf);
517       attr.log_size = log_buf_size;
518     } else {
519       // Create and use temporary log buffer if user didn't provide one.
520       tmp_log_buf_size = LOG_BUF_SIZE;
521       tmp_log_buf = malloc(tmp_log_buf_size);
522       if (!tmp_log_buf) {
523         fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n",
524                 strerror(errno));
525         attr.log_level = 0;
526       } else {
527         tmp_log_buf[0] = 0;
528         attr.log_buf = ptr_to_u64(tmp_log_buf);
529         attr.log_size = tmp_log_buf_size;
530       }
531     }
532   }
533 
534   if (name_len) {
535     if (strncmp(name, "kprobe__", 8) == 0)
536       name_offset = 8;
537     else if (strncmp(name, "tracepoint__", 12) == 0)
538       name_offset = 12;
539     else if (strncmp(name, "raw_tracepoint__", 16) == 0)
540       name_offset = 16;
541     memcpy(attr.prog_name, name + name_offset,
542            min(name_len - name_offset, BPF_OBJ_NAME_LEN - 1));
543   }
544 
545   ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
546   // BPF object name is not supported on older Kernels.
547   // If we failed due to this, clear the name and try again.
548   if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) {
549     memset(attr.prog_name, 0, BPF_OBJ_NAME_LEN);
550     ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
551   }
552 
553   if (ret < 0 && errno == EPERM) {
554     // When EPERM is returned, two reasons are possible:
555     //  1. user has no permissions for bpf()
556     //  2. user has insufficent rlimit for locked memory
557     // Unfortunately, there is no api to inspect the current usage of locked
558     // mem for the user, so an accurate calculation of how much memory to lock
559     // for this new program is difficult to calculate. As a hack, bump the limit
560     // to unlimited. If program load fails again, return the error.
561     struct rlimit rl = {};
562     if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
563       rl.rlim_max = RLIM_INFINITY;
564       rl.rlim_cur = rl.rlim_max;
565       if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
566         ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
567     }
568   }
569 
570   // The load has failed. Handle log message.
571   if (ret < 0) {
572     // User has provided a log buffer.
573     if (log_buf_size) {
574       // If logging is not already enabled, enable it and do the syscall again.
575       if (attr.log_level == 0) {
576         attr.log_level = 1;
577         attr.log_buf = ptr_to_u64(log_buf);
578         attr.log_size = log_buf_size;
579         ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
580       }
581       // Print the log message and return.
582       bpf_print_hints(ret, log_buf);
583       if (errno == ENOSPC)
584         fprintf(stderr, "bpf: log_buf size may be insufficient\n");
585       goto return_result;
586     }
587 
588     // User did not provide log buffer. We will try to increase size of
589     // our temporary log buffer to get full error message.
590     if (tmp_log_buf)
591       free(tmp_log_buf);
592     tmp_log_buf_size = LOG_BUF_SIZE;
593     if (attr.log_level == 0)
594       attr.log_level = 1;
595     for (;;) {
596       tmp_log_buf = malloc(tmp_log_buf_size);
597       if (!tmp_log_buf) {
598         fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n",
599                 strerror(errno));
600         goto return_result;
601       }
602       tmp_log_buf[0] = 0;
603       attr.log_buf = ptr_to_u64(tmp_log_buf);
604       attr.log_size = tmp_log_buf_size;
605 
606       ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
607       if (ret < 0 && errno == ENOSPC) {
608         // Temporary buffer size is not enough. Double it and try again.
609         free(tmp_log_buf);
610         tmp_log_buf = NULL;
611         tmp_log_buf_size <<= 1;
612       } else {
613         break;
614       }
615     }
616   }
617 
618   // Check if we should print the log message if log_level is not 0,
619   // either specified by user or set due to error.
620   if (attr.log_level > 0) {
621     // Don't print if user enabled logging and provided log buffer,
622     // but there is no error.
623     if (log_buf && ret < 0)
624       bpf_print_hints(ret, log_buf);
625     else if (tmp_log_buf)
626       bpf_print_hints(ret, tmp_log_buf);
627   }
628 
629 return_result:
630   if (tmp_log_buf)
631     free(tmp_log_buf);
632   return ret;
633 }
634 
bpf_open_raw_sock(const char * name)635 int bpf_open_raw_sock(const char *name)
636 {
637   struct sockaddr_ll sll;
638   int sock;
639 
640   sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
641   if (sock < 0) {
642     fprintf(stderr, "cannot create raw socket\n");
643     return -1;
644   }
645 
646   /* Do not bind on empty interface names */
647   if (!name || *name == '\0')
648     return sock;
649 
650   memset(&sll, 0, sizeof(sll));
651   sll.sll_family = AF_PACKET;
652   sll.sll_ifindex = if_nametoindex(name);
653   if (sll.sll_ifindex == 0) {
654     fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno));
655     close(sock);
656     return -1;
657   }
658   sll.sll_protocol = htons(ETH_P_ALL);
659   if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
660     fprintf(stderr, "bind to %s: %s\n", name, strerror(errno));
661     close(sock);
662     return -1;
663   }
664 
665   return sock;
666 }
667 
bpf_attach_socket(int sock,int prog)668 int bpf_attach_socket(int sock, int prog) {
669   return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog));
670 }
671 
672 #define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
bpf_find_probe_type(const char * event_type)673 static int bpf_find_probe_type(const char *event_type)
674 {
675   int fd;
676   int ret;
677   char buf[PATH_MAX];
678 
679   ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
680   if (ret < 0 || ret >= (int)sizeof(buf))
681     return -1;
682 
683   fd = open(buf, O_RDONLY | O_CLOEXEC);
684   if (fd < 0)
685     return -1;
686   ret = read(fd, buf, sizeof(buf));
687   close(fd);
688   if (ret < 0 || ret >= (int)sizeof(buf))
689     return -1;
690   errno = 0;
691   ret = (int)strtol(buf, NULL, 10);
692   return errno ? -1 : ret;
693 }
694 
695 #define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
bpf_get_retprobe_bit(const char * event_type)696 static int bpf_get_retprobe_bit(const char *event_type)
697 {
698   int fd;
699   int ret;
700   char buf[PATH_MAX];
701 
702   ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
703   if (ret < 0 || ret >= (int)sizeof(buf))
704     return -1;
705 
706   fd = open(buf, O_RDONLY | O_CLOEXEC);
707   if (fd < 0)
708     return -1;
709   ret = read(fd, buf, sizeof(buf));
710   close(fd);
711   if (ret < 0 || ret >= (int)sizeof(buf))
712     return -1;
713   if (strncmp(buf, "config:", strlen("config:")))
714     return -1;
715   errno = 0;
716   ret = (int)strtol(buf + strlen("config:"), NULL, 10);
717   return errno ? -1 : ret;
718 }
719 
720 /*
721  * new kernel API allows creating [k,u]probe with perf_event_open, which
722  * makes it easier to clean up the [k,u]probe. This function tries to
723  * create pfd with the new API.
724  */
bpf_try_perf_event_open_with_probe(const char * name,uint64_t offs,int pid,char * event_type,int is_return)725 static int bpf_try_perf_event_open_with_probe(const char *name, uint64_t offs,
726              int pid, char *event_type, int is_return)
727 {
728   struct perf_event_attr attr = {};
729   int type = bpf_find_probe_type(event_type);
730   int is_return_bit = bpf_get_retprobe_bit(event_type);
731   int cpu = 0;
732 
733   if (type < 0 || is_return_bit < 0)
734     return -1;
735   attr.sample_period = 1;
736   attr.wakeup_events = 1;
737   if (is_return)
738     attr.config |= 1 << is_return_bit;
739 
740   /*
741    * struct perf_event_attr in latest perf_event.h has the following
742    * extension to config1 and config2. To keep bcc compatibe with
743    * older perf_event.h, we use config1 and config2 here instead of
744    * kprobe_func, uprobe_path, kprobe_addr, and probe_offset.
745    *
746    * union {
747    *  __u64 bp_addr;
748    *  __u64 kprobe_func;
749    *  __u64 uprobe_path;
750    *  __u64 config1;
751    * };
752    * union {
753    *   __u64 bp_len;
754    *   __u64 kprobe_addr;
755    *   __u64 probe_offset;
756    *   __u64 config2;
757    * };
758    */
759   attr.config2 = offs;  /* config2 here is kprobe_addr or probe_offset */
760   attr.size = sizeof(attr);
761   attr.type = type;
762   /* config1 here is kprobe_func or  uprobe_path */
763   attr.config1 = ptr_to_u64((void *)name);
764   // PID filter is only possible for uprobe events.
765   if (pid < 0)
766     pid = -1;
767   // perf_event_open API doesn't allow both pid and cpu to be -1.
768   // So only set it to -1 when PID is not -1.
769   // Tracing events do not do CPU filtering in any cases.
770   if (pid != -1)
771     cpu = -1;
772   return syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */,
773                  PERF_FLAG_FD_CLOEXEC);
774 }
775 
776 // When a valid Perf Event FD provided through pfd, it will be used to enable
777 // and attach BPF program to the event, and event_path will be ignored.
778 // Otherwise, event_path is expected to contain the path to the event in debugfs
779 // and it will be used to open the Perf Event FD.
780 // In either case, if the attach partially failed (such as issue with the
781 // ioctl operations), the **caller** need to clean up the Perf Event FD, either
782 // provided by the caller or opened here.
bpf_attach_tracing_event(int progfd,const char * event_path,int pid,int * pfd)783 static int bpf_attach_tracing_event(int progfd, const char *event_path, int pid,
784                                     int *pfd)
785 {
786   int efd, cpu = 0;
787   ssize_t bytes;
788   char buf[PATH_MAX];
789   struct perf_event_attr attr = {};
790   // Caller did not provided a valid Perf Event FD. Create one with the debugfs
791   // event path provided.
792   if (*pfd < 0) {
793     snprintf(buf, sizeof(buf), "%s/id", event_path);
794     efd = open(buf, O_RDONLY | O_CLOEXEC, 0);
795     if (efd < 0) {
796       fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
797       return -1;
798     }
799 
800     bytes = read(efd, buf, sizeof(buf));
801     if (bytes <= 0 || bytes >= (int)sizeof(buf)) {
802       fprintf(stderr, "read(%s): %s\n", buf, strerror(errno));
803       close(efd);
804       return -1;
805     }
806     close(efd);
807     buf[bytes] = '\0';
808     attr.config = strtol(buf, NULL, 0);
809     attr.type = PERF_TYPE_TRACEPOINT;
810     attr.sample_period = 1;
811     attr.wakeup_events = 1;
812     // PID filter is only possible for uprobe events.
813     if (pid < 0)
814       pid = -1;
815     // perf_event_open API doesn't allow both pid and cpu to be -1.
816     // So only set it to -1 when PID is not -1.
817     // Tracing events do not do CPU filtering in any cases.
818     if (pid != -1)
819       cpu = -1;
820     *pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
821     if (*pfd < 0) {
822       fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno));
823       return -1;
824     }
825   }
826 
827   if (ioctl(*pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) {
828     perror("ioctl(PERF_EVENT_IOC_SET_BPF)");
829     return -1;
830   }
831   if (ioctl(*pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
832     perror("ioctl(PERF_EVENT_IOC_ENABLE)");
833     return -1;
834   }
835 
836   return 0;
837 }
838 
bpf_attach_kprobe(int progfd,enum bpf_probe_attach_type attach_type,const char * ev_name,const char * fn_name,uint64_t fn_offset)839 int bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type,
840                       const char *ev_name, const char *fn_name, uint64_t fn_offset)
841 {
842   int kfd, pfd = -1;
843   char buf[256];
844   char event_alias[128];
845   static char *event_type = "kprobe";
846   bool use_debugfs = false;
847 
848   // Try create the kprobe Perf Event with perf_event_open API.
849   pfd = bpf_try_perf_event_open_with_probe(fn_name, fn_offset, -1, event_type,
850                                            attach_type != BPF_PROBE_ENTRY);
851   // If failed, most likely Kernel doesn't support the new perf_event_open API
852   // yet. Try create the event using debugfs.
853   if (pfd < 0) {
854     snprintf(buf, sizeof(buf), "/sys/kernel/tracing/%s_events", event_type);
855     kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
856     if (kfd < 0) {
857       use_debugfs = true;
858       snprintf(buf, sizeof(buf),
859                "/sys/kernel/debug/tracing/%s_events", event_type);
860       kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
861       if (kfd < 0) {
862         fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
863         goto error;
864       }
865     }
866 
867     snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
868 
869     if (fn_offset > 0 && attach_type == BPF_PROBE_ENTRY)
870       snprintf(buf, sizeof(buf), "p:%ss/%s %s+%"PRIu64,
871                event_type, event_alias, fn_name, fn_offset);
872     else
873       snprintf(buf, sizeof(buf), "%c:%ss/%s %s",
874                attach_type == BPF_PROBE_ENTRY ? 'p' : 'r',
875                event_type, event_alias, fn_name);
876 
877     if (write(kfd, buf, strlen(buf)) < 0) {
878       if (errno == ENOENT)
879          fprintf(stderr, "cannot attach kprobe, probe entry may not exist\n");
880       else
881          fprintf(stderr, "cannot attach kprobe, %s\n", strerror(errno));
882       close(kfd);
883       goto error;
884     }
885     close(kfd);
886     if (use_debugfs) {
887       snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type,
888                event_alias);
889     } else {
890       snprintf(buf, sizeof(buf), "/sys/kernel/tracing/events/%ss/%s", event_type,
891                event_alias);
892     }
893   }
894   // If perf_event_open succeeded, bpf_attach_tracing_event will use the created
895   // Perf Event FD directly and buf would be empty and unused.
896   // Otherwise it will read the event ID from the path in buf, create the
897   // Perf Event event using that ID, and updated value of pfd.
898   if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0)
899     return pfd;
900 
901 error:
902   bpf_close_perf_event_fd(pfd);
903   return -1;
904 }
905 
enter_mount_ns(int pid)906 static int enter_mount_ns(int pid) {
907   struct stat self_stat, target_stat;
908   int self_fd = -1, target_fd = -1;
909   char buf[64];
910 
911   if (pid < 0)
912     return -1;
913 
914   if ((size_t)snprintf(buf, sizeof(buf), "/proc/%d/ns/mnt", pid) >= sizeof(buf))
915     return -1;
916 
917   self_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
918   if (self_fd < 0) {
919     perror("open(/proc/self/ns/mnt)");
920     return -1;
921   }
922 
923   target_fd = open(buf, O_RDONLY | O_CLOEXEC);
924   if (target_fd < 0) {
925     perror("open(/proc/<pid>/ns/mnt)");
926     goto error;
927   }
928 
929   if (fstat(self_fd, &self_stat)) {
930     perror("fstat(self_fd)");
931     goto error;
932   }
933 
934   if (fstat(target_fd, &target_stat)) {
935     perror("fstat(target_fd)");
936     goto error;
937   }
938 
939   // both target and current ns are same, avoid setns and close all fds
940   if (self_stat.st_ino == target_stat.st_ino)
941     goto error;
942 
943   if (setns(target_fd, CLONE_NEWNS)) {
944     perror("setns(target)");
945     goto error;
946   }
947 
948   close(target_fd);
949   return self_fd;
950 
951 error:
952   if (self_fd >= 0)
953     close(self_fd);
954   if (target_fd >= 0)
955     close(target_fd);
956   return -1;
957 }
958 
exit_mount_ns(int fd)959 static void exit_mount_ns(int fd) {
960   if (fd < 0)
961     return;
962 
963   if (setns(fd, CLONE_NEWNS))
964     perror("setns");
965   close(fd);
966 }
967 
bpf_attach_uprobe(int progfd,enum bpf_probe_attach_type attach_type,const char * ev_name,const char * binary_path,uint64_t offset,pid_t pid)968 int bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type,
969                       const char *ev_name, const char *binary_path,
970                       uint64_t offset, pid_t pid)
971 {
972   char buf[PATH_MAX];
973   char event_alias[PATH_MAX];
974   static char *event_type = "uprobe";
975   int res, kfd = -1, pfd = -1, ns_fd = -1;
976   bool use_debugfs = false;
977   // Try create the uprobe Perf Event with perf_event_open API.
978   pfd = bpf_try_perf_event_open_with_probe(binary_path, offset, pid, event_type,
979                                            attach_type != BPF_PROBE_ENTRY);
980   // If failed, most likely Kernel doesn't support the new perf_event_open API
981   // yet. Try create the event using debugfs.
982   if (pfd < 0) {
983     snprintf(buf, sizeof(buf), "/sys/kernel/tracing/%s_events", event_type);
984     kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
985     if (kfd < 0) {
986       use_debugfs = true;
987       snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
988       kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
989       if (kfd < 0) {
990         fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
991         goto error;
992       }
993     }
994 
995     res = snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
996     if (res < 0 || res >= (int)sizeof(event_alias)) {
997       fprintf(stderr, "Event name (%s) is too long for buffer\n", ev_name);
998       goto error;
999     }
1000     res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
1001                    event_type, event_alias, binary_path, (unsigned long)offset);
1002     if (res < 0 || res >= (int)sizeof(buf)) {
1003       fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias);
1004       goto error;
1005     }
1006 
1007     ns_fd = enter_mount_ns(pid);
1008     if (write(kfd, buf, strlen(buf)) < 0) {
1009       if (errno == EINVAL)
1010         fprintf(stderr, "check dmesg output for possible cause\n");
1011       goto error;
1012     }
1013     close(kfd);
1014     kfd = -1;
1015     exit_mount_ns(ns_fd);
1016     ns_fd = -1;
1017     if (use_debugfs) {
1018       snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type,
1019                event_alias);
1020     } else {
1021       snprintf(buf, sizeof(buf), "/sys/kernel/tracing/events/%ss/%s", event_type, event_alias);
1022     }
1023   }
1024   // If perf_event_open succeeded, bpf_attach_tracing_event will use the created
1025   // Perf Event FD directly and buf would be empty and unused.
1026   // Otherwise it will read the event ID from the path in buf, create the
1027   // Perf Event event using that ID, and updated value of pfd.
1028   if (bpf_attach_tracing_event(progfd, buf, pid, &pfd) == 0)
1029     return pfd;
1030 
1031 error:
1032   if (kfd >= 0)
1033     close(kfd);
1034   exit_mount_ns(ns_fd);
1035   bpf_close_perf_event_fd(pfd);
1036   return -1;
1037 }
1038 
bpf_detach_probe(const char * ev_name,const char * event_type)1039 static int bpf_detach_probe(const char *ev_name, const char *event_type)
1040 {
1041   int kfd = -1, res;
1042   char buf[PATH_MAX];
1043   int found_event = 0;
1044   size_t bufsize = 0;
1045   char *cptr = NULL;
1046   FILE *fp;
1047   bool use_debugfs = false;
1048 
1049   /*
1050    * For [k,u]probe created with perf_event_open (on newer kernel), it is
1051    * not necessary to clean it up in [k,u]probe_events. We first look up
1052    * the %s_bcc_%d line in [k,u]probe_events. If the event is not found,
1053    * it is safe to skip the cleaning up process (write -:... to the file).
1054    */
1055   snprintf(buf, sizeof(buf), "/sys/kernel/tracing/%s_events", event_type);
1056   fp = fopen(buf, "re");
1057   if (!fp) {
1058     use_debugfs = true;
1059     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
1060     fp = fopen(buf, "re");
1061     if (!fp) {
1062       fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
1063       goto error;
1064     }
1065   }
1066 
1067   res = snprintf(buf, sizeof(buf), "%ss/%s_bcc_%d", event_type, ev_name, getpid());
1068   if (res < 0 || res >= (int)sizeof(buf)) {
1069     fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
1070     goto error;
1071   }
1072 
1073   while (getline(&cptr, &bufsize, fp) != -1)
1074     if (strstr(cptr, buf) != NULL) {
1075       found_event = 1;
1076       break;
1077     }
1078   free(cptr);
1079   fclose(fp);
1080   fp = NULL;
1081 
1082   if (!found_event)
1083     return 0;
1084   if (use_debugfs) {
1085     snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
1086   } else {
1087     snprintf(buf, sizeof(buf), "/sys/kernel/tracing/%s_events", event_type);
1088   }
1089   kfd = open(buf, O_WRONLY | O_APPEND | O_CLOEXEC, 0);
1090   if (kfd < 0) {
1091     fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
1092     goto error;
1093   }
1094 
1095   res = snprintf(buf, sizeof(buf), "-:%ss/%s_bcc_%d", event_type, ev_name, getpid());
1096   if (res < 0 || res >= (int)sizeof(buf)) {
1097     fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
1098     goto error;
1099   }
1100   if (write(kfd, buf, strlen(buf)) < 0) {
1101     fprintf(stderr, "write(%s): %s\n", buf, strerror(errno));
1102     goto error;
1103   }
1104 
1105   close(kfd);
1106   return 0;
1107 
1108 error:
1109   if (kfd >= 0)
1110     close(kfd);
1111   if (fp)
1112     fclose(fp);
1113   return -1;
1114 }
1115 
bpf_detach_kprobe(const char * ev_name)1116 int bpf_detach_kprobe(const char *ev_name)
1117 {
1118   return bpf_detach_probe(ev_name, "kprobe");
1119 }
1120 
bpf_detach_uprobe(const char * ev_name)1121 int bpf_detach_uprobe(const char *ev_name)
1122 {
1123   return bpf_detach_probe(ev_name, "uprobe");
1124 }
1125 
1126 
bpf_attach_tracepoint(int progfd,const char * tp_category,const char * tp_name)1127 int bpf_attach_tracepoint(int progfd, const char *tp_category,
1128                           const char *tp_name)
1129 {
1130   char buf[256];
1131   int pfd = -1;
1132 
1133   snprintf(buf, sizeof(buf), "/sys/kernel/tracing/events/%s/%s",
1134            tp_category, tp_name);
1135   if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0)
1136     return pfd;
1137 
1138   // try debugfs next
1139   snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/%s",
1140            tp_category, tp_name);
1141   if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0)
1142     return pfd;
1143 
1144   bpf_close_perf_event_fd(pfd);
1145   return -1;
1146 }
1147 
bpf_detach_tracepoint(const char * tp_category,const char * tp_name)1148 int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) {
1149   tp_category = NULL;
1150   tp_name = NULL;
1151   // Right now, there is nothing to do, but it's a good idea to encourage
1152   // callers to detach anything they attach.
1153   return 0;
1154 }
1155 
bpf_attach_raw_tracepoint(int progfd,char * tp_name)1156 int bpf_attach_raw_tracepoint(int progfd, char *tp_name)
1157 {
1158   union bpf_attr attr;
1159   int ret;
1160 
1161   bzero(&attr, sizeof(attr));
1162   attr.raw_tracepoint.name = ptr_to_u64(tp_name);
1163   attr.raw_tracepoint.prog_fd = progfd;
1164 
1165   ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
1166   if (ret < 0)
1167     fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno));
1168   return ret;
1169 }
1170 
bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,perf_reader_lost_cb lost_cb,void * cb_cookie,int pid,int cpu,int page_cnt)1171 void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
1172                             perf_reader_lost_cb lost_cb, void *cb_cookie,
1173                             int pid, int cpu, int page_cnt) {
1174   int pfd;
1175   struct perf_event_attr attr = {};
1176   struct perf_reader *reader = NULL;
1177 
1178   reader = perf_reader_new(raw_cb, lost_cb, cb_cookie, page_cnt);
1179   if (!reader)
1180     goto error;
1181 
1182   attr.config = 10;//PERF_COUNT_SW_BPF_OUTPUT;
1183   attr.type = PERF_TYPE_SOFTWARE;
1184   attr.sample_type = PERF_SAMPLE_RAW;
1185   attr.sample_period = 1;
1186   attr.wakeup_events = 1;
1187   pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
1188   if (pfd < 0) {
1189     fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
1190     fprintf(stderr, "   (check your kernel for PERF_COUNT_SW_BPF_OUTPUT support, 4.4 or newer)\n");
1191     goto error;
1192   }
1193   perf_reader_set_fd(reader, pfd);
1194 
1195   if (perf_reader_mmap(reader) < 0)
1196     goto error;
1197 
1198   if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
1199     perror("ioctl(PERF_EVENT_IOC_ENABLE)");
1200     goto error;
1201   }
1202 
1203   return reader;
1204 
1205 error:
1206   if (reader)
1207     perf_reader_free(reader);
1208 
1209   return NULL;
1210 }
1211 
invalid_perf_config(uint32_t type,uint64_t config)1212 static int invalid_perf_config(uint32_t type, uint64_t config) {
1213   switch (type) {
1214   case PERF_TYPE_HARDWARE:
1215     if (config >= PERF_COUNT_HW_MAX) {
1216       fprintf(stderr, "HARDWARE perf event config out of range\n");
1217       goto is_invalid;
1218     }
1219     return 0;
1220   case PERF_TYPE_SOFTWARE:
1221     if (config >= PERF_COUNT_SW_MAX) {
1222       fprintf(stderr, "SOFTWARE perf event config out of range\n");
1223       goto is_invalid;
1224     } else if (config == 10 /* PERF_COUNT_SW_BPF_OUTPUT */) {
1225       fprintf(stderr, "Unable to open or attach perf event for BPF_OUTPUT\n");
1226       goto is_invalid;
1227     }
1228     return 0;
1229   case PERF_TYPE_HW_CACHE:
1230     if (((config >> 16) >= PERF_COUNT_HW_CACHE_RESULT_MAX) ||
1231         (((config >> 8) & 0xff) >= PERF_COUNT_HW_CACHE_OP_MAX) ||
1232         ((config & 0xff) >= PERF_COUNT_HW_CACHE_MAX)) {
1233       fprintf(stderr, "HW_CACHE perf event config out of range\n");
1234       goto is_invalid;
1235     }
1236     return 0;
1237   case PERF_TYPE_TRACEPOINT:
1238   case PERF_TYPE_BREAKPOINT:
1239     fprintf(stderr,
1240             "Unable to open or attach TRACEPOINT or BREAKPOINT events\n");
1241     goto is_invalid;
1242   default:
1243     return 0;
1244   }
1245 is_invalid:
1246   fprintf(stderr, "Invalid perf event type %" PRIu32 " config %" PRIu64 "\n",
1247           type, config);
1248   return 1;
1249 }
1250 
bpf_open_perf_event(uint32_t type,uint64_t config,int pid,int cpu)1251 int bpf_open_perf_event(uint32_t type, uint64_t config, int pid, int cpu) {
1252   int fd;
1253   struct perf_event_attr attr = {};
1254 
1255   if (invalid_perf_config(type, config)) {
1256     return -1;
1257   }
1258 
1259   attr.sample_period = LONG_MAX;
1260   attr.type = type;
1261   attr.config = config;
1262 
1263   fd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
1264   if (fd < 0) {
1265     fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
1266     return -1;
1267   }
1268 
1269   if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
1270     perror("ioctl(PERF_EVENT_IOC_ENABLE)");
1271     close(fd);
1272     return -1;
1273   }
1274 
1275   return fd;
1276 }
1277 
bpf_attach_xdp(const char * dev_name,int progfd,uint32_t flags)1278 int bpf_attach_xdp(const char *dev_name, int progfd, uint32_t flags) {
1279     struct sockaddr_nl sa;
1280     int sock, seq = 0, len, ret = -1;
1281     char buf[4096];
1282     struct nlattr *nla, *nla_xdp;
1283     struct {
1284         struct nlmsghdr  nh;
1285         struct ifinfomsg ifinfo;
1286         char             attrbuf[64];
1287     } req;
1288     struct nlmsghdr *nh;
1289     struct nlmsgerr *err;
1290     socklen_t addrlen;
1291 
1292     memset(&sa, 0, sizeof(sa));
1293     sa.nl_family = AF_NETLINK;
1294 
1295     sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
1296     if (sock < 0) {
1297         fprintf(stderr, "bpf: opening a netlink socket: %s\n", strerror(errno));
1298         return -1;
1299     }
1300 
1301     if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
1302         fprintf(stderr, "bpf: bind to netlink: %s\n", strerror(errno));
1303         goto cleanup;
1304     }
1305 
1306     addrlen = sizeof(sa);
1307     if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
1308         fprintf(stderr, "bpf: get sock name of netlink: %s\n", strerror(errno));
1309         goto cleanup;
1310     }
1311 
1312     if (addrlen != sizeof(sa)) {
1313         fprintf(stderr, "bpf: wrong netlink address length: %d\n", addrlen);
1314         goto cleanup;
1315     }
1316 
1317     memset(&req, 0, sizeof(req));
1318     req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1319     req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1320     req.nh.nlmsg_type = RTM_SETLINK;
1321     req.nh.nlmsg_pid = 0;
1322     req.nh.nlmsg_seq = ++seq;
1323     req.ifinfo.ifi_family = AF_UNSPEC;
1324     req.ifinfo.ifi_index = if_nametoindex(dev_name);
1325     if (req.ifinfo.ifi_index == 0) {
1326         fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno));
1327         goto cleanup;
1328     }
1329 
1330     nla = (struct nlattr *)(((char *)&req)
1331                             + NLMSG_ALIGN(req.nh.nlmsg_len));
1332     nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
1333 
1334     nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
1335     nla->nla_len = NLA_HDRLEN;
1336 
1337     // we specify the FD passed over by the user
1338     nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
1339     nla_xdp->nla_len = NLA_HDRLEN + sizeof(progfd);
1340     memcpy((char *)nla_xdp + NLA_HDRLEN, &progfd, sizeof(progfd));
1341     nla->nla_len += nla_xdp->nla_len;
1342 
1343     // parse flags as passed by the user
1344     if (flags) {
1345         nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
1346         nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/;
1347         nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
1348         memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
1349         nla->nla_len += nla_xdp->nla_len;
1350     }
1351 
1352     req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
1353 
1354     if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
1355         fprintf(stderr, "bpf: send to netlink: %s\n", strerror(errno));
1356         goto cleanup;
1357     }
1358 
1359     len = recv(sock, buf, sizeof(buf), 0);
1360     if (len < 0) {
1361         fprintf(stderr, "bpf: recv from netlink: %s\n", strerror(errno));
1362         goto cleanup;
1363     }
1364 
1365     for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len);
1366          nh = NLMSG_NEXT(nh, len)) {
1367         if (nh->nlmsg_pid != sa.nl_pid) {
1368             fprintf(stderr, "bpf: Wrong pid %u, expected %u\n",
1369                    nh->nlmsg_pid, sa.nl_pid);
1370             errno = EBADMSG;
1371             goto cleanup;
1372         }
1373         if (nh->nlmsg_seq != (unsigned int)seq) {
1374             fprintf(stderr, "bpf: Wrong seq %d, expected %d\n",
1375                    nh->nlmsg_seq, seq);
1376             errno = EBADMSG;
1377             goto cleanup;
1378         }
1379         switch (nh->nlmsg_type) {
1380             case NLMSG_ERROR:
1381                 err = (struct nlmsgerr *)NLMSG_DATA(nh);
1382                 if (!err->error)
1383                     continue;
1384                 fprintf(stderr, "bpf: nlmsg error %s\n", strerror(-err->error));
1385                 errno = -err->error;
1386                 goto cleanup;
1387             case NLMSG_DONE:
1388                 break;
1389         }
1390     }
1391 
1392     ret = 0;
1393 
1394 cleanup:
1395     close(sock);
1396     return ret;
1397 }
1398 
bpf_attach_perf_event_raw(int progfd,void * perf_event_attr,pid_t pid,int cpu,int group_fd,unsigned long extra_flags)1399 int bpf_attach_perf_event_raw(int progfd, void *perf_event_attr, pid_t pid,
1400                               int cpu, int group_fd, unsigned long extra_flags) {
1401   int fd = syscall(__NR_perf_event_open, perf_event_attr, pid, cpu, group_fd,
1402                    PERF_FLAG_FD_CLOEXEC | extra_flags);
1403   if (fd < 0) {
1404     perror("perf_event_open failed");
1405     return -1;
1406   }
1407   if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, progfd) != 0) {
1408     perror("ioctl(PERF_EVENT_IOC_SET_BPF) failed");
1409     close(fd);
1410     return -1;
1411   }
1412   if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) != 0) {
1413     perror("ioctl(PERF_EVENT_IOC_ENABLE) failed");
1414     close(fd);
1415     return -1;
1416   }
1417 
1418   return fd;
1419 }
1420 
bpf_attach_perf_event(int progfd,uint32_t ev_type,uint32_t ev_config,uint64_t sample_period,uint64_t sample_freq,pid_t pid,int cpu,int group_fd)1421 int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
1422                           uint64_t sample_period, uint64_t sample_freq,
1423                           pid_t pid, int cpu, int group_fd) {
1424   if (invalid_perf_config(ev_type, ev_config)) {
1425     return -1;
1426   }
1427   if (!((sample_period > 0) ^ (sample_freq > 0))) {
1428     fprintf(
1429       stderr, "Exactly one of sample_period / sample_freq should be set\n"
1430     );
1431     return -1;
1432   }
1433 
1434   struct perf_event_attr attr = {};
1435   attr.type = ev_type;
1436   attr.config = ev_config;
1437   if (pid > 0)
1438     attr.inherit = 1;
1439   if (sample_freq > 0) {
1440     attr.freq = 1;
1441     attr.sample_freq = sample_freq;
1442   } else {
1443     attr.sample_period = sample_period;
1444   }
1445 
1446   return bpf_attach_perf_event_raw(progfd, &attr, pid, cpu, group_fd, 0);
1447 }
1448 
bpf_close_perf_event_fd(int fd)1449 int bpf_close_perf_event_fd(int fd) {
1450   int res, error = 0;
1451   if (fd >= 0) {
1452     res = ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
1453     if (res != 0) {
1454       perror("ioctl(PERF_EVENT_IOC_DISABLE) failed");
1455       error = res;
1456     }
1457     res = close(fd);
1458     if (res != 0) {
1459       perror("close perf event FD failed");
1460       error = (res && !error) ? res : error;
1461     }
1462   }
1463   return error;
1464 }
1465 
bpf_obj_pin(int fd,const char * pathname)1466 int bpf_obj_pin(int fd, const char *pathname)
1467 {
1468   union bpf_attr attr;
1469 
1470   memset(&attr, 0, sizeof(attr));
1471   attr.pathname = ptr_to_u64((void *)pathname);
1472   attr.bpf_fd = fd;
1473 
1474   return syscall(__NR_bpf, BPF_OBJ_PIN, &attr, sizeof(attr));
1475 }
1476 
bpf_obj_get(const char * pathname)1477 int bpf_obj_get(const char *pathname)
1478 {
1479   union bpf_attr attr;
1480 
1481   memset(&attr, 0, sizeof(attr));
1482   attr.pathname = ptr_to_u64((void *)pathname);
1483 
1484   return syscall(__NR_bpf, BPF_OBJ_GET, &attr, sizeof(attr));
1485 }
1486 
bpf_prog_get_next_id(uint32_t start_id,uint32_t * next_id)1487 int bpf_prog_get_next_id(uint32_t start_id, uint32_t *next_id)
1488 {
1489   union bpf_attr attr;
1490   int err;
1491 
1492   memset(&attr, 0, sizeof(attr));
1493   attr.start_id = start_id;
1494 
1495   err = syscall(__NR_bpf, BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr));
1496   if (!err)
1497     *next_id = attr.next_id;
1498 
1499   return err;
1500 }
1501 
bpf_prog_get_fd_by_id(uint32_t id)1502 int bpf_prog_get_fd_by_id(uint32_t id)
1503 {
1504   union bpf_attr attr;
1505 
1506   memset(&attr, 0, sizeof(attr));
1507   attr.prog_id = id;
1508 
1509   return syscall(__NR_bpf, BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr));
1510 }
1511 
bpf_map_get_fd_by_id(uint32_t id)1512 int bpf_map_get_fd_by_id(uint32_t id)
1513 {
1514   union bpf_attr attr;
1515 
1516   memset(&attr, 0, sizeof(attr));
1517   attr.map_id = id;
1518 
1519   return syscall(__NR_bpf, BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr));
1520 }
1521