1#!/usr/bin/env python
2#
3# tcpv4tracer   Trace TCP connections.
4#               For Linux, uses BCC, eBPF. Embedded C.
5#
6# USAGE: tcpv4tracer [-h] [-v] [-p PID] [-N NETNS]
7#
8# You should generally try to avoid writing long scripts that measure multiple
9# functions and walk multiple kernel structures, as they will be a burden to
10# maintain as the kernel changes.
11# The following code should be replaced, and simplified, when static TCP probes
12# exist.
13#
14# Copyright 2017 Kinvolk GmbH
15#
16# Licensed under the Apache License, Version 2.0 (the "License")
17from __future__ import print_function
18from bcc import BPF
19
20import argparse as ap
21import ctypes
22from socket import inet_ntop, AF_INET, AF_INET6
23from struct import pack
24
25parser = ap.ArgumentParser(description="Trace TCP connections",
26                           formatter_class=ap.RawDescriptionHelpFormatter)
27parser.add_argument("-t", "--timestamp", action="store_true",
28                    help="include timestamp on output")
29parser.add_argument("-p", "--pid", default=0, type=int,
30                    help="trace this PID only")
31parser.add_argument("-N", "--netns", default=0, type=int,
32                    help="trace this Network Namespace only")
33parser.add_argument("-v", "--verbose", action="store_true",
34                    help="include Network Namespace in the output")
35parser.add_argument("--ebpf", action="store_true",
36                    help=ap.SUPPRESS)
37args = parser.parse_args()
38
39bpf_text = """
40#include <uapi/linux/ptrace.h>
41#pragma clang diagnostic push
42#pragma clang diagnostic ignored "-Wtautological-compare"
43#include <net/sock.h>
44#pragma clang diagnostic pop
45#include <net/inet_sock.h>
46#include <net/net_namespace.h>
47#include <bcc/proto.h>
48
49#define TCP_EVENT_TYPE_CONNECT 1
50#define TCP_EVENT_TYPE_ACCEPT  2
51#define TCP_EVENT_TYPE_CLOSE   3
52
53struct tcp_ipv4_event_t {
54    u64 ts_ns;
55    u32 type;
56    u32 pid;
57    char comm[TASK_COMM_LEN];
58    u8 ip;
59    u32 saddr;
60    u32 daddr;
61    u16 sport;
62    u16 dport;
63    u32 netns;
64};
65BPF_PERF_OUTPUT(tcp_ipv4_event);
66
67struct tcp_ipv6_event_t {
68    u64 ts_ns;
69    u32 type;
70    u32 pid;
71    char comm[TASK_COMM_LEN];
72    u8 ip;
73    unsigned __int128 saddr;
74    unsigned __int128 daddr;
75    u16 sport;
76    u16 dport;
77    u32 netns;
78};
79BPF_PERF_OUTPUT(tcp_ipv6_event);
80
81// tcp_set_state doesn't run in the context of the process that initiated the
82// connection so we need to store a map TUPLE -> PID to send the right PID on
83// the event
84struct ipv4_tuple_t {
85    u32 saddr;
86    u32 daddr;
87    u16 sport;
88    u16 dport;
89    u32 netns;
90};
91
92struct ipv6_tuple_t {
93    unsigned __int128 saddr;
94    unsigned __int128 daddr;
95    u16 sport;
96    u16 dport;
97    u32 netns;
98};
99
100struct pid_comm_t {
101    u64 pid;
102    char comm[TASK_COMM_LEN];
103};
104
105BPF_HASH(tuplepid_ipv4, struct ipv4_tuple_t, struct pid_comm_t);
106BPF_HASH(tuplepid_ipv6, struct ipv6_tuple_t, struct pid_comm_t);
107
108BPF_HASH(connectsock, u64, struct sock *);
109
110static int read_ipv4_tuple(struct ipv4_tuple_t *tuple, struct sock *skp)
111{
112  u32 net_ns_inum = 0;
113  u32 saddr = skp->__sk_common.skc_rcv_saddr;
114  u32 daddr = skp->__sk_common.skc_daddr;
115  struct inet_sock *sockp = (struct inet_sock *)skp;
116  u16 sport = sockp->inet_sport;
117  u16 dport = skp->__sk_common.skc_dport;
118#ifdef CONFIG_NET_NS
119  net_ns_inum = skp->__sk_common.skc_net.net->ns.inum;
120#endif
121
122  ##FILTER_NETNS##
123
124  tuple->saddr = saddr;
125  tuple->daddr = daddr;
126  tuple->sport = sport;
127  tuple->dport = dport;
128  tuple->netns = net_ns_inum;
129
130  // if addresses or ports are 0, ignore
131  if (saddr == 0 || daddr == 0 || sport == 0 || dport == 0) {
132      return 0;
133  }
134
135  return 1;
136}
137
138static int read_ipv6_tuple(struct ipv6_tuple_t *tuple, struct sock *skp)
139{
140  u32 net_ns_inum = 0;
141  unsigned __int128 saddr = 0, daddr = 0;
142  struct inet_sock *sockp = (struct inet_sock *)skp;
143  u16 sport = sockp->inet_sport;
144  u16 dport = skp->__sk_common.skc_dport;
145#ifdef CONFIG_NET_NS
146  net_ns_inum = skp->__sk_common.skc_net.net->ns.inum;
147#endif
148  bpf_probe_read(&saddr, sizeof(saddr),
149                 skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
150  bpf_probe_read(&daddr, sizeof(daddr),
151                 skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
152
153  ##FILTER_NETNS##
154
155  tuple->saddr = saddr;
156  tuple->daddr = daddr;
157  tuple->sport = sport;
158  tuple->dport = dport;
159  tuple->netns = net_ns_inum;
160
161  // if addresses or ports are 0, ignore
162  if (saddr == 0 || daddr == 0 || sport == 0 || dport == 0) {
163      return 0;
164  }
165
166  return 1;
167}
168
169static bool check_family(struct sock *sk, u16 expected_family) {
170  u64 zero = 0;
171  u16 family = sk->__sk_common.skc_family;
172  return family == expected_family;
173}
174
175int trace_connect_v4_entry(struct pt_regs *ctx, struct sock *sk)
176{
177  u64 pid = bpf_get_current_pid_tgid();
178
179  ##FILTER_PID##
180
181  // stash the sock ptr for lookup on return
182  connectsock.update(&pid, &sk);
183
184  return 0;
185}
186
187int trace_connect_v4_return(struct pt_regs *ctx)
188{
189  int ret = PT_REGS_RC(ctx);
190  u64 pid = bpf_get_current_pid_tgid();
191
192  struct sock **skpp;
193  skpp = connectsock.lookup(&pid);
194  if (skpp == 0) {
195      return 0;       // missed entry
196  }
197
198  connectsock.delete(&pid);
199
200  if (ret != 0) {
201      // failed to send SYNC packet, may not have populated
202      // socket __sk_common.{skc_rcv_saddr, ...}
203      return 0;
204  }
205
206  // pull in details
207  struct sock *skp = *skpp;
208  struct ipv4_tuple_t t = { };
209  if (!read_ipv4_tuple(&t, skp)) {
210      return 0;
211  }
212
213  struct pid_comm_t p = { };
214  p.pid = pid;
215  bpf_get_current_comm(&p.comm, sizeof(p.comm));
216
217  tuplepid_ipv4.update(&t, &p);
218
219  return 0;
220}
221
222int trace_connect_v6_entry(struct pt_regs *ctx, struct sock *sk)
223{
224  u64 pid = bpf_get_current_pid_tgid();
225
226  ##FILTER_PID##
227
228  // stash the sock ptr for lookup on return
229  connectsock.update(&pid, &sk);
230
231  return 0;
232}
233
234int trace_connect_v6_return(struct pt_regs *ctx)
235{
236  int ret = PT_REGS_RC(ctx);
237  u64 pid = bpf_get_current_pid_tgid();
238
239  struct sock **skpp;
240  skpp = connectsock.lookup(&pid);
241  if (skpp == 0) {
242      return 0;       // missed entry
243  }
244
245  connectsock.delete(&pid);
246
247  if (ret != 0) {
248      // failed to send SYNC packet, may not have populated
249      // socket __sk_common.{skc_rcv_saddr, ...}
250      return 0;
251  }
252
253  // pull in details
254  struct sock *skp = *skpp;
255  struct ipv6_tuple_t t = { };
256  if (!read_ipv6_tuple(&t, skp)) {
257      return 0;
258  }
259
260  struct pid_comm_t p = { };
261  p.pid = pid;
262  bpf_get_current_comm(&p.comm, sizeof(p.comm));
263
264  tuplepid_ipv6.update(&t, &p);
265
266  return 0;
267}
268
269int trace_tcp_set_state_entry(struct pt_regs *ctx, struct sock *skp, int state)
270{
271  if (state != TCP_ESTABLISHED && state != TCP_CLOSE) {
272      return 0;
273  }
274
275  u8 ipver = 0;
276  if (check_family(skp, AF_INET)) {
277      ipver = 4;
278      struct ipv4_tuple_t t = { };
279      if (!read_ipv4_tuple(&t, skp)) {
280          return 0;
281      }
282
283      if (state == TCP_CLOSE) {
284          tuplepid_ipv4.delete(&t);
285          return 0;
286      }
287
288      struct pid_comm_t *p;
289      p = tuplepid_ipv4.lookup(&t);
290      if (p == 0) {
291          return 0;       // missed entry
292      }
293
294      struct tcp_ipv4_event_t evt4 = { };
295      evt4.ts_ns = bpf_ktime_get_ns();
296      evt4.type = TCP_EVENT_TYPE_CONNECT;
297      evt4.pid = p->pid >> 32;
298      evt4.ip = ipver;
299      evt4.saddr = t.saddr;
300      evt4.daddr = t.daddr;
301      evt4.sport = ntohs(t.sport);
302      evt4.dport = ntohs(t.dport);
303      evt4.netns = t.netns;
304
305      int i;
306      for (i = 0; i < TASK_COMM_LEN; i++) {
307          evt4.comm[i] = p->comm[i];
308      }
309
310      tcp_ipv4_event.perf_submit(ctx, &evt4, sizeof(evt4));
311      tuplepid_ipv4.delete(&t);
312  } else if (check_family(skp, AF_INET6)) {
313      ipver = 6;
314      struct ipv6_tuple_t t = { };
315      if (!read_ipv6_tuple(&t, skp)) {
316          return 0;
317      }
318
319      if (state == TCP_CLOSE) {
320          tuplepid_ipv6.delete(&t);
321          return 0;
322      }
323
324      struct pid_comm_t *p;
325      p = tuplepid_ipv6.lookup(&t);
326      if (p == 0) {
327          return 0;       // missed entry
328      }
329
330      struct tcp_ipv6_event_t evt6 = { };
331      evt6.ts_ns = bpf_ktime_get_ns();
332      evt6.type = TCP_EVENT_TYPE_CONNECT;
333      evt6.pid = p->pid >> 32;
334      evt6.ip = ipver;
335      evt6.saddr = t.saddr;
336      evt6.daddr = t.daddr;
337      evt6.sport = ntohs(t.sport);
338      evt6.dport = ntohs(t.dport);
339      evt6.netns = t.netns;
340
341      int i;
342      for (i = 0; i < TASK_COMM_LEN; i++) {
343          evt6.comm[i] = p->comm[i];
344      }
345
346      tcp_ipv6_event.perf_submit(ctx, &evt6, sizeof(evt6));
347      tuplepid_ipv6.delete(&t);
348  }
349  // else drop
350
351  return 0;
352}
353
354int trace_close_entry(struct pt_regs *ctx, struct sock *skp)
355{
356  u64 pid = bpf_get_current_pid_tgid();
357
358  ##FILTER_PID##
359
360  u8 oldstate = skp->sk_state;
361  // Don't generate close events for connections that were never
362  // established in the first place.
363  if (oldstate == TCP_SYN_SENT ||
364      oldstate == TCP_SYN_RECV ||
365      oldstate == TCP_NEW_SYN_RECV)
366      return 0;
367
368  u8 ipver = 0;
369  if (check_family(skp, AF_INET)) {
370      ipver = 4;
371      struct ipv4_tuple_t t = { };
372      if (!read_ipv4_tuple(&t, skp)) {
373          return 0;
374      }
375
376      struct tcp_ipv4_event_t evt4 = { };
377      evt4.ts_ns = bpf_ktime_get_ns();
378      evt4.type = TCP_EVENT_TYPE_CLOSE;
379      evt4.pid = pid >> 32;
380      evt4.ip = ipver;
381      evt4.saddr = t.saddr;
382      evt4.daddr = t.daddr;
383      evt4.sport = ntohs(t.sport);
384      evt4.dport = ntohs(t.dport);
385      evt4.netns = t.netns;
386      bpf_get_current_comm(&evt4.comm, sizeof(evt4.comm));
387
388      tcp_ipv4_event.perf_submit(ctx, &evt4, sizeof(evt4));
389  } else if (check_family(skp, AF_INET6)) {
390      ipver = 6;
391      struct ipv6_tuple_t t = { };
392      if (!read_ipv6_tuple(&t, skp)) {
393          return 0;
394      }
395
396      struct tcp_ipv6_event_t evt6 = { };
397      evt6.ts_ns = bpf_ktime_get_ns();
398      evt6.type = TCP_EVENT_TYPE_CLOSE;
399      evt6.pid = pid >> 32;
400      evt6.ip = ipver;
401      evt6.saddr = t.saddr;
402      evt6.daddr = t.daddr;
403      evt6.sport = ntohs(t.sport);
404      evt6.dport = ntohs(t.dport);
405      evt6.netns = t.netns;
406      bpf_get_current_comm(&evt6.comm, sizeof(evt6.comm));
407
408      tcp_ipv6_event.perf_submit(ctx, &evt6, sizeof(evt6));
409  }
410  // else drop
411
412  return 0;
413};
414
415int trace_accept_return(struct pt_regs *ctx)
416{
417  struct sock *newsk = (struct sock *)PT_REGS_RC(ctx);
418  u64 pid = bpf_get_current_pid_tgid();
419
420  ##FILTER_PID##
421
422  if (newsk == NULL) {
423      return 0;
424  }
425
426  // pull in details
427  u16 lport = 0, dport = 0;
428  u32 net_ns_inum = 0;
429  u8 ipver = 0;
430
431  dport = newsk->__sk_common.skc_dport;
432  lport = newsk->__sk_common.skc_num;
433
434  // Get network namespace id, if kernel supports it
435#ifdef CONFIG_NET_NS
436  net_ns_inum = newsk->__sk_common.skc_net.net->ns.inum;
437#endif
438
439  ##FILTER_NETNS##
440
441  if (check_family(newsk, AF_INET)) {
442      ipver = 4;
443
444      struct tcp_ipv4_event_t evt4 = { 0 };
445
446      evt4.ts_ns = bpf_ktime_get_ns();
447      evt4.type = TCP_EVENT_TYPE_ACCEPT;
448      evt4.netns = net_ns_inum;
449      evt4.pid = pid >> 32;
450      evt4.ip = ipver;
451
452      evt4.saddr = newsk->__sk_common.skc_rcv_saddr;
453      evt4.daddr = newsk->__sk_common.skc_daddr;
454
455      evt4.sport = lport;
456      evt4.dport = ntohs(dport);
457      bpf_get_current_comm(&evt4.comm, sizeof(evt4.comm));
458
459      // do not send event if IP address is 0.0.0.0 or port is 0
460      if (evt4.saddr != 0 && evt4.daddr != 0 &&
461          evt4.sport != 0 && evt4.dport != 0) {
462          tcp_ipv4_event.perf_submit(ctx, &evt4, sizeof(evt4));
463      }
464  } else if (check_family(newsk, AF_INET6)) {
465      ipver = 6;
466
467      struct tcp_ipv6_event_t evt6 = { 0 };
468
469      evt6.ts_ns = bpf_ktime_get_ns();
470      evt6.type = TCP_EVENT_TYPE_ACCEPT;
471      evt6.netns = net_ns_inum;
472      evt6.pid = pid >> 32;
473      evt6.ip = ipver;
474
475      bpf_probe_read(&evt6.saddr, sizeof(evt6.saddr),
476                     newsk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
477      bpf_probe_read(&evt6.daddr, sizeof(evt6.daddr),
478                     newsk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
479
480      evt6.sport = lport;
481      evt6.dport = ntohs(dport);
482      bpf_get_current_comm(&evt6.comm, sizeof(evt6.comm));
483
484      // do not send event if IP address is 0.0.0.0 or port is 0
485      if (evt6.saddr != 0 && evt6.daddr != 0 &&
486          evt6.sport != 0 && evt6.dport != 0) {
487          tcp_ipv6_event.perf_submit(ctx, &evt6, sizeof(evt6));
488      }
489  }
490  // else drop
491
492  return 0;
493}
494"""
495
496TASK_COMM_LEN = 16   # linux/sched.h
497
498
499class TCPIPV4Evt(ctypes.Structure):
500    _fields_ = [
501            ("ts_ns", ctypes.c_ulonglong),
502            ("type", ctypes.c_uint),
503            ("pid", ctypes.c_uint),
504            ("comm", ctypes.c_char * TASK_COMM_LEN),
505            ("ip", ctypes.c_ubyte),
506            ("saddr", ctypes.c_uint),
507            ("daddr", ctypes.c_uint),
508            ("sport", ctypes.c_ushort),
509            ("dport", ctypes.c_ushort),
510            ("netns", ctypes.c_uint)
511    ]
512
513
514class TCPIPV6Evt(ctypes.Structure):
515    _fields_ = [
516            ("ts_ns", ctypes.c_ulonglong),
517            ("type", ctypes.c_uint),
518            ("pid", ctypes.c_uint),
519            ("comm", ctypes.c_char * TASK_COMM_LEN),
520            ("ip", ctypes.c_ubyte),
521            ("saddr", (ctypes.c_ulong * 2)),
522            ("daddr", (ctypes.c_ulong * 2)),
523            ("sport", ctypes.c_ushort),
524            ("dport", ctypes.c_ushort),
525            ("netns", ctypes.c_uint)
526    ]
527
528
529verbose_types = {"C": "connect", "A": "accept",
530                 "X": "close", "U": "unknown"}
531
532
533def print_ipv4_event(cpu, data, size):
534    event = ctypes.cast(data, ctypes.POINTER(TCPIPV4Evt)).contents
535    global start_ts
536
537    if args.timestamp:
538        if start_ts == 0:
539            start_ts = event.ts_ns
540        if args.verbose:
541            print("%-14d" % (event.ts_ns - start_ts), end="")
542        else:
543            print("%-9.3f" % ((event.ts_ns - start_ts) / 1000000000.0), end="")
544    if event.type == 1:
545        type_str = "C"
546    elif event.type == 2:
547        type_str = "A"
548    elif event.type == 3:
549        type_str = "X"
550    else:
551        type_str = "U"
552
553    if args.verbose:
554        print("%-12s " % (verbose_types[type_str]), end="")
555    else:
556        print("%-2s " % (type_str), end="")
557
558    print("%-6d %-16s %-2d %-16s %-16s %-6d %-6d" %
559          (event.pid, event.comm.decode('utf-8', 'replace'),
560           event.ip,
561           inet_ntop(AF_INET, pack("I", event.saddr)),
562           inet_ntop(AF_INET, pack("I", event.daddr)),
563           event.sport,
564           event.dport), end="")
565    if args.verbose and not args.netns:
566        print(" %-8d" % event.netns)
567    else:
568        print()
569
570
571def print_ipv6_event(cpu, data, size):
572    event = ctypes.cast(data, ctypes.POINTER(TCPIPV6Evt)).contents
573    global start_ts
574    if args.timestamp:
575        if start_ts == 0:
576            start_ts = event.ts_ns
577        if args.verbose:
578            print("%-14d" % (event.ts_ns - start_ts), end="")
579        else:
580            print("%-9.3f" % ((event.ts_ns - start_ts) / 1000000000.0), end="")
581    if event.type == 1:
582        type_str = "C"
583    elif event.type == 2:
584        type_str = "A"
585    elif event.type == 3:
586        type_str = "X"
587    else:
588        type_str = "U"
589
590    if args.verbose:
591        print("%-12s " % (verbose_types[type_str]), end="")
592    else:
593        print("%-2s " % (type_str), end="")
594
595    print("%-6d %-16s %-2d %-16s %-16s %-6d %-6d" %
596          (event.pid, event.comm.decode('utf-8', 'replace'),
597           event.ip,
598           "[" + inet_ntop(AF_INET6, event.saddr) + "]",
599           "[" + inet_ntop(AF_INET6, event.daddr) + "]",
600           event.sport,
601           event.dport), end="")
602    if args.verbose and not args.netns:
603        print(" %-8d" % event.netns)
604    else:
605        print()
606
607
608pid_filter = ""
609netns_filter = ""
610
611if args.pid:
612    pid_filter = 'if (pid >> 32 != %d) { return 0; }' % args.pid
613if args.netns:
614    netns_filter = 'if (net_ns_inum != %d) { return 0; }' % args.netns
615
616bpf_text = bpf_text.replace('##FILTER_PID##', pid_filter)
617bpf_text = bpf_text.replace('##FILTER_NETNS##', netns_filter)
618
619if args.ebpf:
620    print(bpf_text)
621    exit()
622
623# initialize BPF
624b = BPF(text=bpf_text)
625b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_entry")
626b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_return")
627b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_entry")
628b.attach_kretprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_return")
629b.attach_kprobe(event="tcp_set_state", fn_name="trace_tcp_set_state_entry")
630b.attach_kprobe(event="tcp_close", fn_name="trace_close_entry")
631b.attach_kretprobe(event="inet_csk_accept", fn_name="trace_accept_return")
632
633print("Tracing TCP established connections. Ctrl-C to end.")
634
635# header
636if args.verbose:
637    if args.timestamp:
638        print("%-14s" % ("TIME(ns)"), end="")
639    print("%-12s %-6s %-16s %-2s %-16s %-16s %-6s %-7s" % ("TYPE",
640          "PID", "COMM", "IP", "SADDR", "DADDR", "SPORT", "DPORT"), end="")
641    if not args.netns:
642        print("%-8s" % "NETNS", end="")
643    print()
644else:
645    if args.timestamp:
646        print("%-9s" % ("TIME(s)"), end="")
647    print("%-2s %-6s %-16s %-2s %-16s %-16s %-6s %-6s" %
648          ("T", "PID", "COMM", "IP", "SADDR", "DADDR", "SPORT", "DPORT"))
649
650start_ts = 0
651
652def inet_ntoa(addr):
653    dq = ''
654    for i in range(0, 4):
655        dq = dq + str(addr & 0xff)
656        if (i != 3):
657            dq = dq + '.'
658        addr = addr >> 8
659    return dq
660
661
662b["tcp_ipv4_event"].open_perf_buffer(print_ipv4_event)
663b["tcp_ipv6_event"].open_perf_buffer(print_ipv6_event)
664while True:
665    try:
666        b.perf_buffer_poll()
667    except KeyboardInterrupt:
668        exit()
669