1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcplife   Trace the lifespan of TCP sessions and summarize.
5#           For Linux, uses BCC, BPF. Embedded C.
6#
7# USAGE: tcplife [-h] [-C] [-S] [-p PID] [interval [count]]
8#
9# This uses the sock:inet_sock_set_state tracepoint if it exists (added to
10# Linux 4.16, and replacing the earlier tcp:tcp_set_state), else it uses
11# kernel dynamic tracing of tcp_set_state().
12#
13# While throughput counters are emitted, they are fetched in a low-overhead
14# manner: reading members of the tcp_info struct on TCP close. ie, we do not
15# trace send/receive.
16#
17# Copyright 2016 Netflix, Inc.
18# Licensed under the Apache License, Version 2.0 (the "License")
19#
20# IDEA: Julia Evans
21#
22# 18-Oct-2016   Brendan Gregg   Created this.
23# 29-Dec-2017      "      "     Added tracepoint support.
24
25from __future__ import print_function
26from bcc import BPF
27import argparse
28from socket import inet_ntop, ntohs, AF_INET, AF_INET6
29from struct import pack
30import ctypes as ct
31from time import strftime
32
33# arguments
34examples = """examples:
35    ./tcplife           # trace all TCP connect()s
36    ./tcplife -t        # include time column (HH:MM:SS)
37    ./tcplife -w        # wider colums (fit IPv6)
38    ./tcplife -stT      # csv output, with times & timestamps
39    ./tcplife -p 181    # only trace PID 181
40    ./tcplife -L 80     # only trace local port 80
41    ./tcplife -L 80,81  # only trace local ports 80 and 81
42    ./tcplife -D 80     # only trace remote port 80
43"""
44parser = argparse.ArgumentParser(
45    description="Trace the lifespan of TCP sessions and summarize",
46    formatter_class=argparse.RawDescriptionHelpFormatter,
47    epilog=examples)
48parser.add_argument("-T", "--time", action="store_true",
49    help="include time column on output (HH:MM:SS)")
50parser.add_argument("-t", "--timestamp", action="store_true",
51    help="include timestamp on output (seconds)")
52parser.add_argument("-w", "--wide", action="store_true",
53    help="wide column output (fits IPv6 addresses)")
54parser.add_argument("-s", "--csv", action="store_true",
55    help="comma separated values output")
56parser.add_argument("-p", "--pid",
57    help="trace this PID only")
58parser.add_argument("-L", "--localport",
59    help="comma-separated list of local ports to trace.")
60parser.add_argument("-D", "--remoteport",
61    help="comma-separated list of remote ports to trace.")
62parser.add_argument("--ebpf", action="store_true",
63    help=argparse.SUPPRESS)
64args = parser.parse_args()
65debug = 0
66
67# define BPF program
68bpf_text = """
69#include <uapi/linux/ptrace.h>
70#define KBUILD_MODNAME "foo"
71#include <linux/tcp.h>
72#include <net/sock.h>
73#include <bcc/proto.h>
74
75BPF_HASH(birth, struct sock *, u64);
76
77// separate data structs for ipv4 and ipv6
78struct ipv4_data_t {
79    u64 ts_us;
80    u32 pid;
81    u32 saddr;
82    u32 daddr;
83    u64 ports;
84    u64 rx_b;
85    u64 tx_b;
86    u64 span_us;
87    char task[TASK_COMM_LEN];
88};
89BPF_PERF_OUTPUT(ipv4_events);
90
91struct ipv6_data_t {
92    u64 ts_us;
93    u32 pid;
94    unsigned __int128 saddr;
95    unsigned __int128 daddr;
96    u64 ports;
97    u64 rx_b;
98    u64 tx_b;
99    u64 span_us;
100    char task[TASK_COMM_LEN];
101};
102BPF_PERF_OUTPUT(ipv6_events);
103
104struct id_t {
105    u32 pid;
106    char task[TASK_COMM_LEN];
107};
108BPF_HASH(whoami, struct sock *, struct id_t);
109"""
110
111#
112# XXX: The following is temporary code for older kernels, Linux 4.14 and
113# older. It uses kprobes to instrument tcp_set_state(). On Linux 4.16 and
114# later, the sock:inet_sock_set_state tracepoint should be used instead, as
115# is done by the code that follows this. In the distant future (2021?), this
116# kprobe code can be removed. This is why there is so much code
117# duplication: to make removal easier.
118#
119bpf_text_kprobe = """
120int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
121{
122    u32 pid = bpf_get_current_pid_tgid() >> 32;
123
124    // lport is either used in a filter here, or later
125    u16 lport = sk->__sk_common.skc_num;
126    FILTER_LPORT
127
128    // dport is either used in a filter here, or later
129    u16 dport = sk->__sk_common.skc_dport;
130    dport = ntohs(dport);
131    FILTER_DPORT
132
133    /*
134     * This tool includes PID and comm context. It's best effort, and may
135     * be wrong in some situations. It currently works like this:
136     * - record timestamp on any state < TCP_FIN_WAIT1
137     * - cache task context on:
138     *       TCP_SYN_SENT: tracing from client
139     *       TCP_LAST_ACK: client-closed from server
140     * - do output on TCP_CLOSE:
141     *       fetch task context if cached, or use current task
142     */
143
144    // capture birth time
145    if (state < TCP_FIN_WAIT1) {
146        /*
147         * Matching just ESTABLISHED may be sufficient, provided no code-path
148         * sets ESTABLISHED without a tcp_set_state() call. Until we know
149         * that for sure, match all early states to increase chances a
150         * timestamp is set.
151         * Note that this needs to be set before the PID filter later on,
152         * since the PID isn't reliable for these early stages, so we must
153         * save all timestamps and do the PID filter later when we can.
154         */
155        u64 ts = bpf_ktime_get_ns();
156        birth.update(&sk, &ts);
157    }
158
159    // record PID & comm on SYN_SENT
160    if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) {
161        // now we can PID filter, both here and a little later on for CLOSE
162        FILTER_PID
163        struct id_t me = {.pid = pid};
164        bpf_get_current_comm(&me.task, sizeof(me.task));
165        whoami.update(&sk, &me);
166    }
167
168    if (state != TCP_CLOSE)
169        return 0;
170
171    // calculate lifespan
172    u64 *tsp, delta_us;
173    tsp = birth.lookup(&sk);
174    if (tsp == 0) {
175        whoami.delete(&sk);     // may not exist
176        return 0;               // missed create
177    }
178    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
179    birth.delete(&sk);
180
181    // fetch possible cached data, and filter
182    struct id_t *mep;
183    mep = whoami.lookup(&sk);
184    if (mep != 0)
185        pid = mep->pid;
186    FILTER_PID
187
188    // get throughput stats. see tcp_get_info().
189    u64 rx_b = 0, tx_b = 0, sport = 0;
190    struct tcp_sock *tp = (struct tcp_sock *)sk;
191    rx_b = tp->bytes_received;
192    tx_b = tp->bytes_acked;
193
194    u16 family = sk->__sk_common.skc_family;
195
196    if (family == AF_INET) {
197        struct ipv4_data_t data4 = {};
198        data4.span_us = delta_us;
199        data4.rx_b = rx_b;
200        data4.tx_b = tx_b;
201        data4.ts_us = bpf_ktime_get_ns() / 1000;
202        data4.saddr = sk->__sk_common.skc_rcv_saddr;
203        data4.daddr = sk->__sk_common.skc_daddr;
204        // a workaround until data4 compiles with separate lport/dport
205        data4.pid = pid;
206        data4.ports = dport + ((0ULL + lport) << 32);
207        if (mep == 0) {
208            bpf_get_current_comm(&data4.task, sizeof(data4.task));
209        } else {
210            bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
211        }
212        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
213
214    } else /* 6 */ {
215        struct ipv6_data_t data6 = {};
216        data6.span_us = delta_us;
217        data6.rx_b = rx_b;
218        data6.tx_b = tx_b;
219        data6.ts_us = bpf_ktime_get_ns() / 1000;
220        bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
221            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
222        bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
223            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
224        // a workaround until data6 compiles with separate lport/dport
225        data6.ports = dport + ((0ULL + lport) << 32);
226        data6.pid = pid;
227        if (mep == 0) {
228            bpf_get_current_comm(&data6.task, sizeof(data6.task));
229        } else {
230            bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
231        }
232        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
233    }
234
235    if (mep != 0)
236        whoami.delete(&sk);
237
238    return 0;
239}
240"""
241
242bpf_text_tracepoint = """
243TRACEPOINT_PROBE(sock, inet_sock_set_state)
244{
245    if (args->protocol != IPPROTO_TCP)
246        return 0;
247
248    u32 pid = bpf_get_current_pid_tgid() >> 32;
249    // sk is mostly used as a UUID, and for two tcp stats:
250    struct sock *sk = (struct sock *)args->skaddr;
251
252    // lport is either used in a filter here, or later
253    u16 lport = args->sport;
254    FILTER_LPORT
255
256    // dport is either used in a filter here, or later
257    u16 dport = args->dport;
258    FILTER_DPORT
259
260    /*
261     * This tool includes PID and comm context. It's best effort, and may
262     * be wrong in some situations. It currently works like this:
263     * - record timestamp on any state < TCP_FIN_WAIT1
264     * - cache task context on:
265     *       TCP_SYN_SENT: tracing from client
266     *       TCP_LAST_ACK: client-closed from server
267     * - do output on TCP_CLOSE:
268     *       fetch task context if cached, or use current task
269     */
270
271    // capture birth time
272    if (args->newstate < TCP_FIN_WAIT1) {
273        /*
274         * Matching just ESTABLISHED may be sufficient, provided no code-path
275         * sets ESTABLISHED without a tcp_set_state() call. Until we know
276         * that for sure, match all early states to increase chances a
277         * timestamp is set.
278         * Note that this needs to be set before the PID filter later on,
279         * since the PID isn't reliable for these early stages, so we must
280         * save all timestamps and do the PID filter later when we can.
281         */
282        u64 ts = bpf_ktime_get_ns();
283        birth.update(&sk, &ts);
284    }
285
286    // record PID & comm on SYN_SENT
287    if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) {
288        // now we can PID filter, both here and a little later on for CLOSE
289        FILTER_PID
290        struct id_t me = {.pid = pid};
291        bpf_get_current_comm(&me.task, sizeof(me.task));
292        whoami.update(&sk, &me);
293    }
294
295    if (args->newstate != TCP_CLOSE)
296        return 0;
297
298    // calculate lifespan
299    u64 *tsp, delta_us;
300    tsp = birth.lookup(&sk);
301    if (tsp == 0) {
302        whoami.delete(&sk);     // may not exist
303        return 0;               // missed create
304    }
305    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
306    birth.delete(&sk);
307
308    // fetch possible cached data, and filter
309    struct id_t *mep;
310    mep = whoami.lookup(&sk);
311    if (mep != 0)
312        pid = mep->pid;
313    FILTER_PID
314
315    // get throughput stats. see tcp_get_info().
316    u64 rx_b = 0, tx_b = 0, sport = 0;
317    struct tcp_sock *tp = (struct tcp_sock *)sk;
318    rx_b = tp->bytes_received;
319    tx_b = tp->bytes_acked;
320
321    if (args->family == AF_INET) {
322        struct ipv4_data_t data4 = {};
323        data4.span_us = delta_us;
324        data4.rx_b = rx_b;
325        data4.tx_b = tx_b;
326        data4.ts_us = bpf_ktime_get_ns() / 1000;
327        __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
328        __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
329        // a workaround until data4 compiles with separate lport/dport
330        data4.ports = dport + ((0ULL + lport) << 32);
331        data4.pid = pid;
332
333        if (mep == 0) {
334            bpf_get_current_comm(&data4.task, sizeof(data4.task));
335        } else {
336            bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
337        }
338        ipv4_events.perf_submit(args, &data4, sizeof(data4));
339
340    } else /* 6 */ {
341        struct ipv6_data_t data6 = {};
342        data6.span_us = delta_us;
343        data6.rx_b = rx_b;
344        data6.tx_b = tx_b;
345        data6.ts_us = bpf_ktime_get_ns() / 1000;
346        __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
347        __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
348        // a workaround until data6 compiles with separate lport/dport
349        data6.ports = dport + ((0ULL + lport) << 32);
350        data6.pid = pid;
351        if (mep == 0) {
352            bpf_get_current_comm(&data6.task, sizeof(data6.task));
353        } else {
354            bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
355        }
356        ipv6_events.perf_submit(args, &data6, sizeof(data6));
357    }
358
359    if (mep != 0)
360        whoami.delete(&sk);
361
362    return 0;
363}
364"""
365
366if (BPF.tracepoint_exists("sock", "inet_sock_set_state")):
367    bpf_text += bpf_text_tracepoint
368else:
369    bpf_text += bpf_text_kprobe
370
371# code substitutions
372if args.pid:
373    bpf_text = bpf_text.replace('FILTER_PID',
374        'if (pid != %s) { return 0; }' % args.pid)
375if args.remoteport:
376    dports = [int(dport) for dport in args.remoteport.split(',')]
377    dports_if = ' && '.join(['dport != %d' % dport for dport in dports])
378    bpf_text = bpf_text.replace('FILTER_DPORT',
379        'if (%s) { birth.delete(&sk); return 0; }' % dports_if)
380if args.localport:
381    lports = [int(lport) for lport in args.localport.split(',')]
382    lports_if = ' && '.join(['lport != %d' % lport for lport in lports])
383    bpf_text = bpf_text.replace('FILTER_LPORT',
384        'if (%s) { birth.delete(&sk); return 0; }' % lports_if)
385bpf_text = bpf_text.replace('FILTER_PID', '')
386bpf_text = bpf_text.replace('FILTER_DPORT', '')
387bpf_text = bpf_text.replace('FILTER_LPORT', '')
388
389if debug or args.ebpf:
390    print(bpf_text)
391    if args.ebpf:
392        exit()
393
394# event data
395TASK_COMM_LEN = 16      # linux/sched.h
396
397class Data_ipv4(ct.Structure):
398    _fields_ = [
399        ("ts_us", ct.c_ulonglong),
400        ("pid", ct.c_uint),
401        ("saddr", ct.c_uint),
402        ("daddr", ct.c_uint),
403        ("ports", ct.c_ulonglong),
404        ("rx_b", ct.c_ulonglong),
405        ("tx_b", ct.c_ulonglong),
406        ("span_us", ct.c_ulonglong),
407        ("task", ct.c_char * TASK_COMM_LEN)
408    ]
409
410class Data_ipv6(ct.Structure):
411    _fields_ = [
412        ("ts_us", ct.c_ulonglong),
413        ("pid", ct.c_uint),
414        ("saddr", (ct.c_ulonglong * 2)),
415        ("daddr", (ct.c_ulonglong * 2)),
416        ("ports", ct.c_ulonglong),
417        ("rx_b", ct.c_ulonglong),
418        ("tx_b", ct.c_ulonglong),
419        ("span_us", ct.c_ulonglong),
420        ("task", ct.c_char * TASK_COMM_LEN)
421    ]
422
423#
424# Setup output formats
425#
426# Don't change the default output (next 2 lines): this fits in 80 chars. I
427# know it doesn't have NS or UIDs etc. I know. If you really, really, really
428# need to add columns, columns that solve real actual problems, I'd start by
429# adding an extended mode (-x) to included those columns.
430#
431header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
432format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"
433if args.wide:
434    header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s"
435    format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f"
436if args.csv:
437    header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
438    format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f"
439
440# process event
441def print_ipv4_event(cpu, data, size):
442    event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
443    global start_ts
444    if args.time:
445        if args.csv:
446            print("%s," % strftime("%H:%M:%S"), end="")
447        else:
448            print("%-8s " % strftime("%H:%M:%S"), end="")
449    if args.timestamp:
450        if start_ts == 0:
451            start_ts = event.ts_us
452        delta_s = (float(event.ts_us) - start_ts) / 1000000
453        if args.csv:
454            print("%.6f," % delta_s, end="")
455        else:
456            print("%-9.6f " % delta_s, end="")
457    print(format_string % (event.pid, event.task.decode('utf-8', 'replace'),
458        "4" if args.wide or args.csv else "",
459        inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32,
460        inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff,
461        event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
462
463def print_ipv6_event(cpu, data, size):
464    event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
465    global start_ts
466    if args.time:
467        if args.csv:
468            print("%s," % strftime("%H:%M:%S"), end="")
469        else:
470            print("%-8s " % strftime("%H:%M:%S"), end="")
471    if args.timestamp:
472        if start_ts == 0:
473            start_ts = event.ts_us
474        delta_s = (float(event.ts_us) - start_ts) / 1000000
475        if args.csv:
476            print("%.6f," % delta_s, end="")
477        else:
478            print("%-9.6f " % delta_s, end="")
479    print(format_string % (event.pid, event.task.decode('utf-8', 'replace'),
480        "6" if args.wide or args.csv else "",
481        inet_ntop(AF_INET6, event.saddr), event.ports >> 32,
482        inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff,
483        event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
484
485# initialize BPF
486b = BPF(text=bpf_text)
487
488# header
489if args.time:
490    if args.csv:
491        print("%s," % ("TIME"), end="")
492    else:
493        print("%-8s " % ("TIME"), end="")
494if args.timestamp:
495    if args.csv:
496        print("%s," % ("TIME(s)"), end="")
497    else:
498        print("%-9s " % ("TIME(s)"), end="")
499print(header_string % ("PID", "COMM",
500    "IP" if args.wide or args.csv else "", "LADDR",
501    "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"))
502
503start_ts = 0
504
505# read events
506b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
507b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
508while 1:
509    try:
510        b.perf_buffer_poll()
511    except KeyboardInterrupt:
512        exit()
513