1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcpconnlat    Trace TCP active connection latency (connect).
5#               For Linux, uses BCC, eBPF. Embedded C.
6#
7# USAGE: tcpconnlat [-h] [-t] [-p PID]
8#
9# This uses dynamic tracing of kernel functions, and will need to be updated
10# to match kernel changes.
11#
12# Copyright 2016 Netflix, Inc.
13# Licensed under the Apache License, Version 2.0 (the "License")
14#
15# 19-Feb-2016   Brendan Gregg   Created this.
16
17from __future__ import print_function
18from bcc import BPF
19from socket import inet_ntop, AF_INET, AF_INET6
20from struct import pack
21import argparse
22import ctypes as ct
23
24# arg validation
25def positive_float(val):
26    try:
27        ival = float(val)
28    except ValueError:
29        raise argparse.ArgumentTypeError("must be a float")
30
31    if ival < 0:
32        raise argparse.ArgumentTypeError("must be positive")
33    return ival
34
35# arguments
36examples = """examples:
37    ./tcpconnlat           # trace all TCP connect()s
38    ./tcpconnlat 1         # trace connection latency slower than 1 ms
39    ./tcpconnlat 0.1       # trace connection latency slower than 100 us
40    ./tcpconnlat -t        # include timestamps
41    ./tcpconnlat -p 181    # only trace PID 181
42"""
43parser = argparse.ArgumentParser(
44    description="Trace TCP connects and show connection latency",
45    formatter_class=argparse.RawDescriptionHelpFormatter,
46    epilog=examples)
47parser.add_argument("-t", "--timestamp", action="store_true",
48    help="include timestamp on output")
49parser.add_argument("-p", "--pid",
50    help="trace this PID only")
51parser.add_argument("duration_ms", nargs="?", default=0,
52    type=positive_float,
53    help="minimum duration to trace (ms)")
54parser.add_argument("-v", "--verbose", action="store_true",
55    help="print the BPF program for debugging purposes")
56parser.add_argument("--ebpf", action="store_true",
57    help=argparse.SUPPRESS)
58args = parser.parse_args()
59
60if args.duration_ms:
61    # support fractions but round to nearest microsecond
62    duration_us = int(args.duration_ms * 1000)
63else:
64    duration_us = 0   # default is show all
65
66debug = 0
67
68# define BPF program
69bpf_text = """
70#include <uapi/linux/ptrace.h>
71#include <net/sock.h>
72#include <net/tcp_states.h>
73#include <bcc/proto.h>
74
75struct info_t {
76    u64 ts;
77    u32 pid;
78    char task[TASK_COMM_LEN];
79};
80BPF_HASH(start, struct sock *, struct info_t);
81
82// separate data structs for ipv4 and ipv6
83struct ipv4_data_t {
84    u64 ts_us;
85    u32 pid;
86    u32 saddr;
87    u32 daddr;
88    u64 ip;
89    u16 dport;
90    u64 delta_us;
91    char task[TASK_COMM_LEN];
92};
93BPF_PERF_OUTPUT(ipv4_events);
94
95struct ipv6_data_t {
96    u64 ts_us;
97    u32 pid;
98    unsigned __int128 saddr;
99    unsigned __int128 daddr;
100    u64 ip;
101    u16 dport;
102    u64 delta_us;
103    char task[TASK_COMM_LEN];
104};
105BPF_PERF_OUTPUT(ipv6_events);
106
107int trace_connect(struct pt_regs *ctx, struct sock *sk)
108{
109    u32 pid = bpf_get_current_pid_tgid();
110    FILTER
111    struct info_t info = {.pid = pid};
112    info.ts = bpf_ktime_get_ns();
113    bpf_get_current_comm(&info.task, sizeof(info.task));
114    start.update(&sk, &info);
115    return 0;
116};
117
118// See tcp_v4_do_rcv() and tcp_v6_do_rcv(). So TCP_ESTBALISHED and TCP_LISTEN
119// are fast path and processed elsewhere, and leftovers are processed by
120// tcp_rcv_state_process(). We can trace this for handshake completion.
121// This should all be switched to static tracepoints when available.
122int trace_tcp_rcv_state_process(struct pt_regs *ctx, struct sock *skp)
123{
124    // will be in TCP_SYN_SENT for handshake
125    if (skp->__sk_common.skc_state != TCP_SYN_SENT)
126        return 0;
127
128    // check start and calculate delta
129    struct info_t *infop = start.lookup(&skp);
130    if (infop == 0) {
131        return 0;   // missed entry or filtered
132    }
133
134    u64 ts = infop->ts;
135    u64 now = bpf_ktime_get_ns();
136
137    u64 delta_us = (now - ts) / 1000ul;
138
139#ifdef MIN_LATENCY
140    if ( delta_us < DURATION_US ) {
141        return 0; // connect latency is below latency filter minimum
142    }
143#endif
144
145    // pull in details
146    u16 family = 0, dport = 0;
147    family = skp->__sk_common.skc_family;
148    dport = skp->__sk_common.skc_dport;
149
150    // emit to appropriate data path
151    if (family == AF_INET) {
152        struct ipv4_data_t data4 = {.pid = infop->pid, .ip = 4};
153        data4.ts_us = now / 1000;
154        data4.saddr = skp->__sk_common.skc_rcv_saddr;
155        data4.daddr = skp->__sk_common.skc_daddr;
156        data4.dport = ntohs(dport);
157        data4.delta_us = delta_us;
158        __builtin_memcpy(&data4.task, infop->task, sizeof(data4.task));
159        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
160
161    } else /* AF_INET6 */ {
162        struct ipv6_data_t data6 = {.pid = infop->pid, .ip = 6};
163        data6.ts_us = now / 1000;
164        bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
165            skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
166        bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
167            skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
168        data6.dport = ntohs(dport);
169        data6.delta_us = delta_us;
170        __builtin_memcpy(&data6.task, infop->task, sizeof(data6.task));
171        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
172    }
173
174    start.delete(&skp);
175
176    return 0;
177}
178"""
179
180if duration_us > 0:
181    bpf_text = "#define MIN_LATENCY\n" + bpf_text
182    bpf_text = bpf_text.replace('DURATION_US', str(duration_us))
183
184# code substitutions
185if args.pid:
186    bpf_text = bpf_text.replace('FILTER',
187        'if (pid != %s) { return 0; }' % args.pid)
188else:
189    bpf_text = bpf_text.replace('FILTER', '')
190if debug or args.verbose or args.ebpf:
191    print(bpf_text)
192    if args.ebpf:
193        exit()
194
195# initialize BPF
196b = BPF(text=bpf_text)
197b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect")
198b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect")
199b.attach_kprobe(event="tcp_rcv_state_process",
200    fn_name="trace_tcp_rcv_state_process")
201
202# event data
203TASK_COMM_LEN = 16      # linux/sched.h
204
205class Data_ipv4(ct.Structure):
206    _fields_ = [
207        ("ts_us", ct.c_ulonglong),
208        ("pid", ct.c_uint),
209        ("saddr", ct.c_uint),
210        ("daddr", ct.c_uint),
211        ("ip", ct.c_ulonglong),
212        ("dport", ct.c_ushort),
213        ("delta_us", ct.c_ulonglong),
214        ("task", ct.c_char * TASK_COMM_LEN)
215    ]
216
217class Data_ipv6(ct.Structure):
218    _fields_ = [
219        ("ts_us", ct.c_ulonglong),
220        ("pid", ct.c_uint),
221        ("saddr", (ct.c_ulonglong * 2)),
222        ("daddr", (ct.c_ulonglong * 2)),
223        ("ip", ct.c_ulonglong),
224        ("dport", ct.c_ushort),
225        ("delta_us", ct.c_ulonglong),
226        ("task", ct.c_char * TASK_COMM_LEN)
227    ]
228
229# process event
230start_ts = 0
231
232def print_ipv4_event(cpu, data, size):
233    event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
234    global start_ts
235    if args.timestamp:
236        if start_ts == 0:
237            start_ts = event.ts_us
238        print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
239    print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid,
240        event.task.decode('utf-8', 'replace'), event.ip,
241        inet_ntop(AF_INET, pack("I", event.saddr)),
242        inet_ntop(AF_INET, pack("I", event.daddr)), event.dport,
243        float(event.delta_us) / 1000))
244
245def print_ipv6_event(cpu, data, size):
246    event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
247    global start_ts
248    if args.timestamp:
249        if start_ts == 0:
250            start_ts = event.ts_us
251        print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
252    print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid,
253        event.task.decode('utf-8', 'replace'), event.ip,
254        inet_ntop(AF_INET6, event.saddr), inet_ntop(AF_INET6, event.daddr),
255        event.dport, float(event.delta_us) / 1000))
256
257# header
258if args.timestamp:
259    print("%-9s" % ("TIME(s)"), end="")
260print("%-6s %-12s %-2s %-16s %-16s %-5s %s" % ("PID", "COMM", "IP", "SADDR",
261    "DADDR", "DPORT", "LAT(ms)"))
262
263# read events
264b["ipv4_events"].open_perf_buffer(print_ipv4_event)
265b["ipv6_events"].open_perf_buffer(print_ipv6_event)
266while 1:
267    b.perf_buffer_poll()
268