1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcptop    Summarize TCP send/recv throughput by host.
5#           For Linux, uses BCC, eBPF. Embedded C.
6#
7# USAGE: tcptop [-h] [-C] [-S] [-p PID] [interval [count]]
8#
9# This uses dynamic tracing of kernel functions, and will need to be updated
10# to match kernel changes.
11#
12# WARNING: This traces all send/receives at the TCP level, and while it
13# summarizes data in-kernel to reduce overhead, there may still be some
14# overhead at high TCP send/receive rates (eg, ~13% of one CPU at 100k TCP
15# events/sec. This is not the same as packet rate: funccount can be used to
16# count the kprobes below to find out the TCP rate). Test in a lab environment
17# first. If your send/receive rate is low (eg, <1k/sec) then the overhead is
18# expected to be negligible.
19#
20# ToDo: Fit output to screen size (top X only) in default (not -C) mode.
21#
22# Copyright 2016 Netflix, Inc.
23# Licensed under the Apache License, Version 2.0 (the "License")
24#
25# 02-Sep-2016   Brendan Gregg   Created this.
26
27from __future__ import print_function
28from bcc import BPF
29import argparse
30from socket import inet_ntop, AF_INET, AF_INET6
31from struct import pack
32from time import sleep, strftime
33from subprocess import call
34import ctypes as ct
35from collections import namedtuple, defaultdict
36
37# arguments
38def range_check(string):
39    value = int(string)
40    if value < 1:
41        msg = "value must be stricly positive, got %d" % (value,)
42        raise argparse.ArgumentTypeError(msg)
43    return value
44
45examples = """examples:
46    ./tcptop           # trace TCP send/recv by host
47    ./tcptop -C        # don't clear the screen
48    ./tcptop -p 181    # only trace PID 181
49"""
50parser = argparse.ArgumentParser(
51    description="Summarize TCP send/recv throughput by host",
52    formatter_class=argparse.RawDescriptionHelpFormatter,
53    epilog=examples)
54parser.add_argument("-C", "--noclear", action="store_true",
55    help="don't clear the screen")
56parser.add_argument("-S", "--nosummary", action="store_true",
57    help="skip system summary line")
58parser.add_argument("-p", "--pid",
59    help="trace this PID only")
60parser.add_argument("interval", nargs="?", default=1, type=range_check,
61    help="output interval, in seconds (default 1)")
62parser.add_argument("count", nargs="?", default=-1, type=range_check,
63    help="number of outputs")
64parser.add_argument("--ebpf", action="store_true",
65    help=argparse.SUPPRESS)
66args = parser.parse_args()
67debug = 0
68
69# linux stats
70loadavg = "/proc/loadavg"
71
72# define BPF program
73bpf_text = """
74#include <uapi/linux/ptrace.h>
75#include <net/sock.h>
76#include <bcc/proto.h>
77
78struct ipv4_key_t {
79    u32 pid;
80    u32 saddr;
81    u32 daddr;
82    u16 lport;
83    u16 dport;
84};
85BPF_HASH(ipv4_send_bytes, struct ipv4_key_t);
86BPF_HASH(ipv4_recv_bytes, struct ipv4_key_t);
87
88struct ipv6_key_t {
89    u32 pid;
90    unsigned __int128 saddr;
91    unsigned __int128 daddr;
92    u16 lport;
93    u16 dport;
94};
95BPF_HASH(ipv6_send_bytes, struct ipv6_key_t);
96BPF_HASH(ipv6_recv_bytes, struct ipv6_key_t);
97
98int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
99    struct msghdr *msg, size_t size)
100{
101    u32 pid = bpf_get_current_pid_tgid();
102    FILTER
103    u16 dport = 0, family = sk->__sk_common.skc_family;
104
105    if (family == AF_INET) {
106        struct ipv4_key_t ipv4_key = {.pid = pid};
107        ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
108        ipv4_key.daddr = sk->__sk_common.skc_daddr;
109        ipv4_key.lport = sk->__sk_common.skc_num;
110        dport = sk->__sk_common.skc_dport;
111        ipv4_key.dport = ntohs(dport);
112        ipv4_send_bytes.increment(ipv4_key, size);
113
114    } else if (family == AF_INET6) {
115        struct ipv6_key_t ipv6_key = {.pid = pid};
116        __builtin_memcpy(&ipv6_key.saddr,
117            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32, sizeof(ipv6_key.saddr));
118        __builtin_memcpy(&ipv6_key.daddr,
119            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32, sizeof(ipv6_key.daddr));
120        ipv6_key.lport = sk->__sk_common.skc_num;
121        dport = sk->__sk_common.skc_dport;
122        ipv6_key.dport = ntohs(dport);
123        ipv6_send_bytes.increment(ipv6_key, size);
124    }
125    // else drop
126
127    return 0;
128}
129
130/*
131 * tcp_recvmsg() would be obvious to trace, but is less suitable because:
132 * - we'd need to trace both entry and return, to have both sock and size
133 * - misses tcp_read_sock() traffic
134 * we'd much prefer tracepoints once they are available.
135 */
136int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied)
137{
138    u32 pid = bpf_get_current_pid_tgid();
139    FILTER
140    u16 dport = 0, family = sk->__sk_common.skc_family;
141    u64 *val, zero = 0;
142
143    if (copied <= 0)
144        return 0;
145
146    if (family == AF_INET) {
147        struct ipv4_key_t ipv4_key = {.pid = pid};
148        ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
149        ipv4_key.daddr = sk->__sk_common.skc_daddr;
150        ipv4_key.lport = sk->__sk_common.skc_num;
151        dport = sk->__sk_common.skc_dport;
152        ipv4_key.dport = ntohs(dport);
153        ipv4_recv_bytes.increment(ipv4_key, copied);
154
155    } else if (family == AF_INET6) {
156        struct ipv6_key_t ipv6_key = {.pid = pid};
157        __builtin_memcpy(&ipv6_key.saddr,
158            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32, sizeof(ipv6_key.saddr));
159        __builtin_memcpy(&ipv6_key.daddr,
160            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32, sizeof(ipv6_key.daddr));
161        ipv6_key.lport = sk->__sk_common.skc_num;
162        dport = sk->__sk_common.skc_dport;
163        ipv6_key.dport = ntohs(dport);
164        ipv6_recv_bytes.increment(ipv6_key, copied);
165    }
166    // else drop
167
168    return 0;
169}
170"""
171
172# code substitutions
173if args.pid:
174    bpf_text = bpf_text.replace('FILTER',
175        'if (pid != %s) { return 0; }' % args.pid)
176else:
177    bpf_text = bpf_text.replace('FILTER', '')
178if debug or args.ebpf:
179    print(bpf_text)
180    if args.ebpf:
181        exit()
182
183TCPSessionKey = namedtuple('TCPSession', ['pid', 'laddr', 'lport', 'daddr', 'dport'])
184
185def pid_to_comm(pid):
186    try:
187        comm = open("/proc/%d/comm" % pid, "r").read().rstrip()
188        return comm
189    except IOError:
190        return str(pid)
191
192def get_ipv4_session_key(k):
193    return TCPSessionKey(pid=k.pid,
194                         laddr=inet_ntop(AF_INET, pack("I", k.saddr)),
195                         lport=k.lport,
196                         daddr=inet_ntop(AF_INET, pack("I", k.daddr)),
197                         dport=k.dport)
198
199def get_ipv6_session_key(k):
200    return TCPSessionKey(pid=k.pid,
201                         laddr=inet_ntop(AF_INET6, k.saddr),
202                         lport=k.lport,
203                         daddr=inet_ntop(AF_INET6, k.daddr),
204                         dport=k.dport)
205
206# initialize BPF
207b = BPF(text=bpf_text)
208
209ipv4_send_bytes = b["ipv4_send_bytes"]
210ipv4_recv_bytes = b["ipv4_recv_bytes"]
211ipv6_send_bytes = b["ipv6_send_bytes"]
212ipv6_recv_bytes = b["ipv6_recv_bytes"]
213
214print('Tracing... Output every %s secs. Hit Ctrl-C to end' % args.interval)
215
216# output
217i = 0
218exiting = False
219while i != args.count and not exiting:
220    try:
221        sleep(args.interval)
222    except KeyboardInterrupt:
223        exiting = True
224
225    # header
226    if args.noclear:
227        print()
228    else:
229        call("clear")
230    if not args.nosummary:
231        with open(loadavg) as stats:
232            print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
233
234    # IPv4: build dict of all seen keys
235    ipv4_throughput = defaultdict(lambda: [0, 0])
236    for k, v in ipv4_send_bytes.items():
237        key = get_ipv4_session_key(k)
238        ipv4_throughput[key][0] = v.value
239    ipv4_send_bytes.clear()
240
241    for k, v in ipv4_recv_bytes.items():
242        key = get_ipv4_session_key(k)
243        ipv4_throughput[key][1] = v.value
244    ipv4_recv_bytes.clear()
245
246    if ipv4_throughput:
247        print("%-6s %-12s %-21s %-21s %6s %6s" % ("PID", "COMM",
248            "LADDR", "RADDR", "RX_KB", "TX_KB"))
249
250    # output
251    for k, (send_bytes, recv_bytes) in sorted(ipv4_throughput.items(),
252                                              key=lambda kv: sum(kv[1]),
253                                              reverse=True):
254        print("%-6d %-12.12s %-21s %-21s %6d %6d" % (k.pid,
255            pid_to_comm(k.pid),
256            k.laddr + ":" + str(k.lport),
257            k.daddr + ":" + str(k.dport),
258            int(recv_bytes / 1024), int(send_bytes / 1024)))
259
260    # IPv6: build dict of all seen keys
261    ipv6_throughput = defaultdict(lambda: [0, 0])
262    for k, v in ipv6_send_bytes.items():
263        key = get_ipv6_session_key(k)
264        ipv6_throughput[key][0] = v.value
265    ipv6_send_bytes.clear()
266
267    for k, v in ipv6_recv_bytes.items():
268        key = get_ipv6_session_key(k)
269        ipv6_throughput[key][1] = v.value
270    ipv6_recv_bytes.clear()
271
272    if ipv6_throughput:
273        # more than 80 chars, sadly.
274        print("\n%-6s %-12s %-32s %-32s %6s %6s" % ("PID", "COMM",
275            "LADDR6", "RADDR6", "RX_KB", "TX_KB"))
276
277    # output
278    for k, (send_bytes, recv_bytes) in sorted(ipv6_throughput.items(),
279                                              key=lambda kv: sum(kv[1]),
280                                              reverse=True):
281        print("%-6d %-12.12s %-32s %-32s %6d %6d" % (k.pid,
282            pid_to_comm(k.pid),
283            k.laddr + ":" + str(k.lport),
284            k.daddr + ":" + str(k.dport),
285            int(recv_bytes / 1024), int(send_bytes / 1024)))
286
287    i += 1
288