1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# biotop  block device (disk) I/O by process.
5#         For Linux, uses BCC, eBPF.
6#
7# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
8#
9# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
10# request, as well as a starting timestamp for calculating I/O latency.
11#
12# Copyright 2016 Netflix, Inc.
13# Licensed under the Apache License, Version 2.0 (the "License")
14#
15# 06-Feb-2016   Brendan Gregg   Created this.
16
17from __future__ import print_function
18from bcc import BPF
19from time import sleep, strftime
20import argparse
21import signal
22from subprocess import call
23
24# arguments
25examples = """examples:
26    ./biotop            # block device I/O top, 1 second refresh
27    ./biotop -C         # don't clear the screen
28    ./biotop 5          # 5 second summaries
29    ./biotop 5 10       # 5 second summaries, 10 times only
30"""
31parser = argparse.ArgumentParser(
32    description="Block device (disk) I/O by process",
33    formatter_class=argparse.RawDescriptionHelpFormatter,
34    epilog=examples)
35parser.add_argument("-C", "--noclear", action="store_true",
36    help="don't clear the screen")
37parser.add_argument("-r", "--maxrows", default=20,
38    help="maximum rows to print, default 20")
39parser.add_argument("interval", nargs="?", default=1,
40    help="output interval, in seconds")
41parser.add_argument("count", nargs="?", default=99999999,
42    help="number of outputs")
43parser.add_argument("--ebpf", action="store_true",
44    help=argparse.SUPPRESS)
45args = parser.parse_args()
46interval = int(args.interval)
47countdown = int(args.count)
48maxrows = int(args.maxrows)
49clear = not int(args.noclear)
50
51# linux stats
52loadavg = "/proc/loadavg"
53diskstats = "/proc/diskstats"
54
55# signal handler
56def signal_ignore(signal, frame):
57    print()
58
59# load BPF program
60bpf_text = """
61#include <uapi/linux/ptrace.h>
62#include <linux/blkdev.h>
63
64// for saving process info by request
65struct who_t {
66    u32 pid;
67    char name[TASK_COMM_LEN];
68};
69
70// the key for the output summary
71struct info_t {
72    u32 pid;
73    int rwflag;
74    int major;
75    int minor;
76    char name[TASK_COMM_LEN];
77};
78
79// the value of the output summary
80struct val_t {
81    u64 bytes;
82    u64 us;
83    u32 io;
84};
85
86BPF_HASH(start, struct request *);
87BPF_HASH(whobyreq, struct request *, struct who_t);
88BPF_HASH(counts, struct info_t, struct val_t);
89
90// cache PID and comm by-req
91int trace_pid_start(struct pt_regs *ctx, struct request *req)
92{
93    struct who_t who = {};
94
95    if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
96        who.pid = bpf_get_current_pid_tgid();
97        whobyreq.update(&req, &who);
98    }
99
100    return 0;
101}
102
103// time block I/O
104int trace_req_start(struct pt_regs *ctx, struct request *req)
105{
106    u64 ts;
107
108    ts = bpf_ktime_get_ns();
109    start.update(&req, &ts);
110
111    return 0;
112}
113
114// output
115int trace_req_completion(struct pt_regs *ctx, struct request *req)
116{
117    u64 *tsp;
118
119    // fetch timestamp and calculate delta
120    tsp = start.lookup(&req);
121    if (tsp == 0) {
122        return 0;    // missed tracing issue
123    }
124
125    struct who_t *whop;
126    struct val_t *valp, zero = {};
127    u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
128
129    // setup info_t key
130    struct info_t info = {};
131    info.major = req->rq_disk->major;
132    info.minor = req->rq_disk->first_minor;
133/*
134 * The following deals with a kernel version change (in mainline 4.7, although
135 * it may be backported to earlier kernels) with how block request write flags
136 * are tested. We handle both pre- and post-change versions here. Please avoid
137 * kernel version tests like this as much as possible: they inflate the code,
138 * test, and maintenance burden.
139 */
140#ifdef REQ_WRITE
141    info.rwflag = !!(req->cmd_flags & REQ_WRITE);
142#elif defined(REQ_OP_SHIFT)
143    info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
144#else
145    info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
146#endif
147
148    whop = whobyreq.lookup(&req);
149    if (whop == 0) {
150        // missed pid who, save stats as pid 0
151        valp = counts.lookup_or_init(&info, &zero);
152    } else {
153        info.pid = whop->pid;
154        __builtin_memcpy(&info.name, whop->name, sizeof(info.name));
155        valp = counts.lookup_or_init(&info, &zero);
156    }
157
158    // save stats
159    valp->us += delta_us;
160    valp->bytes += req->__data_len;
161    valp->io++;
162
163    start.delete(&req);
164    whobyreq.delete(&req);
165
166    return 0;
167}
168"""
169
170if args.ebpf:
171    print(bpf_text)
172    exit()
173
174b = BPF(text=bpf_text)
175b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
176b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
177b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
178b.attach_kprobe(event="blk_account_io_completion",
179    fn_name="trace_req_completion")
180
181print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
182
183# cache disk major,minor -> diskname
184disklookup = {}
185with open(diskstats) as stats:
186    for line in stats:
187        a = line.split()
188        disklookup[a[0] + "," + a[1]] = a[2]
189
190# output
191exiting = 0
192while 1:
193    try:
194        sleep(interval)
195    except KeyboardInterrupt:
196        exiting = 1
197
198    # header
199    if clear:
200        call("clear")
201    else:
202        print()
203    with open(loadavg) as stats:
204        print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
205    print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
206        "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
207
208    # by-PID output
209    counts = b.get_table("counts")
210    line = 0
211    for k, v in reversed(sorted(counts.items(),
212                                key=lambda counts: counts[1].bytes)):
213
214        # lookup disk
215        disk = str(k.major) + "," + str(k.minor)
216        if disk in disklookup:
217            diskname = disklookup[disk]
218        else:
219            diskname = "?"
220
221        # print line
222        avg_ms = (float(v.us) / 1000) / v.io
223        print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid,
224            k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R",
225            k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms))
226
227        line += 1
228        if line >= maxrows:
229            break
230    counts.clear()
231
232    countdown -= 1
233    if exiting or countdown == 0:
234        print("Detaching...")
235        exit()
236