1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# biotop block device (disk) I/O by process. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count] 8# 9# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O 10# request, as well as a starting timestamp for calculating I/O latency. 11# 12# Copyright 2016 Netflix, Inc. 13# Licensed under the Apache License, Version 2.0 (the "License") 14# 15# 06-Feb-2016 Brendan Gregg Created this. 16 17from __future__ import print_function 18from bcc import BPF 19from time import sleep, strftime 20import argparse 21import signal 22from subprocess import call 23 24# arguments 25examples = """examples: 26 ./biotop # block device I/O top, 1 second refresh 27 ./biotop -C # don't clear the screen 28 ./biotop 5 # 5 second summaries 29 ./biotop 5 10 # 5 second summaries, 10 times only 30""" 31parser = argparse.ArgumentParser( 32 description="Block device (disk) I/O by process", 33 formatter_class=argparse.RawDescriptionHelpFormatter, 34 epilog=examples) 35parser.add_argument("-C", "--noclear", action="store_true", 36 help="don't clear the screen") 37parser.add_argument("-r", "--maxrows", default=20, 38 help="maximum rows to print, default 20") 39parser.add_argument("interval", nargs="?", default=1, 40 help="output interval, in seconds") 41parser.add_argument("count", nargs="?", default=99999999, 42 help="number of outputs") 43parser.add_argument("--ebpf", action="store_true", 44 help=argparse.SUPPRESS) 45args = parser.parse_args() 46interval = int(args.interval) 47countdown = int(args.count) 48maxrows = int(args.maxrows) 49clear = not int(args.noclear) 50 51# linux stats 52loadavg = "/proc/loadavg" 53diskstats = "/proc/diskstats" 54 55# signal handler 56def signal_ignore(signal, frame): 57 print() 58 59# load BPF program 60bpf_text = """ 61#include <uapi/linux/ptrace.h> 62#include <linux/blkdev.h> 63 64// for saving process info by request 65struct who_t { 66 u32 pid; 67 char name[TASK_COMM_LEN]; 68}; 69 70// the key for the output summary 71struct info_t { 72 u32 pid; 73 int rwflag; 74 int major; 75 int minor; 76 char name[TASK_COMM_LEN]; 77}; 78 79// the value of the output summary 80struct val_t { 81 u64 bytes; 82 u64 us; 83 u32 io; 84}; 85 86BPF_HASH(start, struct request *); 87BPF_HASH(whobyreq, struct request *, struct who_t); 88BPF_HASH(counts, struct info_t, struct val_t); 89 90// cache PID and comm by-req 91int trace_pid_start(struct pt_regs *ctx, struct request *req) 92{ 93 struct who_t who = {}; 94 95 if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) { 96 who.pid = bpf_get_current_pid_tgid(); 97 whobyreq.update(&req, &who); 98 } 99 100 return 0; 101} 102 103// time block I/O 104int trace_req_start(struct pt_regs *ctx, struct request *req) 105{ 106 u64 ts; 107 108 ts = bpf_ktime_get_ns(); 109 start.update(&req, &ts); 110 111 return 0; 112} 113 114// output 115int trace_req_completion(struct pt_regs *ctx, struct request *req) 116{ 117 u64 *tsp; 118 119 // fetch timestamp and calculate delta 120 tsp = start.lookup(&req); 121 if (tsp == 0) { 122 return 0; // missed tracing issue 123 } 124 125 struct who_t *whop; 126 struct val_t *valp, zero = {}; 127 u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; 128 129 // setup info_t key 130 struct info_t info = {}; 131 info.major = req->rq_disk->major; 132 info.minor = req->rq_disk->first_minor; 133/* 134 * The following deals with a kernel version change (in mainline 4.7, although 135 * it may be backported to earlier kernels) with how block request write flags 136 * are tested. We handle both pre- and post-change versions here. Please avoid 137 * kernel version tests like this as much as possible: they inflate the code, 138 * test, and maintenance burden. 139 */ 140#ifdef REQ_WRITE 141 info.rwflag = !!(req->cmd_flags & REQ_WRITE); 142#elif defined(REQ_OP_SHIFT) 143 info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); 144#else 145 info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); 146#endif 147 148 whop = whobyreq.lookup(&req); 149 if (whop == 0) { 150 // missed pid who, save stats as pid 0 151 valp = counts.lookup_or_init(&info, &zero); 152 } else { 153 info.pid = whop->pid; 154 __builtin_memcpy(&info.name, whop->name, sizeof(info.name)); 155 valp = counts.lookup_or_init(&info, &zero); 156 } 157 158 // save stats 159 valp->us += delta_us; 160 valp->bytes += req->__data_len; 161 valp->io++; 162 163 start.delete(&req); 164 whobyreq.delete(&req); 165 166 return 0; 167} 168""" 169 170if args.ebpf: 171 print(bpf_text) 172 exit() 173 174b = BPF(text=bpf_text) 175b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") 176b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 177b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 178b.attach_kprobe(event="blk_account_io_completion", 179 fn_name="trace_req_completion") 180 181print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) 182 183# cache disk major,minor -> diskname 184disklookup = {} 185with open(diskstats) as stats: 186 for line in stats: 187 a = line.split() 188 disklookup[a[0] + "," + a[1]] = a[2] 189 190# output 191exiting = 0 192while 1: 193 try: 194 sleep(interval) 195 except KeyboardInterrupt: 196 exiting = 1 197 198 # header 199 if clear: 200 call("clear") 201 else: 202 print() 203 with open(loadavg) as stats: 204 print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) 205 print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM", 206 "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms")) 207 208 # by-PID output 209 counts = b.get_table("counts") 210 line = 0 211 for k, v in reversed(sorted(counts.items(), 212 key=lambda counts: counts[1].bytes)): 213 214 # lookup disk 215 disk = str(k.major) + "," + str(k.minor) 216 if disk in disklookup: 217 diskname = disklookup[disk] 218 else: 219 diskname = "?" 220 221 # print line 222 avg_ms = (float(v.us) / 1000) / v.io 223 print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid, 224 k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R", 225 k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms)) 226 227 line += 1 228 if line >= maxrows: 229 break 230 counts.clear() 231 232 countdown -= 1 233 if exiting or countdown == 0: 234 print("Detaching...") 235 exit() 236