1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# btrfsslower Trace slow btrfs operations. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: btrfsslower [-h] [-j] [-p PID] [min_ms] 8# 9# This script traces common btrfs file operations: reads, writes, opens, and 10# syncs. It measures the time spent in these operations, and prints details 11# for each that exceeded a threshold. 12# 13# WARNING: This adds low-overhead instrumentation to these btrfs operations, 14# including reads and writes from the file system cache. Such reads and writes 15# can be very frequent (depending on the workload; eg, 1M/sec), at which 16# point the overhead of this tool (even if it prints no "slower" events) can 17# begin to become significant. 18# 19# By default, a minimum millisecond threshold of 10 is used. 20# 21# Copyright 2016 Netflix, Inc. 22# Licensed under the Apache License, Version 2.0 (the "License") 23# 24# 15-Feb-2016 Brendan Gregg Created this. 25# 16-Oct-2016 Dina Goldshtein -p to filter by process ID. 26 27from __future__ import print_function 28from bcc import BPF 29import argparse 30from time import strftime 31import ctypes as ct 32 33# symbols 34kallsyms = "/proc/kallsyms" 35 36# arguments 37examples = """examples: 38 ./btrfsslower # trace operations slower than 10 ms (default) 39 ./btrfsslower 1 # trace operations slower than 1 ms 40 ./btrfsslower -j 1 # ... 1 ms, parsable output (csv) 41 ./btrfsslower 0 # trace all operations (warning: verbose) 42 ./btrfsslower -p 185 # trace PID 185 only 43""" 44parser = argparse.ArgumentParser( 45 description="Trace common btrfs file operations slower than a threshold", 46 formatter_class=argparse.RawDescriptionHelpFormatter, 47 epilog=examples) 48parser.add_argument("-j", "--csv", action="store_true", 49 help="just print fields: comma-separated values") 50parser.add_argument("-p", "--pid", 51 help="trace this PID only") 52parser.add_argument("min_ms", nargs="?", default='10', 53 help="minimum I/O duration to trace, in ms (default 10)") 54parser.add_argument("--ebpf", action="store_true", 55 help=argparse.SUPPRESS) 56args = parser.parse_args() 57min_ms = int(args.min_ms) 58pid = args.pid 59csv = args.csv 60debug = 0 61 62# define BPF program 63bpf_text = """ 64#include <uapi/linux/ptrace.h> 65#include <linux/fs.h> 66#include <linux/sched.h> 67#include <linux/dcache.h> 68 69// XXX: switch these to char's when supported 70#define TRACE_READ 0 71#define TRACE_WRITE 1 72#define TRACE_OPEN 2 73#define TRACE_FSYNC 3 74 75struct val_t { 76 u64 ts; 77 u64 offset; 78 struct file *fp; 79}; 80 81struct data_t { 82 // XXX: switch some to u32's when supported 83 u64 ts_us; 84 u64 type; 85 u64 size; 86 u64 offset; 87 u64 delta_us; 88 u64 pid; 89 char task[TASK_COMM_LEN]; 90 char file[DNAME_INLINE_LEN]; 91}; 92 93BPF_HASH(entryinfo, u64, struct val_t); 94BPF_PERF_OUTPUT(events); 95 96// 97// Store timestamp and size on entry 98// 99 100// The current btrfs (Linux 4.5) uses generic_file_read_iter() instead of it's 101// own read function. So we need to trace that and then filter on btrfs, which 102// I do by checking file->f_op. 103int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) 104{ 105 u64 id = bpf_get_current_pid_tgid(); 106 u32 pid = id >> 32; // PID is higher part 107 108 if (FILTER_PID) 109 return 0; 110 111 // btrfs filter on file->f_op == btrfs_file_operations 112 struct file *fp = iocb->ki_filp; 113 if ((u64)fp->f_op != BTRFS_FILE_OPERATIONS) 114 return 0; 115 116 // store filep and timestamp by pid 117 struct val_t val = {}; 118 val.ts = bpf_ktime_get_ns(); 119 val.fp = fp; 120 val.offset = iocb->ki_pos; 121 if (val.fp) 122 entryinfo.update(&id, &val); 123 124 return 0; 125} 126 127// btrfs_file_write_iter(): 128int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb) 129{ 130 u64 id = bpf_get_current_pid_tgid(); 131 u32 pid = id >> 32; // PID is higher part 132 133 if (FILTER_PID) 134 return 0; 135 136 // store filep and timestamp by id 137 struct val_t val = {}; 138 val.ts = bpf_ktime_get_ns(); 139 val.fp = iocb->ki_filp; 140 val.offset = iocb->ki_pos; 141 if (val.fp) 142 entryinfo.update(&id, &val); 143 144 return 0; 145} 146 147// The current btrfs (Linux 4.5) uses generic_file_open(), instead of it's own 148// function. Same as with reads. Trace the generic path and filter: 149int trace_open_entry(struct pt_regs *ctx, struct inode *inode, 150 struct file *file) 151{ 152 u64 id = bpf_get_current_pid_tgid(); 153 u32 pid = id >> 32; // PID is higher part 154 155 if (FILTER_PID) 156 return 0; 157 158 // btrfs filter on file->f_op == btrfs_file_operations 159 if ((u64)file->f_op != BTRFS_FILE_OPERATIONS) 160 return 0; 161 162 // store filep and timestamp by id 163 struct val_t val = {}; 164 val.ts = bpf_ktime_get_ns(); 165 val.fp = file; 166 val.offset = 0; 167 if (val.fp) 168 entryinfo.update(&id, &val); 169 170 return 0; 171} 172 173// btrfs_sync_file(): 174int trace_fsync_entry(struct pt_regs *ctx, struct file *file) 175{ 176 u64 id = bpf_get_current_pid_tgid(); 177 u32 pid = id >> 32; // PID is higher part 178 179 if (FILTER_PID) 180 return 0; 181 182 // store filep and timestamp by id 183 struct val_t val = {}; 184 val.ts = bpf_ktime_get_ns(); 185 val.fp = file; 186 val.offset = 0; 187 if (val.fp) 188 entryinfo.update(&id, &val); 189 190 return 0; 191} 192 193// 194// Output 195// 196 197static int trace_return(struct pt_regs *ctx, int type) 198{ 199 struct val_t *valp; 200 u64 id = bpf_get_current_pid_tgid(); 201 u32 pid = id >> 32; // PID is higher part 202 203 valp = entryinfo.lookup(&id); 204 if (valp == 0) { 205 // missed tracing issue or filtered 206 return 0; 207 } 208 209 // calculate delta 210 u64 ts = bpf_ktime_get_ns(); 211 u64 delta_us = (ts - valp->ts) / 1000; 212 entryinfo.delete(&id); 213 if (FILTER_US) 214 return 0; 215 216 // populate output struct 217 u32 size = PT_REGS_RC(ctx); 218 struct data_t data = {.type = type, .size = size, .delta_us = delta_us, 219 .pid = pid}; 220 data.ts_us = ts / 1000; 221 data.offset = valp->offset; 222 bpf_get_current_comm(&data.task, sizeof(data.task)); 223 224 // workaround (rewriter should handle file to d_name in one step): 225 struct dentry *de = NULL; 226 struct qstr qs = {}; 227 de = valp->fp->f_path.dentry; 228 qs = de->d_name; 229 if (qs.len == 0) 230 return 0; 231 bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name); 232 233 // output 234 events.perf_submit(ctx, &data, sizeof(data)); 235 236 return 0; 237} 238 239int trace_read_return(struct pt_regs *ctx) 240{ 241 return trace_return(ctx, TRACE_READ); 242} 243 244int trace_write_return(struct pt_regs *ctx) 245{ 246 return trace_return(ctx, TRACE_WRITE); 247} 248 249int trace_open_return(struct pt_regs *ctx) 250{ 251 return trace_return(ctx, TRACE_OPEN); 252} 253 254int trace_fsync_return(struct pt_regs *ctx) 255{ 256 return trace_return(ctx, TRACE_FSYNC); 257} 258 259""" 260 261# code replacements 262with open(kallsyms) as syms: 263 ops = '' 264 for line in syms: 265 a = line.rstrip().split() 266 (addr, name) = (a[0], a[2]) 267 name = name.split("\t")[0] 268 if name == "btrfs_file_operations": 269 ops = "0x" + addr 270 break 271 if ops == '': 272 print("ERROR: no btrfs_file_operations in /proc/kallsyms. Exiting.") 273 print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") 274 exit() 275 bpf_text = bpf_text.replace('BTRFS_FILE_OPERATIONS', ops) 276if min_ms == 0: 277 bpf_text = bpf_text.replace('FILTER_US', '0') 278else: 279 bpf_text = bpf_text.replace('FILTER_US', 280 'delta_us <= %s' % str(min_ms * 1000)) 281if args.pid: 282 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 283else: 284 bpf_text = bpf_text.replace('FILTER_PID', '0') 285if debug or args.ebpf: 286 print(bpf_text) 287 if args.ebpf: 288 exit() 289 290# kernel->user event data: struct data_t 291DNAME_INLINE_LEN = 32 # linux/dcache.h 292TASK_COMM_LEN = 16 # linux/sched.h 293class Data(ct.Structure): 294 _fields_ = [ 295 ("ts_us", ct.c_ulonglong), 296 ("type", ct.c_ulonglong), 297 ("size", ct.c_ulonglong), 298 ("offset", ct.c_ulonglong), 299 ("delta_us", ct.c_ulonglong), 300 ("pid", ct.c_ulonglong), 301 ("task", ct.c_char * TASK_COMM_LEN), 302 ("file", ct.c_char * DNAME_INLINE_LEN) 303 ] 304 305# process event 306def print_event(cpu, data, size): 307 event = ct.cast(data, ct.POINTER(Data)).contents 308 309 type = 'R' 310 if event.type == 1: 311 type = 'W' 312 elif event.type == 2: 313 type = 'O' 314 elif event.type == 3: 315 type = 'S' 316 317 if (csv): 318 print("%d,%s,%d,%s,%d,%d,%d,%s" % ( 319 event.ts_us, event.task.decode('utf-8', 'replace'), event.pid, 320 type, event.size, event.offset, event.delta_us, 321 event.file.decode('utf-8', 'replace'))) 322 return 323 print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), 324 event.task.decode('utf-8', 'replace'), event.pid, type, event.size, 325 event.offset / 1024, float(event.delta_us) / 1000, 326 event.file.decode('utf-8', 'replace'))) 327 328# initialize BPF 329b = BPF(text=bpf_text) 330 331# Common file functions. See earlier comment about generic_*(). 332b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") 333b.attach_kprobe(event="btrfs_file_write_iter", fn_name="trace_write_entry") 334b.attach_kprobe(event="generic_file_open", fn_name="trace_open_entry") 335b.attach_kprobe(event="btrfs_sync_file", fn_name="trace_fsync_entry") 336b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") 337b.attach_kretprobe(event="btrfs_file_write_iter", fn_name="trace_write_return") 338b.attach_kretprobe(event="generic_file_open", fn_name="trace_open_return") 339b.attach_kretprobe(event="btrfs_sync_file", fn_name="trace_fsync_return") 340 341# header 342if (csv): 343 print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") 344else: 345 if min_ms == 0: 346 print("Tracing btrfs operations") 347 else: 348 print("Tracing btrfs operations slower than %d ms" % min_ms) 349 print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", 350 "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) 351 352# read events 353b["events"].open_perf_buffer(print_event, page_cnt=64) 354while 1: 355 b.perf_buffer_poll() 356