1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# xfsslower Trace slow XFS operations. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: xfsslower [-h] [-j] [-p PID] [min_ms] 8# 9# This script traces common XFS file operations: reads, writes, opens, and 10# syncs. It measures the time spent in these operations, and prints details 11# for each that exceeded a threshold. 12# 13# WARNING: This adds low-overhead instrumentation to these XFS operations, 14# including reads and writes from the file system cache. Such reads and writes 15# can be very frequent (depending on the workload; eg, 1M/sec), at which 16# point the overhead of this tool (even if it prints no "slower" events) can 17# begin to become significant. 18# 19# By default, a minimum millisecond threshold of 10 is used. 20# 21# Copyright 2016 Netflix, Inc. 22# Licensed under the Apache License, Version 2.0 (the "License") 23# 24# 11-Feb-2016 Brendan Gregg Created this. 25# 16-Oct-2016 Dina Goldshtein -p to filter by process ID. 26 27from __future__ import print_function 28from bcc import BPF 29import argparse 30from time import strftime 31import ctypes as ct 32 33# arguments 34examples = """examples: 35 ./xfsslower # trace operations slower than 10 ms (default) 36 ./xfsslower 1 # trace operations slower than 1 ms 37 ./xfsslower -j 1 # ... 1 ms, parsable output (csv) 38 ./xfsslower 0 # trace all operations (warning: verbose) 39 ./xfsslower -p 185 # trace PID 185 only 40""" 41parser = argparse.ArgumentParser( 42 description="Trace common XFS file operations slower than a threshold", 43 formatter_class=argparse.RawDescriptionHelpFormatter, 44 epilog=examples) 45parser.add_argument("-j", "--csv", action="store_true", 46 help="just print fields: comma-separated values") 47parser.add_argument("-p", "--pid", 48 help="trace this PID only") 49parser.add_argument("min_ms", nargs="?", default='10', 50 help="minimum I/O duration to trace, in ms (default 10)") 51parser.add_argument("--ebpf", action="store_true", 52 help=argparse.SUPPRESS) 53args = parser.parse_args() 54min_ms = int(args.min_ms) 55pid = args.pid 56csv = args.csv 57debug = 0 58 59# define BPF program 60bpf_text = """ 61#include <uapi/linux/ptrace.h> 62#include <linux/fs.h> 63#include <linux/sched.h> 64#include <linux/dcache.h> 65 66// XXX: switch these to char's when supported 67#define TRACE_READ 0 68#define TRACE_WRITE 1 69#define TRACE_OPEN 2 70#define TRACE_FSYNC 3 71 72struct val_t { 73 u64 ts; 74 u64 offset; 75 struct file *fp; 76}; 77 78struct data_t { 79 // XXX: switch some to u32's when supported 80 u64 ts_us; 81 u64 type; 82 u64 size; 83 u64 offset; 84 u64 delta_us; 85 u64 pid; 86 char task[TASK_COMM_LEN]; 87 char file[DNAME_INLINE_LEN]; 88}; 89 90BPF_HASH(entryinfo, u64, struct val_t); 91BPF_PERF_OUTPUT(events); 92 93// 94// Store timestamp and size on entry 95// 96 97// xfs_file_read_iter(), xfs_file_write_iter(): 98int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb) 99{ 100 u64 id = bpf_get_current_pid_tgid(); 101 u32 pid = id >> 32; // PID is higher part 102 103 if (FILTER_PID) 104 return 0; 105 106 // store filep and timestamp by id 107 struct val_t val = {}; 108 val.ts = bpf_ktime_get_ns(); 109 val.fp = iocb->ki_filp; 110 val.offset = iocb->ki_pos; 111 if (val.fp) 112 entryinfo.update(&id, &val); 113 114 return 0; 115} 116 117// xfs_file_open(): 118int trace_open_entry(struct pt_regs *ctx, struct inode *inode, 119 struct file *file) 120{ 121 u64 id = bpf_get_current_pid_tgid(); 122 u32 pid = id >> 32; // PID is higher part 123 124 if (FILTER_PID) 125 return 0; 126 127 // store filep and timestamp by id 128 struct val_t val = {}; 129 val.ts = bpf_ktime_get_ns(); 130 val.fp = file; 131 val.offset = 0; 132 if (val.fp) 133 entryinfo.update(&id, &val); 134 135 return 0; 136} 137 138// xfs_file_fsync(): 139int trace_fsync_entry(struct pt_regs *ctx, struct file *file) 140{ 141 u64 id = bpf_get_current_pid_tgid(); 142 u32 pid = id >> 32; // PID is higher part 143 144 if (FILTER_PID) 145 return 0; 146 147 // store filep and timestamp by id 148 struct val_t val = {}; 149 val.ts = bpf_ktime_get_ns(); 150 val.fp = file; 151 val.offset = 0; 152 if (val.fp) 153 entryinfo.update(&id, &val); 154 155 return 0; 156} 157 158// 159// Output 160// 161 162static int trace_return(struct pt_regs *ctx, int type) 163{ 164 struct val_t *valp; 165 u64 id = bpf_get_current_pid_tgid(); 166 u32 pid = id >> 32; // PID is higher part 167 168 valp = entryinfo.lookup(&id); 169 if (valp == 0) { 170 // missed tracing issue or filtered 171 return 0; 172 } 173 174 // calculate delta 175 u64 ts = bpf_ktime_get_ns(); 176 u64 delta_us = ts - valp->ts; 177 entryinfo.delete(&id); 178 179 // Skip entries with backwards time: temp workaround for #728 180 if ((s64) delta_us < 0) 181 return 0; 182 183 delta_us /= 1000; 184 185 if (FILTER_US) 186 return 0; 187 188 // populate output struct 189 u32 size = PT_REGS_RC(ctx); 190 struct data_t data = {.type = type, .size = size, .delta_us = delta_us, 191 .pid = pid}; 192 data.ts_us = ts / 1000; 193 data.offset = valp->offset; 194 bpf_get_current_comm(&data.task, sizeof(data.task)); 195 196 // workaround (rewriter should handle file to d_name in one step): 197 struct qstr qs = valp->fp->f_path.dentry->d_name; 198 if (qs.len == 0) 199 return 0; 200 bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name); 201 202 // output 203 events.perf_submit(ctx, &data, sizeof(data)); 204 205 return 0; 206} 207 208int trace_read_return(struct pt_regs *ctx) 209{ 210 return trace_return(ctx, TRACE_READ); 211} 212 213int trace_write_return(struct pt_regs *ctx) 214{ 215 return trace_return(ctx, TRACE_WRITE); 216} 217 218int trace_open_return(struct pt_regs *ctx) 219{ 220 return trace_return(ctx, TRACE_OPEN); 221} 222 223int trace_fsync_return(struct pt_regs *ctx) 224{ 225 return trace_return(ctx, TRACE_FSYNC); 226} 227 228""" 229if min_ms == 0: 230 bpf_text = bpf_text.replace('FILTER_US', '0') 231else: 232 bpf_text = bpf_text.replace('FILTER_US', 233 'delta_us <= %s' % str(min_ms * 1000)) 234if args.pid: 235 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 236else: 237 bpf_text = bpf_text.replace('FILTER_PID', '0') 238if debug or args.ebpf: 239 print(bpf_text) 240 if args.ebpf: 241 exit() 242 243# kernel->user event data: struct data_t 244DNAME_INLINE_LEN = 32 # linux/dcache.h 245TASK_COMM_LEN = 16 # linux/sched.h 246class Data(ct.Structure): 247 _fields_ = [ 248 ("ts_us", ct.c_ulonglong), 249 ("type", ct.c_ulonglong), 250 ("size", ct.c_ulonglong), 251 ("offset", ct.c_ulonglong), 252 ("delta_us", ct.c_ulonglong), 253 ("pid", ct.c_ulonglong), 254 ("task", ct.c_char * TASK_COMM_LEN), 255 ("file", ct.c_char * DNAME_INLINE_LEN) 256 ] 257 258# process event 259def print_event(cpu, data, size): 260 event = ct.cast(data, ct.POINTER(Data)).contents 261 262 type = 'R' 263 if event.type == 1: 264 type = 'W' 265 elif event.type == 2: 266 type = 'O' 267 elif event.type == 3: 268 type = 'S' 269 270 if (csv): 271 print("%d,%s,%d,%s,%d,%d,%d,%s" % ( 272 event.ts_us, event.task, event.pid, type, event.size, 273 event.offset, event.delta_us, event.file)) 274 return 275 print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), 276 event.task, event.pid, type, event.size, event.offset / 1024, 277 float(event.delta_us) / 1000, event.file)) 278 279# initialize BPF 280b = BPF(text=bpf_text) 281 282# common file functions 283b.attach_kprobe(event="xfs_file_read_iter", fn_name="trace_rw_entry") 284b.attach_kprobe(event="xfs_file_write_iter", fn_name="trace_rw_entry") 285b.attach_kprobe(event="xfs_file_open", fn_name="trace_open_entry") 286b.attach_kprobe(event="xfs_file_fsync", fn_name="trace_fsync_entry") 287b.attach_kretprobe(event="xfs_file_read_iter", fn_name="trace_read_return") 288b.attach_kretprobe(event="xfs_file_write_iter", fn_name="trace_write_return") 289b.attach_kretprobe(event="xfs_file_open", fn_name="trace_open_return") 290b.attach_kretprobe(event="xfs_file_fsync", fn_name="trace_fsync_return") 291 292# header 293if (csv): 294 print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") 295else: 296 if min_ms == 0: 297 print("Tracing XFS operations") 298 else: 299 print("Tracing XFS operations slower than %d ms" % min_ms) 300 print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", 301 "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) 302 303# read events 304b["events"].open_perf_buffer(print_event, page_cnt=64) 305while 1: 306 b.perf_buffer_poll() 307