1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# fileslower Trace slow synchronous file reads and writes. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: fileslower [-h] [-p PID] [-a] [min_ms] 8# 9# This script uses kernel dynamic tracing of synchronous reads and writes 10# at the VFS interface, to identify slow file reads and writes for any file 11# system. 12# 13# This works by tracing __vfs_read() and __vfs_write(), and filtering for 14# synchronous I/O (the path to new_sync_read() and new_sync_write()), and 15# for I/O with filenames. This approach provides a view of just two file 16# system request types. There are typically many others: asynchronous I/O, 17# directory operations, file handle operations, etc, that this tool does not 18# instrument. 19# 20# WARNING: This traces VFS reads and writes, which can be extremely frequent, 21# and so the overhead of this tool can become severe depending on the 22# workload. 23# 24# By default, a minimum millisecond threshold of 10 is used. 25# 26# Copyright 2016 Netflix, Inc. 27# Licensed under the Apache License, Version 2.0 (the "License") 28# 29# 06-Feb-2016 Brendan Gregg Created this. 30 31from __future__ import print_function 32from bcc import BPF 33import argparse 34import ctypes as ct 35import time 36 37# arguments 38examples = """examples: 39 ./fileslower # trace sync file I/O slower than 10 ms (default) 40 ./fileslower 1 # trace sync file I/O slower than 1 ms 41 ./fileslower -p 185 # trace PID 185 only 42""" 43parser = argparse.ArgumentParser( 44 description="Trace slow synchronous file reads and writes", 45 formatter_class=argparse.RawDescriptionHelpFormatter, 46 epilog=examples) 47parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid", 48 help="trace this PID only") 49parser.add_argument("-a", "--all-files", action="store_true", 50 help="include non-regular file types (sockets, FIFOs, etc)") 51parser.add_argument("min_ms", nargs="?", default='10', 52 help="minimum I/O duration to trace, in ms (default 10)") 53parser.add_argument("--ebpf", action="store_true", 54 help=argparse.SUPPRESS) 55args = parser.parse_args() 56min_ms = int(args.min_ms) 57tgid = args.tgid 58debug = 0 59 60# define BPF program 61bpf_text = """ 62#include <uapi/linux/ptrace.h> 63#include <linux/fs.h> 64#include <linux/sched.h> 65 66enum trace_mode { 67 MODE_READ, 68 MODE_WRITE 69}; 70 71struct val_t { 72 u32 sz; 73 u64 ts; 74 u32 name_len; 75 // de->d_name.name may point to de->d_iname so limit len accordingly 76 char name[DNAME_INLINE_LEN]; 77 char comm[TASK_COMM_LEN]; 78}; 79 80struct data_t { 81 enum trace_mode mode; 82 u32 pid; 83 u32 sz; 84 u64 delta_us; 85 u32 name_len; 86 char name[DNAME_INLINE_LEN]; 87 char comm[TASK_COMM_LEN]; 88}; 89 90BPF_HASH(entryinfo, pid_t, struct val_t); 91BPF_PERF_OUTPUT(events); 92 93// store timestamp and size on entry 94static int trace_rw_entry(struct pt_regs *ctx, struct file *file, 95 char __user *buf, size_t count) 96{ 97 u32 tgid = bpf_get_current_pid_tgid() >> 32; 98 if (TGID_FILTER) 99 return 0; 100 101 u32 pid = bpf_get_current_pid_tgid(); 102 103 // skip I/O lacking a filename 104 struct dentry *de = file->f_path.dentry; 105 int mode = file->f_inode->i_mode; 106 if (de->d_name.len == 0 || TYPE_FILTER) 107 return 0; 108 109 // store size and timestamp by pid 110 struct val_t val = {}; 111 val.sz = count; 112 val.ts = bpf_ktime_get_ns(); 113 114 struct qstr d_name = de->d_name; 115 val.name_len = d_name.len; 116 bpf_probe_read(&val.name, sizeof(val.name), d_name.name); 117 bpf_get_current_comm(&val.comm, sizeof(val.comm)); 118 entryinfo.update(&pid, &val); 119 120 return 0; 121} 122 123int trace_read_entry(struct pt_regs *ctx, struct file *file, 124 char __user *buf, size_t count) 125{ 126 // skip non-sync I/O; see kernel code for __vfs_read() 127 if (!(file->f_op->read_iter)) 128 return 0; 129 return trace_rw_entry(ctx, file, buf, count); 130} 131 132int trace_write_entry(struct pt_regs *ctx, struct file *file, 133 char __user *buf, size_t count) 134{ 135 // skip non-sync I/O; see kernel code for __vfs_write() 136 if (!(file->f_op->write_iter)) 137 return 0; 138 return trace_rw_entry(ctx, file, buf, count); 139} 140 141// output 142static int trace_rw_return(struct pt_regs *ctx, int type) 143{ 144 struct val_t *valp; 145 u32 pid = bpf_get_current_pid_tgid(); 146 147 valp = entryinfo.lookup(&pid); 148 if (valp == 0) { 149 // missed tracing issue or filtered 150 return 0; 151 } 152 u64 delta_us = (bpf_ktime_get_ns() - valp->ts) / 1000; 153 entryinfo.delete(&pid); 154 if (delta_us < MIN_US) 155 return 0; 156 157 struct data_t data = {}; 158 data.mode = type; 159 data.pid = pid; 160 data.sz = valp->sz; 161 data.delta_us = delta_us; 162 data.name_len = valp->name_len; 163 bpf_probe_read(&data.name, sizeof(data.name), valp->name); 164 bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm); 165 events.perf_submit(ctx, &data, sizeof(data)); 166 167 return 0; 168} 169 170int trace_read_return(struct pt_regs *ctx) 171{ 172 return trace_rw_return(ctx, MODE_READ); 173} 174 175int trace_write_return(struct pt_regs *ctx) 176{ 177 return trace_rw_return(ctx, MODE_WRITE); 178} 179 180""" 181bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000)) 182if args.tgid: 183 bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % tgid) 184else: 185 bpf_text = bpf_text.replace('TGID_FILTER', '0') 186if args.all_files: 187 bpf_text = bpf_text.replace('TYPE_FILTER', '0') 188else: 189 bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)') 190 191if debug or args.ebpf: 192 print(bpf_text) 193 if args.ebpf: 194 exit() 195 196# initialize BPF 197b = BPF(text=bpf_text) 198 199# I'd rather trace these via new_sync_read/new_sync_write (which used to be 200# do_sync_read/do_sync_write), but those became static. So trace these from 201# the parent functions, at the cost of more overhead, instead. 202# Ultimately, we should be using [V]FS tracepoints. 203b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry") 204b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return") 205try: 206 b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") 207 b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return") 208except: 209 # older kernels don't have __vfs_write so try vfs_write instead 210 b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry") 211 b.attach_kretprobe(event="vfs_write", fn_name="trace_write_return") 212 213TASK_COMM_LEN = 16 # linux/sched.h 214DNAME_INLINE_LEN = 32 # linux/dcache.h 215 216class Data(ct.Structure): 217 _fields_ = [ 218 ("mode", ct.c_int), 219 ("pid", ct.c_uint), 220 ("sz", ct.c_uint), 221 ("delta_us", ct.c_ulonglong), 222 ("name_len", ct.c_uint), 223 ("name", ct.c_char * DNAME_INLINE_LEN), 224 ("comm", ct.c_char * TASK_COMM_LEN), 225 ] 226 227mode_s = { 228 0: 'R', 229 1: 'W', 230} 231 232# header 233print("Tracing sync read/writes slower than %d ms" % min_ms) 234print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "TID", "D", 235 "BYTES", "LAT(ms)", "FILENAME")) 236 237start_ts = time.time() 238 239def print_event(cpu, data, size): 240 event = ct.cast(data, ct.POINTER(Data)).contents 241 242 ms = float(event.delta_us) / 1000 243 name = event.name.decode('utf-8', 'replace') 244 if event.name_len > DNAME_INLINE_LEN: 245 name = name[:-3] + "..." 246 247 print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % ( 248 time.time() - start_ts, event.comm.decode('utf-8', 'replace'), 249 event.pid, mode_s[event.mode], event.sz, ms, name)) 250 251b["events"].open_perf_buffer(print_event, page_cnt=64) 252while 1: 253 b.perf_buffer_poll() 254