1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# funcslower Trace slow kernel or user function calls. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: funcslower [-h] [-p PID] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS] 8# [-T] [-t] [-v] function [function ...] 9# 10# WARNING: This tool traces function calls by instrumenting the entry and 11# return from each function. For commonly-invoked functions like memory allocs 12# or file writes, this can be extremely expensive. Mind the overhead. 13# 14# NOTE: This tool cannot trace nested functions in the same invocation 15# due to instrumentation specifics, only innermost calls will be visible. 16# 17# By default, a minimum millisecond threshold of 1 is used. 18# 19# Copyright 2017, Sasha Goldshtein 20# Licensed under the Apache License, Version 2.0 (the "License") 21# 22# 30-Mar-2017 Sasha Goldshtein Created this. 23 24from __future__ import print_function 25from bcc import BPF 26import argparse 27import ctypes as ct 28import time 29 30examples = """examples: 31 ./funcslower vfs_write # trace vfs_write calls slower than 1ms 32 ./funcslower -m 10 vfs_write # same, but slower than 10ms 33 ./funcslower -u 10 c:open # trace open calls slower than 10us 34 ./funcslower -p 135 c:open # trace pid 135 only 35 ./funcslower c:malloc c:free # trace both malloc and free slower than 1ms 36 ./funcslower -a 2 c:open # show first two arguments to open 37 ./funcslower -UK -m 10 c:open # Show user and kernel stack frame of open calls slower than 10ms 38 ./funcslower -f -UK c:open # Output in folded format for flame graphs 39""" 40parser = argparse.ArgumentParser( 41 description="Trace slow kernel or user function calls.", 42 formatter_class=argparse.RawDescriptionHelpFormatter, 43 epilog=examples) 44parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid", 45 help="trace this PID only") 46parser.add_argument("-m", "--min-ms", type=float, dest="min_ms", 47 help="minimum duration to trace (ms)") 48parser.add_argument("-u", "--min-us", type=float, dest="min_us", 49 help="minimum duration to trace (us)") 50parser.add_argument("-a", "--arguments", type=int, 51 help="print this many entry arguments, as hex") 52parser.add_argument("-T", "--time", action="store_true", 53 help="show HH:MM:SS timestamp") 54parser.add_argument("-t", "--timestamp", action="store_true", 55 help="show timestamp in seconds at us resolution") 56parser.add_argument("-v", "--verbose", action="store_true", 57 help="print the BPF program for debugging purposes") 58parser.add_argument(metavar="function", nargs="+", dest="functions", 59 help="function(s) to trace") 60parser.add_argument("--ebpf", action="store_true", 61 help=argparse.SUPPRESS) 62parser.add_argument("-f", "--folded", action="store_true", 63 help="output folded format, one line per stack (for flame graphs)") 64parser.add_argument("-U", "--user-stack", 65 action="store_true", help="output user stack trace") 66parser.add_argument("-K", "--kernel-stack", 67 action="store_true", help="output kernel stack trace") 68 69args = parser.parse_args() 70# fractions are allowed, but rounded to an integer nanosecond 71if args.min_ms: 72 duration_ns = int(args.min_ms * 1000000) 73elif args.min_us: 74 duration_ns = int(args.min_us * 1000) 75else: 76 duration_ns = 1000000 # default to 1ms 77 78bpf_text = """ 79#include <uapi/linux/ptrace.h> 80#include <linux/sched.h> // for TASK_COMM_LEN 81 82struct entry_t { 83 u64 id; 84 u64 start_ns; 85#ifdef GRAB_ARGS 86 u64 args[6]; 87#endif 88}; 89 90struct data_t { 91 u64 id; 92 u64 tgid_pid; 93 u64 start_ns; 94 u64 duration_ns; 95 u64 retval; 96 char comm[TASK_COMM_LEN]; 97#ifdef USER_STACKS 98 int user_stack_id; 99#endif 100#ifdef KERNEL_STACKS 101 int kernel_stack_id; 102 u64 kernel_ip; 103#endif 104#ifdef GRAB_ARGS 105 u64 args[6]; 106#endif 107}; 108 109BPF_HASH(entryinfo, u64, struct entry_t); 110BPF_PERF_OUTPUT(events); 111 112#if defined(USER_STACKS) || defined(KERNEL_STACKS) 113BPF_STACK_TRACE(stacks, 2048); 114#endif 115 116static int trace_entry(struct pt_regs *ctx, int id) 117{ 118 u64 tgid_pid = bpf_get_current_pid_tgid(); 119 u32 tgid = tgid_pid >> 32; 120 if (TGID_FILTER) 121 return 0; 122 123 u32 pid = tgid_pid; 124 125 struct entry_t entry = {}; 126 entry.start_ns = bpf_ktime_get_ns(); 127 entry.id = id; 128#ifdef GRAB_ARGS 129 entry.args[0] = PT_REGS_PARM1(ctx); 130 entry.args[1] = PT_REGS_PARM2(ctx); 131 entry.args[2] = PT_REGS_PARM3(ctx); 132 entry.args[3] = PT_REGS_PARM4(ctx); 133 entry.args[4] = PT_REGS_PARM5(ctx); 134 entry.args[5] = PT_REGS_PARM6(ctx); 135#endif 136 137 entryinfo.update(&tgid_pid, &entry); 138 139 return 0; 140} 141 142int trace_return(struct pt_regs *ctx) 143{ 144 struct entry_t *entryp; 145 u64 tgid_pid = bpf_get_current_pid_tgid(); 146 147 entryp = entryinfo.lookup(&tgid_pid); 148 if (entryp == 0) { 149 return 0; 150 } 151 152 u64 delta_ns = bpf_ktime_get_ns() - entryp->start_ns; 153 entryinfo.delete(&tgid_pid); 154 155 if (delta_ns < DURATION_NS) 156 return 0; 157 158 struct data_t data = {}; 159 data.id = entryp->id; 160 data.tgid_pid = tgid_pid; 161 data.start_ns = entryp->start_ns; 162 data.duration_ns = delta_ns; 163 data.retval = PT_REGS_RC(ctx); 164 165#ifdef USER_STACKS 166 data.user_stack_id = stacks.get_stackid(ctx, BPF_F_USER_STACK); 167#endif 168 169#ifdef KERNEL_STACKS 170 data.kernel_stack_id = stacks.get_stackid(ctx, 0); 171 172 if (data.kernel_stack_id >= 0) { 173 u64 ip = PT_REGS_IP(ctx); 174 u64 page_offset; 175 176 // if ip isn't sane, leave key ips as zero for later checking 177#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE) 178 // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it 179 page_offset = __PAGE_OFFSET_BASE; 180#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4) 181 // x64, 4.17, and later 182#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL) 183 page_offset = __PAGE_OFFSET_BASE_L5; 184#else 185 page_offset = __PAGE_OFFSET_BASE_L4; 186#endif 187#else 188 // earlier x86_64 kernels, e.g., 4.6, comes here 189 // arm64, s390, powerpc, x86_32 190 page_offset = PAGE_OFFSET; 191#endif 192 193 if (ip > page_offset) { 194 data.kernel_ip = ip; 195 } 196 } 197#endif 198 199#ifdef GRAB_ARGS 200 bpf_probe_read(&data.args[0], sizeof(data.args), entryp->args); 201#endif 202 bpf_get_current_comm(&data.comm, sizeof(data.comm)); 203 events.perf_submit(ctx, &data, sizeof(data)); 204 205 return 0; 206} 207""" 208 209bpf_text = bpf_text.replace('DURATION_NS', str(duration_ns)) 210if args.arguments: 211 bpf_text = "#define GRAB_ARGS\n" + bpf_text 212if args.user_stack: 213 bpf_text = "#define USER_STACKS\n" + bpf_text 214if args.kernel_stack: 215 bpf_text = "#define KERNEL_STACKS\n" + bpf_text 216if args.tgid: 217 bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid) 218else: 219 bpf_text = bpf_text.replace('TGID_FILTER', '0') 220 221for i in range(len(args.functions)): 222 bpf_text += """ 223int trace_%d(struct pt_regs *ctx) { 224 return trace_entry(ctx, %d); 225} 226""" % (i, i) 227 228if args.verbose or args.ebpf: 229 print(bpf_text) 230 if args.ebpf: 231 exit() 232 233b = BPF(text=bpf_text) 234 235for i, function in enumerate(args.functions): 236 if ":" in function: 237 library, func = function.split(":") 238 b.attach_uprobe(name=library, sym=func, fn_name="trace_%d" % i) 239 b.attach_uretprobe(name=library, sym=func, fn_name="trace_return") 240 else: 241 b.attach_kprobe(event=function, fn_name="trace_%d" % i) 242 b.attach_kretprobe(event=function, fn_name="trace_return") 243 244TASK_COMM_LEN = 16 # linux/sched.h 245 246class Data(ct.Structure): 247 _fields_ = [ 248 ("id", ct.c_ulonglong), 249 ("tgid_pid", ct.c_ulonglong), 250 ("start_ns", ct.c_ulonglong), 251 ("duration_ns", ct.c_ulonglong), 252 ("retval", ct.c_ulonglong), 253 ("comm", ct.c_char * TASK_COMM_LEN) 254 ] + ([("args", ct.c_ulonglong * 6)] if args.arguments else []) + \ 255 ([("user_stack_id", ct.c_int)] if args.user_stack else []) + \ 256 ([("kernel_stack_id", ct.c_int),("kernel_ip", ct.c_ulonglong)] if args.kernel_stack else []) 257 258time_designator = "us" if args.min_us else "ms" 259time_value = args.min_us or args.min_ms or 1 260time_multiplier = 1000 if args.min_us else 1000000 261time_col = args.time or args.timestamp 262 263# Do not print header when folded 264if not args.folded: 265 print("Tracing function calls slower than %g %s... Ctrl+C to quit." % 266 (time_value, time_designator)) 267 print((("%-10s " % "TIME" if time_col else "") + "%-14s %-6s %7s %16s %s") % 268 ("COMM", "PID", "LAT(%s)" % time_designator, "RVAL", 269 "FUNC" + (" ARGS" if args.arguments else ""))) 270 271earliest_ts = 0 272 273def time_str(event): 274 if args.time: 275 return "%-10s " % time.strftime("%H:%M:%S") 276 if args.timestamp: 277 global earliest_ts 278 if earliest_ts == 0: 279 earliest_ts = event.start_ns 280 return "%-10.6f " % ((event.start_ns - earliest_ts) / 1000000000.0) 281 return "" 282 283def args_str(event): 284 if not args.arguments: 285 return "" 286 return str.join(" ", ["0x%x" % arg for arg in event.args[:args.arguments]]) 287 288def print_stack(event): 289 user_stack = [] 290 stack_traces = b.get_table("stacks") 291 292 if args.user_stack and event.user_stack_id > 0: 293 user_stack = stack_traces.walk(event.user_stack_id) 294 295 kernel_stack = [] 296 if args.kernel_stack and event.kernel_stack_id > 0: 297 kernel_tmp = stack_traces.walk(event.kernel_stack_id) 298 299 # fix kernel stack 300 for addr in kernel_tmp: 301 kernel_stack.append(addr) 302 303 do_delimiter = user_stack and kernel_stack 304 305 if args.folded: 306 # print folded stack output 307 user_stack = list(user_stack) 308 kernel_stack = list(kernel_stack) 309 line = [event.comm.decode('utf-8', 'replace')] + \ 310 [b.sym(addr, event.tgid_pid) for addr in reversed(user_stack)] + \ 311 (do_delimiter and ["-"] or []) + \ 312 [b.ksym(addr) for addr in reversed(kernel_stack)] 313 print("%s %d" % (";".join(line), 1)) 314 else: 315 # print default multi-line stack output. 316 for addr in kernel_stack: 317 print(" %s" % b.ksym(addr)) 318 for addr in user_stack: 319 print(" %s" % b.sym(addr, event.tgid_pid)) 320 321def print_event(cpu, data, size): 322 event = ct.cast(data, ct.POINTER(Data)).contents 323 ts = float(event.duration_ns) / time_multiplier 324 if not args.folded: 325 print((time_str(event) + "%-14.14s %-6s %7.2f %16x %s %s") % 326 (event.comm.decode('utf-8', 'replace'), event.tgid_pid >> 32, 327 ts, event.retval, args.functions[event.id], args_str(event))) 328 if args.user_stack or args.kernel_stack: 329 print_stack(event) 330 331b["events"].open_perf_buffer(print_event, page_cnt=64) 332while True: 333 b.perf_buffer_poll() 334