1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# funcslower  Trace slow kernel or user function calls.
5#             For Linux, uses BCC, eBPF.
6#
7# USAGE: funcslower [-h] [-p PID] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS]
8#                   [-T] [-t] [-v] function [function ...]
9#
10# WARNING: This tool traces function calls by instrumenting the entry and
11# return from each function. For commonly-invoked functions like memory allocs
12# or file writes, this can be extremely expensive. Mind the overhead.
13#
14# NOTE: This tool cannot trace nested functions in the same invocation
15# due to instrumentation specifics, only innermost calls will be visible.
16#
17# By default, a minimum millisecond threshold of 1 is used.
18#
19# Copyright 2017, Sasha Goldshtein
20# Licensed under the Apache License, Version 2.0 (the "License")
21#
22# 30-Mar-2017   Sasha Goldshtein    Created this.
23
24from __future__ import print_function
25from bcc import BPF
26import argparse
27import ctypes as ct
28import time
29
30examples = """examples:
31  ./funcslower vfs_write        # trace vfs_write calls slower than 1ms
32  ./funcslower -m 10 vfs_write  # same, but slower than 10ms
33  ./funcslower -u 10 c:open     # trace open calls slower than 10us
34  ./funcslower -p 135 c:open    # trace pid 135 only
35  ./funcslower c:malloc c:free  # trace both malloc and free slower than 1ms
36  ./funcslower -a 2 c:open      # show first two arguments to open
37  ./funcslower -UK -m 10 c:open # Show user and kernel stack frame of open calls slower than 10ms
38  ./funcslower -f -UK c:open    # Output in folded format for flame graphs
39"""
40parser = argparse.ArgumentParser(
41    description="Trace slow kernel or user function calls.",
42    formatter_class=argparse.RawDescriptionHelpFormatter,
43    epilog=examples)
44parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
45    help="trace this PID only")
46parser.add_argument("-m", "--min-ms", type=float, dest="min_ms",
47    help="minimum duration to trace (ms)")
48parser.add_argument("-u", "--min-us", type=float, dest="min_us",
49    help="minimum duration to trace (us)")
50parser.add_argument("-a", "--arguments", type=int,
51    help="print this many entry arguments, as hex")
52parser.add_argument("-T", "--time", action="store_true",
53    help="show HH:MM:SS timestamp")
54parser.add_argument("-t", "--timestamp", action="store_true",
55    help="show timestamp in seconds at us resolution")
56parser.add_argument("-v", "--verbose", action="store_true",
57    help="print the BPF program for debugging purposes")
58parser.add_argument(metavar="function", nargs="+", dest="functions",
59    help="function(s) to trace")
60parser.add_argument("--ebpf", action="store_true",
61    help=argparse.SUPPRESS)
62parser.add_argument("-f", "--folded", action="store_true",
63    help="output folded format, one line per stack (for flame graphs)")
64parser.add_argument("-U", "--user-stack",
65  action="store_true", help="output user stack trace")
66parser.add_argument("-K", "--kernel-stack",
67  action="store_true", help="output kernel stack trace")
68
69args = parser.parse_args()
70# fractions are allowed, but rounded to an integer nanosecond
71if args.min_ms:
72    duration_ns = int(args.min_ms * 1000000)
73elif args.min_us:
74    duration_ns = int(args.min_us * 1000)
75else:
76    duration_ns = 1000000   # default to 1ms
77
78bpf_text = """
79#include <uapi/linux/ptrace.h>
80#include <linux/sched.h>    // for TASK_COMM_LEN
81
82struct entry_t {
83    u64 id;
84    u64 start_ns;
85#ifdef GRAB_ARGS
86    u64 args[6];
87#endif
88};
89
90struct data_t {
91    u64 id;
92    u64 tgid_pid;
93    u64 start_ns;
94    u64 duration_ns;
95    u64 retval;
96    char comm[TASK_COMM_LEN];
97#ifdef USER_STACKS
98    int user_stack_id;
99#endif
100#ifdef KERNEL_STACKS
101    int kernel_stack_id;
102    u64 kernel_ip;
103#endif
104#ifdef GRAB_ARGS
105    u64 args[6];
106#endif
107};
108
109BPF_HASH(entryinfo, u64, struct entry_t);
110BPF_PERF_OUTPUT(events);
111
112#if defined(USER_STACKS) || defined(KERNEL_STACKS)
113BPF_STACK_TRACE(stacks, 2048);
114#endif
115
116static int trace_entry(struct pt_regs *ctx, int id)
117{
118    u64 tgid_pid = bpf_get_current_pid_tgid();
119    u32 tgid = tgid_pid >> 32;
120    if (TGID_FILTER)
121        return 0;
122
123    u32 pid = tgid_pid;
124
125    struct entry_t entry = {};
126    entry.start_ns = bpf_ktime_get_ns();
127    entry.id = id;
128#ifdef GRAB_ARGS
129    entry.args[0] = PT_REGS_PARM1(ctx);
130    entry.args[1] = PT_REGS_PARM2(ctx);
131    entry.args[2] = PT_REGS_PARM3(ctx);
132    entry.args[3] = PT_REGS_PARM4(ctx);
133    entry.args[4] = PT_REGS_PARM5(ctx);
134    entry.args[5] = PT_REGS_PARM6(ctx);
135#endif
136
137    entryinfo.update(&tgid_pid, &entry);
138
139    return 0;
140}
141
142int trace_return(struct pt_regs *ctx)
143{
144    struct entry_t *entryp;
145    u64 tgid_pid = bpf_get_current_pid_tgid();
146
147    entryp = entryinfo.lookup(&tgid_pid);
148    if (entryp == 0) {
149        return 0;
150    }
151
152    u64 delta_ns = bpf_ktime_get_ns() - entryp->start_ns;
153    entryinfo.delete(&tgid_pid);
154
155    if (delta_ns < DURATION_NS)
156        return 0;
157
158    struct data_t data = {};
159    data.id = entryp->id;
160    data.tgid_pid = tgid_pid;
161    data.start_ns = entryp->start_ns;
162    data.duration_ns = delta_ns;
163    data.retval = PT_REGS_RC(ctx);
164
165#ifdef USER_STACKS
166    data.user_stack_id = stacks.get_stackid(ctx, BPF_F_USER_STACK);
167#endif
168
169#ifdef KERNEL_STACKS
170    data.kernel_stack_id = stacks.get_stackid(ctx, 0);
171
172    if (data.kernel_stack_id >= 0) {
173        u64 ip = PT_REGS_IP(ctx);
174        u64 page_offset;
175
176        // if ip isn't sane, leave key ips as zero for later checking
177#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
178        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
179        page_offset = __PAGE_OFFSET_BASE;
180#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
181        // x64, 4.17, and later
182#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
183        page_offset = __PAGE_OFFSET_BASE_L5;
184#else
185        page_offset = __PAGE_OFFSET_BASE_L4;
186#endif
187#else
188        // earlier x86_64 kernels, e.g., 4.6, comes here
189        // arm64, s390, powerpc, x86_32
190        page_offset = PAGE_OFFSET;
191#endif
192
193        if (ip > page_offset) {
194            data.kernel_ip = ip;
195        }
196    }
197#endif
198
199#ifdef GRAB_ARGS
200    bpf_probe_read(&data.args[0], sizeof(data.args), entryp->args);
201#endif
202    bpf_get_current_comm(&data.comm, sizeof(data.comm));
203    events.perf_submit(ctx, &data, sizeof(data));
204
205    return 0;
206}
207"""
208
209bpf_text = bpf_text.replace('DURATION_NS', str(duration_ns))
210if args.arguments:
211    bpf_text = "#define GRAB_ARGS\n" + bpf_text
212if args.user_stack:
213    bpf_text = "#define USER_STACKS\n" + bpf_text
214if args.kernel_stack:
215    bpf_text = "#define KERNEL_STACKS\n" + bpf_text
216if args.tgid:
217    bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid)
218else:
219    bpf_text = bpf_text.replace('TGID_FILTER', '0')
220
221for i in range(len(args.functions)):
222    bpf_text += """
223int trace_%d(struct pt_regs *ctx) {
224    return trace_entry(ctx, %d);
225}
226""" % (i, i)
227
228if args.verbose or args.ebpf:
229    print(bpf_text)
230    if args.ebpf:
231        exit()
232
233b = BPF(text=bpf_text)
234
235for i, function in enumerate(args.functions):
236    if ":" in function:
237        library, func = function.split(":")
238        b.attach_uprobe(name=library, sym=func, fn_name="trace_%d" % i)
239        b.attach_uretprobe(name=library, sym=func, fn_name="trace_return")
240    else:
241        b.attach_kprobe(event=function, fn_name="trace_%d" % i)
242        b.attach_kretprobe(event=function, fn_name="trace_return")
243
244TASK_COMM_LEN = 16  # linux/sched.h
245
246class Data(ct.Structure):
247    _fields_ = [
248        ("id", ct.c_ulonglong),
249        ("tgid_pid", ct.c_ulonglong),
250        ("start_ns", ct.c_ulonglong),
251        ("duration_ns", ct.c_ulonglong),
252        ("retval", ct.c_ulonglong),
253        ("comm", ct.c_char * TASK_COMM_LEN)
254    ] + ([("args", ct.c_ulonglong * 6)] if args.arguments else []) + \
255            ([("user_stack_id", ct.c_int)] if args.user_stack else []) + \
256            ([("kernel_stack_id", ct.c_int),("kernel_ip", ct.c_ulonglong)] if args.kernel_stack else [])
257
258time_designator = "us" if args.min_us else "ms"
259time_value = args.min_us or args.min_ms or 1
260time_multiplier = 1000 if args.min_us else 1000000
261time_col = args.time or args.timestamp
262
263# Do not print header when folded
264if not args.folded:
265    print("Tracing function calls slower than %g %s... Ctrl+C to quit." %
266          (time_value, time_designator))
267    print((("%-10s " % "TIME" if time_col else "") + "%-14s %-6s %7s %16s %s") %
268        ("COMM", "PID", "LAT(%s)" % time_designator, "RVAL",
269        "FUNC" + (" ARGS" if args.arguments else "")))
270
271earliest_ts = 0
272
273def time_str(event):
274    if args.time:
275        return "%-10s " % time.strftime("%H:%M:%S")
276    if args.timestamp:
277        global earliest_ts
278        if earliest_ts == 0:
279            earliest_ts = event.start_ns
280        return "%-10.6f " % ((event.start_ns - earliest_ts) / 1000000000.0)
281    return ""
282
283def args_str(event):
284    if not args.arguments:
285        return ""
286    return str.join(" ", ["0x%x" % arg for arg in event.args[:args.arguments]])
287
288def print_stack(event):
289    user_stack = []
290    stack_traces = b.get_table("stacks")
291
292    if args.user_stack and event.user_stack_id > 0:
293        user_stack = stack_traces.walk(event.user_stack_id)
294
295    kernel_stack = []
296    if args.kernel_stack and event.kernel_stack_id > 0:
297        kernel_tmp = stack_traces.walk(event.kernel_stack_id)
298
299        # fix kernel stack
300        for addr in kernel_tmp:
301            kernel_stack.append(addr)
302
303    do_delimiter = user_stack and kernel_stack
304
305    if args.folded:
306        # print folded stack output
307        user_stack = list(user_stack)
308        kernel_stack = list(kernel_stack)
309        line = [event.comm.decode('utf-8', 'replace')] + \
310            [b.sym(addr, event.tgid_pid) for addr in reversed(user_stack)] + \
311            (do_delimiter and ["-"] or []) + \
312            [b.ksym(addr) for addr in reversed(kernel_stack)]
313        print("%s %d" % (";".join(line), 1))
314    else:
315        # print default multi-line stack output.
316        for addr in kernel_stack:
317            print("    %s" % b.ksym(addr))
318        for addr in user_stack:
319            print("    %s" % b.sym(addr, event.tgid_pid))
320
321def print_event(cpu, data, size):
322    event = ct.cast(data, ct.POINTER(Data)).contents
323    ts = float(event.duration_ns) / time_multiplier
324    if not args.folded:
325        print((time_str(event) + "%-14.14s %-6s %7.2f %16x %s %s") %
326            (event.comm.decode('utf-8', 'replace'), event.tgid_pid >> 32,
327             ts, event.retval, args.functions[event.id], args_str(event)))
328    if args.user_stack or args.kernel_stack:
329        print_stack(event)
330
331b["events"].open_perf_buffer(print_event, page_cnt=64)
332while True:
333    b.perf_buffer_poll()
334