• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# profile  Profile CPU usage by sampling stack traces at a timed interval.
5#          For Linux, uses BCC, BPF, perf_events. Embedded C.
6#
7# This is an efficient profiler, as stack traces are frequency counted in
8# kernel context, rather than passing every stack to user space for frequency
9# counting there. Only the unique stacks and counts are passed to user space
10# at the end of the profile, greatly reducing the kernel<->user transfer.
11#
12# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
13# a version of this tool that may work on Linux 4.6 - 4.8.
14#
15# Copyright 2016 Netflix, Inc.
16# Licensed under the Apache License, Version 2.0 (the "License")
17#
18# THANKS: Alexei Starovoitov, who added proper BPF profiling support to Linux;
19# Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote much
20# of the code here, borrowed from tracepoint.py and offcputime.py; and
21# Teng Qin, who added perf support in bcc.
22#
23# 15-Jul-2016   Brendan Gregg   Created this.
24# 20-Oct-2016      "      "     Switched to use the new 4.9 support.
25
26from __future__ import print_function
27from bcc import BPF, PerfType, PerfSWConfig
28from sys import stderr
29from time import sleep
30import argparse
31import signal
32import os
33import errno
34import multiprocessing
35import ctypes as ct
36
37#
38# Process Arguments
39#
40
41# arg validation
42def positive_int(val):
43    try:
44        ival = int(val)
45    except ValueError:
46        raise argparse.ArgumentTypeError("must be an integer")
47
48    if ival < 0:
49        raise argparse.ArgumentTypeError("must be positive")
50    return ival
51
52def positive_nonzero_int(val):
53    ival = positive_int(val)
54    if ival == 0:
55        raise argparse.ArgumentTypeError("must be nonzero")
56    return ival
57
58def stack_id_err(stack_id):
59    # -EFAULT in get_stackid normally means the stack-trace is not availible,
60    # Such as getting kernel stack trace in userspace code
61    return (stack_id < 0) and (stack_id != -errno.EFAULT)
62
63# arguments
64examples = """examples:
65    ./profile             # profile stack traces at 49 Hertz until Ctrl-C
66    ./profile -F 99       # profile stack traces at 99 Hertz
67    ./profile -c 1000000  # profile stack traces every 1 in a million events
68    ./profile 5           # profile at 49 Hertz for 5 seconds only
69    ./profile -f 5        # output in folded format for flame graphs
70    ./profile -p 185      # only profile threads for PID 185
71    ./profile -U          # only show user space stacks (no kernel)
72    ./profile -K          # only show kernel space stacks (no user)
73"""
74parser = argparse.ArgumentParser(
75    description="Profile CPU stack traces at a timed interval",
76    formatter_class=argparse.RawDescriptionHelpFormatter,
77    epilog=examples)
78thread_group = parser.add_mutually_exclusive_group()
79thread_group.add_argument("-p", "--pid", type=positive_int,
80    help="profile this PID only")
81# TODO: add options for user/kernel threads only
82stack_group = parser.add_mutually_exclusive_group()
83stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
84    help="show stacks from user space only (no kernel space stacks)")
85stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
86    help="show stacks from kernel space only (no user space stacks)")
87sample_group = parser.add_mutually_exclusive_group()
88sample_group.add_argument("-F", "--frequency", type=positive_int,
89    help="sample frequency, Hertz")
90sample_group.add_argument("-c", "--count", type=positive_int,
91    help="sample period, number of events")
92parser.add_argument("-d", "--delimited", action="store_true",
93    help="insert delimiter between kernel/user stacks")
94parser.add_argument("-a", "--annotations", action="store_true",
95    help="add _[k] annotations to kernel frames")
96parser.add_argument("-f", "--folded", action="store_true",
97    help="output folded format, one line per stack (for flame graphs)")
98parser.add_argument("--stack-storage-size", default=16384,
99    type=positive_nonzero_int,
100    help="the number of unique stack traces that can be stored and "
101        "displayed (default %(default)s)")
102parser.add_argument("duration", nargs="?", default=99999999,
103    type=positive_nonzero_int,
104    help="duration of trace, in seconds")
105parser.add_argument("-C", "--cpu", type=int, default=-1,
106    help="cpu number to run profile on")
107parser.add_argument("--ebpf", action="store_true",
108    help=argparse.SUPPRESS)
109
110# option logic
111args = parser.parse_args()
112pid = int(args.pid) if args.pid is not None else -1
113duration = int(args.duration)
114debug = 0
115need_delimiter = args.delimited and not (args.kernel_stacks_only or
116    args.user_stacks_only)
117# TODO: add stack depth, and interval
118
119#
120# Setup BPF
121#
122
123# define BPF program
124bpf_text = """
125#include <uapi/linux/ptrace.h>
126#include <uapi/linux/bpf_perf_event.h>
127#include <linux/sched.h>
128
129struct key_t {
130    u32 pid;
131    u64 kernel_ip;
132    u64 kernel_ret_ip;
133    int user_stack_id;
134    int kernel_stack_id;
135    char name[TASK_COMM_LEN];
136};
137BPF_HASH(counts, struct key_t);
138BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
139
140// This code gets a bit complex. Probably not suitable for casual hacking.
141
142int do_perf_event(struct bpf_perf_event_data *ctx) {
143    u32 pid = bpf_get_current_pid_tgid() >> 32;
144    if (!(THREAD_FILTER))
145        return 0;
146
147    // create map key
148    struct key_t key = {.pid = pid};
149    bpf_get_current_comm(&key.name, sizeof(key.name));
150
151    // get stacks
152    key.user_stack_id = USER_STACK_GET;
153    key.kernel_stack_id = KERNEL_STACK_GET;
154
155    if (key.kernel_stack_id >= 0) {
156        // populate extras to fix the kernel stack
157        u64 ip = PT_REGS_IP(&ctx->regs);
158        u64 page_offset;
159
160        // if ip isn't sane, leave key ips as zero for later checking
161#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
162        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
163        page_offset = __PAGE_OFFSET_BASE;
164#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
165        // x64, 4.17, and later
166#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
167        page_offset = __PAGE_OFFSET_BASE_L5;
168#else
169        page_offset = __PAGE_OFFSET_BASE_L4;
170#endif
171#else
172        // earlier x86_64 kernels, e.g., 4.6, comes here
173        // arm64, s390, powerpc, x86_32
174        page_offset = PAGE_OFFSET;
175#endif
176
177        if (ip > page_offset) {
178            key.kernel_ip = ip;
179        }
180    }
181
182    counts.increment(key);
183    return 0;
184}
185"""
186
187# set thread filter
188thread_context = ""
189perf_filter = "-a"
190if args.pid is not None:
191    thread_context = "PID %s" % args.pid
192    thread_filter = 'pid == %s' % args.pid
193    perf_filter = '-p %s' % args.pid
194else:
195    thread_context = "all threads"
196    thread_filter = '1'
197bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
198
199# set stack storage size
200bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
201
202# handle stack args
203kernel_stack_get = "stack_traces.get_stackid(&ctx->regs, 0)"
204user_stack_get = "stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK)"
205stack_context = ""
206if args.user_stacks_only:
207    stack_context = "user"
208    kernel_stack_get = "-1"
209elif args.kernel_stacks_only:
210    stack_context = "kernel"
211    user_stack_get = "-1"
212else:
213    stack_context = "user + kernel"
214bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
215bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)
216
217sample_freq = 0
218sample_period = 0
219if args.frequency:
220    sample_freq = args.frequency
221elif args.count:
222    sample_period = args.count
223else:
224    # If user didn't specify anything, use default 49Hz sampling
225    sample_freq = 49
226sample_context = "%s%d %s" % (("", sample_freq, "Hertz") if sample_freq
227                         else ("every ", sample_period, "events"))
228
229# header
230if not args.folded:
231    print("Sampling at %s of %s by %s stack" %
232        (sample_context, thread_context, stack_context), end="")
233    if args.cpu >= 0:
234        print(" on CPU#{}".format(args.cpu), end="")
235    if duration < 99999999:
236        print(" for %d secs." % duration)
237    else:
238        print("... Hit Ctrl-C to end.")
239
240if debug or args.ebpf:
241    print(bpf_text)
242    if args.ebpf:
243        exit()
244
245# initialize BPF & perf_events
246b = BPF(text=bpf_text)
247b.attach_perf_event(ev_type=PerfType.SOFTWARE,
248    ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event",
249    sample_period=sample_period, sample_freq=sample_freq, cpu=args.cpu)
250
251# signal handler
252def signal_ignore(signal, frame):
253    print()
254
255#
256# Output Report
257#
258
259# collect samples
260try:
261    sleep(duration)
262except KeyboardInterrupt:
263    # as cleanup can take some time, trap Ctrl-C:
264    signal.signal(signal.SIGINT, signal_ignore)
265
266if not args.folded:
267    print()
268
269def aksym(addr):
270    if args.annotations:
271        return b.ksym(addr) + "_[k]".encode()
272    else:
273        return b.ksym(addr)
274
275# output stacks
276missing_stacks = 0
277has_enomem = False
278counts = b.get_table("counts")
279stack_traces = b.get_table("stack_traces")
280need_delimiter = args.delimited and not (args.kernel_stacks_only or
281                                         args.user_stacks_only)
282for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
283    # handle get_stackid errors
284    if not args.user_stacks_only and stack_id_err(k.kernel_stack_id):
285        missing_stacks += 1
286        has_enomem = has_enomem or k.kernel_stack_id == -errno.ENOMEM
287    if not args.kernel_stacks_only and stack_id_err(k.user_stack_id):
288        missing_stacks += 1
289        has_enomem = has_enomem or k.user_stack_id == -errno.ENOMEM
290
291    user_stack = [] if k.user_stack_id < 0 else \
292        stack_traces.walk(k.user_stack_id)
293    kernel_tmp = [] if k.kernel_stack_id < 0 else \
294        stack_traces.walk(k.kernel_stack_id)
295
296    # fix kernel stack
297    kernel_stack = []
298    if k.kernel_stack_id >= 0:
299        for addr in kernel_tmp:
300            kernel_stack.append(addr)
301        # the later IP checking
302        if k.kernel_ip:
303            kernel_stack.insert(0, k.kernel_ip)
304
305    if args.folded:
306        # print folded stack output
307        user_stack = list(user_stack)
308        kernel_stack = list(kernel_stack)
309        line = [k.name]
310        # if we failed to get the stack is, such as due to no space (-ENOMEM) or
311        # hash collision (-EEXIST), we still print a placeholder for consistency
312        if not args.kernel_stacks_only:
313            if stack_id_err(k.user_stack_id):
314                line.append("[Missed User Stack]")
315            else:
316                line.extend([b.sym(addr, k.pid) for addr in reversed(user_stack)])
317        if not args.user_stacks_only:
318            line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else [])
319            if stack_id_err(k.kernel_stack_id):
320                line.append("[Missed Kernel Stack]")
321            else:
322                line.extend([b.ksym(addr) for addr in reversed(kernel_stack)])
323        print("%s %d" % (b";".join(line).decode('utf-8', 'replace'), v.value))
324    else:
325        # print default multi-line stack output
326        if not args.user_stacks_only:
327            if stack_id_err(k.kernel_stack_id):
328                print("    [Missed Kernel Stack]")
329            else:
330                for addr in kernel_stack:
331                    print("    %s" % aksym(addr))
332        if not args.kernel_stacks_only:
333            if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0:
334                print("    --")
335            if stack_id_err(k.user_stack_id):
336                print("    [Missed User Stack]")
337            else:
338                for addr in user_stack:
339                    print("    %s" % b.sym(addr, k.pid).decode('utf-8', 'replace'))
340        print("    %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid))
341        print("        %d\n" % v.value)
342
343# check missing
344if missing_stacks > 0:
345    enomem_str = "" if not has_enomem else \
346        " Consider increasing --stack-storage-size."
347    print("WARNING: %d stack traces could not be displayed.%s" %
348        (missing_stacks, enomem_str),
349        file=stderr)
350