1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# profile Profile CPU usage by sampling stack traces at a timed interval. 5# For Linux, uses BCC, BPF, perf_events. Embedded C. 6# 7# This is an efficient profiler, as stack traces are frequency counted in 8# kernel context, rather than passing every stack to user space for frequency 9# counting there. Only the unique stacks and counts are passed to user space 10# at the end of the profile, greatly reducing the kernel<->user transfer. 11# 12# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is 13# a version of this tool that may work on Linux 4.6 - 4.8. 14# 15# Copyright 2016 Netflix, Inc. 16# Licensed under the Apache License, Version 2.0 (the "License") 17# 18# THANKS: Alexei Starovoitov, who added proper BPF profiling support to Linux; 19# Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote much 20# of the code here, borrowed from tracepoint.py and offcputime.py; and 21# Teng Qin, who added perf support in bcc. 22# 23# 15-Jul-2016 Brendan Gregg Created this. 24# 20-Oct-2016 " " Switched to use the new 4.9 support. 25 26from __future__ import print_function 27from bcc import BPF, PerfType, PerfSWConfig 28from sys import stderr 29from time import sleep 30import argparse 31import signal 32import os 33import errno 34import multiprocessing 35import ctypes as ct 36 37# 38# Process Arguments 39# 40 41# arg validation 42def positive_int(val): 43 try: 44 ival = int(val) 45 except ValueError: 46 raise argparse.ArgumentTypeError("must be an integer") 47 48 if ival < 0: 49 raise argparse.ArgumentTypeError("must be positive") 50 return ival 51 52def positive_nonzero_int(val): 53 ival = positive_int(val) 54 if ival == 0: 55 raise argparse.ArgumentTypeError("must be nonzero") 56 return ival 57 58def stack_id_err(stack_id): 59 # -EFAULT in get_stackid normally means the stack-trace is not availible, 60 # Such as getting kernel stack trace in userspace code 61 return (stack_id < 0) and (stack_id != -errno.EFAULT) 62 63# arguments 64examples = """examples: 65 ./profile # profile stack traces at 49 Hertz until Ctrl-C 66 ./profile -F 99 # profile stack traces at 99 Hertz 67 ./profile -c 1000000 # profile stack traces every 1 in a million events 68 ./profile 5 # profile at 49 Hertz for 5 seconds only 69 ./profile -f 5 # output in folded format for flame graphs 70 ./profile -p 185 # only profile threads for PID 185 71 ./profile -U # only show user space stacks (no kernel) 72 ./profile -K # only show kernel space stacks (no user) 73""" 74parser = argparse.ArgumentParser( 75 description="Profile CPU stack traces at a timed interval", 76 formatter_class=argparse.RawDescriptionHelpFormatter, 77 epilog=examples) 78thread_group = parser.add_mutually_exclusive_group() 79thread_group.add_argument("-p", "--pid", type=positive_int, 80 help="profile this PID only") 81# TODO: add options for user/kernel threads only 82stack_group = parser.add_mutually_exclusive_group() 83stack_group.add_argument("-U", "--user-stacks-only", action="store_true", 84 help="show stacks from user space only (no kernel space stacks)") 85stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true", 86 help="show stacks from kernel space only (no user space stacks)") 87sample_group = parser.add_mutually_exclusive_group() 88sample_group.add_argument("-F", "--frequency", type=positive_int, 89 help="sample frequency, Hertz") 90sample_group.add_argument("-c", "--count", type=positive_int, 91 help="sample period, number of events") 92parser.add_argument("-d", "--delimited", action="store_true", 93 help="insert delimiter between kernel/user stacks") 94parser.add_argument("-a", "--annotations", action="store_true", 95 help="add _[k] annotations to kernel frames") 96parser.add_argument("-f", "--folded", action="store_true", 97 help="output folded format, one line per stack (for flame graphs)") 98parser.add_argument("--stack-storage-size", default=16384, 99 type=positive_nonzero_int, 100 help="the number of unique stack traces that can be stored and " 101 "displayed (default %(default)s)") 102parser.add_argument("duration", nargs="?", default=99999999, 103 type=positive_nonzero_int, 104 help="duration of trace, in seconds") 105parser.add_argument("-C", "--cpu", type=int, default=-1, 106 help="cpu number to run profile on") 107parser.add_argument("--ebpf", action="store_true", 108 help=argparse.SUPPRESS) 109 110# option logic 111args = parser.parse_args() 112pid = int(args.pid) if args.pid is not None else -1 113duration = int(args.duration) 114debug = 0 115need_delimiter = args.delimited and not (args.kernel_stacks_only or 116 args.user_stacks_only) 117# TODO: add stack depth, and interval 118 119# 120# Setup BPF 121# 122 123# define BPF program 124bpf_text = """ 125#include <uapi/linux/ptrace.h> 126#include <uapi/linux/bpf_perf_event.h> 127#include <linux/sched.h> 128 129struct key_t { 130 u32 pid; 131 u64 kernel_ip; 132 u64 kernel_ret_ip; 133 int user_stack_id; 134 int kernel_stack_id; 135 char name[TASK_COMM_LEN]; 136}; 137BPF_HASH(counts, struct key_t); 138BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); 139 140// This code gets a bit complex. Probably not suitable for casual hacking. 141 142int do_perf_event(struct bpf_perf_event_data *ctx) { 143 u32 pid = bpf_get_current_pid_tgid() >> 32; 144 if (!(THREAD_FILTER)) 145 return 0; 146 147 // create map key 148 struct key_t key = {.pid = pid}; 149 bpf_get_current_comm(&key.name, sizeof(key.name)); 150 151 // get stacks 152 key.user_stack_id = USER_STACK_GET; 153 key.kernel_stack_id = KERNEL_STACK_GET; 154 155 if (key.kernel_stack_id >= 0) { 156 // populate extras to fix the kernel stack 157 u64 ip = PT_REGS_IP(&ctx->regs); 158 u64 page_offset; 159 160 // if ip isn't sane, leave key ips as zero for later checking 161#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE) 162 // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it 163 page_offset = __PAGE_OFFSET_BASE; 164#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4) 165 // x64, 4.17, and later 166#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL) 167 page_offset = __PAGE_OFFSET_BASE_L5; 168#else 169 page_offset = __PAGE_OFFSET_BASE_L4; 170#endif 171#else 172 // earlier x86_64 kernels, e.g., 4.6, comes here 173 // arm64, s390, powerpc, x86_32 174 page_offset = PAGE_OFFSET; 175#endif 176 177 if (ip > page_offset) { 178 key.kernel_ip = ip; 179 } 180 } 181 182 counts.increment(key); 183 return 0; 184} 185""" 186 187# set thread filter 188thread_context = "" 189perf_filter = "-a" 190if args.pid is not None: 191 thread_context = "PID %s" % args.pid 192 thread_filter = 'pid == %s' % args.pid 193 perf_filter = '-p %s' % args.pid 194else: 195 thread_context = "all threads" 196 thread_filter = '1' 197bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) 198 199# set stack storage size 200bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) 201 202# handle stack args 203kernel_stack_get = "stack_traces.get_stackid(&ctx->regs, 0)" 204user_stack_get = "stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK)" 205stack_context = "" 206if args.user_stacks_only: 207 stack_context = "user" 208 kernel_stack_get = "-1" 209elif args.kernel_stacks_only: 210 stack_context = "kernel" 211 user_stack_get = "-1" 212else: 213 stack_context = "user + kernel" 214bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get) 215bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get) 216 217sample_freq = 0 218sample_period = 0 219if args.frequency: 220 sample_freq = args.frequency 221elif args.count: 222 sample_period = args.count 223else: 224 # If user didn't specify anything, use default 49Hz sampling 225 sample_freq = 49 226sample_context = "%s%d %s" % (("", sample_freq, "Hertz") if sample_freq 227 else ("every ", sample_period, "events")) 228 229# header 230if not args.folded: 231 print("Sampling at %s of %s by %s stack" % 232 (sample_context, thread_context, stack_context), end="") 233 if args.cpu >= 0: 234 print(" on CPU#{}".format(args.cpu), end="") 235 if duration < 99999999: 236 print(" for %d secs." % duration) 237 else: 238 print("... Hit Ctrl-C to end.") 239 240if debug or args.ebpf: 241 print(bpf_text) 242 if args.ebpf: 243 exit() 244 245# initialize BPF & perf_events 246b = BPF(text=bpf_text) 247b.attach_perf_event(ev_type=PerfType.SOFTWARE, 248 ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event", 249 sample_period=sample_period, sample_freq=sample_freq, cpu=args.cpu) 250 251# signal handler 252def signal_ignore(signal, frame): 253 print() 254 255# 256# Output Report 257# 258 259# collect samples 260try: 261 sleep(duration) 262except KeyboardInterrupt: 263 # as cleanup can take some time, trap Ctrl-C: 264 signal.signal(signal.SIGINT, signal_ignore) 265 266if not args.folded: 267 print() 268 269def aksym(addr): 270 if args.annotations: 271 return b.ksym(addr) + "_[k]".encode() 272 else: 273 return b.ksym(addr) 274 275# output stacks 276missing_stacks = 0 277has_enomem = False 278counts = b.get_table("counts") 279stack_traces = b.get_table("stack_traces") 280need_delimiter = args.delimited and not (args.kernel_stacks_only or 281 args.user_stacks_only) 282for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): 283 # handle get_stackid errors 284 if not args.user_stacks_only and stack_id_err(k.kernel_stack_id): 285 missing_stacks += 1 286 has_enomem = has_enomem or k.kernel_stack_id == -errno.ENOMEM 287 if not args.kernel_stacks_only and stack_id_err(k.user_stack_id): 288 missing_stacks += 1 289 has_enomem = has_enomem or k.user_stack_id == -errno.ENOMEM 290 291 user_stack = [] if k.user_stack_id < 0 else \ 292 stack_traces.walk(k.user_stack_id) 293 kernel_tmp = [] if k.kernel_stack_id < 0 else \ 294 stack_traces.walk(k.kernel_stack_id) 295 296 # fix kernel stack 297 kernel_stack = [] 298 if k.kernel_stack_id >= 0: 299 for addr in kernel_tmp: 300 kernel_stack.append(addr) 301 # the later IP checking 302 if k.kernel_ip: 303 kernel_stack.insert(0, k.kernel_ip) 304 305 if args.folded: 306 # print folded stack output 307 user_stack = list(user_stack) 308 kernel_stack = list(kernel_stack) 309 line = [k.name] 310 # if we failed to get the stack is, such as due to no space (-ENOMEM) or 311 # hash collision (-EEXIST), we still print a placeholder for consistency 312 if not args.kernel_stacks_only: 313 if stack_id_err(k.user_stack_id): 314 line.append("[Missed User Stack]") 315 else: 316 line.extend([b.sym(addr, k.pid) for addr in reversed(user_stack)]) 317 if not args.user_stacks_only: 318 line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else []) 319 if stack_id_err(k.kernel_stack_id): 320 line.append("[Missed Kernel Stack]") 321 else: 322 line.extend([b.ksym(addr) for addr in reversed(kernel_stack)]) 323 print("%s %d" % (b";".join(line).decode('utf-8', 'replace'), v.value)) 324 else: 325 # print default multi-line stack output 326 if not args.user_stacks_only: 327 if stack_id_err(k.kernel_stack_id): 328 print(" [Missed Kernel Stack]") 329 else: 330 for addr in kernel_stack: 331 print(" %s" % aksym(addr)) 332 if not args.kernel_stacks_only: 333 if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0: 334 print(" --") 335 if stack_id_err(k.user_stack_id): 336 print(" [Missed User Stack]") 337 else: 338 for addr in user_stack: 339 print(" %s" % b.sym(addr, k.pid).decode('utf-8', 'replace')) 340 print(" %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid)) 341 print(" %d\n" % v.value) 342 343# check missing 344if missing_stacks > 0: 345 enomem_str = "" if not has_enomem else \ 346 " Consider increasing --stack-storage-size." 347 print("WARNING: %d stack traces could not be displayed.%s" % 348 (missing_stacks, enomem_str), 349 file=stderr) 350