1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# criticalstat Trace long critical sections (IRQs or preemption disabled) 5# For Linux, uses BCC, eBPF. Requires kernel built with 6# CONFIG_DEBUG_PREEMPT and CONFIG_PREEMPTIRQ_EVENTS 7# 8# USAGE: criticalstat [-h] [-p] [-i] [-d DURATION] 9# 10# Copyright (c) 2018, Google, Inc. 11# Licensed under the Apache License, Version 2.0 (the "License") 12# 13# By Joel Fernandes <joel@joelfernandes.org> 14 15from __future__ import print_function 16from bcc import BPF 17import argparse 18import ctypes as ct 19import sys 20import subprocess 21import os.path 22 23examples="" 24 25parser = argparse.ArgumentParser( 26 description="Trace long critical sections", 27 formatter_class=argparse.RawDescriptionHelpFormatter, 28 epilog=examples) 29 30parser.add_argument("-p", "--preemptoff", action="store_true", 31 help="Find long sections where preemption was off") 32 33parser.add_argument("-i", "--irqoff", action="store_true", 34 help="Find long sections where IRQ was off") 35 36parser.add_argument("-d", "--duration", default=100, 37 help="Duration in uS (microseconds) below which we filter") 38 39args = parser.parse_args() 40 41preemptoff = False 42irqoff = False 43 44if args.irqoff: 45 preemptoff = False 46 irqoff = True 47elif args.preemptoff: 48 preemptoff = True 49 irqoff = False 50 51debugfs_path = subprocess.Popen ("cat /proc/mounts | grep -w debugfs" + 52 " | awk '{print $2}'", 53 shell=True, 54 stdout=subprocess.PIPE).stdout.read().split(b"\n")[0] 55 56if debugfs_path == "": 57 print("ERROR: Unable to find debugfs mount point"); 58 sys.exit(0); 59 60trace_path = debugfs_path + b"/tracing/events/preemptirq/"; 61 62if (not os.path.exists(trace_path + b"irq_disable") or 63 not os.path.exists(trace_path + b"irq_enable") or 64 not os.path.exists(trace_path + b"preempt_disable") or 65 not os.path.exists(trace_path + b"preempt_enable")): 66 print("ERROR: required tracing events are not available\n" + 67 "Make sure the kernel is built with CONFIG_DEBUG_PREEMPT " + 68 "and CONFIG_PREEMPTIRQ_EVENTS enabled. Also please disable " + 69 "CONFIG_PROVE_LOCKING and CONFIG_LOCKDEP on older kernels.") 70 sys.exit(0) 71 72bpf_text = """ 73#include <uapi/linux/ptrace.h> 74#include <linux/sched.h> 75 76enum addr_offs { 77 START_CALLER_OFF, 78 START_PARENT_OFF, 79 END_CALLER_OFF, 80 END_PARENT_OFF 81}; 82 83struct start_data { 84 u32 addr_offs[2]; 85 u64 ts; 86 int idle_skip; 87 int active; 88}; 89 90struct data_t { 91 u64 time; 92 s64 stack_id; 93 u32 cpu; 94 u64 id; 95 u32 addrs[4]; /* indexed by addr_offs */ 96 char comm[TASK_COMM_LEN]; 97}; 98 99BPF_STACK_TRACE(stack_traces, 16384); 100BPF_PERCPU_ARRAY(sts, struct start_data, 1); 101BPF_PERCPU_ARRAY(isidle, u64, 1); 102BPF_PERF_OUTPUT(events); 103 104/* 105 * In the below code we install tracepoint probes on preempt or 106 * IRQ disable/enable critical sections and idle events, the cases 107 * are combinations of 4 different states. 108 * The states are defined as: 109 * CSenter: A critical section has been entered. Either due to 110 * preempt disable or irq disable. 111 * CSexit: A critical section has been exited. Either due to 112 * preempt enable or irq enable. 113 * Ienter: The CPU has entered an idle state. 114 * Iexit: The CPU has exited an idle state. 115 * 116 * The scenario we are trying to detect is if there is an overlap 117 * between Critical sections and idle entry/exit. If there are any 118 * such cases, we avoid recording those critical sections since they 119 * are not worth while to record and just add noise. 120 */ 121TRACEPOINT_PROBE(power, cpu_idle) 122{ 123 int idx = 0; 124 u64 val; 125 struct start_data *stdp, std; 126 127 // Mark active sections as that they should be skipped 128 129 // Handle the case CSenter, Ienter, CSexit, Iexit 130 // Handle the case CSenter, Ienter, Iexit, CSexit 131 stdp = sts.lookup(&idx); 132 if (stdp && stdp->active) { 133 /* 134 * Due to verifier issues, we have to copy contents 135 * of stdp onto the stack before the update. 136 * Fix it to directly update once kernel patch d71962f 137 * becomes more widespread. 138 */ 139 std = *stdp; 140 std.idle_skip = 1; 141 sts.update(&idx, &std); 142 } 143 144 // Mark CPU as actively within idle or not. 145 if (args->state < 100) { 146 val = 1; 147 isidle.update(&idx, &val); 148 } else { 149 val = 0; 150 isidle.update(&idx, &val); 151 } 152 return 0; 153} 154 155static int in_idle(void) 156{ 157 u64 *idlep; 158 int idx = 0; 159 160 // Skip event if we're in idle loop 161 idlep = isidle.lookup(&idx); 162 if (idlep && *idlep) 163 return 1; 164 return 0; 165} 166 167static void reset_state(void) 168{ 169 int idx = 0; 170 struct start_data s = {}; 171 172 sts.update(&idx, &s); 173} 174 175TRACEPOINT_PROBE(preemptirq, TYPE_disable) 176{ 177 int idx = 0; 178 struct start_data s; 179 180 // Handle the case Ienter, CSenter, CSexit, Iexit 181 // Handle the case Ienter, CSenter, Iexit, CSexit 182 if (in_idle()) { 183 reset_state(); 184 return 0; 185 } 186 187 u64 ts = bpf_ktime_get_ns(); 188 189 s.idle_skip = 0; 190 s.addr_offs[START_CALLER_OFF] = args->caller_offs; 191 s.addr_offs[START_PARENT_OFF] = args->parent_offs; 192 s.ts = ts; 193 s.active = 1; 194 195 sts.update(&idx, &s); 196 return 0; 197} 198 199TRACEPOINT_PROBE(preemptirq, TYPE_enable) 200{ 201 int idx = 0; 202 u64 start_ts, end_ts, diff; 203 struct start_data *stdp; 204 205 // Handle the case CSenter, Ienter, CSexit, Iexit 206 // Handle the case Ienter, CSenter, CSexit, Iexit 207 if (in_idle()) { 208 reset_state(); 209 return 0; 210 } 211 212 stdp = sts.lookup(&idx); 213 if (!stdp) { 214 reset_state(); 215 return 0; 216 } 217 218 // Handle the case Ienter, Csenter, Iexit, Csexit 219 if (!stdp->active) { 220 reset_state(); 221 return 0; 222 } 223 224 // Handle the case CSenter, Ienter, Iexit, CSexit 225 if (stdp->idle_skip) { 226 reset_state(); 227 return 0; 228 } 229 230 end_ts = bpf_ktime_get_ns(); 231 start_ts = stdp->ts; 232 233 if (start_ts > end_ts) { 234 reset_state(); 235 return 0; 236 } 237 238 diff = end_ts - start_ts; 239 240 if (diff < DURATION) { 241 reset_state(); 242 return 0; 243 } 244 245 u64 id = bpf_get_current_pid_tgid(); 246 struct data_t data = {}; 247 248 if (bpf_get_current_comm(&data.comm, sizeof(data.comm)) == 0) { 249 data.addrs[START_CALLER_OFF] = stdp->addr_offs[START_CALLER_OFF]; 250 data.addrs[START_PARENT_OFF] = stdp->addr_offs[START_PARENT_OFF]; 251 data.addrs[END_CALLER_OFF] = args->caller_offs; 252 data.addrs[END_PARENT_OFF] = args->parent_offs; 253 254 data.id = id; 255 data.stack_id = stack_traces.get_stackid(args, 0); 256 data.time = diff; 257 data.cpu = bpf_get_smp_processor_id(); 258 events.perf_submit(args, &data, sizeof(data)); 259 } 260 261 reset_state(); 262 return 0; 263} 264""" 265bpf_text = bpf_text.replace('DURATION', '{}'.format(int(args.duration) * 1000)) 266 267if preemptoff: 268 bpf_text = bpf_text.replace('TYPE', 'preempt') 269else: 270 bpf_text = bpf_text.replace('TYPE', 'irq') 271 272b = BPF(text=bpf_text) 273 274TASK_COMM_LEN = 16 # linux/sched.h 275 276class Data(ct.Structure): 277 _fields_ = [ 278 ("time", ct.c_ulonglong), 279 ("stack_id", ct.c_longlong), 280 ("cpu", ct.c_int), 281 ("id", ct.c_ulonglong), 282 ("addrs", ct.c_int * 4), 283 ("comm", ct.c_char * TASK_COMM_LEN), 284 ] 285 286def get_syms(kstack): 287 syms = [] 288 289 for addr in kstack: 290 s = b.ksym(addr, show_offset=True) 291 syms.append(s) 292 293 return syms 294 295# process event 296def print_event(cpu, data, size): 297 try: 298 global b 299 event = ct.cast(data, ct.POINTER(Data)).contents 300 stack_traces = b['stack_traces'] 301 stext = b.ksymname('_stext') 302 303 print("===================================") 304 print("TASK: %s (pid %5d tid %5d) Total Time: %-9.3fus\n\n" % (event.comm, \ 305 (event.id >> 32), (event.id & 0xffffffff), float(event.time) / 1000), end="") 306 print("Section start: {} -> {}".format(b.ksym(stext + event.addrs[0]), b.ksym(stext + event.addrs[1]))) 307 print("Section end: {} -> {}".format(b.ksym(stext + event.addrs[2]), b.ksym(stext + event.addrs[3]))) 308 309 if event.stack_id >= 0: 310 kstack = stack_traces.walk(event.stack_id) 311 syms = get_syms(kstack) 312 if not syms: 313 return 314 315 for s in syms: 316 print(" ", end="") 317 print("%s" % s) 318 else: 319 print("NO STACK FOUND DUE TO COLLISION") 320 print("===================================") 321 print("") 322 except: 323 sys.exit(0) 324 325b["events"].open_perf_buffer(print_event, page_cnt=256) 326 327print("Finding critical section with {} disabled for > {}us".format( 328 ('preempt' if preemptoff else 'IRQ'), args.duration)) 329 330while 1: 331 b.perf_buffer_poll(); 332