1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# runqlen Summarize scheduler run queue length as a histogram. 5# For Linux, uses BCC, eBPF. 6# 7# This counts the length of the run queue, excluding the currently running 8# thread, and shows it as a histogram. 9# 10# Also answers run queue occupancy. 11# 12# USAGE: runqlen [-h] [-T] [-Q] [-m] [-D] [interval] [count] 13# 14# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is 15# a version of this tool that may work on Linux 4.6 - 4.8. 16# 17# Copyright 2016 Netflix, Inc. 18# Licensed under the Apache License, Version 2.0 (the "License") 19# 20# 12-Dec-2016 Brendan Gregg Created this. 21 22from __future__ import print_function 23from bcc import BPF, PerfType, PerfSWConfig 24from time import sleep, strftime 25from tempfile import NamedTemporaryFile 26from os import open, close, dup, unlink, O_WRONLY 27import argparse 28 29# arguments 30examples = """examples: 31 ./runqlen # summarize run queue length as a histogram 32 ./runqlen 1 10 # print 1 second summaries, 10 times 33 ./runqlen -T 1 # 1s summaries and timestamps 34 ./runqlen -O # report run queue occupancy 35 ./runqlen -C # show each CPU separately 36""" 37parser = argparse.ArgumentParser( 38 description="Summarize scheduler run queue length as a histogram", 39 formatter_class=argparse.RawDescriptionHelpFormatter, 40 epilog=examples) 41parser.add_argument("-T", "--timestamp", action="store_true", 42 help="include timestamp on output") 43parser.add_argument("-O", "--runqocc", action="store_true", 44 help="report run queue occupancy") 45parser.add_argument("-C", "--cpus", action="store_true", 46 help="print output for each CPU separately") 47parser.add_argument("interval", nargs="?", default=99999999, 48 help="output interval, in seconds") 49parser.add_argument("count", nargs="?", default=99999999, 50 help="number of outputs") 51parser.add_argument("--ebpf", action="store_true", 52 help=argparse.SUPPRESS) 53args = parser.parse_args() 54countdown = int(args.count) 55debug = 0 56frequency = 99 57 58# Linux 4.15 introduced a new field runnable_weight 59# in linux_src:kernel/sched/sched.h as 60# struct cfs_rq { 61# struct load_weight load; 62# unsigned long runnable_weight; 63# unsigned int nr_running, h_nr_running; 64# ...... 65# } 66# and this tool requires to access nr_running to get 67# runqueue len information. 68# 69# The commit which introduces cfs_rq->runnable_weight 70# field also introduces the field sched_entity->runnable_weight 71# where sched_entity is defined in linux_src:include/linux/sched.h. 72# 73# To cope with pre-4.15 and 4.15/post-4.15 releases, 74# we run a simple BPF program to detect whether 75# field sched_entity->runnable_weight exists. The existence of 76# this field should infer the existence of cfs_rq->runnable_weight. 77# 78# This will need maintenance as the relationship between these 79# two fields may change in the future. 80# 81def check_runnable_weight_field(): 82 # Define the bpf program for checking purpose 83 bpf_check_text = """ 84#include <linux/sched.h> 85unsigned long dummy(struct sched_entity *entity) 86{ 87 return entity->runnable_weight; 88} 89""" 90 91 # Get a temporary file name 92 tmp_file = NamedTemporaryFile(delete=False) 93 tmp_file.close(); 94 95 # Duplicate and close stderr (fd = 2) 96 old_stderr = dup(2) 97 close(2) 98 99 # Open a new file, should get fd number 2 100 # This will avoid printing llvm errors on the screen 101 fd = open(tmp_file.name, O_WRONLY) 102 try: 103 t = BPF(text=bpf_check_text) 104 success_compile = True 105 except: 106 success_compile = False 107 108 # Release the fd 2, and next dup should restore old stderr 109 close(fd) 110 dup(old_stderr) 111 close(old_stderr) 112 113 # remove the temporary file and return 114 unlink(tmp_file.name) 115 return success_compile 116 117 118# define BPF program 119bpf_text = """ 120#include <uapi/linux/ptrace.h> 121#include <linux/sched.h> 122 123// Declare enough of cfs_rq to find nr_running, since we can't #import the 124// header. This will need maintenance. It is from kernel/sched/sched.h: 125struct cfs_rq_partial { 126 struct load_weight load; 127 RUNNABLE_WEIGHT_FIELD 128 unsigned int nr_running, h_nr_running; 129}; 130 131typedef struct cpu_key { 132 int cpu; 133 unsigned int slot; 134} cpu_key_t; 135STORAGE 136 137int do_perf_event() 138{ 139 unsigned int len = 0; 140 pid_t pid = 0; 141 struct task_struct *task = NULL; 142 struct cfs_rq_partial *my_q = NULL; 143 144 // Fetch the run queue length from task->se.cfs_rq->nr_running. This is an 145 // unstable interface and may need maintenance. Perhaps a future version 146 // of BPF will support task_rq(p) or something similar as a more reliable 147 // interface. 148 task = (struct task_struct *)bpf_get_current_task(); 149 my_q = (struct cfs_rq_partial *)task->se.cfs_rq; 150 len = my_q->nr_running; 151 152 // Calculate run queue length by subtracting the currently running task, 153 // if present. len 0 == idle, len 1 == one running task. 154 if (len > 0) 155 len--; 156 157 STORE 158 159 return 0; 160} 161""" 162 163# code substitutions 164if args.cpus: 165 bpf_text = bpf_text.replace('STORAGE', 166 'BPF_HISTOGRAM(dist, cpu_key_t);') 167 bpf_text = bpf_text.replace('STORE', 'cpu_key_t key = {.slot = len}; ' + 168 'key.cpu = bpf_get_smp_processor_id(); ' + 169 'dist.increment(key);') 170else: 171 bpf_text = bpf_text.replace('STORAGE', 172 'BPF_HISTOGRAM(dist, unsigned int);') 173 bpf_text = bpf_text.replace('STORE', 'dist.increment(len);') 174 175if check_runnable_weight_field(): 176 bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;') 177else: 178 bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '') 179 180if debug or args.ebpf: 181 print(bpf_text) 182 if args.ebpf: 183 exit() 184 185# initialize BPF & perf_events 186b = BPF(text=bpf_text) 187b.attach_perf_event(ev_type=PerfType.SOFTWARE, 188 ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event", 189 sample_period=0, sample_freq=frequency) 190 191print("Sampling run queue length... Hit Ctrl-C to end.") 192 193# output 194exiting = 0 if args.interval else 1 195dist = b.get_table("dist") 196while (1): 197 try: 198 sleep(int(args.interval)) 199 except KeyboardInterrupt: 200 exiting = 1 201 202 print() 203 if args.timestamp: 204 print("%-8s\n" % strftime("%H:%M:%S"), end="") 205 206 if args.runqocc: 207 if args.cpus: 208 # run queue occupancy, per-CPU summary 209 idle = {} 210 queued = {} 211 cpumax = 0 212 for k, v in dist.items(): 213 if k.cpu > cpumax: 214 cpumax = k.cpu 215 for c in range(0, cpumax + 1): 216 idle[c] = 0 217 queued[c] = 0 218 for k, v in dist.items(): 219 if k.slot == 0: 220 idle[k.cpu] += v.value 221 else: 222 queued[k.cpu] += v.value 223 for c in range(0, cpumax + 1): 224 samples = idle[c] + queued[c] 225 if samples: 226 runqocc = float(queued[c]) / samples 227 else: 228 runqocc = 0 229 print("runqocc, CPU %-3d %6.2f%%" % (c, 100 * runqocc)) 230 231 else: 232 # run queue occupancy, system-wide summary 233 idle = 0 234 queued = 0 235 for k, v in dist.items(): 236 if k.value == 0: 237 idle += v.value 238 else: 239 queued += v.value 240 samples = idle + queued 241 if samples: 242 runqocc = float(queued) / samples 243 else: 244 runqocc = 0 245 print("runqocc: %0.2f%%" % (100 * runqocc)) 246 247 else: 248 # run queue length histograms 249 dist.print_linear_hist("runqlen", "cpu") 250 251 dist.clear() 252 253 countdown -= 1 254 if exiting or countdown == 0: 255 exit() 256