• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #!/usr/bin/python
2 #
3 # lockstat Trace and display lock contention stats
4 #
5 # USAGE: lockstat
6 
7 # Licensed under the Apache License, Version 2.0 (the "License")
8 # 28-Jul-2017   Gisle Dankel   Created this.
9 
10 from bcc import BPF
11 from ctypes import c_int
12 from time import sleep
13 from datetime import datetime
14 import argparse
15 import subprocess
16 import os
17 
18 # One Lock object per TGID and uaddr.
19 class Lock(object):
20     def __init__(self):
21         self.contention_count = 0
22         self.elapsed_blocked = 0
23         self.thread_count = 0
24         self.last_stack_syms = []
25 
26     def update(self, count, block_time, last_stack_syms):
27         self.contention_count += count
28         self.elapsed_blocked += block_time
29         self.thread_count += 1
30         self.last_stack_syms = last_stack_syms
31 
32 def run_command_get_pid(command):
33         p = subprocess.Popen(command.split())
34         return p.pid
35 
36 examples = """
37 EXAMPLES:
38 
39 ./lockstat
40         Trace calls to sys_futex and display contented locks every 5 seconds
41         for all processes running on the system
42 ./lockstat -p <pid>
43         Trace only for the specified pid and display contended locks
44         every 5 seconds
45 ./lockstat -p <pid> -t
46         Trace for a specified pid and print a message on each entry and exit to
47         sys_futex until interrupted or killed
48 ./lockstat -p <pid> 10
49         Trace the specified pid and show a message every 10 seconds
50 ./lockstat -c <command> 1 30
51         Run the specified command and display contended locks every 1 second
52         for a total of 30 times
53 """
54 
55 description = """
56 Trace kernel futex events.
57 These often occur because of lock contention, e.g. involving a pthread_mutex.
58 This script resemblers the following SystemTap example:
59 https://sourceware.org/systemtap/SystemTap_Beginners_Guide/futexcontentionsect.html
60 """
61 
62 parser = argparse.ArgumentParser(description=description,
63         formatter_class=argparse.RawDescriptionHelpFormatter,
64         epilog=examples)
65 parser.add_argument("-p", "--pid", type=int, default=-1,
66         help="the PID to trace; if not specified, trace all")
67 parser.add_argument("-t", "--trace", action="store_true",
68         help="print trace messages for each futex enter/exit")
69 parser.add_argument("interval", nargs="?", default=5, type=int,
70         help="interval in seconds to print summary")
71 parser.add_argument("count", nargs="?", type=int,
72         help="number of times to print the report before exiting")
73 parser.add_argument("-c", "--command",
74         help="execute and trace the specified command")
75 
76 args = parser.parse_args()
77 
78 pid = args.pid
79 command = args.command
80 interval = args.interval
81 num_prints = args.count
82 trace_all = args.trace
83 
84 if command is not None:
85         print("Executing '%s' and tracing the resulting process." % command)
86         pid = run_command_get_pid(command)
87 
88 bpf_source = """
89 #include <uapi/linux/futex.h>
90 #include <uapi/linux/ptrace.h>
91 #include <linux/sched.h>
92 #include <linux/time.h>
93 
94 struct comm_t {
95     char name[TASK_COMM_LEN];
96 };
97 
98 struct lock_key_t {
99         u64 uaddr;
100         u32 pid;
101         u32 tgid;
102 };
103 
104 struct lock_info_t {
105         u64 elapsed_blocked;
106         u64 contention_count;
107         u64 sid;
108 };
109 
110 BPF_HASH(pid_lock, u32, u64);
111 BPF_HASH(pid_blocktime, u32, u64);
112 BPF_HASH(tgid_comm, u32, struct comm_t);
113 BPF_HASH(lock_stats, struct lock_key_t, struct lock_info_t, 1000000);
114 BPF_STACK_TRACE(stack_traces, 16384);
115 
116 static inline int update_stats(u32 pid, u32 tgid, u64 uaddr, u64 block_time, u64 sid) {
117         struct lock_key_t key = {};
118         struct lock_info_t zero = {};
119         struct lock_info_t *info;
120 
121         key.pid = pid;
122         key.tgid = tgid;
123         key.uaddr = uaddr;
124         info = lock_stats.lookup_or_init(&key, &zero);
125         info->elapsed_blocked += block_time;
126         info->contention_count++;
127         info->sid = sid;
128 
129         if (0 == tgid_comm.lookup(&tgid)) {
130             struct comm_t comm;
131             bpf_get_current_comm(&comm.name, sizeof(comm.name));
132             tgid_comm.update(&tgid, &comm);
133         }
134         return 0;
135 }
136 
137 // FIXME: Should attach to sys_enter_futex and sys_exit_futex tracepoints here,
138 // but that does not currently work
139 int sys_futex_enter(struct pt_regs *ctx, u32 *uaddr, int op, u32 val,
140                     struct timespec *utime, u32 *uaddr2, u32 val3) {
141         int cmd = op & FUTEX_CMD_MASK;
142         if (cmd != FUTEX_WAIT)
143                 return 0;
144 
145         u64 pid_tgid = bpf_get_current_pid_tgid();
146         u32 pid = pid_tgid;
147         u32 tgid = pid_tgid >> 32;
148 
149         if (!(THREAD_FILTER))
150             return 0;
151 
152         u64 timestamp = bpf_ktime_get_ns();
153         u64 uaddr64 = (u64) uaddr;
154         pid_lock.update(&pid, &uaddr64);
155         pid_blocktime.update(&pid, &timestamp);
156 
157         if (SHOULD_PRINT)
158                 bpf_trace_printk("enter sys_futex, pid = %u, uaddr = %x, "
159                                  "cmd = %u\\n", pid, uaddr64, cmd);
160         return 0;
161 }
162 
163 int sys_futex_exit(struct pt_regs *ctx) {
164         u64 pid_tgid = bpf_get_current_pid_tgid();
165         u32 pid = pid_tgid;
166         u32 tgid = pid_tgid >> 32;
167         if (!(THREAD_FILTER))
168             return 0;
169 
170         u64 *blocktime = pid_blocktime.lookup(&pid);
171         u64 *uaddr = pid_lock.lookup(&pid);
172         u64 timestamp = bpf_ktime_get_ns();
173         u64 elapsed;
174         u64 sid;
175 
176         if (blocktime == 0 || uaddr == 0)
177                 return 0; // not FUTEX_WAIT, or (less likely) missed futex_enter
178 
179         elapsed = timestamp - *blocktime;
180 
181         sid = stack_traces.get_stackid(ctx, BPF_F_USER_STACK);
182         update_stats(pid, tgid, *uaddr, elapsed, sid);
183         pid_lock.delete(&pid);
184         pid_blocktime.delete(&pid);
185 
186         if (SHOULD_PRINT) {
187                 bpf_trace_printk("exit sys_futex, uaddr = %x, elapsed = %uns\\n",
188                                  uaddr == 0 ? 0 : *uaddr, elapsed);
189         }
190         return 0;
191 }
192 
193 """
194 
195 bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0")
196 
197 thread_filter = '1'
198 if pid != -1:
199 	print("Tracing pid %d, Ctrl+C to quit." % pid)
200 	# 'tgid' in kernel space is what people thin of as 'pid' in userspace
201 	thread_filter = "tgid == %d" % pid
202 else:
203 	print("Tracing all processes, Ctrl+C to quit.")
204 
205 bpf_source = bpf_source.replace("THREAD_FILTER", thread_filter)
206 
207 bpf_program = BPF(text=bpf_source)
208 bpf_program.attach_kprobe(event="SyS_futex", fn_name="sys_futex_enter")
209 bpf_program.attach_kretprobe(event="SyS_futex", fn_name="sys_futex_exit")
210 
211 def get_syms(stack, pid):
212     global bpf_program
213     syms = []
214     for addr in stack:
215         s = bpf_program.sym(addr, pid, show_offset=True)
216         syms.append(s)
217     return syms
218 
219 def print_syms(syms):
220     print("=========")
221     for f in syms:
222         print(f)
223     print("=========")
224 
225 def is_android_monitor_lock(syms):
226     for s in syms:
227         if 'art::Monitor::Lock' in s:
228             return True
229     return False
230 
231 def disp_stack(stack, pid):
232     for addr in stack:
233         s = bpf_program.sym(addr, pid, show_offset=True)
234 
235 def create_tgid_stats():
236         global bpf_program
237         stats = bpf_program["lock_stats"]
238         res = {}
239         stack_traces = bpf_program['stack_traces']
240         for key, val in stats.items():
241                 # Only display Android monitor locks
242                 if val.sid >= 0:
243                     ust = stack_traces.walk(val.sid)
244                     syms = get_syms(ust, key.pid)
245                     if not is_android_monitor_lock(syms):
246                         continue
247                 else:
248                     continue
249 
250                 if not key.tgid in res:
251                         res[key.tgid] = {}
252                 if not key.uaddr in res[key.tgid]:
253                         res[key.tgid][key.uaddr] = Lock()
254 
255                 lock = res[key.tgid][key.uaddr]
256                 lock.update(val.contention_count, val.elapsed_blocked, syms)
257         return res
258 
259 def print_comm_stats(stats):
260         if stats == {}:
261             return
262 
263         comms = bpf_program["tgid_comm"]
264         print("\n%s:" % (datetime.now().strftime("%H:%M:%S")))
265         for tgid, locks in stats.items():
266                 comm = comms[c_int(tgid)].name
267                 print("\n  %s (%d):" % (comm, tgid))
268                 sorted_locks = sorted(locks.items(),
269                                       key=lambda x: x[1].elapsed_blocked,
270                                       reverse=True)
271                 for addr, stats in sorted_locks:
272                     print("    %x: %dms (%d contentions involving %d threads, avg %dus)" %
273                           (addr, stats.elapsed_blocked / 1000000,
274                            stats.contention_count, stats.thread_count,
275                            stats.elapsed_blocked / stats.contention_count / 1000))
276 
277                     # No use of displaying lock stacks since we're only
278                     # filtering for Android monitor locks.
279                     #
280                     # print("Last stack for this lock:")
281                     # print_syms(stats.last_stack_syms)
282 
283 count_so_far = 0
284 while True:
285         if trace_all:
286                 print(bpf_program.trace_fields())
287         else:
288                 try:
289                         sleep(interval)
290                 except KeyboardInterrupt:
291                         exit()
292                 print_comm_stats(create_tgid_stats())
293                 count_so_far += 1
294                 bpf_program['tgid_comm'].clear()
295                 bpf_program['lock_stats'].clear()
296                 bpf_program['pid_lock'].clear()
297                 bpf_program['pid_blocktime'].clear()
298 
299                 if num_prints is not None and count_so_far >= num_prints:
300                         exit()
301