1 #!/usr/bin/env python 2 # 3 # memleak Trace and display outstanding allocations to detect 4 # memory leaks in user-mode processes and the kernel. 5 # 6 # USAGE: memleak [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND] 7 # [--combined-only] [-s SAMPLE_RATE] [-T TOP] [-z MIN_SIZE] 8 # [-Z MAX_SIZE] [-O OBJ] 9 # [interval] [count] 10 # 11 # Licensed under the Apache License, Version 2.0 (the "License") 12 # Copyright (C) 2016 Sasha Goldshtein. 13 14 from bcc import BPF 15 from time import sleep 16 from datetime import datetime 17 import resource 18 import argparse 19 import subprocess 20 import os 21 import sys 22 23 class Allocation(object): 24 def __init__(self, stack, size): 25 self.stack = stack 26 self.count = 1 27 self.size = size 28 29 def update(self, size): 30 self.count += 1 31 self.size += size 32 33 def run_command_get_output(command): 34 p = subprocess.Popen(command.split(), 35 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 36 return iter(p.stdout.readline, b'') 37 38 def run_command_get_pid(command): 39 p = subprocess.Popen(command.split()) 40 return p.pid 41 42 examples = """ 43 EXAMPLES: 44 45 ./memleak -p $(pidof allocs) 46 Trace allocations and display a summary of "leaked" (outstanding) 47 allocations every 5 seconds 48 ./memleak -p $(pidof allocs) -t 49 Trace allocations and display each individual allocator function call 50 ./memleak -ap $(pidof allocs) 10 51 Trace allocations and display allocated addresses, sizes, and stacks 52 every 10 seconds for outstanding allocations 53 ./memleak -c "./allocs" 54 Run the specified command and trace its allocations 55 ./memleak 56 Trace allocations in kernel mode and display a summary of outstanding 57 allocations every 5 seconds 58 ./memleak -o 60000 59 Trace allocations in kernel mode and display a summary of outstanding 60 allocations that are at least one minute (60 seconds) old 61 ./memleak -s 5 62 Trace roughly every 5th allocation, to reduce overhead 63 """ 64 65 description = """ 66 Trace outstanding memory allocations that weren't freed. 67 Supports both user-mode allocations made with libc functions and kernel-mode 68 allocations made with kmalloc/kmem_cache_alloc/get_free_pages and corresponding 69 memory release functions. 70 """ 71 72 parser = argparse.ArgumentParser(description=description, 73 formatter_class=argparse.RawDescriptionHelpFormatter, 74 epilog=examples) 75 parser.add_argument("-p", "--pid", type=int, default=-1, 76 help="the PID to trace; if not specified, trace kernel allocs") 77 parser.add_argument("-t", "--trace", action="store_true", 78 help="print trace messages for each alloc/free call") 79 parser.add_argument("interval", nargs="?", default=5, type=int, 80 help="interval in seconds to print outstanding allocations") 81 parser.add_argument("count", nargs="?", type=int, 82 help="number of times to print the report before exiting") 83 parser.add_argument("-a", "--show-allocs", default=False, action="store_true", 84 help="show allocation addresses and sizes as well as call stacks") 85 parser.add_argument("-o", "--older", default=500, type=int, 86 help="prune allocations younger than this age in milliseconds") 87 parser.add_argument("-c", "--command", 88 help="execute and trace the specified command") 89 parser.add_argument("--combined-only", default=False, action="store_true", 90 help="show combined allocation statistics only") 91 parser.add_argument("-s", "--sample-rate", default=1, type=int, 92 help="sample every N-th allocation to decrease the overhead") 93 parser.add_argument("-T", "--top", type=int, default=10, 94 help="display only this many top allocating stacks (by size)") 95 parser.add_argument("-z", "--min-size", type=int, 96 help="capture only allocations larger than this size") 97 parser.add_argument("-Z", "--max-size", type=int, 98 help="capture only allocations smaller than this size") 99 parser.add_argument("-O", "--obj", type=str, default="c", 100 help="attach to allocator functions in the specified object") 101 parser.add_argument("--ebpf", action="store_true", 102 help=argparse.SUPPRESS) 103 104 args = parser.parse_args() 105 106 pid = args.pid 107 command = args.command 108 kernel_trace = (pid == -1 and command is None) 109 trace_all = args.trace 110 interval = args.interval 111 min_age_ns = 1e6 * args.older 112 sample_every_n = args.sample_rate 113 num_prints = args.count 114 top_stacks = args.top 115 min_size = args.min_size 116 max_size = args.max_size 117 obj = args.obj 118 119 if min_size is not None and max_size is not None and min_size > max_size: 120 print("min_size (-z) can't be greater than max_size (-Z)") 121 exit(1) 122 123 if command is not None: 124 print("Executing '%s' and tracing the resulting process." % command) 125 pid = run_command_get_pid(command) 126 127 bpf_source = """ 128 #include <uapi/linux/ptrace.h> 129 130 struct alloc_info_t { 131 u64 size; 132 u64 timestamp_ns; 133 int stack_id; 134 }; 135 136 struct combined_alloc_info_t { 137 u64 total_size; 138 u64 number_of_allocs; 139 }; 140 141 BPF_HASH(sizes, u64); 142 BPF_TABLE("hash", u64, struct alloc_info_t, allocs, 1000000); 143 BPF_HASH(memptrs, u64, u64); 144 BPF_STACK_TRACE(stack_traces, 10240); 145 BPF_TABLE("hash", u64, struct combined_alloc_info_t, combined_allocs, 10240); 146 147 static inline void update_statistics_add(u64 stack_id, u64 sz) { 148 struct combined_alloc_info_t *existing_cinfo; 149 struct combined_alloc_info_t cinfo = {0}; 150 151 existing_cinfo = combined_allocs.lookup(&stack_id); 152 if (existing_cinfo != 0) 153 cinfo = *existing_cinfo; 154 155 cinfo.total_size += sz; 156 cinfo.number_of_allocs += 1; 157 158 combined_allocs.update(&stack_id, &cinfo); 159 } 160 161 static inline void update_statistics_del(u64 stack_id, u64 sz) { 162 struct combined_alloc_info_t *existing_cinfo; 163 struct combined_alloc_info_t cinfo = {0}; 164 165 existing_cinfo = combined_allocs.lookup(&stack_id); 166 if (existing_cinfo != 0) 167 cinfo = *existing_cinfo; 168 169 if (sz >= cinfo.total_size) 170 cinfo.total_size = 0; 171 else 172 cinfo.total_size -= sz; 173 174 if (cinfo.number_of_allocs > 0) 175 cinfo.number_of_allocs -= 1; 176 177 combined_allocs.update(&stack_id, &cinfo); 178 } 179 180 static inline int gen_alloc_enter(struct pt_regs *ctx, size_t size) { 181 SIZE_FILTER 182 if (SAMPLE_EVERY_N > 1) { 183 u64 ts = bpf_ktime_get_ns(); 184 if (ts % SAMPLE_EVERY_N != 0) 185 return 0; 186 } 187 188 u64 pid = bpf_get_current_pid_tgid(); 189 u64 size64 = size; 190 sizes.update(&pid, &size64); 191 192 if (SHOULD_PRINT) 193 bpf_trace_printk("alloc entered, size = %u\\n", size); 194 return 0; 195 } 196 197 static inline int gen_alloc_exit2(struct pt_regs *ctx, u64 address) { 198 u64 pid = bpf_get_current_pid_tgid(); 199 u64* size64 = sizes.lookup(&pid); 200 struct alloc_info_t info = {0}; 201 202 if (size64 == 0) 203 return 0; // missed alloc entry 204 205 info.size = *size64; 206 sizes.delete(&pid); 207 208 info.timestamp_ns = bpf_ktime_get_ns(); 209 info.stack_id = stack_traces.get_stackid(ctx, STACK_FLAGS); 210 allocs.update(&address, &info); 211 update_statistics_add(info.stack_id, info.size); 212 213 if (SHOULD_PRINT) { 214 bpf_trace_printk("alloc exited, size = %lu, result = %lx\\n", 215 info.size, address); 216 } 217 return 0; 218 } 219 220 static inline int gen_alloc_exit(struct pt_regs *ctx) { 221 return gen_alloc_exit2(ctx, PT_REGS_RC(ctx)); 222 } 223 224 static inline int gen_free_enter(struct pt_regs *ctx, void *address) { 225 u64 addr = (u64)address; 226 struct alloc_info_t *info = allocs.lookup(&addr); 227 if (info == 0) 228 return 0; 229 230 allocs.delete(&addr); 231 update_statistics_del(info->stack_id, info->size); 232 233 if (SHOULD_PRINT) { 234 bpf_trace_printk("free entered, address = %lx, size = %lu\\n", 235 address, info->size); 236 } 237 return 0; 238 } 239 240 int malloc_enter(struct pt_regs *ctx, size_t size) { 241 return gen_alloc_enter(ctx, size); 242 } 243 244 int malloc_exit(struct pt_regs *ctx) { 245 return gen_alloc_exit(ctx); 246 } 247 248 int free_enter(struct pt_regs *ctx, void *address) { 249 return gen_free_enter(ctx, address); 250 } 251 252 int calloc_enter(struct pt_regs *ctx, size_t nmemb, size_t size) { 253 return gen_alloc_enter(ctx, nmemb * size); 254 } 255 256 int calloc_exit(struct pt_regs *ctx) { 257 return gen_alloc_exit(ctx); 258 } 259 260 int realloc_enter(struct pt_regs *ctx, void *ptr, size_t size) { 261 gen_free_enter(ctx, ptr); 262 return gen_alloc_enter(ctx, size); 263 } 264 265 int realloc_exit(struct pt_regs *ctx) { 266 return gen_alloc_exit(ctx); 267 } 268 269 int posix_memalign_enter(struct pt_regs *ctx, void **memptr, size_t alignment, 270 size_t size) { 271 u64 memptr64 = (u64)(size_t)memptr; 272 u64 pid = bpf_get_current_pid_tgid(); 273 274 memptrs.update(&pid, &memptr64); 275 return gen_alloc_enter(ctx, size); 276 } 277 278 int posix_memalign_exit(struct pt_regs *ctx) { 279 u64 pid = bpf_get_current_pid_tgid(); 280 u64 *memptr64 = memptrs.lookup(&pid); 281 void *addr; 282 283 if (memptr64 == 0) 284 return 0; 285 286 memptrs.delete(&pid); 287 288 if (bpf_probe_read(&addr, sizeof(void*), (void*)(size_t)*memptr64)) 289 return 0; 290 291 u64 addr64 = (u64)(size_t)addr; 292 return gen_alloc_exit2(ctx, addr64); 293 } 294 295 int aligned_alloc_enter(struct pt_regs *ctx, size_t alignment, size_t size) { 296 return gen_alloc_enter(ctx, size); 297 } 298 299 int aligned_alloc_exit(struct pt_regs *ctx) { 300 return gen_alloc_exit(ctx); 301 } 302 303 int valloc_enter(struct pt_regs *ctx, size_t size) { 304 return gen_alloc_enter(ctx, size); 305 } 306 307 int valloc_exit(struct pt_regs *ctx) { 308 return gen_alloc_exit(ctx); 309 } 310 311 int memalign_enter(struct pt_regs *ctx, size_t alignment, size_t size) { 312 return gen_alloc_enter(ctx, size); 313 } 314 315 int memalign_exit(struct pt_regs *ctx) { 316 return gen_alloc_exit(ctx); 317 } 318 319 int pvalloc_enter(struct pt_regs *ctx, size_t size) { 320 return gen_alloc_enter(ctx, size); 321 } 322 323 int pvalloc_exit(struct pt_regs *ctx) { 324 return gen_alloc_exit(ctx); 325 } 326 """ 327 328 bpf_source_kernel = """ 329 330 TRACEPOINT_PROBE(kmem, kmalloc) { 331 gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc); 332 return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr); 333 } 334 335 TRACEPOINT_PROBE(kmem, kmalloc_node) { 336 gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc); 337 return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr); 338 } 339 340 TRACEPOINT_PROBE(kmem, kfree) { 341 return gen_free_enter((struct pt_regs *)args, (void *)args->ptr); 342 } 343 344 TRACEPOINT_PROBE(kmem, kmem_cache_alloc) { 345 gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc); 346 return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr); 347 } 348 349 TRACEPOINT_PROBE(kmem, kmem_cache_alloc_node) { 350 gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc); 351 return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr); 352 } 353 354 TRACEPOINT_PROBE(kmem, kmem_cache_free) { 355 return gen_free_enter((struct pt_regs *)args, (void *)args->ptr); 356 } 357 358 TRACEPOINT_PROBE(kmem, mm_page_alloc) { 359 gen_alloc_enter((struct pt_regs *)args, PAGE_SIZE << args->order); 360 return gen_alloc_exit2((struct pt_regs *)args, args->pfn); 361 } 362 363 TRACEPOINT_PROBE(kmem, mm_page_free) { 364 return gen_free_enter((struct pt_regs *)args, (void *)args->pfn); 365 } 366 """ 367 368 if kernel_trace: 369 bpf_source += bpf_source_kernel 370 371 bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0") 372 bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n)) 373 bpf_source = bpf_source.replace("PAGE_SIZE", str(resource.getpagesize())) 374 375 size_filter = "" 376 if min_size is not None and max_size is not None: 377 size_filter = "if (size < %d || size > %d) return 0;" % \ 378 (min_size, max_size) 379 elif min_size is not None: 380 size_filter = "if (size < %d) return 0;" % min_size 381 elif max_size is not None: 382 size_filter = "if (size > %d) return 0;" % max_size 383 bpf_source = bpf_source.replace("SIZE_FILTER", size_filter) 384 385 stack_flags = "BPF_F_REUSE_STACKID" 386 if not kernel_trace: 387 stack_flags += "|BPF_F_USER_STACK" 388 bpf_source = bpf_source.replace("STACK_FLAGS", stack_flags) 389 390 if args.ebpf: 391 print(bpf_source) 392 exit() 393 394 bpf = BPF(text=bpf_source) 395 396 if not kernel_trace: 397 print("Attaching to pid %d, Ctrl+C to quit." % pid) 398 399 def attach_probes(sym, fn_prefix=None, can_fail=False): 400 if fn_prefix is None: 401 fn_prefix = sym 402 403 try: 404 bpf.attach_uprobe(name=obj, sym=sym, 405 fn_name=fn_prefix + "_enter", 406 pid=pid) 407 bpf.attach_uretprobe(name=obj, sym=sym, 408 fn_name=fn_prefix + "_exit", 409 pid=pid) 410 except Exception: 411 if can_fail: 412 return 413 else: 414 raise 415 416 attach_probes("malloc") 417 attach_probes("calloc") 418 attach_probes("realloc") 419 attach_probes("posix_memalign") 420 attach_probes("valloc") 421 attach_probes("memalign") 422 attach_probes("pvalloc") 423 attach_probes("aligned_alloc", can_fail=True) # added in C11 424 bpf.attach_uprobe(name=obj, sym="free", fn_name="free_enter", 425 pid=pid) 426 427 else: 428 print("Attaching to kernel allocators, Ctrl+C to quit.") 429 430 # No probe attaching here. Allocations are counted by attaching to 431 # tracepoints. 432 # 433 # Memory allocations in Linux kernel are not limited to malloc/free 434 # equivalents. It's also common to allocate a memory page or multiple 435 # pages. Page allocator have two interfaces, one working with page 436 # frame numbers (PFN), while other working with page addresses. It's 437 # possible to allocate pages with one kind of functions, and free them 438 # with another. Code in kernel can easy convert PFNs to addresses and 439 # back, but it's hard to do the same in eBPF kprobe without fragile 440 # hacks. 441 # 442 # Fortunately, Linux exposes tracepoints for memory allocations, which 443 # can be instrumented by eBPF programs. Tracepoint for page allocations 444 # gives access to PFNs for both allocator interfaces. So there is no 445 # need to guess which allocation corresponds to which free. 446 447 def print_outstanding(): 448 print("[%s] Top %d stacks with outstanding allocations:" % 449 (datetime.now().strftime("%H:%M:%S"), top_stacks)) 450 alloc_info = {} 451 allocs = bpf["allocs"] 452 stack_traces = bpf["stack_traces"] 453 for address, info in sorted(allocs.items(), key=lambda a: a[1].size): 454 if BPF.monotonic_time() - min_age_ns < info.timestamp_ns: 455 continue 456 if info.stack_id < 0: 457 continue 458 if info.stack_id in alloc_info: 459 alloc_info[info.stack_id].update(info.size) 460 else: 461 stack = list(stack_traces.walk(info.stack_id)) 462 combined = [] 463 for addr in stack: 464 combined.append(bpf.sym(addr, pid, 465 show_module=True, show_offset=True)) 466 alloc_info[info.stack_id] = Allocation(combined, 467 info.size) 468 if args.show_allocs: 469 print("\taddr = %x size = %s" % 470 (address.value, info.size)) 471 to_show = sorted(alloc_info.values(), 472 key=lambda a: a.size)[-top_stacks:] 473 for alloc in to_show: 474 print("\t%d bytes in %d allocations from stack\n\t\t%s" % 475 (alloc.size, alloc.count, b"\n\t\t".join(alloc.stack))) 476 477 def print_outstanding_combined(): 478 stack_traces = bpf["stack_traces"] 479 stacks = sorted(bpf["combined_allocs"].items(), 480 key=lambda a: -a[1].total_size) 481 cnt = 1 482 entries = [] 483 for stack_id, info in stacks: 484 try: 485 trace = [] 486 for addr in stack_traces.walk(stack_id.value): 487 sym = bpf.sym(addr, pid, 488 show_module=True, 489 show_offset=True) 490 trace.append(sym) 491 trace = "\n\t\t".join(trace) 492 except KeyError: 493 trace = "stack information lost" 494 495 entry = ("\t%d bytes in %d allocations from stack\n\t\t%s" % 496 (info.total_size, info.number_of_allocs, trace)) 497 entries.append(entry) 498 499 cnt += 1 500 if cnt > top_stacks: 501 break 502 503 print("[%s] Top %d stacks with outstanding allocations:" % 504 (datetime.now().strftime("%H:%M:%S"), top_stacks)) 505 506 print('\n'.join(reversed(entries))) 507 508 count_so_far = 0 509 while True: 510 if trace_all: 511 print(bpf.trace_fields()) 512 else: 513 try: 514 sleep(interval) 515 except KeyboardInterrupt: 516 exit() 517 if args.combined_only: 518 print_outstanding_combined() 519 else: 520 print_outstanding() 521 sys.stdout.flush() 522 count_so_far += 1 523 if num_prints is not None and count_so_far >= num_prints: 524 exit() 525