1 /*
2  * deadlock_detector.c  Detects potential deadlocks in a running process.
3  *                      For Linux, uses BCC, eBPF. See .py file.
4  *
5  * Copyright 2017 Facebook, Inc.
6  * Licensed under the Apache License, Version 2.0 (the "License")
7  *
8  * 1-Feb-2016   Kenny Yu   Created this.
9  */
10 
11 #include <linux/sched.h>
12 #include <uapi/linux/ptrace.h>
13 
14 // Maximum number of mutexes a single thread can hold at once.
15 // If the number is too big, the unrolled loops wil cause the stack
16 // to be too big, and the bpf verifier will fail.
17 #define MAX_HELD_MUTEXES 16
18 
19 // Info about held mutexes. `mutex` will be 0 if not held.
20 struct held_mutex_t {
21   u64 mutex;
22   u64 stack_id;
23 };
24 
25 // List of mutexes that a thread is holding. Whenever we loop over this array,
26 // we need to force the compiler to unroll the loop, otherwise the bcc verifier
27 // will fail because the loop will create a backwards edge.
28 struct thread_to_held_mutex_leaf_t {
29   struct held_mutex_t held_mutexes[MAX_HELD_MUTEXES];
30 };
31 
32 // Map of thread ID -> array of (mutex addresses, stack id)
33 BPF_HASH(thread_to_held_mutexes, u32, struct thread_to_held_mutex_leaf_t, 2097152);
34 
35 // Key type for edges. Represents an edge from mutex1 => mutex2.
36 struct edges_key_t {
37   u64 mutex1;
38   u64 mutex2;
39 };
40 
41 // Leaf type for edges. Holds information about where each mutex was acquired.
42 struct edges_leaf_t {
43   u64 mutex1_stack_id;
44   u64 mutex2_stack_id;
45   u32 thread_pid;
46   char comm[TASK_COMM_LEN];
47 };
48 
49 // Represents all edges currently in the mutex wait graph.
50 BPF_HASH(edges, struct edges_key_t, struct edges_leaf_t, 2097152);
51 
52 // Info about parent thread when a child thread is created.
53 struct thread_created_leaf_t {
54   u64 stack_id;
55   u32 parent_pid;
56   char comm[TASK_COMM_LEN];
57 };
58 
59 // Map of child thread pid -> info about parent thread.
60 BPF_HASH(thread_to_parent, u32, struct thread_created_leaf_t);
61 
62 // Stack traces when threads are created and when mutexes are locked/unlocked.
63 BPF_STACK_TRACE(stack_traces, 655360);
64 
65 // The first argument to the user space function we are tracing
66 // is a pointer to the mutex M held by thread T.
67 //
68 // For all mutexes N held by mutexes_held[T]
69 //   add edge N => M (held by T)
70 // mutexes_held[T].add(M)
trace_mutex_acquire(struct pt_regs * ctx,void * mutex_addr)71 int trace_mutex_acquire(struct pt_regs *ctx, void *mutex_addr) {
72   // Higher 32 bits is process ID, Lower 32 bits is thread ID
73   u32 pid = bpf_get_current_pid_tgid();
74   u64 mutex = (u64)mutex_addr;
75 
76   struct thread_to_held_mutex_leaf_t empty_leaf = {};
77   struct thread_to_held_mutex_leaf_t *leaf =
78       thread_to_held_mutexes.lookup_or_init(&pid, &empty_leaf);
79   if (!leaf) {
80     bpf_trace_printk(
81         "could not add thread_to_held_mutex key, thread: %d, mutex: %p\n", pid,
82         mutex);
83     return 1; // Could not insert, no more memory
84   }
85 
86   // Recursive mutexes lock the same mutex multiple times. We cannot tell if
87   // the mutex is recursive after the mutex is already created. To avoid noisy
88   // reports, disallow self edges. Do one pass to check if we are already
89   // holding the mutex, and if we are, do nothing.
90   #pragma unroll
91   for (int i = 0; i < MAX_HELD_MUTEXES; ++i) {
92     if (leaf->held_mutexes[i].mutex == mutex) {
93       return 1; // Disallow self edges
94     }
95   }
96 
97   u64 stack_id =
98       stack_traces.get_stackid(ctx, BPF_F_USER_STACK | BPF_F_REUSE_STACKID);
99 
100   int added_mutex = 0;
101   #pragma unroll
102   for (int i = 0; i < MAX_HELD_MUTEXES; ++i) {
103     // If this is a free slot, see if we can insert.
104     if (!leaf->held_mutexes[i].mutex) {
105       if (!added_mutex) {
106         leaf->held_mutexes[i].mutex = mutex;
107         leaf->held_mutexes[i].stack_id = stack_id;
108         added_mutex = 1;
109       }
110       continue; // Nothing to do for a free slot
111     }
112 
113     // Add edges from held mutex => current mutex
114     struct edges_key_t edge_key = {};
115     edge_key.mutex1 = leaf->held_mutexes[i].mutex;
116     edge_key.mutex2 = mutex;
117 
118     struct edges_leaf_t edge_leaf = {};
119     edge_leaf.mutex1_stack_id = leaf->held_mutexes[i].stack_id;
120     edge_leaf.mutex2_stack_id = stack_id;
121     edge_leaf.thread_pid = pid;
122     bpf_get_current_comm(&edge_leaf.comm, sizeof(edge_leaf.comm));
123 
124     // Returns non-zero on error
125     int result = edges.update(&edge_key, &edge_leaf);
126     if (result) {
127       bpf_trace_printk("could not add edge key %p, %p, error: %d\n",
128                        edge_key.mutex1, edge_key.mutex2, result);
129       continue; // Could not insert, no more memory
130     }
131   }
132 
133   // There were no free slots for this mutex.
134   if (!added_mutex) {
135     bpf_trace_printk("could not add mutex %p, added_mutex: %d\n", mutex,
136                      added_mutex);
137     return 1;
138   }
139   return 0;
140 }
141 
142 // The first argument to the user space function we are tracing
143 // is a pointer to the mutex M held by thread T.
144 //
145 // mutexes_held[T].remove(M)
trace_mutex_release(struct pt_regs * ctx,void * mutex_addr)146 int trace_mutex_release(struct pt_regs *ctx, void *mutex_addr) {
147   // Higher 32 bits is process ID, Lower 32 bits is thread ID
148   u32 pid = bpf_get_current_pid_tgid();
149   u64 mutex = (u64)mutex_addr;
150 
151   struct thread_to_held_mutex_leaf_t *leaf =
152       thread_to_held_mutexes.lookup(&pid);
153   if (!leaf) {
154     // If the leaf does not exist for the pid, then it means we either missed
155     // the acquire event, or we had no more memory and could not add it.
156     bpf_trace_printk(
157         "could not find thread_to_held_mutex, thread: %d, mutex: %p\n", pid,
158         mutex);
159     return 1;
160   }
161 
162   // For older kernels without "Bpf: allow access into map value arrays"
163   // (https://lkml.org/lkml/2016/8/30/287) the bpf verifier will fail with an
164   // invalid memory access on `leaf->held_mutexes[i]` below. On newer kernels,
165   // we can avoid making this extra copy in `value` and use `leaf` directly.
166   struct thread_to_held_mutex_leaf_t value = {};
167   bpf_probe_read(&value, sizeof(struct thread_to_held_mutex_leaf_t), leaf);
168 
169   #pragma unroll
170   for (int i = 0; i < MAX_HELD_MUTEXES; ++i) {
171     // Find the current mutex (if it exists), and clear it.
172     // Note: Can't use `leaf->` in this if condition, see comment above.
173     if (value.held_mutexes[i].mutex == mutex) {
174       leaf->held_mutexes[i].mutex = 0;
175       leaf->held_mutexes[i].stack_id = 0;
176     }
177   }
178 
179   return 0;
180 }
181 
182 // Trace return from clone() syscall in the child thread (return value > 0).
trace_clone(struct pt_regs * ctx,unsigned long flags,void * child_stack,void * ptid,void * ctid,struct pt_regs * regs)183 int trace_clone(struct pt_regs *ctx, unsigned long flags, void *child_stack,
184                 void *ptid, void *ctid, struct pt_regs *regs) {
185   u32 child_pid = PT_REGS_RC(ctx);
186   if (child_pid <= 0) {
187     return 1;
188   }
189 
190   struct thread_created_leaf_t thread_created_leaf = {};
191   thread_created_leaf.parent_pid = bpf_get_current_pid_tgid();
192   thread_created_leaf.stack_id =
193       stack_traces.get_stackid(ctx, BPF_F_USER_STACK | BPF_F_REUSE_STACKID);
194   bpf_get_current_comm(&thread_created_leaf.comm,
195                        sizeof(thread_created_leaf.comm));
196 
197   struct thread_created_leaf_t *insert_result =
198       thread_to_parent.lookup_or_init(&child_pid, &thread_created_leaf);
199   if (!insert_result) {
200     bpf_trace_printk(
201         "could not add thread_created_key, child: %d, parent: %d\n", child_pid,
202         thread_created_leaf.parent_pid);
203     return 1; // Could not insert, no more memory
204   }
205   return 0;
206 }
207