1#!/usr/bin/env python
2#
3# deadlock_detector  Detects potential deadlocks (lock order inversions)
4#                    on a running process. For Linux, uses BCC, eBPF.
5#
6# USAGE: deadlock_detector.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH]
7#                             [--verbose] [--lock-symbols LOCK_SYMBOLS]
8#                             [--unlock-symbols UNLOCK_SYMBOLS]
9#                             pid
10#
11# This traces pthread mutex lock and unlock calls to build a directed graph
12# representing the mutex wait graph:
13#
14# - Nodes in the graph represent mutexes.
15# - Edge (A, B) exists if there exists some thread T where lock(A) was called
16#   and lock(B) was called before unlock(A) was called.
17#
18# If the program finds a potential lock order inversion, the program will dump
19# the cycle of mutexes and the stack traces where each mutex was acquired, and
20# then exit.
21#
22# This program can only find potential deadlocks that occur while the program
23# is tracing the process. It cannot find deadlocks that may have occurred
24# before the program was attached to the process.
25#
26# Since this traces all mutex lock and unlock events and all thread creation
27# events on the traced process, the overhead of this bpf program can be very
28# high if the process has many threads and mutexes. You should only run this on
29# a process where the slowdown is acceptable.
30#
31# Note: This tool does not work for shared mutexes or recursive mutexes.
32#
33# For shared (read-write) mutexes, a deadlock requires a cycle in the wait
34# graph where at least one of the mutexes in the cycle is acquiring exclusive
35# (write) ownership.
36#
37# For recursive mutexes, lock() is called multiple times on the same mutex.
38# However, there is no way to determine if a mutex is a recursive mutex
39# after the mutex has been created. As a result, this tool will not find
40# potential deadlocks that involve only one mutex.
41#
42# Copyright 2017 Facebook, Inc.
43# Licensed under the Apache License, Version 2.0 (the "License")
44#
45# 01-Feb-2017   Kenny Yu   Created this.
46
47from __future__ import (
48    absolute_import, division, unicode_literals, print_function
49)
50from bcc import BPF
51from collections import defaultdict
52import argparse
53import json
54import os
55import subprocess
56import sys
57import time
58
59
60class DiGraph(object):
61    '''
62    Adapted from networkx: http://networkx.github.io/
63    Represents a directed graph. Edges can store (key, value) attributes.
64    '''
65
66    def __init__(self):
67        # Map of node -> set of nodes
68        self.adjacency_map = {}
69        # Map of (node1, node2) -> map string -> arbitrary attribute
70        # This will not be copied in subgraph()
71        self.attributes_map = {}
72
73    def neighbors(self, node):
74        return self.adjacency_map.get(node, set())
75
76    def edges(self):
77        edges = []
78        for node, neighbors in self.adjacency_map.items():
79            for neighbor in neighbors:
80                edges.append((node, neighbor))
81        return edges
82
83    def nodes(self):
84        return self.adjacency_map.keys()
85
86    def attributes(self, node1, node2):
87        return self.attributes_map[(node1, node2)]
88
89    def add_edge(self, node1, node2, **kwargs):
90        if node1 not in self.adjacency_map:
91            self.adjacency_map[node1] = set()
92        if node2 not in self.adjacency_map:
93            self.adjacency_map[node2] = set()
94        self.adjacency_map[node1].add(node2)
95        self.attributes_map[(node1, node2)] = kwargs
96
97    def remove_node(self, node):
98        self.adjacency_map.pop(node, None)
99        for _, neighbors in self.adjacency_map.items():
100            neighbors.discard(node)
101
102    def subgraph(self, nodes):
103        graph = DiGraph()
104        for node in nodes:
105            for neighbor in self.neighbors(node):
106                if neighbor in nodes:
107                    graph.add_edge(node, neighbor)
108        return graph
109
110    def node_link_data(self):
111        '''
112        Returns the graph as a dictionary in a format that can be
113        serialized.
114        '''
115        data = {
116            'directed': True,
117            'multigraph': False,
118            'graph': {},
119            'links': [],
120            'nodes': [],
121        }
122
123        # Do one pass to build a map of node -> position in nodes
124        node_to_number = {}
125        for node in self.adjacency_map.keys():
126            node_to_number[node] = len(data['nodes'])
127            data['nodes'].append({'id': node})
128
129        # Do another pass to build the link information
130        for node, neighbors in self.adjacency_map.items():
131            for neighbor in neighbors:
132                link = self.attributes_map[(node, neighbor)].copy()
133                link['source'] = node_to_number[node]
134                link['target'] = node_to_number[neighbor]
135                data['links'].append(link)
136        return data
137
138
139def strongly_connected_components(G):
140    '''
141    Adapted from networkx: http://networkx.github.io/
142    Parameters
143    ----------
144    G : DiGraph
145    Returns
146    -------
147    comp : generator of sets
148        A generator of sets of nodes, one for each strongly connected
149        component of G.
150    '''
151    preorder = {}
152    lowlink = {}
153    scc_found = {}
154    scc_queue = []
155    i = 0  # Preorder counter
156    for source in G.nodes():
157        if source not in scc_found:
158            queue = [source]
159            while queue:
160                v = queue[-1]
161                if v not in preorder:
162                    i = i + 1
163                    preorder[v] = i
164                done = 1
165                v_nbrs = G.neighbors(v)
166                for w in v_nbrs:
167                    if w not in preorder:
168                        queue.append(w)
169                        done = 0
170                        break
171                if done == 1:
172                    lowlink[v] = preorder[v]
173                    for w in v_nbrs:
174                        if w not in scc_found:
175                            if preorder[w] > preorder[v]:
176                                lowlink[v] = min([lowlink[v], lowlink[w]])
177                            else:
178                                lowlink[v] = min([lowlink[v], preorder[w]])
179                    queue.pop()
180                    if lowlink[v] == preorder[v]:
181                        scc_found[v] = True
182                        scc = {v}
183                        while (
184                            scc_queue and preorder[scc_queue[-1]] > preorder[v]
185                        ):
186                            k = scc_queue.pop()
187                            scc_found[k] = True
188                            scc.add(k)
189                        yield scc
190                    else:
191                        scc_queue.append(v)
192
193
194def simple_cycles(G):
195    '''
196    Adapted from networkx: http://networkx.github.io/
197    Parameters
198    ----------
199    G : DiGraph
200    Returns
201    -------
202    cycle_generator: generator
203       A generator that produces elementary cycles of the graph.
204       Each cycle is represented by a list of nodes along the cycle.
205    '''
206
207    def _unblock(thisnode, blocked, B):
208        stack = set([thisnode])
209        while stack:
210            node = stack.pop()
211            if node in blocked:
212                blocked.remove(node)
213                stack.update(B[node])
214                B[node].clear()
215
216    # Johnson's algorithm requires some ordering of the nodes.
217    # We assign the arbitrary ordering given by the strongly connected comps
218    # There is no need to track the ordering as each node removed as processed.
219    # save the actual graph so we can mutate it here
220    # We only take the edges because we do not want to
221    # copy edge and node attributes here.
222    subG = G.subgraph(G.nodes())
223    sccs = list(strongly_connected_components(subG))
224    while sccs:
225        scc = sccs.pop()
226        # order of scc determines ordering of nodes
227        startnode = scc.pop()
228        # Processing node runs 'circuit' routine from recursive version
229        path = [startnode]
230        blocked = set()  # vertex: blocked from search?
231        closed = set()  # nodes involved in a cycle
232        blocked.add(startnode)
233        B = defaultdict(set)  # graph portions that yield no elementary circuit
234        stack = [(startnode, list(subG.neighbors(startnode)))]
235        while stack:
236            thisnode, nbrs = stack[-1]
237            if nbrs:
238                nextnode = nbrs.pop()
239                if nextnode == startnode:
240                    yield path[:]
241                    closed.update(path)
242                elif nextnode not in blocked:
243                    path.append(nextnode)
244                    stack.append((nextnode, list(subG.neighbors(nextnode))))
245                    closed.discard(nextnode)
246                    blocked.add(nextnode)
247                    continue
248            # done with nextnode... look for more neighbors
249            if not nbrs:  # no more nbrs
250                if thisnode in closed:
251                    _unblock(thisnode, blocked, B)
252                else:
253                    for nbr in subG.neighbors(thisnode):
254                        if thisnode not in B[nbr]:
255                            B[nbr].add(thisnode)
256                stack.pop()
257                path.pop()
258        # done processing this node
259        subG.remove_node(startnode)
260        H = subG.subgraph(scc)  # make smaller to avoid work in SCC routine
261        sccs.extend(list(strongly_connected_components(H)))
262
263
264def find_cycle(graph):
265    '''
266    Looks for a cycle in the graph. If found, returns the first cycle.
267    If nodes a1, a2, ..., an are in a cycle, then this returns:
268        [(a1,a2), (a2,a3), ... (an-1,an), (an, a1)]
269    Otherwise returns an empty list.
270    '''
271    cycles = list(simple_cycles(graph))
272    if cycles:
273        nodes = cycles[0]
274        nodes.append(nodes[0])
275        edges = []
276        prev = nodes[0]
277        for node in nodes[1:]:
278            edges.append((prev, node))
279            prev = node
280        return edges
281    else:
282        return []
283
284
285def print_cycle(binary, graph, edges, thread_info, print_stack_trace_fn):
286    '''
287    Prints the cycle in the mutex graph in the following format:
288
289    Potential Deadlock Detected!
290
291    Cycle in lock order graph: M0 => M1 => M2 => M0
292
293    for (m, n) in cycle:
294        Mutex n acquired here while holding Mutex m in thread T:
295            [ stack trace ]
296
297        Mutex m previously acquired by thread T here:
298            [ stack trace ]
299
300    for T in all threads:
301        Thread T was created here:
302            [ stack trace ]
303    '''
304
305    # List of mutexes in the cycle, first and last repeated
306    nodes_in_order = []
307    # Map mutex address -> readable alias
308    node_addr_to_name = {}
309    for counter, (m, n) in enumerate(edges):
310        nodes_in_order.append(m)
311        # For global or static variables, try to symbolize the mutex address.
312        symbol = symbolize_with_objdump(binary, m)
313        if symbol:
314            symbol += ' '
315        node_addr_to_name[m] = 'Mutex M%d (%s0x%016x)' % (counter, symbol, m)
316    nodes_in_order.append(nodes_in_order[0])
317
318    print('----------------\nPotential Deadlock Detected!\n')
319    print(
320        'Cycle in lock order graph: %s\n' %
321        (' => '.join([node_addr_to_name[n] for n in nodes_in_order]))
322    )
323
324    # Set of threads involved in the lock inversion
325    thread_pids = set()
326
327    # For each edge in the cycle, print where the two mutexes were held
328    for (m, n) in edges:
329        thread_pid = graph.attributes(m, n)['thread_pid']
330        thread_comm = graph.attributes(m, n)['thread_comm']
331        first_mutex_stack_id = graph.attributes(m, n)['first_mutex_stack_id']
332        second_mutex_stack_id = graph.attributes(m, n)['second_mutex_stack_id']
333        thread_pids.add(thread_pid)
334        print(
335            '%s acquired here while holding %s in Thread %d (%s):' % (
336                node_addr_to_name[n], node_addr_to_name[m], thread_pid,
337                thread_comm
338            )
339        )
340        print_stack_trace_fn(second_mutex_stack_id)
341        print('')
342        print(
343            '%s previously acquired by the same Thread %d (%s) here:' %
344            (node_addr_to_name[m], thread_pid, thread_comm)
345        )
346        print_stack_trace_fn(first_mutex_stack_id)
347        print('')
348
349    # Print where the threads were created, if available
350    for thread_pid in thread_pids:
351        parent_pid, stack_id, parent_comm = thread_info.get(
352            thread_pid, (None, None, None)
353        )
354        if parent_pid:
355            print(
356                'Thread %d created by Thread %d (%s) here: ' %
357                (thread_pid, parent_pid, parent_comm)
358            )
359            print_stack_trace_fn(stack_id)
360        else:
361            print(
362                'Could not find stack trace where Thread %d was created' %
363                thread_pid
364            )
365        print('')
366
367
368def symbolize_with_objdump(binary, addr):
369    '''
370    Searches the binary for the address using objdump. Returns the symbol if
371    it is found, otherwise returns empty string.
372    '''
373    try:
374        command = (
375            'objdump -tT %s | grep %x | awk {\'print $NF\'} | c++filt' %
376            (binary, addr)
377        )
378        output = subprocess.check_output(command, shell=True)
379        return output.decode('utf-8').strip()
380    except subprocess.CalledProcessError:
381        return ''
382
383
384def strlist(s):
385    '''Given a comma-separated string, returns a list of substrings'''
386    return s.strip().split(',')
387
388
389def main():
390    examples = '''Examples:
391    deadlock_detector 181        # Analyze PID 181
392
393    deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0
394                                 # Analyze PID 181 and locks from this binary.
395                                 # If tracing a process that is running from
396                                 # a dynamically-linked binary, this argument
397                                 # is required and should be the path to the
398                                 # pthread library.
399
400    deadlock_detector 181 --verbose
401                                 # Analyze PID 181 and print statistics about
402                                 # the mutex wait graph.
403
404    deadlock_detector 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \\
405        --unlock-symbols my_mutex_unlock1,my_mutex_unlock2
406                                 # Analyze PID 181 and trace custom mutex
407                                 # symbols instead of pthread mutexes.
408
409    deadlock_detector 181 --dump-graph graph.json
410                                 # Analyze PID 181 and dump the mutex wait
411                                 # graph to graph.json.
412    '''
413    parser = argparse.ArgumentParser(
414        description=(
415            'Detect potential deadlocks (lock inversions) in a running binary.'
416            '\nMust be run as root.'
417        ),
418        formatter_class=argparse.RawDescriptionHelpFormatter,
419        epilog=examples,
420    )
421    parser.add_argument('pid', type=int, help='Pid to trace')
422    # Binaries with `:` in the path will fail to attach uprobes on kernels
423    # running without this patch: https://lkml.org/lkml/2017/1/13/585.
424    # Symlinks to the binary without `:` in the path can get around this issue.
425    parser.add_argument(
426        '--binary',
427        type=str,
428        default='',
429        help='If set, trace the mutexes from the binary at this path. '
430        'For statically-linked binaries, this argument is not required. '
431        'For dynamically-linked binaries, this argument is required and '
432        'should be the path of the pthread library the binary is using. '
433        'Example: /lib/x86_64-linux-gnu/libpthread.so.0',
434    )
435    parser.add_argument(
436        '--dump-graph',
437        type=str,
438        default='',
439        help='If set, this will dump the mutex graph to the specified file.',
440    )
441    parser.add_argument(
442        '--verbose',
443        action='store_true',
444        help='Print statistics about the mutex wait graph.',
445    )
446    parser.add_argument(
447        '--lock-symbols',
448        type=strlist,
449        default=['pthread_mutex_lock'],
450        help='Comma-separated list of lock symbols to trace. Default is '
451        'pthread_mutex_lock. These symbols cannot be inlined in the binary.',
452    )
453    parser.add_argument(
454        '--unlock-symbols',
455        type=strlist,
456        default=['pthread_mutex_unlock'],
457        help='Comma-separated list of unlock symbols to trace. Default is '
458        'pthread_mutex_unlock. These symbols cannot be inlined in the binary.',
459    )
460    args = parser.parse_args()
461    if not args.binary:
462        try:
463            args.binary = os.readlink('/proc/%d/exe' % args.pid)
464        except OSError as e:
465            print('%s. Is the process (pid=%d) running?' % (str(e), args.pid))
466            sys.exit(1)
467
468    bpf = BPF(src_file=b'deadlock_detector.c')
469
470    # Trace where threads are created
471    bpf.attach_kretprobe(event=bpf.get_syscall_fnname('clone'), fn_name='trace_clone')
472
473    # We must trace unlock first, otherwise in the time we attached the probe
474    # on lock() and have not yet attached the probe on unlock(), a thread can
475    # acquire mutexes and release them, but the release events will not be
476    # traced, resulting in noisy reports.
477    for symbol in args.unlock_symbols:
478        try:
479            bpf.attach_uprobe(
480                name=args.binary,
481                sym=symbol,
482                fn_name='trace_mutex_release',
483                pid=args.pid,
484            )
485        except Exception as e:
486            print('%s. Failed to attach to symbol: %s' % (str(e), symbol))
487            sys.exit(1)
488    for symbol in args.lock_symbols:
489        try:
490            bpf.attach_uprobe(
491                name=args.binary,
492                sym=symbol,
493                fn_name='trace_mutex_acquire',
494                pid=args.pid,
495            )
496        except Exception as e:
497            print('%s. Failed to attach to symbol: %s' % (str(e), symbol))
498            sys.exit(1)
499
500    def print_stack_trace(stack_id):
501        '''Closure that prints the symbolized stack trace.'''
502        for addr in bpf.get_table('stack_traces').walk(stack_id):
503            line = bpf.sym(addr, args.pid)
504            # Try to symbolize with objdump if we cannot with bpf.
505            if line == '[unknown]':
506                symbol = symbolize_with_objdump(args.binary, addr)
507                if symbol:
508                    line = symbol
509            print('@ %016x %s' % (addr, line))
510
511    print('Tracing... Hit Ctrl-C to end.')
512    while True:
513        try:
514            # Map of child thread pid -> parent info
515            thread_info = {
516                child.value: (parent.parent_pid, parent.stack_id, parent.comm)
517                for child, parent in bpf.get_table('thread_to_parent').items()
518            }
519
520            # Mutex wait directed graph. Nodes are mutexes. Edge (A,B) exists
521            # if there exists some thread T where lock(A) was called and
522            # lock(B) was called before unlock(A) was called.
523            graph = DiGraph()
524            for key, leaf in bpf.get_table('edges').items():
525                graph.add_edge(
526                    key.mutex1,
527                    key.mutex2,
528                    thread_pid=leaf.thread_pid,
529                    thread_comm=leaf.comm.decode('utf-8'),
530                    first_mutex_stack_id=leaf.mutex1_stack_id,
531                    second_mutex_stack_id=leaf.mutex2_stack_id,
532                )
533            if args.verbose:
534                print(
535                    'Mutexes: %d, Edges: %d' %
536                    (len(graph.nodes()), len(graph.edges()))
537                )
538            if args.dump_graph:
539                with open(args.dump_graph, 'w') as f:
540                    data = graph.node_link_data()
541                    f.write(json.dumps(data, indent=2))
542
543            cycle = find_cycle(graph)
544            if cycle:
545                print_cycle(
546                    args.binary, graph, cycle, thread_info, print_stack_trace
547                )
548                sys.exit(1)
549
550            time.sleep(1)
551        except KeyboardInterrupt:
552            break
553
554
555if __name__ == '__main__':
556    main()
557