1#!/usr/bin/env python3
2#
3# Copyright (C) 2022 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""
18Checks dwarf CFI (unwinding) information by comparing it to disassembly.
19It is only a simple heuristic check of stack pointer adjustments.
20Fully inferring CFI from disassembly is not possible in general.
21"""
22
23import os, re, subprocess, collections, pathlib, bisect, collections, sys
24from argparse import ArgumentParser
25from dataclasses import dataclass
26from functools import cache
27from pathlib import Path
28from typing import Any, List, Optional, Set, Tuple, Dict
29
30arch: str = ""
31ARCHES = ["i386", "x86_64", "arm", "aarch64", "riscv64"]
32
33IGNORE : Dict[str, List[str]] = {
34    # Aligns stack.
35    "art_quick_osr_stub": ["i386"],
36    # Intermediate invalid CFI while loading all registers.
37    "art_quick_do_long_jump": ["x86_64"],
38    # Saves/restores SP in other register.
39    "art_quick_generic_jni_trampoline": ["arm", "i386", "x86_64"],
40    # Starts with non-zero offset at the start of the method.
41    "art_quick_throw_null_pointer_exception_from_signal": ARCHES,
42    # Pops stack without static control flow past the opcode.
43    "nterp_op_return": ["arm", "aarch64", "i386", "x86_64", "riscv64"],
44}
45
46SP = {"arm": "SP", "aarch64": "WSP", "i386": "ESP", "x86_64": "RSP", "riscv64": "X2"}
47INITIAL_OFFSET = {"i386": 4, "x86_64": 8}
48
49@cache
50def get_inst_semantics(arch: str) -> List[Any]:
51  """ List of regex expressions for supported instructions and their behaviour """
52
53  rexprs = []
54  def add(rexpr, adjust_offset=lambda m: 0, adjust_pc=None):
55    rexprs.append((re.compile(rexpr), adjust_offset, adjust_pc))
56  if arch in ["i386", "x86_64"]:
57    ptr_size = {"i386": 4, "x86_64": 8}[arch]
58    add(r"push. .*", lambda m: ptr_size)
59    add(r"pop. .*", lambda m: -ptr_size)
60    add(r"sub. \$(\w+), (?:%esp|%rsp)", lambda m: int(m[1], 0))
61    add(r"add. \$(\w+), (?:%esp|%rsp)", lambda m: -int(m[1], 0))
62    add(r"call. (0x\w+) <.*", lambda m: ptr_size, adjust_pc=lambda m: int(m[1], 0))
63    add(r"j[a-z]* (0x\w+) <.*", adjust_pc=lambda m: int(m[1], 0))
64  if arch in ["arm", "aarch64"]:
65    add(r"sub sp,(?: sp,)? #(\w+)", lambda m: int(m[1], 0))
66    add(r"add sp,(?: sp,)? #(\w+)", lambda m: -int(m[1], 0))
67    add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
68    add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
69    add(r"stp \w+, \w+, \[sp, #-(\w+)\]!", lambda m: int(m[1], 0))
70    add(r"ldp \w+, \w+, \[sp\], #(\w+)", lambda m: -int(m[1], 0))
71    add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(",")))
72    add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(",")))
73    add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(",")))
74    add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(",")))
75    add(r"cb\w* \w+, (0x\w+).*", adjust_pc=lambda m: int(m[1], 0))
76    add(r"(?:b|bl|b\w\w) (0x\w+).*", adjust_pc=lambda m: int(m[1], 0))
77  if arch in ["riscv64"]:
78    add(r"addi sp, sp, (-?\w+)", lambda m: -int(m[1], 0))
79    add(r"b\w* (?:\w+, )+(0x\w+).*", adjust_pc=lambda m: int(m[1], 0))
80    add(r"(?:j|jal) (?:\w+, )?(0x\w+).*", adjust_pc=lambda m: int(m[1], 0))
81  return rexprs
82
83@dataclass(frozen=True)
84class Error(Exception):
85  address: int
86  message: str
87
88def get_arch(lib: pathlib.Path) -> str:
89  """ Get architecture of the given library based on the ELF header. """
90
91  proc = subprocess.run([args.objdump, "--file-headers", lib],
92                        encoding='utf-8',
93                        capture_output=True,
94                        check=True)
95
96  m = re.search("^architecture: *(.*)$", proc.stdout, re.MULTILINE)
97  assert m, "Can not find ABI of ELF file " + str(lib)
98  assert m.group(1) in ARCHES, "Unknown arch: " + m.group(1)
99  return m.group(1)
100
101Source = collections.namedtuple("Source", ["pc", "file", "line", "flag"])
102
103def get_src(lib: pathlib.Path) -> List[Source]:
104  """ Get source-file and line-number for all hand-written assembly code. """
105
106  proc = subprocess.run([args.dwarfdump, "--debug-line", lib],
107                        encoding='utf-8',
108                        capture_output=True,
109                        check=True)
110
111  section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE)
112  filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE)
113  line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)'  # pc, line, column, file
114                       ' +\d+ +\d +(.*)')                     # isa, discriminator, flag
115
116  results = []
117  for section in section_re.split(proc.stdout):
118    files = {m[1]: m[2] for m in filename_re.finditer(section)}
119    if not any(f.endswith(".S") for f in files.values()):
120      continue
121    lines = line_re.findall(section)
122    results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines])
123  return sorted(filter(lambda line: "end_sequence" not in line.flag, results))
124
125Fde = collections.namedtuple("Fde", ["pc", "end", "data"])
126
127def get_fde(lib: pathlib.Path) -> List[Fde]:
128  """ Get all FDE blocks (in dumped text-based format) """
129
130  proc = subprocess.run([args.dwarfdump, "--debug-frame", lib],
131                        encoding='utf-8',
132                        capture_output=True,
133                        check=True)
134
135  section_re = re.compile("\n(?! |\n)", re.MULTILINE)  # New-line not followed by indent.
136  fda_re = re.compile(".* FDE .* pc=([0-9a-f]+)...([0-9a-f]+)")
137
138  results = []
139  for section in section_re.split(proc.stdout):
140    m = fda_re.match(section)
141    if m:
142      fde = Fde(int(m[1], 16), int(m[2], 16), section)
143      if fde.pc != 0:
144        results.append(fde)
145  return sorted(results)
146
147Asm = collections.namedtuple("Asm", ["pc", "name", "data"])
148
149def get_asm(lib: pathlib.Path) -> List[Asm]:
150  """ Get all ASM blocks (in dumped text-based format) """
151
152  proc = subprocess.run(
153      [
154          args.objdump,
155          "--disassemble",
156          "--no-show-raw-insn",
157          "--disassemble-zeroes",
158          lib,
159      ],
160      encoding="utf-8",
161      capture_output=True,
162      check=True,
163  )
164
165  section_re = re.compile("\n(?! |\n)", re.MULTILINE)  # New-line not followed by indent.
166  sym_re = re.compile("([0-9a-f]+) <(.+)>:")
167
168  results = []
169  for section in section_re.split(proc.stdout):
170    sym = sym_re.match(section)
171    if sym:
172      results.append(Asm(int(sym[1], 16), sym[2], section))
173  return sorted(results)
174
175Inst = collections.namedtuple("Inst", ["pc", "inst"])
176
177def get_instructions(asm: Asm) -> List[Inst]:
178  """ Extract individual instructions from disassembled code block """
179
180  data = re.sub(r"[ \t]+", " ", asm.data)
181  inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)")
182  return [Inst(int(pc, 16), inst) for pc, inst in inst_re.findall(data)]
183
184# PC -> CFA offset (stack size at given PC; None if it not just trivial SP+<integer>)
185CfaOffsets = Dict[int, Optional[int]]
186
187def get_dwarf_cfa_offsets(insts: List[Inst], fde: Fde) -> CfaOffsets:
188  """ Get CFA offsets for all instructions from DWARF """
189
190  # Parse the CFA offset definitions from the FDE.
191  sp = SP[arch]
192  m = re.compile(r"(0x[0-9a-f]+): CFA=(\w*)([^:\n]*)").findall(fde.data)
193  cfa = collections.deque([(int(a, 0), int(o or "0") if r == sp else None) for a, r, o in m])
194  if all(offset is None for add, offset in cfa):
195    # This would create result that never checks anything.
196    raise Error(insts[0].pc, "No trivial CFA offsets. Add function to IGNORE list?")
197
198  # Create map from instruction PCs to corresponding CFA offsets.
199  offset: Optional[int] = INITIAL_OFFSET.get(arch, 0)
200  result: CfaOffsets = {}
201  for pc, inst in insts:
202    while cfa and cfa[0][0] <= pc:
203      offset = cfa.popleft()[1]
204    result[pc] = offset
205  return result
206
207def get_inferred_cfa_offsets(insts: List[Inst]) -> CfaOffsets:
208  """ Get CFA offsets for all instructions from static analysis """
209
210  rexprs = get_inst_semantics(arch)
211  offset: Optional[int] = INITIAL_OFFSET.get(arch, 0)
212  result: CfaOffsets = {}
213  for pc, inst in insts:
214    # Set current offset for PC, unless branch already set it.
215    offset = result.setdefault(pc, offset)
216
217    # Adjust PC and offset based on the current instruction.
218    for rexpr, adjust_offset, adjust_pc in rexprs:
219      m = rexpr.fullmatch(inst)
220      if m:
221        new_offset = offset + adjust_offset(m)
222        if adjust_pc:
223          new_pc = adjust_pc(m)
224          if insts[0].pc <= new_pc <= insts[-1].pc:
225            if new_pc in result and result[new_pc] != new_offset:
226              raise Error(pc, "Inconsistent branch (old={} new={})"
227                              .format(result[new_pc], new_offset))
228            result[new_pc] = new_offset
229        else:
230          offset = new_offset
231        break  # First matched pattern wins.
232  return result
233
234def check_fde(fde: Fde, insts: List[Inst], srcs) -> None:
235  """ Compare DWARF offsets to assembly-inferred offsets. Report differences. """
236
237  dwarf_cfa_offsets = get_dwarf_cfa_offsets(insts, fde)
238  inferred_cfa_offsets = get_inferred_cfa_offsets(insts)
239
240  for pc, inst in insts:
241    if dwarf_cfa_offsets[pc] is not None and dwarf_cfa_offsets[pc] != inferred_cfa_offsets[pc]:
242      if pc in srcs:  # Only report if it maps to source code (not padding or literals).
243        for inst2 in insts:
244          print("0x{:08x} [{}]: dwarf={} inferred={} {}".format(
245                    inst2.pc, srcs.get(inst2.pc, ""),
246                    str(dwarf_cfa_offsets[inst2.pc]), str(inferred_cfa_offsets[inst2.pc]),
247                    inst2.inst.strip()))
248        raise Error(pc, "DWARF offset does not match inferred offset")
249
250def check_lib(lib: pathlib.Path) -> int:
251  global arch
252  arch = get_arch(lib)
253
254  assert lib.exists()
255  fdes = get_fde(lib)
256  asms = collections.deque(get_asm(lib))
257  srcs = {src.pc: src.file + ":" + src.line for src in get_src(lib)}
258  seen = set()  # Used to verify the we have covered all assembly source lines.
259  fail = 0
260  assert srcs, "No sources found"
261
262  for fde in fdes:
263    if fde.pc not in srcs:
264      continue  # Ignore if it is not hand-written assembly.
265
266    # Assembly instructions (one FDE can cover several assembly chunks).
267    all_insts, name = [], ""
268    while asms and asms[0].pc < fde.end:
269      asm = asms.popleft()
270      if asm.pc < fde.pc:
271        continue
272      insts = get_instructions(asm)
273      if any(asm.name.startswith(n) and arch in a for n, a in IGNORE.items()):
274        seen.update([inst.pc for inst in insts])
275        continue
276      all_insts.extend(insts)
277      name = name or asm.name
278    if not all_insts:
279      continue  # No assembly
280
281    # Compare DWARF data to assembly instructions
282    try:
283      check_fde(fde, all_insts, srcs)
284    except Error as e:
285      print("0x{:08x} [{}]: ERROR in {}: {}\n"
286            .format(e.address, srcs.get(e.address, ""), name, e.message))
287      fail += 1
288    seen.update([inst.pc for inst in all_insts])
289  for pc in sorted(set(srcs.keys()) - seen):
290    print("ERROR: Missing CFI for {:08x}: {}".format(pc, srcs[pc]))
291    fail += 1
292  return fail
293
294
295def main(argv):
296  """ Check libraries provided on the command line, or use the default build output """
297
298  libs = args.srcs
299  if not libs:
300    apex = Path(os.environ["OUT"]) / "symbols/apex/"
301    libs = list(apex.glob("**/libart.so"))
302    assert libs, "Can not find any libart.so in " + str(apex)
303  for lib in libs:
304    fail = check_lib(pathlib.Path(lib))
305    if fail > 0:
306      print(fail, "ERROR(s) in", str(lib))
307      sys.exit(1)
308  if args.out:
309    args.out.write_bytes(b"")
310
311if __name__ == "__main__":
312  parser = ArgumentParser(description=__doc__)
313  parser.add_argument("--out", type=Path, help="Output (will just generate empty file)")
314  parser.add_argument("--dwarfdump", type=Path, default="llvm-dwarfdump")
315  parser.add_argument("--objdump", type=Path, default="llvm-objdump")
316  parser.add_argument("srcs", nargs="*", type=Path)
317  args = parser.parse_args()
318
319  main(sys.argv)
320