1#!/usr/bin/env python3 2# 3# Copyright (C) 2022 The Android Open Source Project 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17""" 18Checks dwarf CFI (unwinding) information by comparing it to disassembly. 19It is only a simple heuristic check of stack pointer adjustments. 20Fully inferring CFI from disassembly is not possible in general. 21""" 22 23import os, re, subprocess, collections, pathlib, bisect, collections, sys 24from argparse import ArgumentParser 25from dataclasses import dataclass 26from functools import cache 27from pathlib import Path 28from typing import Any, List, Optional, Set, Tuple, Dict 29 30arch: str = "" 31ARCHES = ["i386", "x86_64", "arm", "aarch64", "riscv64"] 32 33IGNORE : Dict[str, List[str]] = { 34 # Aligns stack. 35 "art_quick_osr_stub": ["i386"], 36 # Intermediate invalid CFI while loading all registers. 37 "art_quick_do_long_jump": ["x86_64"], 38 # Saves/restores SP in other register. 39 "art_quick_generic_jni_trampoline": ["arm", "i386", "x86_64"], 40 # Starts with non-zero offset at the start of the method. 41 "art_quick_throw_null_pointer_exception_from_signal": ARCHES, 42 # Pops stack without static control flow past the opcode. 43 "nterp_op_return": ["arm", "aarch64", "i386", "x86_64", "riscv64"], 44} 45 46SP = {"arm": "SP", "aarch64": "WSP", "i386": "ESP", "x86_64": "RSP", "riscv64": "X2"} 47INITIAL_OFFSET = {"i386": 4, "x86_64": 8} 48 49@cache 50def get_inst_semantics(arch: str) -> List[Any]: 51 """ List of regex expressions for supported instructions and their behaviour """ 52 53 rexprs = [] 54 def add(rexpr, adjust_offset=lambda m: 0, adjust_pc=None): 55 rexprs.append((re.compile(rexpr), adjust_offset, adjust_pc)) 56 if arch in ["i386", "x86_64"]: 57 ptr_size = {"i386": 4, "x86_64": 8}[arch] 58 add(r"push. .*", lambda m: ptr_size) 59 add(r"pop. .*", lambda m: -ptr_size) 60 add(r"sub. \$(\w+), (?:%esp|%rsp)", lambda m: int(m[1], 0)) 61 add(r"add. \$(\w+), (?:%esp|%rsp)", lambda m: -int(m[1], 0)) 62 add(r"call. (0x\w+) <.*", lambda m: ptr_size, adjust_pc=lambda m: int(m[1], 0)) 63 add(r"j[a-z]* (0x\w+) <.*", adjust_pc=lambda m: int(m[1], 0)) 64 if arch in ["arm", "aarch64"]: 65 add(r"sub sp,(?: sp,)? #(\w+)", lambda m: int(m[1], 0)) 66 add(r"add sp,(?: sp,)? #(\w+)", lambda m: -int(m[1], 0)) 67 add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1])) 68 add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1])) 69 add(r"stp \w+, \w+, \[sp, #-(\w+)\]!", lambda m: int(m[1], 0)) 70 add(r"ldp \w+, \w+, \[sp\], #(\w+)", lambda m: -int(m[1], 0)) 71 add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(","))) 72 add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(","))) 73 add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(","))) 74 add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(","))) 75 add(r"cb\w* \w+, (0x\w+).*", adjust_pc=lambda m: int(m[1], 0)) 76 add(r"(?:b|bl|b\w\w) (0x\w+).*", adjust_pc=lambda m: int(m[1], 0)) 77 if arch in ["riscv64"]: 78 add(r"addi sp, sp, (-?\w+)", lambda m: -int(m[1], 0)) 79 add(r"b\w* (?:\w+, )+(0x\w+).*", adjust_pc=lambda m: int(m[1], 0)) 80 add(r"(?:j|jal) (?:\w+, )?(0x\w+).*", adjust_pc=lambda m: int(m[1], 0)) 81 return rexprs 82 83@dataclass(frozen=True) 84class Error(Exception): 85 address: int 86 message: str 87 88def get_arch(lib: pathlib.Path) -> str: 89 """ Get architecture of the given library based on the ELF header. """ 90 91 proc = subprocess.run([args.objdump, "--file-headers", lib], 92 encoding='utf-8', 93 capture_output=True, 94 check=True) 95 96 m = re.search("^architecture: *(.*)$", proc.stdout, re.MULTILINE) 97 assert m, "Can not find ABI of ELF file " + str(lib) 98 assert m.group(1) in ARCHES, "Unknown arch: " + m.group(1) 99 return m.group(1) 100 101Source = collections.namedtuple("Source", ["pc", "file", "line", "flag"]) 102 103def get_src(lib: pathlib.Path) -> List[Source]: 104 """ Get source-file and line-number for all hand-written assembly code. """ 105 106 proc = subprocess.run([args.dwarfdump, "--debug-line", lib], 107 encoding='utf-8', 108 capture_output=True, 109 check=True) 110 111 section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE) 112 filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE) 113 line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)' # pc, line, column, file 114 ' +\d+ +\d +(.*)') # isa, discriminator, flag 115 116 results = [] 117 for section in section_re.split(proc.stdout): 118 files = {m[1]: m[2] for m in filename_re.finditer(section)} 119 if not any(f.endswith(".S") for f in files.values()): 120 continue 121 lines = line_re.findall(section) 122 results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines]) 123 return sorted(filter(lambda line: "end_sequence" not in line.flag, results)) 124 125Fde = collections.namedtuple("Fde", ["pc", "end", "data"]) 126 127def get_fde(lib: pathlib.Path) -> List[Fde]: 128 """ Get all FDE blocks (in dumped text-based format) """ 129 130 proc = subprocess.run([args.dwarfdump, "--debug-frame", lib], 131 encoding='utf-8', 132 capture_output=True, 133 check=True) 134 135 section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent. 136 fda_re = re.compile(".* FDE .* pc=([0-9a-f]+)...([0-9a-f]+)") 137 138 results = [] 139 for section in section_re.split(proc.stdout): 140 m = fda_re.match(section) 141 if m: 142 fde = Fde(int(m[1], 16), int(m[2], 16), section) 143 if fde.pc != 0: 144 results.append(fde) 145 return sorted(results) 146 147Asm = collections.namedtuple("Asm", ["pc", "name", "data"]) 148 149def get_asm(lib: pathlib.Path) -> List[Asm]: 150 """ Get all ASM blocks (in dumped text-based format) """ 151 152 proc = subprocess.run( 153 [ 154 args.objdump, 155 "--disassemble", 156 "--no-show-raw-insn", 157 "--disassemble-zeroes", 158 lib, 159 ], 160 encoding="utf-8", 161 capture_output=True, 162 check=True, 163 ) 164 165 section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent. 166 sym_re = re.compile("([0-9a-f]+) <(.+)>:") 167 168 results = [] 169 for section in section_re.split(proc.stdout): 170 sym = sym_re.match(section) 171 if sym: 172 results.append(Asm(int(sym[1], 16), sym[2], section)) 173 return sorted(results) 174 175Inst = collections.namedtuple("Inst", ["pc", "inst"]) 176 177def get_instructions(asm: Asm) -> List[Inst]: 178 """ Extract individual instructions from disassembled code block """ 179 180 data = re.sub(r"[ \t]+", " ", asm.data) 181 inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)") 182 return [Inst(int(pc, 16), inst) for pc, inst in inst_re.findall(data)] 183 184# PC -> CFA offset (stack size at given PC; None if it not just trivial SP+<integer>) 185CfaOffsets = Dict[int, Optional[int]] 186 187def get_dwarf_cfa_offsets(insts: List[Inst], fde: Fde) -> CfaOffsets: 188 """ Get CFA offsets for all instructions from DWARF """ 189 190 # Parse the CFA offset definitions from the FDE. 191 sp = SP[arch] 192 m = re.compile(r"(0x[0-9a-f]+): CFA=(\w*)([^:\n]*)").findall(fde.data) 193 cfa = collections.deque([(int(a, 0), int(o or "0") if r == sp else None) for a, r, o in m]) 194 if all(offset is None for add, offset in cfa): 195 # This would create result that never checks anything. 196 raise Error(insts[0].pc, "No trivial CFA offsets. Add function to IGNORE list?") 197 198 # Create map from instruction PCs to corresponding CFA offsets. 199 offset: Optional[int] = INITIAL_OFFSET.get(arch, 0) 200 result: CfaOffsets = {} 201 for pc, inst in insts: 202 while cfa and cfa[0][0] <= pc: 203 offset = cfa.popleft()[1] 204 result[pc] = offset 205 return result 206 207def get_inferred_cfa_offsets(insts: List[Inst]) -> CfaOffsets: 208 """ Get CFA offsets for all instructions from static analysis """ 209 210 rexprs = get_inst_semantics(arch) 211 offset: Optional[int] = INITIAL_OFFSET.get(arch, 0) 212 result: CfaOffsets = {} 213 for pc, inst in insts: 214 # Set current offset for PC, unless branch already set it. 215 offset = result.setdefault(pc, offset) 216 217 # Adjust PC and offset based on the current instruction. 218 for rexpr, adjust_offset, adjust_pc in rexprs: 219 m = rexpr.fullmatch(inst) 220 if m: 221 new_offset = offset + adjust_offset(m) 222 if adjust_pc: 223 new_pc = adjust_pc(m) 224 if insts[0].pc <= new_pc <= insts[-1].pc: 225 if new_pc in result and result[new_pc] != new_offset: 226 raise Error(pc, "Inconsistent branch (old={} new={})" 227 .format(result[new_pc], new_offset)) 228 result[new_pc] = new_offset 229 else: 230 offset = new_offset 231 break # First matched pattern wins. 232 return result 233 234def check_fde(fde: Fde, insts: List[Inst], srcs) -> None: 235 """ Compare DWARF offsets to assembly-inferred offsets. Report differences. """ 236 237 dwarf_cfa_offsets = get_dwarf_cfa_offsets(insts, fde) 238 inferred_cfa_offsets = get_inferred_cfa_offsets(insts) 239 240 for pc, inst in insts: 241 if dwarf_cfa_offsets[pc] is not None and dwarf_cfa_offsets[pc] != inferred_cfa_offsets[pc]: 242 if pc in srcs: # Only report if it maps to source code (not padding or literals). 243 for inst2 in insts: 244 print("0x{:08x} [{}]: dwarf={} inferred={} {}".format( 245 inst2.pc, srcs.get(inst2.pc, ""), 246 str(dwarf_cfa_offsets[inst2.pc]), str(inferred_cfa_offsets[inst2.pc]), 247 inst2.inst.strip())) 248 raise Error(pc, "DWARF offset does not match inferred offset") 249 250def check_lib(lib: pathlib.Path) -> int: 251 global arch 252 arch = get_arch(lib) 253 254 assert lib.exists() 255 fdes = get_fde(lib) 256 asms = collections.deque(get_asm(lib)) 257 srcs = {src.pc: src.file + ":" + src.line for src in get_src(lib)} 258 seen = set() # Used to verify the we have covered all assembly source lines. 259 fail = 0 260 assert srcs, "No sources found" 261 262 for fde in fdes: 263 if fde.pc not in srcs: 264 continue # Ignore if it is not hand-written assembly. 265 266 # Assembly instructions (one FDE can cover several assembly chunks). 267 all_insts, name = [], "" 268 while asms and asms[0].pc < fde.end: 269 asm = asms.popleft() 270 if asm.pc < fde.pc: 271 continue 272 insts = get_instructions(asm) 273 if any(asm.name.startswith(n) and arch in a for n, a in IGNORE.items()): 274 seen.update([inst.pc for inst in insts]) 275 continue 276 all_insts.extend(insts) 277 name = name or asm.name 278 if not all_insts: 279 continue # No assembly 280 281 # Compare DWARF data to assembly instructions 282 try: 283 check_fde(fde, all_insts, srcs) 284 except Error as e: 285 print("0x{:08x} [{}]: ERROR in {}: {}\n" 286 .format(e.address, srcs.get(e.address, ""), name, e.message)) 287 fail += 1 288 seen.update([inst.pc for inst in all_insts]) 289 for pc in sorted(set(srcs.keys()) - seen): 290 print("ERROR: Missing CFI for {:08x}: {}".format(pc, srcs[pc])) 291 fail += 1 292 return fail 293 294 295def main(argv): 296 """ Check libraries provided on the command line, or use the default build output """ 297 298 libs = args.srcs 299 if not libs: 300 apex = Path(os.environ["OUT"]) / "symbols/apex/" 301 libs = list(apex.glob("**/libart.so")) 302 assert libs, "Can not find any libart.so in " + str(apex) 303 for lib in libs: 304 fail = check_lib(pathlib.Path(lib)) 305 if fail > 0: 306 print(fail, "ERROR(s) in", str(lib)) 307 sys.exit(1) 308 if args.out: 309 args.out.write_bytes(b"") 310 311if __name__ == "__main__": 312 parser = ArgumentParser(description=__doc__) 313 parser.add_argument("--out", type=Path, help="Output (will just generate empty file)") 314 parser.add_argument("--dwarfdump", type=Path, default="llvm-dwarfdump") 315 parser.add_argument("--objdump", type=Path, default="llvm-objdump") 316 parser.add_argument("srcs", nargs="*", type=Path) 317 args = parser.parse_args() 318 319 main(sys.argv) 320