1#!/usr/bin/env python3 2 3# Copyright 2016, VIXL authors 4# All rights reserved. 5# 6# Redistribution and use in source and binary forms, with or without 7# modification, are permitted provided that the following conditions are met: 8# 9# * Redistributions of source code must retain the above copyright notice, 10# this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above copyright notice, 12# this list of conditions and the following disclaimer in the documentation 13# and/or other materials provided with the distribution. 14# * Neither the name of ARM Limited nor the names of its contributors may be 15# used to endorse or promote products derived from this software without 16# specific prior written permission. 17# 18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND 19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29""" 30Verify generated AArch32 assembler traces against `llvm-mc`. 31 32This script will find all files in `test/aarch32/traces/` with names starting 33will `assembler`, and check them against `llvm-mc`. It checks our assembler is 34correct by looking up what instruction we meant to asssemble, assemble it with 35`llvm` and check the result is bit identical to what our assembler generated. 36 37You may run the script with no arguments from VIXL's top-level directory as long 38as `llvm-mc` is in your PATH. You may provide a different `llvm-mc` path with 39the `--llvm-mc` option. This script relies on version 3.8 or higher of 40LLVM. Previous versions refuse to assemble some instructions that ARMv8 allows, 41but ARMv7 did not. 42 43For example, let's say we have the following assembler trace for CLZ 44(the real trace is a lot bigger): 45 46~~~ 47static const byte kInstruction_Clz_eq_r0_r0[] = { 48 0x10, 0x0f, 0x6f, 0x01 // Clz eq r0 r0 49}; 50static const byte kInstruction_Clz_eq_r0_r1[] = { 51 0x11, 0x0f, 0x6f, 0x01 // Clz eq r0 r1 52}; 53static const byte kInstruction_Clz_eq_r0_r2[] = { 54 0x12, 0x0f, 0x6f, 0x01 // Clz eq r0 r2 55}; 56static const TestResult kReferenceClz[] = { 57 { 58 ARRAY_SIZE(kInstruction_Clz_eq_r0_r0), 59 kInstruction_Clz_eq_r0_r0, 60 }, 61 { 62 ARRAY_SIZE(kInstruction_Clz_eq_r0_r1), 63 kInstruction_Clz_eq_r0_r1, 64 }, 65 { 66 ARRAY_SIZE(kInstruction_Clz_eq_r0_r2), 67 kInstruction_Clz_eq_r0_r2, 68 }, 69}; 70~~~ 71 72The traces contain both the list of bytes that were encoded as well as a comment 73with a description of the instruction this is. This script searches for these 74lines and checks them. 75 76With our example, the script will find the following: 77 78 [ 79 ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]), 80 ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]), 81 ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"]) 82 ] 83 84Then the tricky part is to convert the description of the instruction into the 85following valid assembly syntax: 86 87 clzeq r0, r0 88 clzeq r0, r1 89 clzeq r0, r2 90 91Our example is easy, but it gets more complicated with load and store 92instructions for example. We can feed this as input to `llvm-mc`: 93 94 $ echo " 95 clzeq r0, r0 96 clzeq r0, r1 97 clzeq r0, r2 98 " | llvm-mc -assemble -arch=arm -mattr=v8,crc -show-encoding 99 100And we will get the following output: 101 102 .text 103 clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01] 104 clzeq r0, r1 @ encoding: [0x11,0x0f,0x6f,0x01] 105 clzeq r0, r2 @ encoding: [0x12,0x0f,0x6f,0x01] 106 107The script will finally extract the encoding and compare it to what VIXL 108generated. 109""" 110 111import argparse 112import subprocess 113import os 114import re 115import itertools 116import types 117 118def BuildOptions(): 119 result = argparse.ArgumentParser( 120 description = 'Use `llvm-mc` to check the assembler traces are correct.', 121 formatter_class = argparse.ArgumentDefaultsHelpFormatter) 122 result.add_argument('--llvm-mc', default='llvm-mc', help='Path to llvm-mc') 123 result.add_argument('--verbose', '-v', action='store_true') 124 return result.parse_args() 125 126 127def CheckLLVMVersion(llvm_mc): 128 version = subprocess.check_output([llvm_mc, '-version']) 129 m = re.search("^ LLVM version (\d)\.(\d)\.\d$", version.decode(), re.M) 130 major, minor = m.groups() 131 if int(major) < 3 or (int(major) == 3 and int(minor) < 8): 132 raise Exception("This script requires LLVM version 3.8 or higher.") 133 134 135def ConvertToLLVMFormat(vixl_instruction, triple): 136 """ 137 Take an string representing an instruction and convert it to assembly syntax 138 for LLVM. VIXL's test generation framework will print instruction 139 representations as a space seperated list. The first element is the mnemonic 140 and the following elements are operands. 141 """ 142 143 # Dictionnary of patterns. The key is an identifier used in 144 # `llvm_mc_instruction_converters` below. The value needs to be a capturing 145 # regular expression. 146 pattern_matchers = { 147 # Allow an optional underscore in case this an "and" instruction. 148 "mnemonic": "(\w+?)_?", 149 "condition": 150 "(al|eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le)", 151 "register": 152 "(r0|r1|r2|r3|r4|r5|r6|r7|r8|r9|r10|r11|r12|r13|r14|r15|pc|sp|lr)", 153 "immediate": "(0x[0-9a-f]+|[0-9]+)", 154 "shift": "(lsl|lsr|asr|ror)", 155 } 156 157 # List of converters. Each of them represents an instruction form and what to 158 # convert it to. This list needs to be complete; an exception is raised if we 159 # couldn't find a converter for the instruction. 160 # 161 # The first part of each tuple is a pattern to match. It's simply a regular 162 # expression. Additionally, each identifier in curly braces is replaced by the 163 # corresponding pattern from `pattern_matchers`. 164 # 165 # The second part of the tuple is a string that describes what the result will 166 # look like. Empty curly braces are replaced by matches, in order. 167 llvm_mc_instruction_converters = [ 168 ("it {condition}", "it {}"), 169 ("{mnemonic} {condition} {register} {immediate}", 170 "{}{} {}, #{}"), 171 ("{mnemonic} {condition} {register} {register} {immediate}", 172 "{}{} {}, {}, #{}"), 173 ("{mnemonic} {condition} {register} {register}", 174 "{}{} {}, {}"), 175 ("{mnemonic} {condition} {register} {register} {register}", 176 "{}{} {}, {}, {}"), 177 ("{mnemonic} {register} {register} {register}", 178 "{} {}, {}, {}"), 179 ("{mnemonic} {condition} {register} {register} {immediate}", 180 "{}{} {}, {}, #{}"), 181 ("{mnemonic} {condition} {register} {register} {register} {shift} " 182 "{immediate}", 183 "{}{} {}, {}, {}, {} #{}"), 184 ("{mnemonic} {condition} {register} {register} {register} {shift} " 185 "{register}", 186 "{}{} {}, {}, {}, {} {}"), 187 ("{mnemonic} {condition} {register} {register} {shift} {immediate}", 188 "{}{} {}, {}, {} #{}"), 189 ("{mnemonic} {condition} {register} {register} {shift} {register}", 190 "{}{} {}, {}, {} {}"), 191 ("{mnemonic} {condition} {register} {register} plus {immediate} offset", 192 "{}{} {}, [{}, #{}]"), 193 ("{mnemonic} {condition} {register} {register} minus {immediate} offset", 194 "{}{} {}, [{}, #-{}]"), 195 ("{mnemonic} {condition} {register} {register} plus {immediate} postindex", 196 "{}{} {}, [{}], #{}"), 197 ("{mnemonic} {condition} {register} {register} minus {immediate} " 198 "postindex", 199 "{}{} {}, [{}], #-{}"), 200 ("{mnemonic} {condition} {register} {register} plus {immediate} preindex", 201 "{}{} {}, [{}, #{}]!"), 202 ("{mnemonic} {condition} {register} {register} minus {immediate} " 203 "preindex", 204 "{}{} {}, [{}, #-{}]!"), 205 ("{mnemonic} {condition} {register} {register} plus {register} offset", 206 "{}{} {}, [{}, {}]"), 207 ("{mnemonic} {condition} {register} {register} minus {register} offset", 208 "{}{} {}, [{}, -{}]"), 209 ("{mnemonic} {condition} {register} {register} plus {register} postindex", 210 "{}{} {}, [{}], {}"), 211 ("{mnemonic} {condition} {register} {register} minus {register} " 212 "postindex", 213 "{}{} {}, [{}], -{}"), 214 ("{mnemonic} {condition} {register} {register} plus {register} preindex", 215 "{}{} {}, [{}, {}]!"), 216 ("{mnemonic} {condition} {register} {register} minus {register} preindex", 217 "{}{} {}, [{}, -{}]!"), 218 ("{mnemonic} {condition} {register} {register} plus {register} {shift} " 219 "{immediate} offset", 220 "{}{} {}, [{}, {}, {} #{}]"), 221 ("{mnemonic} {condition} {register} {register} minus {register} {shift} " 222 "{immediate} offset", 223 "{}{} {}, [{}, -{}, {} #{}]"), 224 ("{mnemonic} {condition} {register} {register} plus {register} {shift} " 225 "{immediate} postindex", 226 "{}{} {}, [{}], {}, {} #{}"), 227 ("{mnemonic} {condition} {register} {register} minus {register} {shift} " 228 "{immediate} postindex", 229 "{}{} {}, [{}], -{}, {} #{}"), 230 ("{mnemonic} {condition} {register} {register} plus {register} {shift} " 231 "{immediate} preindex", 232 "{}{} {}, [{}, {}, {} #{}]!"), 233 ("{mnemonic} {condition} {register} {register} minus {register} {shift} " 234 "{immediate} preindex", 235 "{}{} {}, [{}, -{}, {} #{}]!"), 236 ] 237 238 # Work around issues in LLVM 3.8. 239 if triple == "thumbv8": 240 def ConvertMovRdImm(matches): 241 """ 242 LLVM chooses the T3 encoding for `mov <rd>, #<immediate>` when the 243 immediate fits both into a modified immediate (T2 encoding) and 16 244 bits (T3 encoding). Adding the `.W` modifier forces the T2 encoding to 245 be used. 246 """ 247 # The immediate is the second capture in "mov al {register} {immediate}". 248 imm = int(matches[1], 16) 249 if imm <= 0xffff: 250 lsb = imm & -imm 251 if (imm >> 8) < lsb: 252 return "mov.w {}, #{}".format(*matches) 253 # Fall back to a LLVM making the right decision. 254 return "mov {}, #{}".format(*matches) 255 llvm_mc_instruction_converters[:0] = [ 256 # The ARM ARM specifies that if <Rn> is PC in either an ADD or SUB 257 # instruction with an immediate, the assembler should use the ADR 258 # encoding. LLVM does not know about this subtlety. We get around this 259 # by manually translating the instruction to their ADR form. 260 ("add al {register} pc {immediate}", "adr {}, #{}"), 261 ("sub al {register} pc {immediate}", "adr {}, #-{}"), 262 263 # LLVM is (rightfully) being helpful by swapping register operands so 264 # that the 16 bit encoding of the following instructions is used. 265 # However, VIXL does not do this. These rules specifically add the `.w` 266 # modifier to force LLVM to use the 32 bit encoding if the last register 267 # is identical to first one. But at the same time, we should still use 268 # the narrow encoding if all registers are the same. 269 ("adcs al {register} (\\1) (\\1)", "adcs.n {}, {}, {}"), 270 ("adcs al {register} {register} (\\1)", "adcs.w {}, {}, {}"), 271 ("orrs al {register} (\\1) (\\1)", "orrs.n {}, {}, {}"), 272 ("orrs al {register} {register} (\\1)", "orrs.w {}, {}, {}"), 273 ("eors al {register} (\\1) (\\1)", "eors.n {}, {}, {}"), 274 ("eors al {register} {register} (\\1)", "eors.w {}, {}, {}"), 275 ("ands al {register} (\\1) (\\1)", "ands.n {}, {}, {}"), 276 ("ands al {register} {register} (\\1)", "ands.w {}, {}, {}"), 277 # Solve the same issue as for the previous rules, however, we need to 278 # take into account that ADD instructions with the stack pointer have 279 # additional 16 bit forms. 280 ("add al {register} (\\1) (\\1)", "add.n {}, {}, {}"), 281 ("add al {register} (\\1) r13", "add.w {}, {}, sp"), 282 ("add al {register} r13 (\\1)", "add.n {}, sp, {}"), 283 ("add al {register} {register} (\\1)", "add.w {}, {}, {}"), 284 ("mov al {register} {immediate}", ConvertMovRdImm) 285 ] 286 287 # Our test generator framework uses mnemonics starting with a capital letters. 288 # We need everythin to be lower case for LLVM. 289 vixl_instruction = vixl_instruction.lower() 290 291 llvm_instruction = [] 292 293 # VIXL may have generated more than one instruction seperated by ';' 294 # (an IT instruction for example). 295 for instruction in vixl_instruction.split(';'): 296 # Strip out extra white spaces. 297 instruction = instruction.strip() 298 # Try all converters in the list. 299 for pattern, result in llvm_mc_instruction_converters: 300 # Build the regular expression for this converter. 301 instruction_matcher = "^" + pattern.format(**pattern_matchers) + "$" 302 match = re.match(instruction_matcher, instruction) 303 if match: 304 # If we have a match, the object will contain a tuple of substrings. 305 if isinstance(result, types.FunctionType): 306 # `result` is a function, call it produce the instruction. 307 llvm_instruction.append(result(match.groups())) 308 else: 309 # `result` is a string, use it as the format string. 310 assert(isinstance(result, str)) 311 llvm_instruction.append(result.format(*match.groups())) 312 break 313 314 if llvm_instruction: 315 return "\n".join(llvm_instruction) 316 317 # No converters worked so raise an exception. 318 raise Exception("Unsupported instruction {}.".format(instruction)) 319 320 321def ReadTrace(trace): 322 """ 323 Receive the content of an assembler trace, extract the relevant information 324 and return it as a list of tuples. The first part of each typle is a string 325 representing the instruction. The second part is a list of bytes representing 326 the encoding. 327 328 For example: 329 330 [ 331 ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]), 332 ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]), 333 ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"]) 334 ] 335 """ 336 337 pattern = re.compile( 338 "^ (?P<encoding>(:?0x[0-9a-f]{2}, )+0x[0-9a-f]{2}) // (?P<instruction>.*)$", 339 re.M) 340 return [ 341 (m.group('instruction'), m.group('encoding').replace(" ", "").split(",")) 342 for m in re.finditer(pattern, trace) 343 ] 344 345 346def VerifyInstructionsWithLLVMMC(llvm_mc, f, triple): 347 """ 348 Extract all instructions from `f`, feed them to `llvm-mc` and make sure it's 349 encoded them the same way as VIXL. `triple` allows us to specify either 350 "thumbv8" or "armv8". 351 """ 352 353 vixl_reference = ReadTrace(f.read()) 354 vixl_instructions, vixl_encodings = zip(*vixl_reference) 355 instructions = [ 356 ConvertToLLVMFormat(instruction, triple) 357 for instruction in vixl_instructions 358 ] 359 llvm_mc_proc = subprocess.Popen( 360 [llvm_mc, '-assemble', '-triple={}'.format(triple), '-mattr=v8,crc', 361 # LLVM fails to recognize some instructions as valid T32 when we do not 362 # set `-mcpu`. 363 '-mcpu=cortex-a53', '-show-encoding'], 364 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 365 out, err = llvm_mc_proc.communicate("\n".join(instructions).encode()) 366 # If `llvm-mc` printed something to stderr then stop. 367 if err: 368 print(err.decode()) 369 return 370 371 # Extract list of bytes from `llvm-mc` output. It's in the following form: 372 # 373 # clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01] 374 # ^^^^ ^^^^ ^^^^ ^^^^ 375 llvm_encodings = [ 376 match_object.group('encoding').replace(" ", "").split(",") 377 for match_object in re.finditer(".*@ encoding: \[(?P<encoding>.*)\]", 378 out.decode()) 379 ] 380 381 # If LLVM has generated exactly twice as much instructions, we assume this is 382 # due to IT instructions preceding every instruction under test. VIXL's 383 # assembly reference files will contain a single array of 4 bytes encoding 384 # both the IT and the following instruction. While LLVM will have decoded them 385 # into two seperate 2 bytes arrays. 386 if len(llvm_encodings) == 2 * len(vixl_encodings): 387 llvm_encodings = [ 388 llvm_encodings[i * 2] + llvm_encodings[(i * 2) + 1] 389 for i in range(0, len(vixl_encodings)) 390 ] 391 392 # Check the encodings from LLVM are identical to VIXL's. 393 if len(llvm_encodings) != len(vixl_encodings): 394 print("""Error: llvm-mc generated {} instructions than there are in the 395generated trace. 396 """.format("fewer" if len(llvm_encodings) < len(vixl_encodings) else "more")) 397 else: 398 for i in range(0, len(vixl_encodings)): 399 if llvm_encodings[i] != vixl_encodings[i]: 400 print("""Error: llvm-mc disagrees on the encoding of \"{instruction}\": 401 LLVM-MC: {llvm} 402 VIXL: {vixl} 403 """.format(instruction=vixl_instructions[i].replace("\n", "; "), 404 llvm=llvm_encodings[i], 405 vixl=vixl_encodings[i])) 406 407 408if __name__ == "__main__": 409 args = BuildOptions() 410 411 CheckLLVMVersion(args.llvm_mc) 412 413 trace_dir = 'test/aarch32/traces/' 414 trace_files = [ 415 trace_file 416 for trace_file in os.listdir(trace_dir) 417 if trace_file.startswith("assembler-") 418 ] 419 trace_files.sort() 420 for trace_file in trace_files: 421 if args.verbose: 422 print("Verifying \"" + trace_file + "\".") 423 with open(os.path.join(trace_dir, trace_file), "r") as f: 424 if "t32" in trace_file: 425 VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "thumbv8") 426 elif "a32" in trace_file: 427 VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "armv8") 428 else: 429 raise Exception("Failed to recognize the ISA in \"" + trace_file + "\".") 430