1#!/usr/bin/env python3
2
3# Copyright 2016, VIXL authors
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are met:
8#
9#   * Redistributions of source code must retain the above copyright notice,
10#     this list of conditions and the following disclaimer.
11#   * Redistributions in binary form must reproduce the above copyright notice,
12#     this list of conditions and the following disclaimer in the documentation
13#     and/or other materials provided with the distribution.
14#   * Neither the name of ARM Limited nor the names of its contributors may be
15#     used to endorse or promote products derived from this software without
16#     specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29"""
30Verify generated AArch32 assembler traces against `llvm-mc`.
31
32This script will find all files in `test/aarch32/traces/` with names starting
33will `assembler`, and check them against `llvm-mc`. It checks our assembler is
34correct by looking up what instruction we meant to asssemble, assemble it with
35`llvm` and check the result is bit identical to what our assembler generated.
36
37You may run the script with no arguments from VIXL's top-level directory as long
38as `llvm-mc` is in your PATH. You may provide a different `llvm-mc` path with
39the `--llvm-mc` option. This script relies on version 3.8 or higher of
40LLVM. Previous versions refuse to assemble some instructions that ARMv8 allows,
41but ARMv7 did not.
42
43For example, let's say we have the following assembler trace for CLZ
44(the real trace is a lot bigger):
45
46~~~
47static const byte kInstruction_Clz_eq_r0_r0[] = {
48  0x10, 0x0f, 0x6f, 0x01 // Clz eq r0 r0
49};
50static const byte kInstruction_Clz_eq_r0_r1[] = {
51  0x11, 0x0f, 0x6f, 0x01 // Clz eq r0 r1
52};
53static const byte kInstruction_Clz_eq_r0_r2[] = {
54  0x12, 0x0f, 0x6f, 0x01 // Clz eq r0 r2
55};
56static const TestResult kReferenceClz[] = {
57  {
58    ARRAY_SIZE(kInstruction_Clz_eq_r0_r0),
59    kInstruction_Clz_eq_r0_r0,
60  },
61  {
62    ARRAY_SIZE(kInstruction_Clz_eq_r0_r1),
63    kInstruction_Clz_eq_r0_r1,
64  },
65  {
66    ARRAY_SIZE(kInstruction_Clz_eq_r0_r2),
67    kInstruction_Clz_eq_r0_r2,
68  },
69};
70~~~
71
72The traces contain both the list of bytes that were encoded as well as a comment
73with a description of the instruction this is. This script searches for these
74lines and checks them.
75
76With our example, the script will find the following:
77
78    [
79      ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
80      ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
81      ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
82    ]
83
84Then the tricky part is to convert the description of the instruction into the
85following valid assembly syntax:
86
87    clzeq r0, r0
88    clzeq r0, r1
89    clzeq r0, r2
90
91Our example is easy, but it gets more complicated with load and store
92instructions for example. We can feed this as input to `llvm-mc`:
93
94    $ echo "
95      clzeq r0, r0
96      clzeq r0, r1
97      clzeq r0, r2
98    " | llvm-mc -assemble -arch=arm -mattr=v8,crc -show-encoding
99
100And we will get the following output:
101
102            .text
103            clzeq   r0, r0                  @ encoding: [0x10,0x0f,0x6f,0x01]
104            clzeq   r0, r1                  @ encoding: [0x11,0x0f,0x6f,0x01]
105            clzeq   r0, r2                  @ encoding: [0x12,0x0f,0x6f,0x01]
106
107The script will finally extract the encoding and compare it to what VIXL
108generated.
109"""
110
111import argparse
112import subprocess
113import os
114import re
115import itertools
116import types
117
118def BuildOptions():
119  result = argparse.ArgumentParser(
120      description = 'Use `llvm-mc` to check the assembler traces are correct.',
121      formatter_class = argparse.ArgumentDefaultsHelpFormatter)
122  result.add_argument('--llvm-mc', default='llvm-mc', help='Path to llvm-mc')
123  result.add_argument('--verbose', '-v', action='store_true')
124  return result.parse_args()
125
126
127def CheckLLVMVersion(llvm_mc):
128  version = subprocess.check_output([llvm_mc, '-version'])
129  m = re.search("^  LLVM version (\d)\.(\d)\.\d$", version.decode(), re.M)
130  major, minor = m.groups()
131  if int(major) < 3 or (int(major) == 3 and int(minor) < 8):
132    raise Exception("This script requires LLVM version 3.8 or higher.")
133
134
135def ConvertToLLVMFormat(vixl_instruction, triple):
136  """
137  Take an string representing an instruction and convert it to assembly syntax
138  for LLVM. VIXL's test generation framework will print instruction
139  representations as a space seperated list. The first element is the mnemonic
140  and the following elements are operands.
141  """
142
143  def DtUntypedToLLVM(matches):
144    dt = ""
145    if matches[1] == "untyped8":
146      dt = "8"
147    elif matches[1] == "untyped16":
148      dt = "16"
149    elif matches[1] == "untyped32":
150      dt = "32"
151    else:
152      raise Exception()
153
154    return "{}.{} {}, {}, {}".format(matches[0], dt, matches[2], matches[3], matches[4])
155
156  # Dictionnary of patterns. The key is an identifier used in
157  # `llvm_mc_instruction_converters` below. The value needs to be a capturing
158  # regular expression.
159  pattern_matchers = {
160      # Allow an optional underscore in case this an "and" instruction.
161      "mnemonic": "(\w+?)_?",
162      "condition":
163          "(al|eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le)",
164      "register":
165          "(r0|r1|r2|r3|r4|r5|r6|r7|r8|r9|r10|r11|r12|r13|r14|r15|pc|sp|lr)",
166      "immediate": "(0x[0-9a-f]+|[0-9]+)",
167      "shift": "(lsl|lsr|asr|ror)",
168      "dregister": "(d[0-9]|d[12][0-9]|d3[01])",
169      "dt": "(s8|s16|s32|s64|u8|u16|u32|u64|f16|f32|f64|i8|i16|i32|i64|p8|p64)",
170      "dt_untyped": "(untyped8|untyped16|untyped32)"
171  }
172
173  # List of converters. Each of them represents an instruction form and what to
174  # convert it to. This list needs to be complete; an exception is raised if we
175  # couldn't find a converter for the instruction.
176  #
177  # The first part of each tuple is a pattern to match. It's simply a regular
178  # expression. Additionally, each identifier in curly braces is replaced by the
179  # corresponding pattern from `pattern_matchers`.
180  #
181  # The second part of the tuple is a string that describes what the result will
182  # look like. Empty curly braces are replaced by matches, in order.
183  llvm_mc_instruction_converters = [
184      ("it {condition}", "it {}"),
185      ("{mnemonic} {condition} {register} {immediate}",
186       "{}{} {}, #{}"),
187      ("{mnemonic} {condition} {register} {register} {immediate}",
188       "{}{} {}, {}, #{}"),
189      ("{mnemonic} {condition} {register} {register}",
190       "{}{} {}, {}"),
191      ("{mnemonic} {condition} {register} {register} {register}",
192       "{}{} {}, {}, {}"),
193      ("{mnemonic} {register} {register} {register}",
194       "{} {}, {}, {}"),
195      ("{mnemonic} {condition} {register} {register} {immediate}",
196       "{}{} {}, {}, #{}"),
197      ("{mnemonic} {condition} {register} {register} {register} {shift} "
198           "{immediate}",
199       "{}{} {}, {}, {}, {} #{}"),
200      ("{mnemonic} {condition} {register} {register} {register} {shift} "
201           "{register}",
202       "{}{} {}, {}, {}, {} {}"),
203      ("{mnemonic} {condition} {register} {register} {shift} {immediate}",
204       "{}{} {}, {}, {} #{}"),
205      ("{mnemonic} {condition} {register} {register} {shift} {register}",
206       "{}{} {}, {}, {} {}"),
207      ("{mnemonic} {condition} {register} {register} plus {immediate} offset",
208       "{}{} {}, [{}, #{}]"),
209      ("{mnemonic} {condition} {register} {register} minus {immediate} offset",
210       "{}{} {}, [{}, #-{}]"),
211      ("{mnemonic} {condition} {register} {register} plus {immediate} postindex",
212       "{}{} {}, [{}], #{}"),
213      ("{mnemonic} {condition} {register} {register} minus {immediate} "
214           "postindex",
215       "{}{} {}, [{}], #-{}"),
216      ("{mnemonic} {condition} {register} {register} plus {immediate} preindex",
217       "{}{} {}, [{}, #{}]!"),
218      ("{mnemonic} {condition} {register} {register} minus {immediate} "
219           "preindex",
220       "{}{} {}, [{}, #-{}]!"),
221      ("{mnemonic} {condition} {register} {register} plus {register} offset",
222       "{}{} {}, [{}, {}]"),
223      ("{mnemonic} {condition} {register} {register} minus {register} offset",
224       "{}{} {}, [{}, -{}]"),
225      ("{mnemonic} {condition} {register} {register} plus {register} postindex",
226       "{}{} {}, [{}], {}"),
227      ("{mnemonic} {condition} {register} {register} minus {register} "
228           "postindex",
229       "{}{} {}, [{}], -{}"),
230      ("{mnemonic} {condition} {register} {register} plus {register} preindex",
231       "{}{} {}, [{}, {}]!"),
232      ("{mnemonic} {condition} {register} {register} minus {register} preindex",
233       "{}{} {}, [{}, -{}]!"),
234      ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
235           "{immediate} offset",
236       "{}{} {}, [{}, {}, {} #{}]"),
237      ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
238           "{immediate} offset",
239       "{}{} {}, [{}, -{}, {} #{}]"),
240      ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
241           "{immediate} postindex",
242       "{}{} {}, [{}], {}, {} #{}"),
243      ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
244           "{immediate} postindex",
245       "{}{} {}, [{}], -{}, {} #{}"),
246      ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
247           "{immediate} preindex",
248       "{}{} {}, [{}, {}, {} #{}]!"),
249      ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
250           "{immediate} preindex",
251       "{}{} {}, [{}, -{}, {} #{}]!"),
252      ("{mnemonic} {dt} {dregister} {dregister} {dregister}",
253       "{}.{} {}, {}, {}"),
254      ("{mnemonic} {dt_untyped} {dregister} {dregister} {dregister}", DtUntypedToLLVM)
255  ]
256
257  # Work around issues in LLVM 3.8.
258  if triple == "thumbv8":
259    def ConvertMovRdImm(matches):
260      """
261      LLVM chooses the T3 encoding for `mov <rd>, #<immediate>` when the
262      immediate fits both into a modified immediate (T2 encoding) and 16
263      bits (T3 encoding). Adding the `.W` modifier forces the T2 encoding to
264      be used.
265      """
266      # The immediate is the second capture in "mov al {register} {immediate}".
267      imm = int(matches[1], 16)
268      if imm <= 0xffff:
269        lsb = imm & -imm
270        if (imm >> 8) < lsb:
271          return "mov.w {}, #{}".format(*matches)
272      # Fall back to a LLVM making the right decision.
273      return "mov {}, #{}".format(*matches)
274    llvm_mc_instruction_converters[:0] = [
275        # The ARM ARM specifies that if <Rn> is PC in either an ADD or SUB
276        # instruction with an immediate, the assembler should use the ADR
277        # encoding. LLVM does not know about this subtlety. We get around this
278        # by manually translating the instruction to their ADR form.
279        ("add al {register} pc {immediate}", "adr {}, #{}"),
280        ("sub al {register} pc {immediate}", "adr {}, #-{}"),
281
282        # LLVM is (rightfully) being helpful by swapping register operands so
283        # that the 16 bit encoding of the following instructions is used.
284        # However, VIXL does not do this. These rules specifically add the `.w`
285        # modifier to force LLVM to use the 32 bit encoding if the last register
286        # is identical to first one. But at the same time, we should still use
287        # the narrow encoding if all registers are the same.
288        ("adcs al {register} (\\1) (\\1)", "adcs.n {}, {}, {}"),
289        ("adcs al {register} {register} (\\1)", "adcs.w {}, {}, {}"),
290        ("orrs al {register} (\\1) (\\1)", "orrs.n {}, {}, {}"),
291        ("orrs al {register} {register} (\\1)", "orrs.w {}, {}, {}"),
292        ("eors al {register} (\\1) (\\1)", "eors.n {}, {}, {}"),
293        ("eors al {register} {register} (\\1)", "eors.w {}, {}, {}"),
294        ("ands al {register} (\\1) (\\1)", "ands.n {}, {}, {}"),
295        ("ands al {register} {register} (\\1)", "ands.w {}, {}, {}"),
296        # Solve the same issue as for the previous rules, however, we need to
297        # take into account that ADD instructions with the stack pointer have
298        # additional 16 bit forms.
299        ("add al {register} (\\1) (\\1)", "add.n {}, {}, {}"),
300        ("add al {register} (\\1) r13", "add.w {}, {}, sp"),
301        ("add al {register} r13 (\\1)", "add.n {}, sp, {}"),
302        ("add al {register} {register} (\\1)", "add.w {}, {}, {}"),
303        ("mov al {register} {immediate}", ConvertMovRdImm)
304    ]
305
306  # Our test generator framework uses mnemonics starting with a capital letters.
307  # We need everythin to be lower case for LLVM.
308  vixl_instruction = vixl_instruction.lower()
309
310  llvm_instruction = []
311
312  # VIXL may have generated more than one instruction seperated by ';'
313  # (an IT instruction for example).
314  for instruction in vixl_instruction.split(';'):
315    # Strip out extra white spaces.
316    instruction = instruction.strip()
317    # Try all converters in the list.
318    for pattern, result in llvm_mc_instruction_converters:
319      # Build the regular expression for this converter.
320      instruction_matcher = "^" + pattern.format(**pattern_matchers) + "$"
321      match = re.match(instruction_matcher, instruction)
322      if match:
323        # If we have a match, the object will contain a tuple of substrings.
324        if isinstance(result, types.FunctionType):
325          # `result` is a function, call it produce the instruction.
326          llvm_instruction.append(result(match.groups()))
327        else:
328          # `result` is a string, use it as the format string.
329          assert(isinstance(result, str))
330          llvm_instruction.append(result.format(*match.groups()))
331        break
332
333  if llvm_instruction:
334    return "\n".join(llvm_instruction)
335
336  # No converters worked so raise an exception.
337  raise Exception("Unsupported instruction {}.".format(instruction))
338
339
340def ReadTrace(trace):
341  """
342  Receive the content of an assembler trace, extract the relevant information
343  and return it as a list of tuples. The first part of each typle is a string
344  representing the instruction. The second part is a list of bytes representing
345  the encoding.
346
347  For example:
348
349      [
350        ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
351        ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
352        ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
353      ]
354  """
355
356  pattern = re.compile(
357      "^  (?P<encoding>(:?0x[0-9a-f]{2}, )+0x[0-9a-f]{2}) // (?P<instruction>.*)$",
358      re.M)
359  return [
360      (m.group('instruction'), m.group('encoding').replace(" ", "").split(","))
361      for m in re.finditer(pattern, trace)
362  ]
363
364
365def VerifyInstructionsWithLLVMMC(llvm_mc, f, triple):
366  """
367  Extract all instructions from `f`, feed them to `llvm-mc` and make sure it's
368  encoded them the same way as VIXL. `triple` allows us to specify either
369  "thumbv8" or "armv8".
370  """
371
372  vixl_reference = ReadTrace(f.read())
373  vixl_instructions, vixl_encodings = zip(*vixl_reference)
374  instructions = [
375      ConvertToLLVMFormat(instruction, triple)
376      for instruction in vixl_instructions
377  ]
378  llvm_mc_proc = subprocess.Popen(
379      [llvm_mc, '-assemble', '-triple={}'.format(triple), '-mattr=v8,crc',
380       # LLVM fails to recognize some instructions as valid T32 when we do not
381       # set `-mcpu`.
382       '-mcpu=cortex-a53', '-show-encoding'],
383      stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
384  out, err = llvm_mc_proc.communicate("\n".join(instructions).encode())
385  # If `llvm-mc` printed something to stderr then stop.
386  if err:
387    print(err.decode())
388    return
389
390  # Extract list of bytes from `llvm-mc` output. It's in the following form:
391  #
392  #         clzeq   r0, r0                  @ encoding: [0x10,0x0f,0x6f,0x01]
393  #                                                      ^^^^ ^^^^ ^^^^ ^^^^
394  llvm_encodings = [
395      match_object.group('encoding').replace(" ", "").split(",")
396      for match_object in re.finditer(".*@ encoding: \[(?P<encoding>.*)\]",
397                                      out.decode())
398  ]
399
400  # If LLVM has generated exactly twice as much instructions, we assume this is
401  # due to IT instructions preceding every instruction under test. VIXL's
402  # assembly reference files will contain a single array of 4 bytes encoding
403  # both the IT and the following instruction. While LLVM will have decoded them
404  # into two seperate 2 bytes arrays.
405  if len(llvm_encodings) == 2 * len(vixl_encodings):
406    llvm_encodings = [
407        llvm_encodings[i * 2] + llvm_encodings[(i * 2) + 1]
408        for i in range(0, len(vixl_encodings))
409    ]
410
411  # Check the encodings from LLVM are identical to VIXL's.
412  if len(llvm_encodings) != len(vixl_encodings):
413    print("""Error: llvm-mc generated {} instructions than there are in the
414generated trace.
415        """.format("fewer" if len(llvm_encodings) < len(vixl_encodings) else "more"))
416  else:
417    for i in range(0, len(vixl_encodings)):
418      if llvm_encodings[i] != vixl_encodings[i]:
419        print("""Error: llvm-mc disagrees on the encoding of \"{instruction}\":
420  LLVM-MC: {llvm}
421  VIXL:    {vixl}
422            """.format(instruction=vixl_instructions[i].replace("\n", "; "),
423                       llvm=llvm_encodings[i],
424                       vixl=vixl_encodings[i]))
425
426
427if __name__ == "__main__":
428  args = BuildOptions()
429
430  CheckLLVMVersion(args.llvm_mc)
431
432  trace_dir = 'test/aarch32/traces/'
433  trace_files = [
434      trace_file
435      for trace_file in os.listdir(trace_dir)
436      if trace_file.startswith("assembler-")
437  ]
438  trace_files.sort()
439  for trace_file in trace_files:
440    if args.verbose:
441      print("Verifying \"" + trace_file + "\".")
442    with open(os.path.join(trace_dir, trace_file), "r") as f:
443      if "t32" in trace_file:
444        VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "thumbv8")
445      elif "a32" in trace_file:
446        VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "armv8")
447      else:
448        raise Exception("Failed to recognize the ISA in \"" + trace_file + "\".")
449