1#!/usr/bin/python
2# Test tool to compare Capstone output with llvm-mc. By Nguyen Anh Quynh, 2014
3import array, os.path, sys
4from subprocess import Popen, PIPE, STDOUT
5from capstone import *
6
7
8# convert all hex numbers to decimal numbers in a text
9def normalize_hex(a):
10    while(True):
11        i = a.find('0x')
12        if i == -1: # no more hex number
13            break
14        hexnum = '0x'
15        for c in a[i + 2:]:
16            if c in '0123456789abcdefABCDEF':
17                hexnum += c
18            else:
19                break
20        num = int(hexnum, 16)
21        a = a.replace(hexnum, str(num))
22    return a
23
24
25def run_mc(arch, hexcode, option, syntax=None):
26    def normalize(text):
27        # remove tabs
28        text = text.lower()
29        items = text.split()
30        text = ' '.join(items)
31        if arch == CS_ARCH_X86:
32            # remove comment after #
33            i = text.find('# ')
34            if i != -1:
35                return text[:i].strip()
36        if arch == CS_ARCH_ARM64:
37            # remove comment after #
38            i = text.find('// ')
39            if i != -1:
40                return text[:i].strip()
41        # remove some redundant spaces
42        text = text.replace('{ ', '{')
43        text = text.replace(' }', '}')
44        return text.strip()
45
46    #print("Trying to decode: %s" %hexcode)
47    if syntax:
48        if arch == CS_ARCH_MIPS:
49            p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa', syntax] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
50        else:
51            p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', syntax] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
52    else:
53        if arch == CS_ARCH_MIPS:
54            p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa'] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
55        else:
56            p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex'] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
57    output = p.communicate(input=hexcode)[0]
58    lines = output.split('\n')
59    #print lines
60    if 'invalid' in lines[0]:
61        #print 'invalid ----'
62        return 'FAILED to disassemble (MC)'
63    else:
64        #print 'OK:', lines[1]
65        return normalize(lines[1].strip())
66
67def test_file(fname):
68    print("Test %s" %fname);
69    f = open(fname)
70    lines = f.readlines()
71    f.close()
72
73    if not lines[0].startswith('# '):
74        print("ERROR: decoding information is missing")
75        return
76
77    # skip '# ' at the front, then split line to get out hexcode
78    # Note: option can be '', or 'None'
79    #print lines[0]
80    #print lines[0][2:].split(', ')
81    (arch, mode, option) = lines[0][2:].split(', ')
82    mode = mode.replace(' ', '')
83    option = option.strip()
84
85    archs = {
86        "CS_ARCH_ARM": CS_ARCH_ARM,
87        "CS_ARCH_ARM64": CS_ARCH_ARM64,
88        "CS_ARCH_MIPS": CS_ARCH_MIPS,
89        "CS_ARCH_PPC": CS_ARCH_PPC,
90        "CS_ARCH_SPARC": CS_ARCH_SPARC,
91        "CS_ARCH_SYSZ": CS_ARCH_SYSZ,
92        "CS_ARCH_X86": CS_ARCH_X86,
93        "CS_ARCH_XCORE": CS_ARCH_XCORE
94        # "CS_ARCH_M68K": CS_ARCH_M68K,
95    }
96
97    modes = {
98        "CS_MODE_16": CS_MODE_16,
99        "CS_MODE_32": CS_MODE_32,
100        "CS_MODE_64": CS_MODE_64,
101        "CS_MODE_MIPS32": CS_MODE_MIPS32,
102        "CS_MODE_MIPS64": CS_MODE_MIPS64,
103        "0": CS_MODE_ARM,
104        "CS_MODE_ARM": CS_MODE_ARM,
105        "CS_MODE_THUMB": CS_MODE_THUMB,
106        "CS_MODE_ARM+CS_MODE_V8": CS_MODE_ARM+CS_MODE_V8,
107        "CS_MODE_THUMB+CS_MODE_V8": CS_MODE_THUMB+CS_MODE_V8,
108        "CS_MODE_THUMB+CS_MODE_MCLASS": CS_MODE_THUMB+CS_MODE_MCLASS,
109        "CS_MODE_LITTLE_ENDIAN": CS_MODE_LITTLE_ENDIAN,
110        "CS_MODE_BIG_ENDIAN": CS_MODE_BIG_ENDIAN,
111        "CS_MODE_64+CS_MODE_LITTLE_ENDIAN": CS_MODE_64+CS_MODE_LITTLE_ENDIAN,
112        "CS_MODE_64+CS_MODE_BIG_ENDIAN": CS_MODE_64+CS_MODE_BIG_ENDIAN,
113        "CS_MODE_MIPS32+CS_MODE_MICRO": CS_MODE_MIPS32+CS_MODE_MICRO,
114        "CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN,
115        "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO": CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN,
116        "CS_MODE_BIG_ENDIAN+CS_MODE_V9": CS_MODE_BIG_ENDIAN + CS_MODE_V9,
117        "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN,
118        "CS_MODE_MIPS32+CS_MODE_LITTLE_ENDIAN": CS_MODE_MIPS32+CS_MODE_LITTLE_ENDIAN,
119        "CS_MODE_MIPS64+CS_MODE_LITTLE_ENDIAN": CS_MODE_MIPS64+CS_MODE_LITTLE_ENDIAN,
120        "CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN,
121    }
122
123    options = {
124        "CS_OPT_SYNTAX_ATT": CS_OPT_SYNTAX_ATT,
125        "CS_OPT_SYNTAX_NOREGNAME": CS_OPT_SYNTAX_NOREGNAME,
126    }
127
128    mc_modes = {
129        ("CS_ARCH_X86", "CS_MODE_32"): ['-triple=i386'],
130        ("CS_ARCH_X86", "CS_MODE_64"): ['-triple=x86_64'],
131        ("CS_ARCH_ARM", "CS_MODE_ARM"): ['-triple=armv7'],
132        ("CS_ARCH_ARM", "CS_MODE_THUMB"): ['-triple=thumbv7'],
133        ("CS_ARCH_ARM", "CS_MODE_ARM+CS_MODE_V8"): ['-triple=armv8'],
134        ("CS_ARCH_ARM", "CS_MODE_THUMB+CS_MODE_V8"): ['-triple=thumbv8'],
135        ("CS_ARCH_ARM", "CS_MODE_THUMB+CS_MODE_MCLASS"): ['-triple=thumbv7m'],
136        ("CS_ARCH_ARM64", "0"): ['-triple=aarch64'],
137        ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN"): ['-triple=mips'],
138        ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_MICRO"): ['-triple=mipsel', '-mattr=+micromips'],
139        ("CS_ARCH_MIPS", "CS_MODE_MIPS64"): ['-triple=mips64el'],
140        ("CS_ARCH_MIPS", "CS_MODE_MIPS32"): ['-triple=mipsel'],
141        ("CS_ARCH_MIPS", "CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN"): ['-triple=mips64'],
142        ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN"): ['-triple=mips', '-mattr=+micromips'],
143        ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO"): ['-triple=mips', '-mattr=+micromips'],
144        ("CS_ARCH_PPC", "CS_MODE_BIG_ENDIAN"): ['-triple=powerpc64'],
145        ('CS_ARCH_SPARC', 'CS_MODE_BIG_ENDIAN'): ['-triple=sparc'],
146        ('CS_ARCH_SPARC', 'CS_MODE_BIG_ENDIAN+CS_MODE_V9'): ['-triple=sparcv9'],
147        ('CS_ARCH_SYSZ', '0'): ['-triple=s390x', '-mcpu=z196'],
148    }
149
150    #if not option in ('', 'None'):
151    #    print archs[arch], modes[mode], options[option]
152
153    #print(arch, mode, option)
154    md = Cs(archs[arch], modes[mode])
155
156    mc_option = None
157    if arch == 'CS_ARCH_X86':
158        # tell llvm-mc to use Intel syntax
159        mc_option = '-output-asm-variant=1'
160
161    if arch == 'CS_ARCH_ARM' or arch == 'CS_ARCH_PPC' :
162        md.syntax = CS_OPT_SYNTAX_NOREGNAME
163
164    if fname.endswith('3DNow.s.cs'):
165        md.syntax = CS_OPT_SYNTAX_ATT
166
167    for line in lines[1:]:
168        # ignore all the input lines having # in front.
169        if line.startswith('#'):
170            continue
171        #print("Check %s" %line)
172        code = line.split(' = ')[0]
173        asm  = ''.join(line.split(' = ')[1:])
174        hex_code = code.replace('0x', '')
175        hex_code = hex_code.replace(',', '')
176        hex_data = hex_code.decode('hex')
177        #hex_bytes = array.array('B', hex_data)
178
179        x = list(md.disasm(hex_data, 0))
180        if len(x) > 0:
181            if x[0].op_str != '':
182                cs_output = "%s %s" %(x[0].mnemonic, x[0].op_str)
183            else:
184                cs_output = x[0].mnemonic
185        else:
186            cs_output = 'FAILED to disassemble'
187
188        cs_output2 = normalize_hex(cs_output)
189        cs_output2 = cs_output2.replace(' ', '')
190
191        if arch == 'CS_ARCH_MIPS':
192            # normalize register alias names
193            cs_output2 = cs_output2.replace('$at', '$1')
194            cs_output2 = cs_output2.replace('$v0', '$2')
195            cs_output2 = cs_output2.replace('$v1', '$3')
196
197            cs_output2 = cs_output2.replace('$a0', '$4')
198            cs_output2 = cs_output2.replace('$a1', '$5')
199            cs_output2 = cs_output2.replace('$a2', '$6')
200            cs_output2 = cs_output2.replace('$a3', '$7')
201
202            cs_output2 = cs_output2.replace('$t0', '$8')
203            cs_output2 = cs_output2.replace('$t1', '$9')
204            cs_output2 = cs_output2.replace('$t2', '$10')
205            cs_output2 = cs_output2.replace('$t3', '$11')
206            cs_output2 = cs_output2.replace('$t4', '$12')
207            cs_output2 = cs_output2.replace('$t5', '$13')
208            cs_output2 = cs_output2.replace('$t6', '$14')
209            cs_output2 = cs_output2.replace('$t7', '$15')
210            cs_output2 = cs_output2.replace('$t8', '$24')
211            cs_output2 = cs_output2.replace('$t9', '$25')
212
213            cs_output2 = cs_output2.replace('$s0', '$16')
214            cs_output2 = cs_output2.replace('$s1', '$17')
215            cs_output2 = cs_output2.replace('$s2', '$18')
216            cs_output2 = cs_output2.replace('$s3', '$19')
217            cs_output2 = cs_output2.replace('$s4', '$20')
218            cs_output2 = cs_output2.replace('$s5', '$21')
219            cs_output2 = cs_output2.replace('$s6', '$22')
220            cs_output2 = cs_output2.replace('$s7', '$23')
221
222            cs_output2 = cs_output2.replace('$k0', '$26')
223            cs_output2 = cs_output2.replace('$k1', '$27')
224
225        #print("Running MC ...")
226        if fname.endswith('thumb-fp-armv8.s.cs'):
227            mc_output = run_mc(archs[arch], code, ['-triple=thumbv8'], mc_option)
228        elif fname.endswith('mips64-alu-instructions.s.cs'):
229            mc_output = run_mc(archs[arch], code, ['-triple=mips64el', '-mcpu=mips64r2'], mc_option)
230        else:
231            mc_output = run_mc(archs[arch], code, mc_modes[(arch, mode)], mc_option)
232        mc_output2 = normalize_hex(mc_output)
233
234        if arch == 'CS_ARCH_MIPS':
235            mc_output2 = mc_output2.replace(' 0(', '(')
236
237        if arch == 'CS_ARCH_PPC':
238            mc_output2 = mc_output2.replace('.+', '')
239            mc_output2 = mc_output2.replace('.', '')
240            mc_output2 = mc_output2.replace(' 0(', '(')
241
242        mc_output2 = mc_output2.replace(' ', '')
243        mc_output2 = mc_output2.replace('opaque', '')
244
245
246        if (cs_output2 != mc_output2):
247            asm = asm.replace(' ', '').strip().lower()
248            if asm != cs_output2:
249                print("Mismatch: %s" %line.strip())
250                print("\tMC = %s" %mc_output)
251                print("\tCS = %s" %cs_output)
252
253
254if __name__ == '__main__':
255    if len(sys.argv) == 1:
256        fnames = sys.stdin.readlines()
257        for fname in fnames:
258            test_file(fname.strip())
259    else:
260        #print("Usage: ./test_mc.py <input-file.s.cs>")
261        test_file(sys.argv[1])
262
263