1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38#
39# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
40#	and are still same even for updated module;
41
42$flavour = shift;
43$output  = shift;
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
48die "can't locate arm-xlate.pl";
49
50open OUT,"| \"$^X\" $xlate $flavour $output";
51*STDOUT=*OUT;
52
53$prefix="aes_hw";
54
55$code=<<___;
56#include <GFp/arm_arch.h>
57
58#if __ARM_MAX_ARCH__>=7
59.text
60___
61$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
62$code.=<<___						if ($flavour !~ /64/);
63.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
64.fpu	neon
65.code	32
66#undef	__thumb2__
67___
68
69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
71# maintain both 32- and 64-bit codes within single module and
72# transliterate common code to either flavour with regex vodoo.
73#
74{{{
75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
77	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
78
79
80# On AArch64, put the data .rodata and use adrp + add for compatibility with
81# execute-only memory. On AArch32, put it in .text and use adr.
82$code.= ".section .rodata\n" if ($flavour =~ /64/);
83$code.=<<___;
84.align	5
85.Lrcon:
86.long	0x01,0x01,0x01,0x01
87.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
88.long	0x1b,0x1b,0x1b,0x1b
89
90.text
91
92.globl	GFp_${prefix}_set_encrypt_key
93.type	GFp_${prefix}_set_encrypt_key,%function
94.align	5
95GFp_${prefix}_set_encrypt_key:
96.Lenc_key:
97___
98$code.=<<___	if ($flavour =~ /64/);
99	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
100	AARCH64_VALID_CALL_TARGET
101	stp	x29,x30,[sp,#-16]!
102	add	x29,sp,#0
103___
104$code.=<<___;
105	mov	$ptr,#-1
106	cmp	$inp,#0
107	b.eq	.Lenc_key_abort
108	cmp	$out,#0
109	b.eq	.Lenc_key_abort
110	mov	$ptr,#-2
111	cmp	$bits,#128
112	b.lt	.Lenc_key_abort
113	cmp	$bits,#256
114	b.gt	.Lenc_key_abort
115	tst	$bits,#0x3f
116	b.ne	.Lenc_key_abort
117
118___
119$code.=<<___	if ($flavour =~ /64/);
120	adrp	$ptr,:pg_hi21:.Lrcon
121	add	$ptr,$ptr,:lo12:.Lrcon
122___
123$code.=<<___	if ($flavour !~ /64/);
124	adr	$ptr,.Lrcon
125___
126$code.=<<___;
127	cmp	$bits,#192
128
129	veor	$zero,$zero,$zero
130	vld1.8	{$in0},[$inp],#16
131	mov	$bits,#8		// reuse $bits
132	vld1.32	{$rcon,$mask},[$ptr],#32
133
134	b.lt	.Loop128
135	// 192-bit key support was removed.
136	b	.L256
137
138.align	4
139.Loop128:
140	vtbl.8	$key,{$in0},$mask
141	vext.8	$tmp,$zero,$in0,#12
142	vst1.32	{$in0},[$out],#16
143	aese	$key,$zero
144	subs	$bits,$bits,#1
145
146	veor	$in0,$in0,$tmp
147	vext.8	$tmp,$zero,$tmp,#12
148	veor	$in0,$in0,$tmp
149	vext.8	$tmp,$zero,$tmp,#12
150	 veor	$key,$key,$rcon
151	veor	$in0,$in0,$tmp
152	vshl.u8	$rcon,$rcon,#1
153	veor	$in0,$in0,$key
154	b.ne	.Loop128
155
156	vld1.32	{$rcon},[$ptr]
157
158	vtbl.8	$key,{$in0},$mask
159	vext.8	$tmp,$zero,$in0,#12
160	vst1.32	{$in0},[$out],#16
161	aese	$key,$zero
162
163	veor	$in0,$in0,$tmp
164	vext.8	$tmp,$zero,$tmp,#12
165	veor	$in0,$in0,$tmp
166	vext.8	$tmp,$zero,$tmp,#12
167	 veor	$key,$key,$rcon
168	veor	$in0,$in0,$tmp
169	vshl.u8	$rcon,$rcon,#1
170	veor	$in0,$in0,$key
171
172	vtbl.8	$key,{$in0},$mask
173	vext.8	$tmp,$zero,$in0,#12
174	vst1.32	{$in0},[$out],#16
175	aese	$key,$zero
176
177	veor	$in0,$in0,$tmp
178	vext.8	$tmp,$zero,$tmp,#12
179	veor	$in0,$in0,$tmp
180	vext.8	$tmp,$zero,$tmp,#12
181	 veor	$key,$key,$rcon
182	veor	$in0,$in0,$tmp
183	veor	$in0,$in0,$key
184	vst1.32	{$in0},[$out]
185	add	$out,$out,#0x50
186
187	mov	$rounds,#10
188	b	.Ldone
189
190// 192-bit key support was removed.
191
192.align	4
193.L256:
194	vld1.8	{$in1},[$inp]
195	mov	$bits,#7
196	mov	$rounds,#14
197	vst1.32	{$in0},[$out],#16
198
199.Loop256:
200	vtbl.8	$key,{$in1},$mask
201	vext.8	$tmp,$zero,$in0,#12
202	vst1.32	{$in1},[$out],#16
203	aese	$key,$zero
204	subs	$bits,$bits,#1
205
206	veor	$in0,$in0,$tmp
207	vext.8	$tmp,$zero,$tmp,#12
208	veor	$in0,$in0,$tmp
209	vext.8	$tmp,$zero,$tmp,#12
210	 veor	$key,$key,$rcon
211	veor	$in0,$in0,$tmp
212	vshl.u8	$rcon,$rcon,#1
213	veor	$in0,$in0,$key
214	vst1.32	{$in0},[$out],#16
215	b.eq	.Ldone
216
217	vdup.32	$key,${in0}[3]		// just splat
218	vext.8	$tmp,$zero,$in1,#12
219	aese	$key,$zero
220
221	veor	$in1,$in1,$tmp
222	vext.8	$tmp,$zero,$tmp,#12
223	veor	$in1,$in1,$tmp
224	vext.8	$tmp,$zero,$tmp,#12
225	veor	$in1,$in1,$tmp
226
227	veor	$in1,$in1,$key
228	b	.Loop256
229
230.Ldone:
231	str	$rounds,[$out]
232	mov	$ptr,#0
233
234.Lenc_key_abort:
235	mov	x0,$ptr			// return value
236	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
237	ret
238.size	GFp_${prefix}_set_encrypt_key,.-GFp_${prefix}_set_encrypt_key
239___
240}}}
241{{{
242sub gen_block () {
243my $dir = shift;
244my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
245my ($inp,$out,$key)=map("x$_",(0..2));
246my $rounds="w3";
247my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
248
249$code.=<<___;
250.globl	GFp_${prefix}_${dir}crypt
251.type	GFp_${prefix}_${dir}crypt,%function
252.align	5
253GFp_${prefix}_${dir}crypt:
254	AARCH64_VALID_CALL_TARGET
255	ldr	$rounds,[$key,#240]
256	vld1.32	{$rndkey0},[$key],#16
257	vld1.8	{$inout},[$inp]
258	sub	$rounds,$rounds,#2
259	vld1.32	{$rndkey1},[$key],#16
260
261.Loop_${dir}c:
262	aes$e	$inout,$rndkey0
263	aes$mc	$inout,$inout
264	vld1.32	{$rndkey0},[$key],#16
265	subs	$rounds,$rounds,#2
266	aes$e	$inout,$rndkey1
267	aes$mc	$inout,$inout
268	vld1.32	{$rndkey1},[$key],#16
269	b.gt	.Loop_${dir}c
270
271	aes$e	$inout,$rndkey0
272	aes$mc	$inout,$inout
273	vld1.32	{$rndkey0},[$key]
274	aes$e	$inout,$rndkey1
275	veor	$inout,$inout,$rndkey0
276
277	vst1.8	{$inout},[$out]
278	ret
279.size	GFp_${prefix}_${dir}crypt,.-GFp_${prefix}_${dir}crypt
280___
281}
282&gen_block("en");
283&gen_block("de");
284}}}
285{{{
286my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
287my ($rounds,$cnt,$key_)=("w5","w6","x7");
288my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
289my $step="x12";		# aliases with $tctr2
290
291my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
292my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
293
294my ($dat,$tmp)=($dat0,$tmp0);
295
296### q8-q15	preloaded key schedule
297
298$code.=<<___;
299.globl	GFp_${prefix}_ctr32_encrypt_blocks
300.type	GFp_${prefix}_ctr32_encrypt_blocks,%function
301.align	5
302GFp_${prefix}_ctr32_encrypt_blocks:
303___
304$code.=<<___	if ($flavour =~ /64/);
305	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
306	AARCH64_VALID_CALL_TARGET
307	stp		x29,x30,[sp,#-16]!
308	add		x29,sp,#0
309___
310$code.=<<___	if ($flavour !~ /64/);
311	mov		ip,sp
312	stmdb		sp!,{r4-r10,lr}
313	vstmdb		sp!,{d8-d15}            @ ABI specification says so
314	ldr		r4, [ip]		@ load remaining arg
315___
316$code.=<<___;
317	ldr		$rounds,[$key,#240]
318
319	ldr		$ctr, [$ivp, #12]
320	vld1.32		{$dat0},[$ivp]
321
322	vld1.32		{q8-q9},[$key]		// load key schedule...
323	sub		$rounds,$rounds,#4
324	mov		$step,#16
325	cmp		$len,#2
326	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
327	sub		$rounds,$rounds,#2
328	vld1.32		{q12-q13},[$key_],#32
329	vld1.32		{q14-q15},[$key_],#32
330	vld1.32		{$rndlast},[$key_]
331	add		$key_,$key,#32
332	mov		$cnt,$rounds
333	cclr		$step,lo
334
335	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
336	// affected by silicon errata #1742098 [0] and #1655431 [1],
337	// respectively, where the second instruction of an aese/aesmc
338	// instruction pair may execute twice if an interrupt is taken right
339	// after the first instruction consumes an input register of which a
340	// single 32-bit lane has been updated the last time it was modified.
341	//
342	// This function uses a counter in one 32-bit lane. The vmov.32 lines
343	// could write to $dat1 and $dat2 directly, but that trips this bugs.
344	// We write to $ivec and copy to the final register as a workaround.
345	//
346	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
347	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
348#ifndef __ARMEB__
349	rev		$ctr, $ctr
350#endif
351	add		$tctr1, $ctr, #1
352	vorr		$ivec,$dat0,$dat0
353	rev		$tctr1, $tctr1
354	vmov.32		${ivec}[3],$tctr1
355	add		$ctr, $ctr, #2
356	vorr		$dat1,$ivec,$ivec
357	b.ls		.Lctr32_tail
358	rev		$tctr2, $ctr
359	vmov.32		${ivec}[3],$tctr2
360	sub		$len,$len,#3		// bias
361	vorr		$dat2,$ivec,$ivec
362	b		.Loop3x_ctr32
363
364.align	4
365.Loop3x_ctr32:
366	aese		$dat0,q8
367	aesmc		$dat0,$dat0
368	aese		$dat1,q8
369	aesmc		$dat1,$dat1
370	aese		$dat2,q8
371	aesmc		$dat2,$dat2
372	vld1.32		{q8},[$key_],#16
373	subs		$cnt,$cnt,#2
374	aese		$dat0,q9
375	aesmc		$dat0,$dat0
376	aese		$dat1,q9
377	aesmc		$dat1,$dat1
378	aese		$dat2,q9
379	aesmc		$dat2,$dat2
380	vld1.32		{q9},[$key_],#16
381	b.gt		.Loop3x_ctr32
382
383	aese		$dat0,q8
384	aesmc		$tmp0,$dat0
385	aese		$dat1,q8
386	aesmc		$tmp1,$dat1
387	 vld1.8		{$in0},[$inp],#16
388	 add		$tctr0,$ctr,#1
389	aese		$dat2,q8
390	aesmc		$dat2,$dat2
391	 vld1.8		{$in1},[$inp],#16
392	 rev		$tctr0,$tctr0
393	aese		$tmp0,q9
394	aesmc		$tmp0,$tmp0
395	aese		$tmp1,q9
396	aesmc		$tmp1,$tmp1
397	 vld1.8		{$in2},[$inp],#16
398	 mov		$key_,$key
399	aese		$dat2,q9
400	aesmc		$tmp2,$dat2
401	aese		$tmp0,q12
402	aesmc		$tmp0,$tmp0
403	aese		$tmp1,q12
404	aesmc		$tmp1,$tmp1
405	 veor		$in0,$in0,$rndlast
406	 add		$tctr1,$ctr,#2
407	aese		$tmp2,q12
408	aesmc		$tmp2,$tmp2
409	 veor		$in1,$in1,$rndlast
410	 add		$ctr,$ctr,#3
411	aese		$tmp0,q13
412	aesmc		$tmp0,$tmp0
413	aese		$tmp1,q13
414	aesmc		$tmp1,$tmp1
415	 // Note the logic to update $dat0, $dat1, and $dat1 is written to work
416	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
417	 // 32-bit mode. See the comment above.
418	 veor		$in2,$in2,$rndlast
419	 vmov.32	${ivec}[3], $tctr0
420	aese		$tmp2,q13
421	aesmc		$tmp2,$tmp2
422	 vorr		$dat0,$ivec,$ivec
423	 rev		$tctr1,$tctr1
424	aese		$tmp0,q14
425	aesmc		$tmp0,$tmp0
426	 vmov.32	${ivec}[3], $tctr1
427	 rev		$tctr2,$ctr
428	aese		$tmp1,q14
429	aesmc		$tmp1,$tmp1
430	 vorr		$dat1,$ivec,$ivec
431	 vmov.32	${ivec}[3], $tctr2
432	aese		$tmp2,q14
433	aesmc		$tmp2,$tmp2
434	 vorr		$dat2,$ivec,$ivec
435	 subs		$len,$len,#3
436	aese		$tmp0,q15
437	aese		$tmp1,q15
438	aese		$tmp2,q15
439
440	veor		$in0,$in0,$tmp0
441	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
442	vst1.8		{$in0},[$out],#16
443	veor		$in1,$in1,$tmp1
444	 mov		$cnt,$rounds
445	vst1.8		{$in1},[$out],#16
446	veor		$in2,$in2,$tmp2
447	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
448	vst1.8		{$in2},[$out],#16
449	b.hs		.Loop3x_ctr32
450
451	adds		$len,$len,#3
452	b.eq		.Lctr32_done
453	cmp		$len,#1
454	mov		$step,#16
455	cclr		$step,eq
456
457.Lctr32_tail:
458	aese		$dat0,q8
459	aesmc		$dat0,$dat0
460	aese		$dat1,q8
461	aesmc		$dat1,$dat1
462	vld1.32		{q8},[$key_],#16
463	subs		$cnt,$cnt,#2
464	aese		$dat0,q9
465	aesmc		$dat0,$dat0
466	aese		$dat1,q9
467	aesmc		$dat1,$dat1
468	vld1.32		{q9},[$key_],#16
469	b.gt		.Lctr32_tail
470
471	aese		$dat0,q8
472	aesmc		$dat0,$dat0
473	aese		$dat1,q8
474	aesmc		$dat1,$dat1
475	aese		$dat0,q9
476	aesmc		$dat0,$dat0
477	aese		$dat1,q9
478	aesmc		$dat1,$dat1
479	 vld1.8		{$in0},[$inp],$step
480	aese		$dat0,q12
481	aesmc		$dat0,$dat0
482	aese		$dat1,q12
483	aesmc		$dat1,$dat1
484	 vld1.8		{$in1},[$inp]
485	aese		$dat0,q13
486	aesmc		$dat0,$dat0
487	aese		$dat1,q13
488	aesmc		$dat1,$dat1
489	 veor		$in0,$in0,$rndlast
490	aese		$dat0,q14
491	aesmc		$dat0,$dat0
492	aese		$dat1,q14
493	aesmc		$dat1,$dat1
494	 veor		$in1,$in1,$rndlast
495	aese		$dat0,q15
496	aese		$dat1,q15
497
498	cmp		$len,#1
499	veor		$in0,$in0,$dat0
500	veor		$in1,$in1,$dat1
501	vst1.8		{$in0},[$out],#16
502	b.eq		.Lctr32_done
503	vst1.8		{$in1},[$out]
504
505.Lctr32_done:
506___
507$code.=<<___	if ($flavour !~ /64/);
508	vldmia		sp!,{d8-d15}
509	ldmia		sp!,{r4-r10,pc}
510___
511$code.=<<___	if ($flavour =~ /64/);
512	ldr		x29,[sp],#16
513	ret
514___
515$code.=<<___;
516.size	GFp_${prefix}_ctr32_encrypt_blocks,.-GFp_${prefix}_ctr32_encrypt_blocks
517___
518}}}
519$code.=<<___;
520#endif
521___
522########################################
523if ($flavour =~ /64/) {			######## 64-bit code
524    my %opcode = (
525	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
526	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
527
528    local *unaes = sub {
529	my ($mnemonic,$arg)=@_;
530
531	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
532	sprintf ".inst\t0x%08x\t//%s %s",
533			$opcode{$mnemonic}|$1|($2<<5),
534			$mnemonic,$arg;
535    };
536
537    foreach(split("\n",$code)) {
538	s/\`([^\`]*)\`/eval($1)/geo;
539
540	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
541	s/@\s/\/\//o;			# old->new style commentary
542
543	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
544	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
545	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
546	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
547	s/vext\.8/ext/o		or
548	s/vrev32\.8/rev32/o	or
549	s/vtst\.8/cmtst/o	or
550	s/vshr/ushr/o		or
551	s/^(\s+)v/$1/o		or	# strip off v prefix
552	s/\bbx\s+lr\b/ret/o;
553
554	# fix up remaining legacy suffixes
555	s/\.[ui]?8//o;
556	m/\],#8/o and s/\.16b/\.8b/go;
557	s/\.[ui]?32//o and s/\.16b/\.4s/go;
558	s/\.[ui]?64//o and s/\.16b/\.2d/go;
559	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
560
561	print $_,"\n";
562    }
563} else {				######## 32-bit code
564    my %opcode = (
565	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
566	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
567
568    local *unaes = sub {
569	my ($mnemonic,$arg)=@_;
570
571	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
572	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
573					 |(($2&7)<<1) |(($2&8)<<2);
574	    # since ARMv7 instructions are always encoded little-endian.
575	    # correct solution is to use .inst directive, but older
576	    # assemblers don't implement it:-(
577	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
578			$word&0xff,($word>>8)&0xff,
579			($word>>16)&0xff,($word>>24)&0xff,
580			$mnemonic,$arg;
581	}
582    };
583
584    sub unvtbl {
585	my $arg=shift;
586
587	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
588	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
589		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
590    }
591
592    sub unvdup32 {
593	my $arg=shift;
594
595	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
596	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
597    }
598
599    sub unvmov32 {
600	my $arg=shift;
601
602	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
603	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
604    }
605
606    foreach(split("\n",$code)) {
607	s/\`([^\`]*)\`/eval($1)/geo;
608
609	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
610	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
611	s/\/\/\s?/@ /o;				# new->old style commentary
612
613	# fix up remaining new-style suffixes
614	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
615	s/\],#[0-9]+/]!/o;
616
617	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
618	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
619	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
620	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
621	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
622	s/^(\s+)b\./$1b/o				or
623	s/^(\s+)mov\./$1mov/o				or
624	s/^(\s+)ret/$1bx\tlr/o;
625
626	print $_,"\n";
627    }
628}
629
630close STDOUT or die "error closing STDOUT";
631