1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38#
39# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
40#	and are still same even for updated module;
41
42$flavour = shift;
43$output  = shift;
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
48die "can't locate arm-xlate.pl";
49
50open OUT,"| \"$^X\" $xlate $flavour $output";
51*STDOUT=*OUT;
52
53$prefix="aes_hw";
54
55$code=<<___;
56#include <openssl/arm_arch.h>
57
58#if __ARM_MAX_ARCH__>=7
59.text
60___
61$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
62$code.=<<___						if ($flavour !~ /64/);
63.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
64.fpu	neon
65.code	32
66#undef	__thumb2__
67___
68
69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
71# maintain both 32- and 64-bit codes within single module and
72# transliterate common code to either flavour with regex vodoo.
73#
74{{{
75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
77	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
78
79
80# On AArch64, put the data .rodata and use adrp + add for compatibility with
81# execute-only memory. On AArch32, put it in .text and use adr.
82$code.= ".section .rodata\n" if ($flavour =~ /64/);
83$code.=<<___;
84.align	5
85.Lrcon:
86.long	0x01,0x01,0x01,0x01
87.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
88.long	0x1b,0x1b,0x1b,0x1b
89
90.text
91
92.globl	${prefix}_set_encrypt_key
93.type	${prefix}_set_encrypt_key,%function
94.align	5
95${prefix}_set_encrypt_key:
96.Lenc_key:
97___
98$code.=<<___	if ($flavour =~ /64/);
99	stp	x29,x30,[sp,#-16]!
100	add	x29,sp,#0
101___
102$code.=<<___;
103	mov	$ptr,#-1
104	cmp	$inp,#0
105	b.eq	.Lenc_key_abort
106	cmp	$out,#0
107	b.eq	.Lenc_key_abort
108	mov	$ptr,#-2
109	cmp	$bits,#128
110	b.lt	.Lenc_key_abort
111	cmp	$bits,#256
112	b.gt	.Lenc_key_abort
113	tst	$bits,#0x3f
114	b.ne	.Lenc_key_abort
115
116___
117$code.=<<___	if ($flavour =~ /64/);
118	adrp	$ptr,:pg_hi21:.Lrcon
119	add	$ptr,$ptr,:lo12:.Lrcon
120___
121$code.=<<___	if ($flavour !~ /64/);
122	adr	$ptr,.Lrcon
123___
124$code.=<<___;
125	cmp	$bits,#192
126
127	veor	$zero,$zero,$zero
128	vld1.8	{$in0},[$inp],#16
129	mov	$bits,#8		// reuse $bits
130	vld1.32	{$rcon,$mask},[$ptr],#32
131
132	b.lt	.Loop128
133	b.eq	.L192
134	b	.L256
135
136.align	4
137.Loop128:
138	vtbl.8	$key,{$in0},$mask
139	vext.8	$tmp,$zero,$in0,#12
140	vst1.32	{$in0},[$out],#16
141	aese	$key,$zero
142	subs	$bits,$bits,#1
143
144	veor	$in0,$in0,$tmp
145	vext.8	$tmp,$zero,$tmp,#12
146	veor	$in0,$in0,$tmp
147	vext.8	$tmp,$zero,$tmp,#12
148	 veor	$key,$key,$rcon
149	veor	$in0,$in0,$tmp
150	vshl.u8	$rcon,$rcon,#1
151	veor	$in0,$in0,$key
152	b.ne	.Loop128
153
154	vld1.32	{$rcon},[$ptr]
155
156	vtbl.8	$key,{$in0},$mask
157	vext.8	$tmp,$zero,$in0,#12
158	vst1.32	{$in0},[$out],#16
159	aese	$key,$zero
160
161	veor	$in0,$in0,$tmp
162	vext.8	$tmp,$zero,$tmp,#12
163	veor	$in0,$in0,$tmp
164	vext.8	$tmp,$zero,$tmp,#12
165	 veor	$key,$key,$rcon
166	veor	$in0,$in0,$tmp
167	vshl.u8	$rcon,$rcon,#1
168	veor	$in0,$in0,$key
169
170	vtbl.8	$key,{$in0},$mask
171	vext.8	$tmp,$zero,$in0,#12
172	vst1.32	{$in0},[$out],#16
173	aese	$key,$zero
174
175	veor	$in0,$in0,$tmp
176	vext.8	$tmp,$zero,$tmp,#12
177	veor	$in0,$in0,$tmp
178	vext.8	$tmp,$zero,$tmp,#12
179	 veor	$key,$key,$rcon
180	veor	$in0,$in0,$tmp
181	veor	$in0,$in0,$key
182	vst1.32	{$in0},[$out]
183	add	$out,$out,#0x50
184
185	mov	$rounds,#10
186	b	.Ldone
187
188.align	4
189.L192:
190	vld1.8	{$in1},[$inp],#8
191	vmov.i8	$key,#8			// borrow $key
192	vst1.32	{$in0},[$out],#16
193	vsub.i8	$mask,$mask,$key	// adjust the mask
194
195.Loop192:
196	vtbl.8	$key,{$in1},$mask
197	vext.8	$tmp,$zero,$in0,#12
198	vst1.32	{$in1},[$out],#8
199	aese	$key,$zero
200	subs	$bits,$bits,#1
201
202	veor	$in0,$in0,$tmp
203	vext.8	$tmp,$zero,$tmp,#12
204	veor	$in0,$in0,$tmp
205	vext.8	$tmp,$zero,$tmp,#12
206	veor	$in0,$in0,$tmp
207
208	vdup.32	$tmp,${in0}[3]
209	veor	$tmp,$tmp,$in1
210	 veor	$key,$key,$rcon
211	vext.8	$in1,$zero,$in1,#12
212	vshl.u8	$rcon,$rcon,#1
213	veor	$in1,$in1,$tmp
214	veor	$in0,$in0,$key
215	veor	$in1,$in1,$key
216	vst1.32	{$in0},[$out],#16
217	b.ne	.Loop192
218
219	mov	$rounds,#12
220	add	$out,$out,#0x20
221	b	.Ldone
222
223.align	4
224.L256:
225	vld1.8	{$in1},[$inp]
226	mov	$bits,#7
227	mov	$rounds,#14
228	vst1.32	{$in0},[$out],#16
229
230.Loop256:
231	vtbl.8	$key,{$in1},$mask
232	vext.8	$tmp,$zero,$in0,#12
233	vst1.32	{$in1},[$out],#16
234	aese	$key,$zero
235	subs	$bits,$bits,#1
236
237	veor	$in0,$in0,$tmp
238	vext.8	$tmp,$zero,$tmp,#12
239	veor	$in0,$in0,$tmp
240	vext.8	$tmp,$zero,$tmp,#12
241	 veor	$key,$key,$rcon
242	veor	$in0,$in0,$tmp
243	vshl.u8	$rcon,$rcon,#1
244	veor	$in0,$in0,$key
245	vst1.32	{$in0},[$out],#16
246	b.eq	.Ldone
247
248	vdup.32	$key,${in0}[3]		// just splat
249	vext.8	$tmp,$zero,$in1,#12
250	aese	$key,$zero
251
252	veor	$in1,$in1,$tmp
253	vext.8	$tmp,$zero,$tmp,#12
254	veor	$in1,$in1,$tmp
255	vext.8	$tmp,$zero,$tmp,#12
256	veor	$in1,$in1,$tmp
257
258	veor	$in1,$in1,$key
259	b	.Loop256
260
261.Ldone:
262	str	$rounds,[$out]
263	mov	$ptr,#0
264
265.Lenc_key_abort:
266	mov	x0,$ptr			// return value
267	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
268	ret
269.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
270
271.globl	${prefix}_set_decrypt_key
272.type	${prefix}_set_decrypt_key,%function
273.align	5
274${prefix}_set_decrypt_key:
275___
276$code.=<<___	if ($flavour =~ /64/);
277	stp	x29,x30,[sp,#-16]!
278	add	x29,sp,#0
279___
280$code.=<<___	if ($flavour !~ /64/);
281	stmdb	sp!,{r4,lr}
282___
283$code.=<<___;
284	bl	.Lenc_key
285
286	cmp	x0,#0
287	b.ne	.Ldec_key_abort
288
289	sub	$out,$out,#240		// restore original $out
290	mov	x4,#-16
291	add	$inp,$out,x12,lsl#4	// end of key schedule
292
293	vld1.32	{v0.16b},[$out]
294	vld1.32	{v1.16b},[$inp]
295	vst1.32	{v0.16b},[$inp],x4
296	vst1.32	{v1.16b},[$out],#16
297
298.Loop_imc:
299	vld1.32	{v0.16b},[$out]
300	vld1.32	{v1.16b},[$inp]
301	aesimc	v0.16b,v0.16b
302	aesimc	v1.16b,v1.16b
303	vst1.32	{v0.16b},[$inp],x4
304	vst1.32	{v1.16b},[$out],#16
305	cmp	$inp,$out
306	b.hi	.Loop_imc
307
308	vld1.32	{v0.16b},[$out]
309	aesimc	v0.16b,v0.16b
310	vst1.32	{v0.16b},[$inp]
311
312	eor	x0,x0,x0		// return value
313.Ldec_key_abort:
314___
315$code.=<<___	if ($flavour !~ /64/);
316	ldmia	sp!,{r4,pc}
317___
318$code.=<<___	if ($flavour =~ /64/);
319	ldp	x29,x30,[sp],#16
320	ret
321___
322$code.=<<___;
323.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
324___
325}}}
326{{{
327sub gen_block () {
328my $dir = shift;
329my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
330my ($inp,$out,$key)=map("x$_",(0..2));
331my $rounds="w3";
332my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
333
334$code.=<<___;
335.globl	${prefix}_${dir}crypt
336.type	${prefix}_${dir}crypt,%function
337.align	5
338${prefix}_${dir}crypt:
339	ldr	$rounds,[$key,#240]
340	vld1.32	{$rndkey0},[$key],#16
341	vld1.8	{$inout},[$inp]
342	sub	$rounds,$rounds,#2
343	vld1.32	{$rndkey1},[$key],#16
344
345.Loop_${dir}c:
346	aes$e	$inout,$rndkey0
347	aes$mc	$inout,$inout
348	vld1.32	{$rndkey0},[$key],#16
349	subs	$rounds,$rounds,#2
350	aes$e	$inout,$rndkey1
351	aes$mc	$inout,$inout
352	vld1.32	{$rndkey1},[$key],#16
353	b.gt	.Loop_${dir}c
354
355	aes$e	$inout,$rndkey0
356	aes$mc	$inout,$inout
357	vld1.32	{$rndkey0},[$key]
358	aes$e	$inout,$rndkey1
359	veor	$inout,$inout,$rndkey0
360
361	vst1.8	{$inout},[$out]
362	ret
363.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
364___
365}
366&gen_block("en");
367&gen_block("de");
368}}}
369{{{
370my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
371my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
372my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
373
374my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
375my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
376
377### q8-q15	preloaded key schedule
378
379$code.=<<___;
380.globl	${prefix}_cbc_encrypt
381.type	${prefix}_cbc_encrypt,%function
382.align	5
383${prefix}_cbc_encrypt:
384___
385$code.=<<___	if ($flavour =~ /64/);
386	stp	x29,x30,[sp,#-16]!
387	add	x29,sp,#0
388___
389$code.=<<___	if ($flavour !~ /64/);
390	mov	ip,sp
391	stmdb	sp!,{r4-r8,lr}
392	vstmdb	sp!,{d8-d15}            @ ABI specification says so
393	ldmia	ip,{r4-r5}		@ load remaining args
394___
395$code.=<<___;
396	subs	$len,$len,#16
397	mov	$step,#16
398	b.lo	.Lcbc_abort
399	cclr	$step,eq
400
401	cmp	$enc,#0			// en- or decrypting?
402	ldr	$rounds,[$key,#240]
403	and	$len,$len,#-16
404	vld1.8	{$ivec},[$ivp]
405	vld1.8	{$dat},[$inp],$step
406
407	vld1.32	{q8-q9},[$key]		// load key schedule...
408	sub	$rounds,$rounds,#6
409	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
410	sub	$rounds,$rounds,#2
411	vld1.32	{q10-q11},[$key_],#32
412	vld1.32	{q12-q13},[$key_],#32
413	vld1.32	{q14-q15},[$key_],#32
414	vld1.32	{$rndlast},[$key_]
415
416	add	$key_,$key,#32
417	mov	$cnt,$rounds
418	b.eq	.Lcbc_dec
419
420	cmp	$rounds,#2
421	veor	$dat,$dat,$ivec
422	veor	$rndzero_n_last,q8,$rndlast
423	b.eq	.Lcbc_enc128
424
425	vld1.32	{$in0-$in1},[$key_]
426	add	$key_,$key,#16
427	add	$key4,$key,#16*4
428	add	$key5,$key,#16*5
429	aese	$dat,q8
430	aesmc	$dat,$dat
431	add	$key6,$key,#16*6
432	add	$key7,$key,#16*7
433	b	.Lenter_cbc_enc
434
435.align	4
436.Loop_cbc_enc:
437	aese	$dat,q8
438	aesmc	$dat,$dat
439	 vst1.8	{$ivec},[$out],#16
440.Lenter_cbc_enc:
441	aese	$dat,q9
442	aesmc	$dat,$dat
443	aese	$dat,$in0
444	aesmc	$dat,$dat
445	vld1.32	{q8},[$key4]
446	cmp	$rounds,#4
447	aese	$dat,$in1
448	aesmc	$dat,$dat
449	vld1.32	{q9},[$key5]
450	b.eq	.Lcbc_enc192
451
452	aese	$dat,q8
453	aesmc	$dat,$dat
454	vld1.32	{q8},[$key6]
455	aese	$dat,q9
456	aesmc	$dat,$dat
457	vld1.32	{q9},[$key7]
458	nop
459
460.Lcbc_enc192:
461	aese	$dat,q8
462	aesmc	$dat,$dat
463	 subs	$len,$len,#16
464	aese	$dat,q9
465	aesmc	$dat,$dat
466	 cclr	$step,eq
467	aese	$dat,q10
468	aesmc	$dat,$dat
469	aese	$dat,q11
470	aesmc	$dat,$dat
471	 vld1.8	{q8},[$inp],$step
472	aese	$dat,q12
473	aesmc	$dat,$dat
474	 veor	q8,q8,$rndzero_n_last
475	aese	$dat,q13
476	aesmc	$dat,$dat
477	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
478	aese	$dat,q14
479	aesmc	$dat,$dat
480	aese	$dat,q15
481	veor	$ivec,$dat,$rndlast
482	b.hs	.Loop_cbc_enc
483
484	vst1.8	{$ivec},[$out],#16
485	b	.Lcbc_done
486
487.align	5
488.Lcbc_enc128:
489	vld1.32	{$in0-$in1},[$key_]
490	aese	$dat,q8
491	aesmc	$dat,$dat
492	b	.Lenter_cbc_enc128
493.Loop_cbc_enc128:
494	aese	$dat,q8
495	aesmc	$dat,$dat
496	 vst1.8	{$ivec},[$out],#16
497.Lenter_cbc_enc128:
498	aese	$dat,q9
499	aesmc	$dat,$dat
500	 subs	$len,$len,#16
501	aese	$dat,$in0
502	aesmc	$dat,$dat
503	 cclr	$step,eq
504	aese	$dat,$in1
505	aesmc	$dat,$dat
506	aese	$dat,q10
507	aesmc	$dat,$dat
508	aese	$dat,q11
509	aesmc	$dat,$dat
510	 vld1.8	{q8},[$inp],$step
511	aese	$dat,q12
512	aesmc	$dat,$dat
513	aese	$dat,q13
514	aesmc	$dat,$dat
515	aese	$dat,q14
516	aesmc	$dat,$dat
517	 veor	q8,q8,$rndzero_n_last
518	aese	$dat,q15
519	veor	$ivec,$dat,$rndlast
520	b.hs	.Loop_cbc_enc128
521
522	vst1.8	{$ivec},[$out],#16
523	b	.Lcbc_done
524___
525{
526my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
527$code.=<<___;
528.align	5
529.Lcbc_dec:
530	vld1.8	{$dat2},[$inp],#16
531	subs	$len,$len,#32		// bias
532	add	$cnt,$rounds,#2
533	vorr	$in1,$dat,$dat
534	vorr	$dat1,$dat,$dat
535	vorr	$in2,$dat2,$dat2
536	b.lo	.Lcbc_dec_tail
537
538	vorr	$dat1,$dat2,$dat2
539	vld1.8	{$dat2},[$inp],#16
540	vorr	$in0,$dat,$dat
541	vorr	$in1,$dat1,$dat1
542	vorr	$in2,$dat2,$dat2
543
544.Loop3x_cbc_dec:
545	aesd	$dat0,q8
546	aesimc	$dat0,$dat0
547	aesd	$dat1,q8
548	aesimc	$dat1,$dat1
549	aesd	$dat2,q8
550	aesimc	$dat2,$dat2
551	vld1.32	{q8},[$key_],#16
552	subs	$cnt,$cnt,#2
553	aesd	$dat0,q9
554	aesimc	$dat0,$dat0
555	aesd	$dat1,q9
556	aesimc	$dat1,$dat1
557	aesd	$dat2,q9
558	aesimc	$dat2,$dat2
559	vld1.32	{q9},[$key_],#16
560	b.gt	.Loop3x_cbc_dec
561
562	aesd	$dat0,q8
563	aesimc	$dat0,$dat0
564	aesd	$dat1,q8
565	aesimc	$dat1,$dat1
566	aesd	$dat2,q8
567	aesimc	$dat2,$dat2
568	 veor	$tmp0,$ivec,$rndlast
569	 subs	$len,$len,#0x30
570	 veor	$tmp1,$in0,$rndlast
571	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
572	aesd	$dat0,q9
573	aesimc	$dat0,$dat0
574	aesd	$dat1,q9
575	aesimc	$dat1,$dat1
576	aesd	$dat2,q9
577	aesimc	$dat2,$dat2
578	 veor	$tmp2,$in1,$rndlast
579	 add	$inp,$inp,x6		// $inp is adjusted in such way that
580					// at exit from the loop $dat1-$dat2
581					// are loaded with last "words"
582	 vorr	$ivec,$in2,$in2
583	 mov	$key_,$key
584	aesd	$dat0,q12
585	aesimc	$dat0,$dat0
586	aesd	$dat1,q12
587	aesimc	$dat1,$dat1
588	aesd	$dat2,q12
589	aesimc	$dat2,$dat2
590	 vld1.8	{$in0},[$inp],#16
591	aesd	$dat0,q13
592	aesimc	$dat0,$dat0
593	aesd	$dat1,q13
594	aesimc	$dat1,$dat1
595	aesd	$dat2,q13
596	aesimc	$dat2,$dat2
597	 vld1.8	{$in1},[$inp],#16
598	aesd	$dat0,q14
599	aesimc	$dat0,$dat0
600	aesd	$dat1,q14
601	aesimc	$dat1,$dat1
602	aesd	$dat2,q14
603	aesimc	$dat2,$dat2
604	 vld1.8	{$in2},[$inp],#16
605	aesd	$dat0,q15
606	aesd	$dat1,q15
607	aesd	$dat2,q15
608	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
609	 add	$cnt,$rounds,#2
610	veor	$tmp0,$tmp0,$dat0
611	veor	$tmp1,$tmp1,$dat1
612	veor	$dat2,$dat2,$tmp2
613	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
614	vst1.8	{$tmp0},[$out],#16
615	 vorr	$dat0,$in0,$in0
616	vst1.8	{$tmp1},[$out],#16
617	 vorr	$dat1,$in1,$in1
618	vst1.8	{$dat2},[$out],#16
619	 vorr	$dat2,$in2,$in2
620	b.hs	.Loop3x_cbc_dec
621
622	cmn	$len,#0x30
623	b.eq	.Lcbc_done
624	nop
625
626.Lcbc_dec_tail:
627	aesd	$dat1,q8
628	aesimc	$dat1,$dat1
629	aesd	$dat2,q8
630	aesimc	$dat2,$dat2
631	vld1.32	{q8},[$key_],#16
632	subs	$cnt,$cnt,#2
633	aesd	$dat1,q9
634	aesimc	$dat1,$dat1
635	aesd	$dat2,q9
636	aesimc	$dat2,$dat2
637	vld1.32	{q9},[$key_],#16
638	b.gt	.Lcbc_dec_tail
639
640	aesd	$dat1,q8
641	aesimc	$dat1,$dat1
642	aesd	$dat2,q8
643	aesimc	$dat2,$dat2
644	aesd	$dat1,q9
645	aesimc	$dat1,$dat1
646	aesd	$dat2,q9
647	aesimc	$dat2,$dat2
648	aesd	$dat1,q12
649	aesimc	$dat1,$dat1
650	aesd	$dat2,q12
651	aesimc	$dat2,$dat2
652	 cmn	$len,#0x20
653	aesd	$dat1,q13
654	aesimc	$dat1,$dat1
655	aesd	$dat2,q13
656	aesimc	$dat2,$dat2
657	 veor	$tmp1,$ivec,$rndlast
658	aesd	$dat1,q14
659	aesimc	$dat1,$dat1
660	aesd	$dat2,q14
661	aesimc	$dat2,$dat2
662	 veor	$tmp2,$in1,$rndlast
663	aesd	$dat1,q15
664	aesd	$dat2,q15
665	b.eq	.Lcbc_dec_one
666	veor	$tmp1,$tmp1,$dat1
667	veor	$tmp2,$tmp2,$dat2
668	 vorr	$ivec,$in2,$in2
669	vst1.8	{$tmp1},[$out],#16
670	vst1.8	{$tmp2},[$out],#16
671	b	.Lcbc_done
672
673.Lcbc_dec_one:
674	veor	$tmp1,$tmp1,$dat2
675	 vorr	$ivec,$in2,$in2
676	vst1.8	{$tmp1},[$out],#16
677
678.Lcbc_done:
679	vst1.8	{$ivec},[$ivp]
680.Lcbc_abort:
681___
682}
683$code.=<<___	if ($flavour !~ /64/);
684	vldmia	sp!,{d8-d15}
685	ldmia	sp!,{r4-r8,pc}
686___
687$code.=<<___	if ($flavour =~ /64/);
688	ldr	x29,[sp],#16
689	ret
690___
691$code.=<<___;
692.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
693___
694}}}
695{{{
696my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
697my ($rounds,$cnt,$key_)=("w5","w6","x7");
698my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
699my $step="x12";		# aliases with $tctr2
700
701my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
702my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
703
704my ($dat,$tmp)=($dat0,$tmp0);
705
706### q8-q15	preloaded key schedule
707
708$code.=<<___;
709.globl	${prefix}_ctr32_encrypt_blocks
710.type	${prefix}_ctr32_encrypt_blocks,%function
711.align	5
712${prefix}_ctr32_encrypt_blocks:
713___
714$code.=<<___	if ($flavour =~ /64/);
715	stp		x29,x30,[sp,#-16]!
716	add		x29,sp,#0
717___
718$code.=<<___	if ($flavour !~ /64/);
719	mov		ip,sp
720	stmdb		sp!,{r4-r10,lr}
721	vstmdb		sp!,{d8-d15}            @ ABI specification says so
722	ldr		r4, [ip]		@ load remaining arg
723___
724$code.=<<___;
725	ldr		$rounds,[$key,#240]
726
727	ldr		$ctr, [$ivp, #12]
728	vld1.32		{$dat0},[$ivp]
729
730	vld1.32		{q8-q9},[$key]		// load key schedule...
731	sub		$rounds,$rounds,#4
732	mov		$step,#16
733	cmp		$len,#2
734	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
735	sub		$rounds,$rounds,#2
736	vld1.32		{q12-q13},[$key_],#32
737	vld1.32		{q14-q15},[$key_],#32
738	vld1.32		{$rndlast},[$key_]
739	add		$key_,$key,#32
740	mov		$cnt,$rounds
741	cclr		$step,lo
742#ifndef __ARMEB__
743	rev		$ctr, $ctr
744#endif
745	vorr		$dat1,$dat0,$dat0
746	add		$tctr1, $ctr, #1
747	vorr		$dat2,$dat0,$dat0
748	add		$ctr, $ctr, #2
749	vorr		$ivec,$dat0,$dat0
750	rev		$tctr1, $tctr1
751	vmov.32		${dat1}[3],$tctr1
752	b.ls		.Lctr32_tail
753	rev		$tctr2, $ctr
754	sub		$len,$len,#3		// bias
755	vmov.32		${dat2}[3],$tctr2
756	b		.Loop3x_ctr32
757
758.align	4
759.Loop3x_ctr32:
760	aese		$dat0,q8
761	aesmc		$dat0,$dat0
762	aese		$dat1,q8
763	aesmc		$dat1,$dat1
764	aese		$dat2,q8
765	aesmc		$dat2,$dat2
766	vld1.32		{q8},[$key_],#16
767	subs		$cnt,$cnt,#2
768	aese		$dat0,q9
769	aesmc		$dat0,$dat0
770	aese		$dat1,q9
771	aesmc		$dat1,$dat1
772	aese		$dat2,q9
773	aesmc		$dat2,$dat2
774	vld1.32		{q9},[$key_],#16
775	b.gt		.Loop3x_ctr32
776
777	aese		$dat0,q8
778	aesmc		$tmp0,$dat0
779	aese		$dat1,q8
780	aesmc		$tmp1,$dat1
781	 vld1.8		{$in0},[$inp],#16
782	 vorr		$dat0,$ivec,$ivec
783	aese		$dat2,q8
784	aesmc		$dat2,$dat2
785	 vld1.8		{$in1},[$inp],#16
786	 vorr		$dat1,$ivec,$ivec
787	aese		$tmp0,q9
788	aesmc		$tmp0,$tmp0
789	aese		$tmp1,q9
790	aesmc		$tmp1,$tmp1
791	 vld1.8		{$in2},[$inp],#16
792	 mov		$key_,$key
793	aese		$dat2,q9
794	aesmc		$tmp2,$dat2
795	 vorr		$dat2,$ivec,$ivec
796	 add		$tctr0,$ctr,#1
797	aese		$tmp0,q12
798	aesmc		$tmp0,$tmp0
799	aese		$tmp1,q12
800	aesmc		$tmp1,$tmp1
801	 veor		$in0,$in0,$rndlast
802	 add		$tctr1,$ctr,#2
803	aese		$tmp2,q12
804	aesmc		$tmp2,$tmp2
805	 veor		$in1,$in1,$rndlast
806	 add		$ctr,$ctr,#3
807	aese		$tmp0,q13
808	aesmc		$tmp0,$tmp0
809	aese		$tmp1,q13
810	aesmc		$tmp1,$tmp1
811	 veor		$in2,$in2,$rndlast
812	 rev		$tctr0,$tctr0
813	aese		$tmp2,q13
814	aesmc		$tmp2,$tmp2
815	 vmov.32	${dat0}[3], $tctr0
816	 rev		$tctr1,$tctr1
817	aese		$tmp0,q14
818	aesmc		$tmp0,$tmp0
819	aese		$tmp1,q14
820	aesmc		$tmp1,$tmp1
821	 vmov.32	${dat1}[3], $tctr1
822	 rev		$tctr2,$ctr
823	aese		$tmp2,q14
824	aesmc		$tmp2,$tmp2
825	 vmov.32	${dat2}[3], $tctr2
826	 subs		$len,$len,#3
827	aese		$tmp0,q15
828	aese		$tmp1,q15
829	aese		$tmp2,q15
830
831	veor		$in0,$in0,$tmp0
832	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
833	vst1.8		{$in0},[$out],#16
834	veor		$in1,$in1,$tmp1
835	 mov		$cnt,$rounds
836	vst1.8		{$in1},[$out],#16
837	veor		$in2,$in2,$tmp2
838	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
839	vst1.8		{$in2},[$out],#16
840	b.hs		.Loop3x_ctr32
841
842	adds		$len,$len,#3
843	b.eq		.Lctr32_done
844	cmp		$len,#1
845	mov		$step,#16
846	cclr		$step,eq
847
848.Lctr32_tail:
849	aese		$dat0,q8
850	aesmc		$dat0,$dat0
851	aese		$dat1,q8
852	aesmc		$dat1,$dat1
853	vld1.32		{q8},[$key_],#16
854	subs		$cnt,$cnt,#2
855	aese		$dat0,q9
856	aesmc		$dat0,$dat0
857	aese		$dat1,q9
858	aesmc		$dat1,$dat1
859	vld1.32		{q9},[$key_],#16
860	b.gt		.Lctr32_tail
861
862	aese		$dat0,q8
863	aesmc		$dat0,$dat0
864	aese		$dat1,q8
865	aesmc		$dat1,$dat1
866	aese		$dat0,q9
867	aesmc		$dat0,$dat0
868	aese		$dat1,q9
869	aesmc		$dat1,$dat1
870	 vld1.8		{$in0},[$inp],$step
871	aese		$dat0,q12
872	aesmc		$dat0,$dat0
873	aese		$dat1,q12
874	aesmc		$dat1,$dat1
875	 vld1.8		{$in1},[$inp]
876	aese		$dat0,q13
877	aesmc		$dat0,$dat0
878	aese		$dat1,q13
879	aesmc		$dat1,$dat1
880	 veor		$in0,$in0,$rndlast
881	aese		$dat0,q14
882	aesmc		$dat0,$dat0
883	aese		$dat1,q14
884	aesmc		$dat1,$dat1
885	 veor		$in1,$in1,$rndlast
886	aese		$dat0,q15
887	aese		$dat1,q15
888
889	cmp		$len,#1
890	veor		$in0,$in0,$dat0
891	veor		$in1,$in1,$dat1
892	vst1.8		{$in0},[$out],#16
893	b.eq		.Lctr32_done
894	vst1.8		{$in1},[$out]
895
896.Lctr32_done:
897___
898$code.=<<___	if ($flavour !~ /64/);
899	vldmia		sp!,{d8-d15}
900	ldmia		sp!,{r4-r10,pc}
901___
902$code.=<<___	if ($flavour =~ /64/);
903	ldr		x29,[sp],#16
904	ret
905___
906$code.=<<___;
907.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
908___
909}}}
910$code.=<<___;
911#endif
912___
913########################################
914if ($flavour =~ /64/) {			######## 64-bit code
915    my %opcode = (
916	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
917	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
918
919    local *unaes = sub {
920	my ($mnemonic,$arg)=@_;
921
922	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
923	sprintf ".inst\t0x%08x\t//%s %s",
924			$opcode{$mnemonic}|$1|($2<<5),
925			$mnemonic,$arg;
926    };
927
928    foreach(split("\n",$code)) {
929	s/\`([^\`]*)\`/eval($1)/geo;
930
931	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
932	s/@\s/\/\//o;			# old->new style commentary
933
934	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
935	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
936	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
937	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
938	s/vext\.8/ext/o		or
939	s/vrev32\.8/rev32/o	or
940	s/vtst\.8/cmtst/o	or
941	s/vshr/ushr/o		or
942	s/^(\s+)v/$1/o		or	# strip off v prefix
943	s/\bbx\s+lr\b/ret/o;
944
945	# fix up remaining legacy suffixes
946	s/\.[ui]?8//o;
947	m/\],#8/o and s/\.16b/\.8b/go;
948	s/\.[ui]?32//o and s/\.16b/\.4s/go;
949	s/\.[ui]?64//o and s/\.16b/\.2d/go;
950	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
951
952	print $_,"\n";
953    }
954} else {				######## 32-bit code
955    my %opcode = (
956	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
957	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
958
959    local *unaes = sub {
960	my ($mnemonic,$arg)=@_;
961
962	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
963	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
964					 |(($2&7)<<1) |(($2&8)<<2);
965	    # since ARMv7 instructions are always encoded little-endian.
966	    # correct solution is to use .inst directive, but older
967	    # assemblers don't implement it:-(
968	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
969			$word&0xff,($word>>8)&0xff,
970			($word>>16)&0xff,($word>>24)&0xff,
971			$mnemonic,$arg;
972	}
973    };
974
975    sub unvtbl {
976	my $arg=shift;
977
978	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
979	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
980		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
981    }
982
983    sub unvdup32 {
984	my $arg=shift;
985
986	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
987	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
988    }
989
990    sub unvmov32 {
991	my $arg=shift;
992
993	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
994	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
995    }
996
997    foreach(split("\n",$code)) {
998	s/\`([^\`]*)\`/eval($1)/geo;
999
1000	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
1001	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
1002	s/\/\/\s?/@ /o;				# new->old style commentary
1003
1004	# fix up remaining new-style suffixes
1005	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
1006	s/\],#[0-9]+/]!/o;
1007
1008	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
1009	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
1010	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
1011	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
1012	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1013	s/^(\s+)b\./$1b/o				or
1014	s/^(\s+)mov\./$1mov/o				or
1015	s/^(\s+)ret/$1bx\tlr/o;
1016
1017	print $_,"\n";
1018    }
1019}
1020
1021close STDOUT;
1022