1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38#
39# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
40#	and are still same even for updated module;
41
42$flavour = shift;
43$output  = shift;
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
48die "can't locate arm-xlate.pl";
49
50open OUT,"| \"$^X\" $xlate $flavour $output";
51*STDOUT=*OUT;
52
53$prefix="aes_hw";
54
55$code=<<___;
56#include <openssl/arm_arch.h>
57
58#if __ARM_MAX_ARCH__>=7
59.text
60___
61$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
62$code.=<<___						if ($flavour !~ /64/);
63.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
64.fpu	neon
65.code	32
66#undef	__thumb2__
67___
68
69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
71# maintain both 32- and 64-bit codes within single module and
72# transliterate common code to either flavour with regex vodoo.
73#
74{{{
75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
77	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
78
79
80# On AArch64, put the data .rodata and use adrp + add for compatibility with
81# execute-only memory. On AArch32, put it in .text and use adr.
82$code.= ".section .rodata\n" if ($flavour =~ /64/);
83$code.=<<___;
84.align	5
85.Lrcon:
86.long	0x01,0x01,0x01,0x01
87.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
88.long	0x1b,0x1b,0x1b,0x1b
89
90.text
91
92.globl	${prefix}_set_encrypt_key
93.type	${prefix}_set_encrypt_key,%function
94.align	5
95${prefix}_set_encrypt_key:
96.Lenc_key:
97___
98$code.=<<___	if ($flavour =~ /64/);
99	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
100	AARCH64_VALID_CALL_TARGET
101	stp	x29,x30,[sp,#-16]!
102	add	x29,sp,#0
103___
104$code.=<<___;
105	mov	$ptr,#-1
106	cmp	$inp,#0
107	b.eq	.Lenc_key_abort
108	cmp	$out,#0
109	b.eq	.Lenc_key_abort
110	mov	$ptr,#-2
111	cmp	$bits,#128
112	b.lt	.Lenc_key_abort
113	cmp	$bits,#256
114	b.gt	.Lenc_key_abort
115	tst	$bits,#0x3f
116	b.ne	.Lenc_key_abort
117
118___
119$code.=<<___	if ($flavour =~ /64/);
120	adrp	$ptr,:pg_hi21:.Lrcon
121	add	$ptr,$ptr,:lo12:.Lrcon
122___
123$code.=<<___	if ($flavour !~ /64/);
124	adr	$ptr,.Lrcon
125___
126$code.=<<___;
127	cmp	$bits,#192
128
129	veor	$zero,$zero,$zero
130	vld1.8	{$in0},[$inp],#16
131	mov	$bits,#8		// reuse $bits
132	vld1.32	{$rcon,$mask},[$ptr],#32
133
134	b.lt	.Loop128
135	b.eq	.L192
136	b	.L256
137
138.align	4
139.Loop128:
140	vtbl.8	$key,{$in0},$mask
141	vext.8	$tmp,$zero,$in0,#12
142	vst1.32	{$in0},[$out],#16
143	aese	$key,$zero
144	subs	$bits,$bits,#1
145
146	veor	$in0,$in0,$tmp
147	vext.8	$tmp,$zero,$tmp,#12
148	veor	$in0,$in0,$tmp
149	vext.8	$tmp,$zero,$tmp,#12
150	 veor	$key,$key,$rcon
151	veor	$in0,$in0,$tmp
152	vshl.u8	$rcon,$rcon,#1
153	veor	$in0,$in0,$key
154	b.ne	.Loop128
155
156	vld1.32	{$rcon},[$ptr]
157
158	vtbl.8	$key,{$in0},$mask
159	vext.8	$tmp,$zero,$in0,#12
160	vst1.32	{$in0},[$out],#16
161	aese	$key,$zero
162
163	veor	$in0,$in0,$tmp
164	vext.8	$tmp,$zero,$tmp,#12
165	veor	$in0,$in0,$tmp
166	vext.8	$tmp,$zero,$tmp,#12
167	 veor	$key,$key,$rcon
168	veor	$in0,$in0,$tmp
169	vshl.u8	$rcon,$rcon,#1
170	veor	$in0,$in0,$key
171
172	vtbl.8	$key,{$in0},$mask
173	vext.8	$tmp,$zero,$in0,#12
174	vst1.32	{$in0},[$out],#16
175	aese	$key,$zero
176
177	veor	$in0,$in0,$tmp
178	vext.8	$tmp,$zero,$tmp,#12
179	veor	$in0,$in0,$tmp
180	vext.8	$tmp,$zero,$tmp,#12
181	 veor	$key,$key,$rcon
182	veor	$in0,$in0,$tmp
183	veor	$in0,$in0,$key
184	vst1.32	{$in0},[$out]
185	add	$out,$out,#0x50
186
187	mov	$rounds,#10
188	b	.Ldone
189
190.align	4
191.L192:
192	vld1.8	{$in1},[$inp],#8
193	vmov.i8	$key,#8			// borrow $key
194	vst1.32	{$in0},[$out],#16
195	vsub.i8	$mask,$mask,$key	// adjust the mask
196
197.Loop192:
198	vtbl.8	$key,{$in1},$mask
199	vext.8	$tmp,$zero,$in0,#12
200	vst1.32	{$in1},[$out],#8
201	aese	$key,$zero
202	subs	$bits,$bits,#1
203
204	veor	$in0,$in0,$tmp
205	vext.8	$tmp,$zero,$tmp,#12
206	veor	$in0,$in0,$tmp
207	vext.8	$tmp,$zero,$tmp,#12
208	veor	$in0,$in0,$tmp
209
210	vdup.32	$tmp,${in0}[3]
211	veor	$tmp,$tmp,$in1
212	 veor	$key,$key,$rcon
213	vext.8	$in1,$zero,$in1,#12
214	vshl.u8	$rcon,$rcon,#1
215	veor	$in1,$in1,$tmp
216	veor	$in0,$in0,$key
217	veor	$in1,$in1,$key
218	vst1.32	{$in0},[$out],#16
219	b.ne	.Loop192
220
221	mov	$rounds,#12
222	add	$out,$out,#0x20
223	b	.Ldone
224
225.align	4
226.L256:
227	vld1.8	{$in1},[$inp]
228	mov	$bits,#7
229	mov	$rounds,#14
230	vst1.32	{$in0},[$out],#16
231
232.Loop256:
233	vtbl.8	$key,{$in1},$mask
234	vext.8	$tmp,$zero,$in0,#12
235	vst1.32	{$in1},[$out],#16
236	aese	$key,$zero
237	subs	$bits,$bits,#1
238
239	veor	$in0,$in0,$tmp
240	vext.8	$tmp,$zero,$tmp,#12
241	veor	$in0,$in0,$tmp
242	vext.8	$tmp,$zero,$tmp,#12
243	 veor	$key,$key,$rcon
244	veor	$in0,$in0,$tmp
245	vshl.u8	$rcon,$rcon,#1
246	veor	$in0,$in0,$key
247	vst1.32	{$in0},[$out],#16
248	b.eq	.Ldone
249
250	vdup.32	$key,${in0}[3]		// just splat
251	vext.8	$tmp,$zero,$in1,#12
252	aese	$key,$zero
253
254	veor	$in1,$in1,$tmp
255	vext.8	$tmp,$zero,$tmp,#12
256	veor	$in1,$in1,$tmp
257	vext.8	$tmp,$zero,$tmp,#12
258	veor	$in1,$in1,$tmp
259
260	veor	$in1,$in1,$key
261	b	.Loop256
262
263.Ldone:
264	str	$rounds,[$out]
265	mov	$ptr,#0
266
267.Lenc_key_abort:
268	mov	x0,$ptr			// return value
269	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
270	ret
271.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
272
273.globl	${prefix}_set_decrypt_key
274.type	${prefix}_set_decrypt_key,%function
275.align	5
276${prefix}_set_decrypt_key:
277___
278$code.=<<___	if ($flavour =~ /64/);
279	AARCH64_SIGN_LINK_REGISTER
280	stp	x29,x30,[sp,#-16]!
281	add	x29,sp,#0
282___
283$code.=<<___	if ($flavour !~ /64/);
284	stmdb	sp!,{r4,lr}
285___
286$code.=<<___;
287	bl	.Lenc_key
288
289	cmp	x0,#0
290	b.ne	.Ldec_key_abort
291
292	sub	$out,$out,#240		// restore original $out
293	mov	x4,#-16
294	add	$inp,$out,x12,lsl#4	// end of key schedule
295
296	vld1.32	{v0.16b},[$out]
297	vld1.32	{v1.16b},[$inp]
298	vst1.32	{v0.16b},[$inp],x4
299	vst1.32	{v1.16b},[$out],#16
300
301.Loop_imc:
302	vld1.32	{v0.16b},[$out]
303	vld1.32	{v1.16b},[$inp]
304	aesimc	v0.16b,v0.16b
305	aesimc	v1.16b,v1.16b
306	vst1.32	{v0.16b},[$inp],x4
307	vst1.32	{v1.16b},[$out],#16
308	cmp	$inp,$out
309	b.hi	.Loop_imc
310
311	vld1.32	{v0.16b},[$out]
312	aesimc	v0.16b,v0.16b
313	vst1.32	{v0.16b},[$inp]
314
315	eor	x0,x0,x0		// return value
316.Ldec_key_abort:
317___
318$code.=<<___	if ($flavour !~ /64/);
319	ldmia	sp!,{r4,pc}
320___
321$code.=<<___	if ($flavour =~ /64/);
322	ldp	x29,x30,[sp],#16
323	AARCH64_VALIDATE_LINK_REGISTER
324	ret
325___
326$code.=<<___;
327.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
328___
329}}}
330{{{
331sub gen_block () {
332my $dir = shift;
333my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
334my ($inp,$out,$key)=map("x$_",(0..2));
335my $rounds="w3";
336my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
337
338$code.=<<___;
339.globl	${prefix}_${dir}crypt
340.type	${prefix}_${dir}crypt,%function
341.align	5
342${prefix}_${dir}crypt:
343	AARCH64_VALID_CALL_TARGET
344	ldr	$rounds,[$key,#240]
345	vld1.32	{$rndkey0},[$key],#16
346	vld1.8	{$inout},[$inp]
347	sub	$rounds,$rounds,#2
348	vld1.32	{$rndkey1},[$key],#16
349
350.Loop_${dir}c:
351	aes$e	$inout,$rndkey0
352	aes$mc	$inout,$inout
353	vld1.32	{$rndkey0},[$key],#16
354	subs	$rounds,$rounds,#2
355	aes$e	$inout,$rndkey1
356	aes$mc	$inout,$inout
357	vld1.32	{$rndkey1},[$key],#16
358	b.gt	.Loop_${dir}c
359
360	aes$e	$inout,$rndkey0
361	aes$mc	$inout,$inout
362	vld1.32	{$rndkey0},[$key]
363	aes$e	$inout,$rndkey1
364	veor	$inout,$inout,$rndkey0
365
366	vst1.8	{$inout},[$out]
367	ret
368.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
369___
370}
371&gen_block("en");
372&gen_block("de");
373}}}
374{{{
375my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
376my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
377my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
378
379my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
380my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
381
382### q8-q15	preloaded key schedule
383
384$code.=<<___;
385.globl	${prefix}_cbc_encrypt
386.type	${prefix}_cbc_encrypt,%function
387.align	5
388${prefix}_cbc_encrypt:
389___
390$code.=<<___	if ($flavour =~ /64/);
391	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
392	AARCH64_VALID_CALL_TARGET
393	stp	x29,x30,[sp,#-16]!
394	add	x29,sp,#0
395___
396$code.=<<___	if ($flavour !~ /64/);
397	mov	ip,sp
398	stmdb	sp!,{r4-r8,lr}
399	vstmdb	sp!,{d8-d15}            @ ABI specification says so
400	ldmia	ip,{r4-r5}		@ load remaining args
401___
402$code.=<<___;
403	subs	$len,$len,#16
404	mov	$step,#16
405	b.lo	.Lcbc_abort
406	cclr	$step,eq
407
408	cmp	$enc,#0			// en- or decrypting?
409	ldr	$rounds,[$key,#240]
410	and	$len,$len,#-16
411	vld1.8	{$ivec},[$ivp]
412	vld1.8	{$dat},[$inp],$step
413
414	vld1.32	{q8-q9},[$key]		// load key schedule...
415	sub	$rounds,$rounds,#6
416	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
417	sub	$rounds,$rounds,#2
418	vld1.32	{q10-q11},[$key_],#32
419	vld1.32	{q12-q13},[$key_],#32
420	vld1.32	{q14-q15},[$key_],#32
421	vld1.32	{$rndlast},[$key_]
422
423	add	$key_,$key,#32
424	mov	$cnt,$rounds
425	b.eq	.Lcbc_dec
426
427	cmp	$rounds,#2
428	veor	$dat,$dat,$ivec
429	veor	$rndzero_n_last,q8,$rndlast
430	b.eq	.Lcbc_enc128
431
432	vld1.32	{$in0-$in1},[$key_]
433	add	$key_,$key,#16
434	add	$key4,$key,#16*4
435	add	$key5,$key,#16*5
436	aese	$dat,q8
437	aesmc	$dat,$dat
438	add	$key6,$key,#16*6
439	add	$key7,$key,#16*7
440	b	.Lenter_cbc_enc
441
442.align	4
443.Loop_cbc_enc:
444	aese	$dat,q8
445	aesmc	$dat,$dat
446	 vst1.8	{$ivec},[$out],#16
447.Lenter_cbc_enc:
448	aese	$dat,q9
449	aesmc	$dat,$dat
450	aese	$dat,$in0
451	aesmc	$dat,$dat
452	vld1.32	{q8},[$key4]
453	cmp	$rounds,#4
454	aese	$dat,$in1
455	aesmc	$dat,$dat
456	vld1.32	{q9},[$key5]
457	b.eq	.Lcbc_enc192
458
459	aese	$dat,q8
460	aesmc	$dat,$dat
461	vld1.32	{q8},[$key6]
462	aese	$dat,q9
463	aesmc	$dat,$dat
464	vld1.32	{q9},[$key7]
465	nop
466
467.Lcbc_enc192:
468	aese	$dat,q8
469	aesmc	$dat,$dat
470	 subs	$len,$len,#16
471	aese	$dat,q9
472	aesmc	$dat,$dat
473	 cclr	$step,eq
474	aese	$dat,q10
475	aesmc	$dat,$dat
476	aese	$dat,q11
477	aesmc	$dat,$dat
478	 vld1.8	{q8},[$inp],$step
479	aese	$dat,q12
480	aesmc	$dat,$dat
481	 veor	q8,q8,$rndzero_n_last
482	aese	$dat,q13
483	aesmc	$dat,$dat
484	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
485	aese	$dat,q14
486	aesmc	$dat,$dat
487	aese	$dat,q15
488	veor	$ivec,$dat,$rndlast
489	b.hs	.Loop_cbc_enc
490
491	vst1.8	{$ivec},[$out],#16
492	b	.Lcbc_done
493
494.align	5
495.Lcbc_enc128:
496	vld1.32	{$in0-$in1},[$key_]
497	aese	$dat,q8
498	aesmc	$dat,$dat
499	b	.Lenter_cbc_enc128
500.Loop_cbc_enc128:
501	aese	$dat,q8
502	aesmc	$dat,$dat
503	 vst1.8	{$ivec},[$out],#16
504.Lenter_cbc_enc128:
505	aese	$dat,q9
506	aesmc	$dat,$dat
507	 subs	$len,$len,#16
508	aese	$dat,$in0
509	aesmc	$dat,$dat
510	 cclr	$step,eq
511	aese	$dat,$in1
512	aesmc	$dat,$dat
513	aese	$dat,q10
514	aesmc	$dat,$dat
515	aese	$dat,q11
516	aesmc	$dat,$dat
517	 vld1.8	{q8},[$inp],$step
518	aese	$dat,q12
519	aesmc	$dat,$dat
520	aese	$dat,q13
521	aesmc	$dat,$dat
522	aese	$dat,q14
523	aesmc	$dat,$dat
524	 veor	q8,q8,$rndzero_n_last
525	aese	$dat,q15
526	veor	$ivec,$dat,$rndlast
527	b.hs	.Loop_cbc_enc128
528
529	vst1.8	{$ivec},[$out],#16
530	b	.Lcbc_done
531___
532{
533my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
534$code.=<<___;
535.align	5
536.Lcbc_dec:
537	vld1.8	{$dat2},[$inp],#16
538	subs	$len,$len,#32		// bias
539	add	$cnt,$rounds,#2
540	vorr	$in1,$dat,$dat
541	vorr	$dat1,$dat,$dat
542	vorr	$in2,$dat2,$dat2
543	b.lo	.Lcbc_dec_tail
544
545	vorr	$dat1,$dat2,$dat2
546	vld1.8	{$dat2},[$inp],#16
547	vorr	$in0,$dat,$dat
548	vorr	$in1,$dat1,$dat1
549	vorr	$in2,$dat2,$dat2
550
551.Loop3x_cbc_dec:
552	aesd	$dat0,q8
553	aesimc	$dat0,$dat0
554	aesd	$dat1,q8
555	aesimc	$dat1,$dat1
556	aesd	$dat2,q8
557	aesimc	$dat2,$dat2
558	vld1.32	{q8},[$key_],#16
559	subs	$cnt,$cnt,#2
560	aesd	$dat0,q9
561	aesimc	$dat0,$dat0
562	aesd	$dat1,q9
563	aesimc	$dat1,$dat1
564	aesd	$dat2,q9
565	aesimc	$dat2,$dat2
566	vld1.32	{q9},[$key_],#16
567	b.gt	.Loop3x_cbc_dec
568
569	aesd	$dat0,q8
570	aesimc	$dat0,$dat0
571	aesd	$dat1,q8
572	aesimc	$dat1,$dat1
573	aesd	$dat2,q8
574	aesimc	$dat2,$dat2
575	 veor	$tmp0,$ivec,$rndlast
576	 subs	$len,$len,#0x30
577	 veor	$tmp1,$in0,$rndlast
578	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
579	aesd	$dat0,q9
580	aesimc	$dat0,$dat0
581	aesd	$dat1,q9
582	aesimc	$dat1,$dat1
583	aesd	$dat2,q9
584	aesimc	$dat2,$dat2
585	 veor	$tmp2,$in1,$rndlast
586	 add	$inp,$inp,x6		// $inp is adjusted in such way that
587					// at exit from the loop $dat1-$dat2
588					// are loaded with last "words"
589	 vorr	$ivec,$in2,$in2
590	 mov	$key_,$key
591	aesd	$dat0,q12
592	aesimc	$dat0,$dat0
593	aesd	$dat1,q12
594	aesimc	$dat1,$dat1
595	aesd	$dat2,q12
596	aesimc	$dat2,$dat2
597	 vld1.8	{$in0},[$inp],#16
598	aesd	$dat0,q13
599	aesimc	$dat0,$dat0
600	aesd	$dat1,q13
601	aesimc	$dat1,$dat1
602	aesd	$dat2,q13
603	aesimc	$dat2,$dat2
604	 vld1.8	{$in1},[$inp],#16
605	aesd	$dat0,q14
606	aesimc	$dat0,$dat0
607	aesd	$dat1,q14
608	aesimc	$dat1,$dat1
609	aesd	$dat2,q14
610	aesimc	$dat2,$dat2
611	 vld1.8	{$in2},[$inp],#16
612	aesd	$dat0,q15
613	aesd	$dat1,q15
614	aesd	$dat2,q15
615	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
616	 add	$cnt,$rounds,#2
617	veor	$tmp0,$tmp0,$dat0
618	veor	$tmp1,$tmp1,$dat1
619	veor	$dat2,$dat2,$tmp2
620	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
621	vst1.8	{$tmp0},[$out],#16
622	 vorr	$dat0,$in0,$in0
623	vst1.8	{$tmp1},[$out],#16
624	 vorr	$dat1,$in1,$in1
625	vst1.8	{$dat2},[$out],#16
626	 vorr	$dat2,$in2,$in2
627	b.hs	.Loop3x_cbc_dec
628
629	cmn	$len,#0x30
630	b.eq	.Lcbc_done
631	nop
632
633.Lcbc_dec_tail:
634	aesd	$dat1,q8
635	aesimc	$dat1,$dat1
636	aesd	$dat2,q8
637	aesimc	$dat2,$dat2
638	vld1.32	{q8},[$key_],#16
639	subs	$cnt,$cnt,#2
640	aesd	$dat1,q9
641	aesimc	$dat1,$dat1
642	aesd	$dat2,q9
643	aesimc	$dat2,$dat2
644	vld1.32	{q9},[$key_],#16
645	b.gt	.Lcbc_dec_tail
646
647	aesd	$dat1,q8
648	aesimc	$dat1,$dat1
649	aesd	$dat2,q8
650	aesimc	$dat2,$dat2
651	aesd	$dat1,q9
652	aesimc	$dat1,$dat1
653	aesd	$dat2,q9
654	aesimc	$dat2,$dat2
655	aesd	$dat1,q12
656	aesimc	$dat1,$dat1
657	aesd	$dat2,q12
658	aesimc	$dat2,$dat2
659	 cmn	$len,#0x20
660	aesd	$dat1,q13
661	aesimc	$dat1,$dat1
662	aesd	$dat2,q13
663	aesimc	$dat2,$dat2
664	 veor	$tmp1,$ivec,$rndlast
665	aesd	$dat1,q14
666	aesimc	$dat1,$dat1
667	aesd	$dat2,q14
668	aesimc	$dat2,$dat2
669	 veor	$tmp2,$in1,$rndlast
670	aesd	$dat1,q15
671	aesd	$dat2,q15
672	b.eq	.Lcbc_dec_one
673	veor	$tmp1,$tmp1,$dat1
674	veor	$tmp2,$tmp2,$dat2
675	 vorr	$ivec,$in2,$in2
676	vst1.8	{$tmp1},[$out],#16
677	vst1.8	{$tmp2},[$out],#16
678	b	.Lcbc_done
679
680.Lcbc_dec_one:
681	veor	$tmp1,$tmp1,$dat2
682	 vorr	$ivec,$in2,$in2
683	vst1.8	{$tmp1},[$out],#16
684
685.Lcbc_done:
686	vst1.8	{$ivec},[$ivp]
687.Lcbc_abort:
688___
689}
690$code.=<<___	if ($flavour !~ /64/);
691	vldmia	sp!,{d8-d15}
692	ldmia	sp!,{r4-r8,pc}
693___
694$code.=<<___	if ($flavour =~ /64/);
695	ldr	x29,[sp],#16
696	ret
697___
698$code.=<<___;
699.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
700___
701}}}
702{{{
703my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
704my ($rounds,$cnt,$key_)=("w5","w6","x7");
705my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
706my $step="x12";		# aliases with $tctr2
707
708my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
709my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
710
711my ($dat,$tmp)=($dat0,$tmp0);
712
713### q8-q15	preloaded key schedule
714
715$code.=<<___;
716.globl	${prefix}_ctr32_encrypt_blocks
717.type	${prefix}_ctr32_encrypt_blocks,%function
718.align	5
719${prefix}_ctr32_encrypt_blocks:
720___
721$code.=<<___	if ($flavour =~ /64/);
722	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
723	AARCH64_VALID_CALL_TARGET
724	stp		x29,x30,[sp,#-16]!
725	add		x29,sp,#0
726___
727$code.=<<___	if ($flavour !~ /64/);
728	mov		ip,sp
729	stmdb		sp!,{r4-r10,lr}
730	vstmdb		sp!,{d8-d15}            @ ABI specification says so
731	ldr		r4, [ip]		@ load remaining arg
732___
733$code.=<<___;
734	ldr		$rounds,[$key,#240]
735
736	ldr		$ctr, [$ivp, #12]
737	vld1.32		{$dat0},[$ivp]
738
739	vld1.32		{q8-q9},[$key]		// load key schedule...
740	sub		$rounds,$rounds,#4
741	mov		$step,#16
742	cmp		$len,#2
743	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
744	sub		$rounds,$rounds,#2
745	vld1.32		{q12-q13},[$key_],#32
746	vld1.32		{q14-q15},[$key_],#32
747	vld1.32		{$rndlast},[$key_]
748	add		$key_,$key,#32
749	mov		$cnt,$rounds
750	cclr		$step,lo
751
752	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
753	// affected by silicon errata #1742098 [0] and #1655431 [1],
754	// respectively, where the second instruction of an aese/aesmc
755	// instruction pair may execute twice if an interrupt is taken right
756	// after the first instruction consumes an input register of which a
757	// single 32-bit lane has been updated the last time it was modified.
758	//
759	// This function uses a counter in one 32-bit lane. The vmov.32 lines
760	// could write to $dat1 and $dat2 directly, but that trips this bugs.
761	// We write to $ivec and copy to the final register as a workaround.
762	//
763	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
764	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
765#ifndef __ARMEB__
766	rev		$ctr, $ctr
767#endif
768	add		$tctr1, $ctr, #1
769	vorr		$ivec,$dat0,$dat0
770	rev		$tctr1, $tctr1
771	vmov.32		${ivec}[3],$tctr1
772	add		$ctr, $ctr, #2
773	vorr		$dat1,$ivec,$ivec
774	b.ls		.Lctr32_tail
775	rev		$tctr2, $ctr
776	vmov.32		${ivec}[3],$tctr2
777	sub		$len,$len,#3		// bias
778	vorr		$dat2,$ivec,$ivec
779	b		.Loop3x_ctr32
780
781.align	4
782.Loop3x_ctr32:
783	aese		$dat0,q8
784	aesmc		$dat0,$dat0
785	aese		$dat1,q8
786	aesmc		$dat1,$dat1
787	aese		$dat2,q8
788	aesmc		$dat2,$dat2
789	vld1.32		{q8},[$key_],#16
790	subs		$cnt,$cnt,#2
791	aese		$dat0,q9
792	aesmc		$dat0,$dat0
793	aese		$dat1,q9
794	aesmc		$dat1,$dat1
795	aese		$dat2,q9
796	aesmc		$dat2,$dat2
797	vld1.32		{q9},[$key_],#16
798	b.gt		.Loop3x_ctr32
799
800	aese		$dat0,q8
801	aesmc		$tmp0,$dat0
802	aese		$dat1,q8
803	aesmc		$tmp1,$dat1
804	 vld1.8		{$in0},[$inp],#16
805	 add		$tctr0,$ctr,#1
806	aese		$dat2,q8
807	aesmc		$dat2,$dat2
808	 vld1.8		{$in1},[$inp],#16
809	 rev		$tctr0,$tctr0
810	aese		$tmp0,q9
811	aesmc		$tmp0,$tmp0
812	aese		$tmp1,q9
813	aesmc		$tmp1,$tmp1
814	 vld1.8		{$in2},[$inp],#16
815	 mov		$key_,$key
816	aese		$dat2,q9
817	aesmc		$tmp2,$dat2
818	aese		$tmp0,q12
819	aesmc		$tmp0,$tmp0
820	aese		$tmp1,q12
821	aesmc		$tmp1,$tmp1
822	 veor		$in0,$in0,$rndlast
823	 add		$tctr1,$ctr,#2
824	aese		$tmp2,q12
825	aesmc		$tmp2,$tmp2
826	 veor		$in1,$in1,$rndlast
827	 add		$ctr,$ctr,#3
828	aese		$tmp0,q13
829	aesmc		$tmp0,$tmp0
830	aese		$tmp1,q13
831	aesmc		$tmp1,$tmp1
832	 // Note the logic to update $dat0, $dat1, and $dat1 is written to work
833	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
834	 // 32-bit mode. See the comment above.
835	 veor		$in2,$in2,$rndlast
836	 vmov.32	${ivec}[3], $tctr0
837	aese		$tmp2,q13
838	aesmc		$tmp2,$tmp2
839	 vorr		$dat0,$ivec,$ivec
840	 rev		$tctr1,$tctr1
841	aese		$tmp0,q14
842	aesmc		$tmp0,$tmp0
843	 vmov.32	${ivec}[3], $tctr1
844	 rev		$tctr2,$ctr
845	aese		$tmp1,q14
846	aesmc		$tmp1,$tmp1
847	 vorr		$dat1,$ivec,$ivec
848	 vmov.32	${ivec}[3], $tctr2
849	aese		$tmp2,q14
850	aesmc		$tmp2,$tmp2
851	 vorr		$dat2,$ivec,$ivec
852	 subs		$len,$len,#3
853	aese		$tmp0,q15
854	aese		$tmp1,q15
855	aese		$tmp2,q15
856
857	veor		$in0,$in0,$tmp0
858	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
859	vst1.8		{$in0},[$out],#16
860	veor		$in1,$in1,$tmp1
861	 mov		$cnt,$rounds
862	vst1.8		{$in1},[$out],#16
863	veor		$in2,$in2,$tmp2
864	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
865	vst1.8		{$in2},[$out],#16
866	b.hs		.Loop3x_ctr32
867
868	adds		$len,$len,#3
869	b.eq		.Lctr32_done
870	cmp		$len,#1
871	mov		$step,#16
872	cclr		$step,eq
873
874.Lctr32_tail:
875	aese		$dat0,q8
876	aesmc		$dat0,$dat0
877	aese		$dat1,q8
878	aesmc		$dat1,$dat1
879	vld1.32		{q8},[$key_],#16
880	subs		$cnt,$cnt,#2
881	aese		$dat0,q9
882	aesmc		$dat0,$dat0
883	aese		$dat1,q9
884	aesmc		$dat1,$dat1
885	vld1.32		{q9},[$key_],#16
886	b.gt		.Lctr32_tail
887
888	aese		$dat0,q8
889	aesmc		$dat0,$dat0
890	aese		$dat1,q8
891	aesmc		$dat1,$dat1
892	aese		$dat0,q9
893	aesmc		$dat0,$dat0
894	aese		$dat1,q9
895	aesmc		$dat1,$dat1
896	 vld1.8		{$in0},[$inp],$step
897	aese		$dat0,q12
898	aesmc		$dat0,$dat0
899	aese		$dat1,q12
900	aesmc		$dat1,$dat1
901	 vld1.8		{$in1},[$inp]
902	aese		$dat0,q13
903	aesmc		$dat0,$dat0
904	aese		$dat1,q13
905	aesmc		$dat1,$dat1
906	 veor		$in0,$in0,$rndlast
907	aese		$dat0,q14
908	aesmc		$dat0,$dat0
909	aese		$dat1,q14
910	aesmc		$dat1,$dat1
911	 veor		$in1,$in1,$rndlast
912	aese		$dat0,q15
913	aese		$dat1,q15
914
915	cmp		$len,#1
916	veor		$in0,$in0,$dat0
917	veor		$in1,$in1,$dat1
918	vst1.8		{$in0},[$out],#16
919	b.eq		.Lctr32_done
920	vst1.8		{$in1},[$out]
921
922.Lctr32_done:
923___
924$code.=<<___	if ($flavour !~ /64/);
925	vldmia		sp!,{d8-d15}
926	ldmia		sp!,{r4-r10,pc}
927___
928$code.=<<___	if ($flavour =~ /64/);
929	ldr		x29,[sp],#16
930	ret
931___
932$code.=<<___;
933.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
934___
935}}}
936$code.=<<___;
937#endif
938___
939########################################
940if ($flavour =~ /64/) {			######## 64-bit code
941    my %opcode = (
942	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
943	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
944
945    local *unaes = sub {
946	my ($mnemonic,$arg)=@_;
947
948	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
949	sprintf ".inst\t0x%08x\t//%s %s",
950			$opcode{$mnemonic}|$1|($2<<5),
951			$mnemonic,$arg;
952    };
953
954    foreach(split("\n",$code)) {
955	s/\`([^\`]*)\`/eval($1)/geo;
956
957	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
958	s/@\s/\/\//o;			# old->new style commentary
959
960	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
961	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
962	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
963	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
964	s/vext\.8/ext/o		or
965	s/vrev32\.8/rev32/o	or
966	s/vtst\.8/cmtst/o	or
967	s/vshr/ushr/o		or
968	s/^(\s+)v/$1/o		or	# strip off v prefix
969	s/\bbx\s+lr\b/ret/o;
970
971	# fix up remaining legacy suffixes
972	s/\.[ui]?8//o;
973	m/\],#8/o and s/\.16b/\.8b/go;
974	s/\.[ui]?32//o and s/\.16b/\.4s/go;
975	s/\.[ui]?64//o and s/\.16b/\.2d/go;
976	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
977
978	print $_,"\n";
979    }
980} else {				######## 32-bit code
981    my %opcode = (
982	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
983	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
984
985    local *unaes = sub {
986	my ($mnemonic,$arg)=@_;
987
988	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
989	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
990					 |(($2&7)<<1) |(($2&8)<<2);
991	    # since ARMv7 instructions are always encoded little-endian.
992	    # correct solution is to use .inst directive, but older
993	    # assemblers don't implement it:-(
994	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
995			$word&0xff,($word>>8)&0xff,
996			($word>>16)&0xff,($word>>24)&0xff,
997			$mnemonic,$arg;
998	}
999    };
1000
1001    sub unvtbl {
1002	my $arg=shift;
1003
1004	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
1005	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
1006		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
1007    }
1008
1009    sub unvdup32 {
1010	my $arg=shift;
1011
1012	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
1013	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
1014    }
1015
1016    sub unvmov32 {
1017	my $arg=shift;
1018
1019	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
1020	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
1021    }
1022
1023    foreach(split("\n",$code)) {
1024	s/\`([^\`]*)\`/eval($1)/geo;
1025
1026	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
1027	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
1028	s/\/\/\s?/@ /o;				# new->old style commentary
1029
1030	# fix up remaining new-style suffixes
1031	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
1032	s/\],#[0-9]+/]!/o;
1033
1034	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
1035	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
1036	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
1037	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
1038	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1039	s/^(\s+)b\./$1b/o				or
1040	s/^(\s+)mov\./$1mov/o				or
1041	s/^(\s+)ret/$1bx\tlr/o;
1042
1043	print $_,"\n";
1044    }
1045}
1046
1047close STDOUT or die "error closing STDOUT";
1048