1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for ARMv8 AES instructions. The
11# module is endian-agnostic in sense that it supports both big- and
12# little-endian cases. As does it support both 32- and 64-bit modes
13# of operation. Latter is achieved by limiting amount of utilized
14# registers to 16, which implies additional NEON load and integer
15# instructions. This has no effect on mighty Apple A7, where results
16# are literally equal to the theoretical estimates based on AES
17# instruction latencies and issue rates. On Cortex-A53, an in-order
18# execution core, this costs up to 10-15%, which is partially
19# compensated by implementing dedicated code path for 128-bit
20# CBC encrypt case. On Cortex-A57 parallelizable mode performance
21# seems to be limited by sheer amount of NEON instructions...
22#
23# Performance in cycles per byte processed with 128-bit key:
24#
25#		CBC enc		CBC dec		CTR
26# Apple A7	2.39		1.20		1.20
27# Cortex-A53	1.32		1.29		1.46
28# Cortex-A57(*)	1.95		0.85		0.93
29# Denver	1.96		0.86		0.80
30#
31# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
32#	and are still same even for updated module;
33
34$flavour = shift;
35$output  = shift;
36
37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
39( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
40die "can't locate arm-xlate.pl";
41
42open OUT,"| \"$^X\" $xlate $flavour $output";
43*STDOUT=*OUT;
44
45$prefix="aes_v8";
46
47$code=<<___;
48#include <openssl/arm_arch.h>
49
50#if __ARM_MAX_ARCH__>=7
51.text
52___
53$code.=<<___ if ($flavour =~ /64/);
54#if !defined(__clang__)
55.arch  armv8-a+crypto
56#endif
57___
58$code.=".arch	armv7-a\n.fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
59		#^^^^^^ this is done to simplify adoption by not depending
60		#	on latest binutils.
61
62# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
63# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
64# maintain both 32- and 64-bit codes within single module and
65# transliterate common code to either flavour with regex vodoo.
66#
67{{{
68my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
69my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
70	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
71
72
73$code.=<<___;
74.align	5
75.Lrcon:
76.long	0x01,0x01,0x01,0x01
77.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
78.long	0x1b,0x1b,0x1b,0x1b
79
80.globl	${prefix}_set_encrypt_key
81.type	${prefix}_set_encrypt_key,%function
82.align	5
83${prefix}_set_encrypt_key:
84.Lenc_key:
85___
86$code.=<<___	if ($flavour =~ /64/);
87	stp	x29,x30,[sp,#-16]!
88	add	x29,sp,#0
89___
90$code.=<<___;
91	mov	$ptr,#-1
92	cmp	$inp,#0
93	b.eq	.Lenc_key_abort
94	cmp	$out,#0
95	b.eq	.Lenc_key_abort
96	mov	$ptr,#-2
97	cmp	$bits,#128
98	b.lt	.Lenc_key_abort
99	cmp	$bits,#256
100	b.gt	.Lenc_key_abort
101	tst	$bits,#0x3f
102	b.ne	.Lenc_key_abort
103
104	adr	$ptr,.Lrcon
105	cmp	$bits,#192
106
107	veor	$zero,$zero,$zero
108	vld1.8	{$in0},[$inp],#16
109	mov	$bits,#8		// reuse $bits
110	vld1.32	{$rcon,$mask},[$ptr],#32
111
112	b.lt	.Loop128
113	b.eq	.L192
114	b	.L256
115
116.align	4
117.Loop128:
118	vtbl.8	$key,{$in0},$mask
119	vext.8	$tmp,$zero,$in0,#12
120	vst1.32	{$in0},[$out],#16
121	aese	$key,$zero
122	subs	$bits,$bits,#1
123
124	veor	$in0,$in0,$tmp
125	vext.8	$tmp,$zero,$tmp,#12
126	veor	$in0,$in0,$tmp
127	vext.8	$tmp,$zero,$tmp,#12
128	 veor	$key,$key,$rcon
129	veor	$in0,$in0,$tmp
130	vshl.u8	$rcon,$rcon,#1
131	veor	$in0,$in0,$key
132	b.ne	.Loop128
133
134	vld1.32	{$rcon},[$ptr]
135
136	vtbl.8	$key,{$in0},$mask
137	vext.8	$tmp,$zero,$in0,#12
138	vst1.32	{$in0},[$out],#16
139	aese	$key,$zero
140
141	veor	$in0,$in0,$tmp
142	vext.8	$tmp,$zero,$tmp,#12
143	veor	$in0,$in0,$tmp
144	vext.8	$tmp,$zero,$tmp,#12
145	 veor	$key,$key,$rcon
146	veor	$in0,$in0,$tmp
147	vshl.u8	$rcon,$rcon,#1
148	veor	$in0,$in0,$key
149
150	vtbl.8	$key,{$in0},$mask
151	vext.8	$tmp,$zero,$in0,#12
152	vst1.32	{$in0},[$out],#16
153	aese	$key,$zero
154
155	veor	$in0,$in0,$tmp
156	vext.8	$tmp,$zero,$tmp,#12
157	veor	$in0,$in0,$tmp
158	vext.8	$tmp,$zero,$tmp,#12
159	 veor	$key,$key,$rcon
160	veor	$in0,$in0,$tmp
161	veor	$in0,$in0,$key
162	vst1.32	{$in0},[$out]
163	add	$out,$out,#0x50
164
165	mov	$rounds,#10
166	b	.Ldone
167
168.align	4
169.L192:
170	vld1.8	{$in1},[$inp],#8
171	vmov.i8	$key,#8			// borrow $key
172	vst1.32	{$in0},[$out],#16
173	vsub.i8	$mask,$mask,$key	// adjust the mask
174
175.Loop192:
176	vtbl.8	$key,{$in1},$mask
177	vext.8	$tmp,$zero,$in0,#12
178	vst1.32	{$in1},[$out],#8
179	aese	$key,$zero
180	subs	$bits,$bits,#1
181
182	veor	$in0,$in0,$tmp
183	vext.8	$tmp,$zero,$tmp,#12
184	veor	$in0,$in0,$tmp
185	vext.8	$tmp,$zero,$tmp,#12
186	veor	$in0,$in0,$tmp
187
188	vdup.32	$tmp,${in0}[3]
189	veor	$tmp,$tmp,$in1
190	 veor	$key,$key,$rcon
191	vext.8	$in1,$zero,$in1,#12
192	vshl.u8	$rcon,$rcon,#1
193	veor	$in1,$in1,$tmp
194	veor	$in0,$in0,$key
195	veor	$in1,$in1,$key
196	vst1.32	{$in0},[$out],#16
197	b.ne	.Loop192
198
199	mov	$rounds,#12
200	add	$out,$out,#0x20
201	b	.Ldone
202
203.align	4
204.L256:
205	vld1.8	{$in1},[$inp]
206	mov	$bits,#7
207	mov	$rounds,#14
208	vst1.32	{$in0},[$out],#16
209
210.Loop256:
211	vtbl.8	$key,{$in1},$mask
212	vext.8	$tmp,$zero,$in0,#12
213	vst1.32	{$in1},[$out],#16
214	aese	$key,$zero
215	subs	$bits,$bits,#1
216
217	veor	$in0,$in0,$tmp
218	vext.8	$tmp,$zero,$tmp,#12
219	veor	$in0,$in0,$tmp
220	vext.8	$tmp,$zero,$tmp,#12
221	 veor	$key,$key,$rcon
222	veor	$in0,$in0,$tmp
223	vshl.u8	$rcon,$rcon,#1
224	veor	$in0,$in0,$key
225	vst1.32	{$in0},[$out],#16
226	b.eq	.Ldone
227
228	vdup.32	$key,${in0}[3]		// just splat
229	vext.8	$tmp,$zero,$in1,#12
230	aese	$key,$zero
231
232	veor	$in1,$in1,$tmp
233	vext.8	$tmp,$zero,$tmp,#12
234	veor	$in1,$in1,$tmp
235	vext.8	$tmp,$zero,$tmp,#12
236	veor	$in1,$in1,$tmp
237
238	veor	$in1,$in1,$key
239	b	.Loop256
240
241.Ldone:
242	str	$rounds,[$out]
243	mov	$ptr,#0
244
245.Lenc_key_abort:
246	mov	x0,$ptr			// return value
247	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
248	ret
249.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
250
251.globl	${prefix}_set_decrypt_key
252.type	${prefix}_set_decrypt_key,%function
253.align	5
254${prefix}_set_decrypt_key:
255___
256$code.=<<___	if ($flavour =~ /64/);
257	stp	x29,x30,[sp,#-16]!
258	add	x29,sp,#0
259___
260$code.=<<___	if ($flavour !~ /64/);
261	stmdb	sp!,{r4,lr}
262___
263$code.=<<___;
264	bl	.Lenc_key
265
266	cmp	x0,#0
267	b.ne	.Ldec_key_abort
268
269	sub	$out,$out,#240		// restore original $out
270	mov	x4,#-16
271	add	$inp,$out,x12,lsl#4	// end of key schedule
272
273	vld1.32	{v0.16b},[$out]
274	vld1.32	{v1.16b},[$inp]
275	vst1.32	{v0.16b},[$inp],x4
276	vst1.32	{v1.16b},[$out],#16
277
278.Loop_imc:
279	vld1.32	{v0.16b},[$out]
280	vld1.32	{v1.16b},[$inp]
281	aesimc	v0.16b,v0.16b
282	aesimc	v1.16b,v1.16b
283	vst1.32	{v0.16b},[$inp],x4
284	vst1.32	{v1.16b},[$out],#16
285	cmp	$inp,$out
286	b.hi	.Loop_imc
287
288	vld1.32	{v0.16b},[$out]
289	aesimc	v0.16b,v0.16b
290	vst1.32	{v0.16b},[$inp]
291
292	eor	x0,x0,x0		// return value
293.Ldec_key_abort:
294___
295$code.=<<___	if ($flavour !~ /64/);
296	ldmia	sp!,{r4,pc}
297___
298$code.=<<___	if ($flavour =~ /64/);
299	ldp	x29,x30,[sp],#16
300	ret
301___
302$code.=<<___;
303.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
304___
305}}}
306{{{
307sub gen_block () {
308my $dir = shift;
309my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
310my ($inp,$out,$key)=map("x$_",(0..2));
311my $rounds="w3";
312my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
313
314$code.=<<___;
315.globl	${prefix}_${dir}crypt
316.type	${prefix}_${dir}crypt,%function
317.align	5
318${prefix}_${dir}crypt:
319	ldr	$rounds,[$key,#240]
320	vld1.32	{$rndkey0},[$key],#16
321	vld1.8	{$inout},[$inp]
322	sub	$rounds,$rounds,#2
323	vld1.32	{$rndkey1},[$key],#16
324
325.Loop_${dir}c:
326	aes$e	$inout,$rndkey0
327	aes$mc	$inout,$inout
328	vld1.32	{$rndkey0},[$key],#16
329	subs	$rounds,$rounds,#2
330	aes$e	$inout,$rndkey1
331	aes$mc	$inout,$inout
332	vld1.32	{$rndkey1},[$key],#16
333	b.gt	.Loop_${dir}c
334
335	aes$e	$inout,$rndkey0
336	aes$mc	$inout,$inout
337	vld1.32	{$rndkey0},[$key]
338	aes$e	$inout,$rndkey1
339	veor	$inout,$inout,$rndkey0
340
341	vst1.8	{$inout},[$out]
342	ret
343.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
344___
345}
346&gen_block("en");
347&gen_block("de");
348}}}
349{{{
350my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
351my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
352my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
353
354my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
355my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
356
357### q8-q15	preloaded key schedule
358
359$code.=<<___;
360.globl	${prefix}_cbc_encrypt
361.type	${prefix}_cbc_encrypt,%function
362.align	5
363${prefix}_cbc_encrypt:
364___
365$code.=<<___	if ($flavour =~ /64/);
366	stp	x29,x30,[sp,#-16]!
367	add	x29,sp,#0
368___
369$code.=<<___	if ($flavour !~ /64/);
370	mov	ip,sp
371	stmdb	sp!,{r4-r8,lr}
372	vstmdb	sp!,{d8-d15}            @ ABI specification says so
373	ldmia	ip,{r4-r5}		@ load remaining args
374___
375$code.=<<___;
376	subs	$len,$len,#16
377	mov	$step,#16
378	b.lo	.Lcbc_abort
379	cclr	$step,eq
380
381	cmp	$enc,#0			// en- or decrypting?
382	ldr	$rounds,[$key,#240]
383	and	$len,$len,#-16
384	vld1.8	{$ivec},[$ivp]
385	vld1.8	{$dat},[$inp],$step
386
387	vld1.32	{q8-q9},[$key]		// load key schedule...
388	sub	$rounds,$rounds,#6
389	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
390	sub	$rounds,$rounds,#2
391	vld1.32	{q10-q11},[$key_],#32
392	vld1.32	{q12-q13},[$key_],#32
393	vld1.32	{q14-q15},[$key_],#32
394	vld1.32	{$rndlast},[$key_]
395
396	add	$key_,$key,#32
397	mov	$cnt,$rounds
398	b.eq	.Lcbc_dec
399
400	cmp	$rounds,#2
401	veor	$dat,$dat,$ivec
402	veor	$rndzero_n_last,q8,$rndlast
403	b.eq	.Lcbc_enc128
404
405	vld1.32	{$in0-$in1},[$key_]
406	add	$key_,$key,#16
407	add	$key4,$key,#16*4
408	add	$key5,$key,#16*5
409	aese	$dat,q8
410	aesmc	$dat,$dat
411	add	$key6,$key,#16*6
412	add	$key7,$key,#16*7
413	b	.Lenter_cbc_enc
414
415.align	4
416.Loop_cbc_enc:
417	aese	$dat,q8
418	aesmc	$dat,$dat
419	 vst1.8	{$ivec},[$out],#16
420.Lenter_cbc_enc:
421	aese	$dat,q9
422	aesmc	$dat,$dat
423	aese	$dat,$in0
424	aesmc	$dat,$dat
425	vld1.32	{q8},[$key4]
426	cmp	$rounds,#4
427	aese	$dat,$in1
428	aesmc	$dat,$dat
429	vld1.32	{q9},[$key5]
430	b.eq	.Lcbc_enc192
431
432	aese	$dat,q8
433	aesmc	$dat,$dat
434	vld1.32	{q8},[$key6]
435	aese	$dat,q9
436	aesmc	$dat,$dat
437	vld1.32	{q9},[$key7]
438	nop
439
440.Lcbc_enc192:
441	aese	$dat,q8
442	aesmc	$dat,$dat
443	 subs	$len,$len,#16
444	aese	$dat,q9
445	aesmc	$dat,$dat
446	 cclr	$step,eq
447	aese	$dat,q10
448	aesmc	$dat,$dat
449	aese	$dat,q11
450	aesmc	$dat,$dat
451	 vld1.8	{q8},[$inp],$step
452	aese	$dat,q12
453	aesmc	$dat,$dat
454	 veor	q8,q8,$rndzero_n_last
455	aese	$dat,q13
456	aesmc	$dat,$dat
457	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
458	aese	$dat,q14
459	aesmc	$dat,$dat
460	aese	$dat,q15
461	veor	$ivec,$dat,$rndlast
462	b.hs	.Loop_cbc_enc
463
464	vst1.8	{$ivec},[$out],#16
465	b	.Lcbc_done
466
467.align	5
468.Lcbc_enc128:
469	vld1.32	{$in0-$in1},[$key_]
470	aese	$dat,q8
471	aesmc	$dat,$dat
472	b	.Lenter_cbc_enc128
473.Loop_cbc_enc128:
474	aese	$dat,q8
475	aesmc	$dat,$dat
476	 vst1.8	{$ivec},[$out],#16
477.Lenter_cbc_enc128:
478	aese	$dat,q9
479	aesmc	$dat,$dat
480	 subs	$len,$len,#16
481	aese	$dat,$in0
482	aesmc	$dat,$dat
483	 cclr	$step,eq
484	aese	$dat,$in1
485	aesmc	$dat,$dat
486	aese	$dat,q10
487	aesmc	$dat,$dat
488	aese	$dat,q11
489	aesmc	$dat,$dat
490	 vld1.8	{q8},[$inp],$step
491	aese	$dat,q12
492	aesmc	$dat,$dat
493	aese	$dat,q13
494	aesmc	$dat,$dat
495	aese	$dat,q14
496	aesmc	$dat,$dat
497	 veor	q8,q8,$rndzero_n_last
498	aese	$dat,q15
499	veor	$ivec,$dat,$rndlast
500	b.hs	.Loop_cbc_enc128
501
502	vst1.8	{$ivec},[$out],#16
503	b	.Lcbc_done
504___
505{
506my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
507$code.=<<___;
508.align	5
509.Lcbc_dec:
510	vld1.8	{$dat2},[$inp],#16
511	subs	$len,$len,#32		// bias
512	add	$cnt,$rounds,#2
513	vorr	$in1,$dat,$dat
514	vorr	$dat1,$dat,$dat
515	vorr	$in2,$dat2,$dat2
516	b.lo	.Lcbc_dec_tail
517
518	vorr	$dat1,$dat2,$dat2
519	vld1.8	{$dat2},[$inp],#16
520	vorr	$in0,$dat,$dat
521	vorr	$in1,$dat1,$dat1
522	vorr	$in2,$dat2,$dat2
523
524.Loop3x_cbc_dec:
525	aesd	$dat0,q8
526	aesimc	$dat0,$dat0
527	aesd	$dat1,q8
528	aesimc	$dat1,$dat1
529	aesd	$dat2,q8
530	aesimc	$dat2,$dat2
531	vld1.32	{q8},[$key_],#16
532	subs	$cnt,$cnt,#2
533	aesd	$dat0,q9
534	aesimc	$dat0,$dat0
535	aesd	$dat1,q9
536	aesimc	$dat1,$dat1
537	aesd	$dat2,q9
538	aesimc	$dat2,$dat2
539	vld1.32	{q9},[$key_],#16
540	b.gt	.Loop3x_cbc_dec
541
542	aesd	$dat0,q8
543	aesimc	$dat0,$dat0
544	aesd	$dat1,q8
545	aesimc	$dat1,$dat1
546	aesd	$dat2,q8
547	aesimc	$dat2,$dat2
548	 veor	$tmp0,$ivec,$rndlast
549	 subs	$len,$len,#0x30
550	 veor	$tmp1,$in0,$rndlast
551	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
552	aesd	$dat0,q9
553	aesimc	$dat0,$dat0
554	aesd	$dat1,q9
555	aesimc	$dat1,$dat1
556	aesd	$dat2,q9
557	aesimc	$dat2,$dat2
558	 veor	$tmp2,$in1,$rndlast
559	 add	$inp,$inp,x6		// $inp is adjusted in such way that
560					// at exit from the loop $dat1-$dat2
561					// are loaded with last "words"
562	 vorr	$ivec,$in2,$in2
563	 mov	$key_,$key
564	aesd	$dat0,q12
565	aesimc	$dat0,$dat0
566	aesd	$dat1,q12
567	aesimc	$dat1,$dat1
568	aesd	$dat2,q12
569	aesimc	$dat2,$dat2
570	 vld1.8	{$in0},[$inp],#16
571	aesd	$dat0,q13
572	aesimc	$dat0,$dat0
573	aesd	$dat1,q13
574	aesimc	$dat1,$dat1
575	aesd	$dat2,q13
576	aesimc	$dat2,$dat2
577	 vld1.8	{$in1},[$inp],#16
578	aesd	$dat0,q14
579	aesimc	$dat0,$dat0
580	aesd	$dat1,q14
581	aesimc	$dat1,$dat1
582	aesd	$dat2,q14
583	aesimc	$dat2,$dat2
584	 vld1.8	{$in2},[$inp],#16
585	aesd	$dat0,q15
586	aesd	$dat1,q15
587	aesd	$dat2,q15
588	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
589	 add	$cnt,$rounds,#2
590	veor	$tmp0,$tmp0,$dat0
591	veor	$tmp1,$tmp1,$dat1
592	veor	$dat2,$dat2,$tmp2
593	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
594	vst1.8	{$tmp0},[$out],#16
595	 vorr	$dat0,$in0,$in0
596	vst1.8	{$tmp1},[$out],#16
597	 vorr	$dat1,$in1,$in1
598	vst1.8	{$dat2},[$out],#16
599	 vorr	$dat2,$in2,$in2
600	b.hs	.Loop3x_cbc_dec
601
602	cmn	$len,#0x30
603	b.eq	.Lcbc_done
604	nop
605
606.Lcbc_dec_tail:
607	aesd	$dat1,q8
608	aesimc	$dat1,$dat1
609	aesd	$dat2,q8
610	aesimc	$dat2,$dat2
611	vld1.32	{q8},[$key_],#16
612	subs	$cnt,$cnt,#2
613	aesd	$dat1,q9
614	aesimc	$dat1,$dat1
615	aesd	$dat2,q9
616	aesimc	$dat2,$dat2
617	vld1.32	{q9},[$key_],#16
618	b.gt	.Lcbc_dec_tail
619
620	aesd	$dat1,q8
621	aesimc	$dat1,$dat1
622	aesd	$dat2,q8
623	aesimc	$dat2,$dat2
624	aesd	$dat1,q9
625	aesimc	$dat1,$dat1
626	aesd	$dat2,q9
627	aesimc	$dat2,$dat2
628	aesd	$dat1,q12
629	aesimc	$dat1,$dat1
630	aesd	$dat2,q12
631	aesimc	$dat2,$dat2
632	 cmn	$len,#0x20
633	aesd	$dat1,q13
634	aesimc	$dat1,$dat1
635	aesd	$dat2,q13
636	aesimc	$dat2,$dat2
637	 veor	$tmp1,$ivec,$rndlast
638	aesd	$dat1,q14
639	aesimc	$dat1,$dat1
640	aesd	$dat2,q14
641	aesimc	$dat2,$dat2
642	 veor	$tmp2,$in1,$rndlast
643	aesd	$dat1,q15
644	aesd	$dat2,q15
645	b.eq	.Lcbc_dec_one
646	veor	$tmp1,$tmp1,$dat1
647	veor	$tmp2,$tmp2,$dat2
648	 vorr	$ivec,$in2,$in2
649	vst1.8	{$tmp1},[$out],#16
650	vst1.8	{$tmp2},[$out],#16
651	b	.Lcbc_done
652
653.Lcbc_dec_one:
654	veor	$tmp1,$tmp1,$dat2
655	 vorr	$ivec,$in2,$in2
656	vst1.8	{$tmp1},[$out],#16
657
658.Lcbc_done:
659	vst1.8	{$ivec},[$ivp]
660.Lcbc_abort:
661___
662}
663$code.=<<___	if ($flavour !~ /64/);
664	vldmia	sp!,{d8-d15}
665	ldmia	sp!,{r4-r8,pc}
666___
667$code.=<<___	if ($flavour =~ /64/);
668	ldr	x29,[sp],#16
669	ret
670___
671$code.=<<___;
672.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
673___
674}}}
675{{{
676my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
677my ($rounds,$cnt,$key_)=("w5","w6","x7");
678my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
679my $step="x12";		# aliases with $tctr2
680
681my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
682my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
683
684my ($dat,$tmp)=($dat0,$tmp0);
685
686### q8-q15	preloaded key schedule
687
688$code.=<<___;
689.globl	${prefix}_ctr32_encrypt_blocks
690.type	${prefix}_ctr32_encrypt_blocks,%function
691.align	5
692${prefix}_ctr32_encrypt_blocks:
693___
694$code.=<<___	if ($flavour =~ /64/);
695	stp		x29,x30,[sp,#-16]!
696	add		x29,sp,#0
697___
698$code.=<<___	if ($flavour !~ /64/);
699	mov		ip,sp
700	stmdb		sp!,{r4-r10,lr}
701	vstmdb		sp!,{d8-d15}            @ ABI specification says so
702	ldr		r4, [ip]		@ load remaining arg
703___
704$code.=<<___;
705	ldr		$rounds,[$key,#240]
706
707	ldr		$ctr, [$ivp, #12]
708	vld1.32		{$dat0},[$ivp]
709
710	vld1.32		{q8-q9},[$key]		// load key schedule...
711	sub		$rounds,$rounds,#4
712	mov		$step,#16
713	cmp		$len,#2
714	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
715	sub		$rounds,$rounds,#2
716	vld1.32		{q12-q13},[$key_],#32
717	vld1.32		{q14-q15},[$key_],#32
718	vld1.32		{$rndlast},[$key_]
719	add		$key_,$key,#32
720	mov		$cnt,$rounds
721	cclr		$step,lo
722#ifndef __ARMEB__
723	rev		$ctr, $ctr
724#endif
725	vorr		$dat1,$dat0,$dat0
726	add		$tctr1, $ctr, #1
727	vorr		$dat2,$dat0,$dat0
728	add		$ctr, $ctr, #2
729	vorr		$ivec,$dat0,$dat0
730	rev		$tctr1, $tctr1
731	vmov.32		${dat1}[3],$tctr1
732	b.ls		.Lctr32_tail
733	rev		$tctr2, $ctr
734	sub		$len,$len,#3		// bias
735	vmov.32		${dat2}[3],$tctr2
736	b		.Loop3x_ctr32
737
738.align	4
739.Loop3x_ctr32:
740	aese		$dat0,q8
741	aesmc		$dat0,$dat0
742	aese		$dat1,q8
743	aesmc		$dat1,$dat1
744	aese		$dat2,q8
745	aesmc		$dat2,$dat2
746	vld1.32		{q8},[$key_],#16
747	subs		$cnt,$cnt,#2
748	aese		$dat0,q9
749	aesmc		$dat0,$dat0
750	aese		$dat1,q9
751	aesmc		$dat1,$dat1
752	aese		$dat2,q9
753	aesmc		$dat2,$dat2
754	vld1.32		{q9},[$key_],#16
755	b.gt		.Loop3x_ctr32
756
757	aese		$dat0,q8
758	aesmc		$tmp0,$dat0
759	aese		$dat1,q8
760	aesmc		$tmp1,$dat1
761	 vld1.8		{$in0},[$inp],#16
762	 vorr		$dat0,$ivec,$ivec
763	aese		$dat2,q8
764	aesmc		$dat2,$dat2
765	 vld1.8		{$in1},[$inp],#16
766	 vorr		$dat1,$ivec,$ivec
767	aese		$tmp0,q9
768	aesmc		$tmp0,$tmp0
769	aese		$tmp1,q9
770	aesmc		$tmp1,$tmp1
771	 vld1.8		{$in2},[$inp],#16
772	 mov		$key_,$key
773	aese		$dat2,q9
774	aesmc		$tmp2,$dat2
775	 vorr		$dat2,$ivec,$ivec
776	 add		$tctr0,$ctr,#1
777	aese		$tmp0,q12
778	aesmc		$tmp0,$tmp0
779	aese		$tmp1,q12
780	aesmc		$tmp1,$tmp1
781	 veor		$in0,$in0,$rndlast
782	 add		$tctr1,$ctr,#2
783	aese		$tmp2,q12
784	aesmc		$tmp2,$tmp2
785	 veor		$in1,$in1,$rndlast
786	 add		$ctr,$ctr,#3
787	aese		$tmp0,q13
788	aesmc		$tmp0,$tmp0
789	aese		$tmp1,q13
790	aesmc		$tmp1,$tmp1
791	 veor		$in2,$in2,$rndlast
792	 rev		$tctr0,$tctr0
793	aese		$tmp2,q13
794	aesmc		$tmp2,$tmp2
795	 vmov.32	${dat0}[3], $tctr0
796	 rev		$tctr1,$tctr1
797	aese		$tmp0,q14
798	aesmc		$tmp0,$tmp0
799	aese		$tmp1,q14
800	aesmc		$tmp1,$tmp1
801	 vmov.32	${dat1}[3], $tctr1
802	 rev		$tctr2,$ctr
803	aese		$tmp2,q14
804	aesmc		$tmp2,$tmp2
805	 vmov.32	${dat2}[3], $tctr2
806	 subs		$len,$len,#3
807	aese		$tmp0,q15
808	aese		$tmp1,q15
809	aese		$tmp2,q15
810
811	veor		$in0,$in0,$tmp0
812	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
813	vst1.8		{$in0},[$out],#16
814	veor		$in1,$in1,$tmp1
815	 mov		$cnt,$rounds
816	vst1.8		{$in1},[$out],#16
817	veor		$in2,$in2,$tmp2
818	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
819	vst1.8		{$in2},[$out],#16
820	b.hs		.Loop3x_ctr32
821
822	adds		$len,$len,#3
823	b.eq		.Lctr32_done
824	cmp		$len,#1
825	mov		$step,#16
826	cclr		$step,eq
827
828.Lctr32_tail:
829	aese		$dat0,q8
830	aesmc		$dat0,$dat0
831	aese		$dat1,q8
832	aesmc		$dat1,$dat1
833	vld1.32		{q8},[$key_],#16
834	subs		$cnt,$cnt,#2
835	aese		$dat0,q9
836	aesmc		$dat0,$dat0
837	aese		$dat1,q9
838	aesmc		$dat1,$dat1
839	vld1.32		{q9},[$key_],#16
840	b.gt		.Lctr32_tail
841
842	aese		$dat0,q8
843	aesmc		$dat0,$dat0
844	aese		$dat1,q8
845	aesmc		$dat1,$dat1
846	aese		$dat0,q9
847	aesmc		$dat0,$dat0
848	aese		$dat1,q9
849	aesmc		$dat1,$dat1
850	 vld1.8		{$in0},[$inp],$step
851	aese		$dat0,q12
852	aesmc		$dat0,$dat0
853	aese		$dat1,q12
854	aesmc		$dat1,$dat1
855	 vld1.8		{$in1},[$inp]
856	aese		$dat0,q13
857	aesmc		$dat0,$dat0
858	aese		$dat1,q13
859	aesmc		$dat1,$dat1
860	 veor		$in0,$in0,$rndlast
861	aese		$dat0,q14
862	aesmc		$dat0,$dat0
863	aese		$dat1,q14
864	aesmc		$dat1,$dat1
865	 veor		$in1,$in1,$rndlast
866	aese		$dat0,q15
867	aese		$dat1,q15
868
869	cmp		$len,#1
870	veor		$in0,$in0,$dat0
871	veor		$in1,$in1,$dat1
872	vst1.8		{$in0},[$out],#16
873	b.eq		.Lctr32_done
874	vst1.8		{$in1},[$out]
875
876.Lctr32_done:
877___
878$code.=<<___	if ($flavour !~ /64/);
879	vldmia		sp!,{d8-d15}
880	ldmia		sp!,{r4-r10,pc}
881___
882$code.=<<___	if ($flavour =~ /64/);
883	ldr		x29,[sp],#16
884	ret
885___
886$code.=<<___;
887.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
888___
889}}}
890$code.=<<___;
891#endif
892___
893########################################
894if ($flavour =~ /64/) {			######## 64-bit code
895    my %opcode = (
896	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
897	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
898
899    local *unaes = sub {
900	my ($mnemonic,$arg)=@_;
901
902	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
903	sprintf ".inst\t0x%08x\t//%s %s",
904			$opcode{$mnemonic}|$1|($2<<5),
905			$mnemonic,$arg;
906    };
907
908    foreach(split("\n",$code)) {
909	s/\`([^\`]*)\`/eval($1)/geo;
910
911	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
912	s/@\s/\/\//o;			# old->new style commentary
913
914	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
915	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
916	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
917	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
918	s/vext\.8/ext/o		or
919	s/vrev32\.8/rev32/o	or
920	s/vtst\.8/cmtst/o	or
921	s/vshr/ushr/o		or
922	s/^(\s+)v/$1/o		or	# strip off v prefix
923	s/\bbx\s+lr\b/ret/o;
924
925	# fix up remainig legacy suffixes
926	s/\.[ui]?8//o;
927	m/\],#8/o and s/\.16b/\.8b/go;
928	s/\.[ui]?32//o and s/\.16b/\.4s/go;
929	s/\.[ui]?64//o and s/\.16b/\.2d/go;
930	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
931
932	print $_,"\n";
933    }
934} else {				######## 32-bit code
935    my %opcode = (
936	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
937	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
938
939    local *unaes = sub {
940	my ($mnemonic,$arg)=@_;
941
942	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
943	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
944					 |(($2&7)<<1) |(($2&8)<<2);
945	    # since ARMv7 instructions are always encoded little-endian.
946	    # correct solution is to use .inst directive, but older
947	    # assemblers don't implement it:-(
948	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
949			$word&0xff,($word>>8)&0xff,
950			($word>>16)&0xff,($word>>24)&0xff,
951			$mnemonic,$arg;
952	}
953    };
954
955    sub unvtbl {
956	my $arg=shift;
957
958	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
959	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
960		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
961    }
962
963    sub unvdup32 {
964	my $arg=shift;
965
966	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
967	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
968    }
969
970    sub unvmov32 {
971	my $arg=shift;
972
973	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
974	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
975    }
976
977    foreach(split("\n",$code)) {
978	s/\`([^\`]*)\`/eval($1)/geo;
979
980	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
981	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
982	s/\/\/\s?/@ /o;				# new->old style commentary
983
984	# fix up remainig new-style suffixes
985	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
986	s/\],#[0-9]+/]!/o;
987
988	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
989	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
990	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
991	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
992	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
993	s/^(\s+)b\./$1b/o				or
994	s/^(\s+)mov\./$1mov/o				or
995	s/^(\s+)ret/$1bx\tlr/o;
996
997	print $_,"\n";
998    }
999}
1000
1001close STDOUT;
1002