1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
18#
19# ChaCha20 for ARMv8.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.9    3xNEON+1xIALU	6xNEON+2xIALU
24#
25# Apple A7		5.50/+49%       3.33            1.70
26# Cortex-A53		8.40/+80%       4.72		4.72(*)
27# Cortex-A57		8.06/+43%       4.90            4.43(**)
28# Denver		4.50/+82%       2.63		2.67(*)
29# X-Gene		9.50/+46%       8.82		8.89(*)
30# Mongoose		8.00/+44%	3.64		3.25
31#
32# (*)	it's expected that doubling interleave factor doesn't help
33#	all processors, only those with higher NEON latency and
34#	higher instruction issue rate;
35# (**)	expected improvement was actually higher;
36
37$flavour=shift;
38$output=shift;
39
40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
42( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
43die "can't locate arm-xlate.pl";
44
45open OUT,"| \"$^X\" $xlate $flavour $output";
46*STDOUT=*OUT;
47
48sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
49{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
50  my $arg = pop;
51    $arg = "#$arg" if ($arg*1 eq $arg);
52    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
53}
54
55my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
56
57my @x=map("x$_",(5..17,19..21));
58my @d=map("x$_",(22..28,30));
59
60sub ROUND {
61my ($a0,$b0,$c0,$d0)=@_;
62my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
63my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
64my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
65
66    (
67	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
68	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
69	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
70	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
71	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
72	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
73	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
74	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
75	"&ror_32	(@x[$d0],@x[$d0],16)",
76	 "&ror_32	(@x[$d1],@x[$d1],16)",
77	  "&ror_32	(@x[$d2],@x[$d2],16)",
78	   "&ror_32	(@x[$d3],@x[$d3],16)",
79
80	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
81	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
82	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
83	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
84	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
85	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
86	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
87	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
88	"&ror_32	(@x[$b0],@x[$b0],20)",
89	 "&ror_32	(@x[$b1],@x[$b1],20)",
90	  "&ror_32	(@x[$b2],@x[$b2],20)",
91	   "&ror_32	(@x[$b3],@x[$b3],20)",
92
93	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
94	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
95	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
96	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
97	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
98	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
99	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
100	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
101	"&ror_32	(@x[$d0],@x[$d0],24)",
102	 "&ror_32	(@x[$d1],@x[$d1],24)",
103	  "&ror_32	(@x[$d2],@x[$d2],24)",
104	   "&ror_32	(@x[$d3],@x[$d3],24)",
105
106	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
107	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
108	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
109	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
110	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
111	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
112	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
113	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
114	"&ror_32	(@x[$b0],@x[$b0],25)",
115	 "&ror_32	(@x[$b1],@x[$b1],25)",
116	  "&ror_32	(@x[$b2],@x[$b2],25)",
117	   "&ror_32	(@x[$b3],@x[$b3],25)"
118    );
119}
120
121$code.=<<___;
122#include <openssl/arm_arch.h>
123
124.text
125
126.extern	OPENSSL_armcap_P
127
128.align	5
129.Lsigma:
130.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
131.Lone:
132.long	1,0,0,0
133.LOPENSSL_armcap_P:
134#ifdef	__ILP32__
135.long	OPENSSL_armcap_P-.
136#else
137.quad	OPENSSL_armcap_P-.
138#endif
139.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
140
141.globl	ChaCha20_ctr32
142.type	ChaCha20_ctr32,%function
143.align	5
144ChaCha20_ctr32:
145	cbz	$len,.Labort
146	adr	@x[0],.LOPENSSL_armcap_P
147	cmp	$len,#192
148	b.lo	.Lshort
149#ifdef	__ILP32__
150	ldrsw	@x[1],[@x[0]]
151#else
152	ldr	@x[1],[@x[0]]
153#endif
154	ldr	w17,[@x[1],@x[0]]
155	tst	w17,#ARMV7_NEON
156	b.ne	ChaCha20_neon
157
158.Lshort:
159	stp	x29,x30,[sp,#-96]!
160	add	x29,sp,#0
161
162	adr	@x[0],.Lsigma
163	stp	x19,x20,[sp,#16]
164	stp	x21,x22,[sp,#32]
165	stp	x23,x24,[sp,#48]
166	stp	x25,x26,[sp,#64]
167	stp	x27,x28,[sp,#80]
168	sub	sp,sp,#64
169
170	ldp	@d[0],@d[1],[@x[0]]		// load sigma
171	ldp	@d[2],@d[3],[$key]		// load key
172	ldp	@d[4],@d[5],[$key,#16]
173	ldp	@d[6],@d[7],[$ctr]		// load counter
174#ifdef	__ARMEB__
175	ror	@d[2],@d[2],#32
176	ror	@d[3],@d[3],#32
177	ror	@d[4],@d[4],#32
178	ror	@d[5],@d[5],#32
179	ror	@d[6],@d[6],#32
180	ror	@d[7],@d[7],#32
181#endif
182
183.Loop_outer:
184	mov.32	@x[0],@d[0]			// unpack key block
185	lsr	@x[1],@d[0],#32
186	mov.32	@x[2],@d[1]
187	lsr	@x[3],@d[1],#32
188	mov.32	@x[4],@d[2]
189	lsr	@x[5],@d[2],#32
190	mov.32	@x[6],@d[3]
191	lsr	@x[7],@d[3],#32
192	mov.32	@x[8],@d[4]
193	lsr	@x[9],@d[4],#32
194	mov.32	@x[10],@d[5]
195	lsr	@x[11],@d[5],#32
196	mov.32	@x[12],@d[6]
197	lsr	@x[13],@d[6],#32
198	mov.32	@x[14],@d[7]
199	lsr	@x[15],@d[7],#32
200
201	mov	$ctr,#10
202	subs	$len,$len,#64
203.Loop:
204	sub	$ctr,$ctr,#1
205___
206	foreach (&ROUND(0, 4, 8,12)) { eval; }
207	foreach (&ROUND(0, 5,10,15)) { eval; }
208$code.=<<___;
209	cbnz	$ctr,.Loop
210
211	add.32	@x[0],@x[0],@d[0]		// accumulate key block
212	add	@x[1],@x[1],@d[0],lsr#32
213	add.32	@x[2],@x[2],@d[1]
214	add	@x[3],@x[3],@d[1],lsr#32
215	add.32	@x[4],@x[4],@d[2]
216	add	@x[5],@x[5],@d[2],lsr#32
217	add.32	@x[6],@x[6],@d[3]
218	add	@x[7],@x[7],@d[3],lsr#32
219	add.32	@x[8],@x[8],@d[4]
220	add	@x[9],@x[9],@d[4],lsr#32
221	add.32	@x[10],@x[10],@d[5]
222	add	@x[11],@x[11],@d[5],lsr#32
223	add.32	@x[12],@x[12],@d[6]
224	add	@x[13],@x[13],@d[6],lsr#32
225	add.32	@x[14],@x[14],@d[7]
226	add	@x[15],@x[15],@d[7],lsr#32
227
228	b.lo	.Ltail
229
230	add	@x[0],@x[0],@x[1],lsl#32	// pack
231	add	@x[2],@x[2],@x[3],lsl#32
232	ldp	@x[1],@x[3],[$inp,#0]		// load input
233	add	@x[4],@x[4],@x[5],lsl#32
234	add	@x[6],@x[6],@x[7],lsl#32
235	ldp	@x[5],@x[7],[$inp,#16]
236	add	@x[8],@x[8],@x[9],lsl#32
237	add	@x[10],@x[10],@x[11],lsl#32
238	ldp	@x[9],@x[11],[$inp,#32]
239	add	@x[12],@x[12],@x[13],lsl#32
240	add	@x[14],@x[14],@x[15],lsl#32
241	ldp	@x[13],@x[15],[$inp,#48]
242	add	$inp,$inp,#64
243#ifdef	__ARMEB__
244	rev	@x[0],@x[0]
245	rev	@x[2],@x[2]
246	rev	@x[4],@x[4]
247	rev	@x[6],@x[6]
248	rev	@x[8],@x[8]
249	rev	@x[10],@x[10]
250	rev	@x[12],@x[12]
251	rev	@x[14],@x[14]
252#endif
253	eor	@x[0],@x[0],@x[1]
254	eor	@x[2],@x[2],@x[3]
255	eor	@x[4],@x[4],@x[5]
256	eor	@x[6],@x[6],@x[7]
257	eor	@x[8],@x[8],@x[9]
258	eor	@x[10],@x[10],@x[11]
259	eor	@x[12],@x[12],@x[13]
260	eor	@x[14],@x[14],@x[15]
261
262	stp	@x[0],@x[2],[$out,#0]		// store output
263	 add	@d[6],@d[6],#1			// increment counter
264	stp	@x[4],@x[6],[$out,#16]
265	stp	@x[8],@x[10],[$out,#32]
266	stp	@x[12],@x[14],[$out,#48]
267	add	$out,$out,#64
268
269	b.hi	.Loop_outer
270
271	ldp	x19,x20,[x29,#16]
272	add	sp,sp,#64
273	ldp	x21,x22,[x29,#32]
274	ldp	x23,x24,[x29,#48]
275	ldp	x25,x26,[x29,#64]
276	ldp	x27,x28,[x29,#80]
277	ldp	x29,x30,[sp],#96
278.Labort:
279	ret
280
281.align	4
282.Ltail:
283	add	$len,$len,#64
284.Less_than_64:
285	sub	$out,$out,#1
286	add	$inp,$inp,$len
287	add	$out,$out,$len
288	add	$ctr,sp,$len
289	neg	$len,$len
290
291	add	@x[0],@x[0],@x[1],lsl#32	// pack
292	add	@x[2],@x[2],@x[3],lsl#32
293	add	@x[4],@x[4],@x[5],lsl#32
294	add	@x[6],@x[6],@x[7],lsl#32
295	add	@x[8],@x[8],@x[9],lsl#32
296	add	@x[10],@x[10],@x[11],lsl#32
297	add	@x[12],@x[12],@x[13],lsl#32
298	add	@x[14],@x[14],@x[15],lsl#32
299#ifdef	__ARMEB__
300	rev	@x[0],@x[0]
301	rev	@x[2],@x[2]
302	rev	@x[4],@x[4]
303	rev	@x[6],@x[6]
304	rev	@x[8],@x[8]
305	rev	@x[10],@x[10]
306	rev	@x[12],@x[12]
307	rev	@x[14],@x[14]
308#endif
309	stp	@x[0],@x[2],[sp,#0]
310	stp	@x[4],@x[6],[sp,#16]
311	stp	@x[8],@x[10],[sp,#32]
312	stp	@x[12],@x[14],[sp,#48]
313
314.Loop_tail:
315	ldrb	w10,[$inp,$len]
316	ldrb	w11,[$ctr,$len]
317	add	$len,$len,#1
318	eor	w10,w10,w11
319	strb	w10,[$out,$len]
320	cbnz	$len,.Loop_tail
321
322	stp	xzr,xzr,[sp,#0]
323	stp	xzr,xzr,[sp,#16]
324	stp	xzr,xzr,[sp,#32]
325	stp	xzr,xzr,[sp,#48]
326
327	ldp	x19,x20,[x29,#16]
328	add	sp,sp,#64
329	ldp	x21,x22,[x29,#32]
330	ldp	x23,x24,[x29,#48]
331	ldp	x25,x26,[x29,#64]
332	ldp	x27,x28,[x29,#80]
333	ldp	x29,x30,[sp],#96
334	ret
335.size	ChaCha20_ctr32,.-ChaCha20_ctr32
336___
337
338{{{
339my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
340    map("v$_.4s",(0..7,16..23));
341my (@K)=map("v$_.4s",(24..30));
342my $ONE="v31.4s";
343
344sub NEONROUND {
345my $odd = pop;
346my ($a,$b,$c,$d,$t)=@_;
347
348	(
349	"&add		('$a','$a','$b')",
350	"&eor		('$d','$d','$a')",
351	"&rev32_16	('$d','$d')",		# vrot ($d,16)
352
353	"&add		('$c','$c','$d')",
354	"&eor		('$t','$b','$c')",
355	"&ushr		('$b','$t',20)",
356	"&sli		('$b','$t',12)",
357
358	"&add		('$a','$a','$b')",
359	"&eor		('$t','$d','$a')",
360	"&ushr		('$d','$t',24)",
361	"&sli		('$d','$t',8)",
362
363	"&add		('$c','$c','$d')",
364	"&eor		('$t','$b','$c')",
365	"&ushr		('$b','$t',25)",
366	"&sli		('$b','$t',7)",
367
368	"&ext		('$c','$c','$c',8)",
369	"&ext		('$d','$d','$d',$odd?4:12)",
370	"&ext		('$b','$b','$b',$odd?12:4)"
371	);
372}
373
374$code.=<<___;
375
376.type	ChaCha20_neon,%function
377.align	5
378ChaCha20_neon:
379	stp	x29,x30,[sp,#-96]!
380	add	x29,sp,#0
381
382	adr	@x[0],.Lsigma
383	stp	x19,x20,[sp,#16]
384	stp	x21,x22,[sp,#32]
385	stp	x23,x24,[sp,#48]
386	stp	x25,x26,[sp,#64]
387	stp	x27,x28,[sp,#80]
388	cmp	$len,#512
389	b.hs	.L512_or_more_neon
390
391	sub	sp,sp,#64
392
393	ldp	@d[0],@d[1],[@x[0]]		// load sigma
394	ld1	{@K[0]},[@x[0]],#16
395	ldp	@d[2],@d[3],[$key]		// load key
396	ldp	@d[4],@d[5],[$key,#16]
397	ld1	{@K[1],@K[2]},[$key]
398	ldp	@d[6],@d[7],[$ctr]		// load counter
399	ld1	{@K[3]},[$ctr]
400	ld1	{$ONE},[@x[0]]
401#ifdef	__ARMEB__
402	rev64	@K[0],@K[0]
403	ror	@d[2],@d[2],#32
404	ror	@d[3],@d[3],#32
405	ror	@d[4],@d[4],#32
406	ror	@d[5],@d[5],#32
407	ror	@d[6],@d[6],#32
408	ror	@d[7],@d[7],#32
409#endif
410	add	@K[3],@K[3],$ONE		// += 1
411	add	@K[4],@K[3],$ONE
412	add	@K[5],@K[4],$ONE
413	shl	$ONE,$ONE,#2			// 1 -> 4
414
415.Loop_outer_neon:
416	mov.32	@x[0],@d[0]			// unpack key block
417	lsr	@x[1],@d[0],#32
418	 mov	$A0,@K[0]
419	mov.32	@x[2],@d[1]
420	lsr	@x[3],@d[1],#32
421	 mov	$A1,@K[0]
422	mov.32	@x[4],@d[2]
423	lsr	@x[5],@d[2],#32
424	 mov	$A2,@K[0]
425	mov.32	@x[6],@d[3]
426	 mov	$B0,@K[1]
427	lsr	@x[7],@d[3],#32
428	 mov	$B1,@K[1]
429	mov.32	@x[8],@d[4]
430	 mov	$B2,@K[1]
431	lsr	@x[9],@d[4],#32
432	 mov	$D0,@K[3]
433	mov.32	@x[10],@d[5]
434	 mov	$D1,@K[4]
435	lsr	@x[11],@d[5],#32
436	 mov	$D2,@K[5]
437	mov.32	@x[12],@d[6]
438	 mov	$C0,@K[2]
439	lsr	@x[13],@d[6],#32
440	 mov	$C1,@K[2]
441	mov.32	@x[14],@d[7]
442	 mov	$C2,@K[2]
443	lsr	@x[15],@d[7],#32
444
445	mov	$ctr,#10
446	subs	$len,$len,#256
447.Loop_neon:
448	sub	$ctr,$ctr,#1
449___
450	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
451	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
452	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
453	my @thread3=&ROUND(0,4,8,12);
454
455	foreach (@thread0) {
456		eval;			eval(shift(@thread3));
457		eval(shift(@thread1));	eval(shift(@thread3));
458		eval(shift(@thread2));	eval(shift(@thread3));
459	}
460
461	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
462	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
463	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
464	@thread3=&ROUND(0,5,10,15);
465
466	foreach (@thread0) {
467		eval;			eval(shift(@thread3));
468		eval(shift(@thread1));	eval(shift(@thread3));
469		eval(shift(@thread2));	eval(shift(@thread3));
470	}
471$code.=<<___;
472	cbnz	$ctr,.Loop_neon
473
474	add.32	@x[0],@x[0],@d[0]		// accumulate key block
475	 add	$A0,$A0,@K[0]
476	add	@x[1],@x[1],@d[0],lsr#32
477	 add	$A1,$A1,@K[0]
478	add.32	@x[2],@x[2],@d[1]
479	 add	$A2,$A2,@K[0]
480	add	@x[3],@x[3],@d[1],lsr#32
481	 add	$C0,$C0,@K[2]
482	add.32	@x[4],@x[4],@d[2]
483	 add	$C1,$C1,@K[2]
484	add	@x[5],@x[5],@d[2],lsr#32
485	 add	$C2,$C2,@K[2]
486	add.32	@x[6],@x[6],@d[3]
487	 add	$D0,$D0,@K[3]
488	add	@x[7],@x[7],@d[3],lsr#32
489	add.32	@x[8],@x[8],@d[4]
490	 add	$D1,$D1,@K[4]
491	add	@x[9],@x[9],@d[4],lsr#32
492	add.32	@x[10],@x[10],@d[5]
493	 add	$D2,$D2,@K[5]
494	add	@x[11],@x[11],@d[5],lsr#32
495	add.32	@x[12],@x[12],@d[6]
496	 add	$B0,$B0,@K[1]
497	add	@x[13],@x[13],@d[6],lsr#32
498	add.32	@x[14],@x[14],@d[7]
499	 add	$B1,$B1,@K[1]
500	add	@x[15],@x[15],@d[7],lsr#32
501	 add	$B2,$B2,@K[1]
502
503	b.lo	.Ltail_neon
504
505	add	@x[0],@x[0],@x[1],lsl#32	// pack
506	add	@x[2],@x[2],@x[3],lsl#32
507	ldp	@x[1],@x[3],[$inp,#0]		// load input
508	add	@x[4],@x[4],@x[5],lsl#32
509	add	@x[6],@x[6],@x[7],lsl#32
510	ldp	@x[5],@x[7],[$inp,#16]
511	add	@x[8],@x[8],@x[9],lsl#32
512	add	@x[10],@x[10],@x[11],lsl#32
513	ldp	@x[9],@x[11],[$inp,#32]
514	add	@x[12],@x[12],@x[13],lsl#32
515	add	@x[14],@x[14],@x[15],lsl#32
516	ldp	@x[13],@x[15],[$inp,#48]
517	add	$inp,$inp,#64
518#ifdef	__ARMEB__
519	rev	@x[0],@x[0]
520	rev	@x[2],@x[2]
521	rev	@x[4],@x[4]
522	rev	@x[6],@x[6]
523	rev	@x[8],@x[8]
524	rev	@x[10],@x[10]
525	rev	@x[12],@x[12]
526	rev	@x[14],@x[14]
527#endif
528	ld1.8	{$T0-$T3},[$inp],#64
529	eor	@x[0],@x[0],@x[1]
530	eor	@x[2],@x[2],@x[3]
531	eor	@x[4],@x[4],@x[5]
532	eor	@x[6],@x[6],@x[7]
533	eor	@x[8],@x[8],@x[9]
534	 eor	$A0,$A0,$T0
535	eor	@x[10],@x[10],@x[11]
536	 eor	$B0,$B0,$T1
537	eor	@x[12],@x[12],@x[13]
538	 eor	$C0,$C0,$T2
539	eor	@x[14],@x[14],@x[15]
540	 eor	$D0,$D0,$T3
541	 ld1.8	{$T0-$T3},[$inp],#64
542
543	stp	@x[0],@x[2],[$out,#0]		// store output
544	 add	@d[6],@d[6],#4			// increment counter
545	stp	@x[4],@x[6],[$out,#16]
546	 add	@K[3],@K[3],$ONE		// += 4
547	stp	@x[8],@x[10],[$out,#32]
548	 add	@K[4],@K[4],$ONE
549	stp	@x[12],@x[14],[$out,#48]
550	 add	@K[5],@K[5],$ONE
551	add	$out,$out,#64
552
553	st1.8	{$A0-$D0},[$out],#64
554	ld1.8	{$A0-$D0},[$inp],#64
555
556	eor	$A1,$A1,$T0
557	eor	$B1,$B1,$T1
558	eor	$C1,$C1,$T2
559	eor	$D1,$D1,$T3
560	st1.8	{$A1-$D1},[$out],#64
561
562	eor	$A2,$A2,$A0
563	eor	$B2,$B2,$B0
564	eor	$C2,$C2,$C0
565	eor	$D2,$D2,$D0
566	st1.8	{$A2-$D2},[$out],#64
567
568	b.hi	.Loop_outer_neon
569
570	ldp	x19,x20,[x29,#16]
571	add	sp,sp,#64
572	ldp	x21,x22,[x29,#32]
573	ldp	x23,x24,[x29,#48]
574	ldp	x25,x26,[x29,#64]
575	ldp	x27,x28,[x29,#80]
576	ldp	x29,x30,[sp],#96
577	ret
578
579.Ltail_neon:
580	add	$len,$len,#256
581	cmp	$len,#64
582	b.lo	.Less_than_64
583
584	add	@x[0],@x[0],@x[1],lsl#32	// pack
585	add	@x[2],@x[2],@x[3],lsl#32
586	ldp	@x[1],@x[3],[$inp,#0]		// load input
587	add	@x[4],@x[4],@x[5],lsl#32
588	add	@x[6],@x[6],@x[7],lsl#32
589	ldp	@x[5],@x[7],[$inp,#16]
590	add	@x[8],@x[8],@x[9],lsl#32
591	add	@x[10],@x[10],@x[11],lsl#32
592	ldp	@x[9],@x[11],[$inp,#32]
593	add	@x[12],@x[12],@x[13],lsl#32
594	add	@x[14],@x[14],@x[15],lsl#32
595	ldp	@x[13],@x[15],[$inp,#48]
596	add	$inp,$inp,#64
597#ifdef	__ARMEB__
598	rev	@x[0],@x[0]
599	rev	@x[2],@x[2]
600	rev	@x[4],@x[4]
601	rev	@x[6],@x[6]
602	rev	@x[8],@x[8]
603	rev	@x[10],@x[10]
604	rev	@x[12],@x[12]
605	rev	@x[14],@x[14]
606#endif
607	eor	@x[0],@x[0],@x[1]
608	eor	@x[2],@x[2],@x[3]
609	eor	@x[4],@x[4],@x[5]
610	eor	@x[6],@x[6],@x[7]
611	eor	@x[8],@x[8],@x[9]
612	eor	@x[10],@x[10],@x[11]
613	eor	@x[12],@x[12],@x[13]
614	eor	@x[14],@x[14],@x[15]
615
616	stp	@x[0],@x[2],[$out,#0]		// store output
617	 add	@d[6],@d[6],#4			// increment counter
618	stp	@x[4],@x[6],[$out,#16]
619	stp	@x[8],@x[10],[$out,#32]
620	stp	@x[12],@x[14],[$out,#48]
621	add	$out,$out,#64
622	b.eq	.Ldone_neon
623	sub	$len,$len,#64
624	cmp	$len,#64
625	b.lo	.Less_than_128
626
627	ld1.8	{$T0-$T3},[$inp],#64
628	eor	$A0,$A0,$T0
629	eor	$B0,$B0,$T1
630	eor	$C0,$C0,$T2
631	eor	$D0,$D0,$T3
632	st1.8	{$A0-$D0},[$out],#64
633	b.eq	.Ldone_neon
634	sub	$len,$len,#64
635	cmp	$len,#64
636	b.lo	.Less_than_192
637
638	ld1.8	{$T0-$T3},[$inp],#64
639	eor	$A1,$A1,$T0
640	eor	$B1,$B1,$T1
641	eor	$C1,$C1,$T2
642	eor	$D1,$D1,$T3
643	st1.8	{$A1-$D1},[$out],#64
644	b.eq	.Ldone_neon
645	sub	$len,$len,#64
646
647	st1.8	{$A2-$D2},[sp]
648	b	.Last_neon
649
650.Less_than_128:
651	st1.8	{$A0-$D0},[sp]
652	b	.Last_neon
653.Less_than_192:
654	st1.8	{$A1-$D1},[sp]
655	b	.Last_neon
656
657.align	4
658.Last_neon:
659	sub	$out,$out,#1
660	add	$inp,$inp,$len
661	add	$out,$out,$len
662	add	$ctr,sp,$len
663	neg	$len,$len
664
665.Loop_tail_neon:
666	ldrb	w10,[$inp,$len]
667	ldrb	w11,[$ctr,$len]
668	add	$len,$len,#1
669	eor	w10,w10,w11
670	strb	w10,[$out,$len]
671	cbnz	$len,.Loop_tail_neon
672
673	stp	xzr,xzr,[sp,#0]
674	stp	xzr,xzr,[sp,#16]
675	stp	xzr,xzr,[sp,#32]
676	stp	xzr,xzr,[sp,#48]
677
678.Ldone_neon:
679	ldp	x19,x20,[x29,#16]
680	add	sp,sp,#64
681	ldp	x21,x22,[x29,#32]
682	ldp	x23,x24,[x29,#48]
683	ldp	x25,x26,[x29,#64]
684	ldp	x27,x28,[x29,#80]
685	ldp	x29,x30,[sp],#96
686	ret
687.size	ChaCha20_neon,.-ChaCha20_neon
688___
689{
690my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
691my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
692    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
693
694$code.=<<___;
695.type	ChaCha20_512_neon,%function
696.align	5
697ChaCha20_512_neon:
698	stp	x29,x30,[sp,#-96]!
699	add	x29,sp,#0
700
701	adr	@x[0],.Lsigma
702	stp	x19,x20,[sp,#16]
703	stp	x21,x22,[sp,#32]
704	stp	x23,x24,[sp,#48]
705	stp	x25,x26,[sp,#64]
706	stp	x27,x28,[sp,#80]
707
708.L512_or_more_neon:
709	sub	sp,sp,#128+64
710
711	ldp	@d[0],@d[1],[@x[0]]		// load sigma
712	ld1	{@K[0]},[@x[0]],#16
713	ldp	@d[2],@d[3],[$key]		// load key
714	ldp	@d[4],@d[5],[$key,#16]
715	ld1	{@K[1],@K[2]},[$key]
716	ldp	@d[6],@d[7],[$ctr]		// load counter
717	ld1	{@K[3]},[$ctr]
718	ld1	{$ONE},[@x[0]]
719#ifdef	__ARMEB__
720	rev64	@K[0],@K[0]
721	ror	@d[2],@d[2],#32
722	ror	@d[3],@d[3],#32
723	ror	@d[4],@d[4],#32
724	ror	@d[5],@d[5],#32
725	ror	@d[6],@d[6],#32
726	ror	@d[7],@d[7],#32
727#endif
728	add	@K[3],@K[3],$ONE		// += 1
729	stp	@K[0],@K[1],[sp,#0]		// off-load key block, invariant part
730	add	@K[3],@K[3],$ONE		// not typo
731	str	@K[2],[sp,#32]
732	add	@K[4],@K[3],$ONE
733	add	@K[5],@K[4],$ONE
734	add	@K[6],@K[5],$ONE
735	shl	$ONE,$ONE,#2			// 1 -> 4
736
737	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
738	stp	d10,d11,[sp,#128+16]
739	stp	d12,d13,[sp,#128+32]
740	stp	d14,d15,[sp,#128+48]
741
742	sub	$len,$len,#512			// not typo
743
744.Loop_outer_512_neon:
745	 mov	$A0,@K[0]
746	 mov	$A1,@K[0]
747	 mov	$A2,@K[0]
748	 mov	$A3,@K[0]
749	 mov	$A4,@K[0]
750	 mov	$A5,@K[0]
751	 mov	$B0,@K[1]
752	mov.32	@x[0],@d[0]			// unpack key block
753	 mov	$B1,@K[1]
754	lsr	@x[1],@d[0],#32
755	 mov	$B2,@K[1]
756	mov.32	@x[2],@d[1]
757	 mov	$B3,@K[1]
758	lsr	@x[3],@d[1],#32
759	 mov	$B4,@K[1]
760	mov.32	@x[4],@d[2]
761	 mov	$B5,@K[1]
762	lsr	@x[5],@d[2],#32
763	 mov	$D0,@K[3]
764	mov.32	@x[6],@d[3]
765	 mov	$D1,@K[4]
766	lsr	@x[7],@d[3],#32
767	 mov	$D2,@K[5]
768	mov.32	@x[8],@d[4]
769	 mov	$D3,@K[6]
770	lsr	@x[9],@d[4],#32
771	 mov	$C0,@K[2]
772	mov.32	@x[10],@d[5]
773	 mov	$C1,@K[2]
774	lsr	@x[11],@d[5],#32
775	 add	$D4,$D0,$ONE			// +4
776	mov.32	@x[12],@d[6]
777	 add	$D5,$D1,$ONE			// +4
778	lsr	@x[13],@d[6],#32
779	 mov	$C2,@K[2]
780	mov.32	@x[14],@d[7]
781	 mov	$C3,@K[2]
782	lsr	@x[15],@d[7],#32
783	 mov	$C4,@K[2]
784	 stp	@K[3],@K[4],[sp,#48]		// off-load key block, variable part
785	 mov	$C5,@K[2]
786	 str	@K[5],[sp,#80]
787
788	mov	$ctr,#5
789	subs	$len,$len,#512
790.Loop_upper_neon:
791	sub	$ctr,$ctr,#1
792___
793	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
794	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
795	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
796	my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
797	my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
798	my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
799	my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
800	my $diff = ($#thread0+1)*6 - $#thread67 - 1;
801	my $i = 0;
802
803	foreach (@thread0) {
804		eval;			eval(shift(@thread67));
805		eval(shift(@thread1));	eval(shift(@thread67));
806		eval(shift(@thread2));	eval(shift(@thread67));
807		eval(shift(@thread3));	eval(shift(@thread67));
808		eval(shift(@thread4));	eval(shift(@thread67));
809		eval(shift(@thread5));	eval(shift(@thread67));
810	}
811
812	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
813	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
814	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
815	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
816	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
817	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
818	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
819
820	foreach (@thread0) {
821		eval;			eval(shift(@thread67));
822		eval(shift(@thread1));	eval(shift(@thread67));
823		eval(shift(@thread2));	eval(shift(@thread67));
824		eval(shift(@thread3));	eval(shift(@thread67));
825		eval(shift(@thread4));	eval(shift(@thread67));
826		eval(shift(@thread5));	eval(shift(@thread67));
827	}
828$code.=<<___;
829	cbnz	$ctr,.Loop_upper_neon
830
831	add.32	@x[0],@x[0],@d[0]		// accumulate key block
832	add	@x[1],@x[1],@d[0],lsr#32
833	add.32	@x[2],@x[2],@d[1]
834	add	@x[3],@x[3],@d[1],lsr#32
835	add.32	@x[4],@x[4],@d[2]
836	add	@x[5],@x[5],@d[2],lsr#32
837	add.32	@x[6],@x[6],@d[3]
838	add	@x[7],@x[7],@d[3],lsr#32
839	add.32	@x[8],@x[8],@d[4]
840	add	@x[9],@x[9],@d[4],lsr#32
841	add.32	@x[10],@x[10],@d[5]
842	add	@x[11],@x[11],@d[5],lsr#32
843	add.32	@x[12],@x[12],@d[6]
844	add	@x[13],@x[13],@d[6],lsr#32
845	add.32	@x[14],@x[14],@d[7]
846	add	@x[15],@x[15],@d[7],lsr#32
847
848	add	@x[0],@x[0],@x[1],lsl#32	// pack
849	add	@x[2],@x[2],@x[3],lsl#32
850	ldp	@x[1],@x[3],[$inp,#0]		// load input
851	add	@x[4],@x[4],@x[5],lsl#32
852	add	@x[6],@x[6],@x[7],lsl#32
853	ldp	@x[5],@x[7],[$inp,#16]
854	add	@x[8],@x[8],@x[9],lsl#32
855	add	@x[10],@x[10],@x[11],lsl#32
856	ldp	@x[9],@x[11],[$inp,#32]
857	add	@x[12],@x[12],@x[13],lsl#32
858	add	@x[14],@x[14],@x[15],lsl#32
859	ldp	@x[13],@x[15],[$inp,#48]
860	add	$inp,$inp,#64
861#ifdef	__ARMEB__
862	rev	@x[0],@x[0]
863	rev	@x[2],@x[2]
864	rev	@x[4],@x[4]
865	rev	@x[6],@x[6]
866	rev	@x[8],@x[8]
867	rev	@x[10],@x[10]
868	rev	@x[12],@x[12]
869	rev	@x[14],@x[14]
870#endif
871	eor	@x[0],@x[0],@x[1]
872	eor	@x[2],@x[2],@x[3]
873	eor	@x[4],@x[4],@x[5]
874	eor	@x[6],@x[6],@x[7]
875	eor	@x[8],@x[8],@x[9]
876	eor	@x[10],@x[10],@x[11]
877	eor	@x[12],@x[12],@x[13]
878	eor	@x[14],@x[14],@x[15]
879
880	 stp	@x[0],@x[2],[$out,#0]		// store output
881	 add	@d[6],@d[6],#1			// increment counter
882	mov.32	@x[0],@d[0]			// unpack key block
883	lsr	@x[1],@d[0],#32
884	 stp	@x[4],@x[6],[$out,#16]
885	mov.32	@x[2],@d[1]
886	lsr	@x[3],@d[1],#32
887	 stp	@x[8],@x[10],[$out,#32]
888	mov.32	@x[4],@d[2]
889	lsr	@x[5],@d[2],#32
890	 stp	@x[12],@x[14],[$out,#48]
891	 add	$out,$out,#64
892	mov.32	@x[6],@d[3]
893	lsr	@x[7],@d[3],#32
894	mov.32	@x[8],@d[4]
895	lsr	@x[9],@d[4],#32
896	mov.32	@x[10],@d[5]
897	lsr	@x[11],@d[5],#32
898	mov.32	@x[12],@d[6]
899	lsr	@x[13],@d[6],#32
900	mov.32	@x[14],@d[7]
901	lsr	@x[15],@d[7],#32
902
903	mov	$ctr,#5
904.Loop_lower_neon:
905	sub	$ctr,$ctr,#1
906___
907	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
908	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
909	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
910	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
911	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
912	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
913	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
914
915	foreach (@thread0) {
916		eval;			eval(shift(@thread67));
917		eval(shift(@thread1));	eval(shift(@thread67));
918		eval(shift(@thread2));	eval(shift(@thread67));
919		eval(shift(@thread3));	eval(shift(@thread67));
920		eval(shift(@thread4));	eval(shift(@thread67));
921		eval(shift(@thread5));	eval(shift(@thread67));
922	}
923
924	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
925	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
926	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
927	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
928	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
929	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
930	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
931
932	foreach (@thread0) {
933		eval;			eval(shift(@thread67));
934		eval(shift(@thread1));	eval(shift(@thread67));
935		eval(shift(@thread2));	eval(shift(@thread67));
936		eval(shift(@thread3));	eval(shift(@thread67));
937		eval(shift(@thread4));	eval(shift(@thread67));
938		eval(shift(@thread5));	eval(shift(@thread67));
939	}
940$code.=<<___;
941	cbnz	$ctr,.Loop_lower_neon
942
943	add.32	@x[0],@x[0],@d[0]		// accumulate key block
944	 ldp	@K[0],@K[1],[sp,#0]
945	add	@x[1],@x[1],@d[0],lsr#32
946	 ldp	@K[2],@K[3],[sp,#32]
947	add.32	@x[2],@x[2],@d[1]
948	 ldp	@K[4],@K[5],[sp,#64]
949	add	@x[3],@x[3],@d[1],lsr#32
950	 add	$A0,$A0,@K[0]
951	add.32	@x[4],@x[4],@d[2]
952	 add	$A1,$A1,@K[0]
953	add	@x[5],@x[5],@d[2],lsr#32
954	 add	$A2,$A2,@K[0]
955	add.32	@x[6],@x[6],@d[3]
956	 add	$A3,$A3,@K[0]
957	add	@x[7],@x[7],@d[3],lsr#32
958	 add	$A4,$A4,@K[0]
959	add.32	@x[8],@x[8],@d[4]
960	 add	$A5,$A5,@K[0]
961	add	@x[9],@x[9],@d[4],lsr#32
962	 add	$C0,$C0,@K[2]
963	add.32	@x[10],@x[10],@d[5]
964	 add	$C1,$C1,@K[2]
965	add	@x[11],@x[11],@d[5],lsr#32
966	 add	$C2,$C2,@K[2]
967	add.32	@x[12],@x[12],@d[6]
968	 add	$C3,$C3,@K[2]
969	add	@x[13],@x[13],@d[6],lsr#32
970	 add	$C4,$C4,@K[2]
971	add.32	@x[14],@x[14],@d[7]
972	 add	$C5,$C5,@K[2]
973	add	@x[15],@x[15],@d[7],lsr#32
974	 add	$D4,$D4,$ONE			// +4
975	add	@x[0],@x[0],@x[1],lsl#32	// pack
976	 add	$D5,$D5,$ONE			// +4
977	add	@x[2],@x[2],@x[3],lsl#32
978	 add	$D0,$D0,@K[3]
979	ldp	@x[1],@x[3],[$inp,#0]		// load input
980	 add	$D1,$D1,@K[4]
981	add	@x[4],@x[4],@x[5],lsl#32
982	 add	$D2,$D2,@K[5]
983	add	@x[6],@x[6],@x[7],lsl#32
984	 add	$D3,$D3,@K[6]
985	ldp	@x[5],@x[7],[$inp,#16]
986	 add	$D4,$D4,@K[3]
987	add	@x[8],@x[8],@x[9],lsl#32
988	 add	$D5,$D5,@K[4]
989	add	@x[10],@x[10],@x[11],lsl#32
990	 add	$B0,$B0,@K[1]
991	ldp	@x[9],@x[11],[$inp,#32]
992	 add	$B1,$B1,@K[1]
993	add	@x[12],@x[12],@x[13],lsl#32
994	 add	$B2,$B2,@K[1]
995	add	@x[14],@x[14],@x[15],lsl#32
996	 add	$B3,$B3,@K[1]
997	ldp	@x[13],@x[15],[$inp,#48]
998	 add	$B4,$B4,@K[1]
999	add	$inp,$inp,#64
1000	 add	$B5,$B5,@K[1]
1001
1002#ifdef	__ARMEB__
1003	rev	@x[0],@x[0]
1004	rev	@x[2],@x[2]
1005	rev	@x[4],@x[4]
1006	rev	@x[6],@x[6]
1007	rev	@x[8],@x[8]
1008	rev	@x[10],@x[10]
1009	rev	@x[12],@x[12]
1010	rev	@x[14],@x[14]
1011#endif
1012	ld1.8	{$T0-$T3},[$inp],#64
1013	eor	@x[0],@x[0],@x[1]
1014	eor	@x[2],@x[2],@x[3]
1015	eor	@x[4],@x[4],@x[5]
1016	eor	@x[6],@x[6],@x[7]
1017	eor	@x[8],@x[8],@x[9]
1018	 eor	$A0,$A0,$T0
1019	eor	@x[10],@x[10],@x[11]
1020	 eor	$B0,$B0,$T1
1021	eor	@x[12],@x[12],@x[13]
1022	 eor	$C0,$C0,$T2
1023	eor	@x[14],@x[14],@x[15]
1024	 eor	$D0,$D0,$T3
1025	 ld1.8	{$T0-$T3},[$inp],#64
1026
1027	stp	@x[0],@x[2],[$out,#0]		// store output
1028	 add	@d[6],@d[6],#7			// increment counter
1029	stp	@x[4],@x[6],[$out,#16]
1030	stp	@x[8],@x[10],[$out,#32]
1031	stp	@x[12],@x[14],[$out,#48]
1032	add	$out,$out,#64
1033	st1.8	{$A0-$D0},[$out],#64
1034
1035	ld1.8	{$A0-$D0},[$inp],#64
1036	eor	$A1,$A1,$T0
1037	eor	$B1,$B1,$T1
1038	eor	$C1,$C1,$T2
1039	eor	$D1,$D1,$T3
1040	st1.8	{$A1-$D1},[$out],#64
1041
1042	ld1.8	{$A1-$D1},[$inp],#64
1043	eor	$A2,$A2,$A0
1044	 ldp	@K[0],@K[1],[sp,#0]
1045	eor	$B2,$B2,$B0
1046	 ldp	@K[2],@K[3],[sp,#32]
1047	eor	$C2,$C2,$C0
1048	eor	$D2,$D2,$D0
1049	st1.8	{$A2-$D2},[$out],#64
1050
1051	ld1.8	{$A2-$D2},[$inp],#64
1052	eor	$A3,$A3,$A1
1053	eor	$B3,$B3,$B1
1054	eor	$C3,$C3,$C1
1055	eor	$D3,$D3,$D1
1056	st1.8	{$A3-$D3},[$out],#64
1057
1058	ld1.8	{$A3-$D3},[$inp],#64
1059	eor	$A4,$A4,$A2
1060	eor	$B4,$B4,$B2
1061	eor	$C4,$C4,$C2
1062	eor	$D4,$D4,$D2
1063	st1.8	{$A4-$D4},[$out],#64
1064
1065	shl	$A0,$ONE,#1			// 4 -> 8
1066	eor	$A5,$A5,$A3
1067	eor	$B5,$B5,$B3
1068	eor	$C5,$C5,$C3
1069	eor	$D5,$D5,$D3
1070	st1.8	{$A5-$D5},[$out],#64
1071
1072	add	@K[3],@K[3],$A0			// += 8
1073	add	@K[4],@K[4],$A0
1074	add	@K[5],@K[5],$A0
1075	add	@K[6],@K[6],$A0
1076
1077	b.hs	.Loop_outer_512_neon
1078
1079	adds	$len,$len,#512
1080	ushr	$A0,$ONE,#2			// 4 -> 1
1081
1082	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1083	ldp	d10,d11,[sp,#128+16]
1084	ldp	d12,d13,[sp,#128+32]
1085	ldp	d14,d15,[sp,#128+48]
1086
1087	stp	@K[0],$ONE,[sp,#0]		// wipe off-load area
1088	stp	@K[0],$ONE,[sp,#32]
1089	stp	@K[0],$ONE,[sp,#64]
1090
1091	b.eq	.Ldone_512_neon
1092
1093	cmp	$len,#192
1094	sub	@K[3],@K[3],$A0			// -= 1
1095	sub	@K[4],@K[4],$A0
1096	sub	@K[5],@K[5],$A0
1097	add	sp,sp,#128
1098	b.hs	.Loop_outer_neon
1099
1100	eor	@K[1],@K[1],@K[1]
1101	eor	@K[2],@K[2],@K[2]
1102	eor	@K[3],@K[3],@K[3]
1103	eor	@K[4],@K[4],@K[4]
1104	eor	@K[5],@K[5],@K[5]
1105	eor	@K[6],@K[6],@K[6]
1106	b	.Loop_outer
1107
1108.Ldone_512_neon:
1109	ldp	x19,x20,[x29,#16]
1110	add	sp,sp,#128+64
1111	ldp	x21,x22,[x29,#32]
1112	ldp	x23,x24,[x29,#48]
1113	ldp	x25,x26,[x29,#64]
1114	ldp	x27,x28,[x29,#80]
1115	ldp	x29,x30,[sp],#96
1116	ret
1117.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1118___
1119}
1120}}}
1121
1122foreach (split("\n",$code)) {
1123	s/\`([^\`]*)\`/eval $1/geo;
1124
1125	(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))	or
1126	(m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1))	or
1127	(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))	or
1128	(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))	or
1129	(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1130
1131	#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1132
1133	print $_,"\n";
1134}
1135close STDOUT;	# flush
1136