1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# December 2014
11#
12# ChaCha20 for ARMv4.
13#
14# Performance in cycles per byte out of large buffer.
15#
16#			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
17#
18# Cortex-A5		19.3(*)/+95%    21.8        14.1
19# Cortex-A8		10.5(*)/+160%   13.9        6.35
20# Cortex-A9		12.9(**)/+110%  14.3        6.50
21# Cortex-A15		11.0/+40%       16.0        5.00
22# Snapdragon S4		11.5/+125%      13.6        4.90
23#
24# (*)	most "favourable" result for aligned data on little-endian
25#	processor, result for misaligned data is 10-15% lower;
26# (**)	this result is a trade-off: it can be improved by 20%,
27#	but then Snapdragon S4 and Cortex-A8 results get
28#	20-25% worse;
29
30$flavour = shift;
31if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
32else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
33
34if ($flavour && $flavour ne "void") {
35    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
37    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
38    die "can't locate arm-xlate.pl";
39
40    open STDOUT,"| \"$^X\" $xlate $flavour $output";
41} else {
42    open STDOUT,">$output";
43}
44
45sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
46{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
47  my $arg = pop;
48    $arg = "#$arg" if ($arg*1 eq $arg);
49    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
50}
51
52my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
53my @t=map("r$_",(8..11));
54
55sub ROUND {
56my ($a0,$b0,$c0,$d0)=@_;
57my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
58my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
59my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
60my $odd = $d0&1;
61my ($xc,$xc_) = (@t[0..1]);
62my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
63my @ret;
64
65	# Consider order in which variables are addressed by their
66	# index:
67	#
68	#       a   b   c   d
69	#
70	#       0   4   8  12 < even round
71	#       1   5   9  13
72	#       2   6  10  14
73	#       3   7  11  15
74	#       0   5  10  15 < odd round
75	#       1   6  11  12
76	#       2   7   8  13
77	#       3   4   9  14
78	#
79	# 'a', 'b' are permanently allocated in registers, @x[0..7],
80	# while 'c's and pair of 'd's are maintained in memory. If
81	# you observe 'c' column, you'll notice that pair of 'c's is
82	# invariant between rounds. This means that we have to reload
83	# them once per round, in the middle. This is why you'll see
84	# bunch of 'c' stores and loads in the middle, but none in
85	# the beginning or end. If you observe 'd' column, you'll
86	# notice that 15 and 13 are reused in next pair of rounds.
87	# This is why these two are chosen for offloading to memory,
88	# to make loads count more.
89							push @ret,(
90	"&add	(@x[$a0],@x[$a0],@x[$b0])",
91	"&mov	($xd,$xd,'ror#16')",
92	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
93	 "&mov	($xd_,$xd_,'ror#16')",
94	"&eor	($xd,$xd,@x[$a0],'ror#16')",
95	 "&eor	($xd_,$xd_,@x[$a1],'ror#16')",
96
97	"&add	($xc,$xc,$xd)",
98	"&mov	(@x[$b0],@x[$b0],'ror#20')",
99	 "&add	($xc_,$xc_,$xd_)",
100	 "&mov	(@x[$b1],@x[$b1],'ror#20')",
101	"&eor	(@x[$b0],@x[$b0],$xc,'ror#20')",
102	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#20')",
103
104	"&add	(@x[$a0],@x[$a0],@x[$b0])",
105	"&mov	($xd,$xd,'ror#24')",
106	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
107	 "&mov	($xd_,$xd_,'ror#24')",
108	"&eor	($xd,$xd,@x[$a0],'ror#24')",
109	 "&eor	($xd_,$xd_,@x[$a1],'ror#24')",
110
111	"&add	($xc,$xc,$xd)",
112	"&mov	(@x[$b0],@x[$b0],'ror#25')"		);
113							push @ret,(
114	"&str	($xd,'[sp,#4*(16+$d0)]')",
115	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
116							push @ret,(
117	 "&add	($xc_,$xc_,$xd_)",
118	 "&mov	(@x[$b1],@x[$b1],'ror#25')"		);
119							push @ret,(
120	 "&str	($xd_,'[sp,#4*(16+$d1)]')",
121	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
122							push @ret,(
123	"&eor	(@x[$b0],@x[$b0],$xc,'ror#25')",
124	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#25')"	);
125
126	$xd=@x[$d2]					if (!$odd);
127	$xd_=@x[$d3]					if ($odd);
128							push @ret,(
129	"&str	($xc,'[sp,#4*(16+$c0)]')",
130	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
131	"&add	(@x[$a2],@x[$a2],@x[$b2])",
132	"&mov	($xd,$xd,'ror#16')",
133	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
134	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
135	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
136	 "&mov	($xd_,$xd_,'ror#16')",
137	"&eor	($xd,$xd,@x[$a2],'ror#16')",
138	 "&eor	($xd_,$xd_,@x[$a3],'ror#16')",
139
140	"&add	($xc,$xc,$xd)",
141	"&mov	(@x[$b2],@x[$b2],'ror#20')",
142	 "&add	($xc_,$xc_,$xd_)",
143	 "&mov	(@x[$b3],@x[$b3],'ror#20')",
144	"&eor	(@x[$b2],@x[$b2],$xc,'ror#20')",
145	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#20')",
146
147	"&add	(@x[$a2],@x[$a2],@x[$b2])",
148	"&mov	($xd,$xd,'ror#24')",
149	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
150	 "&mov	($xd_,$xd_,'ror#24')",
151	"&eor	($xd,$xd,@x[$a2],'ror#24')",
152	 "&eor	($xd_,$xd_,@x[$a3],'ror#24')",
153
154	"&add	($xc,$xc,$xd)",
155	"&mov	(@x[$b2],@x[$b2],'ror#25')",
156	 "&add	($xc_,$xc_,$xd_)",
157	 "&mov	(@x[$b3],@x[$b3],'ror#25')",
158	"&eor	(@x[$b2],@x[$b2],$xc,'ror#25')",
159	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#25')"	);
160
161	@ret;
162}
163
164$code.=<<___;
165#include <openssl/arm_arch.h>
166
167.text
168#if defined(__thumb2__)
169.syntax	unified
170.thumb
171#else
172.code	32
173#endif
174
175#if defined(__thumb2__) || defined(__clang__)
176#define ldrhsb	ldrbhs
177#endif
178
179.align	5
180.Lsigma:
181.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
182.Lone:
183.long	1,0,0,0
184#if __ARM_MAX_ARCH__>=7
185.LOPENSSL_armcap:
186.word   OPENSSL_armcap_P-.LChaCha20_ctr32
187#else
188.word	-1
189#endif
190
191.globl	ChaCha20_ctr32
192.type	ChaCha20_ctr32,%function
193.align	5
194ChaCha20_ctr32:
195.LChaCha20_ctr32:
196	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
197	stmdb	sp!,{r0-r2,r4-r11,lr}
198#if __ARM_ARCH__<7 && !defined(__thumb2__)
199	sub	r14,pc,#16		@ ChaCha20_ctr32
200#else
201	adr	r14,.LChaCha20_ctr32
202#endif
203	cmp	r2,#0			@ len==0?
204#ifdef	__thumb2__
205	itt	eq
206#endif
207	addeq	sp,sp,#4*3
208	beq	.Lno_data
209#if __ARM_MAX_ARCH__>=7
210	cmp	r2,#192			@ test len
211	bls	.Lshort
212	ldr	r4,[r14,#-32]
213	ldr	r4,[r14,r4]
214# ifdef	__APPLE__
215	ldr	r4,[r4]
216# endif
217	tst	r4,#ARMV7_NEON
218	bne	.LChaCha20_neon
219.Lshort:
220#endif
221	ldmia	r12,{r4-r7}		@ load counter and nonce
222	sub	sp,sp,#4*(16)		@ off-load area
223	sub	r14,r14,#64		@ .Lsigma
224	stmdb	sp!,{r4-r7}		@ copy counter and nonce
225	ldmia	r3,{r4-r11}		@ load key
226	ldmia	r14,{r0-r3}		@ load sigma
227	stmdb	sp!,{r4-r11}		@ copy key
228	stmdb	sp!,{r0-r3}		@ copy sigma
229	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
230	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
231	b	.Loop_outer_enter
232
233.align	4
234.Loop_outer:
235	ldmia	sp,{r0-r9}		@ load key material
236	str	@t[3],[sp,#4*(32+2)]	@ save len
237	str	r12,  [sp,#4*(32+1)]	@ save inp
238	str	r14,  [sp,#4*(32+0)]	@ save out
239.Loop_outer_enter:
240	ldr	@t[3], [sp,#4*(15)]
241	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
242	ldr	@t[2], [sp,#4*(13)]
243	ldr	@x[14],[sp,#4*(14)]
244	str	@t[3], [sp,#4*(16+15)]
245	mov	@t[3],#10
246	b	.Loop
247
248.align	4
249.Loop:
250	subs	@t[3],@t[3],#1
251___
252	foreach (&ROUND(0, 4, 8,12)) { eval; }
253	foreach (&ROUND(0, 5,10,15)) { eval; }
254$code.=<<___;
255	bne	.Loop
256
257	ldr	@t[3],[sp,#4*(32+2)]	@ load len
258
259	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
260	str	@t[1], [sp,#4*(16+9)]
261	str	@x[12],[sp,#4*(16+12)]
262	str	@t[2], [sp,#4*(16+13)]
263	str	@x[14],[sp,#4*(16+14)]
264
265	@ at this point we have first half of 512-bit result in
266	@ @x[0-7] and second half at sp+4*(16+8)
267
268	cmp	@t[3],#64		@ done yet?
269#ifdef	__thumb2__
270	itete	lo
271#endif
272	addlo	r12,sp,#4*(0)		@ shortcut or ...
273	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
274	addlo	r14,sp,#4*(0)		@ shortcut or ...
275	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
276
277	ldr	@t[0],[sp,#4*(0)]	@ load key material
278	ldr	@t[1],[sp,#4*(1)]
279
280#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
281# if __ARM_ARCH__<7
282	orr	@t[2],r12,r14
283	tst	@t[2],#3		@ are input and output aligned?
284	ldr	@t[2],[sp,#4*(2)]
285	bne	.Lunaligned
286	cmp	@t[3],#64		@ restore flags
287# else
288	ldr	@t[2],[sp,#4*(2)]
289# endif
290	ldr	@t[3],[sp,#4*(3)]
291
292	add	@x[0],@x[0],@t[0]	@ accumulate key material
293	add	@x[1],@x[1],@t[1]
294# ifdef	__thumb2__
295	itt	hs
296# endif
297	ldrhs	@t[0],[r12],#16		@ load input
298	ldrhs	@t[1],[r12,#-12]
299
300	add	@x[2],@x[2],@t[2]
301	add	@x[3],@x[3],@t[3]
302# ifdef	__thumb2__
303	itt	hs
304# endif
305	ldrhs	@t[2],[r12,#-8]
306	ldrhs	@t[3],[r12,#-4]
307# if __ARM_ARCH__>=6 && defined(__ARMEB__)
308	rev	@x[0],@x[0]
309	rev	@x[1],@x[1]
310	rev	@x[2],@x[2]
311	rev	@x[3],@x[3]
312# endif
313# ifdef	__thumb2__
314	itt	hs
315# endif
316	eorhs	@x[0],@x[0],@t[0]	@ xor with input
317	eorhs	@x[1],@x[1],@t[1]
318	 add	@t[0],sp,#4*(4)
319	str	@x[0],[r14],#16		@ store output
320# ifdef	__thumb2__
321	itt	hs
322# endif
323	eorhs	@x[2],@x[2],@t[2]
324	eorhs	@x[3],@x[3],@t[3]
325	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
326	str	@x[1],[r14,#-12]
327	str	@x[2],[r14,#-8]
328	str	@x[3],[r14,#-4]
329
330	add	@x[4],@x[4],@t[0]	@ accumulate key material
331	add	@x[5],@x[5],@t[1]
332# ifdef	__thumb2__
333	itt	hs
334# endif
335	ldrhs	@t[0],[r12],#16		@ load input
336	ldrhs	@t[1],[r12,#-12]
337	add	@x[6],@x[6],@t[2]
338	add	@x[7],@x[7],@t[3]
339# ifdef	__thumb2__
340	itt	hs
341# endif
342	ldrhs	@t[2],[r12,#-8]
343	ldrhs	@t[3],[r12,#-4]
344# if __ARM_ARCH__>=6 && defined(__ARMEB__)
345	rev	@x[4],@x[4]
346	rev	@x[5],@x[5]
347	rev	@x[6],@x[6]
348	rev	@x[7],@x[7]
349# endif
350# ifdef	__thumb2__
351	itt	hs
352# endif
353	eorhs	@x[4],@x[4],@t[0]
354	eorhs	@x[5],@x[5],@t[1]
355	 add	@t[0],sp,#4*(8)
356	str	@x[4],[r14],#16		@ store output
357# ifdef	__thumb2__
358	itt	hs
359# endif
360	eorhs	@x[6],@x[6],@t[2]
361	eorhs	@x[7],@x[7],@t[3]
362	str	@x[5],[r14,#-12]
363	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
364	str	@x[6],[r14,#-8]
365	 add	@x[0],sp,#4*(16+8)
366	str	@x[7],[r14,#-4]
367
368	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half
369
370	add	@x[0],@x[0],@t[0]	@ accumulate key material
371	add	@x[1],@x[1],@t[1]
372# ifdef	__thumb2__
373	itt	hs
374# endif
375	ldrhs	@t[0],[r12],#16		@ load input
376	ldrhs	@t[1],[r12,#-12]
377# ifdef	__thumb2__
378	itt	hi
379# endif
380	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
381	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
382	add	@x[2],@x[2],@t[2]
383	add	@x[3],@x[3],@t[3]
384# ifdef	__thumb2__
385	itt	hs
386# endif
387	ldrhs	@t[2],[r12,#-8]
388	ldrhs	@t[3],[r12,#-4]
389# if __ARM_ARCH__>=6 && defined(__ARMEB__)
390	rev	@x[0],@x[0]
391	rev	@x[1],@x[1]
392	rev	@x[2],@x[2]
393	rev	@x[3],@x[3]
394# endif
395# ifdef	__thumb2__
396	itt	hs
397# endif
398	eorhs	@x[0],@x[0],@t[0]
399	eorhs	@x[1],@x[1],@t[1]
400	 add	@t[0],sp,#4*(12)
401	str	@x[0],[r14],#16		@ store output
402# ifdef	__thumb2__
403	itt	hs
404# endif
405	eorhs	@x[2],@x[2],@t[2]
406	eorhs	@x[3],@x[3],@t[3]
407	str	@x[1],[r14,#-12]
408	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
409	str	@x[2],[r14,#-8]
410	str	@x[3],[r14,#-4]
411
412	add	@x[4],@x[4],@t[0]	@ accumulate key material
413	add	@x[5],@x[5],@t[1]
414# ifdef	__thumb2__
415	itt	hi
416# endif
417	 addhi	@t[0],@t[0],#1		@ next counter value
418	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
419# ifdef	__thumb2__
420	itt	hs
421# endif
422	ldrhs	@t[0],[r12],#16		@ load input
423	ldrhs	@t[1],[r12,#-12]
424	add	@x[6],@x[6],@t[2]
425	add	@x[7],@x[7],@t[3]
426# ifdef	__thumb2__
427	itt	hs
428# endif
429	ldrhs	@t[2],[r12,#-8]
430	ldrhs	@t[3],[r12,#-4]
431# if __ARM_ARCH__>=6 && defined(__ARMEB__)
432	rev	@x[4],@x[4]
433	rev	@x[5],@x[5]
434	rev	@x[6],@x[6]
435	rev	@x[7],@x[7]
436# endif
437# ifdef	__thumb2__
438	itt	hs
439# endif
440	eorhs	@x[4],@x[4],@t[0]
441	eorhs	@x[5],@x[5],@t[1]
442# ifdef	__thumb2__
443	 it	ne
444# endif
445	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
446# ifdef	__thumb2__
447	itt	hs
448# endif
449	eorhs	@x[6],@x[6],@t[2]
450	eorhs	@x[7],@x[7],@t[3]
451	str	@x[4],[r14],#16		@ store output
452	str	@x[5],[r14,#-12]
453# ifdef	__thumb2__
454	it	hs
455# endif
456	 subhs	@t[3],@t[0],#64		@ len-=64
457	str	@x[6],[r14,#-8]
458	str	@x[7],[r14,#-4]
459	bhi	.Loop_outer
460
461	beq	.Ldone
462# if __ARM_ARCH__<7
463	b	.Ltail
464
465.align	4
466.Lunaligned:				@ unaligned endian-neutral path
467	cmp	@t[3],#64		@ restore flags
468# endif
469#endif
470#if __ARM_ARCH__<7
471	ldr	@t[3],[sp,#4*(3)]
472___
473for ($i=0;$i<16;$i+=4) {
474my $j=$i&0x7;
475
476$code.=<<___	if ($i==4);
477	add	@x[0],sp,#4*(16+8)
478___
479$code.=<<___	if ($i==8);
480	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
481# ifdef	__thumb2__
482	itt	hi
483# endif
484	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
485	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
486___
487$code.=<<___;
488	add	@x[$j+0],@x[$j+0],@t[0]		@ accumulate key material
489___
490$code.=<<___	if ($i==12);
491# ifdef	__thumb2__
492	itt	hi
493# endif
494	addhi	@t[0],@t[0],#1			@ next counter value
495	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
496___
497$code.=<<___;
498	add	@x[$j+1],@x[$j+1],@t[1]
499	add	@x[$j+2],@x[$j+2],@t[2]
500# ifdef	__thumb2__
501	itete	lo
502# endif
503	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
504	ldrhsb	@t[0],[r12],#16			@ ... load input
505	eorlo	@t[1],@t[1],@t[1]
506	ldrhsb	@t[1],[r12,#-12]
507
508	add	@x[$j+3],@x[$j+3],@t[3]
509# ifdef	__thumb2__
510	itete	lo
511# endif
512	eorlo	@t[2],@t[2],@t[2]
513	ldrhsb	@t[2],[r12,#-8]
514	eorlo	@t[3],@t[3],@t[3]
515	ldrhsb	@t[3],[r12,#-4]
516
517	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
518	eor	@x[$j+1],@t[1],@x[$j+1]
519# ifdef	__thumb2__
520	itt	hs
521# endif
522	ldrhsb	@t[0],[r12,#-15]		@ load more input
523	ldrhsb	@t[1],[r12,#-11]
524	eor	@x[$j+2],@t[2],@x[$j+2]
525	 strb	@x[$j+0],[r14],#16		@ store output
526	eor	@x[$j+3],@t[3],@x[$j+3]
527# ifdef	__thumb2__
528	itt	hs
529# endif
530	ldrhsb	@t[2],[r12,#-7]
531	ldrhsb	@t[3],[r12,#-3]
532	 strb	@x[$j+1],[r14,#-12]
533	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
534	 strb	@x[$j+2],[r14,#-8]
535	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
536# ifdef	__thumb2__
537	itt	hs
538# endif
539	ldrhsb	@t[0],[r12,#-14]		@ load more input
540	ldrhsb	@t[1],[r12,#-10]
541	 strb	@x[$j+3],[r14,#-4]
542	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
543	 strb	@x[$j+0],[r14,#-15]
544	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
545# ifdef	__thumb2__
546	itt	hs
547# endif
548	ldrhsb	@t[2],[r12,#-6]
549	ldrhsb	@t[3],[r12,#-2]
550	 strb	@x[$j+1],[r14,#-11]
551	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
552	 strb	@x[$j+2],[r14,#-7]
553	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
554# ifdef	__thumb2__
555	itt	hs
556# endif
557	ldrhsb	@t[0],[r12,#-13]		@ load more input
558	ldrhsb	@t[1],[r12,#-9]
559	 strb	@x[$j+3],[r14,#-3]
560	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
561	 strb	@x[$j+0],[r14,#-14]
562	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
563# ifdef	__thumb2__
564	itt	hs
565# endif
566	ldrhsb	@t[2],[r12,#-5]
567	ldrhsb	@t[3],[r12,#-1]
568	 strb	@x[$j+1],[r14,#-10]
569	 strb	@x[$j+2],[r14,#-6]
570	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
571	 strb	@x[$j+3],[r14,#-2]
572	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
573	 strb	@x[$j+0],[r14,#-13]
574	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
575	 strb	@x[$j+1],[r14,#-9]
576	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
577	 strb	@x[$j+2],[r14,#-5]
578	 strb	@x[$j+3],[r14,#-1]
579___
580$code.=<<___	if ($i<12);
581	add	@t[0],sp,#4*(4+$i)
582	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
583___
584}
585$code.=<<___;
586# ifdef	__thumb2__
587	it	ne
588# endif
589	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
590# ifdef	__thumb2__
591	it	hs
592# endif
593	subhs	@t[3],@t[0],#64			@ len-=64
594	bhi	.Loop_outer
595
596	beq	.Ldone
597#endif
598
599.Ltail:
600	ldr	r12,[sp,#4*(32+1)]	@ load inp
601	add	@t[1],sp,#4*(0)
602	ldr	r14,[sp,#4*(32+0)]	@ load out
603
604.Loop_tail:
605	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
606	ldrb	@t[3],[r12],#1		@ read input
607	subs	@t[0],@t[0],#1
608	eor	@t[3],@t[3],@t[2]
609	strb	@t[3],[r14],#1		@ store output
610	bne	.Loop_tail
611
612.Ldone:
613	add	sp,sp,#4*(32+3)
614.Lno_data:
615	ldmia	sp!,{r4-r11,pc}
616.size	ChaCha20_ctr32,.-ChaCha20_ctr32
617___
618
619{{{
620my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
621    map("q$_",(0..15));
622
623sub NEONROUND {
624my $odd = pop;
625my ($a,$b,$c,$d,$t)=@_;
626
627	(
628	"&vadd_i32	($a,$a,$b)",
629	"&veor		($d,$d,$a)",
630	"&vrev32_16	($d,$d)",	# vrot ($d,16)
631
632	"&vadd_i32	($c,$c,$d)",
633	"&veor		($t,$b,$c)",
634	"&vshr_u32	($b,$t,20)",
635	"&vsli_32	($b,$t,12)",
636
637	"&vadd_i32	($a,$a,$b)",
638	"&veor		($t,$d,$a)",
639	"&vshr_u32	($d,$t,24)",
640	"&vsli_32	($d,$t,8)",
641
642	"&vadd_i32	($c,$c,$d)",
643	"&veor		($t,$b,$c)",
644	"&vshr_u32	($b,$t,25)",
645	"&vsli_32	($b,$t,7)",
646
647	"&vext_8	($c,$c,$c,8)",
648	"&vext_8	($b,$b,$b,$odd?12:4)",
649	"&vext_8	($d,$d,$d,$odd?4:12)"
650	);
651}
652
653$code.=<<___;
654#if __ARM_MAX_ARCH__>=7
655.arch	armv7-a
656.fpu	neon
657
658.type	ChaCha20_neon,%function
659.align	5
660ChaCha20_neon:
661	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
662	stmdb		sp!,{r0-r2,r4-r11,lr}
663.LChaCha20_neon:
664	adr		r14,.Lsigma
665	vstmdb		sp!,{d8-d15}		@ ABI spec says so
666	stmdb		sp!,{r0-r3}
667
668	vld1.32		{$b0-$c0},[r3]		@ load key
669	ldmia		r3,{r4-r11}		@ load key
670
671	sub		sp,sp,#4*(16+16)
672	vld1.32		{$d0},[r12]		@ load counter and nonce
673	add		r12,sp,#4*8
674	ldmia		r14,{r0-r3}		@ load sigma
675	vld1.32		{$a0},[r14]!		@ load sigma
676	vld1.32		{$t0},[r14]		@ one
677	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
678	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key
679
680	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
681	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
682	vshl.i32	$t1#lo,$t0#lo,#1	@ two
683	vstr		$t0#lo,[sp,#4*(16+0)]
684	vshl.i32	$t2#lo,$t0#lo,#2	@ four
685	vstr		$t1#lo,[sp,#4*(16+2)]
686	vmov		$a1,$a0
687	vstr		$t2#lo,[sp,#4*(16+4)]
688	vmov		$a2,$a0
689	vmov		$b1,$b0
690	vmov		$b2,$b0
691	b		.Loop_neon_enter
692
693.align	4
694.Loop_neon_outer:
695	ldmia		sp,{r0-r9}		@ load key material
696	cmp		@t[3],#64*2		@ if len<=64*2
697	bls		.Lbreak_neon		@ switch to integer-only
698	vmov		$a1,$a0
699	str		@t[3],[sp,#4*(32+2)]	@ save len
700	vmov		$a2,$a0
701	str		r12,  [sp,#4*(32+1)]	@ save inp
702	vmov		$b1,$b0
703	str		r14,  [sp,#4*(32+0)]	@ save out
704	vmov		$b2,$b0
705.Loop_neon_enter:
706	ldr		@t[3], [sp,#4*(15)]
707	vadd.i32	$d1,$d0,$t0		@ counter+1
708	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
709	vmov		$c1,$c0
710	ldr		@t[2], [sp,#4*(13)]
711	vmov		$c2,$c0
712	ldr		@x[14],[sp,#4*(14)]
713	vadd.i32	$d2,$d1,$t0		@ counter+2
714	str		@t[3], [sp,#4*(16+15)]
715	mov		@t[3],#10
716	add		@x[12],@x[12],#3	@ counter+3
717	b		.Loop_neon
718
719.align	4
720.Loop_neon:
721	subs		@t[3],@t[3],#1
722___
723	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
724	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
725	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
726	my @thread3=&ROUND(0,4,8,12);
727
728	foreach (@thread0) {
729		eval;			eval(shift(@thread3));
730		eval(shift(@thread1));	eval(shift(@thread3));
731		eval(shift(@thread2));	eval(shift(@thread3));
732	}
733
734	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
735	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
736	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
737	@thread3=&ROUND(0,5,10,15);
738
739	foreach (@thread0) {
740		eval;			eval(shift(@thread3));
741		eval(shift(@thread1));	eval(shift(@thread3));
742		eval(shift(@thread2));	eval(shift(@thread3));
743	}
744$code.=<<___;
745	bne		.Loop_neon
746
747	add		@t[3],sp,#32
748	vld1.32		{$t0-$t1},[sp]		@ load key material
749	vld1.32		{$t2-$t3},[@t[3]]
750
751	ldr		@t[3],[sp,#4*(32+2)]	@ load len
752
753	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
754	str		@t[1], [sp,#4*(16+9)]
755	str		@x[12],[sp,#4*(16+12)]
756	str		@t[2], [sp,#4*(16+13)]
757	str		@x[14],[sp,#4*(16+14)]
758
759	@ at this point we have first half of 512-bit result in
760	@ @x[0-7] and second half at sp+4*(16+8)
761
762	ldr		r12,[sp,#4*(32+1)]	@ load inp
763	ldr		r14,[sp,#4*(32+0)]	@ load out
764
765	vadd.i32	$a0,$a0,$t0		@ accumulate key material
766	vadd.i32	$a1,$a1,$t0
767	vadd.i32	$a2,$a2,$t0
768	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
769
770	vadd.i32	$b0,$b0,$t1
771	vadd.i32	$b1,$b1,$t1
772	vadd.i32	$b2,$b2,$t1
773	vldr		$t1#lo,[sp,#4*(16+2)]	@ two
774
775	vadd.i32	$c0,$c0,$t2
776	vadd.i32	$c1,$c1,$t2
777	vadd.i32	$c2,$c2,$t2
778	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
779	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2
780
781	vadd.i32	$d0,$d0,$t3
782	vadd.i32	$d1,$d1,$t3
783	vadd.i32	$d2,$d2,$t3
784
785	cmp		@t[3],#64*4
786	blo		.Ltail_neon
787
788	vld1.8		{$t0-$t1},[r12]!	@ load input
789	 mov		@t[3],sp
790	vld1.8		{$t2-$t3},[r12]!
791	veor		$a0,$a0,$t0		@ xor with input
792	veor		$b0,$b0,$t1
793	vld1.8		{$t0-$t1},[r12]!
794	veor		$c0,$c0,$t2
795	veor		$d0,$d0,$t3
796	vld1.8		{$t2-$t3},[r12]!
797
798	veor		$a1,$a1,$t0
799	 vst1.8		{$a0-$b0},[r14]!	@ store output
800	veor		$b1,$b1,$t1
801	vld1.8		{$t0-$t1},[r12]!
802	veor		$c1,$c1,$t2
803	 vst1.8		{$c0-$d0},[r14]!
804	veor		$d1,$d1,$t3
805	vld1.8		{$t2-$t3},[r12]!
806
807	veor		$a2,$a2,$t0
808	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
809	 veor		$t0#hi,$t0#hi,$t0#hi
810	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
811	veor		$b2,$b2,$t1
812	 vld1.32	{$c0-$d0},[@t[3]]
813	veor		$c2,$c2,$t2
814	 vst1.8		{$a1-$b1},[r14]!
815	veor		$d2,$d2,$t3
816	 vst1.8		{$c1-$d1},[r14]!
817
818	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
819	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
820
821	ldmia		sp,{@t[0]-@t[3]}	@ load key material
822	add		@x[0],@x[0],@t[0]	@ accumulate key material
823	ldr		@t[0],[r12],#16		@ load input
824	 vst1.8		{$a2-$b2},[r14]!
825	add		@x[1],@x[1],@t[1]
826	ldr		@t[1],[r12,#-12]
827	 vst1.8		{$c2-$d2},[r14]!
828	add		@x[2],@x[2],@t[2]
829	ldr		@t[2],[r12,#-8]
830	add		@x[3],@x[3],@t[3]
831	ldr		@t[3],[r12,#-4]
832# ifdef	__ARMEB__
833	rev		@x[0],@x[0]
834	rev		@x[1],@x[1]
835	rev		@x[2],@x[2]
836	rev		@x[3],@x[3]
837# endif
838	eor		@x[0],@x[0],@t[0]	@ xor with input
839	 add		@t[0],sp,#4*(4)
840	eor		@x[1],@x[1],@t[1]
841	str		@x[0],[r14],#16		@ store output
842	eor		@x[2],@x[2],@t[2]
843	str		@x[1],[r14,#-12]
844	eor		@x[3],@x[3],@t[3]
845	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
846	str		@x[2],[r14,#-8]
847	str		@x[3],[r14,#-4]
848
849	add		@x[4],@x[4],@t[0]	@ accumulate key material
850	ldr		@t[0],[r12],#16		@ load input
851	add		@x[5],@x[5],@t[1]
852	ldr		@t[1],[r12,#-12]
853	add		@x[6],@x[6],@t[2]
854	ldr		@t[2],[r12,#-8]
855	add		@x[7],@x[7],@t[3]
856	ldr		@t[3],[r12,#-4]
857# ifdef	__ARMEB__
858	rev		@x[4],@x[4]
859	rev		@x[5],@x[5]
860	rev		@x[6],@x[6]
861	rev		@x[7],@x[7]
862# endif
863	eor		@x[4],@x[4],@t[0]
864	 add		@t[0],sp,#4*(8)
865	eor		@x[5],@x[5],@t[1]
866	str		@x[4],[r14],#16		@ store output
867	eor		@x[6],@x[6],@t[2]
868	str		@x[5],[r14,#-12]
869	eor		@x[7],@x[7],@t[3]
870	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
871	str		@x[6],[r14,#-8]
872	 add		@x[0],sp,#4*(16+8)
873	str		@x[7],[r14,#-4]
874
875	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
876
877	add		@x[0],@x[0],@t[0]	@ accumulate key material
878	ldr		@t[0],[r12],#16		@ load input
879	add		@x[1],@x[1],@t[1]
880	ldr		@t[1],[r12,#-12]
881# ifdef	__thumb2__
882	it	hi
883# endif
884	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
885	add		@x[2],@x[2],@t[2]
886	ldr		@t[2],[r12,#-8]
887# ifdef	__thumb2__
888	it	hi
889# endif
890	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
891	add		@x[3],@x[3],@t[3]
892	ldr		@t[3],[r12,#-4]
893# ifdef	__ARMEB__
894	rev		@x[0],@x[0]
895	rev		@x[1],@x[1]
896	rev		@x[2],@x[2]
897	rev		@x[3],@x[3]
898# endif
899	eor		@x[0],@x[0],@t[0]
900	 add		@t[0],sp,#4*(12)
901	eor		@x[1],@x[1],@t[1]
902	str		@x[0],[r14],#16		@ store output
903	eor		@x[2],@x[2],@t[2]
904	str		@x[1],[r14,#-12]
905	eor		@x[3],@x[3],@t[3]
906	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
907	str		@x[2],[r14,#-8]
908	str		@x[3],[r14,#-4]
909
910	add		@x[4],@x[4],@t[0]	@ accumulate key material
911	 add		@t[0],@t[0],#4		@ next counter value
912	add		@x[5],@x[5],@t[1]
913	 str		@t[0],[sp,#4*(12)]	@ save next counter value
914	ldr		@t[0],[r12],#16		@ load input
915	add		@x[6],@x[6],@t[2]
916	 add		@x[4],@x[4],#3		@ counter+3
917	ldr		@t[1],[r12,#-12]
918	add		@x[7],@x[7],@t[3]
919	ldr		@t[2],[r12,#-8]
920	ldr		@t[3],[r12,#-4]
921# ifdef	__ARMEB__
922	rev		@x[4],@x[4]
923	rev		@x[5],@x[5]
924	rev		@x[6],@x[6]
925	rev		@x[7],@x[7]
926# endif
927	eor		@x[4],@x[4],@t[0]
928# ifdef	__thumb2__
929	it	hi
930# endif
931	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
932	eor		@x[5],@x[5],@t[1]
933	eor		@x[6],@x[6],@t[2]
934	str		@x[4],[r14],#16		@ store output
935	eor		@x[7],@x[7],@t[3]
936	str		@x[5],[r14,#-12]
937	 sub		@t[3],@t[0],#64*4	@ len-=64*4
938	str		@x[6],[r14,#-8]
939	str		@x[7],[r14,#-4]
940	bhi		.Loop_neon_outer
941
942	b		.Ldone_neon
943
944.align	4
945.Lbreak_neon:
946	@ harmonize NEON and integer-only stack frames: load data
947	@ from NEON frame, but save to integer-only one; distance
948	@ between the two is 4*(32+4+16-32)=4*(20).
949
950	str		@t[3], [sp,#4*(20+32+2)]	@ save len
951	 add		@t[3],sp,#4*(32+4)
952	str		r12,   [sp,#4*(20+32+1)]	@ save inp
953	str		r14,   [sp,#4*(20+32+0)]	@ save out
954
955	ldr		@x[12],[sp,#4*(16+10)]
956	ldr		@x[14],[sp,#4*(16+11)]
957	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
958	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
959	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"
960
961	ldr		@t[3], [sp,#4*(15)]
962	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
963	ldr		@t[2], [sp,#4*(13)]
964	ldr		@x[14],[sp,#4*(14)]
965	str		@t[3], [sp,#4*(20+16+15)]
966	add		@t[3],sp,#4*(20)
967	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
968	add		sp,sp,#4*(20)			@ switch frame
969	vst1.32		{$c0-$d0},[@t[3]]
970	mov		@t[3],#10
971	b		.Loop				@ go integer-only
972
973.align	4
974.Ltail_neon:
975	cmp		@t[3],#64*3
976	bhs		.L192_or_more_neon
977	cmp		@t[3],#64*2
978	bhs		.L128_or_more_neon
979	cmp		@t[3],#64*1
980	bhs		.L64_or_more_neon
981
982	add		@t[0],sp,#4*(8)
983	vst1.8		{$a0-$b0},[sp]
984	add		@t[2],sp,#4*(0)
985	vst1.8		{$c0-$d0},[@t[0]]
986	b		.Loop_tail_neon
987
988.align	4
989.L64_or_more_neon:
990	vld1.8		{$t0-$t1},[r12]!
991	vld1.8		{$t2-$t3},[r12]!
992	veor		$a0,$a0,$t0
993	veor		$b0,$b0,$t1
994	veor		$c0,$c0,$t2
995	veor		$d0,$d0,$t3
996	vst1.8		{$a0-$b0},[r14]!
997	vst1.8		{$c0-$d0},[r14]!
998
999	beq		.Ldone_neon
1000
1001	add		@t[0],sp,#4*(8)
1002	vst1.8		{$a1-$b1},[sp]
1003	add		@t[2],sp,#4*(0)
1004	vst1.8		{$c1-$d1},[@t[0]]
1005	sub		@t[3],@t[3],#64*1	@ len-=64*1
1006	b		.Loop_tail_neon
1007
1008.align	4
1009.L128_or_more_neon:
1010	vld1.8		{$t0-$t1},[r12]!
1011	vld1.8		{$t2-$t3},[r12]!
1012	veor		$a0,$a0,$t0
1013	veor		$b0,$b0,$t1
1014	vld1.8		{$t0-$t1},[r12]!
1015	veor		$c0,$c0,$t2
1016	veor		$d0,$d0,$t3
1017	vld1.8		{$t2-$t3},[r12]!
1018
1019	veor		$a1,$a1,$t0
1020	veor		$b1,$b1,$t1
1021	 vst1.8		{$a0-$b0},[r14]!
1022	veor		$c1,$c1,$t2
1023	 vst1.8		{$c0-$d0},[r14]!
1024	veor		$d1,$d1,$t3
1025	vst1.8		{$a1-$b1},[r14]!
1026	vst1.8		{$c1-$d1},[r14]!
1027
1028	beq		.Ldone_neon
1029
1030	add		@t[0],sp,#4*(8)
1031	vst1.8		{$a2-$b2},[sp]
1032	add		@t[2],sp,#4*(0)
1033	vst1.8		{$c2-$d2},[@t[0]]
1034	sub		@t[3],@t[3],#64*2	@ len-=64*2
1035	b		.Loop_tail_neon
1036
1037.align	4
1038.L192_or_more_neon:
1039	vld1.8		{$t0-$t1},[r12]!
1040	vld1.8		{$t2-$t3},[r12]!
1041	veor		$a0,$a0,$t0
1042	veor		$b0,$b0,$t1
1043	vld1.8		{$t0-$t1},[r12]!
1044	veor		$c0,$c0,$t2
1045	veor		$d0,$d0,$t3
1046	vld1.8		{$t2-$t3},[r12]!
1047
1048	veor		$a1,$a1,$t0
1049	veor		$b1,$b1,$t1
1050	vld1.8		{$t0-$t1},[r12]!
1051	veor		$c1,$c1,$t2
1052	 vst1.8		{$a0-$b0},[r14]!
1053	veor		$d1,$d1,$t3
1054	vld1.8		{$t2-$t3},[r12]!
1055
1056	veor		$a2,$a2,$t0
1057	 vst1.8		{$c0-$d0},[r14]!
1058	veor		$b2,$b2,$t1
1059	 vst1.8		{$a1-$b1},[r14]!
1060	veor		$c2,$c2,$t2
1061	 vst1.8		{$c1-$d1},[r14]!
1062	veor		$d2,$d2,$t3
1063	vst1.8		{$a2-$b2},[r14]!
1064	vst1.8		{$c2-$d2},[r14]!
1065
1066	beq		.Ldone_neon
1067
1068	ldmia		sp,{@t[0]-@t[3]}	@ load key material
1069	add		@x[0],@x[0],@t[0]	@ accumulate key material
1070	 add		@t[0],sp,#4*(4)
1071	add		@x[1],@x[1],@t[1]
1072	add		@x[2],@x[2],@t[2]
1073	add		@x[3],@x[3],@t[3]
1074	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1075
1076	add		@x[4],@x[4],@t[0]	@ accumulate key material
1077	 add		@t[0],sp,#4*(8)
1078	add		@x[5],@x[5],@t[1]
1079	add		@x[6],@x[6],@t[2]
1080	add		@x[7],@x[7],@t[3]
1081	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1082# ifdef	__ARMEB__
1083	rev		@x[0],@x[0]
1084	rev		@x[1],@x[1]
1085	rev		@x[2],@x[2]
1086	rev		@x[3],@x[3]
1087	rev		@x[4],@x[4]
1088	rev		@x[5],@x[5]
1089	rev		@x[6],@x[6]
1090	rev		@x[7],@x[7]
1091# endif
1092	stmia		sp,{@x[0]-@x[7]}
1093	 add		@x[0],sp,#4*(16+8)
1094
1095	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
1096
1097	add		@x[0],@x[0],@t[0]	@ accumulate key material
1098	 add		@t[0],sp,#4*(12)
1099	add		@x[1],@x[1],@t[1]
1100	add		@x[2],@x[2],@t[2]
1101	add		@x[3],@x[3],@t[3]
1102	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1103
1104	add		@x[4],@x[4],@t[0]	@ accumulate key material
1105	 add		@t[0],sp,#4*(8)
1106	add		@x[5],@x[5],@t[1]
1107	 add		@x[4],@x[4],#3		@ counter+3
1108	add		@x[6],@x[6],@t[2]
1109	add		@x[7],@x[7],@t[3]
1110	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
1111# ifdef	__ARMEB__
1112	rev		@x[0],@x[0]
1113	rev		@x[1],@x[1]
1114	rev		@x[2],@x[2]
1115	rev		@x[3],@x[3]
1116	rev		@x[4],@x[4]
1117	rev		@x[5],@x[5]
1118	rev		@x[6],@x[6]
1119	rev		@x[7],@x[7]
1120# endif
1121	stmia		@t[0],{@x[0]-@x[7]}
1122	 add		@t[2],sp,#4*(0)
1123	 sub		@t[3],@t[3],#64*3	@ len-=64*3
1124
1125.Loop_tail_neon:
1126	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
1127	ldrb		@t[1],[r12],#1		@ read input
1128	subs		@t[3],@t[3],#1
1129	eor		@t[0],@t[0],@t[1]
1130	strb		@t[0],[r14],#1		@ store output
1131	bne		.Loop_tail_neon
1132
1133.Ldone_neon:
1134	add		sp,sp,#4*(32+4)
1135	vldmia		sp,{d8-d15}
1136	add		sp,sp,#4*(16+3)
1137	ldmia		sp!,{r4-r11,pc}
1138.size	ChaCha20_neon,.-ChaCha20_neon
1139.comm	OPENSSL_armcap_P,4,4
1140#endif
1141___
1142}}}
1143
1144foreach (split("\n",$code)) {
1145	s/\`([^\`]*)\`/eval $1/geo;
1146
1147	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1148
1149	print $_,"\n";
1150}
1151close STDOUT;
1152