1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14#include <openssl/arm_arch.h>
15
16
17
18.section	__TEXT,__const
19
20.align	5
21Lsigma:
22.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
23Lone:
24.long	1,0,0,0
25LOPENSSL_armcap_P:
26#ifdef	__ILP32__
27.long	_OPENSSL_armcap_P-.
28#else
29.quad	_OPENSSL_armcap_P-.
30#endif
31.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
32.align	2
33
34.text
35
36.globl	_ChaCha20_ctr32
37.private_extern	_ChaCha20_ctr32
38
39.align	5
40_ChaCha20_ctr32:
41	cbz	x2,Labort
42	adrp	x5,_OPENSSL_armcap_P@PAGE
43	cmp	x2,#192
44	b.lo	Lshort
45	add	x5,x5,_OPENSSL_armcap_P@PAGEOFF
46	ldr	w17,[x5]
47	tst	w17,#ARMV7_NEON
48	b.ne	ChaCha20_neon
49
50Lshort:
51	stp	x29,x30,[sp,#-96]!
52	add	x29,sp,#0
53
54	adrp	x5,Lsigma@PAGE
55	add	x5,x5,Lsigma@PAGEOFF
56	stp	x19,x20,[sp,#16]
57	stp	x21,x22,[sp,#32]
58	stp	x23,x24,[sp,#48]
59	stp	x25,x26,[sp,#64]
60	stp	x27,x28,[sp,#80]
61	sub	sp,sp,#64
62
63	ldp	x22,x23,[x5]		// load sigma
64	ldp	x24,x25,[x3]		// load key
65	ldp	x26,x27,[x3,#16]
66	ldp	x28,x30,[x4]		// load counter
67#ifdef	__ARMEB__
68	ror	x24,x24,#32
69	ror	x25,x25,#32
70	ror	x26,x26,#32
71	ror	x27,x27,#32
72	ror	x28,x28,#32
73	ror	x30,x30,#32
74#endif
75
76Loop_outer:
77	mov	w5,w22			// unpack key block
78	lsr	x6,x22,#32
79	mov	w7,w23
80	lsr	x8,x23,#32
81	mov	w9,w24
82	lsr	x10,x24,#32
83	mov	w11,w25
84	lsr	x12,x25,#32
85	mov	w13,w26
86	lsr	x14,x26,#32
87	mov	w15,w27
88	lsr	x16,x27,#32
89	mov	w17,w28
90	lsr	x19,x28,#32
91	mov	w20,w30
92	lsr	x21,x30,#32
93
94	mov	x4,#10
95	subs	x2,x2,#64
96Loop:
97	sub	x4,x4,#1
98	add	w5,w5,w9
99	add	w6,w6,w10
100	add	w7,w7,w11
101	add	w8,w8,w12
102	eor	w17,w17,w5
103	eor	w19,w19,w6
104	eor	w20,w20,w7
105	eor	w21,w21,w8
106	ror	w17,w17,#16
107	ror	w19,w19,#16
108	ror	w20,w20,#16
109	ror	w21,w21,#16
110	add	w13,w13,w17
111	add	w14,w14,w19
112	add	w15,w15,w20
113	add	w16,w16,w21
114	eor	w9,w9,w13
115	eor	w10,w10,w14
116	eor	w11,w11,w15
117	eor	w12,w12,w16
118	ror	w9,w9,#20
119	ror	w10,w10,#20
120	ror	w11,w11,#20
121	ror	w12,w12,#20
122	add	w5,w5,w9
123	add	w6,w6,w10
124	add	w7,w7,w11
125	add	w8,w8,w12
126	eor	w17,w17,w5
127	eor	w19,w19,w6
128	eor	w20,w20,w7
129	eor	w21,w21,w8
130	ror	w17,w17,#24
131	ror	w19,w19,#24
132	ror	w20,w20,#24
133	ror	w21,w21,#24
134	add	w13,w13,w17
135	add	w14,w14,w19
136	add	w15,w15,w20
137	add	w16,w16,w21
138	eor	w9,w9,w13
139	eor	w10,w10,w14
140	eor	w11,w11,w15
141	eor	w12,w12,w16
142	ror	w9,w9,#25
143	ror	w10,w10,#25
144	ror	w11,w11,#25
145	ror	w12,w12,#25
146	add	w5,w5,w10
147	add	w6,w6,w11
148	add	w7,w7,w12
149	add	w8,w8,w9
150	eor	w21,w21,w5
151	eor	w17,w17,w6
152	eor	w19,w19,w7
153	eor	w20,w20,w8
154	ror	w21,w21,#16
155	ror	w17,w17,#16
156	ror	w19,w19,#16
157	ror	w20,w20,#16
158	add	w15,w15,w21
159	add	w16,w16,w17
160	add	w13,w13,w19
161	add	w14,w14,w20
162	eor	w10,w10,w15
163	eor	w11,w11,w16
164	eor	w12,w12,w13
165	eor	w9,w9,w14
166	ror	w10,w10,#20
167	ror	w11,w11,#20
168	ror	w12,w12,#20
169	ror	w9,w9,#20
170	add	w5,w5,w10
171	add	w6,w6,w11
172	add	w7,w7,w12
173	add	w8,w8,w9
174	eor	w21,w21,w5
175	eor	w17,w17,w6
176	eor	w19,w19,w7
177	eor	w20,w20,w8
178	ror	w21,w21,#24
179	ror	w17,w17,#24
180	ror	w19,w19,#24
181	ror	w20,w20,#24
182	add	w15,w15,w21
183	add	w16,w16,w17
184	add	w13,w13,w19
185	add	w14,w14,w20
186	eor	w10,w10,w15
187	eor	w11,w11,w16
188	eor	w12,w12,w13
189	eor	w9,w9,w14
190	ror	w10,w10,#25
191	ror	w11,w11,#25
192	ror	w12,w12,#25
193	ror	w9,w9,#25
194	cbnz	x4,Loop
195
196	add	w5,w5,w22		// accumulate key block
197	add	x6,x6,x22,lsr#32
198	add	w7,w7,w23
199	add	x8,x8,x23,lsr#32
200	add	w9,w9,w24
201	add	x10,x10,x24,lsr#32
202	add	w11,w11,w25
203	add	x12,x12,x25,lsr#32
204	add	w13,w13,w26
205	add	x14,x14,x26,lsr#32
206	add	w15,w15,w27
207	add	x16,x16,x27,lsr#32
208	add	w17,w17,w28
209	add	x19,x19,x28,lsr#32
210	add	w20,w20,w30
211	add	x21,x21,x30,lsr#32
212
213	b.lo	Ltail
214
215	add	x5,x5,x6,lsl#32	// pack
216	add	x7,x7,x8,lsl#32
217	ldp	x6,x8,[x1,#0]		// load input
218	add	x9,x9,x10,lsl#32
219	add	x11,x11,x12,lsl#32
220	ldp	x10,x12,[x1,#16]
221	add	x13,x13,x14,lsl#32
222	add	x15,x15,x16,lsl#32
223	ldp	x14,x16,[x1,#32]
224	add	x17,x17,x19,lsl#32
225	add	x20,x20,x21,lsl#32
226	ldp	x19,x21,[x1,#48]
227	add	x1,x1,#64
228#ifdef	__ARMEB__
229	rev	x5,x5
230	rev	x7,x7
231	rev	x9,x9
232	rev	x11,x11
233	rev	x13,x13
234	rev	x15,x15
235	rev	x17,x17
236	rev	x20,x20
237#endif
238	eor	x5,x5,x6
239	eor	x7,x7,x8
240	eor	x9,x9,x10
241	eor	x11,x11,x12
242	eor	x13,x13,x14
243	eor	x15,x15,x16
244	eor	x17,x17,x19
245	eor	x20,x20,x21
246
247	stp	x5,x7,[x0,#0]		// store output
248	add	x28,x28,#1			// increment counter
249	stp	x9,x11,[x0,#16]
250	stp	x13,x15,[x0,#32]
251	stp	x17,x20,[x0,#48]
252	add	x0,x0,#64
253
254	b.hi	Loop_outer
255
256	ldp	x19,x20,[x29,#16]
257	add	sp,sp,#64
258	ldp	x21,x22,[x29,#32]
259	ldp	x23,x24,[x29,#48]
260	ldp	x25,x26,[x29,#64]
261	ldp	x27,x28,[x29,#80]
262	ldp	x29,x30,[sp],#96
263Labort:
264	ret
265
266.align	4
267Ltail:
268	add	x2,x2,#64
269Less_than_64:
270	sub	x0,x0,#1
271	add	x1,x1,x2
272	add	x0,x0,x2
273	add	x4,sp,x2
274	neg	x2,x2
275
276	add	x5,x5,x6,lsl#32	// pack
277	add	x7,x7,x8,lsl#32
278	add	x9,x9,x10,lsl#32
279	add	x11,x11,x12,lsl#32
280	add	x13,x13,x14,lsl#32
281	add	x15,x15,x16,lsl#32
282	add	x17,x17,x19,lsl#32
283	add	x20,x20,x21,lsl#32
284#ifdef	__ARMEB__
285	rev	x5,x5
286	rev	x7,x7
287	rev	x9,x9
288	rev	x11,x11
289	rev	x13,x13
290	rev	x15,x15
291	rev	x17,x17
292	rev	x20,x20
293#endif
294	stp	x5,x7,[sp,#0]
295	stp	x9,x11,[sp,#16]
296	stp	x13,x15,[sp,#32]
297	stp	x17,x20,[sp,#48]
298
299Loop_tail:
300	ldrb	w10,[x1,x2]
301	ldrb	w11,[x4,x2]
302	add	x2,x2,#1
303	eor	w10,w10,w11
304	strb	w10,[x0,x2]
305	cbnz	x2,Loop_tail
306
307	stp	xzr,xzr,[sp,#0]
308	stp	xzr,xzr,[sp,#16]
309	stp	xzr,xzr,[sp,#32]
310	stp	xzr,xzr,[sp,#48]
311
312	ldp	x19,x20,[x29,#16]
313	add	sp,sp,#64
314	ldp	x21,x22,[x29,#32]
315	ldp	x23,x24,[x29,#48]
316	ldp	x25,x26,[x29,#64]
317	ldp	x27,x28,[x29,#80]
318	ldp	x29,x30,[sp],#96
319	ret
320
321
322
323.align	5
324ChaCha20_neon:
325	stp	x29,x30,[sp,#-96]!
326	add	x29,sp,#0
327
328	adrp	x5,Lsigma@PAGE
329	add	x5,x5,Lsigma@PAGEOFF
330	stp	x19,x20,[sp,#16]
331	stp	x21,x22,[sp,#32]
332	stp	x23,x24,[sp,#48]
333	stp	x25,x26,[sp,#64]
334	stp	x27,x28,[sp,#80]
335	cmp	x2,#512
336	b.hs	L512_or_more_neon
337
338	sub	sp,sp,#64
339
340	ldp	x22,x23,[x5]		// load sigma
341	ld1	{v24.4s},[x5],#16
342	ldp	x24,x25,[x3]		// load key
343	ldp	x26,x27,[x3,#16]
344	ld1	{v25.4s,v26.4s},[x3]
345	ldp	x28,x30,[x4]		// load counter
346	ld1	{v27.4s},[x4]
347	ld1	{v31.4s},[x5]
348#ifdef	__ARMEB__
349	rev64	v24.4s,v24.4s
350	ror	x24,x24,#32
351	ror	x25,x25,#32
352	ror	x26,x26,#32
353	ror	x27,x27,#32
354	ror	x28,x28,#32
355	ror	x30,x30,#32
356#endif
357	add	v27.4s,v27.4s,v31.4s		// += 1
358	add	v28.4s,v27.4s,v31.4s
359	add	v29.4s,v28.4s,v31.4s
360	shl	v31.4s,v31.4s,#2			// 1 -> 4
361
362Loop_outer_neon:
363	mov	w5,w22			// unpack key block
364	lsr	x6,x22,#32
365	mov	v0.16b,v24.16b
366	mov	w7,w23
367	lsr	x8,x23,#32
368	mov	v4.16b,v24.16b
369	mov	w9,w24
370	lsr	x10,x24,#32
371	mov	v16.16b,v24.16b
372	mov	w11,w25
373	mov	v1.16b,v25.16b
374	lsr	x12,x25,#32
375	mov	v5.16b,v25.16b
376	mov	w13,w26
377	mov	v17.16b,v25.16b
378	lsr	x14,x26,#32
379	mov	v3.16b,v27.16b
380	mov	w15,w27
381	mov	v7.16b,v28.16b
382	lsr	x16,x27,#32
383	mov	v19.16b,v29.16b
384	mov	w17,w28
385	mov	v2.16b,v26.16b
386	lsr	x19,x28,#32
387	mov	v6.16b,v26.16b
388	mov	w20,w30
389	mov	v18.16b,v26.16b
390	lsr	x21,x30,#32
391
392	mov	x4,#10
393	subs	x2,x2,#256
394Loop_neon:
395	sub	x4,x4,#1
396	add	v0.4s,v0.4s,v1.4s
397	add	w5,w5,w9
398	add	v4.4s,v4.4s,v5.4s
399	add	w6,w6,w10
400	add	v16.4s,v16.4s,v17.4s
401	add	w7,w7,w11
402	eor	v3.16b,v3.16b,v0.16b
403	add	w8,w8,w12
404	eor	v7.16b,v7.16b,v4.16b
405	eor	w17,w17,w5
406	eor	v19.16b,v19.16b,v16.16b
407	eor	w19,w19,w6
408	rev32	v3.8h,v3.8h
409	eor	w20,w20,w7
410	rev32	v7.8h,v7.8h
411	eor	w21,w21,w8
412	rev32	v19.8h,v19.8h
413	ror	w17,w17,#16
414	add	v2.4s,v2.4s,v3.4s
415	ror	w19,w19,#16
416	add	v6.4s,v6.4s,v7.4s
417	ror	w20,w20,#16
418	add	v18.4s,v18.4s,v19.4s
419	ror	w21,w21,#16
420	eor	v20.16b,v1.16b,v2.16b
421	add	w13,w13,w17
422	eor	v21.16b,v5.16b,v6.16b
423	add	w14,w14,w19
424	eor	v22.16b,v17.16b,v18.16b
425	add	w15,w15,w20
426	ushr	v1.4s,v20.4s,#20
427	add	w16,w16,w21
428	ushr	v5.4s,v21.4s,#20
429	eor	w9,w9,w13
430	ushr	v17.4s,v22.4s,#20
431	eor	w10,w10,w14
432	sli	v1.4s,v20.4s,#12
433	eor	w11,w11,w15
434	sli	v5.4s,v21.4s,#12
435	eor	w12,w12,w16
436	sli	v17.4s,v22.4s,#12
437	ror	w9,w9,#20
438	add	v0.4s,v0.4s,v1.4s
439	ror	w10,w10,#20
440	add	v4.4s,v4.4s,v5.4s
441	ror	w11,w11,#20
442	add	v16.4s,v16.4s,v17.4s
443	ror	w12,w12,#20
444	eor	v20.16b,v3.16b,v0.16b
445	add	w5,w5,w9
446	eor	v21.16b,v7.16b,v4.16b
447	add	w6,w6,w10
448	eor	v22.16b,v19.16b,v16.16b
449	add	w7,w7,w11
450	ushr	v3.4s,v20.4s,#24
451	add	w8,w8,w12
452	ushr	v7.4s,v21.4s,#24
453	eor	w17,w17,w5
454	ushr	v19.4s,v22.4s,#24
455	eor	w19,w19,w6
456	sli	v3.4s,v20.4s,#8
457	eor	w20,w20,w7
458	sli	v7.4s,v21.4s,#8
459	eor	w21,w21,w8
460	sli	v19.4s,v22.4s,#8
461	ror	w17,w17,#24
462	add	v2.4s,v2.4s,v3.4s
463	ror	w19,w19,#24
464	add	v6.4s,v6.4s,v7.4s
465	ror	w20,w20,#24
466	add	v18.4s,v18.4s,v19.4s
467	ror	w21,w21,#24
468	eor	v20.16b,v1.16b,v2.16b
469	add	w13,w13,w17
470	eor	v21.16b,v5.16b,v6.16b
471	add	w14,w14,w19
472	eor	v22.16b,v17.16b,v18.16b
473	add	w15,w15,w20
474	ushr	v1.4s,v20.4s,#25
475	add	w16,w16,w21
476	ushr	v5.4s,v21.4s,#25
477	eor	w9,w9,w13
478	ushr	v17.4s,v22.4s,#25
479	eor	w10,w10,w14
480	sli	v1.4s,v20.4s,#7
481	eor	w11,w11,w15
482	sli	v5.4s,v21.4s,#7
483	eor	w12,w12,w16
484	sli	v17.4s,v22.4s,#7
485	ror	w9,w9,#25
486	ext	v2.16b,v2.16b,v2.16b,#8
487	ror	w10,w10,#25
488	ext	v6.16b,v6.16b,v6.16b,#8
489	ror	w11,w11,#25
490	ext	v18.16b,v18.16b,v18.16b,#8
491	ror	w12,w12,#25
492	ext	v3.16b,v3.16b,v3.16b,#12
493	ext	v7.16b,v7.16b,v7.16b,#12
494	ext	v19.16b,v19.16b,v19.16b,#12
495	ext	v1.16b,v1.16b,v1.16b,#4
496	ext	v5.16b,v5.16b,v5.16b,#4
497	ext	v17.16b,v17.16b,v17.16b,#4
498	add	v0.4s,v0.4s,v1.4s
499	add	w5,w5,w10
500	add	v4.4s,v4.4s,v5.4s
501	add	w6,w6,w11
502	add	v16.4s,v16.4s,v17.4s
503	add	w7,w7,w12
504	eor	v3.16b,v3.16b,v0.16b
505	add	w8,w8,w9
506	eor	v7.16b,v7.16b,v4.16b
507	eor	w21,w21,w5
508	eor	v19.16b,v19.16b,v16.16b
509	eor	w17,w17,w6
510	rev32	v3.8h,v3.8h
511	eor	w19,w19,w7
512	rev32	v7.8h,v7.8h
513	eor	w20,w20,w8
514	rev32	v19.8h,v19.8h
515	ror	w21,w21,#16
516	add	v2.4s,v2.4s,v3.4s
517	ror	w17,w17,#16
518	add	v6.4s,v6.4s,v7.4s
519	ror	w19,w19,#16
520	add	v18.4s,v18.4s,v19.4s
521	ror	w20,w20,#16
522	eor	v20.16b,v1.16b,v2.16b
523	add	w15,w15,w21
524	eor	v21.16b,v5.16b,v6.16b
525	add	w16,w16,w17
526	eor	v22.16b,v17.16b,v18.16b
527	add	w13,w13,w19
528	ushr	v1.4s,v20.4s,#20
529	add	w14,w14,w20
530	ushr	v5.4s,v21.4s,#20
531	eor	w10,w10,w15
532	ushr	v17.4s,v22.4s,#20
533	eor	w11,w11,w16
534	sli	v1.4s,v20.4s,#12
535	eor	w12,w12,w13
536	sli	v5.4s,v21.4s,#12
537	eor	w9,w9,w14
538	sli	v17.4s,v22.4s,#12
539	ror	w10,w10,#20
540	add	v0.4s,v0.4s,v1.4s
541	ror	w11,w11,#20
542	add	v4.4s,v4.4s,v5.4s
543	ror	w12,w12,#20
544	add	v16.4s,v16.4s,v17.4s
545	ror	w9,w9,#20
546	eor	v20.16b,v3.16b,v0.16b
547	add	w5,w5,w10
548	eor	v21.16b,v7.16b,v4.16b
549	add	w6,w6,w11
550	eor	v22.16b,v19.16b,v16.16b
551	add	w7,w7,w12
552	ushr	v3.4s,v20.4s,#24
553	add	w8,w8,w9
554	ushr	v7.4s,v21.4s,#24
555	eor	w21,w21,w5
556	ushr	v19.4s,v22.4s,#24
557	eor	w17,w17,w6
558	sli	v3.4s,v20.4s,#8
559	eor	w19,w19,w7
560	sli	v7.4s,v21.4s,#8
561	eor	w20,w20,w8
562	sli	v19.4s,v22.4s,#8
563	ror	w21,w21,#24
564	add	v2.4s,v2.4s,v3.4s
565	ror	w17,w17,#24
566	add	v6.4s,v6.4s,v7.4s
567	ror	w19,w19,#24
568	add	v18.4s,v18.4s,v19.4s
569	ror	w20,w20,#24
570	eor	v20.16b,v1.16b,v2.16b
571	add	w15,w15,w21
572	eor	v21.16b,v5.16b,v6.16b
573	add	w16,w16,w17
574	eor	v22.16b,v17.16b,v18.16b
575	add	w13,w13,w19
576	ushr	v1.4s,v20.4s,#25
577	add	w14,w14,w20
578	ushr	v5.4s,v21.4s,#25
579	eor	w10,w10,w15
580	ushr	v17.4s,v22.4s,#25
581	eor	w11,w11,w16
582	sli	v1.4s,v20.4s,#7
583	eor	w12,w12,w13
584	sli	v5.4s,v21.4s,#7
585	eor	w9,w9,w14
586	sli	v17.4s,v22.4s,#7
587	ror	w10,w10,#25
588	ext	v2.16b,v2.16b,v2.16b,#8
589	ror	w11,w11,#25
590	ext	v6.16b,v6.16b,v6.16b,#8
591	ror	w12,w12,#25
592	ext	v18.16b,v18.16b,v18.16b,#8
593	ror	w9,w9,#25
594	ext	v3.16b,v3.16b,v3.16b,#4
595	ext	v7.16b,v7.16b,v7.16b,#4
596	ext	v19.16b,v19.16b,v19.16b,#4
597	ext	v1.16b,v1.16b,v1.16b,#12
598	ext	v5.16b,v5.16b,v5.16b,#12
599	ext	v17.16b,v17.16b,v17.16b,#12
600	cbnz	x4,Loop_neon
601
602	add	w5,w5,w22		// accumulate key block
603	add	v0.4s,v0.4s,v24.4s
604	add	x6,x6,x22,lsr#32
605	add	v4.4s,v4.4s,v24.4s
606	add	w7,w7,w23
607	add	v16.4s,v16.4s,v24.4s
608	add	x8,x8,x23,lsr#32
609	add	v2.4s,v2.4s,v26.4s
610	add	w9,w9,w24
611	add	v6.4s,v6.4s,v26.4s
612	add	x10,x10,x24,lsr#32
613	add	v18.4s,v18.4s,v26.4s
614	add	w11,w11,w25
615	add	v3.4s,v3.4s,v27.4s
616	add	x12,x12,x25,lsr#32
617	add	w13,w13,w26
618	add	v7.4s,v7.4s,v28.4s
619	add	x14,x14,x26,lsr#32
620	add	w15,w15,w27
621	add	v19.4s,v19.4s,v29.4s
622	add	x16,x16,x27,lsr#32
623	add	w17,w17,w28
624	add	v1.4s,v1.4s,v25.4s
625	add	x19,x19,x28,lsr#32
626	add	w20,w20,w30
627	add	v5.4s,v5.4s,v25.4s
628	add	x21,x21,x30,lsr#32
629	add	v17.4s,v17.4s,v25.4s
630
631	b.lo	Ltail_neon
632
633	add	x5,x5,x6,lsl#32	// pack
634	add	x7,x7,x8,lsl#32
635	ldp	x6,x8,[x1,#0]		// load input
636	add	x9,x9,x10,lsl#32
637	add	x11,x11,x12,lsl#32
638	ldp	x10,x12,[x1,#16]
639	add	x13,x13,x14,lsl#32
640	add	x15,x15,x16,lsl#32
641	ldp	x14,x16,[x1,#32]
642	add	x17,x17,x19,lsl#32
643	add	x20,x20,x21,lsl#32
644	ldp	x19,x21,[x1,#48]
645	add	x1,x1,#64
646#ifdef	__ARMEB__
647	rev	x5,x5
648	rev	x7,x7
649	rev	x9,x9
650	rev	x11,x11
651	rev	x13,x13
652	rev	x15,x15
653	rev	x17,x17
654	rev	x20,x20
655#endif
656	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
657	eor	x5,x5,x6
658	eor	x7,x7,x8
659	eor	x9,x9,x10
660	eor	x11,x11,x12
661	eor	x13,x13,x14
662	eor	v0.16b,v0.16b,v20.16b
663	eor	x15,x15,x16
664	eor	v1.16b,v1.16b,v21.16b
665	eor	x17,x17,x19
666	eor	v2.16b,v2.16b,v22.16b
667	eor	x20,x20,x21
668	eor	v3.16b,v3.16b,v23.16b
669	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
670
671	stp	x5,x7,[x0,#0]		// store output
672	add	x28,x28,#4			// increment counter
673	stp	x9,x11,[x0,#16]
674	add	v27.4s,v27.4s,v31.4s		// += 4
675	stp	x13,x15,[x0,#32]
676	add	v28.4s,v28.4s,v31.4s
677	stp	x17,x20,[x0,#48]
678	add	v29.4s,v29.4s,v31.4s
679	add	x0,x0,#64
680
681	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
682	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
683
684	eor	v4.16b,v4.16b,v20.16b
685	eor	v5.16b,v5.16b,v21.16b
686	eor	v6.16b,v6.16b,v22.16b
687	eor	v7.16b,v7.16b,v23.16b
688	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
689
690	eor	v16.16b,v16.16b,v0.16b
691	eor	v17.16b,v17.16b,v1.16b
692	eor	v18.16b,v18.16b,v2.16b
693	eor	v19.16b,v19.16b,v3.16b
694	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
695
696	b.hi	Loop_outer_neon
697
698	ldp	x19,x20,[x29,#16]
699	add	sp,sp,#64
700	ldp	x21,x22,[x29,#32]
701	ldp	x23,x24,[x29,#48]
702	ldp	x25,x26,[x29,#64]
703	ldp	x27,x28,[x29,#80]
704	ldp	x29,x30,[sp],#96
705	ret
706
707Ltail_neon:
708	add	x2,x2,#256
709	cmp	x2,#64
710	b.lo	Less_than_64
711
712	add	x5,x5,x6,lsl#32	// pack
713	add	x7,x7,x8,lsl#32
714	ldp	x6,x8,[x1,#0]		// load input
715	add	x9,x9,x10,lsl#32
716	add	x11,x11,x12,lsl#32
717	ldp	x10,x12,[x1,#16]
718	add	x13,x13,x14,lsl#32
719	add	x15,x15,x16,lsl#32
720	ldp	x14,x16,[x1,#32]
721	add	x17,x17,x19,lsl#32
722	add	x20,x20,x21,lsl#32
723	ldp	x19,x21,[x1,#48]
724	add	x1,x1,#64
725#ifdef	__ARMEB__
726	rev	x5,x5
727	rev	x7,x7
728	rev	x9,x9
729	rev	x11,x11
730	rev	x13,x13
731	rev	x15,x15
732	rev	x17,x17
733	rev	x20,x20
734#endif
735	eor	x5,x5,x6
736	eor	x7,x7,x8
737	eor	x9,x9,x10
738	eor	x11,x11,x12
739	eor	x13,x13,x14
740	eor	x15,x15,x16
741	eor	x17,x17,x19
742	eor	x20,x20,x21
743
744	stp	x5,x7,[x0,#0]		// store output
745	add	x28,x28,#4			// increment counter
746	stp	x9,x11,[x0,#16]
747	stp	x13,x15,[x0,#32]
748	stp	x17,x20,[x0,#48]
749	add	x0,x0,#64
750	b.eq	Ldone_neon
751	sub	x2,x2,#64
752	cmp	x2,#64
753	b.lo	Less_than_128
754
755	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
756	eor	v0.16b,v0.16b,v20.16b
757	eor	v1.16b,v1.16b,v21.16b
758	eor	v2.16b,v2.16b,v22.16b
759	eor	v3.16b,v3.16b,v23.16b
760	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
761	b.eq	Ldone_neon
762	sub	x2,x2,#64
763	cmp	x2,#64
764	b.lo	Less_than_192
765
766	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
767	eor	v4.16b,v4.16b,v20.16b
768	eor	v5.16b,v5.16b,v21.16b
769	eor	v6.16b,v6.16b,v22.16b
770	eor	v7.16b,v7.16b,v23.16b
771	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
772	b.eq	Ldone_neon
773	sub	x2,x2,#64
774
775	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
776	b	Last_neon
777
778Less_than_128:
779	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
780	b	Last_neon
781Less_than_192:
782	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
783	b	Last_neon
784
785.align	4
786Last_neon:
787	sub	x0,x0,#1
788	add	x1,x1,x2
789	add	x0,x0,x2
790	add	x4,sp,x2
791	neg	x2,x2
792
793Loop_tail_neon:
794	ldrb	w10,[x1,x2]
795	ldrb	w11,[x4,x2]
796	add	x2,x2,#1
797	eor	w10,w10,w11
798	strb	w10,[x0,x2]
799	cbnz	x2,Loop_tail_neon
800
801	stp	xzr,xzr,[sp,#0]
802	stp	xzr,xzr,[sp,#16]
803	stp	xzr,xzr,[sp,#32]
804	stp	xzr,xzr,[sp,#48]
805
806Ldone_neon:
807	ldp	x19,x20,[x29,#16]
808	add	sp,sp,#64
809	ldp	x21,x22,[x29,#32]
810	ldp	x23,x24,[x29,#48]
811	ldp	x25,x26,[x29,#64]
812	ldp	x27,x28,[x29,#80]
813	ldp	x29,x30,[sp],#96
814	ret
815
816
817.align	5
818ChaCha20_512_neon:
819	stp	x29,x30,[sp,#-96]!
820	add	x29,sp,#0
821
822	adrp	x5,Lsigma@PAGE
823	add	x5,x5,Lsigma@PAGEOFF
824	stp	x19,x20,[sp,#16]
825	stp	x21,x22,[sp,#32]
826	stp	x23,x24,[sp,#48]
827	stp	x25,x26,[sp,#64]
828	stp	x27,x28,[sp,#80]
829
830L512_or_more_neon:
831	sub	sp,sp,#128+64
832
833	ldp	x22,x23,[x5]		// load sigma
834	ld1	{v24.4s},[x5],#16
835	ldp	x24,x25,[x3]		// load key
836	ldp	x26,x27,[x3,#16]
837	ld1	{v25.4s,v26.4s},[x3]
838	ldp	x28,x30,[x4]		// load counter
839	ld1	{v27.4s},[x4]
840	ld1	{v31.4s},[x5]
841#ifdef	__ARMEB__
842	rev64	v24.4s,v24.4s
843	ror	x24,x24,#32
844	ror	x25,x25,#32
845	ror	x26,x26,#32
846	ror	x27,x27,#32
847	ror	x28,x28,#32
848	ror	x30,x30,#32
849#endif
850	add	v27.4s,v27.4s,v31.4s		// += 1
851	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
852	add	v27.4s,v27.4s,v31.4s		// not typo
853	str	q26,[sp,#32]
854	add	v28.4s,v27.4s,v31.4s
855	add	v29.4s,v28.4s,v31.4s
856	add	v30.4s,v29.4s,v31.4s
857	shl	v31.4s,v31.4s,#2			// 1 -> 4
858
859	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
860	stp	d10,d11,[sp,#128+16]
861	stp	d12,d13,[sp,#128+32]
862	stp	d14,d15,[sp,#128+48]
863
864	sub	x2,x2,#512			// not typo
865
866Loop_outer_512_neon:
867	mov	v0.16b,v24.16b
868	mov	v4.16b,v24.16b
869	mov	v8.16b,v24.16b
870	mov	v12.16b,v24.16b
871	mov	v16.16b,v24.16b
872	mov	v20.16b,v24.16b
873	mov	v1.16b,v25.16b
874	mov	w5,w22			// unpack key block
875	mov	v5.16b,v25.16b
876	lsr	x6,x22,#32
877	mov	v9.16b,v25.16b
878	mov	w7,w23
879	mov	v13.16b,v25.16b
880	lsr	x8,x23,#32
881	mov	v17.16b,v25.16b
882	mov	w9,w24
883	mov	v21.16b,v25.16b
884	lsr	x10,x24,#32
885	mov	v3.16b,v27.16b
886	mov	w11,w25
887	mov	v7.16b,v28.16b
888	lsr	x12,x25,#32
889	mov	v11.16b,v29.16b
890	mov	w13,w26
891	mov	v15.16b,v30.16b
892	lsr	x14,x26,#32
893	mov	v2.16b,v26.16b
894	mov	w15,w27
895	mov	v6.16b,v26.16b
896	lsr	x16,x27,#32
897	add	v19.4s,v3.4s,v31.4s			// +4
898	mov	w17,w28
899	add	v23.4s,v7.4s,v31.4s			// +4
900	lsr	x19,x28,#32
901	mov	v10.16b,v26.16b
902	mov	w20,w30
903	mov	v14.16b,v26.16b
904	lsr	x21,x30,#32
905	mov	v18.16b,v26.16b
906	stp	q27,q28,[sp,#48]		// off-load key block, variable part
907	mov	v22.16b,v26.16b
908	str	q29,[sp,#80]
909
910	mov	x4,#5
911	subs	x2,x2,#512
912Loop_upper_neon:
913	sub	x4,x4,#1
914	add	v0.4s,v0.4s,v1.4s
915	add	w5,w5,w9
916	add	v4.4s,v4.4s,v5.4s
917	add	w6,w6,w10
918	add	v8.4s,v8.4s,v9.4s
919	add	w7,w7,w11
920	add	v12.4s,v12.4s,v13.4s
921	add	w8,w8,w12
922	add	v16.4s,v16.4s,v17.4s
923	eor	w17,w17,w5
924	add	v20.4s,v20.4s,v21.4s
925	eor	w19,w19,w6
926	eor	v3.16b,v3.16b,v0.16b
927	eor	w20,w20,w7
928	eor	v7.16b,v7.16b,v4.16b
929	eor	w21,w21,w8
930	eor	v11.16b,v11.16b,v8.16b
931	ror	w17,w17,#16
932	eor	v15.16b,v15.16b,v12.16b
933	ror	w19,w19,#16
934	eor	v19.16b,v19.16b,v16.16b
935	ror	w20,w20,#16
936	eor	v23.16b,v23.16b,v20.16b
937	ror	w21,w21,#16
938	rev32	v3.8h,v3.8h
939	add	w13,w13,w17
940	rev32	v7.8h,v7.8h
941	add	w14,w14,w19
942	rev32	v11.8h,v11.8h
943	add	w15,w15,w20
944	rev32	v15.8h,v15.8h
945	add	w16,w16,w21
946	rev32	v19.8h,v19.8h
947	eor	w9,w9,w13
948	rev32	v23.8h,v23.8h
949	eor	w10,w10,w14
950	add	v2.4s,v2.4s,v3.4s
951	eor	w11,w11,w15
952	add	v6.4s,v6.4s,v7.4s
953	eor	w12,w12,w16
954	add	v10.4s,v10.4s,v11.4s
955	ror	w9,w9,#20
956	add	v14.4s,v14.4s,v15.4s
957	ror	w10,w10,#20
958	add	v18.4s,v18.4s,v19.4s
959	ror	w11,w11,#20
960	add	v22.4s,v22.4s,v23.4s
961	ror	w12,w12,#20
962	eor	v24.16b,v1.16b,v2.16b
963	add	w5,w5,w9
964	eor	v25.16b,v5.16b,v6.16b
965	add	w6,w6,w10
966	eor	v26.16b,v9.16b,v10.16b
967	add	w7,w7,w11
968	eor	v27.16b,v13.16b,v14.16b
969	add	w8,w8,w12
970	eor	v28.16b,v17.16b,v18.16b
971	eor	w17,w17,w5
972	eor	v29.16b,v21.16b,v22.16b
973	eor	w19,w19,w6
974	ushr	v1.4s,v24.4s,#20
975	eor	w20,w20,w7
976	ushr	v5.4s,v25.4s,#20
977	eor	w21,w21,w8
978	ushr	v9.4s,v26.4s,#20
979	ror	w17,w17,#24
980	ushr	v13.4s,v27.4s,#20
981	ror	w19,w19,#24
982	ushr	v17.4s,v28.4s,#20
983	ror	w20,w20,#24
984	ushr	v21.4s,v29.4s,#20
985	ror	w21,w21,#24
986	sli	v1.4s,v24.4s,#12
987	add	w13,w13,w17
988	sli	v5.4s,v25.4s,#12
989	add	w14,w14,w19
990	sli	v9.4s,v26.4s,#12
991	add	w15,w15,w20
992	sli	v13.4s,v27.4s,#12
993	add	w16,w16,w21
994	sli	v17.4s,v28.4s,#12
995	eor	w9,w9,w13
996	sli	v21.4s,v29.4s,#12
997	eor	w10,w10,w14
998	add	v0.4s,v0.4s,v1.4s
999	eor	w11,w11,w15
1000	add	v4.4s,v4.4s,v5.4s
1001	eor	w12,w12,w16
1002	add	v8.4s,v8.4s,v9.4s
1003	ror	w9,w9,#25
1004	add	v12.4s,v12.4s,v13.4s
1005	ror	w10,w10,#25
1006	add	v16.4s,v16.4s,v17.4s
1007	ror	w11,w11,#25
1008	add	v20.4s,v20.4s,v21.4s
1009	ror	w12,w12,#25
1010	eor	v24.16b,v3.16b,v0.16b
1011	add	w5,w5,w10
1012	eor	v25.16b,v7.16b,v4.16b
1013	add	w6,w6,w11
1014	eor	v26.16b,v11.16b,v8.16b
1015	add	w7,w7,w12
1016	eor	v27.16b,v15.16b,v12.16b
1017	add	w8,w8,w9
1018	eor	v28.16b,v19.16b,v16.16b
1019	eor	w21,w21,w5
1020	eor	v29.16b,v23.16b,v20.16b
1021	eor	w17,w17,w6
1022	ushr	v3.4s,v24.4s,#24
1023	eor	w19,w19,w7
1024	ushr	v7.4s,v25.4s,#24
1025	eor	w20,w20,w8
1026	ushr	v11.4s,v26.4s,#24
1027	ror	w21,w21,#16
1028	ushr	v15.4s,v27.4s,#24
1029	ror	w17,w17,#16
1030	ushr	v19.4s,v28.4s,#24
1031	ror	w19,w19,#16
1032	ushr	v23.4s,v29.4s,#24
1033	ror	w20,w20,#16
1034	sli	v3.4s,v24.4s,#8
1035	add	w15,w15,w21
1036	sli	v7.4s,v25.4s,#8
1037	add	w16,w16,w17
1038	sli	v11.4s,v26.4s,#8
1039	add	w13,w13,w19
1040	sli	v15.4s,v27.4s,#8
1041	add	w14,w14,w20
1042	sli	v19.4s,v28.4s,#8
1043	eor	w10,w10,w15
1044	sli	v23.4s,v29.4s,#8
1045	eor	w11,w11,w16
1046	add	v2.4s,v2.4s,v3.4s
1047	eor	w12,w12,w13
1048	add	v6.4s,v6.4s,v7.4s
1049	eor	w9,w9,w14
1050	add	v10.4s,v10.4s,v11.4s
1051	ror	w10,w10,#20
1052	add	v14.4s,v14.4s,v15.4s
1053	ror	w11,w11,#20
1054	add	v18.4s,v18.4s,v19.4s
1055	ror	w12,w12,#20
1056	add	v22.4s,v22.4s,v23.4s
1057	ror	w9,w9,#20
1058	eor	v24.16b,v1.16b,v2.16b
1059	add	w5,w5,w10
1060	eor	v25.16b,v5.16b,v6.16b
1061	add	w6,w6,w11
1062	eor	v26.16b,v9.16b,v10.16b
1063	add	w7,w7,w12
1064	eor	v27.16b,v13.16b,v14.16b
1065	add	w8,w8,w9
1066	eor	v28.16b,v17.16b,v18.16b
1067	eor	w21,w21,w5
1068	eor	v29.16b,v21.16b,v22.16b
1069	eor	w17,w17,w6
1070	ushr	v1.4s,v24.4s,#25
1071	eor	w19,w19,w7
1072	ushr	v5.4s,v25.4s,#25
1073	eor	w20,w20,w8
1074	ushr	v9.4s,v26.4s,#25
1075	ror	w21,w21,#24
1076	ushr	v13.4s,v27.4s,#25
1077	ror	w17,w17,#24
1078	ushr	v17.4s,v28.4s,#25
1079	ror	w19,w19,#24
1080	ushr	v21.4s,v29.4s,#25
1081	ror	w20,w20,#24
1082	sli	v1.4s,v24.4s,#7
1083	add	w15,w15,w21
1084	sli	v5.4s,v25.4s,#7
1085	add	w16,w16,w17
1086	sli	v9.4s,v26.4s,#7
1087	add	w13,w13,w19
1088	sli	v13.4s,v27.4s,#7
1089	add	w14,w14,w20
1090	sli	v17.4s,v28.4s,#7
1091	eor	w10,w10,w15
1092	sli	v21.4s,v29.4s,#7
1093	eor	w11,w11,w16
1094	ext	v2.16b,v2.16b,v2.16b,#8
1095	eor	w12,w12,w13
1096	ext	v6.16b,v6.16b,v6.16b,#8
1097	eor	w9,w9,w14
1098	ext	v10.16b,v10.16b,v10.16b,#8
1099	ror	w10,w10,#25
1100	ext	v14.16b,v14.16b,v14.16b,#8
1101	ror	w11,w11,#25
1102	ext	v18.16b,v18.16b,v18.16b,#8
1103	ror	w12,w12,#25
1104	ext	v22.16b,v22.16b,v22.16b,#8
1105	ror	w9,w9,#25
1106	ext	v3.16b,v3.16b,v3.16b,#12
1107	ext	v7.16b,v7.16b,v7.16b,#12
1108	ext	v11.16b,v11.16b,v11.16b,#12
1109	ext	v15.16b,v15.16b,v15.16b,#12
1110	ext	v19.16b,v19.16b,v19.16b,#12
1111	ext	v23.16b,v23.16b,v23.16b,#12
1112	ext	v1.16b,v1.16b,v1.16b,#4
1113	ext	v5.16b,v5.16b,v5.16b,#4
1114	ext	v9.16b,v9.16b,v9.16b,#4
1115	ext	v13.16b,v13.16b,v13.16b,#4
1116	ext	v17.16b,v17.16b,v17.16b,#4
1117	ext	v21.16b,v21.16b,v21.16b,#4
1118	add	v0.4s,v0.4s,v1.4s
1119	add	w5,w5,w9
1120	add	v4.4s,v4.4s,v5.4s
1121	add	w6,w6,w10
1122	add	v8.4s,v8.4s,v9.4s
1123	add	w7,w7,w11
1124	add	v12.4s,v12.4s,v13.4s
1125	add	w8,w8,w12
1126	add	v16.4s,v16.4s,v17.4s
1127	eor	w17,w17,w5
1128	add	v20.4s,v20.4s,v21.4s
1129	eor	w19,w19,w6
1130	eor	v3.16b,v3.16b,v0.16b
1131	eor	w20,w20,w7
1132	eor	v7.16b,v7.16b,v4.16b
1133	eor	w21,w21,w8
1134	eor	v11.16b,v11.16b,v8.16b
1135	ror	w17,w17,#16
1136	eor	v15.16b,v15.16b,v12.16b
1137	ror	w19,w19,#16
1138	eor	v19.16b,v19.16b,v16.16b
1139	ror	w20,w20,#16
1140	eor	v23.16b,v23.16b,v20.16b
1141	ror	w21,w21,#16
1142	rev32	v3.8h,v3.8h
1143	add	w13,w13,w17
1144	rev32	v7.8h,v7.8h
1145	add	w14,w14,w19
1146	rev32	v11.8h,v11.8h
1147	add	w15,w15,w20
1148	rev32	v15.8h,v15.8h
1149	add	w16,w16,w21
1150	rev32	v19.8h,v19.8h
1151	eor	w9,w9,w13
1152	rev32	v23.8h,v23.8h
1153	eor	w10,w10,w14
1154	add	v2.4s,v2.4s,v3.4s
1155	eor	w11,w11,w15
1156	add	v6.4s,v6.4s,v7.4s
1157	eor	w12,w12,w16
1158	add	v10.4s,v10.4s,v11.4s
1159	ror	w9,w9,#20
1160	add	v14.4s,v14.4s,v15.4s
1161	ror	w10,w10,#20
1162	add	v18.4s,v18.4s,v19.4s
1163	ror	w11,w11,#20
1164	add	v22.4s,v22.4s,v23.4s
1165	ror	w12,w12,#20
1166	eor	v24.16b,v1.16b,v2.16b
1167	add	w5,w5,w9
1168	eor	v25.16b,v5.16b,v6.16b
1169	add	w6,w6,w10
1170	eor	v26.16b,v9.16b,v10.16b
1171	add	w7,w7,w11
1172	eor	v27.16b,v13.16b,v14.16b
1173	add	w8,w8,w12
1174	eor	v28.16b,v17.16b,v18.16b
1175	eor	w17,w17,w5
1176	eor	v29.16b,v21.16b,v22.16b
1177	eor	w19,w19,w6
1178	ushr	v1.4s,v24.4s,#20
1179	eor	w20,w20,w7
1180	ushr	v5.4s,v25.4s,#20
1181	eor	w21,w21,w8
1182	ushr	v9.4s,v26.4s,#20
1183	ror	w17,w17,#24
1184	ushr	v13.4s,v27.4s,#20
1185	ror	w19,w19,#24
1186	ushr	v17.4s,v28.4s,#20
1187	ror	w20,w20,#24
1188	ushr	v21.4s,v29.4s,#20
1189	ror	w21,w21,#24
1190	sli	v1.4s,v24.4s,#12
1191	add	w13,w13,w17
1192	sli	v5.4s,v25.4s,#12
1193	add	w14,w14,w19
1194	sli	v9.4s,v26.4s,#12
1195	add	w15,w15,w20
1196	sli	v13.4s,v27.4s,#12
1197	add	w16,w16,w21
1198	sli	v17.4s,v28.4s,#12
1199	eor	w9,w9,w13
1200	sli	v21.4s,v29.4s,#12
1201	eor	w10,w10,w14
1202	add	v0.4s,v0.4s,v1.4s
1203	eor	w11,w11,w15
1204	add	v4.4s,v4.4s,v5.4s
1205	eor	w12,w12,w16
1206	add	v8.4s,v8.4s,v9.4s
1207	ror	w9,w9,#25
1208	add	v12.4s,v12.4s,v13.4s
1209	ror	w10,w10,#25
1210	add	v16.4s,v16.4s,v17.4s
1211	ror	w11,w11,#25
1212	add	v20.4s,v20.4s,v21.4s
1213	ror	w12,w12,#25
1214	eor	v24.16b,v3.16b,v0.16b
1215	add	w5,w5,w10
1216	eor	v25.16b,v7.16b,v4.16b
1217	add	w6,w6,w11
1218	eor	v26.16b,v11.16b,v8.16b
1219	add	w7,w7,w12
1220	eor	v27.16b,v15.16b,v12.16b
1221	add	w8,w8,w9
1222	eor	v28.16b,v19.16b,v16.16b
1223	eor	w21,w21,w5
1224	eor	v29.16b,v23.16b,v20.16b
1225	eor	w17,w17,w6
1226	ushr	v3.4s,v24.4s,#24
1227	eor	w19,w19,w7
1228	ushr	v7.4s,v25.4s,#24
1229	eor	w20,w20,w8
1230	ushr	v11.4s,v26.4s,#24
1231	ror	w21,w21,#16
1232	ushr	v15.4s,v27.4s,#24
1233	ror	w17,w17,#16
1234	ushr	v19.4s,v28.4s,#24
1235	ror	w19,w19,#16
1236	ushr	v23.4s,v29.4s,#24
1237	ror	w20,w20,#16
1238	sli	v3.4s,v24.4s,#8
1239	add	w15,w15,w21
1240	sli	v7.4s,v25.4s,#8
1241	add	w16,w16,w17
1242	sli	v11.4s,v26.4s,#8
1243	add	w13,w13,w19
1244	sli	v15.4s,v27.4s,#8
1245	add	w14,w14,w20
1246	sli	v19.4s,v28.4s,#8
1247	eor	w10,w10,w15
1248	sli	v23.4s,v29.4s,#8
1249	eor	w11,w11,w16
1250	add	v2.4s,v2.4s,v3.4s
1251	eor	w12,w12,w13
1252	add	v6.4s,v6.4s,v7.4s
1253	eor	w9,w9,w14
1254	add	v10.4s,v10.4s,v11.4s
1255	ror	w10,w10,#20
1256	add	v14.4s,v14.4s,v15.4s
1257	ror	w11,w11,#20
1258	add	v18.4s,v18.4s,v19.4s
1259	ror	w12,w12,#20
1260	add	v22.4s,v22.4s,v23.4s
1261	ror	w9,w9,#20
1262	eor	v24.16b,v1.16b,v2.16b
1263	add	w5,w5,w10
1264	eor	v25.16b,v5.16b,v6.16b
1265	add	w6,w6,w11
1266	eor	v26.16b,v9.16b,v10.16b
1267	add	w7,w7,w12
1268	eor	v27.16b,v13.16b,v14.16b
1269	add	w8,w8,w9
1270	eor	v28.16b,v17.16b,v18.16b
1271	eor	w21,w21,w5
1272	eor	v29.16b,v21.16b,v22.16b
1273	eor	w17,w17,w6
1274	ushr	v1.4s,v24.4s,#25
1275	eor	w19,w19,w7
1276	ushr	v5.4s,v25.4s,#25
1277	eor	w20,w20,w8
1278	ushr	v9.4s,v26.4s,#25
1279	ror	w21,w21,#24
1280	ushr	v13.4s,v27.4s,#25
1281	ror	w17,w17,#24
1282	ushr	v17.4s,v28.4s,#25
1283	ror	w19,w19,#24
1284	ushr	v21.4s,v29.4s,#25
1285	ror	w20,w20,#24
1286	sli	v1.4s,v24.4s,#7
1287	add	w15,w15,w21
1288	sli	v5.4s,v25.4s,#7
1289	add	w16,w16,w17
1290	sli	v9.4s,v26.4s,#7
1291	add	w13,w13,w19
1292	sli	v13.4s,v27.4s,#7
1293	add	w14,w14,w20
1294	sli	v17.4s,v28.4s,#7
1295	eor	w10,w10,w15
1296	sli	v21.4s,v29.4s,#7
1297	eor	w11,w11,w16
1298	ext	v2.16b,v2.16b,v2.16b,#8
1299	eor	w12,w12,w13
1300	ext	v6.16b,v6.16b,v6.16b,#8
1301	eor	w9,w9,w14
1302	ext	v10.16b,v10.16b,v10.16b,#8
1303	ror	w10,w10,#25
1304	ext	v14.16b,v14.16b,v14.16b,#8
1305	ror	w11,w11,#25
1306	ext	v18.16b,v18.16b,v18.16b,#8
1307	ror	w12,w12,#25
1308	ext	v22.16b,v22.16b,v22.16b,#8
1309	ror	w9,w9,#25
1310	ext	v3.16b,v3.16b,v3.16b,#4
1311	ext	v7.16b,v7.16b,v7.16b,#4
1312	ext	v11.16b,v11.16b,v11.16b,#4
1313	ext	v15.16b,v15.16b,v15.16b,#4
1314	ext	v19.16b,v19.16b,v19.16b,#4
1315	ext	v23.16b,v23.16b,v23.16b,#4
1316	ext	v1.16b,v1.16b,v1.16b,#12
1317	ext	v5.16b,v5.16b,v5.16b,#12
1318	ext	v9.16b,v9.16b,v9.16b,#12
1319	ext	v13.16b,v13.16b,v13.16b,#12
1320	ext	v17.16b,v17.16b,v17.16b,#12
1321	ext	v21.16b,v21.16b,v21.16b,#12
1322	cbnz	x4,Loop_upper_neon
1323
1324	add	w5,w5,w22		// accumulate key block
1325	add	x6,x6,x22,lsr#32
1326	add	w7,w7,w23
1327	add	x8,x8,x23,lsr#32
1328	add	w9,w9,w24
1329	add	x10,x10,x24,lsr#32
1330	add	w11,w11,w25
1331	add	x12,x12,x25,lsr#32
1332	add	w13,w13,w26
1333	add	x14,x14,x26,lsr#32
1334	add	w15,w15,w27
1335	add	x16,x16,x27,lsr#32
1336	add	w17,w17,w28
1337	add	x19,x19,x28,lsr#32
1338	add	w20,w20,w30
1339	add	x21,x21,x30,lsr#32
1340
1341	add	x5,x5,x6,lsl#32	// pack
1342	add	x7,x7,x8,lsl#32
1343	ldp	x6,x8,[x1,#0]		// load input
1344	add	x9,x9,x10,lsl#32
1345	add	x11,x11,x12,lsl#32
1346	ldp	x10,x12,[x1,#16]
1347	add	x13,x13,x14,lsl#32
1348	add	x15,x15,x16,lsl#32
1349	ldp	x14,x16,[x1,#32]
1350	add	x17,x17,x19,lsl#32
1351	add	x20,x20,x21,lsl#32
1352	ldp	x19,x21,[x1,#48]
1353	add	x1,x1,#64
1354#ifdef	__ARMEB__
1355	rev	x5,x5
1356	rev	x7,x7
1357	rev	x9,x9
1358	rev	x11,x11
1359	rev	x13,x13
1360	rev	x15,x15
1361	rev	x17,x17
1362	rev	x20,x20
1363#endif
1364	eor	x5,x5,x6
1365	eor	x7,x7,x8
1366	eor	x9,x9,x10
1367	eor	x11,x11,x12
1368	eor	x13,x13,x14
1369	eor	x15,x15,x16
1370	eor	x17,x17,x19
1371	eor	x20,x20,x21
1372
1373	stp	x5,x7,[x0,#0]		// store output
1374	add	x28,x28,#1			// increment counter
1375	mov	w5,w22			// unpack key block
1376	lsr	x6,x22,#32
1377	stp	x9,x11,[x0,#16]
1378	mov	w7,w23
1379	lsr	x8,x23,#32
1380	stp	x13,x15,[x0,#32]
1381	mov	w9,w24
1382	lsr	x10,x24,#32
1383	stp	x17,x20,[x0,#48]
1384	add	x0,x0,#64
1385	mov	w11,w25
1386	lsr	x12,x25,#32
1387	mov	w13,w26
1388	lsr	x14,x26,#32
1389	mov	w15,w27
1390	lsr	x16,x27,#32
1391	mov	w17,w28
1392	lsr	x19,x28,#32
1393	mov	w20,w30
1394	lsr	x21,x30,#32
1395
1396	mov	x4,#5
1397Loop_lower_neon:
1398	sub	x4,x4,#1
1399	add	v0.4s,v0.4s,v1.4s
1400	add	w5,w5,w9
1401	add	v4.4s,v4.4s,v5.4s
1402	add	w6,w6,w10
1403	add	v8.4s,v8.4s,v9.4s
1404	add	w7,w7,w11
1405	add	v12.4s,v12.4s,v13.4s
1406	add	w8,w8,w12
1407	add	v16.4s,v16.4s,v17.4s
1408	eor	w17,w17,w5
1409	add	v20.4s,v20.4s,v21.4s
1410	eor	w19,w19,w6
1411	eor	v3.16b,v3.16b,v0.16b
1412	eor	w20,w20,w7
1413	eor	v7.16b,v7.16b,v4.16b
1414	eor	w21,w21,w8
1415	eor	v11.16b,v11.16b,v8.16b
1416	ror	w17,w17,#16
1417	eor	v15.16b,v15.16b,v12.16b
1418	ror	w19,w19,#16
1419	eor	v19.16b,v19.16b,v16.16b
1420	ror	w20,w20,#16
1421	eor	v23.16b,v23.16b,v20.16b
1422	ror	w21,w21,#16
1423	rev32	v3.8h,v3.8h
1424	add	w13,w13,w17
1425	rev32	v7.8h,v7.8h
1426	add	w14,w14,w19
1427	rev32	v11.8h,v11.8h
1428	add	w15,w15,w20
1429	rev32	v15.8h,v15.8h
1430	add	w16,w16,w21
1431	rev32	v19.8h,v19.8h
1432	eor	w9,w9,w13
1433	rev32	v23.8h,v23.8h
1434	eor	w10,w10,w14
1435	add	v2.4s,v2.4s,v3.4s
1436	eor	w11,w11,w15
1437	add	v6.4s,v6.4s,v7.4s
1438	eor	w12,w12,w16
1439	add	v10.4s,v10.4s,v11.4s
1440	ror	w9,w9,#20
1441	add	v14.4s,v14.4s,v15.4s
1442	ror	w10,w10,#20
1443	add	v18.4s,v18.4s,v19.4s
1444	ror	w11,w11,#20
1445	add	v22.4s,v22.4s,v23.4s
1446	ror	w12,w12,#20
1447	eor	v24.16b,v1.16b,v2.16b
1448	add	w5,w5,w9
1449	eor	v25.16b,v5.16b,v6.16b
1450	add	w6,w6,w10
1451	eor	v26.16b,v9.16b,v10.16b
1452	add	w7,w7,w11
1453	eor	v27.16b,v13.16b,v14.16b
1454	add	w8,w8,w12
1455	eor	v28.16b,v17.16b,v18.16b
1456	eor	w17,w17,w5
1457	eor	v29.16b,v21.16b,v22.16b
1458	eor	w19,w19,w6
1459	ushr	v1.4s,v24.4s,#20
1460	eor	w20,w20,w7
1461	ushr	v5.4s,v25.4s,#20
1462	eor	w21,w21,w8
1463	ushr	v9.4s,v26.4s,#20
1464	ror	w17,w17,#24
1465	ushr	v13.4s,v27.4s,#20
1466	ror	w19,w19,#24
1467	ushr	v17.4s,v28.4s,#20
1468	ror	w20,w20,#24
1469	ushr	v21.4s,v29.4s,#20
1470	ror	w21,w21,#24
1471	sli	v1.4s,v24.4s,#12
1472	add	w13,w13,w17
1473	sli	v5.4s,v25.4s,#12
1474	add	w14,w14,w19
1475	sli	v9.4s,v26.4s,#12
1476	add	w15,w15,w20
1477	sli	v13.4s,v27.4s,#12
1478	add	w16,w16,w21
1479	sli	v17.4s,v28.4s,#12
1480	eor	w9,w9,w13
1481	sli	v21.4s,v29.4s,#12
1482	eor	w10,w10,w14
1483	add	v0.4s,v0.4s,v1.4s
1484	eor	w11,w11,w15
1485	add	v4.4s,v4.4s,v5.4s
1486	eor	w12,w12,w16
1487	add	v8.4s,v8.4s,v9.4s
1488	ror	w9,w9,#25
1489	add	v12.4s,v12.4s,v13.4s
1490	ror	w10,w10,#25
1491	add	v16.4s,v16.4s,v17.4s
1492	ror	w11,w11,#25
1493	add	v20.4s,v20.4s,v21.4s
1494	ror	w12,w12,#25
1495	eor	v24.16b,v3.16b,v0.16b
1496	add	w5,w5,w10
1497	eor	v25.16b,v7.16b,v4.16b
1498	add	w6,w6,w11
1499	eor	v26.16b,v11.16b,v8.16b
1500	add	w7,w7,w12
1501	eor	v27.16b,v15.16b,v12.16b
1502	add	w8,w8,w9
1503	eor	v28.16b,v19.16b,v16.16b
1504	eor	w21,w21,w5
1505	eor	v29.16b,v23.16b,v20.16b
1506	eor	w17,w17,w6
1507	ushr	v3.4s,v24.4s,#24
1508	eor	w19,w19,w7
1509	ushr	v7.4s,v25.4s,#24
1510	eor	w20,w20,w8
1511	ushr	v11.4s,v26.4s,#24
1512	ror	w21,w21,#16
1513	ushr	v15.4s,v27.4s,#24
1514	ror	w17,w17,#16
1515	ushr	v19.4s,v28.4s,#24
1516	ror	w19,w19,#16
1517	ushr	v23.4s,v29.4s,#24
1518	ror	w20,w20,#16
1519	sli	v3.4s,v24.4s,#8
1520	add	w15,w15,w21
1521	sli	v7.4s,v25.4s,#8
1522	add	w16,w16,w17
1523	sli	v11.4s,v26.4s,#8
1524	add	w13,w13,w19
1525	sli	v15.4s,v27.4s,#8
1526	add	w14,w14,w20
1527	sli	v19.4s,v28.4s,#8
1528	eor	w10,w10,w15
1529	sli	v23.4s,v29.4s,#8
1530	eor	w11,w11,w16
1531	add	v2.4s,v2.4s,v3.4s
1532	eor	w12,w12,w13
1533	add	v6.4s,v6.4s,v7.4s
1534	eor	w9,w9,w14
1535	add	v10.4s,v10.4s,v11.4s
1536	ror	w10,w10,#20
1537	add	v14.4s,v14.4s,v15.4s
1538	ror	w11,w11,#20
1539	add	v18.4s,v18.4s,v19.4s
1540	ror	w12,w12,#20
1541	add	v22.4s,v22.4s,v23.4s
1542	ror	w9,w9,#20
1543	eor	v24.16b,v1.16b,v2.16b
1544	add	w5,w5,w10
1545	eor	v25.16b,v5.16b,v6.16b
1546	add	w6,w6,w11
1547	eor	v26.16b,v9.16b,v10.16b
1548	add	w7,w7,w12
1549	eor	v27.16b,v13.16b,v14.16b
1550	add	w8,w8,w9
1551	eor	v28.16b,v17.16b,v18.16b
1552	eor	w21,w21,w5
1553	eor	v29.16b,v21.16b,v22.16b
1554	eor	w17,w17,w6
1555	ushr	v1.4s,v24.4s,#25
1556	eor	w19,w19,w7
1557	ushr	v5.4s,v25.4s,#25
1558	eor	w20,w20,w8
1559	ushr	v9.4s,v26.4s,#25
1560	ror	w21,w21,#24
1561	ushr	v13.4s,v27.4s,#25
1562	ror	w17,w17,#24
1563	ushr	v17.4s,v28.4s,#25
1564	ror	w19,w19,#24
1565	ushr	v21.4s,v29.4s,#25
1566	ror	w20,w20,#24
1567	sli	v1.4s,v24.4s,#7
1568	add	w15,w15,w21
1569	sli	v5.4s,v25.4s,#7
1570	add	w16,w16,w17
1571	sli	v9.4s,v26.4s,#7
1572	add	w13,w13,w19
1573	sli	v13.4s,v27.4s,#7
1574	add	w14,w14,w20
1575	sli	v17.4s,v28.4s,#7
1576	eor	w10,w10,w15
1577	sli	v21.4s,v29.4s,#7
1578	eor	w11,w11,w16
1579	ext	v2.16b,v2.16b,v2.16b,#8
1580	eor	w12,w12,w13
1581	ext	v6.16b,v6.16b,v6.16b,#8
1582	eor	w9,w9,w14
1583	ext	v10.16b,v10.16b,v10.16b,#8
1584	ror	w10,w10,#25
1585	ext	v14.16b,v14.16b,v14.16b,#8
1586	ror	w11,w11,#25
1587	ext	v18.16b,v18.16b,v18.16b,#8
1588	ror	w12,w12,#25
1589	ext	v22.16b,v22.16b,v22.16b,#8
1590	ror	w9,w9,#25
1591	ext	v3.16b,v3.16b,v3.16b,#12
1592	ext	v7.16b,v7.16b,v7.16b,#12
1593	ext	v11.16b,v11.16b,v11.16b,#12
1594	ext	v15.16b,v15.16b,v15.16b,#12
1595	ext	v19.16b,v19.16b,v19.16b,#12
1596	ext	v23.16b,v23.16b,v23.16b,#12
1597	ext	v1.16b,v1.16b,v1.16b,#4
1598	ext	v5.16b,v5.16b,v5.16b,#4
1599	ext	v9.16b,v9.16b,v9.16b,#4
1600	ext	v13.16b,v13.16b,v13.16b,#4
1601	ext	v17.16b,v17.16b,v17.16b,#4
1602	ext	v21.16b,v21.16b,v21.16b,#4
1603	add	v0.4s,v0.4s,v1.4s
1604	add	w5,w5,w9
1605	add	v4.4s,v4.4s,v5.4s
1606	add	w6,w6,w10
1607	add	v8.4s,v8.4s,v9.4s
1608	add	w7,w7,w11
1609	add	v12.4s,v12.4s,v13.4s
1610	add	w8,w8,w12
1611	add	v16.4s,v16.4s,v17.4s
1612	eor	w17,w17,w5
1613	add	v20.4s,v20.4s,v21.4s
1614	eor	w19,w19,w6
1615	eor	v3.16b,v3.16b,v0.16b
1616	eor	w20,w20,w7
1617	eor	v7.16b,v7.16b,v4.16b
1618	eor	w21,w21,w8
1619	eor	v11.16b,v11.16b,v8.16b
1620	ror	w17,w17,#16
1621	eor	v15.16b,v15.16b,v12.16b
1622	ror	w19,w19,#16
1623	eor	v19.16b,v19.16b,v16.16b
1624	ror	w20,w20,#16
1625	eor	v23.16b,v23.16b,v20.16b
1626	ror	w21,w21,#16
1627	rev32	v3.8h,v3.8h
1628	add	w13,w13,w17
1629	rev32	v7.8h,v7.8h
1630	add	w14,w14,w19
1631	rev32	v11.8h,v11.8h
1632	add	w15,w15,w20
1633	rev32	v15.8h,v15.8h
1634	add	w16,w16,w21
1635	rev32	v19.8h,v19.8h
1636	eor	w9,w9,w13
1637	rev32	v23.8h,v23.8h
1638	eor	w10,w10,w14
1639	add	v2.4s,v2.4s,v3.4s
1640	eor	w11,w11,w15
1641	add	v6.4s,v6.4s,v7.4s
1642	eor	w12,w12,w16
1643	add	v10.4s,v10.4s,v11.4s
1644	ror	w9,w9,#20
1645	add	v14.4s,v14.4s,v15.4s
1646	ror	w10,w10,#20
1647	add	v18.4s,v18.4s,v19.4s
1648	ror	w11,w11,#20
1649	add	v22.4s,v22.4s,v23.4s
1650	ror	w12,w12,#20
1651	eor	v24.16b,v1.16b,v2.16b
1652	add	w5,w5,w9
1653	eor	v25.16b,v5.16b,v6.16b
1654	add	w6,w6,w10
1655	eor	v26.16b,v9.16b,v10.16b
1656	add	w7,w7,w11
1657	eor	v27.16b,v13.16b,v14.16b
1658	add	w8,w8,w12
1659	eor	v28.16b,v17.16b,v18.16b
1660	eor	w17,w17,w5
1661	eor	v29.16b,v21.16b,v22.16b
1662	eor	w19,w19,w6
1663	ushr	v1.4s,v24.4s,#20
1664	eor	w20,w20,w7
1665	ushr	v5.4s,v25.4s,#20
1666	eor	w21,w21,w8
1667	ushr	v9.4s,v26.4s,#20
1668	ror	w17,w17,#24
1669	ushr	v13.4s,v27.4s,#20
1670	ror	w19,w19,#24
1671	ushr	v17.4s,v28.4s,#20
1672	ror	w20,w20,#24
1673	ushr	v21.4s,v29.4s,#20
1674	ror	w21,w21,#24
1675	sli	v1.4s,v24.4s,#12
1676	add	w13,w13,w17
1677	sli	v5.4s,v25.4s,#12
1678	add	w14,w14,w19
1679	sli	v9.4s,v26.4s,#12
1680	add	w15,w15,w20
1681	sli	v13.4s,v27.4s,#12
1682	add	w16,w16,w21
1683	sli	v17.4s,v28.4s,#12
1684	eor	w9,w9,w13
1685	sli	v21.4s,v29.4s,#12
1686	eor	w10,w10,w14
1687	add	v0.4s,v0.4s,v1.4s
1688	eor	w11,w11,w15
1689	add	v4.4s,v4.4s,v5.4s
1690	eor	w12,w12,w16
1691	add	v8.4s,v8.4s,v9.4s
1692	ror	w9,w9,#25
1693	add	v12.4s,v12.4s,v13.4s
1694	ror	w10,w10,#25
1695	add	v16.4s,v16.4s,v17.4s
1696	ror	w11,w11,#25
1697	add	v20.4s,v20.4s,v21.4s
1698	ror	w12,w12,#25
1699	eor	v24.16b,v3.16b,v0.16b
1700	add	w5,w5,w10
1701	eor	v25.16b,v7.16b,v4.16b
1702	add	w6,w6,w11
1703	eor	v26.16b,v11.16b,v8.16b
1704	add	w7,w7,w12
1705	eor	v27.16b,v15.16b,v12.16b
1706	add	w8,w8,w9
1707	eor	v28.16b,v19.16b,v16.16b
1708	eor	w21,w21,w5
1709	eor	v29.16b,v23.16b,v20.16b
1710	eor	w17,w17,w6
1711	ushr	v3.4s,v24.4s,#24
1712	eor	w19,w19,w7
1713	ushr	v7.4s,v25.4s,#24
1714	eor	w20,w20,w8
1715	ushr	v11.4s,v26.4s,#24
1716	ror	w21,w21,#16
1717	ushr	v15.4s,v27.4s,#24
1718	ror	w17,w17,#16
1719	ushr	v19.4s,v28.4s,#24
1720	ror	w19,w19,#16
1721	ushr	v23.4s,v29.4s,#24
1722	ror	w20,w20,#16
1723	sli	v3.4s,v24.4s,#8
1724	add	w15,w15,w21
1725	sli	v7.4s,v25.4s,#8
1726	add	w16,w16,w17
1727	sli	v11.4s,v26.4s,#8
1728	add	w13,w13,w19
1729	sli	v15.4s,v27.4s,#8
1730	add	w14,w14,w20
1731	sli	v19.4s,v28.4s,#8
1732	eor	w10,w10,w15
1733	sli	v23.4s,v29.4s,#8
1734	eor	w11,w11,w16
1735	add	v2.4s,v2.4s,v3.4s
1736	eor	w12,w12,w13
1737	add	v6.4s,v6.4s,v7.4s
1738	eor	w9,w9,w14
1739	add	v10.4s,v10.4s,v11.4s
1740	ror	w10,w10,#20
1741	add	v14.4s,v14.4s,v15.4s
1742	ror	w11,w11,#20
1743	add	v18.4s,v18.4s,v19.4s
1744	ror	w12,w12,#20
1745	add	v22.4s,v22.4s,v23.4s
1746	ror	w9,w9,#20
1747	eor	v24.16b,v1.16b,v2.16b
1748	add	w5,w5,w10
1749	eor	v25.16b,v5.16b,v6.16b
1750	add	w6,w6,w11
1751	eor	v26.16b,v9.16b,v10.16b
1752	add	w7,w7,w12
1753	eor	v27.16b,v13.16b,v14.16b
1754	add	w8,w8,w9
1755	eor	v28.16b,v17.16b,v18.16b
1756	eor	w21,w21,w5
1757	eor	v29.16b,v21.16b,v22.16b
1758	eor	w17,w17,w6
1759	ushr	v1.4s,v24.4s,#25
1760	eor	w19,w19,w7
1761	ushr	v5.4s,v25.4s,#25
1762	eor	w20,w20,w8
1763	ushr	v9.4s,v26.4s,#25
1764	ror	w21,w21,#24
1765	ushr	v13.4s,v27.4s,#25
1766	ror	w17,w17,#24
1767	ushr	v17.4s,v28.4s,#25
1768	ror	w19,w19,#24
1769	ushr	v21.4s,v29.4s,#25
1770	ror	w20,w20,#24
1771	sli	v1.4s,v24.4s,#7
1772	add	w15,w15,w21
1773	sli	v5.4s,v25.4s,#7
1774	add	w16,w16,w17
1775	sli	v9.4s,v26.4s,#7
1776	add	w13,w13,w19
1777	sli	v13.4s,v27.4s,#7
1778	add	w14,w14,w20
1779	sli	v17.4s,v28.4s,#7
1780	eor	w10,w10,w15
1781	sli	v21.4s,v29.4s,#7
1782	eor	w11,w11,w16
1783	ext	v2.16b,v2.16b,v2.16b,#8
1784	eor	w12,w12,w13
1785	ext	v6.16b,v6.16b,v6.16b,#8
1786	eor	w9,w9,w14
1787	ext	v10.16b,v10.16b,v10.16b,#8
1788	ror	w10,w10,#25
1789	ext	v14.16b,v14.16b,v14.16b,#8
1790	ror	w11,w11,#25
1791	ext	v18.16b,v18.16b,v18.16b,#8
1792	ror	w12,w12,#25
1793	ext	v22.16b,v22.16b,v22.16b,#8
1794	ror	w9,w9,#25
1795	ext	v3.16b,v3.16b,v3.16b,#4
1796	ext	v7.16b,v7.16b,v7.16b,#4
1797	ext	v11.16b,v11.16b,v11.16b,#4
1798	ext	v15.16b,v15.16b,v15.16b,#4
1799	ext	v19.16b,v19.16b,v19.16b,#4
1800	ext	v23.16b,v23.16b,v23.16b,#4
1801	ext	v1.16b,v1.16b,v1.16b,#12
1802	ext	v5.16b,v5.16b,v5.16b,#12
1803	ext	v9.16b,v9.16b,v9.16b,#12
1804	ext	v13.16b,v13.16b,v13.16b,#12
1805	ext	v17.16b,v17.16b,v17.16b,#12
1806	ext	v21.16b,v21.16b,v21.16b,#12
1807	cbnz	x4,Loop_lower_neon
1808
1809	add	w5,w5,w22		// accumulate key block
1810	ldp	q24,q25,[sp,#0]
1811	add	x6,x6,x22,lsr#32
1812	ldp	q26,q27,[sp,#32]
1813	add	w7,w7,w23
1814	ldp	q28,q29,[sp,#64]
1815	add	x8,x8,x23,lsr#32
1816	add	v0.4s,v0.4s,v24.4s
1817	add	w9,w9,w24
1818	add	v4.4s,v4.4s,v24.4s
1819	add	x10,x10,x24,lsr#32
1820	add	v8.4s,v8.4s,v24.4s
1821	add	w11,w11,w25
1822	add	v12.4s,v12.4s,v24.4s
1823	add	x12,x12,x25,lsr#32
1824	add	v16.4s,v16.4s,v24.4s
1825	add	w13,w13,w26
1826	add	v20.4s,v20.4s,v24.4s
1827	add	x14,x14,x26,lsr#32
1828	add	v2.4s,v2.4s,v26.4s
1829	add	w15,w15,w27
1830	add	v6.4s,v6.4s,v26.4s
1831	add	x16,x16,x27,lsr#32
1832	add	v10.4s,v10.4s,v26.4s
1833	add	w17,w17,w28
1834	add	v14.4s,v14.4s,v26.4s
1835	add	x19,x19,x28,lsr#32
1836	add	v18.4s,v18.4s,v26.4s
1837	add	w20,w20,w30
1838	add	v22.4s,v22.4s,v26.4s
1839	add	x21,x21,x30,lsr#32
1840	add	v19.4s,v19.4s,v31.4s			// +4
1841	add	x5,x5,x6,lsl#32	// pack
1842	add	v23.4s,v23.4s,v31.4s			// +4
1843	add	x7,x7,x8,lsl#32
1844	add	v3.4s,v3.4s,v27.4s
1845	ldp	x6,x8,[x1,#0]		// load input
1846	add	v7.4s,v7.4s,v28.4s
1847	add	x9,x9,x10,lsl#32
1848	add	v11.4s,v11.4s,v29.4s
1849	add	x11,x11,x12,lsl#32
1850	add	v15.4s,v15.4s,v30.4s
1851	ldp	x10,x12,[x1,#16]
1852	add	v19.4s,v19.4s,v27.4s
1853	add	x13,x13,x14,lsl#32
1854	add	v23.4s,v23.4s,v28.4s
1855	add	x15,x15,x16,lsl#32
1856	add	v1.4s,v1.4s,v25.4s
1857	ldp	x14,x16,[x1,#32]
1858	add	v5.4s,v5.4s,v25.4s
1859	add	x17,x17,x19,lsl#32
1860	add	v9.4s,v9.4s,v25.4s
1861	add	x20,x20,x21,lsl#32
1862	add	v13.4s,v13.4s,v25.4s
1863	ldp	x19,x21,[x1,#48]
1864	add	v17.4s,v17.4s,v25.4s
1865	add	x1,x1,#64
1866	add	v21.4s,v21.4s,v25.4s
1867
1868#ifdef	__ARMEB__
1869	rev	x5,x5
1870	rev	x7,x7
1871	rev	x9,x9
1872	rev	x11,x11
1873	rev	x13,x13
1874	rev	x15,x15
1875	rev	x17,x17
1876	rev	x20,x20
1877#endif
1878	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1879	eor	x5,x5,x6
1880	eor	x7,x7,x8
1881	eor	x9,x9,x10
1882	eor	x11,x11,x12
1883	eor	x13,x13,x14
1884	eor	v0.16b,v0.16b,v24.16b
1885	eor	x15,x15,x16
1886	eor	v1.16b,v1.16b,v25.16b
1887	eor	x17,x17,x19
1888	eor	v2.16b,v2.16b,v26.16b
1889	eor	x20,x20,x21
1890	eor	v3.16b,v3.16b,v27.16b
1891	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1892
1893	stp	x5,x7,[x0,#0]		// store output
1894	add	x28,x28,#7			// increment counter
1895	stp	x9,x11,[x0,#16]
1896	stp	x13,x15,[x0,#32]
1897	stp	x17,x20,[x0,#48]
1898	add	x0,x0,#64
1899	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1900
1901	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1902	eor	v4.16b,v4.16b,v24.16b
1903	eor	v5.16b,v5.16b,v25.16b
1904	eor	v6.16b,v6.16b,v26.16b
1905	eor	v7.16b,v7.16b,v27.16b
1906	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1907
1908	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1909	eor	v8.16b,v8.16b,v0.16b
1910	ldp	q24,q25,[sp,#0]
1911	eor	v9.16b,v9.16b,v1.16b
1912	ldp	q26,q27,[sp,#32]
1913	eor	v10.16b,v10.16b,v2.16b
1914	eor	v11.16b,v11.16b,v3.16b
1915	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1916
1917	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1918	eor	v12.16b,v12.16b,v4.16b
1919	eor	v13.16b,v13.16b,v5.16b
1920	eor	v14.16b,v14.16b,v6.16b
1921	eor	v15.16b,v15.16b,v7.16b
1922	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1923
1924	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1925	eor	v16.16b,v16.16b,v8.16b
1926	eor	v17.16b,v17.16b,v9.16b
1927	eor	v18.16b,v18.16b,v10.16b
1928	eor	v19.16b,v19.16b,v11.16b
1929	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1930
1931	shl	v0.4s,v31.4s,#1			// 4 -> 8
1932	eor	v20.16b,v20.16b,v12.16b
1933	eor	v21.16b,v21.16b,v13.16b
1934	eor	v22.16b,v22.16b,v14.16b
1935	eor	v23.16b,v23.16b,v15.16b
1936	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1937
1938	add	v27.4s,v27.4s,v0.4s			// += 8
1939	add	v28.4s,v28.4s,v0.4s
1940	add	v29.4s,v29.4s,v0.4s
1941	add	v30.4s,v30.4s,v0.4s
1942
1943	b.hs	Loop_outer_512_neon
1944
1945	adds	x2,x2,#512
1946	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1947
1948	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1949	ldp	d10,d11,[sp,#128+16]
1950	ldp	d12,d13,[sp,#128+32]
1951	ldp	d14,d15,[sp,#128+48]
1952
1953	stp	q24,q31,[sp,#0]		// wipe off-load area
1954	stp	q24,q31,[sp,#32]
1955	stp	q24,q31,[sp,#64]
1956
1957	b.eq	Ldone_512_neon
1958
1959	cmp	x2,#192
1960	sub	v27.4s,v27.4s,v0.4s			// -= 1
1961	sub	v28.4s,v28.4s,v0.4s
1962	sub	v29.4s,v29.4s,v0.4s
1963	add	sp,sp,#128
1964	b.hs	Loop_outer_neon
1965
1966	eor	v25.16b,v25.16b,v25.16b
1967	eor	v26.16b,v26.16b,v26.16b
1968	eor	v27.16b,v27.16b,v27.16b
1969	eor	v28.16b,v28.16b,v28.16b
1970	eor	v29.16b,v29.16b,v29.16b
1971	eor	v30.16b,v30.16b,v30.16b
1972	b	Loop_outer
1973
1974Ldone_512_neon:
1975	ldp	x19,x20,[x29,#16]
1976	add	sp,sp,#128+64
1977	ldp	x21,x22,[x29,#32]
1978	ldp	x23,x24,[x29,#48]
1979	ldp	x25,x26,[x29,#64]
1980	ldp	x27,x28,[x29,#80]
1981	ldp	x29,x30,[sp],#96
1982	ret
1983
1984#endif  // !OPENSSL_NO_ASM
1985