1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17
18.private_extern	_OPENSSL_armcap_P
19
20.section	__TEXT,__const
21
22.align	5
23Lsigma:
24.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
25Lone:
26.long	1,0,0,0
27.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
28.align	2
29
30.text
31
32.globl	_ChaCha20_ctr32
33.private_extern	_ChaCha20_ctr32
34
35.align	5
36_ChaCha20_ctr32:
37	AARCH64_VALID_CALL_TARGET
38	cbz	x2,Labort
39#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
40	adrp	x5,:pg_hi21_nc:_OPENSSL_armcap_P
41#else
42	adrp	x5,_OPENSSL_armcap_P@PAGE
43#endif
44	cmp	x2,#192
45	b.lo	Lshort
46	ldr	w17,[x5,_OPENSSL_armcap_P@PAGEOFF]
47	tst	w17,#ARMV7_NEON
48	b.ne	ChaCha20_neon
49
50Lshort:
51	AARCH64_SIGN_LINK_REGISTER
52	stp	x29,x30,[sp,#-96]!
53	add	x29,sp,#0
54
55	adrp	x5,Lsigma@PAGE
56	add	x5,x5,Lsigma@PAGEOFF
57	stp	x19,x20,[sp,#16]
58	stp	x21,x22,[sp,#32]
59	stp	x23,x24,[sp,#48]
60	stp	x25,x26,[sp,#64]
61	stp	x27,x28,[sp,#80]
62	sub	sp,sp,#64
63
64	ldp	x22,x23,[x5]		// load sigma
65	ldp	x24,x25,[x3]		// load key
66	ldp	x26,x27,[x3,#16]
67	ldp	x28,x30,[x4]		// load counter
68#ifdef	__ARMEB__
69	ror	x24,x24,#32
70	ror	x25,x25,#32
71	ror	x26,x26,#32
72	ror	x27,x27,#32
73	ror	x28,x28,#32
74	ror	x30,x30,#32
75#endif
76
77Loop_outer:
78	mov	w5,w22			// unpack key block
79	lsr	x6,x22,#32
80	mov	w7,w23
81	lsr	x8,x23,#32
82	mov	w9,w24
83	lsr	x10,x24,#32
84	mov	w11,w25
85	lsr	x12,x25,#32
86	mov	w13,w26
87	lsr	x14,x26,#32
88	mov	w15,w27
89	lsr	x16,x27,#32
90	mov	w17,w28
91	lsr	x19,x28,#32
92	mov	w20,w30
93	lsr	x21,x30,#32
94
95	mov	x4,#10
96	subs	x2,x2,#64
97Loop:
98	sub	x4,x4,#1
99	add	w5,w5,w9
100	add	w6,w6,w10
101	add	w7,w7,w11
102	add	w8,w8,w12
103	eor	w17,w17,w5
104	eor	w19,w19,w6
105	eor	w20,w20,w7
106	eor	w21,w21,w8
107	ror	w17,w17,#16
108	ror	w19,w19,#16
109	ror	w20,w20,#16
110	ror	w21,w21,#16
111	add	w13,w13,w17
112	add	w14,w14,w19
113	add	w15,w15,w20
114	add	w16,w16,w21
115	eor	w9,w9,w13
116	eor	w10,w10,w14
117	eor	w11,w11,w15
118	eor	w12,w12,w16
119	ror	w9,w9,#20
120	ror	w10,w10,#20
121	ror	w11,w11,#20
122	ror	w12,w12,#20
123	add	w5,w5,w9
124	add	w6,w6,w10
125	add	w7,w7,w11
126	add	w8,w8,w12
127	eor	w17,w17,w5
128	eor	w19,w19,w6
129	eor	w20,w20,w7
130	eor	w21,w21,w8
131	ror	w17,w17,#24
132	ror	w19,w19,#24
133	ror	w20,w20,#24
134	ror	w21,w21,#24
135	add	w13,w13,w17
136	add	w14,w14,w19
137	add	w15,w15,w20
138	add	w16,w16,w21
139	eor	w9,w9,w13
140	eor	w10,w10,w14
141	eor	w11,w11,w15
142	eor	w12,w12,w16
143	ror	w9,w9,#25
144	ror	w10,w10,#25
145	ror	w11,w11,#25
146	ror	w12,w12,#25
147	add	w5,w5,w10
148	add	w6,w6,w11
149	add	w7,w7,w12
150	add	w8,w8,w9
151	eor	w21,w21,w5
152	eor	w17,w17,w6
153	eor	w19,w19,w7
154	eor	w20,w20,w8
155	ror	w21,w21,#16
156	ror	w17,w17,#16
157	ror	w19,w19,#16
158	ror	w20,w20,#16
159	add	w15,w15,w21
160	add	w16,w16,w17
161	add	w13,w13,w19
162	add	w14,w14,w20
163	eor	w10,w10,w15
164	eor	w11,w11,w16
165	eor	w12,w12,w13
166	eor	w9,w9,w14
167	ror	w10,w10,#20
168	ror	w11,w11,#20
169	ror	w12,w12,#20
170	ror	w9,w9,#20
171	add	w5,w5,w10
172	add	w6,w6,w11
173	add	w7,w7,w12
174	add	w8,w8,w9
175	eor	w21,w21,w5
176	eor	w17,w17,w6
177	eor	w19,w19,w7
178	eor	w20,w20,w8
179	ror	w21,w21,#24
180	ror	w17,w17,#24
181	ror	w19,w19,#24
182	ror	w20,w20,#24
183	add	w15,w15,w21
184	add	w16,w16,w17
185	add	w13,w13,w19
186	add	w14,w14,w20
187	eor	w10,w10,w15
188	eor	w11,w11,w16
189	eor	w12,w12,w13
190	eor	w9,w9,w14
191	ror	w10,w10,#25
192	ror	w11,w11,#25
193	ror	w12,w12,#25
194	ror	w9,w9,#25
195	cbnz	x4,Loop
196
197	add	w5,w5,w22		// accumulate key block
198	add	x6,x6,x22,lsr#32
199	add	w7,w7,w23
200	add	x8,x8,x23,lsr#32
201	add	w9,w9,w24
202	add	x10,x10,x24,lsr#32
203	add	w11,w11,w25
204	add	x12,x12,x25,lsr#32
205	add	w13,w13,w26
206	add	x14,x14,x26,lsr#32
207	add	w15,w15,w27
208	add	x16,x16,x27,lsr#32
209	add	w17,w17,w28
210	add	x19,x19,x28,lsr#32
211	add	w20,w20,w30
212	add	x21,x21,x30,lsr#32
213
214	b.lo	Ltail
215
216	add	x5,x5,x6,lsl#32	// pack
217	add	x7,x7,x8,lsl#32
218	ldp	x6,x8,[x1,#0]		// load input
219	add	x9,x9,x10,lsl#32
220	add	x11,x11,x12,lsl#32
221	ldp	x10,x12,[x1,#16]
222	add	x13,x13,x14,lsl#32
223	add	x15,x15,x16,lsl#32
224	ldp	x14,x16,[x1,#32]
225	add	x17,x17,x19,lsl#32
226	add	x20,x20,x21,lsl#32
227	ldp	x19,x21,[x1,#48]
228	add	x1,x1,#64
229#ifdef	__ARMEB__
230	rev	x5,x5
231	rev	x7,x7
232	rev	x9,x9
233	rev	x11,x11
234	rev	x13,x13
235	rev	x15,x15
236	rev	x17,x17
237	rev	x20,x20
238#endif
239	eor	x5,x5,x6
240	eor	x7,x7,x8
241	eor	x9,x9,x10
242	eor	x11,x11,x12
243	eor	x13,x13,x14
244	eor	x15,x15,x16
245	eor	x17,x17,x19
246	eor	x20,x20,x21
247
248	stp	x5,x7,[x0,#0]		// store output
249	add	x28,x28,#1			// increment counter
250	stp	x9,x11,[x0,#16]
251	stp	x13,x15,[x0,#32]
252	stp	x17,x20,[x0,#48]
253	add	x0,x0,#64
254
255	b.hi	Loop_outer
256
257	ldp	x19,x20,[x29,#16]
258	add	sp,sp,#64
259	ldp	x21,x22,[x29,#32]
260	ldp	x23,x24,[x29,#48]
261	ldp	x25,x26,[x29,#64]
262	ldp	x27,x28,[x29,#80]
263	ldp	x29,x30,[sp],#96
264	AARCH64_VALIDATE_LINK_REGISTER
265Labort:
266	ret
267
268.align	4
269Ltail:
270	add	x2,x2,#64
271Less_than_64:
272	sub	x0,x0,#1
273	add	x1,x1,x2
274	add	x0,x0,x2
275	add	x4,sp,x2
276	neg	x2,x2
277
278	add	x5,x5,x6,lsl#32	// pack
279	add	x7,x7,x8,lsl#32
280	add	x9,x9,x10,lsl#32
281	add	x11,x11,x12,lsl#32
282	add	x13,x13,x14,lsl#32
283	add	x15,x15,x16,lsl#32
284	add	x17,x17,x19,lsl#32
285	add	x20,x20,x21,lsl#32
286#ifdef	__ARMEB__
287	rev	x5,x5
288	rev	x7,x7
289	rev	x9,x9
290	rev	x11,x11
291	rev	x13,x13
292	rev	x15,x15
293	rev	x17,x17
294	rev	x20,x20
295#endif
296	stp	x5,x7,[sp,#0]
297	stp	x9,x11,[sp,#16]
298	stp	x13,x15,[sp,#32]
299	stp	x17,x20,[sp,#48]
300
301Loop_tail:
302	ldrb	w10,[x1,x2]
303	ldrb	w11,[x4,x2]
304	add	x2,x2,#1
305	eor	w10,w10,w11
306	strb	w10,[x0,x2]
307	cbnz	x2,Loop_tail
308
309	stp	xzr,xzr,[sp,#0]
310	stp	xzr,xzr,[sp,#16]
311	stp	xzr,xzr,[sp,#32]
312	stp	xzr,xzr,[sp,#48]
313
314	ldp	x19,x20,[x29,#16]
315	add	sp,sp,#64
316	ldp	x21,x22,[x29,#32]
317	ldp	x23,x24,[x29,#48]
318	ldp	x25,x26,[x29,#64]
319	ldp	x27,x28,[x29,#80]
320	ldp	x29,x30,[sp],#96
321	AARCH64_VALIDATE_LINK_REGISTER
322	ret
323
324
325
326.align	5
327ChaCha20_neon:
328	AARCH64_SIGN_LINK_REGISTER
329	stp	x29,x30,[sp,#-96]!
330	add	x29,sp,#0
331
332	adrp	x5,Lsigma@PAGE
333	add	x5,x5,Lsigma@PAGEOFF
334	stp	x19,x20,[sp,#16]
335	stp	x21,x22,[sp,#32]
336	stp	x23,x24,[sp,#48]
337	stp	x25,x26,[sp,#64]
338	stp	x27,x28,[sp,#80]
339	cmp	x2,#512
340	b.hs	L512_or_more_neon
341
342	sub	sp,sp,#64
343
344	ldp	x22,x23,[x5]		// load sigma
345	ld1	{v24.4s},[x5],#16
346	ldp	x24,x25,[x3]		// load key
347	ldp	x26,x27,[x3,#16]
348	ld1	{v25.4s,v26.4s},[x3]
349	ldp	x28,x30,[x4]		// load counter
350	ld1	{v27.4s},[x4]
351	ld1	{v31.4s},[x5]
352#ifdef	__ARMEB__
353	rev64	v24.4s,v24.4s
354	ror	x24,x24,#32
355	ror	x25,x25,#32
356	ror	x26,x26,#32
357	ror	x27,x27,#32
358	ror	x28,x28,#32
359	ror	x30,x30,#32
360#endif
361	add	v27.4s,v27.4s,v31.4s		// += 1
362	add	v28.4s,v27.4s,v31.4s
363	add	v29.4s,v28.4s,v31.4s
364	shl	v31.4s,v31.4s,#2			// 1 -> 4
365
366Loop_outer_neon:
367	mov	w5,w22			// unpack key block
368	lsr	x6,x22,#32
369	mov	v0.16b,v24.16b
370	mov	w7,w23
371	lsr	x8,x23,#32
372	mov	v4.16b,v24.16b
373	mov	w9,w24
374	lsr	x10,x24,#32
375	mov	v16.16b,v24.16b
376	mov	w11,w25
377	mov	v1.16b,v25.16b
378	lsr	x12,x25,#32
379	mov	v5.16b,v25.16b
380	mov	w13,w26
381	mov	v17.16b,v25.16b
382	lsr	x14,x26,#32
383	mov	v3.16b,v27.16b
384	mov	w15,w27
385	mov	v7.16b,v28.16b
386	lsr	x16,x27,#32
387	mov	v19.16b,v29.16b
388	mov	w17,w28
389	mov	v2.16b,v26.16b
390	lsr	x19,x28,#32
391	mov	v6.16b,v26.16b
392	mov	w20,w30
393	mov	v18.16b,v26.16b
394	lsr	x21,x30,#32
395
396	mov	x4,#10
397	subs	x2,x2,#256
398Loop_neon:
399	sub	x4,x4,#1
400	add	v0.4s,v0.4s,v1.4s
401	add	w5,w5,w9
402	add	v4.4s,v4.4s,v5.4s
403	add	w6,w6,w10
404	add	v16.4s,v16.4s,v17.4s
405	add	w7,w7,w11
406	eor	v3.16b,v3.16b,v0.16b
407	add	w8,w8,w12
408	eor	v7.16b,v7.16b,v4.16b
409	eor	w17,w17,w5
410	eor	v19.16b,v19.16b,v16.16b
411	eor	w19,w19,w6
412	rev32	v3.8h,v3.8h
413	eor	w20,w20,w7
414	rev32	v7.8h,v7.8h
415	eor	w21,w21,w8
416	rev32	v19.8h,v19.8h
417	ror	w17,w17,#16
418	add	v2.4s,v2.4s,v3.4s
419	ror	w19,w19,#16
420	add	v6.4s,v6.4s,v7.4s
421	ror	w20,w20,#16
422	add	v18.4s,v18.4s,v19.4s
423	ror	w21,w21,#16
424	eor	v20.16b,v1.16b,v2.16b
425	add	w13,w13,w17
426	eor	v21.16b,v5.16b,v6.16b
427	add	w14,w14,w19
428	eor	v22.16b,v17.16b,v18.16b
429	add	w15,w15,w20
430	ushr	v1.4s,v20.4s,#20
431	add	w16,w16,w21
432	ushr	v5.4s,v21.4s,#20
433	eor	w9,w9,w13
434	ushr	v17.4s,v22.4s,#20
435	eor	w10,w10,w14
436	sli	v1.4s,v20.4s,#12
437	eor	w11,w11,w15
438	sli	v5.4s,v21.4s,#12
439	eor	w12,w12,w16
440	sli	v17.4s,v22.4s,#12
441	ror	w9,w9,#20
442	add	v0.4s,v0.4s,v1.4s
443	ror	w10,w10,#20
444	add	v4.4s,v4.4s,v5.4s
445	ror	w11,w11,#20
446	add	v16.4s,v16.4s,v17.4s
447	ror	w12,w12,#20
448	eor	v20.16b,v3.16b,v0.16b
449	add	w5,w5,w9
450	eor	v21.16b,v7.16b,v4.16b
451	add	w6,w6,w10
452	eor	v22.16b,v19.16b,v16.16b
453	add	w7,w7,w11
454	ushr	v3.4s,v20.4s,#24
455	add	w8,w8,w12
456	ushr	v7.4s,v21.4s,#24
457	eor	w17,w17,w5
458	ushr	v19.4s,v22.4s,#24
459	eor	w19,w19,w6
460	sli	v3.4s,v20.4s,#8
461	eor	w20,w20,w7
462	sli	v7.4s,v21.4s,#8
463	eor	w21,w21,w8
464	sli	v19.4s,v22.4s,#8
465	ror	w17,w17,#24
466	add	v2.4s,v2.4s,v3.4s
467	ror	w19,w19,#24
468	add	v6.4s,v6.4s,v7.4s
469	ror	w20,w20,#24
470	add	v18.4s,v18.4s,v19.4s
471	ror	w21,w21,#24
472	eor	v20.16b,v1.16b,v2.16b
473	add	w13,w13,w17
474	eor	v21.16b,v5.16b,v6.16b
475	add	w14,w14,w19
476	eor	v22.16b,v17.16b,v18.16b
477	add	w15,w15,w20
478	ushr	v1.4s,v20.4s,#25
479	add	w16,w16,w21
480	ushr	v5.4s,v21.4s,#25
481	eor	w9,w9,w13
482	ushr	v17.4s,v22.4s,#25
483	eor	w10,w10,w14
484	sli	v1.4s,v20.4s,#7
485	eor	w11,w11,w15
486	sli	v5.4s,v21.4s,#7
487	eor	w12,w12,w16
488	sli	v17.4s,v22.4s,#7
489	ror	w9,w9,#25
490	ext	v2.16b,v2.16b,v2.16b,#8
491	ror	w10,w10,#25
492	ext	v6.16b,v6.16b,v6.16b,#8
493	ror	w11,w11,#25
494	ext	v18.16b,v18.16b,v18.16b,#8
495	ror	w12,w12,#25
496	ext	v3.16b,v3.16b,v3.16b,#12
497	ext	v7.16b,v7.16b,v7.16b,#12
498	ext	v19.16b,v19.16b,v19.16b,#12
499	ext	v1.16b,v1.16b,v1.16b,#4
500	ext	v5.16b,v5.16b,v5.16b,#4
501	ext	v17.16b,v17.16b,v17.16b,#4
502	add	v0.4s,v0.4s,v1.4s
503	add	w5,w5,w10
504	add	v4.4s,v4.4s,v5.4s
505	add	w6,w6,w11
506	add	v16.4s,v16.4s,v17.4s
507	add	w7,w7,w12
508	eor	v3.16b,v3.16b,v0.16b
509	add	w8,w8,w9
510	eor	v7.16b,v7.16b,v4.16b
511	eor	w21,w21,w5
512	eor	v19.16b,v19.16b,v16.16b
513	eor	w17,w17,w6
514	rev32	v3.8h,v3.8h
515	eor	w19,w19,w7
516	rev32	v7.8h,v7.8h
517	eor	w20,w20,w8
518	rev32	v19.8h,v19.8h
519	ror	w21,w21,#16
520	add	v2.4s,v2.4s,v3.4s
521	ror	w17,w17,#16
522	add	v6.4s,v6.4s,v7.4s
523	ror	w19,w19,#16
524	add	v18.4s,v18.4s,v19.4s
525	ror	w20,w20,#16
526	eor	v20.16b,v1.16b,v2.16b
527	add	w15,w15,w21
528	eor	v21.16b,v5.16b,v6.16b
529	add	w16,w16,w17
530	eor	v22.16b,v17.16b,v18.16b
531	add	w13,w13,w19
532	ushr	v1.4s,v20.4s,#20
533	add	w14,w14,w20
534	ushr	v5.4s,v21.4s,#20
535	eor	w10,w10,w15
536	ushr	v17.4s,v22.4s,#20
537	eor	w11,w11,w16
538	sli	v1.4s,v20.4s,#12
539	eor	w12,w12,w13
540	sli	v5.4s,v21.4s,#12
541	eor	w9,w9,w14
542	sli	v17.4s,v22.4s,#12
543	ror	w10,w10,#20
544	add	v0.4s,v0.4s,v1.4s
545	ror	w11,w11,#20
546	add	v4.4s,v4.4s,v5.4s
547	ror	w12,w12,#20
548	add	v16.4s,v16.4s,v17.4s
549	ror	w9,w9,#20
550	eor	v20.16b,v3.16b,v0.16b
551	add	w5,w5,w10
552	eor	v21.16b,v7.16b,v4.16b
553	add	w6,w6,w11
554	eor	v22.16b,v19.16b,v16.16b
555	add	w7,w7,w12
556	ushr	v3.4s,v20.4s,#24
557	add	w8,w8,w9
558	ushr	v7.4s,v21.4s,#24
559	eor	w21,w21,w5
560	ushr	v19.4s,v22.4s,#24
561	eor	w17,w17,w6
562	sli	v3.4s,v20.4s,#8
563	eor	w19,w19,w7
564	sli	v7.4s,v21.4s,#8
565	eor	w20,w20,w8
566	sli	v19.4s,v22.4s,#8
567	ror	w21,w21,#24
568	add	v2.4s,v2.4s,v3.4s
569	ror	w17,w17,#24
570	add	v6.4s,v6.4s,v7.4s
571	ror	w19,w19,#24
572	add	v18.4s,v18.4s,v19.4s
573	ror	w20,w20,#24
574	eor	v20.16b,v1.16b,v2.16b
575	add	w15,w15,w21
576	eor	v21.16b,v5.16b,v6.16b
577	add	w16,w16,w17
578	eor	v22.16b,v17.16b,v18.16b
579	add	w13,w13,w19
580	ushr	v1.4s,v20.4s,#25
581	add	w14,w14,w20
582	ushr	v5.4s,v21.4s,#25
583	eor	w10,w10,w15
584	ushr	v17.4s,v22.4s,#25
585	eor	w11,w11,w16
586	sli	v1.4s,v20.4s,#7
587	eor	w12,w12,w13
588	sli	v5.4s,v21.4s,#7
589	eor	w9,w9,w14
590	sli	v17.4s,v22.4s,#7
591	ror	w10,w10,#25
592	ext	v2.16b,v2.16b,v2.16b,#8
593	ror	w11,w11,#25
594	ext	v6.16b,v6.16b,v6.16b,#8
595	ror	w12,w12,#25
596	ext	v18.16b,v18.16b,v18.16b,#8
597	ror	w9,w9,#25
598	ext	v3.16b,v3.16b,v3.16b,#4
599	ext	v7.16b,v7.16b,v7.16b,#4
600	ext	v19.16b,v19.16b,v19.16b,#4
601	ext	v1.16b,v1.16b,v1.16b,#12
602	ext	v5.16b,v5.16b,v5.16b,#12
603	ext	v17.16b,v17.16b,v17.16b,#12
604	cbnz	x4,Loop_neon
605
606	add	w5,w5,w22		// accumulate key block
607	add	v0.4s,v0.4s,v24.4s
608	add	x6,x6,x22,lsr#32
609	add	v4.4s,v4.4s,v24.4s
610	add	w7,w7,w23
611	add	v16.4s,v16.4s,v24.4s
612	add	x8,x8,x23,lsr#32
613	add	v2.4s,v2.4s,v26.4s
614	add	w9,w9,w24
615	add	v6.4s,v6.4s,v26.4s
616	add	x10,x10,x24,lsr#32
617	add	v18.4s,v18.4s,v26.4s
618	add	w11,w11,w25
619	add	v3.4s,v3.4s,v27.4s
620	add	x12,x12,x25,lsr#32
621	add	w13,w13,w26
622	add	v7.4s,v7.4s,v28.4s
623	add	x14,x14,x26,lsr#32
624	add	w15,w15,w27
625	add	v19.4s,v19.4s,v29.4s
626	add	x16,x16,x27,lsr#32
627	add	w17,w17,w28
628	add	v1.4s,v1.4s,v25.4s
629	add	x19,x19,x28,lsr#32
630	add	w20,w20,w30
631	add	v5.4s,v5.4s,v25.4s
632	add	x21,x21,x30,lsr#32
633	add	v17.4s,v17.4s,v25.4s
634
635	b.lo	Ltail_neon
636
637	add	x5,x5,x6,lsl#32	// pack
638	add	x7,x7,x8,lsl#32
639	ldp	x6,x8,[x1,#0]		// load input
640	add	x9,x9,x10,lsl#32
641	add	x11,x11,x12,lsl#32
642	ldp	x10,x12,[x1,#16]
643	add	x13,x13,x14,lsl#32
644	add	x15,x15,x16,lsl#32
645	ldp	x14,x16,[x1,#32]
646	add	x17,x17,x19,lsl#32
647	add	x20,x20,x21,lsl#32
648	ldp	x19,x21,[x1,#48]
649	add	x1,x1,#64
650#ifdef	__ARMEB__
651	rev	x5,x5
652	rev	x7,x7
653	rev	x9,x9
654	rev	x11,x11
655	rev	x13,x13
656	rev	x15,x15
657	rev	x17,x17
658	rev	x20,x20
659#endif
660	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
661	eor	x5,x5,x6
662	eor	x7,x7,x8
663	eor	x9,x9,x10
664	eor	x11,x11,x12
665	eor	x13,x13,x14
666	eor	v0.16b,v0.16b,v20.16b
667	eor	x15,x15,x16
668	eor	v1.16b,v1.16b,v21.16b
669	eor	x17,x17,x19
670	eor	v2.16b,v2.16b,v22.16b
671	eor	x20,x20,x21
672	eor	v3.16b,v3.16b,v23.16b
673	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
674
675	stp	x5,x7,[x0,#0]		// store output
676	add	x28,x28,#4			// increment counter
677	stp	x9,x11,[x0,#16]
678	add	v27.4s,v27.4s,v31.4s		// += 4
679	stp	x13,x15,[x0,#32]
680	add	v28.4s,v28.4s,v31.4s
681	stp	x17,x20,[x0,#48]
682	add	v29.4s,v29.4s,v31.4s
683	add	x0,x0,#64
684
685	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
686	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
687
688	eor	v4.16b,v4.16b,v20.16b
689	eor	v5.16b,v5.16b,v21.16b
690	eor	v6.16b,v6.16b,v22.16b
691	eor	v7.16b,v7.16b,v23.16b
692	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
693
694	eor	v16.16b,v16.16b,v0.16b
695	eor	v17.16b,v17.16b,v1.16b
696	eor	v18.16b,v18.16b,v2.16b
697	eor	v19.16b,v19.16b,v3.16b
698	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
699
700	b.hi	Loop_outer_neon
701
702	ldp	x19,x20,[x29,#16]
703	add	sp,sp,#64
704	ldp	x21,x22,[x29,#32]
705	ldp	x23,x24,[x29,#48]
706	ldp	x25,x26,[x29,#64]
707	ldp	x27,x28,[x29,#80]
708	ldp	x29,x30,[sp],#96
709	AARCH64_VALIDATE_LINK_REGISTER
710	ret
711
712Ltail_neon:
713	add	x2,x2,#256
714	cmp	x2,#64
715	b.lo	Less_than_64
716
717	add	x5,x5,x6,lsl#32	// pack
718	add	x7,x7,x8,lsl#32
719	ldp	x6,x8,[x1,#0]		// load input
720	add	x9,x9,x10,lsl#32
721	add	x11,x11,x12,lsl#32
722	ldp	x10,x12,[x1,#16]
723	add	x13,x13,x14,lsl#32
724	add	x15,x15,x16,lsl#32
725	ldp	x14,x16,[x1,#32]
726	add	x17,x17,x19,lsl#32
727	add	x20,x20,x21,lsl#32
728	ldp	x19,x21,[x1,#48]
729	add	x1,x1,#64
730#ifdef	__ARMEB__
731	rev	x5,x5
732	rev	x7,x7
733	rev	x9,x9
734	rev	x11,x11
735	rev	x13,x13
736	rev	x15,x15
737	rev	x17,x17
738	rev	x20,x20
739#endif
740	eor	x5,x5,x6
741	eor	x7,x7,x8
742	eor	x9,x9,x10
743	eor	x11,x11,x12
744	eor	x13,x13,x14
745	eor	x15,x15,x16
746	eor	x17,x17,x19
747	eor	x20,x20,x21
748
749	stp	x5,x7,[x0,#0]		// store output
750	add	x28,x28,#4			// increment counter
751	stp	x9,x11,[x0,#16]
752	stp	x13,x15,[x0,#32]
753	stp	x17,x20,[x0,#48]
754	add	x0,x0,#64
755	b.eq	Ldone_neon
756	sub	x2,x2,#64
757	cmp	x2,#64
758	b.lo	Less_than_128
759
760	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
761	eor	v0.16b,v0.16b,v20.16b
762	eor	v1.16b,v1.16b,v21.16b
763	eor	v2.16b,v2.16b,v22.16b
764	eor	v3.16b,v3.16b,v23.16b
765	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
766	b.eq	Ldone_neon
767	sub	x2,x2,#64
768	cmp	x2,#64
769	b.lo	Less_than_192
770
771	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
772	eor	v4.16b,v4.16b,v20.16b
773	eor	v5.16b,v5.16b,v21.16b
774	eor	v6.16b,v6.16b,v22.16b
775	eor	v7.16b,v7.16b,v23.16b
776	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
777	b.eq	Ldone_neon
778	sub	x2,x2,#64
779
780	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
781	b	Last_neon
782
783Less_than_128:
784	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
785	b	Last_neon
786Less_than_192:
787	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
788	b	Last_neon
789
790.align	4
791Last_neon:
792	sub	x0,x0,#1
793	add	x1,x1,x2
794	add	x0,x0,x2
795	add	x4,sp,x2
796	neg	x2,x2
797
798Loop_tail_neon:
799	ldrb	w10,[x1,x2]
800	ldrb	w11,[x4,x2]
801	add	x2,x2,#1
802	eor	w10,w10,w11
803	strb	w10,[x0,x2]
804	cbnz	x2,Loop_tail_neon
805
806	stp	xzr,xzr,[sp,#0]
807	stp	xzr,xzr,[sp,#16]
808	stp	xzr,xzr,[sp,#32]
809	stp	xzr,xzr,[sp,#48]
810
811Ldone_neon:
812	ldp	x19,x20,[x29,#16]
813	add	sp,sp,#64
814	ldp	x21,x22,[x29,#32]
815	ldp	x23,x24,[x29,#48]
816	ldp	x25,x26,[x29,#64]
817	ldp	x27,x28,[x29,#80]
818	ldp	x29,x30,[sp],#96
819	AARCH64_VALIDATE_LINK_REGISTER
820	ret
821
822
823.align	5
824ChaCha20_512_neon:
825	AARCH64_SIGN_LINK_REGISTER
826	stp	x29,x30,[sp,#-96]!
827	add	x29,sp,#0
828
829	adrp	x5,Lsigma@PAGE
830	add	x5,x5,Lsigma@PAGEOFF
831	stp	x19,x20,[sp,#16]
832	stp	x21,x22,[sp,#32]
833	stp	x23,x24,[sp,#48]
834	stp	x25,x26,[sp,#64]
835	stp	x27,x28,[sp,#80]
836
837L512_or_more_neon:
838	sub	sp,sp,#128+64
839
840	ldp	x22,x23,[x5]		// load sigma
841	ld1	{v24.4s},[x5],#16
842	ldp	x24,x25,[x3]		// load key
843	ldp	x26,x27,[x3,#16]
844	ld1	{v25.4s,v26.4s},[x3]
845	ldp	x28,x30,[x4]		// load counter
846	ld1	{v27.4s},[x4]
847	ld1	{v31.4s},[x5]
848#ifdef	__ARMEB__
849	rev64	v24.4s,v24.4s
850	ror	x24,x24,#32
851	ror	x25,x25,#32
852	ror	x26,x26,#32
853	ror	x27,x27,#32
854	ror	x28,x28,#32
855	ror	x30,x30,#32
856#endif
857	add	v27.4s,v27.4s,v31.4s		// += 1
858	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
859	add	v27.4s,v27.4s,v31.4s		// not typo
860	str	q26,[sp,#32]
861	add	v28.4s,v27.4s,v31.4s
862	add	v29.4s,v28.4s,v31.4s
863	add	v30.4s,v29.4s,v31.4s
864	shl	v31.4s,v31.4s,#2			// 1 -> 4
865
866	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
867	stp	d10,d11,[sp,#128+16]
868	stp	d12,d13,[sp,#128+32]
869	stp	d14,d15,[sp,#128+48]
870
871	sub	x2,x2,#512			// not typo
872
873Loop_outer_512_neon:
874	mov	v0.16b,v24.16b
875	mov	v4.16b,v24.16b
876	mov	v8.16b,v24.16b
877	mov	v12.16b,v24.16b
878	mov	v16.16b,v24.16b
879	mov	v20.16b,v24.16b
880	mov	v1.16b,v25.16b
881	mov	w5,w22			// unpack key block
882	mov	v5.16b,v25.16b
883	lsr	x6,x22,#32
884	mov	v9.16b,v25.16b
885	mov	w7,w23
886	mov	v13.16b,v25.16b
887	lsr	x8,x23,#32
888	mov	v17.16b,v25.16b
889	mov	w9,w24
890	mov	v21.16b,v25.16b
891	lsr	x10,x24,#32
892	mov	v3.16b,v27.16b
893	mov	w11,w25
894	mov	v7.16b,v28.16b
895	lsr	x12,x25,#32
896	mov	v11.16b,v29.16b
897	mov	w13,w26
898	mov	v15.16b,v30.16b
899	lsr	x14,x26,#32
900	mov	v2.16b,v26.16b
901	mov	w15,w27
902	mov	v6.16b,v26.16b
903	lsr	x16,x27,#32
904	add	v19.4s,v3.4s,v31.4s			// +4
905	mov	w17,w28
906	add	v23.4s,v7.4s,v31.4s			// +4
907	lsr	x19,x28,#32
908	mov	v10.16b,v26.16b
909	mov	w20,w30
910	mov	v14.16b,v26.16b
911	lsr	x21,x30,#32
912	mov	v18.16b,v26.16b
913	stp	q27,q28,[sp,#48]		// off-load key block, variable part
914	mov	v22.16b,v26.16b
915	str	q29,[sp,#80]
916
917	mov	x4,#5
918	subs	x2,x2,#512
919Loop_upper_neon:
920	sub	x4,x4,#1
921	add	v0.4s,v0.4s,v1.4s
922	add	w5,w5,w9
923	add	v4.4s,v4.4s,v5.4s
924	add	w6,w6,w10
925	add	v8.4s,v8.4s,v9.4s
926	add	w7,w7,w11
927	add	v12.4s,v12.4s,v13.4s
928	add	w8,w8,w12
929	add	v16.4s,v16.4s,v17.4s
930	eor	w17,w17,w5
931	add	v20.4s,v20.4s,v21.4s
932	eor	w19,w19,w6
933	eor	v3.16b,v3.16b,v0.16b
934	eor	w20,w20,w7
935	eor	v7.16b,v7.16b,v4.16b
936	eor	w21,w21,w8
937	eor	v11.16b,v11.16b,v8.16b
938	ror	w17,w17,#16
939	eor	v15.16b,v15.16b,v12.16b
940	ror	w19,w19,#16
941	eor	v19.16b,v19.16b,v16.16b
942	ror	w20,w20,#16
943	eor	v23.16b,v23.16b,v20.16b
944	ror	w21,w21,#16
945	rev32	v3.8h,v3.8h
946	add	w13,w13,w17
947	rev32	v7.8h,v7.8h
948	add	w14,w14,w19
949	rev32	v11.8h,v11.8h
950	add	w15,w15,w20
951	rev32	v15.8h,v15.8h
952	add	w16,w16,w21
953	rev32	v19.8h,v19.8h
954	eor	w9,w9,w13
955	rev32	v23.8h,v23.8h
956	eor	w10,w10,w14
957	add	v2.4s,v2.4s,v3.4s
958	eor	w11,w11,w15
959	add	v6.4s,v6.4s,v7.4s
960	eor	w12,w12,w16
961	add	v10.4s,v10.4s,v11.4s
962	ror	w9,w9,#20
963	add	v14.4s,v14.4s,v15.4s
964	ror	w10,w10,#20
965	add	v18.4s,v18.4s,v19.4s
966	ror	w11,w11,#20
967	add	v22.4s,v22.4s,v23.4s
968	ror	w12,w12,#20
969	eor	v24.16b,v1.16b,v2.16b
970	add	w5,w5,w9
971	eor	v25.16b,v5.16b,v6.16b
972	add	w6,w6,w10
973	eor	v26.16b,v9.16b,v10.16b
974	add	w7,w7,w11
975	eor	v27.16b,v13.16b,v14.16b
976	add	w8,w8,w12
977	eor	v28.16b,v17.16b,v18.16b
978	eor	w17,w17,w5
979	eor	v29.16b,v21.16b,v22.16b
980	eor	w19,w19,w6
981	ushr	v1.4s,v24.4s,#20
982	eor	w20,w20,w7
983	ushr	v5.4s,v25.4s,#20
984	eor	w21,w21,w8
985	ushr	v9.4s,v26.4s,#20
986	ror	w17,w17,#24
987	ushr	v13.4s,v27.4s,#20
988	ror	w19,w19,#24
989	ushr	v17.4s,v28.4s,#20
990	ror	w20,w20,#24
991	ushr	v21.4s,v29.4s,#20
992	ror	w21,w21,#24
993	sli	v1.4s,v24.4s,#12
994	add	w13,w13,w17
995	sli	v5.4s,v25.4s,#12
996	add	w14,w14,w19
997	sli	v9.4s,v26.4s,#12
998	add	w15,w15,w20
999	sli	v13.4s,v27.4s,#12
1000	add	w16,w16,w21
1001	sli	v17.4s,v28.4s,#12
1002	eor	w9,w9,w13
1003	sli	v21.4s,v29.4s,#12
1004	eor	w10,w10,w14
1005	add	v0.4s,v0.4s,v1.4s
1006	eor	w11,w11,w15
1007	add	v4.4s,v4.4s,v5.4s
1008	eor	w12,w12,w16
1009	add	v8.4s,v8.4s,v9.4s
1010	ror	w9,w9,#25
1011	add	v12.4s,v12.4s,v13.4s
1012	ror	w10,w10,#25
1013	add	v16.4s,v16.4s,v17.4s
1014	ror	w11,w11,#25
1015	add	v20.4s,v20.4s,v21.4s
1016	ror	w12,w12,#25
1017	eor	v24.16b,v3.16b,v0.16b
1018	add	w5,w5,w10
1019	eor	v25.16b,v7.16b,v4.16b
1020	add	w6,w6,w11
1021	eor	v26.16b,v11.16b,v8.16b
1022	add	w7,w7,w12
1023	eor	v27.16b,v15.16b,v12.16b
1024	add	w8,w8,w9
1025	eor	v28.16b,v19.16b,v16.16b
1026	eor	w21,w21,w5
1027	eor	v29.16b,v23.16b,v20.16b
1028	eor	w17,w17,w6
1029	ushr	v3.4s,v24.4s,#24
1030	eor	w19,w19,w7
1031	ushr	v7.4s,v25.4s,#24
1032	eor	w20,w20,w8
1033	ushr	v11.4s,v26.4s,#24
1034	ror	w21,w21,#16
1035	ushr	v15.4s,v27.4s,#24
1036	ror	w17,w17,#16
1037	ushr	v19.4s,v28.4s,#24
1038	ror	w19,w19,#16
1039	ushr	v23.4s,v29.4s,#24
1040	ror	w20,w20,#16
1041	sli	v3.4s,v24.4s,#8
1042	add	w15,w15,w21
1043	sli	v7.4s,v25.4s,#8
1044	add	w16,w16,w17
1045	sli	v11.4s,v26.4s,#8
1046	add	w13,w13,w19
1047	sli	v15.4s,v27.4s,#8
1048	add	w14,w14,w20
1049	sli	v19.4s,v28.4s,#8
1050	eor	w10,w10,w15
1051	sli	v23.4s,v29.4s,#8
1052	eor	w11,w11,w16
1053	add	v2.4s,v2.4s,v3.4s
1054	eor	w12,w12,w13
1055	add	v6.4s,v6.4s,v7.4s
1056	eor	w9,w9,w14
1057	add	v10.4s,v10.4s,v11.4s
1058	ror	w10,w10,#20
1059	add	v14.4s,v14.4s,v15.4s
1060	ror	w11,w11,#20
1061	add	v18.4s,v18.4s,v19.4s
1062	ror	w12,w12,#20
1063	add	v22.4s,v22.4s,v23.4s
1064	ror	w9,w9,#20
1065	eor	v24.16b,v1.16b,v2.16b
1066	add	w5,w5,w10
1067	eor	v25.16b,v5.16b,v6.16b
1068	add	w6,w6,w11
1069	eor	v26.16b,v9.16b,v10.16b
1070	add	w7,w7,w12
1071	eor	v27.16b,v13.16b,v14.16b
1072	add	w8,w8,w9
1073	eor	v28.16b,v17.16b,v18.16b
1074	eor	w21,w21,w5
1075	eor	v29.16b,v21.16b,v22.16b
1076	eor	w17,w17,w6
1077	ushr	v1.4s,v24.4s,#25
1078	eor	w19,w19,w7
1079	ushr	v5.4s,v25.4s,#25
1080	eor	w20,w20,w8
1081	ushr	v9.4s,v26.4s,#25
1082	ror	w21,w21,#24
1083	ushr	v13.4s,v27.4s,#25
1084	ror	w17,w17,#24
1085	ushr	v17.4s,v28.4s,#25
1086	ror	w19,w19,#24
1087	ushr	v21.4s,v29.4s,#25
1088	ror	w20,w20,#24
1089	sli	v1.4s,v24.4s,#7
1090	add	w15,w15,w21
1091	sli	v5.4s,v25.4s,#7
1092	add	w16,w16,w17
1093	sli	v9.4s,v26.4s,#7
1094	add	w13,w13,w19
1095	sli	v13.4s,v27.4s,#7
1096	add	w14,w14,w20
1097	sli	v17.4s,v28.4s,#7
1098	eor	w10,w10,w15
1099	sli	v21.4s,v29.4s,#7
1100	eor	w11,w11,w16
1101	ext	v2.16b,v2.16b,v2.16b,#8
1102	eor	w12,w12,w13
1103	ext	v6.16b,v6.16b,v6.16b,#8
1104	eor	w9,w9,w14
1105	ext	v10.16b,v10.16b,v10.16b,#8
1106	ror	w10,w10,#25
1107	ext	v14.16b,v14.16b,v14.16b,#8
1108	ror	w11,w11,#25
1109	ext	v18.16b,v18.16b,v18.16b,#8
1110	ror	w12,w12,#25
1111	ext	v22.16b,v22.16b,v22.16b,#8
1112	ror	w9,w9,#25
1113	ext	v3.16b,v3.16b,v3.16b,#12
1114	ext	v7.16b,v7.16b,v7.16b,#12
1115	ext	v11.16b,v11.16b,v11.16b,#12
1116	ext	v15.16b,v15.16b,v15.16b,#12
1117	ext	v19.16b,v19.16b,v19.16b,#12
1118	ext	v23.16b,v23.16b,v23.16b,#12
1119	ext	v1.16b,v1.16b,v1.16b,#4
1120	ext	v5.16b,v5.16b,v5.16b,#4
1121	ext	v9.16b,v9.16b,v9.16b,#4
1122	ext	v13.16b,v13.16b,v13.16b,#4
1123	ext	v17.16b,v17.16b,v17.16b,#4
1124	ext	v21.16b,v21.16b,v21.16b,#4
1125	add	v0.4s,v0.4s,v1.4s
1126	add	w5,w5,w9
1127	add	v4.4s,v4.4s,v5.4s
1128	add	w6,w6,w10
1129	add	v8.4s,v8.4s,v9.4s
1130	add	w7,w7,w11
1131	add	v12.4s,v12.4s,v13.4s
1132	add	w8,w8,w12
1133	add	v16.4s,v16.4s,v17.4s
1134	eor	w17,w17,w5
1135	add	v20.4s,v20.4s,v21.4s
1136	eor	w19,w19,w6
1137	eor	v3.16b,v3.16b,v0.16b
1138	eor	w20,w20,w7
1139	eor	v7.16b,v7.16b,v4.16b
1140	eor	w21,w21,w8
1141	eor	v11.16b,v11.16b,v8.16b
1142	ror	w17,w17,#16
1143	eor	v15.16b,v15.16b,v12.16b
1144	ror	w19,w19,#16
1145	eor	v19.16b,v19.16b,v16.16b
1146	ror	w20,w20,#16
1147	eor	v23.16b,v23.16b,v20.16b
1148	ror	w21,w21,#16
1149	rev32	v3.8h,v3.8h
1150	add	w13,w13,w17
1151	rev32	v7.8h,v7.8h
1152	add	w14,w14,w19
1153	rev32	v11.8h,v11.8h
1154	add	w15,w15,w20
1155	rev32	v15.8h,v15.8h
1156	add	w16,w16,w21
1157	rev32	v19.8h,v19.8h
1158	eor	w9,w9,w13
1159	rev32	v23.8h,v23.8h
1160	eor	w10,w10,w14
1161	add	v2.4s,v2.4s,v3.4s
1162	eor	w11,w11,w15
1163	add	v6.4s,v6.4s,v7.4s
1164	eor	w12,w12,w16
1165	add	v10.4s,v10.4s,v11.4s
1166	ror	w9,w9,#20
1167	add	v14.4s,v14.4s,v15.4s
1168	ror	w10,w10,#20
1169	add	v18.4s,v18.4s,v19.4s
1170	ror	w11,w11,#20
1171	add	v22.4s,v22.4s,v23.4s
1172	ror	w12,w12,#20
1173	eor	v24.16b,v1.16b,v2.16b
1174	add	w5,w5,w9
1175	eor	v25.16b,v5.16b,v6.16b
1176	add	w6,w6,w10
1177	eor	v26.16b,v9.16b,v10.16b
1178	add	w7,w7,w11
1179	eor	v27.16b,v13.16b,v14.16b
1180	add	w8,w8,w12
1181	eor	v28.16b,v17.16b,v18.16b
1182	eor	w17,w17,w5
1183	eor	v29.16b,v21.16b,v22.16b
1184	eor	w19,w19,w6
1185	ushr	v1.4s,v24.4s,#20
1186	eor	w20,w20,w7
1187	ushr	v5.4s,v25.4s,#20
1188	eor	w21,w21,w8
1189	ushr	v9.4s,v26.4s,#20
1190	ror	w17,w17,#24
1191	ushr	v13.4s,v27.4s,#20
1192	ror	w19,w19,#24
1193	ushr	v17.4s,v28.4s,#20
1194	ror	w20,w20,#24
1195	ushr	v21.4s,v29.4s,#20
1196	ror	w21,w21,#24
1197	sli	v1.4s,v24.4s,#12
1198	add	w13,w13,w17
1199	sli	v5.4s,v25.4s,#12
1200	add	w14,w14,w19
1201	sli	v9.4s,v26.4s,#12
1202	add	w15,w15,w20
1203	sli	v13.4s,v27.4s,#12
1204	add	w16,w16,w21
1205	sli	v17.4s,v28.4s,#12
1206	eor	w9,w9,w13
1207	sli	v21.4s,v29.4s,#12
1208	eor	w10,w10,w14
1209	add	v0.4s,v0.4s,v1.4s
1210	eor	w11,w11,w15
1211	add	v4.4s,v4.4s,v5.4s
1212	eor	w12,w12,w16
1213	add	v8.4s,v8.4s,v9.4s
1214	ror	w9,w9,#25
1215	add	v12.4s,v12.4s,v13.4s
1216	ror	w10,w10,#25
1217	add	v16.4s,v16.4s,v17.4s
1218	ror	w11,w11,#25
1219	add	v20.4s,v20.4s,v21.4s
1220	ror	w12,w12,#25
1221	eor	v24.16b,v3.16b,v0.16b
1222	add	w5,w5,w10
1223	eor	v25.16b,v7.16b,v4.16b
1224	add	w6,w6,w11
1225	eor	v26.16b,v11.16b,v8.16b
1226	add	w7,w7,w12
1227	eor	v27.16b,v15.16b,v12.16b
1228	add	w8,w8,w9
1229	eor	v28.16b,v19.16b,v16.16b
1230	eor	w21,w21,w5
1231	eor	v29.16b,v23.16b,v20.16b
1232	eor	w17,w17,w6
1233	ushr	v3.4s,v24.4s,#24
1234	eor	w19,w19,w7
1235	ushr	v7.4s,v25.4s,#24
1236	eor	w20,w20,w8
1237	ushr	v11.4s,v26.4s,#24
1238	ror	w21,w21,#16
1239	ushr	v15.4s,v27.4s,#24
1240	ror	w17,w17,#16
1241	ushr	v19.4s,v28.4s,#24
1242	ror	w19,w19,#16
1243	ushr	v23.4s,v29.4s,#24
1244	ror	w20,w20,#16
1245	sli	v3.4s,v24.4s,#8
1246	add	w15,w15,w21
1247	sli	v7.4s,v25.4s,#8
1248	add	w16,w16,w17
1249	sli	v11.4s,v26.4s,#8
1250	add	w13,w13,w19
1251	sli	v15.4s,v27.4s,#8
1252	add	w14,w14,w20
1253	sli	v19.4s,v28.4s,#8
1254	eor	w10,w10,w15
1255	sli	v23.4s,v29.4s,#8
1256	eor	w11,w11,w16
1257	add	v2.4s,v2.4s,v3.4s
1258	eor	w12,w12,w13
1259	add	v6.4s,v6.4s,v7.4s
1260	eor	w9,w9,w14
1261	add	v10.4s,v10.4s,v11.4s
1262	ror	w10,w10,#20
1263	add	v14.4s,v14.4s,v15.4s
1264	ror	w11,w11,#20
1265	add	v18.4s,v18.4s,v19.4s
1266	ror	w12,w12,#20
1267	add	v22.4s,v22.4s,v23.4s
1268	ror	w9,w9,#20
1269	eor	v24.16b,v1.16b,v2.16b
1270	add	w5,w5,w10
1271	eor	v25.16b,v5.16b,v6.16b
1272	add	w6,w6,w11
1273	eor	v26.16b,v9.16b,v10.16b
1274	add	w7,w7,w12
1275	eor	v27.16b,v13.16b,v14.16b
1276	add	w8,w8,w9
1277	eor	v28.16b,v17.16b,v18.16b
1278	eor	w21,w21,w5
1279	eor	v29.16b,v21.16b,v22.16b
1280	eor	w17,w17,w6
1281	ushr	v1.4s,v24.4s,#25
1282	eor	w19,w19,w7
1283	ushr	v5.4s,v25.4s,#25
1284	eor	w20,w20,w8
1285	ushr	v9.4s,v26.4s,#25
1286	ror	w21,w21,#24
1287	ushr	v13.4s,v27.4s,#25
1288	ror	w17,w17,#24
1289	ushr	v17.4s,v28.4s,#25
1290	ror	w19,w19,#24
1291	ushr	v21.4s,v29.4s,#25
1292	ror	w20,w20,#24
1293	sli	v1.4s,v24.4s,#7
1294	add	w15,w15,w21
1295	sli	v5.4s,v25.4s,#7
1296	add	w16,w16,w17
1297	sli	v9.4s,v26.4s,#7
1298	add	w13,w13,w19
1299	sli	v13.4s,v27.4s,#7
1300	add	w14,w14,w20
1301	sli	v17.4s,v28.4s,#7
1302	eor	w10,w10,w15
1303	sli	v21.4s,v29.4s,#7
1304	eor	w11,w11,w16
1305	ext	v2.16b,v2.16b,v2.16b,#8
1306	eor	w12,w12,w13
1307	ext	v6.16b,v6.16b,v6.16b,#8
1308	eor	w9,w9,w14
1309	ext	v10.16b,v10.16b,v10.16b,#8
1310	ror	w10,w10,#25
1311	ext	v14.16b,v14.16b,v14.16b,#8
1312	ror	w11,w11,#25
1313	ext	v18.16b,v18.16b,v18.16b,#8
1314	ror	w12,w12,#25
1315	ext	v22.16b,v22.16b,v22.16b,#8
1316	ror	w9,w9,#25
1317	ext	v3.16b,v3.16b,v3.16b,#4
1318	ext	v7.16b,v7.16b,v7.16b,#4
1319	ext	v11.16b,v11.16b,v11.16b,#4
1320	ext	v15.16b,v15.16b,v15.16b,#4
1321	ext	v19.16b,v19.16b,v19.16b,#4
1322	ext	v23.16b,v23.16b,v23.16b,#4
1323	ext	v1.16b,v1.16b,v1.16b,#12
1324	ext	v5.16b,v5.16b,v5.16b,#12
1325	ext	v9.16b,v9.16b,v9.16b,#12
1326	ext	v13.16b,v13.16b,v13.16b,#12
1327	ext	v17.16b,v17.16b,v17.16b,#12
1328	ext	v21.16b,v21.16b,v21.16b,#12
1329	cbnz	x4,Loop_upper_neon
1330
1331	add	w5,w5,w22		// accumulate key block
1332	add	x6,x6,x22,lsr#32
1333	add	w7,w7,w23
1334	add	x8,x8,x23,lsr#32
1335	add	w9,w9,w24
1336	add	x10,x10,x24,lsr#32
1337	add	w11,w11,w25
1338	add	x12,x12,x25,lsr#32
1339	add	w13,w13,w26
1340	add	x14,x14,x26,lsr#32
1341	add	w15,w15,w27
1342	add	x16,x16,x27,lsr#32
1343	add	w17,w17,w28
1344	add	x19,x19,x28,lsr#32
1345	add	w20,w20,w30
1346	add	x21,x21,x30,lsr#32
1347
1348	add	x5,x5,x6,lsl#32	// pack
1349	add	x7,x7,x8,lsl#32
1350	ldp	x6,x8,[x1,#0]		// load input
1351	add	x9,x9,x10,lsl#32
1352	add	x11,x11,x12,lsl#32
1353	ldp	x10,x12,[x1,#16]
1354	add	x13,x13,x14,lsl#32
1355	add	x15,x15,x16,lsl#32
1356	ldp	x14,x16,[x1,#32]
1357	add	x17,x17,x19,lsl#32
1358	add	x20,x20,x21,lsl#32
1359	ldp	x19,x21,[x1,#48]
1360	add	x1,x1,#64
1361#ifdef	__ARMEB__
1362	rev	x5,x5
1363	rev	x7,x7
1364	rev	x9,x9
1365	rev	x11,x11
1366	rev	x13,x13
1367	rev	x15,x15
1368	rev	x17,x17
1369	rev	x20,x20
1370#endif
1371	eor	x5,x5,x6
1372	eor	x7,x7,x8
1373	eor	x9,x9,x10
1374	eor	x11,x11,x12
1375	eor	x13,x13,x14
1376	eor	x15,x15,x16
1377	eor	x17,x17,x19
1378	eor	x20,x20,x21
1379
1380	stp	x5,x7,[x0,#0]		// store output
1381	add	x28,x28,#1			// increment counter
1382	mov	w5,w22			// unpack key block
1383	lsr	x6,x22,#32
1384	stp	x9,x11,[x0,#16]
1385	mov	w7,w23
1386	lsr	x8,x23,#32
1387	stp	x13,x15,[x0,#32]
1388	mov	w9,w24
1389	lsr	x10,x24,#32
1390	stp	x17,x20,[x0,#48]
1391	add	x0,x0,#64
1392	mov	w11,w25
1393	lsr	x12,x25,#32
1394	mov	w13,w26
1395	lsr	x14,x26,#32
1396	mov	w15,w27
1397	lsr	x16,x27,#32
1398	mov	w17,w28
1399	lsr	x19,x28,#32
1400	mov	w20,w30
1401	lsr	x21,x30,#32
1402
1403	mov	x4,#5
1404Loop_lower_neon:
1405	sub	x4,x4,#1
1406	add	v0.4s,v0.4s,v1.4s
1407	add	w5,w5,w9
1408	add	v4.4s,v4.4s,v5.4s
1409	add	w6,w6,w10
1410	add	v8.4s,v8.4s,v9.4s
1411	add	w7,w7,w11
1412	add	v12.4s,v12.4s,v13.4s
1413	add	w8,w8,w12
1414	add	v16.4s,v16.4s,v17.4s
1415	eor	w17,w17,w5
1416	add	v20.4s,v20.4s,v21.4s
1417	eor	w19,w19,w6
1418	eor	v3.16b,v3.16b,v0.16b
1419	eor	w20,w20,w7
1420	eor	v7.16b,v7.16b,v4.16b
1421	eor	w21,w21,w8
1422	eor	v11.16b,v11.16b,v8.16b
1423	ror	w17,w17,#16
1424	eor	v15.16b,v15.16b,v12.16b
1425	ror	w19,w19,#16
1426	eor	v19.16b,v19.16b,v16.16b
1427	ror	w20,w20,#16
1428	eor	v23.16b,v23.16b,v20.16b
1429	ror	w21,w21,#16
1430	rev32	v3.8h,v3.8h
1431	add	w13,w13,w17
1432	rev32	v7.8h,v7.8h
1433	add	w14,w14,w19
1434	rev32	v11.8h,v11.8h
1435	add	w15,w15,w20
1436	rev32	v15.8h,v15.8h
1437	add	w16,w16,w21
1438	rev32	v19.8h,v19.8h
1439	eor	w9,w9,w13
1440	rev32	v23.8h,v23.8h
1441	eor	w10,w10,w14
1442	add	v2.4s,v2.4s,v3.4s
1443	eor	w11,w11,w15
1444	add	v6.4s,v6.4s,v7.4s
1445	eor	w12,w12,w16
1446	add	v10.4s,v10.4s,v11.4s
1447	ror	w9,w9,#20
1448	add	v14.4s,v14.4s,v15.4s
1449	ror	w10,w10,#20
1450	add	v18.4s,v18.4s,v19.4s
1451	ror	w11,w11,#20
1452	add	v22.4s,v22.4s,v23.4s
1453	ror	w12,w12,#20
1454	eor	v24.16b,v1.16b,v2.16b
1455	add	w5,w5,w9
1456	eor	v25.16b,v5.16b,v6.16b
1457	add	w6,w6,w10
1458	eor	v26.16b,v9.16b,v10.16b
1459	add	w7,w7,w11
1460	eor	v27.16b,v13.16b,v14.16b
1461	add	w8,w8,w12
1462	eor	v28.16b,v17.16b,v18.16b
1463	eor	w17,w17,w5
1464	eor	v29.16b,v21.16b,v22.16b
1465	eor	w19,w19,w6
1466	ushr	v1.4s,v24.4s,#20
1467	eor	w20,w20,w7
1468	ushr	v5.4s,v25.4s,#20
1469	eor	w21,w21,w8
1470	ushr	v9.4s,v26.4s,#20
1471	ror	w17,w17,#24
1472	ushr	v13.4s,v27.4s,#20
1473	ror	w19,w19,#24
1474	ushr	v17.4s,v28.4s,#20
1475	ror	w20,w20,#24
1476	ushr	v21.4s,v29.4s,#20
1477	ror	w21,w21,#24
1478	sli	v1.4s,v24.4s,#12
1479	add	w13,w13,w17
1480	sli	v5.4s,v25.4s,#12
1481	add	w14,w14,w19
1482	sli	v9.4s,v26.4s,#12
1483	add	w15,w15,w20
1484	sli	v13.4s,v27.4s,#12
1485	add	w16,w16,w21
1486	sli	v17.4s,v28.4s,#12
1487	eor	w9,w9,w13
1488	sli	v21.4s,v29.4s,#12
1489	eor	w10,w10,w14
1490	add	v0.4s,v0.4s,v1.4s
1491	eor	w11,w11,w15
1492	add	v4.4s,v4.4s,v5.4s
1493	eor	w12,w12,w16
1494	add	v8.4s,v8.4s,v9.4s
1495	ror	w9,w9,#25
1496	add	v12.4s,v12.4s,v13.4s
1497	ror	w10,w10,#25
1498	add	v16.4s,v16.4s,v17.4s
1499	ror	w11,w11,#25
1500	add	v20.4s,v20.4s,v21.4s
1501	ror	w12,w12,#25
1502	eor	v24.16b,v3.16b,v0.16b
1503	add	w5,w5,w10
1504	eor	v25.16b,v7.16b,v4.16b
1505	add	w6,w6,w11
1506	eor	v26.16b,v11.16b,v8.16b
1507	add	w7,w7,w12
1508	eor	v27.16b,v15.16b,v12.16b
1509	add	w8,w8,w9
1510	eor	v28.16b,v19.16b,v16.16b
1511	eor	w21,w21,w5
1512	eor	v29.16b,v23.16b,v20.16b
1513	eor	w17,w17,w6
1514	ushr	v3.4s,v24.4s,#24
1515	eor	w19,w19,w7
1516	ushr	v7.4s,v25.4s,#24
1517	eor	w20,w20,w8
1518	ushr	v11.4s,v26.4s,#24
1519	ror	w21,w21,#16
1520	ushr	v15.4s,v27.4s,#24
1521	ror	w17,w17,#16
1522	ushr	v19.4s,v28.4s,#24
1523	ror	w19,w19,#16
1524	ushr	v23.4s,v29.4s,#24
1525	ror	w20,w20,#16
1526	sli	v3.4s,v24.4s,#8
1527	add	w15,w15,w21
1528	sli	v7.4s,v25.4s,#8
1529	add	w16,w16,w17
1530	sli	v11.4s,v26.4s,#8
1531	add	w13,w13,w19
1532	sli	v15.4s,v27.4s,#8
1533	add	w14,w14,w20
1534	sli	v19.4s,v28.4s,#8
1535	eor	w10,w10,w15
1536	sli	v23.4s,v29.4s,#8
1537	eor	w11,w11,w16
1538	add	v2.4s,v2.4s,v3.4s
1539	eor	w12,w12,w13
1540	add	v6.4s,v6.4s,v7.4s
1541	eor	w9,w9,w14
1542	add	v10.4s,v10.4s,v11.4s
1543	ror	w10,w10,#20
1544	add	v14.4s,v14.4s,v15.4s
1545	ror	w11,w11,#20
1546	add	v18.4s,v18.4s,v19.4s
1547	ror	w12,w12,#20
1548	add	v22.4s,v22.4s,v23.4s
1549	ror	w9,w9,#20
1550	eor	v24.16b,v1.16b,v2.16b
1551	add	w5,w5,w10
1552	eor	v25.16b,v5.16b,v6.16b
1553	add	w6,w6,w11
1554	eor	v26.16b,v9.16b,v10.16b
1555	add	w7,w7,w12
1556	eor	v27.16b,v13.16b,v14.16b
1557	add	w8,w8,w9
1558	eor	v28.16b,v17.16b,v18.16b
1559	eor	w21,w21,w5
1560	eor	v29.16b,v21.16b,v22.16b
1561	eor	w17,w17,w6
1562	ushr	v1.4s,v24.4s,#25
1563	eor	w19,w19,w7
1564	ushr	v5.4s,v25.4s,#25
1565	eor	w20,w20,w8
1566	ushr	v9.4s,v26.4s,#25
1567	ror	w21,w21,#24
1568	ushr	v13.4s,v27.4s,#25
1569	ror	w17,w17,#24
1570	ushr	v17.4s,v28.4s,#25
1571	ror	w19,w19,#24
1572	ushr	v21.4s,v29.4s,#25
1573	ror	w20,w20,#24
1574	sli	v1.4s,v24.4s,#7
1575	add	w15,w15,w21
1576	sli	v5.4s,v25.4s,#7
1577	add	w16,w16,w17
1578	sli	v9.4s,v26.4s,#7
1579	add	w13,w13,w19
1580	sli	v13.4s,v27.4s,#7
1581	add	w14,w14,w20
1582	sli	v17.4s,v28.4s,#7
1583	eor	w10,w10,w15
1584	sli	v21.4s,v29.4s,#7
1585	eor	w11,w11,w16
1586	ext	v2.16b,v2.16b,v2.16b,#8
1587	eor	w12,w12,w13
1588	ext	v6.16b,v6.16b,v6.16b,#8
1589	eor	w9,w9,w14
1590	ext	v10.16b,v10.16b,v10.16b,#8
1591	ror	w10,w10,#25
1592	ext	v14.16b,v14.16b,v14.16b,#8
1593	ror	w11,w11,#25
1594	ext	v18.16b,v18.16b,v18.16b,#8
1595	ror	w12,w12,#25
1596	ext	v22.16b,v22.16b,v22.16b,#8
1597	ror	w9,w9,#25
1598	ext	v3.16b,v3.16b,v3.16b,#12
1599	ext	v7.16b,v7.16b,v7.16b,#12
1600	ext	v11.16b,v11.16b,v11.16b,#12
1601	ext	v15.16b,v15.16b,v15.16b,#12
1602	ext	v19.16b,v19.16b,v19.16b,#12
1603	ext	v23.16b,v23.16b,v23.16b,#12
1604	ext	v1.16b,v1.16b,v1.16b,#4
1605	ext	v5.16b,v5.16b,v5.16b,#4
1606	ext	v9.16b,v9.16b,v9.16b,#4
1607	ext	v13.16b,v13.16b,v13.16b,#4
1608	ext	v17.16b,v17.16b,v17.16b,#4
1609	ext	v21.16b,v21.16b,v21.16b,#4
1610	add	v0.4s,v0.4s,v1.4s
1611	add	w5,w5,w9
1612	add	v4.4s,v4.4s,v5.4s
1613	add	w6,w6,w10
1614	add	v8.4s,v8.4s,v9.4s
1615	add	w7,w7,w11
1616	add	v12.4s,v12.4s,v13.4s
1617	add	w8,w8,w12
1618	add	v16.4s,v16.4s,v17.4s
1619	eor	w17,w17,w5
1620	add	v20.4s,v20.4s,v21.4s
1621	eor	w19,w19,w6
1622	eor	v3.16b,v3.16b,v0.16b
1623	eor	w20,w20,w7
1624	eor	v7.16b,v7.16b,v4.16b
1625	eor	w21,w21,w8
1626	eor	v11.16b,v11.16b,v8.16b
1627	ror	w17,w17,#16
1628	eor	v15.16b,v15.16b,v12.16b
1629	ror	w19,w19,#16
1630	eor	v19.16b,v19.16b,v16.16b
1631	ror	w20,w20,#16
1632	eor	v23.16b,v23.16b,v20.16b
1633	ror	w21,w21,#16
1634	rev32	v3.8h,v3.8h
1635	add	w13,w13,w17
1636	rev32	v7.8h,v7.8h
1637	add	w14,w14,w19
1638	rev32	v11.8h,v11.8h
1639	add	w15,w15,w20
1640	rev32	v15.8h,v15.8h
1641	add	w16,w16,w21
1642	rev32	v19.8h,v19.8h
1643	eor	w9,w9,w13
1644	rev32	v23.8h,v23.8h
1645	eor	w10,w10,w14
1646	add	v2.4s,v2.4s,v3.4s
1647	eor	w11,w11,w15
1648	add	v6.4s,v6.4s,v7.4s
1649	eor	w12,w12,w16
1650	add	v10.4s,v10.4s,v11.4s
1651	ror	w9,w9,#20
1652	add	v14.4s,v14.4s,v15.4s
1653	ror	w10,w10,#20
1654	add	v18.4s,v18.4s,v19.4s
1655	ror	w11,w11,#20
1656	add	v22.4s,v22.4s,v23.4s
1657	ror	w12,w12,#20
1658	eor	v24.16b,v1.16b,v2.16b
1659	add	w5,w5,w9
1660	eor	v25.16b,v5.16b,v6.16b
1661	add	w6,w6,w10
1662	eor	v26.16b,v9.16b,v10.16b
1663	add	w7,w7,w11
1664	eor	v27.16b,v13.16b,v14.16b
1665	add	w8,w8,w12
1666	eor	v28.16b,v17.16b,v18.16b
1667	eor	w17,w17,w5
1668	eor	v29.16b,v21.16b,v22.16b
1669	eor	w19,w19,w6
1670	ushr	v1.4s,v24.4s,#20
1671	eor	w20,w20,w7
1672	ushr	v5.4s,v25.4s,#20
1673	eor	w21,w21,w8
1674	ushr	v9.4s,v26.4s,#20
1675	ror	w17,w17,#24
1676	ushr	v13.4s,v27.4s,#20
1677	ror	w19,w19,#24
1678	ushr	v17.4s,v28.4s,#20
1679	ror	w20,w20,#24
1680	ushr	v21.4s,v29.4s,#20
1681	ror	w21,w21,#24
1682	sli	v1.4s,v24.4s,#12
1683	add	w13,w13,w17
1684	sli	v5.4s,v25.4s,#12
1685	add	w14,w14,w19
1686	sli	v9.4s,v26.4s,#12
1687	add	w15,w15,w20
1688	sli	v13.4s,v27.4s,#12
1689	add	w16,w16,w21
1690	sli	v17.4s,v28.4s,#12
1691	eor	w9,w9,w13
1692	sli	v21.4s,v29.4s,#12
1693	eor	w10,w10,w14
1694	add	v0.4s,v0.4s,v1.4s
1695	eor	w11,w11,w15
1696	add	v4.4s,v4.4s,v5.4s
1697	eor	w12,w12,w16
1698	add	v8.4s,v8.4s,v9.4s
1699	ror	w9,w9,#25
1700	add	v12.4s,v12.4s,v13.4s
1701	ror	w10,w10,#25
1702	add	v16.4s,v16.4s,v17.4s
1703	ror	w11,w11,#25
1704	add	v20.4s,v20.4s,v21.4s
1705	ror	w12,w12,#25
1706	eor	v24.16b,v3.16b,v0.16b
1707	add	w5,w5,w10
1708	eor	v25.16b,v7.16b,v4.16b
1709	add	w6,w6,w11
1710	eor	v26.16b,v11.16b,v8.16b
1711	add	w7,w7,w12
1712	eor	v27.16b,v15.16b,v12.16b
1713	add	w8,w8,w9
1714	eor	v28.16b,v19.16b,v16.16b
1715	eor	w21,w21,w5
1716	eor	v29.16b,v23.16b,v20.16b
1717	eor	w17,w17,w6
1718	ushr	v3.4s,v24.4s,#24
1719	eor	w19,w19,w7
1720	ushr	v7.4s,v25.4s,#24
1721	eor	w20,w20,w8
1722	ushr	v11.4s,v26.4s,#24
1723	ror	w21,w21,#16
1724	ushr	v15.4s,v27.4s,#24
1725	ror	w17,w17,#16
1726	ushr	v19.4s,v28.4s,#24
1727	ror	w19,w19,#16
1728	ushr	v23.4s,v29.4s,#24
1729	ror	w20,w20,#16
1730	sli	v3.4s,v24.4s,#8
1731	add	w15,w15,w21
1732	sli	v7.4s,v25.4s,#8
1733	add	w16,w16,w17
1734	sli	v11.4s,v26.4s,#8
1735	add	w13,w13,w19
1736	sli	v15.4s,v27.4s,#8
1737	add	w14,w14,w20
1738	sli	v19.4s,v28.4s,#8
1739	eor	w10,w10,w15
1740	sli	v23.4s,v29.4s,#8
1741	eor	w11,w11,w16
1742	add	v2.4s,v2.4s,v3.4s
1743	eor	w12,w12,w13
1744	add	v6.4s,v6.4s,v7.4s
1745	eor	w9,w9,w14
1746	add	v10.4s,v10.4s,v11.4s
1747	ror	w10,w10,#20
1748	add	v14.4s,v14.4s,v15.4s
1749	ror	w11,w11,#20
1750	add	v18.4s,v18.4s,v19.4s
1751	ror	w12,w12,#20
1752	add	v22.4s,v22.4s,v23.4s
1753	ror	w9,w9,#20
1754	eor	v24.16b,v1.16b,v2.16b
1755	add	w5,w5,w10
1756	eor	v25.16b,v5.16b,v6.16b
1757	add	w6,w6,w11
1758	eor	v26.16b,v9.16b,v10.16b
1759	add	w7,w7,w12
1760	eor	v27.16b,v13.16b,v14.16b
1761	add	w8,w8,w9
1762	eor	v28.16b,v17.16b,v18.16b
1763	eor	w21,w21,w5
1764	eor	v29.16b,v21.16b,v22.16b
1765	eor	w17,w17,w6
1766	ushr	v1.4s,v24.4s,#25
1767	eor	w19,w19,w7
1768	ushr	v5.4s,v25.4s,#25
1769	eor	w20,w20,w8
1770	ushr	v9.4s,v26.4s,#25
1771	ror	w21,w21,#24
1772	ushr	v13.4s,v27.4s,#25
1773	ror	w17,w17,#24
1774	ushr	v17.4s,v28.4s,#25
1775	ror	w19,w19,#24
1776	ushr	v21.4s,v29.4s,#25
1777	ror	w20,w20,#24
1778	sli	v1.4s,v24.4s,#7
1779	add	w15,w15,w21
1780	sli	v5.4s,v25.4s,#7
1781	add	w16,w16,w17
1782	sli	v9.4s,v26.4s,#7
1783	add	w13,w13,w19
1784	sli	v13.4s,v27.4s,#7
1785	add	w14,w14,w20
1786	sli	v17.4s,v28.4s,#7
1787	eor	w10,w10,w15
1788	sli	v21.4s,v29.4s,#7
1789	eor	w11,w11,w16
1790	ext	v2.16b,v2.16b,v2.16b,#8
1791	eor	w12,w12,w13
1792	ext	v6.16b,v6.16b,v6.16b,#8
1793	eor	w9,w9,w14
1794	ext	v10.16b,v10.16b,v10.16b,#8
1795	ror	w10,w10,#25
1796	ext	v14.16b,v14.16b,v14.16b,#8
1797	ror	w11,w11,#25
1798	ext	v18.16b,v18.16b,v18.16b,#8
1799	ror	w12,w12,#25
1800	ext	v22.16b,v22.16b,v22.16b,#8
1801	ror	w9,w9,#25
1802	ext	v3.16b,v3.16b,v3.16b,#4
1803	ext	v7.16b,v7.16b,v7.16b,#4
1804	ext	v11.16b,v11.16b,v11.16b,#4
1805	ext	v15.16b,v15.16b,v15.16b,#4
1806	ext	v19.16b,v19.16b,v19.16b,#4
1807	ext	v23.16b,v23.16b,v23.16b,#4
1808	ext	v1.16b,v1.16b,v1.16b,#12
1809	ext	v5.16b,v5.16b,v5.16b,#12
1810	ext	v9.16b,v9.16b,v9.16b,#12
1811	ext	v13.16b,v13.16b,v13.16b,#12
1812	ext	v17.16b,v17.16b,v17.16b,#12
1813	ext	v21.16b,v21.16b,v21.16b,#12
1814	cbnz	x4,Loop_lower_neon
1815
1816	add	w5,w5,w22		// accumulate key block
1817	ldp	q24,q25,[sp,#0]
1818	add	x6,x6,x22,lsr#32
1819	ldp	q26,q27,[sp,#32]
1820	add	w7,w7,w23
1821	ldp	q28,q29,[sp,#64]
1822	add	x8,x8,x23,lsr#32
1823	add	v0.4s,v0.4s,v24.4s
1824	add	w9,w9,w24
1825	add	v4.4s,v4.4s,v24.4s
1826	add	x10,x10,x24,lsr#32
1827	add	v8.4s,v8.4s,v24.4s
1828	add	w11,w11,w25
1829	add	v12.4s,v12.4s,v24.4s
1830	add	x12,x12,x25,lsr#32
1831	add	v16.4s,v16.4s,v24.4s
1832	add	w13,w13,w26
1833	add	v20.4s,v20.4s,v24.4s
1834	add	x14,x14,x26,lsr#32
1835	add	v2.4s,v2.4s,v26.4s
1836	add	w15,w15,w27
1837	add	v6.4s,v6.4s,v26.4s
1838	add	x16,x16,x27,lsr#32
1839	add	v10.4s,v10.4s,v26.4s
1840	add	w17,w17,w28
1841	add	v14.4s,v14.4s,v26.4s
1842	add	x19,x19,x28,lsr#32
1843	add	v18.4s,v18.4s,v26.4s
1844	add	w20,w20,w30
1845	add	v22.4s,v22.4s,v26.4s
1846	add	x21,x21,x30,lsr#32
1847	add	v19.4s,v19.4s,v31.4s			// +4
1848	add	x5,x5,x6,lsl#32	// pack
1849	add	v23.4s,v23.4s,v31.4s			// +4
1850	add	x7,x7,x8,lsl#32
1851	add	v3.4s,v3.4s,v27.4s
1852	ldp	x6,x8,[x1,#0]		// load input
1853	add	v7.4s,v7.4s,v28.4s
1854	add	x9,x9,x10,lsl#32
1855	add	v11.4s,v11.4s,v29.4s
1856	add	x11,x11,x12,lsl#32
1857	add	v15.4s,v15.4s,v30.4s
1858	ldp	x10,x12,[x1,#16]
1859	add	v19.4s,v19.4s,v27.4s
1860	add	x13,x13,x14,lsl#32
1861	add	v23.4s,v23.4s,v28.4s
1862	add	x15,x15,x16,lsl#32
1863	add	v1.4s,v1.4s,v25.4s
1864	ldp	x14,x16,[x1,#32]
1865	add	v5.4s,v5.4s,v25.4s
1866	add	x17,x17,x19,lsl#32
1867	add	v9.4s,v9.4s,v25.4s
1868	add	x20,x20,x21,lsl#32
1869	add	v13.4s,v13.4s,v25.4s
1870	ldp	x19,x21,[x1,#48]
1871	add	v17.4s,v17.4s,v25.4s
1872	add	x1,x1,#64
1873	add	v21.4s,v21.4s,v25.4s
1874
1875#ifdef	__ARMEB__
1876	rev	x5,x5
1877	rev	x7,x7
1878	rev	x9,x9
1879	rev	x11,x11
1880	rev	x13,x13
1881	rev	x15,x15
1882	rev	x17,x17
1883	rev	x20,x20
1884#endif
1885	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1886	eor	x5,x5,x6
1887	eor	x7,x7,x8
1888	eor	x9,x9,x10
1889	eor	x11,x11,x12
1890	eor	x13,x13,x14
1891	eor	v0.16b,v0.16b,v24.16b
1892	eor	x15,x15,x16
1893	eor	v1.16b,v1.16b,v25.16b
1894	eor	x17,x17,x19
1895	eor	v2.16b,v2.16b,v26.16b
1896	eor	x20,x20,x21
1897	eor	v3.16b,v3.16b,v27.16b
1898	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1899
1900	stp	x5,x7,[x0,#0]		// store output
1901	add	x28,x28,#7			// increment counter
1902	stp	x9,x11,[x0,#16]
1903	stp	x13,x15,[x0,#32]
1904	stp	x17,x20,[x0,#48]
1905	add	x0,x0,#64
1906	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1907
1908	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1909	eor	v4.16b,v4.16b,v24.16b
1910	eor	v5.16b,v5.16b,v25.16b
1911	eor	v6.16b,v6.16b,v26.16b
1912	eor	v7.16b,v7.16b,v27.16b
1913	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1914
1915	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1916	eor	v8.16b,v8.16b,v0.16b
1917	ldp	q24,q25,[sp,#0]
1918	eor	v9.16b,v9.16b,v1.16b
1919	ldp	q26,q27,[sp,#32]
1920	eor	v10.16b,v10.16b,v2.16b
1921	eor	v11.16b,v11.16b,v3.16b
1922	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1923
1924	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1925	eor	v12.16b,v12.16b,v4.16b
1926	eor	v13.16b,v13.16b,v5.16b
1927	eor	v14.16b,v14.16b,v6.16b
1928	eor	v15.16b,v15.16b,v7.16b
1929	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1930
1931	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1932	eor	v16.16b,v16.16b,v8.16b
1933	eor	v17.16b,v17.16b,v9.16b
1934	eor	v18.16b,v18.16b,v10.16b
1935	eor	v19.16b,v19.16b,v11.16b
1936	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1937
1938	shl	v0.4s,v31.4s,#1			// 4 -> 8
1939	eor	v20.16b,v20.16b,v12.16b
1940	eor	v21.16b,v21.16b,v13.16b
1941	eor	v22.16b,v22.16b,v14.16b
1942	eor	v23.16b,v23.16b,v15.16b
1943	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1944
1945	add	v27.4s,v27.4s,v0.4s			// += 8
1946	add	v28.4s,v28.4s,v0.4s
1947	add	v29.4s,v29.4s,v0.4s
1948	add	v30.4s,v30.4s,v0.4s
1949
1950	b.hs	Loop_outer_512_neon
1951
1952	adds	x2,x2,#512
1953	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1954
1955	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1956	ldp	d10,d11,[sp,#128+16]
1957	ldp	d12,d13,[sp,#128+32]
1958	ldp	d14,d15,[sp,#128+48]
1959
1960	stp	q24,q31,[sp,#0]		// wipe off-load area
1961	stp	q24,q31,[sp,#32]
1962	stp	q24,q31,[sp,#64]
1963
1964	b.eq	Ldone_512_neon
1965
1966	cmp	x2,#192
1967	sub	v27.4s,v27.4s,v0.4s			// -= 1
1968	sub	v28.4s,v28.4s,v0.4s
1969	sub	v29.4s,v29.4s,v0.4s
1970	add	sp,sp,#128
1971	b.hs	Loop_outer_neon
1972
1973	eor	v25.16b,v25.16b,v25.16b
1974	eor	v26.16b,v26.16b,v26.16b
1975	eor	v27.16b,v27.16b,v27.16b
1976	eor	v28.16b,v28.16b,v28.16b
1977	eor	v29.16b,v29.16b,v29.16b
1978	eor	v30.16b,v30.16b,v30.16b
1979	b	Loop_outer
1980
1981Ldone_512_neon:
1982	ldp	x19,x20,[x29,#16]
1983	add	sp,sp,#128+64
1984	ldp	x21,x22,[x29,#32]
1985	ldp	x23,x24,[x29,#48]
1986	ldp	x25,x26,[x29,#64]
1987	ldp	x27,x28,[x29,#80]
1988	ldp	x29,x30,[sp],#96
1989	AARCH64_VALIDATE_LINK_REGISTER
1990	ret
1991
1992#endif  // !OPENSSL_NO_ASM
1993