1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#include <GFp/arm_arch.h>
13
14
15.private_extern	_GFp_armcap_P
16
17.section	__TEXT,__const
18
19.align	5
20Lsigma:
21.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
22Lone:
23.long	1,0,0,0
24.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
25.align	2
26
27.text
28
29.globl	_GFp_ChaCha20_ctr32
30.private_extern	_GFp_ChaCha20_ctr32
31
32.align	5
33_GFp_ChaCha20_ctr32:
34	AARCH64_VALID_CALL_TARGET
35	cbz	x2,Labort
36#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
37	adrp	x5,:pg_hi21_nc:_GFp_armcap_P
38#else
39	adrp	x5,_GFp_armcap_P@PAGE
40#endif
41	cmp	x2,#192
42	b.lo	Lshort
43	ldr	w17,[x5,_GFp_armcap_P@PAGEOFF]
44	tst	w17,#ARMV7_NEON
45	b.ne	ChaCha20_neon
46
47Lshort:
48	AARCH64_SIGN_LINK_REGISTER
49	stp	x29,x30,[sp,#-96]!
50	add	x29,sp,#0
51
52	adrp	x5,Lsigma@PAGE
53	add	x5,x5,Lsigma@PAGEOFF
54	stp	x19,x20,[sp,#16]
55	stp	x21,x22,[sp,#32]
56	stp	x23,x24,[sp,#48]
57	stp	x25,x26,[sp,#64]
58	stp	x27,x28,[sp,#80]
59	sub	sp,sp,#64
60
61	ldp	x22,x23,[x5]		// load sigma
62	ldp	x24,x25,[x3]		// load key
63	ldp	x26,x27,[x3,#16]
64	ldp	x28,x30,[x4]		// load counter
65#ifdef	__ARMEB__
66	ror	x24,x24,#32
67	ror	x25,x25,#32
68	ror	x26,x26,#32
69	ror	x27,x27,#32
70	ror	x28,x28,#32
71	ror	x30,x30,#32
72#endif
73
74Loop_outer:
75	mov	w5,w22			// unpack key block
76	lsr	x6,x22,#32
77	mov	w7,w23
78	lsr	x8,x23,#32
79	mov	w9,w24
80	lsr	x10,x24,#32
81	mov	w11,w25
82	lsr	x12,x25,#32
83	mov	w13,w26
84	lsr	x14,x26,#32
85	mov	w15,w27
86	lsr	x16,x27,#32
87	mov	w17,w28
88	lsr	x19,x28,#32
89	mov	w20,w30
90	lsr	x21,x30,#32
91
92	mov	x4,#10
93	subs	x2,x2,#64
94Loop:
95	sub	x4,x4,#1
96	add	w5,w5,w9
97	add	w6,w6,w10
98	add	w7,w7,w11
99	add	w8,w8,w12
100	eor	w17,w17,w5
101	eor	w19,w19,w6
102	eor	w20,w20,w7
103	eor	w21,w21,w8
104	ror	w17,w17,#16
105	ror	w19,w19,#16
106	ror	w20,w20,#16
107	ror	w21,w21,#16
108	add	w13,w13,w17
109	add	w14,w14,w19
110	add	w15,w15,w20
111	add	w16,w16,w21
112	eor	w9,w9,w13
113	eor	w10,w10,w14
114	eor	w11,w11,w15
115	eor	w12,w12,w16
116	ror	w9,w9,#20
117	ror	w10,w10,#20
118	ror	w11,w11,#20
119	ror	w12,w12,#20
120	add	w5,w5,w9
121	add	w6,w6,w10
122	add	w7,w7,w11
123	add	w8,w8,w12
124	eor	w17,w17,w5
125	eor	w19,w19,w6
126	eor	w20,w20,w7
127	eor	w21,w21,w8
128	ror	w17,w17,#24
129	ror	w19,w19,#24
130	ror	w20,w20,#24
131	ror	w21,w21,#24
132	add	w13,w13,w17
133	add	w14,w14,w19
134	add	w15,w15,w20
135	add	w16,w16,w21
136	eor	w9,w9,w13
137	eor	w10,w10,w14
138	eor	w11,w11,w15
139	eor	w12,w12,w16
140	ror	w9,w9,#25
141	ror	w10,w10,#25
142	ror	w11,w11,#25
143	ror	w12,w12,#25
144	add	w5,w5,w10
145	add	w6,w6,w11
146	add	w7,w7,w12
147	add	w8,w8,w9
148	eor	w21,w21,w5
149	eor	w17,w17,w6
150	eor	w19,w19,w7
151	eor	w20,w20,w8
152	ror	w21,w21,#16
153	ror	w17,w17,#16
154	ror	w19,w19,#16
155	ror	w20,w20,#16
156	add	w15,w15,w21
157	add	w16,w16,w17
158	add	w13,w13,w19
159	add	w14,w14,w20
160	eor	w10,w10,w15
161	eor	w11,w11,w16
162	eor	w12,w12,w13
163	eor	w9,w9,w14
164	ror	w10,w10,#20
165	ror	w11,w11,#20
166	ror	w12,w12,#20
167	ror	w9,w9,#20
168	add	w5,w5,w10
169	add	w6,w6,w11
170	add	w7,w7,w12
171	add	w8,w8,w9
172	eor	w21,w21,w5
173	eor	w17,w17,w6
174	eor	w19,w19,w7
175	eor	w20,w20,w8
176	ror	w21,w21,#24
177	ror	w17,w17,#24
178	ror	w19,w19,#24
179	ror	w20,w20,#24
180	add	w15,w15,w21
181	add	w16,w16,w17
182	add	w13,w13,w19
183	add	w14,w14,w20
184	eor	w10,w10,w15
185	eor	w11,w11,w16
186	eor	w12,w12,w13
187	eor	w9,w9,w14
188	ror	w10,w10,#25
189	ror	w11,w11,#25
190	ror	w12,w12,#25
191	ror	w9,w9,#25
192	cbnz	x4,Loop
193
194	add	w5,w5,w22		// accumulate key block
195	add	x6,x6,x22,lsr#32
196	add	w7,w7,w23
197	add	x8,x8,x23,lsr#32
198	add	w9,w9,w24
199	add	x10,x10,x24,lsr#32
200	add	w11,w11,w25
201	add	x12,x12,x25,lsr#32
202	add	w13,w13,w26
203	add	x14,x14,x26,lsr#32
204	add	w15,w15,w27
205	add	x16,x16,x27,lsr#32
206	add	w17,w17,w28
207	add	x19,x19,x28,lsr#32
208	add	w20,w20,w30
209	add	x21,x21,x30,lsr#32
210
211	b.lo	Ltail
212
213	add	x5,x5,x6,lsl#32	// pack
214	add	x7,x7,x8,lsl#32
215	ldp	x6,x8,[x1,#0]		// load input
216	add	x9,x9,x10,lsl#32
217	add	x11,x11,x12,lsl#32
218	ldp	x10,x12,[x1,#16]
219	add	x13,x13,x14,lsl#32
220	add	x15,x15,x16,lsl#32
221	ldp	x14,x16,[x1,#32]
222	add	x17,x17,x19,lsl#32
223	add	x20,x20,x21,lsl#32
224	ldp	x19,x21,[x1,#48]
225	add	x1,x1,#64
226#ifdef	__ARMEB__
227	rev	x5,x5
228	rev	x7,x7
229	rev	x9,x9
230	rev	x11,x11
231	rev	x13,x13
232	rev	x15,x15
233	rev	x17,x17
234	rev	x20,x20
235#endif
236	eor	x5,x5,x6
237	eor	x7,x7,x8
238	eor	x9,x9,x10
239	eor	x11,x11,x12
240	eor	x13,x13,x14
241	eor	x15,x15,x16
242	eor	x17,x17,x19
243	eor	x20,x20,x21
244
245	stp	x5,x7,[x0,#0]		// store output
246	add	x28,x28,#1			// increment counter
247	stp	x9,x11,[x0,#16]
248	stp	x13,x15,[x0,#32]
249	stp	x17,x20,[x0,#48]
250	add	x0,x0,#64
251
252	b.hi	Loop_outer
253
254	ldp	x19,x20,[x29,#16]
255	add	sp,sp,#64
256	ldp	x21,x22,[x29,#32]
257	ldp	x23,x24,[x29,#48]
258	ldp	x25,x26,[x29,#64]
259	ldp	x27,x28,[x29,#80]
260	ldp	x29,x30,[sp],#96
261	AARCH64_VALIDATE_LINK_REGISTER
262Labort:
263	ret
264
265.align	4
266Ltail:
267	add	x2,x2,#64
268Less_than_64:
269	sub	x0,x0,#1
270	add	x1,x1,x2
271	add	x0,x0,x2
272	add	x4,sp,x2
273	neg	x2,x2
274
275	add	x5,x5,x6,lsl#32	// pack
276	add	x7,x7,x8,lsl#32
277	add	x9,x9,x10,lsl#32
278	add	x11,x11,x12,lsl#32
279	add	x13,x13,x14,lsl#32
280	add	x15,x15,x16,lsl#32
281	add	x17,x17,x19,lsl#32
282	add	x20,x20,x21,lsl#32
283#ifdef	__ARMEB__
284	rev	x5,x5
285	rev	x7,x7
286	rev	x9,x9
287	rev	x11,x11
288	rev	x13,x13
289	rev	x15,x15
290	rev	x17,x17
291	rev	x20,x20
292#endif
293	stp	x5,x7,[sp,#0]
294	stp	x9,x11,[sp,#16]
295	stp	x13,x15,[sp,#32]
296	stp	x17,x20,[sp,#48]
297
298Loop_tail:
299	ldrb	w10,[x1,x2]
300	ldrb	w11,[x4,x2]
301	add	x2,x2,#1
302	eor	w10,w10,w11
303	strb	w10,[x0,x2]
304	cbnz	x2,Loop_tail
305
306	stp	xzr,xzr,[sp,#0]
307	stp	xzr,xzr,[sp,#16]
308	stp	xzr,xzr,[sp,#32]
309	stp	xzr,xzr,[sp,#48]
310
311	ldp	x19,x20,[x29,#16]
312	add	sp,sp,#64
313	ldp	x21,x22,[x29,#32]
314	ldp	x23,x24,[x29,#48]
315	ldp	x25,x26,[x29,#64]
316	ldp	x27,x28,[x29,#80]
317	ldp	x29,x30,[sp],#96
318	AARCH64_VALIDATE_LINK_REGISTER
319	ret
320
321
322
323.align	5
324ChaCha20_neon:
325	AARCH64_SIGN_LINK_REGISTER
326	stp	x29,x30,[sp,#-96]!
327	add	x29,sp,#0
328
329	adrp	x5,Lsigma@PAGE
330	add	x5,x5,Lsigma@PAGEOFF
331	stp	x19,x20,[sp,#16]
332	stp	x21,x22,[sp,#32]
333	stp	x23,x24,[sp,#48]
334	stp	x25,x26,[sp,#64]
335	stp	x27,x28,[sp,#80]
336	cmp	x2,#512
337	b.hs	L512_or_more_neon
338
339	sub	sp,sp,#64
340
341	ldp	x22,x23,[x5]		// load sigma
342	ld1	{v24.4s},[x5],#16
343	ldp	x24,x25,[x3]		// load key
344	ldp	x26,x27,[x3,#16]
345	ld1	{v25.4s,v26.4s},[x3]
346	ldp	x28,x30,[x4]		// load counter
347	ld1	{v27.4s},[x4]
348	ld1	{v31.4s},[x5]
349#ifdef	__ARMEB__
350	rev64	v24.4s,v24.4s
351	ror	x24,x24,#32
352	ror	x25,x25,#32
353	ror	x26,x26,#32
354	ror	x27,x27,#32
355	ror	x28,x28,#32
356	ror	x30,x30,#32
357#endif
358	add	v27.4s,v27.4s,v31.4s		// += 1
359	add	v28.4s,v27.4s,v31.4s
360	add	v29.4s,v28.4s,v31.4s
361	shl	v31.4s,v31.4s,#2			// 1 -> 4
362
363Loop_outer_neon:
364	mov	w5,w22			// unpack key block
365	lsr	x6,x22,#32
366	mov	v0.16b,v24.16b
367	mov	w7,w23
368	lsr	x8,x23,#32
369	mov	v4.16b,v24.16b
370	mov	w9,w24
371	lsr	x10,x24,#32
372	mov	v16.16b,v24.16b
373	mov	w11,w25
374	mov	v1.16b,v25.16b
375	lsr	x12,x25,#32
376	mov	v5.16b,v25.16b
377	mov	w13,w26
378	mov	v17.16b,v25.16b
379	lsr	x14,x26,#32
380	mov	v3.16b,v27.16b
381	mov	w15,w27
382	mov	v7.16b,v28.16b
383	lsr	x16,x27,#32
384	mov	v19.16b,v29.16b
385	mov	w17,w28
386	mov	v2.16b,v26.16b
387	lsr	x19,x28,#32
388	mov	v6.16b,v26.16b
389	mov	w20,w30
390	mov	v18.16b,v26.16b
391	lsr	x21,x30,#32
392
393	mov	x4,#10
394	subs	x2,x2,#256
395Loop_neon:
396	sub	x4,x4,#1
397	add	v0.4s,v0.4s,v1.4s
398	add	w5,w5,w9
399	add	v4.4s,v4.4s,v5.4s
400	add	w6,w6,w10
401	add	v16.4s,v16.4s,v17.4s
402	add	w7,w7,w11
403	eor	v3.16b,v3.16b,v0.16b
404	add	w8,w8,w12
405	eor	v7.16b,v7.16b,v4.16b
406	eor	w17,w17,w5
407	eor	v19.16b,v19.16b,v16.16b
408	eor	w19,w19,w6
409	rev32	v3.8h,v3.8h
410	eor	w20,w20,w7
411	rev32	v7.8h,v7.8h
412	eor	w21,w21,w8
413	rev32	v19.8h,v19.8h
414	ror	w17,w17,#16
415	add	v2.4s,v2.4s,v3.4s
416	ror	w19,w19,#16
417	add	v6.4s,v6.4s,v7.4s
418	ror	w20,w20,#16
419	add	v18.4s,v18.4s,v19.4s
420	ror	w21,w21,#16
421	eor	v20.16b,v1.16b,v2.16b
422	add	w13,w13,w17
423	eor	v21.16b,v5.16b,v6.16b
424	add	w14,w14,w19
425	eor	v22.16b,v17.16b,v18.16b
426	add	w15,w15,w20
427	ushr	v1.4s,v20.4s,#20
428	add	w16,w16,w21
429	ushr	v5.4s,v21.4s,#20
430	eor	w9,w9,w13
431	ushr	v17.4s,v22.4s,#20
432	eor	w10,w10,w14
433	sli	v1.4s,v20.4s,#12
434	eor	w11,w11,w15
435	sli	v5.4s,v21.4s,#12
436	eor	w12,w12,w16
437	sli	v17.4s,v22.4s,#12
438	ror	w9,w9,#20
439	add	v0.4s,v0.4s,v1.4s
440	ror	w10,w10,#20
441	add	v4.4s,v4.4s,v5.4s
442	ror	w11,w11,#20
443	add	v16.4s,v16.4s,v17.4s
444	ror	w12,w12,#20
445	eor	v20.16b,v3.16b,v0.16b
446	add	w5,w5,w9
447	eor	v21.16b,v7.16b,v4.16b
448	add	w6,w6,w10
449	eor	v22.16b,v19.16b,v16.16b
450	add	w7,w7,w11
451	ushr	v3.4s,v20.4s,#24
452	add	w8,w8,w12
453	ushr	v7.4s,v21.4s,#24
454	eor	w17,w17,w5
455	ushr	v19.4s,v22.4s,#24
456	eor	w19,w19,w6
457	sli	v3.4s,v20.4s,#8
458	eor	w20,w20,w7
459	sli	v7.4s,v21.4s,#8
460	eor	w21,w21,w8
461	sli	v19.4s,v22.4s,#8
462	ror	w17,w17,#24
463	add	v2.4s,v2.4s,v3.4s
464	ror	w19,w19,#24
465	add	v6.4s,v6.4s,v7.4s
466	ror	w20,w20,#24
467	add	v18.4s,v18.4s,v19.4s
468	ror	w21,w21,#24
469	eor	v20.16b,v1.16b,v2.16b
470	add	w13,w13,w17
471	eor	v21.16b,v5.16b,v6.16b
472	add	w14,w14,w19
473	eor	v22.16b,v17.16b,v18.16b
474	add	w15,w15,w20
475	ushr	v1.4s,v20.4s,#25
476	add	w16,w16,w21
477	ushr	v5.4s,v21.4s,#25
478	eor	w9,w9,w13
479	ushr	v17.4s,v22.4s,#25
480	eor	w10,w10,w14
481	sli	v1.4s,v20.4s,#7
482	eor	w11,w11,w15
483	sli	v5.4s,v21.4s,#7
484	eor	w12,w12,w16
485	sli	v17.4s,v22.4s,#7
486	ror	w9,w9,#25
487	ext	v2.16b,v2.16b,v2.16b,#8
488	ror	w10,w10,#25
489	ext	v6.16b,v6.16b,v6.16b,#8
490	ror	w11,w11,#25
491	ext	v18.16b,v18.16b,v18.16b,#8
492	ror	w12,w12,#25
493	ext	v3.16b,v3.16b,v3.16b,#12
494	ext	v7.16b,v7.16b,v7.16b,#12
495	ext	v19.16b,v19.16b,v19.16b,#12
496	ext	v1.16b,v1.16b,v1.16b,#4
497	ext	v5.16b,v5.16b,v5.16b,#4
498	ext	v17.16b,v17.16b,v17.16b,#4
499	add	v0.4s,v0.4s,v1.4s
500	add	w5,w5,w10
501	add	v4.4s,v4.4s,v5.4s
502	add	w6,w6,w11
503	add	v16.4s,v16.4s,v17.4s
504	add	w7,w7,w12
505	eor	v3.16b,v3.16b,v0.16b
506	add	w8,w8,w9
507	eor	v7.16b,v7.16b,v4.16b
508	eor	w21,w21,w5
509	eor	v19.16b,v19.16b,v16.16b
510	eor	w17,w17,w6
511	rev32	v3.8h,v3.8h
512	eor	w19,w19,w7
513	rev32	v7.8h,v7.8h
514	eor	w20,w20,w8
515	rev32	v19.8h,v19.8h
516	ror	w21,w21,#16
517	add	v2.4s,v2.4s,v3.4s
518	ror	w17,w17,#16
519	add	v6.4s,v6.4s,v7.4s
520	ror	w19,w19,#16
521	add	v18.4s,v18.4s,v19.4s
522	ror	w20,w20,#16
523	eor	v20.16b,v1.16b,v2.16b
524	add	w15,w15,w21
525	eor	v21.16b,v5.16b,v6.16b
526	add	w16,w16,w17
527	eor	v22.16b,v17.16b,v18.16b
528	add	w13,w13,w19
529	ushr	v1.4s,v20.4s,#20
530	add	w14,w14,w20
531	ushr	v5.4s,v21.4s,#20
532	eor	w10,w10,w15
533	ushr	v17.4s,v22.4s,#20
534	eor	w11,w11,w16
535	sli	v1.4s,v20.4s,#12
536	eor	w12,w12,w13
537	sli	v5.4s,v21.4s,#12
538	eor	w9,w9,w14
539	sli	v17.4s,v22.4s,#12
540	ror	w10,w10,#20
541	add	v0.4s,v0.4s,v1.4s
542	ror	w11,w11,#20
543	add	v4.4s,v4.4s,v5.4s
544	ror	w12,w12,#20
545	add	v16.4s,v16.4s,v17.4s
546	ror	w9,w9,#20
547	eor	v20.16b,v3.16b,v0.16b
548	add	w5,w5,w10
549	eor	v21.16b,v7.16b,v4.16b
550	add	w6,w6,w11
551	eor	v22.16b,v19.16b,v16.16b
552	add	w7,w7,w12
553	ushr	v3.4s,v20.4s,#24
554	add	w8,w8,w9
555	ushr	v7.4s,v21.4s,#24
556	eor	w21,w21,w5
557	ushr	v19.4s,v22.4s,#24
558	eor	w17,w17,w6
559	sli	v3.4s,v20.4s,#8
560	eor	w19,w19,w7
561	sli	v7.4s,v21.4s,#8
562	eor	w20,w20,w8
563	sli	v19.4s,v22.4s,#8
564	ror	w21,w21,#24
565	add	v2.4s,v2.4s,v3.4s
566	ror	w17,w17,#24
567	add	v6.4s,v6.4s,v7.4s
568	ror	w19,w19,#24
569	add	v18.4s,v18.4s,v19.4s
570	ror	w20,w20,#24
571	eor	v20.16b,v1.16b,v2.16b
572	add	w15,w15,w21
573	eor	v21.16b,v5.16b,v6.16b
574	add	w16,w16,w17
575	eor	v22.16b,v17.16b,v18.16b
576	add	w13,w13,w19
577	ushr	v1.4s,v20.4s,#25
578	add	w14,w14,w20
579	ushr	v5.4s,v21.4s,#25
580	eor	w10,w10,w15
581	ushr	v17.4s,v22.4s,#25
582	eor	w11,w11,w16
583	sli	v1.4s,v20.4s,#7
584	eor	w12,w12,w13
585	sli	v5.4s,v21.4s,#7
586	eor	w9,w9,w14
587	sli	v17.4s,v22.4s,#7
588	ror	w10,w10,#25
589	ext	v2.16b,v2.16b,v2.16b,#8
590	ror	w11,w11,#25
591	ext	v6.16b,v6.16b,v6.16b,#8
592	ror	w12,w12,#25
593	ext	v18.16b,v18.16b,v18.16b,#8
594	ror	w9,w9,#25
595	ext	v3.16b,v3.16b,v3.16b,#4
596	ext	v7.16b,v7.16b,v7.16b,#4
597	ext	v19.16b,v19.16b,v19.16b,#4
598	ext	v1.16b,v1.16b,v1.16b,#12
599	ext	v5.16b,v5.16b,v5.16b,#12
600	ext	v17.16b,v17.16b,v17.16b,#12
601	cbnz	x4,Loop_neon
602
603	add	w5,w5,w22		// accumulate key block
604	add	v0.4s,v0.4s,v24.4s
605	add	x6,x6,x22,lsr#32
606	add	v4.4s,v4.4s,v24.4s
607	add	w7,w7,w23
608	add	v16.4s,v16.4s,v24.4s
609	add	x8,x8,x23,lsr#32
610	add	v2.4s,v2.4s,v26.4s
611	add	w9,w9,w24
612	add	v6.4s,v6.4s,v26.4s
613	add	x10,x10,x24,lsr#32
614	add	v18.4s,v18.4s,v26.4s
615	add	w11,w11,w25
616	add	v3.4s,v3.4s,v27.4s
617	add	x12,x12,x25,lsr#32
618	add	w13,w13,w26
619	add	v7.4s,v7.4s,v28.4s
620	add	x14,x14,x26,lsr#32
621	add	w15,w15,w27
622	add	v19.4s,v19.4s,v29.4s
623	add	x16,x16,x27,lsr#32
624	add	w17,w17,w28
625	add	v1.4s,v1.4s,v25.4s
626	add	x19,x19,x28,lsr#32
627	add	w20,w20,w30
628	add	v5.4s,v5.4s,v25.4s
629	add	x21,x21,x30,lsr#32
630	add	v17.4s,v17.4s,v25.4s
631
632	b.lo	Ltail_neon
633
634	add	x5,x5,x6,lsl#32	// pack
635	add	x7,x7,x8,lsl#32
636	ldp	x6,x8,[x1,#0]		// load input
637	add	x9,x9,x10,lsl#32
638	add	x11,x11,x12,lsl#32
639	ldp	x10,x12,[x1,#16]
640	add	x13,x13,x14,lsl#32
641	add	x15,x15,x16,lsl#32
642	ldp	x14,x16,[x1,#32]
643	add	x17,x17,x19,lsl#32
644	add	x20,x20,x21,lsl#32
645	ldp	x19,x21,[x1,#48]
646	add	x1,x1,#64
647#ifdef	__ARMEB__
648	rev	x5,x5
649	rev	x7,x7
650	rev	x9,x9
651	rev	x11,x11
652	rev	x13,x13
653	rev	x15,x15
654	rev	x17,x17
655	rev	x20,x20
656#endif
657	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
658	eor	x5,x5,x6
659	eor	x7,x7,x8
660	eor	x9,x9,x10
661	eor	x11,x11,x12
662	eor	x13,x13,x14
663	eor	v0.16b,v0.16b,v20.16b
664	eor	x15,x15,x16
665	eor	v1.16b,v1.16b,v21.16b
666	eor	x17,x17,x19
667	eor	v2.16b,v2.16b,v22.16b
668	eor	x20,x20,x21
669	eor	v3.16b,v3.16b,v23.16b
670	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
671
672	stp	x5,x7,[x0,#0]		// store output
673	add	x28,x28,#4			// increment counter
674	stp	x9,x11,[x0,#16]
675	add	v27.4s,v27.4s,v31.4s		// += 4
676	stp	x13,x15,[x0,#32]
677	add	v28.4s,v28.4s,v31.4s
678	stp	x17,x20,[x0,#48]
679	add	v29.4s,v29.4s,v31.4s
680	add	x0,x0,#64
681
682	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
683	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
684
685	eor	v4.16b,v4.16b,v20.16b
686	eor	v5.16b,v5.16b,v21.16b
687	eor	v6.16b,v6.16b,v22.16b
688	eor	v7.16b,v7.16b,v23.16b
689	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
690
691	eor	v16.16b,v16.16b,v0.16b
692	eor	v17.16b,v17.16b,v1.16b
693	eor	v18.16b,v18.16b,v2.16b
694	eor	v19.16b,v19.16b,v3.16b
695	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
696
697	b.hi	Loop_outer_neon
698
699	ldp	x19,x20,[x29,#16]
700	add	sp,sp,#64
701	ldp	x21,x22,[x29,#32]
702	ldp	x23,x24,[x29,#48]
703	ldp	x25,x26,[x29,#64]
704	ldp	x27,x28,[x29,#80]
705	ldp	x29,x30,[sp],#96
706	AARCH64_VALIDATE_LINK_REGISTER
707	ret
708
709Ltail_neon:
710	add	x2,x2,#256
711	cmp	x2,#64
712	b.lo	Less_than_64
713
714	add	x5,x5,x6,lsl#32	// pack
715	add	x7,x7,x8,lsl#32
716	ldp	x6,x8,[x1,#0]		// load input
717	add	x9,x9,x10,lsl#32
718	add	x11,x11,x12,lsl#32
719	ldp	x10,x12,[x1,#16]
720	add	x13,x13,x14,lsl#32
721	add	x15,x15,x16,lsl#32
722	ldp	x14,x16,[x1,#32]
723	add	x17,x17,x19,lsl#32
724	add	x20,x20,x21,lsl#32
725	ldp	x19,x21,[x1,#48]
726	add	x1,x1,#64
727#ifdef	__ARMEB__
728	rev	x5,x5
729	rev	x7,x7
730	rev	x9,x9
731	rev	x11,x11
732	rev	x13,x13
733	rev	x15,x15
734	rev	x17,x17
735	rev	x20,x20
736#endif
737	eor	x5,x5,x6
738	eor	x7,x7,x8
739	eor	x9,x9,x10
740	eor	x11,x11,x12
741	eor	x13,x13,x14
742	eor	x15,x15,x16
743	eor	x17,x17,x19
744	eor	x20,x20,x21
745
746	stp	x5,x7,[x0,#0]		// store output
747	add	x28,x28,#4			// increment counter
748	stp	x9,x11,[x0,#16]
749	stp	x13,x15,[x0,#32]
750	stp	x17,x20,[x0,#48]
751	add	x0,x0,#64
752	b.eq	Ldone_neon
753	sub	x2,x2,#64
754	cmp	x2,#64
755	b.lo	Less_than_128
756
757	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
758	eor	v0.16b,v0.16b,v20.16b
759	eor	v1.16b,v1.16b,v21.16b
760	eor	v2.16b,v2.16b,v22.16b
761	eor	v3.16b,v3.16b,v23.16b
762	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
763	b.eq	Ldone_neon
764	sub	x2,x2,#64
765	cmp	x2,#64
766	b.lo	Less_than_192
767
768	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
769	eor	v4.16b,v4.16b,v20.16b
770	eor	v5.16b,v5.16b,v21.16b
771	eor	v6.16b,v6.16b,v22.16b
772	eor	v7.16b,v7.16b,v23.16b
773	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
774	b.eq	Ldone_neon
775	sub	x2,x2,#64
776
777	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
778	b	Last_neon
779
780Less_than_128:
781	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
782	b	Last_neon
783Less_than_192:
784	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
785	b	Last_neon
786
787.align	4
788Last_neon:
789	sub	x0,x0,#1
790	add	x1,x1,x2
791	add	x0,x0,x2
792	add	x4,sp,x2
793	neg	x2,x2
794
795Loop_tail_neon:
796	ldrb	w10,[x1,x2]
797	ldrb	w11,[x4,x2]
798	add	x2,x2,#1
799	eor	w10,w10,w11
800	strb	w10,[x0,x2]
801	cbnz	x2,Loop_tail_neon
802
803	stp	xzr,xzr,[sp,#0]
804	stp	xzr,xzr,[sp,#16]
805	stp	xzr,xzr,[sp,#32]
806	stp	xzr,xzr,[sp,#48]
807
808Ldone_neon:
809	ldp	x19,x20,[x29,#16]
810	add	sp,sp,#64
811	ldp	x21,x22,[x29,#32]
812	ldp	x23,x24,[x29,#48]
813	ldp	x25,x26,[x29,#64]
814	ldp	x27,x28,[x29,#80]
815	ldp	x29,x30,[sp],#96
816	AARCH64_VALIDATE_LINK_REGISTER
817	ret
818
819
820.align	5
821ChaCha20_512_neon:
822	AARCH64_SIGN_LINK_REGISTER
823	stp	x29,x30,[sp,#-96]!
824	add	x29,sp,#0
825
826	adrp	x5,Lsigma@PAGE
827	add	x5,x5,Lsigma@PAGEOFF
828	stp	x19,x20,[sp,#16]
829	stp	x21,x22,[sp,#32]
830	stp	x23,x24,[sp,#48]
831	stp	x25,x26,[sp,#64]
832	stp	x27,x28,[sp,#80]
833
834L512_or_more_neon:
835	sub	sp,sp,#128+64
836
837	ldp	x22,x23,[x5]		// load sigma
838	ld1	{v24.4s},[x5],#16
839	ldp	x24,x25,[x3]		// load key
840	ldp	x26,x27,[x3,#16]
841	ld1	{v25.4s,v26.4s},[x3]
842	ldp	x28,x30,[x4]		// load counter
843	ld1	{v27.4s},[x4]
844	ld1	{v31.4s},[x5]
845#ifdef	__ARMEB__
846	rev64	v24.4s,v24.4s
847	ror	x24,x24,#32
848	ror	x25,x25,#32
849	ror	x26,x26,#32
850	ror	x27,x27,#32
851	ror	x28,x28,#32
852	ror	x30,x30,#32
853#endif
854	add	v27.4s,v27.4s,v31.4s		// += 1
855	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
856	add	v27.4s,v27.4s,v31.4s		// not typo
857	str	q26,[sp,#32]
858	add	v28.4s,v27.4s,v31.4s
859	add	v29.4s,v28.4s,v31.4s
860	add	v30.4s,v29.4s,v31.4s
861	shl	v31.4s,v31.4s,#2			// 1 -> 4
862
863	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
864	stp	d10,d11,[sp,#128+16]
865	stp	d12,d13,[sp,#128+32]
866	stp	d14,d15,[sp,#128+48]
867
868	sub	x2,x2,#512			// not typo
869
870Loop_outer_512_neon:
871	mov	v0.16b,v24.16b
872	mov	v4.16b,v24.16b
873	mov	v8.16b,v24.16b
874	mov	v12.16b,v24.16b
875	mov	v16.16b,v24.16b
876	mov	v20.16b,v24.16b
877	mov	v1.16b,v25.16b
878	mov	w5,w22			// unpack key block
879	mov	v5.16b,v25.16b
880	lsr	x6,x22,#32
881	mov	v9.16b,v25.16b
882	mov	w7,w23
883	mov	v13.16b,v25.16b
884	lsr	x8,x23,#32
885	mov	v17.16b,v25.16b
886	mov	w9,w24
887	mov	v21.16b,v25.16b
888	lsr	x10,x24,#32
889	mov	v3.16b,v27.16b
890	mov	w11,w25
891	mov	v7.16b,v28.16b
892	lsr	x12,x25,#32
893	mov	v11.16b,v29.16b
894	mov	w13,w26
895	mov	v15.16b,v30.16b
896	lsr	x14,x26,#32
897	mov	v2.16b,v26.16b
898	mov	w15,w27
899	mov	v6.16b,v26.16b
900	lsr	x16,x27,#32
901	add	v19.4s,v3.4s,v31.4s			// +4
902	mov	w17,w28
903	add	v23.4s,v7.4s,v31.4s			// +4
904	lsr	x19,x28,#32
905	mov	v10.16b,v26.16b
906	mov	w20,w30
907	mov	v14.16b,v26.16b
908	lsr	x21,x30,#32
909	mov	v18.16b,v26.16b
910	stp	q27,q28,[sp,#48]		// off-load key block, variable part
911	mov	v22.16b,v26.16b
912	str	q29,[sp,#80]
913
914	mov	x4,#5
915	subs	x2,x2,#512
916Loop_upper_neon:
917	sub	x4,x4,#1
918	add	v0.4s,v0.4s,v1.4s
919	add	w5,w5,w9
920	add	v4.4s,v4.4s,v5.4s
921	add	w6,w6,w10
922	add	v8.4s,v8.4s,v9.4s
923	add	w7,w7,w11
924	add	v12.4s,v12.4s,v13.4s
925	add	w8,w8,w12
926	add	v16.4s,v16.4s,v17.4s
927	eor	w17,w17,w5
928	add	v20.4s,v20.4s,v21.4s
929	eor	w19,w19,w6
930	eor	v3.16b,v3.16b,v0.16b
931	eor	w20,w20,w7
932	eor	v7.16b,v7.16b,v4.16b
933	eor	w21,w21,w8
934	eor	v11.16b,v11.16b,v8.16b
935	ror	w17,w17,#16
936	eor	v15.16b,v15.16b,v12.16b
937	ror	w19,w19,#16
938	eor	v19.16b,v19.16b,v16.16b
939	ror	w20,w20,#16
940	eor	v23.16b,v23.16b,v20.16b
941	ror	w21,w21,#16
942	rev32	v3.8h,v3.8h
943	add	w13,w13,w17
944	rev32	v7.8h,v7.8h
945	add	w14,w14,w19
946	rev32	v11.8h,v11.8h
947	add	w15,w15,w20
948	rev32	v15.8h,v15.8h
949	add	w16,w16,w21
950	rev32	v19.8h,v19.8h
951	eor	w9,w9,w13
952	rev32	v23.8h,v23.8h
953	eor	w10,w10,w14
954	add	v2.4s,v2.4s,v3.4s
955	eor	w11,w11,w15
956	add	v6.4s,v6.4s,v7.4s
957	eor	w12,w12,w16
958	add	v10.4s,v10.4s,v11.4s
959	ror	w9,w9,#20
960	add	v14.4s,v14.4s,v15.4s
961	ror	w10,w10,#20
962	add	v18.4s,v18.4s,v19.4s
963	ror	w11,w11,#20
964	add	v22.4s,v22.4s,v23.4s
965	ror	w12,w12,#20
966	eor	v24.16b,v1.16b,v2.16b
967	add	w5,w5,w9
968	eor	v25.16b,v5.16b,v6.16b
969	add	w6,w6,w10
970	eor	v26.16b,v9.16b,v10.16b
971	add	w7,w7,w11
972	eor	v27.16b,v13.16b,v14.16b
973	add	w8,w8,w12
974	eor	v28.16b,v17.16b,v18.16b
975	eor	w17,w17,w5
976	eor	v29.16b,v21.16b,v22.16b
977	eor	w19,w19,w6
978	ushr	v1.4s,v24.4s,#20
979	eor	w20,w20,w7
980	ushr	v5.4s,v25.4s,#20
981	eor	w21,w21,w8
982	ushr	v9.4s,v26.4s,#20
983	ror	w17,w17,#24
984	ushr	v13.4s,v27.4s,#20
985	ror	w19,w19,#24
986	ushr	v17.4s,v28.4s,#20
987	ror	w20,w20,#24
988	ushr	v21.4s,v29.4s,#20
989	ror	w21,w21,#24
990	sli	v1.4s,v24.4s,#12
991	add	w13,w13,w17
992	sli	v5.4s,v25.4s,#12
993	add	w14,w14,w19
994	sli	v9.4s,v26.4s,#12
995	add	w15,w15,w20
996	sli	v13.4s,v27.4s,#12
997	add	w16,w16,w21
998	sli	v17.4s,v28.4s,#12
999	eor	w9,w9,w13
1000	sli	v21.4s,v29.4s,#12
1001	eor	w10,w10,w14
1002	add	v0.4s,v0.4s,v1.4s
1003	eor	w11,w11,w15
1004	add	v4.4s,v4.4s,v5.4s
1005	eor	w12,w12,w16
1006	add	v8.4s,v8.4s,v9.4s
1007	ror	w9,w9,#25
1008	add	v12.4s,v12.4s,v13.4s
1009	ror	w10,w10,#25
1010	add	v16.4s,v16.4s,v17.4s
1011	ror	w11,w11,#25
1012	add	v20.4s,v20.4s,v21.4s
1013	ror	w12,w12,#25
1014	eor	v24.16b,v3.16b,v0.16b
1015	add	w5,w5,w10
1016	eor	v25.16b,v7.16b,v4.16b
1017	add	w6,w6,w11
1018	eor	v26.16b,v11.16b,v8.16b
1019	add	w7,w7,w12
1020	eor	v27.16b,v15.16b,v12.16b
1021	add	w8,w8,w9
1022	eor	v28.16b,v19.16b,v16.16b
1023	eor	w21,w21,w5
1024	eor	v29.16b,v23.16b,v20.16b
1025	eor	w17,w17,w6
1026	ushr	v3.4s,v24.4s,#24
1027	eor	w19,w19,w7
1028	ushr	v7.4s,v25.4s,#24
1029	eor	w20,w20,w8
1030	ushr	v11.4s,v26.4s,#24
1031	ror	w21,w21,#16
1032	ushr	v15.4s,v27.4s,#24
1033	ror	w17,w17,#16
1034	ushr	v19.4s,v28.4s,#24
1035	ror	w19,w19,#16
1036	ushr	v23.4s,v29.4s,#24
1037	ror	w20,w20,#16
1038	sli	v3.4s,v24.4s,#8
1039	add	w15,w15,w21
1040	sli	v7.4s,v25.4s,#8
1041	add	w16,w16,w17
1042	sli	v11.4s,v26.4s,#8
1043	add	w13,w13,w19
1044	sli	v15.4s,v27.4s,#8
1045	add	w14,w14,w20
1046	sli	v19.4s,v28.4s,#8
1047	eor	w10,w10,w15
1048	sli	v23.4s,v29.4s,#8
1049	eor	w11,w11,w16
1050	add	v2.4s,v2.4s,v3.4s
1051	eor	w12,w12,w13
1052	add	v6.4s,v6.4s,v7.4s
1053	eor	w9,w9,w14
1054	add	v10.4s,v10.4s,v11.4s
1055	ror	w10,w10,#20
1056	add	v14.4s,v14.4s,v15.4s
1057	ror	w11,w11,#20
1058	add	v18.4s,v18.4s,v19.4s
1059	ror	w12,w12,#20
1060	add	v22.4s,v22.4s,v23.4s
1061	ror	w9,w9,#20
1062	eor	v24.16b,v1.16b,v2.16b
1063	add	w5,w5,w10
1064	eor	v25.16b,v5.16b,v6.16b
1065	add	w6,w6,w11
1066	eor	v26.16b,v9.16b,v10.16b
1067	add	w7,w7,w12
1068	eor	v27.16b,v13.16b,v14.16b
1069	add	w8,w8,w9
1070	eor	v28.16b,v17.16b,v18.16b
1071	eor	w21,w21,w5
1072	eor	v29.16b,v21.16b,v22.16b
1073	eor	w17,w17,w6
1074	ushr	v1.4s,v24.4s,#25
1075	eor	w19,w19,w7
1076	ushr	v5.4s,v25.4s,#25
1077	eor	w20,w20,w8
1078	ushr	v9.4s,v26.4s,#25
1079	ror	w21,w21,#24
1080	ushr	v13.4s,v27.4s,#25
1081	ror	w17,w17,#24
1082	ushr	v17.4s,v28.4s,#25
1083	ror	w19,w19,#24
1084	ushr	v21.4s,v29.4s,#25
1085	ror	w20,w20,#24
1086	sli	v1.4s,v24.4s,#7
1087	add	w15,w15,w21
1088	sli	v5.4s,v25.4s,#7
1089	add	w16,w16,w17
1090	sli	v9.4s,v26.4s,#7
1091	add	w13,w13,w19
1092	sli	v13.4s,v27.4s,#7
1093	add	w14,w14,w20
1094	sli	v17.4s,v28.4s,#7
1095	eor	w10,w10,w15
1096	sli	v21.4s,v29.4s,#7
1097	eor	w11,w11,w16
1098	ext	v2.16b,v2.16b,v2.16b,#8
1099	eor	w12,w12,w13
1100	ext	v6.16b,v6.16b,v6.16b,#8
1101	eor	w9,w9,w14
1102	ext	v10.16b,v10.16b,v10.16b,#8
1103	ror	w10,w10,#25
1104	ext	v14.16b,v14.16b,v14.16b,#8
1105	ror	w11,w11,#25
1106	ext	v18.16b,v18.16b,v18.16b,#8
1107	ror	w12,w12,#25
1108	ext	v22.16b,v22.16b,v22.16b,#8
1109	ror	w9,w9,#25
1110	ext	v3.16b,v3.16b,v3.16b,#12
1111	ext	v7.16b,v7.16b,v7.16b,#12
1112	ext	v11.16b,v11.16b,v11.16b,#12
1113	ext	v15.16b,v15.16b,v15.16b,#12
1114	ext	v19.16b,v19.16b,v19.16b,#12
1115	ext	v23.16b,v23.16b,v23.16b,#12
1116	ext	v1.16b,v1.16b,v1.16b,#4
1117	ext	v5.16b,v5.16b,v5.16b,#4
1118	ext	v9.16b,v9.16b,v9.16b,#4
1119	ext	v13.16b,v13.16b,v13.16b,#4
1120	ext	v17.16b,v17.16b,v17.16b,#4
1121	ext	v21.16b,v21.16b,v21.16b,#4
1122	add	v0.4s,v0.4s,v1.4s
1123	add	w5,w5,w9
1124	add	v4.4s,v4.4s,v5.4s
1125	add	w6,w6,w10
1126	add	v8.4s,v8.4s,v9.4s
1127	add	w7,w7,w11
1128	add	v12.4s,v12.4s,v13.4s
1129	add	w8,w8,w12
1130	add	v16.4s,v16.4s,v17.4s
1131	eor	w17,w17,w5
1132	add	v20.4s,v20.4s,v21.4s
1133	eor	w19,w19,w6
1134	eor	v3.16b,v3.16b,v0.16b
1135	eor	w20,w20,w7
1136	eor	v7.16b,v7.16b,v4.16b
1137	eor	w21,w21,w8
1138	eor	v11.16b,v11.16b,v8.16b
1139	ror	w17,w17,#16
1140	eor	v15.16b,v15.16b,v12.16b
1141	ror	w19,w19,#16
1142	eor	v19.16b,v19.16b,v16.16b
1143	ror	w20,w20,#16
1144	eor	v23.16b,v23.16b,v20.16b
1145	ror	w21,w21,#16
1146	rev32	v3.8h,v3.8h
1147	add	w13,w13,w17
1148	rev32	v7.8h,v7.8h
1149	add	w14,w14,w19
1150	rev32	v11.8h,v11.8h
1151	add	w15,w15,w20
1152	rev32	v15.8h,v15.8h
1153	add	w16,w16,w21
1154	rev32	v19.8h,v19.8h
1155	eor	w9,w9,w13
1156	rev32	v23.8h,v23.8h
1157	eor	w10,w10,w14
1158	add	v2.4s,v2.4s,v3.4s
1159	eor	w11,w11,w15
1160	add	v6.4s,v6.4s,v7.4s
1161	eor	w12,w12,w16
1162	add	v10.4s,v10.4s,v11.4s
1163	ror	w9,w9,#20
1164	add	v14.4s,v14.4s,v15.4s
1165	ror	w10,w10,#20
1166	add	v18.4s,v18.4s,v19.4s
1167	ror	w11,w11,#20
1168	add	v22.4s,v22.4s,v23.4s
1169	ror	w12,w12,#20
1170	eor	v24.16b,v1.16b,v2.16b
1171	add	w5,w5,w9
1172	eor	v25.16b,v5.16b,v6.16b
1173	add	w6,w6,w10
1174	eor	v26.16b,v9.16b,v10.16b
1175	add	w7,w7,w11
1176	eor	v27.16b,v13.16b,v14.16b
1177	add	w8,w8,w12
1178	eor	v28.16b,v17.16b,v18.16b
1179	eor	w17,w17,w5
1180	eor	v29.16b,v21.16b,v22.16b
1181	eor	w19,w19,w6
1182	ushr	v1.4s,v24.4s,#20
1183	eor	w20,w20,w7
1184	ushr	v5.4s,v25.4s,#20
1185	eor	w21,w21,w8
1186	ushr	v9.4s,v26.4s,#20
1187	ror	w17,w17,#24
1188	ushr	v13.4s,v27.4s,#20
1189	ror	w19,w19,#24
1190	ushr	v17.4s,v28.4s,#20
1191	ror	w20,w20,#24
1192	ushr	v21.4s,v29.4s,#20
1193	ror	w21,w21,#24
1194	sli	v1.4s,v24.4s,#12
1195	add	w13,w13,w17
1196	sli	v5.4s,v25.4s,#12
1197	add	w14,w14,w19
1198	sli	v9.4s,v26.4s,#12
1199	add	w15,w15,w20
1200	sli	v13.4s,v27.4s,#12
1201	add	w16,w16,w21
1202	sli	v17.4s,v28.4s,#12
1203	eor	w9,w9,w13
1204	sli	v21.4s,v29.4s,#12
1205	eor	w10,w10,w14
1206	add	v0.4s,v0.4s,v1.4s
1207	eor	w11,w11,w15
1208	add	v4.4s,v4.4s,v5.4s
1209	eor	w12,w12,w16
1210	add	v8.4s,v8.4s,v9.4s
1211	ror	w9,w9,#25
1212	add	v12.4s,v12.4s,v13.4s
1213	ror	w10,w10,#25
1214	add	v16.4s,v16.4s,v17.4s
1215	ror	w11,w11,#25
1216	add	v20.4s,v20.4s,v21.4s
1217	ror	w12,w12,#25
1218	eor	v24.16b,v3.16b,v0.16b
1219	add	w5,w5,w10
1220	eor	v25.16b,v7.16b,v4.16b
1221	add	w6,w6,w11
1222	eor	v26.16b,v11.16b,v8.16b
1223	add	w7,w7,w12
1224	eor	v27.16b,v15.16b,v12.16b
1225	add	w8,w8,w9
1226	eor	v28.16b,v19.16b,v16.16b
1227	eor	w21,w21,w5
1228	eor	v29.16b,v23.16b,v20.16b
1229	eor	w17,w17,w6
1230	ushr	v3.4s,v24.4s,#24
1231	eor	w19,w19,w7
1232	ushr	v7.4s,v25.4s,#24
1233	eor	w20,w20,w8
1234	ushr	v11.4s,v26.4s,#24
1235	ror	w21,w21,#16
1236	ushr	v15.4s,v27.4s,#24
1237	ror	w17,w17,#16
1238	ushr	v19.4s,v28.4s,#24
1239	ror	w19,w19,#16
1240	ushr	v23.4s,v29.4s,#24
1241	ror	w20,w20,#16
1242	sli	v3.4s,v24.4s,#8
1243	add	w15,w15,w21
1244	sli	v7.4s,v25.4s,#8
1245	add	w16,w16,w17
1246	sli	v11.4s,v26.4s,#8
1247	add	w13,w13,w19
1248	sli	v15.4s,v27.4s,#8
1249	add	w14,w14,w20
1250	sli	v19.4s,v28.4s,#8
1251	eor	w10,w10,w15
1252	sli	v23.4s,v29.4s,#8
1253	eor	w11,w11,w16
1254	add	v2.4s,v2.4s,v3.4s
1255	eor	w12,w12,w13
1256	add	v6.4s,v6.4s,v7.4s
1257	eor	w9,w9,w14
1258	add	v10.4s,v10.4s,v11.4s
1259	ror	w10,w10,#20
1260	add	v14.4s,v14.4s,v15.4s
1261	ror	w11,w11,#20
1262	add	v18.4s,v18.4s,v19.4s
1263	ror	w12,w12,#20
1264	add	v22.4s,v22.4s,v23.4s
1265	ror	w9,w9,#20
1266	eor	v24.16b,v1.16b,v2.16b
1267	add	w5,w5,w10
1268	eor	v25.16b,v5.16b,v6.16b
1269	add	w6,w6,w11
1270	eor	v26.16b,v9.16b,v10.16b
1271	add	w7,w7,w12
1272	eor	v27.16b,v13.16b,v14.16b
1273	add	w8,w8,w9
1274	eor	v28.16b,v17.16b,v18.16b
1275	eor	w21,w21,w5
1276	eor	v29.16b,v21.16b,v22.16b
1277	eor	w17,w17,w6
1278	ushr	v1.4s,v24.4s,#25
1279	eor	w19,w19,w7
1280	ushr	v5.4s,v25.4s,#25
1281	eor	w20,w20,w8
1282	ushr	v9.4s,v26.4s,#25
1283	ror	w21,w21,#24
1284	ushr	v13.4s,v27.4s,#25
1285	ror	w17,w17,#24
1286	ushr	v17.4s,v28.4s,#25
1287	ror	w19,w19,#24
1288	ushr	v21.4s,v29.4s,#25
1289	ror	w20,w20,#24
1290	sli	v1.4s,v24.4s,#7
1291	add	w15,w15,w21
1292	sli	v5.4s,v25.4s,#7
1293	add	w16,w16,w17
1294	sli	v9.4s,v26.4s,#7
1295	add	w13,w13,w19
1296	sli	v13.4s,v27.4s,#7
1297	add	w14,w14,w20
1298	sli	v17.4s,v28.4s,#7
1299	eor	w10,w10,w15
1300	sli	v21.4s,v29.4s,#7
1301	eor	w11,w11,w16
1302	ext	v2.16b,v2.16b,v2.16b,#8
1303	eor	w12,w12,w13
1304	ext	v6.16b,v6.16b,v6.16b,#8
1305	eor	w9,w9,w14
1306	ext	v10.16b,v10.16b,v10.16b,#8
1307	ror	w10,w10,#25
1308	ext	v14.16b,v14.16b,v14.16b,#8
1309	ror	w11,w11,#25
1310	ext	v18.16b,v18.16b,v18.16b,#8
1311	ror	w12,w12,#25
1312	ext	v22.16b,v22.16b,v22.16b,#8
1313	ror	w9,w9,#25
1314	ext	v3.16b,v3.16b,v3.16b,#4
1315	ext	v7.16b,v7.16b,v7.16b,#4
1316	ext	v11.16b,v11.16b,v11.16b,#4
1317	ext	v15.16b,v15.16b,v15.16b,#4
1318	ext	v19.16b,v19.16b,v19.16b,#4
1319	ext	v23.16b,v23.16b,v23.16b,#4
1320	ext	v1.16b,v1.16b,v1.16b,#12
1321	ext	v5.16b,v5.16b,v5.16b,#12
1322	ext	v9.16b,v9.16b,v9.16b,#12
1323	ext	v13.16b,v13.16b,v13.16b,#12
1324	ext	v17.16b,v17.16b,v17.16b,#12
1325	ext	v21.16b,v21.16b,v21.16b,#12
1326	cbnz	x4,Loop_upper_neon
1327
1328	add	w5,w5,w22		// accumulate key block
1329	add	x6,x6,x22,lsr#32
1330	add	w7,w7,w23
1331	add	x8,x8,x23,lsr#32
1332	add	w9,w9,w24
1333	add	x10,x10,x24,lsr#32
1334	add	w11,w11,w25
1335	add	x12,x12,x25,lsr#32
1336	add	w13,w13,w26
1337	add	x14,x14,x26,lsr#32
1338	add	w15,w15,w27
1339	add	x16,x16,x27,lsr#32
1340	add	w17,w17,w28
1341	add	x19,x19,x28,lsr#32
1342	add	w20,w20,w30
1343	add	x21,x21,x30,lsr#32
1344
1345	add	x5,x5,x6,lsl#32	// pack
1346	add	x7,x7,x8,lsl#32
1347	ldp	x6,x8,[x1,#0]		// load input
1348	add	x9,x9,x10,lsl#32
1349	add	x11,x11,x12,lsl#32
1350	ldp	x10,x12,[x1,#16]
1351	add	x13,x13,x14,lsl#32
1352	add	x15,x15,x16,lsl#32
1353	ldp	x14,x16,[x1,#32]
1354	add	x17,x17,x19,lsl#32
1355	add	x20,x20,x21,lsl#32
1356	ldp	x19,x21,[x1,#48]
1357	add	x1,x1,#64
1358#ifdef	__ARMEB__
1359	rev	x5,x5
1360	rev	x7,x7
1361	rev	x9,x9
1362	rev	x11,x11
1363	rev	x13,x13
1364	rev	x15,x15
1365	rev	x17,x17
1366	rev	x20,x20
1367#endif
1368	eor	x5,x5,x6
1369	eor	x7,x7,x8
1370	eor	x9,x9,x10
1371	eor	x11,x11,x12
1372	eor	x13,x13,x14
1373	eor	x15,x15,x16
1374	eor	x17,x17,x19
1375	eor	x20,x20,x21
1376
1377	stp	x5,x7,[x0,#0]		// store output
1378	add	x28,x28,#1			// increment counter
1379	mov	w5,w22			// unpack key block
1380	lsr	x6,x22,#32
1381	stp	x9,x11,[x0,#16]
1382	mov	w7,w23
1383	lsr	x8,x23,#32
1384	stp	x13,x15,[x0,#32]
1385	mov	w9,w24
1386	lsr	x10,x24,#32
1387	stp	x17,x20,[x0,#48]
1388	add	x0,x0,#64
1389	mov	w11,w25
1390	lsr	x12,x25,#32
1391	mov	w13,w26
1392	lsr	x14,x26,#32
1393	mov	w15,w27
1394	lsr	x16,x27,#32
1395	mov	w17,w28
1396	lsr	x19,x28,#32
1397	mov	w20,w30
1398	lsr	x21,x30,#32
1399
1400	mov	x4,#5
1401Loop_lower_neon:
1402	sub	x4,x4,#1
1403	add	v0.4s,v0.4s,v1.4s
1404	add	w5,w5,w9
1405	add	v4.4s,v4.4s,v5.4s
1406	add	w6,w6,w10
1407	add	v8.4s,v8.4s,v9.4s
1408	add	w7,w7,w11
1409	add	v12.4s,v12.4s,v13.4s
1410	add	w8,w8,w12
1411	add	v16.4s,v16.4s,v17.4s
1412	eor	w17,w17,w5
1413	add	v20.4s,v20.4s,v21.4s
1414	eor	w19,w19,w6
1415	eor	v3.16b,v3.16b,v0.16b
1416	eor	w20,w20,w7
1417	eor	v7.16b,v7.16b,v4.16b
1418	eor	w21,w21,w8
1419	eor	v11.16b,v11.16b,v8.16b
1420	ror	w17,w17,#16
1421	eor	v15.16b,v15.16b,v12.16b
1422	ror	w19,w19,#16
1423	eor	v19.16b,v19.16b,v16.16b
1424	ror	w20,w20,#16
1425	eor	v23.16b,v23.16b,v20.16b
1426	ror	w21,w21,#16
1427	rev32	v3.8h,v3.8h
1428	add	w13,w13,w17
1429	rev32	v7.8h,v7.8h
1430	add	w14,w14,w19
1431	rev32	v11.8h,v11.8h
1432	add	w15,w15,w20
1433	rev32	v15.8h,v15.8h
1434	add	w16,w16,w21
1435	rev32	v19.8h,v19.8h
1436	eor	w9,w9,w13
1437	rev32	v23.8h,v23.8h
1438	eor	w10,w10,w14
1439	add	v2.4s,v2.4s,v3.4s
1440	eor	w11,w11,w15
1441	add	v6.4s,v6.4s,v7.4s
1442	eor	w12,w12,w16
1443	add	v10.4s,v10.4s,v11.4s
1444	ror	w9,w9,#20
1445	add	v14.4s,v14.4s,v15.4s
1446	ror	w10,w10,#20
1447	add	v18.4s,v18.4s,v19.4s
1448	ror	w11,w11,#20
1449	add	v22.4s,v22.4s,v23.4s
1450	ror	w12,w12,#20
1451	eor	v24.16b,v1.16b,v2.16b
1452	add	w5,w5,w9
1453	eor	v25.16b,v5.16b,v6.16b
1454	add	w6,w6,w10
1455	eor	v26.16b,v9.16b,v10.16b
1456	add	w7,w7,w11
1457	eor	v27.16b,v13.16b,v14.16b
1458	add	w8,w8,w12
1459	eor	v28.16b,v17.16b,v18.16b
1460	eor	w17,w17,w5
1461	eor	v29.16b,v21.16b,v22.16b
1462	eor	w19,w19,w6
1463	ushr	v1.4s,v24.4s,#20
1464	eor	w20,w20,w7
1465	ushr	v5.4s,v25.4s,#20
1466	eor	w21,w21,w8
1467	ushr	v9.4s,v26.4s,#20
1468	ror	w17,w17,#24
1469	ushr	v13.4s,v27.4s,#20
1470	ror	w19,w19,#24
1471	ushr	v17.4s,v28.4s,#20
1472	ror	w20,w20,#24
1473	ushr	v21.4s,v29.4s,#20
1474	ror	w21,w21,#24
1475	sli	v1.4s,v24.4s,#12
1476	add	w13,w13,w17
1477	sli	v5.4s,v25.4s,#12
1478	add	w14,w14,w19
1479	sli	v9.4s,v26.4s,#12
1480	add	w15,w15,w20
1481	sli	v13.4s,v27.4s,#12
1482	add	w16,w16,w21
1483	sli	v17.4s,v28.4s,#12
1484	eor	w9,w9,w13
1485	sli	v21.4s,v29.4s,#12
1486	eor	w10,w10,w14
1487	add	v0.4s,v0.4s,v1.4s
1488	eor	w11,w11,w15
1489	add	v4.4s,v4.4s,v5.4s
1490	eor	w12,w12,w16
1491	add	v8.4s,v8.4s,v9.4s
1492	ror	w9,w9,#25
1493	add	v12.4s,v12.4s,v13.4s
1494	ror	w10,w10,#25
1495	add	v16.4s,v16.4s,v17.4s
1496	ror	w11,w11,#25
1497	add	v20.4s,v20.4s,v21.4s
1498	ror	w12,w12,#25
1499	eor	v24.16b,v3.16b,v0.16b
1500	add	w5,w5,w10
1501	eor	v25.16b,v7.16b,v4.16b
1502	add	w6,w6,w11
1503	eor	v26.16b,v11.16b,v8.16b
1504	add	w7,w7,w12
1505	eor	v27.16b,v15.16b,v12.16b
1506	add	w8,w8,w9
1507	eor	v28.16b,v19.16b,v16.16b
1508	eor	w21,w21,w5
1509	eor	v29.16b,v23.16b,v20.16b
1510	eor	w17,w17,w6
1511	ushr	v3.4s,v24.4s,#24
1512	eor	w19,w19,w7
1513	ushr	v7.4s,v25.4s,#24
1514	eor	w20,w20,w8
1515	ushr	v11.4s,v26.4s,#24
1516	ror	w21,w21,#16
1517	ushr	v15.4s,v27.4s,#24
1518	ror	w17,w17,#16
1519	ushr	v19.4s,v28.4s,#24
1520	ror	w19,w19,#16
1521	ushr	v23.4s,v29.4s,#24
1522	ror	w20,w20,#16
1523	sli	v3.4s,v24.4s,#8
1524	add	w15,w15,w21
1525	sli	v7.4s,v25.4s,#8
1526	add	w16,w16,w17
1527	sli	v11.4s,v26.4s,#8
1528	add	w13,w13,w19
1529	sli	v15.4s,v27.4s,#8
1530	add	w14,w14,w20
1531	sli	v19.4s,v28.4s,#8
1532	eor	w10,w10,w15
1533	sli	v23.4s,v29.4s,#8
1534	eor	w11,w11,w16
1535	add	v2.4s,v2.4s,v3.4s
1536	eor	w12,w12,w13
1537	add	v6.4s,v6.4s,v7.4s
1538	eor	w9,w9,w14
1539	add	v10.4s,v10.4s,v11.4s
1540	ror	w10,w10,#20
1541	add	v14.4s,v14.4s,v15.4s
1542	ror	w11,w11,#20
1543	add	v18.4s,v18.4s,v19.4s
1544	ror	w12,w12,#20
1545	add	v22.4s,v22.4s,v23.4s
1546	ror	w9,w9,#20
1547	eor	v24.16b,v1.16b,v2.16b
1548	add	w5,w5,w10
1549	eor	v25.16b,v5.16b,v6.16b
1550	add	w6,w6,w11
1551	eor	v26.16b,v9.16b,v10.16b
1552	add	w7,w7,w12
1553	eor	v27.16b,v13.16b,v14.16b
1554	add	w8,w8,w9
1555	eor	v28.16b,v17.16b,v18.16b
1556	eor	w21,w21,w5
1557	eor	v29.16b,v21.16b,v22.16b
1558	eor	w17,w17,w6
1559	ushr	v1.4s,v24.4s,#25
1560	eor	w19,w19,w7
1561	ushr	v5.4s,v25.4s,#25
1562	eor	w20,w20,w8
1563	ushr	v9.4s,v26.4s,#25
1564	ror	w21,w21,#24
1565	ushr	v13.4s,v27.4s,#25
1566	ror	w17,w17,#24
1567	ushr	v17.4s,v28.4s,#25
1568	ror	w19,w19,#24
1569	ushr	v21.4s,v29.4s,#25
1570	ror	w20,w20,#24
1571	sli	v1.4s,v24.4s,#7
1572	add	w15,w15,w21
1573	sli	v5.4s,v25.4s,#7
1574	add	w16,w16,w17
1575	sli	v9.4s,v26.4s,#7
1576	add	w13,w13,w19
1577	sli	v13.4s,v27.4s,#7
1578	add	w14,w14,w20
1579	sli	v17.4s,v28.4s,#7
1580	eor	w10,w10,w15
1581	sli	v21.4s,v29.4s,#7
1582	eor	w11,w11,w16
1583	ext	v2.16b,v2.16b,v2.16b,#8
1584	eor	w12,w12,w13
1585	ext	v6.16b,v6.16b,v6.16b,#8
1586	eor	w9,w9,w14
1587	ext	v10.16b,v10.16b,v10.16b,#8
1588	ror	w10,w10,#25
1589	ext	v14.16b,v14.16b,v14.16b,#8
1590	ror	w11,w11,#25
1591	ext	v18.16b,v18.16b,v18.16b,#8
1592	ror	w12,w12,#25
1593	ext	v22.16b,v22.16b,v22.16b,#8
1594	ror	w9,w9,#25
1595	ext	v3.16b,v3.16b,v3.16b,#12
1596	ext	v7.16b,v7.16b,v7.16b,#12
1597	ext	v11.16b,v11.16b,v11.16b,#12
1598	ext	v15.16b,v15.16b,v15.16b,#12
1599	ext	v19.16b,v19.16b,v19.16b,#12
1600	ext	v23.16b,v23.16b,v23.16b,#12
1601	ext	v1.16b,v1.16b,v1.16b,#4
1602	ext	v5.16b,v5.16b,v5.16b,#4
1603	ext	v9.16b,v9.16b,v9.16b,#4
1604	ext	v13.16b,v13.16b,v13.16b,#4
1605	ext	v17.16b,v17.16b,v17.16b,#4
1606	ext	v21.16b,v21.16b,v21.16b,#4
1607	add	v0.4s,v0.4s,v1.4s
1608	add	w5,w5,w9
1609	add	v4.4s,v4.4s,v5.4s
1610	add	w6,w6,w10
1611	add	v8.4s,v8.4s,v9.4s
1612	add	w7,w7,w11
1613	add	v12.4s,v12.4s,v13.4s
1614	add	w8,w8,w12
1615	add	v16.4s,v16.4s,v17.4s
1616	eor	w17,w17,w5
1617	add	v20.4s,v20.4s,v21.4s
1618	eor	w19,w19,w6
1619	eor	v3.16b,v3.16b,v0.16b
1620	eor	w20,w20,w7
1621	eor	v7.16b,v7.16b,v4.16b
1622	eor	w21,w21,w8
1623	eor	v11.16b,v11.16b,v8.16b
1624	ror	w17,w17,#16
1625	eor	v15.16b,v15.16b,v12.16b
1626	ror	w19,w19,#16
1627	eor	v19.16b,v19.16b,v16.16b
1628	ror	w20,w20,#16
1629	eor	v23.16b,v23.16b,v20.16b
1630	ror	w21,w21,#16
1631	rev32	v3.8h,v3.8h
1632	add	w13,w13,w17
1633	rev32	v7.8h,v7.8h
1634	add	w14,w14,w19
1635	rev32	v11.8h,v11.8h
1636	add	w15,w15,w20
1637	rev32	v15.8h,v15.8h
1638	add	w16,w16,w21
1639	rev32	v19.8h,v19.8h
1640	eor	w9,w9,w13
1641	rev32	v23.8h,v23.8h
1642	eor	w10,w10,w14
1643	add	v2.4s,v2.4s,v3.4s
1644	eor	w11,w11,w15
1645	add	v6.4s,v6.4s,v7.4s
1646	eor	w12,w12,w16
1647	add	v10.4s,v10.4s,v11.4s
1648	ror	w9,w9,#20
1649	add	v14.4s,v14.4s,v15.4s
1650	ror	w10,w10,#20
1651	add	v18.4s,v18.4s,v19.4s
1652	ror	w11,w11,#20
1653	add	v22.4s,v22.4s,v23.4s
1654	ror	w12,w12,#20
1655	eor	v24.16b,v1.16b,v2.16b
1656	add	w5,w5,w9
1657	eor	v25.16b,v5.16b,v6.16b
1658	add	w6,w6,w10
1659	eor	v26.16b,v9.16b,v10.16b
1660	add	w7,w7,w11
1661	eor	v27.16b,v13.16b,v14.16b
1662	add	w8,w8,w12
1663	eor	v28.16b,v17.16b,v18.16b
1664	eor	w17,w17,w5
1665	eor	v29.16b,v21.16b,v22.16b
1666	eor	w19,w19,w6
1667	ushr	v1.4s,v24.4s,#20
1668	eor	w20,w20,w7
1669	ushr	v5.4s,v25.4s,#20
1670	eor	w21,w21,w8
1671	ushr	v9.4s,v26.4s,#20
1672	ror	w17,w17,#24
1673	ushr	v13.4s,v27.4s,#20
1674	ror	w19,w19,#24
1675	ushr	v17.4s,v28.4s,#20
1676	ror	w20,w20,#24
1677	ushr	v21.4s,v29.4s,#20
1678	ror	w21,w21,#24
1679	sli	v1.4s,v24.4s,#12
1680	add	w13,w13,w17
1681	sli	v5.4s,v25.4s,#12
1682	add	w14,w14,w19
1683	sli	v9.4s,v26.4s,#12
1684	add	w15,w15,w20
1685	sli	v13.4s,v27.4s,#12
1686	add	w16,w16,w21
1687	sli	v17.4s,v28.4s,#12
1688	eor	w9,w9,w13
1689	sli	v21.4s,v29.4s,#12
1690	eor	w10,w10,w14
1691	add	v0.4s,v0.4s,v1.4s
1692	eor	w11,w11,w15
1693	add	v4.4s,v4.4s,v5.4s
1694	eor	w12,w12,w16
1695	add	v8.4s,v8.4s,v9.4s
1696	ror	w9,w9,#25
1697	add	v12.4s,v12.4s,v13.4s
1698	ror	w10,w10,#25
1699	add	v16.4s,v16.4s,v17.4s
1700	ror	w11,w11,#25
1701	add	v20.4s,v20.4s,v21.4s
1702	ror	w12,w12,#25
1703	eor	v24.16b,v3.16b,v0.16b
1704	add	w5,w5,w10
1705	eor	v25.16b,v7.16b,v4.16b
1706	add	w6,w6,w11
1707	eor	v26.16b,v11.16b,v8.16b
1708	add	w7,w7,w12
1709	eor	v27.16b,v15.16b,v12.16b
1710	add	w8,w8,w9
1711	eor	v28.16b,v19.16b,v16.16b
1712	eor	w21,w21,w5
1713	eor	v29.16b,v23.16b,v20.16b
1714	eor	w17,w17,w6
1715	ushr	v3.4s,v24.4s,#24
1716	eor	w19,w19,w7
1717	ushr	v7.4s,v25.4s,#24
1718	eor	w20,w20,w8
1719	ushr	v11.4s,v26.4s,#24
1720	ror	w21,w21,#16
1721	ushr	v15.4s,v27.4s,#24
1722	ror	w17,w17,#16
1723	ushr	v19.4s,v28.4s,#24
1724	ror	w19,w19,#16
1725	ushr	v23.4s,v29.4s,#24
1726	ror	w20,w20,#16
1727	sli	v3.4s,v24.4s,#8
1728	add	w15,w15,w21
1729	sli	v7.4s,v25.4s,#8
1730	add	w16,w16,w17
1731	sli	v11.4s,v26.4s,#8
1732	add	w13,w13,w19
1733	sli	v15.4s,v27.4s,#8
1734	add	w14,w14,w20
1735	sli	v19.4s,v28.4s,#8
1736	eor	w10,w10,w15
1737	sli	v23.4s,v29.4s,#8
1738	eor	w11,w11,w16
1739	add	v2.4s,v2.4s,v3.4s
1740	eor	w12,w12,w13
1741	add	v6.4s,v6.4s,v7.4s
1742	eor	w9,w9,w14
1743	add	v10.4s,v10.4s,v11.4s
1744	ror	w10,w10,#20
1745	add	v14.4s,v14.4s,v15.4s
1746	ror	w11,w11,#20
1747	add	v18.4s,v18.4s,v19.4s
1748	ror	w12,w12,#20
1749	add	v22.4s,v22.4s,v23.4s
1750	ror	w9,w9,#20
1751	eor	v24.16b,v1.16b,v2.16b
1752	add	w5,w5,w10
1753	eor	v25.16b,v5.16b,v6.16b
1754	add	w6,w6,w11
1755	eor	v26.16b,v9.16b,v10.16b
1756	add	w7,w7,w12
1757	eor	v27.16b,v13.16b,v14.16b
1758	add	w8,w8,w9
1759	eor	v28.16b,v17.16b,v18.16b
1760	eor	w21,w21,w5
1761	eor	v29.16b,v21.16b,v22.16b
1762	eor	w17,w17,w6
1763	ushr	v1.4s,v24.4s,#25
1764	eor	w19,w19,w7
1765	ushr	v5.4s,v25.4s,#25
1766	eor	w20,w20,w8
1767	ushr	v9.4s,v26.4s,#25
1768	ror	w21,w21,#24
1769	ushr	v13.4s,v27.4s,#25
1770	ror	w17,w17,#24
1771	ushr	v17.4s,v28.4s,#25
1772	ror	w19,w19,#24
1773	ushr	v21.4s,v29.4s,#25
1774	ror	w20,w20,#24
1775	sli	v1.4s,v24.4s,#7
1776	add	w15,w15,w21
1777	sli	v5.4s,v25.4s,#7
1778	add	w16,w16,w17
1779	sli	v9.4s,v26.4s,#7
1780	add	w13,w13,w19
1781	sli	v13.4s,v27.4s,#7
1782	add	w14,w14,w20
1783	sli	v17.4s,v28.4s,#7
1784	eor	w10,w10,w15
1785	sli	v21.4s,v29.4s,#7
1786	eor	w11,w11,w16
1787	ext	v2.16b,v2.16b,v2.16b,#8
1788	eor	w12,w12,w13
1789	ext	v6.16b,v6.16b,v6.16b,#8
1790	eor	w9,w9,w14
1791	ext	v10.16b,v10.16b,v10.16b,#8
1792	ror	w10,w10,#25
1793	ext	v14.16b,v14.16b,v14.16b,#8
1794	ror	w11,w11,#25
1795	ext	v18.16b,v18.16b,v18.16b,#8
1796	ror	w12,w12,#25
1797	ext	v22.16b,v22.16b,v22.16b,#8
1798	ror	w9,w9,#25
1799	ext	v3.16b,v3.16b,v3.16b,#4
1800	ext	v7.16b,v7.16b,v7.16b,#4
1801	ext	v11.16b,v11.16b,v11.16b,#4
1802	ext	v15.16b,v15.16b,v15.16b,#4
1803	ext	v19.16b,v19.16b,v19.16b,#4
1804	ext	v23.16b,v23.16b,v23.16b,#4
1805	ext	v1.16b,v1.16b,v1.16b,#12
1806	ext	v5.16b,v5.16b,v5.16b,#12
1807	ext	v9.16b,v9.16b,v9.16b,#12
1808	ext	v13.16b,v13.16b,v13.16b,#12
1809	ext	v17.16b,v17.16b,v17.16b,#12
1810	ext	v21.16b,v21.16b,v21.16b,#12
1811	cbnz	x4,Loop_lower_neon
1812
1813	add	w5,w5,w22		// accumulate key block
1814	ldp	q24,q25,[sp,#0]
1815	add	x6,x6,x22,lsr#32
1816	ldp	q26,q27,[sp,#32]
1817	add	w7,w7,w23
1818	ldp	q28,q29,[sp,#64]
1819	add	x8,x8,x23,lsr#32
1820	add	v0.4s,v0.4s,v24.4s
1821	add	w9,w9,w24
1822	add	v4.4s,v4.4s,v24.4s
1823	add	x10,x10,x24,lsr#32
1824	add	v8.4s,v8.4s,v24.4s
1825	add	w11,w11,w25
1826	add	v12.4s,v12.4s,v24.4s
1827	add	x12,x12,x25,lsr#32
1828	add	v16.4s,v16.4s,v24.4s
1829	add	w13,w13,w26
1830	add	v20.4s,v20.4s,v24.4s
1831	add	x14,x14,x26,lsr#32
1832	add	v2.4s,v2.4s,v26.4s
1833	add	w15,w15,w27
1834	add	v6.4s,v6.4s,v26.4s
1835	add	x16,x16,x27,lsr#32
1836	add	v10.4s,v10.4s,v26.4s
1837	add	w17,w17,w28
1838	add	v14.4s,v14.4s,v26.4s
1839	add	x19,x19,x28,lsr#32
1840	add	v18.4s,v18.4s,v26.4s
1841	add	w20,w20,w30
1842	add	v22.4s,v22.4s,v26.4s
1843	add	x21,x21,x30,lsr#32
1844	add	v19.4s,v19.4s,v31.4s			// +4
1845	add	x5,x5,x6,lsl#32	// pack
1846	add	v23.4s,v23.4s,v31.4s			// +4
1847	add	x7,x7,x8,lsl#32
1848	add	v3.4s,v3.4s,v27.4s
1849	ldp	x6,x8,[x1,#0]		// load input
1850	add	v7.4s,v7.4s,v28.4s
1851	add	x9,x9,x10,lsl#32
1852	add	v11.4s,v11.4s,v29.4s
1853	add	x11,x11,x12,lsl#32
1854	add	v15.4s,v15.4s,v30.4s
1855	ldp	x10,x12,[x1,#16]
1856	add	v19.4s,v19.4s,v27.4s
1857	add	x13,x13,x14,lsl#32
1858	add	v23.4s,v23.4s,v28.4s
1859	add	x15,x15,x16,lsl#32
1860	add	v1.4s,v1.4s,v25.4s
1861	ldp	x14,x16,[x1,#32]
1862	add	v5.4s,v5.4s,v25.4s
1863	add	x17,x17,x19,lsl#32
1864	add	v9.4s,v9.4s,v25.4s
1865	add	x20,x20,x21,lsl#32
1866	add	v13.4s,v13.4s,v25.4s
1867	ldp	x19,x21,[x1,#48]
1868	add	v17.4s,v17.4s,v25.4s
1869	add	x1,x1,#64
1870	add	v21.4s,v21.4s,v25.4s
1871
1872#ifdef	__ARMEB__
1873	rev	x5,x5
1874	rev	x7,x7
1875	rev	x9,x9
1876	rev	x11,x11
1877	rev	x13,x13
1878	rev	x15,x15
1879	rev	x17,x17
1880	rev	x20,x20
1881#endif
1882	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1883	eor	x5,x5,x6
1884	eor	x7,x7,x8
1885	eor	x9,x9,x10
1886	eor	x11,x11,x12
1887	eor	x13,x13,x14
1888	eor	v0.16b,v0.16b,v24.16b
1889	eor	x15,x15,x16
1890	eor	v1.16b,v1.16b,v25.16b
1891	eor	x17,x17,x19
1892	eor	v2.16b,v2.16b,v26.16b
1893	eor	x20,x20,x21
1894	eor	v3.16b,v3.16b,v27.16b
1895	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1896
1897	stp	x5,x7,[x0,#0]		// store output
1898	add	x28,x28,#7			// increment counter
1899	stp	x9,x11,[x0,#16]
1900	stp	x13,x15,[x0,#32]
1901	stp	x17,x20,[x0,#48]
1902	add	x0,x0,#64
1903	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1904
1905	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1906	eor	v4.16b,v4.16b,v24.16b
1907	eor	v5.16b,v5.16b,v25.16b
1908	eor	v6.16b,v6.16b,v26.16b
1909	eor	v7.16b,v7.16b,v27.16b
1910	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1911
1912	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1913	eor	v8.16b,v8.16b,v0.16b
1914	ldp	q24,q25,[sp,#0]
1915	eor	v9.16b,v9.16b,v1.16b
1916	ldp	q26,q27,[sp,#32]
1917	eor	v10.16b,v10.16b,v2.16b
1918	eor	v11.16b,v11.16b,v3.16b
1919	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1920
1921	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1922	eor	v12.16b,v12.16b,v4.16b
1923	eor	v13.16b,v13.16b,v5.16b
1924	eor	v14.16b,v14.16b,v6.16b
1925	eor	v15.16b,v15.16b,v7.16b
1926	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1927
1928	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1929	eor	v16.16b,v16.16b,v8.16b
1930	eor	v17.16b,v17.16b,v9.16b
1931	eor	v18.16b,v18.16b,v10.16b
1932	eor	v19.16b,v19.16b,v11.16b
1933	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1934
1935	shl	v0.4s,v31.4s,#1			// 4 -> 8
1936	eor	v20.16b,v20.16b,v12.16b
1937	eor	v21.16b,v21.16b,v13.16b
1938	eor	v22.16b,v22.16b,v14.16b
1939	eor	v23.16b,v23.16b,v15.16b
1940	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1941
1942	add	v27.4s,v27.4s,v0.4s			// += 8
1943	add	v28.4s,v28.4s,v0.4s
1944	add	v29.4s,v29.4s,v0.4s
1945	add	v30.4s,v30.4s,v0.4s
1946
1947	b.hs	Loop_outer_512_neon
1948
1949	adds	x2,x2,#512
1950	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1951
1952	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1953	ldp	d10,d11,[sp,#128+16]
1954	ldp	d12,d13,[sp,#128+32]
1955	ldp	d14,d15,[sp,#128+48]
1956
1957	stp	q24,q31,[sp,#0]		// wipe off-load area
1958	stp	q24,q31,[sp,#32]
1959	stp	q24,q31,[sp,#64]
1960
1961	b.eq	Ldone_512_neon
1962
1963	cmp	x2,#192
1964	sub	v27.4s,v27.4s,v0.4s			// -= 1
1965	sub	v28.4s,v28.4s,v0.4s
1966	sub	v29.4s,v29.4s,v0.4s
1967	add	sp,sp,#128
1968	b.hs	Loop_outer_neon
1969
1970	eor	v25.16b,v25.16b,v25.16b
1971	eor	v26.16b,v26.16b,v26.16b
1972	eor	v27.16b,v27.16b,v27.16b
1973	eor	v28.16b,v28.16b,v28.16b
1974	eor	v29.16b,v29.16b,v29.16b
1975	eor	v30.16b,v30.16b,v30.16b
1976	b	Loop_outer
1977
1978Ldone_512_neon:
1979	ldp	x19,x20,[x29,#16]
1980	add	sp,sp,#128+64
1981	ldp	x21,x22,[x29,#32]
1982	ldp	x23,x24,[x29,#48]
1983	ldp	x25,x26,[x29,#64]
1984	ldp	x27,x28,[x29,#80]
1985	ldp	x29,x30,[sp],#96
1986	AARCH64_VALIDATE_LINK_REGISTER
1987	ret
1988
1989#endif  // !OPENSSL_NO_ASM
1990