1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17#if __ARM_MAX_ARCH__>=7
18.text
19
20.section	__TEXT,__const
21.align	5
22Lrcon:
23.long	0x01,0x01,0x01,0x01
24.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
25.long	0x1b,0x1b,0x1b,0x1b
26
27.text
28
29.globl	_aes_hw_set_encrypt_key
30.private_extern	_aes_hw_set_encrypt_key
31
32.align	5
33_aes_hw_set_encrypt_key:
34Lenc_key:
35	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
36	AARCH64_VALID_CALL_TARGET
37	stp	x29,x30,[sp,#-16]!
38	add	x29,sp,#0
39	mov	x3,#-1
40	cmp	x0,#0
41	b.eq	Lenc_key_abort
42	cmp	x2,#0
43	b.eq	Lenc_key_abort
44	mov	x3,#-2
45	cmp	w1,#128
46	b.lt	Lenc_key_abort
47	cmp	w1,#256
48	b.gt	Lenc_key_abort
49	tst	w1,#0x3f
50	b.ne	Lenc_key_abort
51
52	adrp	x3,Lrcon@PAGE
53	add	x3,x3,Lrcon@PAGEOFF
54	cmp	w1,#192
55
56	eor	v0.16b,v0.16b,v0.16b
57	ld1	{v3.16b},[x0],#16
58	mov	w1,#8		// reuse w1
59	ld1	{v1.4s,v2.4s},[x3],#32
60
61	b.lt	Loop128
62	b.eq	L192
63	b	L256
64
65.align	4
66Loop128:
67	tbl	v6.16b,{v3.16b},v2.16b
68	ext	v5.16b,v0.16b,v3.16b,#12
69	st1	{v3.4s},[x2],#16
70	aese	v6.16b,v0.16b
71	subs	w1,w1,#1
72
73	eor	v3.16b,v3.16b,v5.16b
74	ext	v5.16b,v0.16b,v5.16b,#12
75	eor	v3.16b,v3.16b,v5.16b
76	ext	v5.16b,v0.16b,v5.16b,#12
77	eor	v6.16b,v6.16b,v1.16b
78	eor	v3.16b,v3.16b,v5.16b
79	shl	v1.16b,v1.16b,#1
80	eor	v3.16b,v3.16b,v6.16b
81	b.ne	Loop128
82
83	ld1	{v1.4s},[x3]
84
85	tbl	v6.16b,{v3.16b},v2.16b
86	ext	v5.16b,v0.16b,v3.16b,#12
87	st1	{v3.4s},[x2],#16
88	aese	v6.16b,v0.16b
89
90	eor	v3.16b,v3.16b,v5.16b
91	ext	v5.16b,v0.16b,v5.16b,#12
92	eor	v3.16b,v3.16b,v5.16b
93	ext	v5.16b,v0.16b,v5.16b,#12
94	eor	v6.16b,v6.16b,v1.16b
95	eor	v3.16b,v3.16b,v5.16b
96	shl	v1.16b,v1.16b,#1
97	eor	v3.16b,v3.16b,v6.16b
98
99	tbl	v6.16b,{v3.16b},v2.16b
100	ext	v5.16b,v0.16b,v3.16b,#12
101	st1	{v3.4s},[x2],#16
102	aese	v6.16b,v0.16b
103
104	eor	v3.16b,v3.16b,v5.16b
105	ext	v5.16b,v0.16b,v5.16b,#12
106	eor	v3.16b,v3.16b,v5.16b
107	ext	v5.16b,v0.16b,v5.16b,#12
108	eor	v6.16b,v6.16b,v1.16b
109	eor	v3.16b,v3.16b,v5.16b
110	eor	v3.16b,v3.16b,v6.16b
111	st1	{v3.4s},[x2]
112	add	x2,x2,#0x50
113
114	mov	w12,#10
115	b	Ldone
116
117.align	4
118L192:
119	ld1	{v4.8b},[x0],#8
120	movi	v6.16b,#8			// borrow v6.16b
121	st1	{v3.4s},[x2],#16
122	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
123
124Loop192:
125	tbl	v6.16b,{v4.16b},v2.16b
126	ext	v5.16b,v0.16b,v3.16b,#12
127	st1	{v4.8b},[x2],#8
128	aese	v6.16b,v0.16b
129	subs	w1,w1,#1
130
131	eor	v3.16b,v3.16b,v5.16b
132	ext	v5.16b,v0.16b,v5.16b,#12
133	eor	v3.16b,v3.16b,v5.16b
134	ext	v5.16b,v0.16b,v5.16b,#12
135	eor	v3.16b,v3.16b,v5.16b
136
137	dup	v5.4s,v3.s[3]
138	eor	v5.16b,v5.16b,v4.16b
139	eor	v6.16b,v6.16b,v1.16b
140	ext	v4.16b,v0.16b,v4.16b,#12
141	shl	v1.16b,v1.16b,#1
142	eor	v4.16b,v4.16b,v5.16b
143	eor	v3.16b,v3.16b,v6.16b
144	eor	v4.16b,v4.16b,v6.16b
145	st1	{v3.4s},[x2],#16
146	b.ne	Loop192
147
148	mov	w12,#12
149	add	x2,x2,#0x20
150	b	Ldone
151
152.align	4
153L256:
154	ld1	{v4.16b},[x0]
155	mov	w1,#7
156	mov	w12,#14
157	st1	{v3.4s},[x2],#16
158
159Loop256:
160	tbl	v6.16b,{v4.16b},v2.16b
161	ext	v5.16b,v0.16b,v3.16b,#12
162	st1	{v4.4s},[x2],#16
163	aese	v6.16b,v0.16b
164	subs	w1,w1,#1
165
166	eor	v3.16b,v3.16b,v5.16b
167	ext	v5.16b,v0.16b,v5.16b,#12
168	eor	v3.16b,v3.16b,v5.16b
169	ext	v5.16b,v0.16b,v5.16b,#12
170	eor	v6.16b,v6.16b,v1.16b
171	eor	v3.16b,v3.16b,v5.16b
172	shl	v1.16b,v1.16b,#1
173	eor	v3.16b,v3.16b,v6.16b
174	st1	{v3.4s},[x2],#16
175	b.eq	Ldone
176
177	dup	v6.4s,v3.s[3]		// just splat
178	ext	v5.16b,v0.16b,v4.16b,#12
179	aese	v6.16b,v0.16b
180
181	eor	v4.16b,v4.16b,v5.16b
182	ext	v5.16b,v0.16b,v5.16b,#12
183	eor	v4.16b,v4.16b,v5.16b
184	ext	v5.16b,v0.16b,v5.16b,#12
185	eor	v4.16b,v4.16b,v5.16b
186
187	eor	v4.16b,v4.16b,v6.16b
188	b	Loop256
189
190Ldone:
191	str	w12,[x2]
192	mov	x3,#0
193
194Lenc_key_abort:
195	mov	x0,x3			// return value
196	ldr	x29,[sp],#16
197	ret
198
199
200.globl	_aes_hw_set_decrypt_key
201.private_extern	_aes_hw_set_decrypt_key
202
203.align	5
204_aes_hw_set_decrypt_key:
205	AARCH64_SIGN_LINK_REGISTER
206	stp	x29,x30,[sp,#-16]!
207	add	x29,sp,#0
208	bl	Lenc_key
209
210	cmp	x0,#0
211	b.ne	Ldec_key_abort
212
213	sub	x2,x2,#240		// restore original x2
214	mov	x4,#-16
215	add	x0,x2,x12,lsl#4	// end of key schedule
216
217	ld1	{v0.4s},[x2]
218	ld1	{v1.4s},[x0]
219	st1	{v0.4s},[x0],x4
220	st1	{v1.4s},[x2],#16
221
222Loop_imc:
223	ld1	{v0.4s},[x2]
224	ld1	{v1.4s},[x0]
225	aesimc	v0.16b,v0.16b
226	aesimc	v1.16b,v1.16b
227	st1	{v0.4s},[x0],x4
228	st1	{v1.4s},[x2],#16
229	cmp	x0,x2
230	b.hi	Loop_imc
231
232	ld1	{v0.4s},[x2]
233	aesimc	v0.16b,v0.16b
234	st1	{v0.4s},[x0]
235
236	eor	x0,x0,x0		// return value
237Ldec_key_abort:
238	ldp	x29,x30,[sp],#16
239	AARCH64_VALIDATE_LINK_REGISTER
240	ret
241
242.globl	_aes_hw_encrypt
243.private_extern	_aes_hw_encrypt
244
245.align	5
246_aes_hw_encrypt:
247	AARCH64_VALID_CALL_TARGET
248	ldr	w3,[x2,#240]
249	ld1	{v0.4s},[x2],#16
250	ld1	{v2.16b},[x0]
251	sub	w3,w3,#2
252	ld1	{v1.4s},[x2],#16
253
254Loop_enc:
255	aese	v2.16b,v0.16b
256	aesmc	v2.16b,v2.16b
257	ld1	{v0.4s},[x2],#16
258	subs	w3,w3,#2
259	aese	v2.16b,v1.16b
260	aesmc	v2.16b,v2.16b
261	ld1	{v1.4s},[x2],#16
262	b.gt	Loop_enc
263
264	aese	v2.16b,v0.16b
265	aesmc	v2.16b,v2.16b
266	ld1	{v0.4s},[x2]
267	aese	v2.16b,v1.16b
268	eor	v2.16b,v2.16b,v0.16b
269
270	st1	{v2.16b},[x1]
271	ret
272
273.globl	_aes_hw_decrypt
274.private_extern	_aes_hw_decrypt
275
276.align	5
277_aes_hw_decrypt:
278	AARCH64_VALID_CALL_TARGET
279	ldr	w3,[x2,#240]
280	ld1	{v0.4s},[x2],#16
281	ld1	{v2.16b},[x0]
282	sub	w3,w3,#2
283	ld1	{v1.4s},[x2],#16
284
285Loop_dec:
286	aesd	v2.16b,v0.16b
287	aesimc	v2.16b,v2.16b
288	ld1	{v0.4s},[x2],#16
289	subs	w3,w3,#2
290	aesd	v2.16b,v1.16b
291	aesimc	v2.16b,v2.16b
292	ld1	{v1.4s},[x2],#16
293	b.gt	Loop_dec
294
295	aesd	v2.16b,v0.16b
296	aesimc	v2.16b,v2.16b
297	ld1	{v0.4s},[x2]
298	aesd	v2.16b,v1.16b
299	eor	v2.16b,v2.16b,v0.16b
300
301	st1	{v2.16b},[x1]
302	ret
303
304.globl	_aes_hw_cbc_encrypt
305.private_extern	_aes_hw_cbc_encrypt
306
307.align	5
308_aes_hw_cbc_encrypt:
309	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
310	AARCH64_VALID_CALL_TARGET
311	stp	x29,x30,[sp,#-16]!
312	add	x29,sp,#0
313	subs	x2,x2,#16
314	mov	x8,#16
315	b.lo	Lcbc_abort
316	csel	x8,xzr,x8,eq
317
318	cmp	w5,#0			// en- or decrypting?
319	ldr	w5,[x3,#240]
320	and	x2,x2,#-16
321	ld1	{v6.16b},[x4]
322	ld1	{v0.16b},[x0],x8
323
324	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
325	sub	w5,w5,#6
326	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
327	sub	w5,w5,#2
328	ld1	{v18.4s,v19.4s},[x7],#32
329	ld1	{v20.4s,v21.4s},[x7],#32
330	ld1	{v22.4s,v23.4s},[x7],#32
331	ld1	{v7.4s},[x7]
332
333	add	x7,x3,#32
334	mov	w6,w5
335	b.eq	Lcbc_dec
336
337	cmp	w5,#2
338	eor	v0.16b,v0.16b,v6.16b
339	eor	v5.16b,v16.16b,v7.16b
340	b.eq	Lcbc_enc128
341
342	ld1	{v2.4s,v3.4s},[x7]
343	add	x7,x3,#16
344	add	x6,x3,#16*4
345	add	x12,x3,#16*5
346	aese	v0.16b,v16.16b
347	aesmc	v0.16b,v0.16b
348	add	x14,x3,#16*6
349	add	x3,x3,#16*7
350	b	Lenter_cbc_enc
351
352.align	4
353Loop_cbc_enc:
354	aese	v0.16b,v16.16b
355	aesmc	v0.16b,v0.16b
356	st1	{v6.16b},[x1],#16
357Lenter_cbc_enc:
358	aese	v0.16b,v17.16b
359	aesmc	v0.16b,v0.16b
360	aese	v0.16b,v2.16b
361	aesmc	v0.16b,v0.16b
362	ld1	{v16.4s},[x6]
363	cmp	w5,#4
364	aese	v0.16b,v3.16b
365	aesmc	v0.16b,v0.16b
366	ld1	{v17.4s},[x12]
367	b.eq	Lcbc_enc192
368
369	aese	v0.16b,v16.16b
370	aesmc	v0.16b,v0.16b
371	ld1	{v16.4s},[x14]
372	aese	v0.16b,v17.16b
373	aesmc	v0.16b,v0.16b
374	ld1	{v17.4s},[x3]
375	nop
376
377Lcbc_enc192:
378	aese	v0.16b,v16.16b
379	aesmc	v0.16b,v0.16b
380	subs	x2,x2,#16
381	aese	v0.16b,v17.16b
382	aesmc	v0.16b,v0.16b
383	csel	x8,xzr,x8,eq
384	aese	v0.16b,v18.16b
385	aesmc	v0.16b,v0.16b
386	aese	v0.16b,v19.16b
387	aesmc	v0.16b,v0.16b
388	ld1	{v16.16b},[x0],x8
389	aese	v0.16b,v20.16b
390	aesmc	v0.16b,v0.16b
391	eor	v16.16b,v16.16b,v5.16b
392	aese	v0.16b,v21.16b
393	aesmc	v0.16b,v0.16b
394	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
395	aese	v0.16b,v22.16b
396	aesmc	v0.16b,v0.16b
397	aese	v0.16b,v23.16b
398	eor	v6.16b,v0.16b,v7.16b
399	b.hs	Loop_cbc_enc
400
401	st1	{v6.16b},[x1],#16
402	b	Lcbc_done
403
404.align	5
405Lcbc_enc128:
406	ld1	{v2.4s,v3.4s},[x7]
407	aese	v0.16b,v16.16b
408	aesmc	v0.16b,v0.16b
409	b	Lenter_cbc_enc128
410Loop_cbc_enc128:
411	aese	v0.16b,v16.16b
412	aesmc	v0.16b,v0.16b
413	st1	{v6.16b},[x1],#16
414Lenter_cbc_enc128:
415	aese	v0.16b,v17.16b
416	aesmc	v0.16b,v0.16b
417	subs	x2,x2,#16
418	aese	v0.16b,v2.16b
419	aesmc	v0.16b,v0.16b
420	csel	x8,xzr,x8,eq
421	aese	v0.16b,v3.16b
422	aesmc	v0.16b,v0.16b
423	aese	v0.16b,v18.16b
424	aesmc	v0.16b,v0.16b
425	aese	v0.16b,v19.16b
426	aesmc	v0.16b,v0.16b
427	ld1	{v16.16b},[x0],x8
428	aese	v0.16b,v20.16b
429	aesmc	v0.16b,v0.16b
430	aese	v0.16b,v21.16b
431	aesmc	v0.16b,v0.16b
432	aese	v0.16b,v22.16b
433	aesmc	v0.16b,v0.16b
434	eor	v16.16b,v16.16b,v5.16b
435	aese	v0.16b,v23.16b
436	eor	v6.16b,v0.16b,v7.16b
437	b.hs	Loop_cbc_enc128
438
439	st1	{v6.16b},[x1],#16
440	b	Lcbc_done
441.align	5
442Lcbc_dec:
443	ld1	{v18.16b},[x0],#16
444	subs	x2,x2,#32		// bias
445	add	w6,w5,#2
446	orr	v3.16b,v0.16b,v0.16b
447	orr	v1.16b,v0.16b,v0.16b
448	orr	v19.16b,v18.16b,v18.16b
449	b.lo	Lcbc_dec_tail
450
451	orr	v1.16b,v18.16b,v18.16b
452	ld1	{v18.16b},[x0],#16
453	orr	v2.16b,v0.16b,v0.16b
454	orr	v3.16b,v1.16b,v1.16b
455	orr	v19.16b,v18.16b,v18.16b
456
457Loop3x_cbc_dec:
458	aesd	v0.16b,v16.16b
459	aesimc	v0.16b,v0.16b
460	aesd	v1.16b,v16.16b
461	aesimc	v1.16b,v1.16b
462	aesd	v18.16b,v16.16b
463	aesimc	v18.16b,v18.16b
464	ld1	{v16.4s},[x7],#16
465	subs	w6,w6,#2
466	aesd	v0.16b,v17.16b
467	aesimc	v0.16b,v0.16b
468	aesd	v1.16b,v17.16b
469	aesimc	v1.16b,v1.16b
470	aesd	v18.16b,v17.16b
471	aesimc	v18.16b,v18.16b
472	ld1	{v17.4s},[x7],#16
473	b.gt	Loop3x_cbc_dec
474
475	aesd	v0.16b,v16.16b
476	aesimc	v0.16b,v0.16b
477	aesd	v1.16b,v16.16b
478	aesimc	v1.16b,v1.16b
479	aesd	v18.16b,v16.16b
480	aesimc	v18.16b,v18.16b
481	eor	v4.16b,v6.16b,v7.16b
482	subs	x2,x2,#0x30
483	eor	v5.16b,v2.16b,v7.16b
484	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
485	aesd	v0.16b,v17.16b
486	aesimc	v0.16b,v0.16b
487	aesd	v1.16b,v17.16b
488	aesimc	v1.16b,v1.16b
489	aesd	v18.16b,v17.16b
490	aesimc	v18.16b,v18.16b
491	eor	v17.16b,v3.16b,v7.16b
492	add	x0,x0,x6		// x0 is adjusted in such way that
493					// at exit from the loop v1.16b-v18.16b
494					// are loaded with last "words"
495	orr	v6.16b,v19.16b,v19.16b
496	mov	x7,x3
497	aesd	v0.16b,v20.16b
498	aesimc	v0.16b,v0.16b
499	aesd	v1.16b,v20.16b
500	aesimc	v1.16b,v1.16b
501	aesd	v18.16b,v20.16b
502	aesimc	v18.16b,v18.16b
503	ld1	{v2.16b},[x0],#16
504	aesd	v0.16b,v21.16b
505	aesimc	v0.16b,v0.16b
506	aesd	v1.16b,v21.16b
507	aesimc	v1.16b,v1.16b
508	aesd	v18.16b,v21.16b
509	aesimc	v18.16b,v18.16b
510	ld1	{v3.16b},[x0],#16
511	aesd	v0.16b,v22.16b
512	aesimc	v0.16b,v0.16b
513	aesd	v1.16b,v22.16b
514	aesimc	v1.16b,v1.16b
515	aesd	v18.16b,v22.16b
516	aesimc	v18.16b,v18.16b
517	ld1	{v19.16b},[x0],#16
518	aesd	v0.16b,v23.16b
519	aesd	v1.16b,v23.16b
520	aesd	v18.16b,v23.16b
521	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
522	add	w6,w5,#2
523	eor	v4.16b,v4.16b,v0.16b
524	eor	v5.16b,v5.16b,v1.16b
525	eor	v18.16b,v18.16b,v17.16b
526	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
527	st1	{v4.16b},[x1],#16
528	orr	v0.16b,v2.16b,v2.16b
529	st1	{v5.16b},[x1],#16
530	orr	v1.16b,v3.16b,v3.16b
531	st1	{v18.16b},[x1],#16
532	orr	v18.16b,v19.16b,v19.16b
533	b.hs	Loop3x_cbc_dec
534
535	cmn	x2,#0x30
536	b.eq	Lcbc_done
537	nop
538
539Lcbc_dec_tail:
540	aesd	v1.16b,v16.16b
541	aesimc	v1.16b,v1.16b
542	aesd	v18.16b,v16.16b
543	aesimc	v18.16b,v18.16b
544	ld1	{v16.4s},[x7],#16
545	subs	w6,w6,#2
546	aesd	v1.16b,v17.16b
547	aesimc	v1.16b,v1.16b
548	aesd	v18.16b,v17.16b
549	aesimc	v18.16b,v18.16b
550	ld1	{v17.4s},[x7],#16
551	b.gt	Lcbc_dec_tail
552
553	aesd	v1.16b,v16.16b
554	aesimc	v1.16b,v1.16b
555	aesd	v18.16b,v16.16b
556	aesimc	v18.16b,v18.16b
557	aesd	v1.16b,v17.16b
558	aesimc	v1.16b,v1.16b
559	aesd	v18.16b,v17.16b
560	aesimc	v18.16b,v18.16b
561	aesd	v1.16b,v20.16b
562	aesimc	v1.16b,v1.16b
563	aesd	v18.16b,v20.16b
564	aesimc	v18.16b,v18.16b
565	cmn	x2,#0x20
566	aesd	v1.16b,v21.16b
567	aesimc	v1.16b,v1.16b
568	aesd	v18.16b,v21.16b
569	aesimc	v18.16b,v18.16b
570	eor	v5.16b,v6.16b,v7.16b
571	aesd	v1.16b,v22.16b
572	aesimc	v1.16b,v1.16b
573	aesd	v18.16b,v22.16b
574	aesimc	v18.16b,v18.16b
575	eor	v17.16b,v3.16b,v7.16b
576	aesd	v1.16b,v23.16b
577	aesd	v18.16b,v23.16b
578	b.eq	Lcbc_dec_one
579	eor	v5.16b,v5.16b,v1.16b
580	eor	v17.16b,v17.16b,v18.16b
581	orr	v6.16b,v19.16b,v19.16b
582	st1	{v5.16b},[x1],#16
583	st1	{v17.16b},[x1],#16
584	b	Lcbc_done
585
586Lcbc_dec_one:
587	eor	v5.16b,v5.16b,v18.16b
588	orr	v6.16b,v19.16b,v19.16b
589	st1	{v5.16b},[x1],#16
590
591Lcbc_done:
592	st1	{v6.16b},[x4]
593Lcbc_abort:
594	ldr	x29,[sp],#16
595	ret
596
597.globl	_aes_hw_ctr32_encrypt_blocks
598.private_extern	_aes_hw_ctr32_encrypt_blocks
599
600.align	5
601_aes_hw_ctr32_encrypt_blocks:
602	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
603	AARCH64_VALID_CALL_TARGET
604	stp	x29,x30,[sp,#-16]!
605	add	x29,sp,#0
606	ldr	w5,[x3,#240]
607
608	ldr	w8, [x4, #12]
609	ld1	{v0.4s},[x4]
610
611	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
612	sub	w5,w5,#4
613	mov	x12,#16
614	cmp	x2,#2
615	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
616	sub	w5,w5,#2
617	ld1	{v20.4s,v21.4s},[x7],#32
618	ld1	{v22.4s,v23.4s},[x7],#32
619	ld1	{v7.4s},[x7]
620	add	x7,x3,#32
621	mov	w6,w5
622	csel	x12,xzr,x12,lo
623
624	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
625	// affected by silicon errata #1742098 [0] and #1655431 [1],
626	// respectively, where the second instruction of an aese/aesmc
627	// instruction pair may execute twice if an interrupt is taken right
628	// after the first instruction consumes an input register of which a
629	// single 32-bit lane has been updated the last time it was modified.
630	//
631	// This function uses a counter in one 32-bit lane. The vmov lines
632	// could write to v1.16b and v18.16b directly, but that trips this bugs.
633	// We write to v6.16b and copy to the final register as a workaround.
634	//
635	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
636	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
637#ifndef __ARMEB__
638	rev	w8, w8
639#endif
640	add	w10, w8, #1
641	orr	v6.16b,v0.16b,v0.16b
642	rev	w10, w10
643	mov	v6.s[3],w10
644	add	w8, w8, #2
645	orr	v1.16b,v6.16b,v6.16b
646	b.ls	Lctr32_tail
647	rev	w12, w8
648	mov	v6.s[3],w12
649	sub	x2,x2,#3		// bias
650	orr	v18.16b,v6.16b,v6.16b
651	b	Loop3x_ctr32
652
653.align	4
654Loop3x_ctr32:
655	aese	v0.16b,v16.16b
656	aesmc	v0.16b,v0.16b
657	aese	v1.16b,v16.16b
658	aesmc	v1.16b,v1.16b
659	aese	v18.16b,v16.16b
660	aesmc	v18.16b,v18.16b
661	ld1	{v16.4s},[x7],#16
662	subs	w6,w6,#2
663	aese	v0.16b,v17.16b
664	aesmc	v0.16b,v0.16b
665	aese	v1.16b,v17.16b
666	aesmc	v1.16b,v1.16b
667	aese	v18.16b,v17.16b
668	aesmc	v18.16b,v18.16b
669	ld1	{v17.4s},[x7],#16
670	b.gt	Loop3x_ctr32
671
672	aese	v0.16b,v16.16b
673	aesmc	v4.16b,v0.16b
674	aese	v1.16b,v16.16b
675	aesmc	v5.16b,v1.16b
676	ld1	{v2.16b},[x0],#16
677	add	w9,w8,#1
678	aese	v18.16b,v16.16b
679	aesmc	v18.16b,v18.16b
680	ld1	{v3.16b},[x0],#16
681	rev	w9,w9
682	aese	v4.16b,v17.16b
683	aesmc	v4.16b,v4.16b
684	aese	v5.16b,v17.16b
685	aesmc	v5.16b,v5.16b
686	ld1	{v19.16b},[x0],#16
687	mov	x7,x3
688	aese	v18.16b,v17.16b
689	aesmc	v17.16b,v18.16b
690	aese	v4.16b,v20.16b
691	aesmc	v4.16b,v4.16b
692	aese	v5.16b,v20.16b
693	aesmc	v5.16b,v5.16b
694	eor	v2.16b,v2.16b,v7.16b
695	add	w10,w8,#2
696	aese	v17.16b,v20.16b
697	aesmc	v17.16b,v17.16b
698	eor	v3.16b,v3.16b,v7.16b
699	add	w8,w8,#3
700	aese	v4.16b,v21.16b
701	aesmc	v4.16b,v4.16b
702	aese	v5.16b,v21.16b
703	aesmc	v5.16b,v5.16b
704	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
705	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
706	 // 32-bit mode. See the comment above.
707	eor	v19.16b,v19.16b,v7.16b
708	mov	v6.s[3], w9
709	aese	v17.16b,v21.16b
710	aesmc	v17.16b,v17.16b
711	orr	v0.16b,v6.16b,v6.16b
712	rev	w10,w10
713	aese	v4.16b,v22.16b
714	aesmc	v4.16b,v4.16b
715	mov	v6.s[3], w10
716	rev	w12,w8
717	aese	v5.16b,v22.16b
718	aesmc	v5.16b,v5.16b
719	orr	v1.16b,v6.16b,v6.16b
720	mov	v6.s[3], w12
721	aese	v17.16b,v22.16b
722	aesmc	v17.16b,v17.16b
723	orr	v18.16b,v6.16b,v6.16b
724	subs	x2,x2,#3
725	aese	v4.16b,v23.16b
726	aese	v5.16b,v23.16b
727	aese	v17.16b,v23.16b
728
729	eor	v2.16b,v2.16b,v4.16b
730	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
731	st1	{v2.16b},[x1],#16
732	eor	v3.16b,v3.16b,v5.16b
733	mov	w6,w5
734	st1	{v3.16b},[x1],#16
735	eor	v19.16b,v19.16b,v17.16b
736	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
737	st1	{v19.16b},[x1],#16
738	b.hs	Loop3x_ctr32
739
740	adds	x2,x2,#3
741	b.eq	Lctr32_done
742	cmp	x2,#1
743	mov	x12,#16
744	csel	x12,xzr,x12,eq
745
746Lctr32_tail:
747	aese	v0.16b,v16.16b
748	aesmc	v0.16b,v0.16b
749	aese	v1.16b,v16.16b
750	aesmc	v1.16b,v1.16b
751	ld1	{v16.4s},[x7],#16
752	subs	w6,w6,#2
753	aese	v0.16b,v17.16b
754	aesmc	v0.16b,v0.16b
755	aese	v1.16b,v17.16b
756	aesmc	v1.16b,v1.16b
757	ld1	{v17.4s},[x7],#16
758	b.gt	Lctr32_tail
759
760	aese	v0.16b,v16.16b
761	aesmc	v0.16b,v0.16b
762	aese	v1.16b,v16.16b
763	aesmc	v1.16b,v1.16b
764	aese	v0.16b,v17.16b
765	aesmc	v0.16b,v0.16b
766	aese	v1.16b,v17.16b
767	aesmc	v1.16b,v1.16b
768	ld1	{v2.16b},[x0],x12
769	aese	v0.16b,v20.16b
770	aesmc	v0.16b,v0.16b
771	aese	v1.16b,v20.16b
772	aesmc	v1.16b,v1.16b
773	ld1	{v3.16b},[x0]
774	aese	v0.16b,v21.16b
775	aesmc	v0.16b,v0.16b
776	aese	v1.16b,v21.16b
777	aesmc	v1.16b,v1.16b
778	eor	v2.16b,v2.16b,v7.16b
779	aese	v0.16b,v22.16b
780	aesmc	v0.16b,v0.16b
781	aese	v1.16b,v22.16b
782	aesmc	v1.16b,v1.16b
783	eor	v3.16b,v3.16b,v7.16b
784	aese	v0.16b,v23.16b
785	aese	v1.16b,v23.16b
786
787	cmp	x2,#1
788	eor	v2.16b,v2.16b,v0.16b
789	eor	v3.16b,v3.16b,v1.16b
790	st1	{v2.16b},[x1],#16
791	b.eq	Lctr32_done
792	st1	{v3.16b},[x1]
793
794Lctr32_done:
795	ldr	x29,[sp],#16
796	ret
797
798#endif
799#endif  // !OPENSSL_NO_ASM
800