1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18#if __ARM_MAX_ARCH__>=7
19.text
20.arch	armv8-a+crypto
21.section	.rodata
22.align	5
23Lrcon:
24.long	0x01,0x01,0x01,0x01
25.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
26.long	0x1b,0x1b,0x1b,0x1b
27
28.text
29
30.globl	aes_hw_set_encrypt_key
31
32.def aes_hw_set_encrypt_key
33   .type 32
34.endef
35.align	5
36aes_hw_set_encrypt_key:
37Lenc_key:
38	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
39	AARCH64_VALID_CALL_TARGET
40	stp	x29,x30,[sp,#-16]!
41	add	x29,sp,#0
42	mov	x3,#-1
43	cmp	x0,#0
44	b.eq	Lenc_key_abort
45	cmp	x2,#0
46	b.eq	Lenc_key_abort
47	mov	x3,#-2
48	cmp	w1,#128
49	b.lt	Lenc_key_abort
50	cmp	w1,#256
51	b.gt	Lenc_key_abort
52	tst	w1,#0x3f
53	b.ne	Lenc_key_abort
54
55	adrp	x3,Lrcon
56	add	x3,x3,:lo12:Lrcon
57	cmp	w1,#192
58
59	eor	v0.16b,v0.16b,v0.16b
60	ld1	{v3.16b},[x0],#16
61	mov	w1,#8		// reuse w1
62	ld1	{v1.4s,v2.4s},[x3],#32
63
64	b.lt	Loop128
65	b.eq	L192
66	b	L256
67
68.align	4
69Loop128:
70	tbl	v6.16b,{v3.16b},v2.16b
71	ext	v5.16b,v0.16b,v3.16b,#12
72	st1	{v3.4s},[x2],#16
73	aese	v6.16b,v0.16b
74	subs	w1,w1,#1
75
76	eor	v3.16b,v3.16b,v5.16b
77	ext	v5.16b,v0.16b,v5.16b,#12
78	eor	v3.16b,v3.16b,v5.16b
79	ext	v5.16b,v0.16b,v5.16b,#12
80	eor	v6.16b,v6.16b,v1.16b
81	eor	v3.16b,v3.16b,v5.16b
82	shl	v1.16b,v1.16b,#1
83	eor	v3.16b,v3.16b,v6.16b
84	b.ne	Loop128
85
86	ld1	{v1.4s},[x3]
87
88	tbl	v6.16b,{v3.16b},v2.16b
89	ext	v5.16b,v0.16b,v3.16b,#12
90	st1	{v3.4s},[x2],#16
91	aese	v6.16b,v0.16b
92
93	eor	v3.16b,v3.16b,v5.16b
94	ext	v5.16b,v0.16b,v5.16b,#12
95	eor	v3.16b,v3.16b,v5.16b
96	ext	v5.16b,v0.16b,v5.16b,#12
97	eor	v6.16b,v6.16b,v1.16b
98	eor	v3.16b,v3.16b,v5.16b
99	shl	v1.16b,v1.16b,#1
100	eor	v3.16b,v3.16b,v6.16b
101
102	tbl	v6.16b,{v3.16b},v2.16b
103	ext	v5.16b,v0.16b,v3.16b,#12
104	st1	{v3.4s},[x2],#16
105	aese	v6.16b,v0.16b
106
107	eor	v3.16b,v3.16b,v5.16b
108	ext	v5.16b,v0.16b,v5.16b,#12
109	eor	v3.16b,v3.16b,v5.16b
110	ext	v5.16b,v0.16b,v5.16b,#12
111	eor	v6.16b,v6.16b,v1.16b
112	eor	v3.16b,v3.16b,v5.16b
113	eor	v3.16b,v3.16b,v6.16b
114	st1	{v3.4s},[x2]
115	add	x2,x2,#0x50
116
117	mov	w12,#10
118	b	Ldone
119
120.align	4
121L192:
122	ld1	{v4.8b},[x0],#8
123	movi	v6.16b,#8			// borrow v6.16b
124	st1	{v3.4s},[x2],#16
125	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
126
127Loop192:
128	tbl	v6.16b,{v4.16b},v2.16b
129	ext	v5.16b,v0.16b,v3.16b,#12
130	st1	{v4.8b},[x2],#8
131	aese	v6.16b,v0.16b
132	subs	w1,w1,#1
133
134	eor	v3.16b,v3.16b,v5.16b
135	ext	v5.16b,v0.16b,v5.16b,#12
136	eor	v3.16b,v3.16b,v5.16b
137	ext	v5.16b,v0.16b,v5.16b,#12
138	eor	v3.16b,v3.16b,v5.16b
139
140	dup	v5.4s,v3.s[3]
141	eor	v5.16b,v5.16b,v4.16b
142	eor	v6.16b,v6.16b,v1.16b
143	ext	v4.16b,v0.16b,v4.16b,#12
144	shl	v1.16b,v1.16b,#1
145	eor	v4.16b,v4.16b,v5.16b
146	eor	v3.16b,v3.16b,v6.16b
147	eor	v4.16b,v4.16b,v6.16b
148	st1	{v3.4s},[x2],#16
149	b.ne	Loop192
150
151	mov	w12,#12
152	add	x2,x2,#0x20
153	b	Ldone
154
155.align	4
156L256:
157	ld1	{v4.16b},[x0]
158	mov	w1,#7
159	mov	w12,#14
160	st1	{v3.4s},[x2],#16
161
162Loop256:
163	tbl	v6.16b,{v4.16b},v2.16b
164	ext	v5.16b,v0.16b,v3.16b,#12
165	st1	{v4.4s},[x2],#16
166	aese	v6.16b,v0.16b
167	subs	w1,w1,#1
168
169	eor	v3.16b,v3.16b,v5.16b
170	ext	v5.16b,v0.16b,v5.16b,#12
171	eor	v3.16b,v3.16b,v5.16b
172	ext	v5.16b,v0.16b,v5.16b,#12
173	eor	v6.16b,v6.16b,v1.16b
174	eor	v3.16b,v3.16b,v5.16b
175	shl	v1.16b,v1.16b,#1
176	eor	v3.16b,v3.16b,v6.16b
177	st1	{v3.4s},[x2],#16
178	b.eq	Ldone
179
180	dup	v6.4s,v3.s[3]		// just splat
181	ext	v5.16b,v0.16b,v4.16b,#12
182	aese	v6.16b,v0.16b
183
184	eor	v4.16b,v4.16b,v5.16b
185	ext	v5.16b,v0.16b,v5.16b,#12
186	eor	v4.16b,v4.16b,v5.16b
187	ext	v5.16b,v0.16b,v5.16b,#12
188	eor	v4.16b,v4.16b,v5.16b
189
190	eor	v4.16b,v4.16b,v6.16b
191	b	Loop256
192
193Ldone:
194	str	w12,[x2]
195	mov	x3,#0
196
197Lenc_key_abort:
198	mov	x0,x3			// return value
199	ldr	x29,[sp],#16
200	ret
201
202
203.globl	aes_hw_set_decrypt_key
204
205.def aes_hw_set_decrypt_key
206   .type 32
207.endef
208.align	5
209aes_hw_set_decrypt_key:
210	AARCH64_SIGN_LINK_REGISTER
211	stp	x29,x30,[sp,#-16]!
212	add	x29,sp,#0
213	bl	Lenc_key
214
215	cmp	x0,#0
216	b.ne	Ldec_key_abort
217
218	sub	x2,x2,#240		// restore original x2
219	mov	x4,#-16
220	add	x0,x2,x12,lsl#4	// end of key schedule
221
222	ld1	{v0.4s},[x2]
223	ld1	{v1.4s},[x0]
224	st1	{v0.4s},[x0],x4
225	st1	{v1.4s},[x2],#16
226
227Loop_imc:
228	ld1	{v0.4s},[x2]
229	ld1	{v1.4s},[x0]
230	aesimc	v0.16b,v0.16b
231	aesimc	v1.16b,v1.16b
232	st1	{v0.4s},[x0],x4
233	st1	{v1.4s},[x2],#16
234	cmp	x0,x2
235	b.hi	Loop_imc
236
237	ld1	{v0.4s},[x2]
238	aesimc	v0.16b,v0.16b
239	st1	{v0.4s},[x0]
240
241	eor	x0,x0,x0		// return value
242Ldec_key_abort:
243	ldp	x29,x30,[sp],#16
244	AARCH64_VALIDATE_LINK_REGISTER
245	ret
246
247.globl	aes_hw_encrypt
248
249.def aes_hw_encrypt
250   .type 32
251.endef
252.align	5
253aes_hw_encrypt:
254	AARCH64_VALID_CALL_TARGET
255	ldr	w3,[x2,#240]
256	ld1	{v0.4s},[x2],#16
257	ld1	{v2.16b},[x0]
258	sub	w3,w3,#2
259	ld1	{v1.4s},[x2],#16
260
261Loop_enc:
262	aese	v2.16b,v0.16b
263	aesmc	v2.16b,v2.16b
264	ld1	{v0.4s},[x2],#16
265	subs	w3,w3,#2
266	aese	v2.16b,v1.16b
267	aesmc	v2.16b,v2.16b
268	ld1	{v1.4s},[x2],#16
269	b.gt	Loop_enc
270
271	aese	v2.16b,v0.16b
272	aesmc	v2.16b,v2.16b
273	ld1	{v0.4s},[x2]
274	aese	v2.16b,v1.16b
275	eor	v2.16b,v2.16b,v0.16b
276
277	st1	{v2.16b},[x1]
278	ret
279
280.globl	aes_hw_decrypt
281
282.def aes_hw_decrypt
283   .type 32
284.endef
285.align	5
286aes_hw_decrypt:
287	AARCH64_VALID_CALL_TARGET
288	ldr	w3,[x2,#240]
289	ld1	{v0.4s},[x2],#16
290	ld1	{v2.16b},[x0]
291	sub	w3,w3,#2
292	ld1	{v1.4s},[x2],#16
293
294Loop_dec:
295	aesd	v2.16b,v0.16b
296	aesimc	v2.16b,v2.16b
297	ld1	{v0.4s},[x2],#16
298	subs	w3,w3,#2
299	aesd	v2.16b,v1.16b
300	aesimc	v2.16b,v2.16b
301	ld1	{v1.4s},[x2],#16
302	b.gt	Loop_dec
303
304	aesd	v2.16b,v0.16b
305	aesimc	v2.16b,v2.16b
306	ld1	{v0.4s},[x2]
307	aesd	v2.16b,v1.16b
308	eor	v2.16b,v2.16b,v0.16b
309
310	st1	{v2.16b},[x1]
311	ret
312
313.globl	aes_hw_cbc_encrypt
314
315.def aes_hw_cbc_encrypt
316   .type 32
317.endef
318.align	5
319aes_hw_cbc_encrypt:
320	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
321	AARCH64_VALID_CALL_TARGET
322	stp	x29,x30,[sp,#-16]!
323	add	x29,sp,#0
324	subs	x2,x2,#16
325	mov	x8,#16
326	b.lo	Lcbc_abort
327	csel	x8,xzr,x8,eq
328
329	cmp	w5,#0			// en- or decrypting?
330	ldr	w5,[x3,#240]
331	and	x2,x2,#-16
332	ld1	{v6.16b},[x4]
333	ld1	{v0.16b},[x0],x8
334
335	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
336	sub	w5,w5,#6
337	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
338	sub	w5,w5,#2
339	ld1	{v18.4s,v19.4s},[x7],#32
340	ld1	{v20.4s,v21.4s},[x7],#32
341	ld1	{v22.4s,v23.4s},[x7],#32
342	ld1	{v7.4s},[x7]
343
344	add	x7,x3,#32
345	mov	w6,w5
346	b.eq	Lcbc_dec
347
348	cmp	w5,#2
349	eor	v0.16b,v0.16b,v6.16b
350	eor	v5.16b,v16.16b,v7.16b
351	b.eq	Lcbc_enc128
352
353	ld1	{v2.4s,v3.4s},[x7]
354	add	x7,x3,#16
355	add	x6,x3,#16*4
356	add	x12,x3,#16*5
357	aese	v0.16b,v16.16b
358	aesmc	v0.16b,v0.16b
359	add	x14,x3,#16*6
360	add	x3,x3,#16*7
361	b	Lenter_cbc_enc
362
363.align	4
364Loop_cbc_enc:
365	aese	v0.16b,v16.16b
366	aesmc	v0.16b,v0.16b
367	st1	{v6.16b},[x1],#16
368Lenter_cbc_enc:
369	aese	v0.16b,v17.16b
370	aesmc	v0.16b,v0.16b
371	aese	v0.16b,v2.16b
372	aesmc	v0.16b,v0.16b
373	ld1	{v16.4s},[x6]
374	cmp	w5,#4
375	aese	v0.16b,v3.16b
376	aesmc	v0.16b,v0.16b
377	ld1	{v17.4s},[x12]
378	b.eq	Lcbc_enc192
379
380	aese	v0.16b,v16.16b
381	aesmc	v0.16b,v0.16b
382	ld1	{v16.4s},[x14]
383	aese	v0.16b,v17.16b
384	aesmc	v0.16b,v0.16b
385	ld1	{v17.4s},[x3]
386	nop
387
388Lcbc_enc192:
389	aese	v0.16b,v16.16b
390	aesmc	v0.16b,v0.16b
391	subs	x2,x2,#16
392	aese	v0.16b,v17.16b
393	aesmc	v0.16b,v0.16b
394	csel	x8,xzr,x8,eq
395	aese	v0.16b,v18.16b
396	aesmc	v0.16b,v0.16b
397	aese	v0.16b,v19.16b
398	aesmc	v0.16b,v0.16b
399	ld1	{v16.16b},[x0],x8
400	aese	v0.16b,v20.16b
401	aesmc	v0.16b,v0.16b
402	eor	v16.16b,v16.16b,v5.16b
403	aese	v0.16b,v21.16b
404	aesmc	v0.16b,v0.16b
405	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
406	aese	v0.16b,v22.16b
407	aesmc	v0.16b,v0.16b
408	aese	v0.16b,v23.16b
409	eor	v6.16b,v0.16b,v7.16b
410	b.hs	Loop_cbc_enc
411
412	st1	{v6.16b},[x1],#16
413	b	Lcbc_done
414
415.align	5
416Lcbc_enc128:
417	ld1	{v2.4s,v3.4s},[x7]
418	aese	v0.16b,v16.16b
419	aesmc	v0.16b,v0.16b
420	b	Lenter_cbc_enc128
421Loop_cbc_enc128:
422	aese	v0.16b,v16.16b
423	aesmc	v0.16b,v0.16b
424	st1	{v6.16b},[x1],#16
425Lenter_cbc_enc128:
426	aese	v0.16b,v17.16b
427	aesmc	v0.16b,v0.16b
428	subs	x2,x2,#16
429	aese	v0.16b,v2.16b
430	aesmc	v0.16b,v0.16b
431	csel	x8,xzr,x8,eq
432	aese	v0.16b,v3.16b
433	aesmc	v0.16b,v0.16b
434	aese	v0.16b,v18.16b
435	aesmc	v0.16b,v0.16b
436	aese	v0.16b,v19.16b
437	aesmc	v0.16b,v0.16b
438	ld1	{v16.16b},[x0],x8
439	aese	v0.16b,v20.16b
440	aesmc	v0.16b,v0.16b
441	aese	v0.16b,v21.16b
442	aesmc	v0.16b,v0.16b
443	aese	v0.16b,v22.16b
444	aesmc	v0.16b,v0.16b
445	eor	v16.16b,v16.16b,v5.16b
446	aese	v0.16b,v23.16b
447	eor	v6.16b,v0.16b,v7.16b
448	b.hs	Loop_cbc_enc128
449
450	st1	{v6.16b},[x1],#16
451	b	Lcbc_done
452.align	5
453Lcbc_dec:
454	ld1	{v18.16b},[x0],#16
455	subs	x2,x2,#32		// bias
456	add	w6,w5,#2
457	orr	v3.16b,v0.16b,v0.16b
458	orr	v1.16b,v0.16b,v0.16b
459	orr	v19.16b,v18.16b,v18.16b
460	b.lo	Lcbc_dec_tail
461
462	orr	v1.16b,v18.16b,v18.16b
463	ld1	{v18.16b},[x0],#16
464	orr	v2.16b,v0.16b,v0.16b
465	orr	v3.16b,v1.16b,v1.16b
466	orr	v19.16b,v18.16b,v18.16b
467
468Loop3x_cbc_dec:
469	aesd	v0.16b,v16.16b
470	aesimc	v0.16b,v0.16b
471	aesd	v1.16b,v16.16b
472	aesimc	v1.16b,v1.16b
473	aesd	v18.16b,v16.16b
474	aesimc	v18.16b,v18.16b
475	ld1	{v16.4s},[x7],#16
476	subs	w6,w6,#2
477	aesd	v0.16b,v17.16b
478	aesimc	v0.16b,v0.16b
479	aesd	v1.16b,v17.16b
480	aesimc	v1.16b,v1.16b
481	aesd	v18.16b,v17.16b
482	aesimc	v18.16b,v18.16b
483	ld1	{v17.4s},[x7],#16
484	b.gt	Loop3x_cbc_dec
485
486	aesd	v0.16b,v16.16b
487	aesimc	v0.16b,v0.16b
488	aesd	v1.16b,v16.16b
489	aesimc	v1.16b,v1.16b
490	aesd	v18.16b,v16.16b
491	aesimc	v18.16b,v18.16b
492	eor	v4.16b,v6.16b,v7.16b
493	subs	x2,x2,#0x30
494	eor	v5.16b,v2.16b,v7.16b
495	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
496	aesd	v0.16b,v17.16b
497	aesimc	v0.16b,v0.16b
498	aesd	v1.16b,v17.16b
499	aesimc	v1.16b,v1.16b
500	aesd	v18.16b,v17.16b
501	aesimc	v18.16b,v18.16b
502	eor	v17.16b,v3.16b,v7.16b
503	add	x0,x0,x6		// x0 is adjusted in such way that
504					// at exit from the loop v1.16b-v18.16b
505					// are loaded with last "words"
506	orr	v6.16b,v19.16b,v19.16b
507	mov	x7,x3
508	aesd	v0.16b,v20.16b
509	aesimc	v0.16b,v0.16b
510	aesd	v1.16b,v20.16b
511	aesimc	v1.16b,v1.16b
512	aesd	v18.16b,v20.16b
513	aesimc	v18.16b,v18.16b
514	ld1	{v2.16b},[x0],#16
515	aesd	v0.16b,v21.16b
516	aesimc	v0.16b,v0.16b
517	aesd	v1.16b,v21.16b
518	aesimc	v1.16b,v1.16b
519	aesd	v18.16b,v21.16b
520	aesimc	v18.16b,v18.16b
521	ld1	{v3.16b},[x0],#16
522	aesd	v0.16b,v22.16b
523	aesimc	v0.16b,v0.16b
524	aesd	v1.16b,v22.16b
525	aesimc	v1.16b,v1.16b
526	aesd	v18.16b,v22.16b
527	aesimc	v18.16b,v18.16b
528	ld1	{v19.16b},[x0],#16
529	aesd	v0.16b,v23.16b
530	aesd	v1.16b,v23.16b
531	aesd	v18.16b,v23.16b
532	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
533	add	w6,w5,#2
534	eor	v4.16b,v4.16b,v0.16b
535	eor	v5.16b,v5.16b,v1.16b
536	eor	v18.16b,v18.16b,v17.16b
537	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
538	st1	{v4.16b},[x1],#16
539	orr	v0.16b,v2.16b,v2.16b
540	st1	{v5.16b},[x1],#16
541	orr	v1.16b,v3.16b,v3.16b
542	st1	{v18.16b},[x1],#16
543	orr	v18.16b,v19.16b,v19.16b
544	b.hs	Loop3x_cbc_dec
545
546	cmn	x2,#0x30
547	b.eq	Lcbc_done
548	nop
549
550Lcbc_dec_tail:
551	aesd	v1.16b,v16.16b
552	aesimc	v1.16b,v1.16b
553	aesd	v18.16b,v16.16b
554	aesimc	v18.16b,v18.16b
555	ld1	{v16.4s},[x7],#16
556	subs	w6,w6,#2
557	aesd	v1.16b,v17.16b
558	aesimc	v1.16b,v1.16b
559	aesd	v18.16b,v17.16b
560	aesimc	v18.16b,v18.16b
561	ld1	{v17.4s},[x7],#16
562	b.gt	Lcbc_dec_tail
563
564	aesd	v1.16b,v16.16b
565	aesimc	v1.16b,v1.16b
566	aesd	v18.16b,v16.16b
567	aesimc	v18.16b,v18.16b
568	aesd	v1.16b,v17.16b
569	aesimc	v1.16b,v1.16b
570	aesd	v18.16b,v17.16b
571	aesimc	v18.16b,v18.16b
572	aesd	v1.16b,v20.16b
573	aesimc	v1.16b,v1.16b
574	aesd	v18.16b,v20.16b
575	aesimc	v18.16b,v18.16b
576	cmn	x2,#0x20
577	aesd	v1.16b,v21.16b
578	aesimc	v1.16b,v1.16b
579	aesd	v18.16b,v21.16b
580	aesimc	v18.16b,v18.16b
581	eor	v5.16b,v6.16b,v7.16b
582	aesd	v1.16b,v22.16b
583	aesimc	v1.16b,v1.16b
584	aesd	v18.16b,v22.16b
585	aesimc	v18.16b,v18.16b
586	eor	v17.16b,v3.16b,v7.16b
587	aesd	v1.16b,v23.16b
588	aesd	v18.16b,v23.16b
589	b.eq	Lcbc_dec_one
590	eor	v5.16b,v5.16b,v1.16b
591	eor	v17.16b,v17.16b,v18.16b
592	orr	v6.16b,v19.16b,v19.16b
593	st1	{v5.16b},[x1],#16
594	st1	{v17.16b},[x1],#16
595	b	Lcbc_done
596
597Lcbc_dec_one:
598	eor	v5.16b,v5.16b,v18.16b
599	orr	v6.16b,v19.16b,v19.16b
600	st1	{v5.16b},[x1],#16
601
602Lcbc_done:
603	st1	{v6.16b},[x4]
604Lcbc_abort:
605	ldr	x29,[sp],#16
606	ret
607
608.globl	aes_hw_ctr32_encrypt_blocks
609
610.def aes_hw_ctr32_encrypt_blocks
611   .type 32
612.endef
613.align	5
614aes_hw_ctr32_encrypt_blocks:
615	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
616	AARCH64_VALID_CALL_TARGET
617	stp	x29,x30,[sp,#-16]!
618	add	x29,sp,#0
619	ldr	w5,[x3,#240]
620
621	ldr	w8, [x4, #12]
622	ld1	{v0.4s},[x4]
623
624	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
625	sub	w5,w5,#4
626	mov	x12,#16
627	cmp	x2,#2
628	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
629	sub	w5,w5,#2
630	ld1	{v20.4s,v21.4s},[x7],#32
631	ld1	{v22.4s,v23.4s},[x7],#32
632	ld1	{v7.4s},[x7]
633	add	x7,x3,#32
634	mov	w6,w5
635	csel	x12,xzr,x12,lo
636
637	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
638	// affected by silicon errata #1742098 [0] and #1655431 [1],
639	// respectively, where the second instruction of an aese/aesmc
640	// instruction pair may execute twice if an interrupt is taken right
641	// after the first instruction consumes an input register of which a
642	// single 32-bit lane has been updated the last time it was modified.
643	//
644	// This function uses a counter in one 32-bit lane. The vmov lines
645	// could write to v1.16b and v18.16b directly, but that trips this bugs.
646	// We write to v6.16b and copy to the final register as a workaround.
647	//
648	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
649	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
650#ifndef __ARMEB__
651	rev	w8, w8
652#endif
653	add	w10, w8, #1
654	orr	v6.16b,v0.16b,v0.16b
655	rev	w10, w10
656	mov	v6.s[3],w10
657	add	w8, w8, #2
658	orr	v1.16b,v6.16b,v6.16b
659	b.ls	Lctr32_tail
660	rev	w12, w8
661	mov	v6.s[3],w12
662	sub	x2,x2,#3		// bias
663	orr	v18.16b,v6.16b,v6.16b
664	b	Loop3x_ctr32
665
666.align	4
667Loop3x_ctr32:
668	aese	v0.16b,v16.16b
669	aesmc	v0.16b,v0.16b
670	aese	v1.16b,v16.16b
671	aesmc	v1.16b,v1.16b
672	aese	v18.16b,v16.16b
673	aesmc	v18.16b,v18.16b
674	ld1	{v16.4s},[x7],#16
675	subs	w6,w6,#2
676	aese	v0.16b,v17.16b
677	aesmc	v0.16b,v0.16b
678	aese	v1.16b,v17.16b
679	aesmc	v1.16b,v1.16b
680	aese	v18.16b,v17.16b
681	aesmc	v18.16b,v18.16b
682	ld1	{v17.4s},[x7],#16
683	b.gt	Loop3x_ctr32
684
685	aese	v0.16b,v16.16b
686	aesmc	v4.16b,v0.16b
687	aese	v1.16b,v16.16b
688	aesmc	v5.16b,v1.16b
689	ld1	{v2.16b},[x0],#16
690	add	w9,w8,#1
691	aese	v18.16b,v16.16b
692	aesmc	v18.16b,v18.16b
693	ld1	{v3.16b},[x0],#16
694	rev	w9,w9
695	aese	v4.16b,v17.16b
696	aesmc	v4.16b,v4.16b
697	aese	v5.16b,v17.16b
698	aesmc	v5.16b,v5.16b
699	ld1	{v19.16b},[x0],#16
700	mov	x7,x3
701	aese	v18.16b,v17.16b
702	aesmc	v17.16b,v18.16b
703	aese	v4.16b,v20.16b
704	aesmc	v4.16b,v4.16b
705	aese	v5.16b,v20.16b
706	aesmc	v5.16b,v5.16b
707	eor	v2.16b,v2.16b,v7.16b
708	add	w10,w8,#2
709	aese	v17.16b,v20.16b
710	aesmc	v17.16b,v17.16b
711	eor	v3.16b,v3.16b,v7.16b
712	add	w8,w8,#3
713	aese	v4.16b,v21.16b
714	aesmc	v4.16b,v4.16b
715	aese	v5.16b,v21.16b
716	aesmc	v5.16b,v5.16b
717	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
718	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
719	 // 32-bit mode. See the comment above.
720	eor	v19.16b,v19.16b,v7.16b
721	mov	v6.s[3], w9
722	aese	v17.16b,v21.16b
723	aesmc	v17.16b,v17.16b
724	orr	v0.16b,v6.16b,v6.16b
725	rev	w10,w10
726	aese	v4.16b,v22.16b
727	aesmc	v4.16b,v4.16b
728	mov	v6.s[3], w10
729	rev	w12,w8
730	aese	v5.16b,v22.16b
731	aesmc	v5.16b,v5.16b
732	orr	v1.16b,v6.16b,v6.16b
733	mov	v6.s[3], w12
734	aese	v17.16b,v22.16b
735	aesmc	v17.16b,v17.16b
736	orr	v18.16b,v6.16b,v6.16b
737	subs	x2,x2,#3
738	aese	v4.16b,v23.16b
739	aese	v5.16b,v23.16b
740	aese	v17.16b,v23.16b
741
742	eor	v2.16b,v2.16b,v4.16b
743	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
744	st1	{v2.16b},[x1],#16
745	eor	v3.16b,v3.16b,v5.16b
746	mov	w6,w5
747	st1	{v3.16b},[x1],#16
748	eor	v19.16b,v19.16b,v17.16b
749	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
750	st1	{v19.16b},[x1],#16
751	b.hs	Loop3x_ctr32
752
753	adds	x2,x2,#3
754	b.eq	Lctr32_done
755	cmp	x2,#1
756	mov	x12,#16
757	csel	x12,xzr,x12,eq
758
759Lctr32_tail:
760	aese	v0.16b,v16.16b
761	aesmc	v0.16b,v0.16b
762	aese	v1.16b,v16.16b
763	aesmc	v1.16b,v1.16b
764	ld1	{v16.4s},[x7],#16
765	subs	w6,w6,#2
766	aese	v0.16b,v17.16b
767	aesmc	v0.16b,v0.16b
768	aese	v1.16b,v17.16b
769	aesmc	v1.16b,v1.16b
770	ld1	{v17.4s},[x7],#16
771	b.gt	Lctr32_tail
772
773	aese	v0.16b,v16.16b
774	aesmc	v0.16b,v0.16b
775	aese	v1.16b,v16.16b
776	aesmc	v1.16b,v1.16b
777	aese	v0.16b,v17.16b
778	aesmc	v0.16b,v0.16b
779	aese	v1.16b,v17.16b
780	aesmc	v1.16b,v1.16b
781	ld1	{v2.16b},[x0],x12
782	aese	v0.16b,v20.16b
783	aesmc	v0.16b,v0.16b
784	aese	v1.16b,v20.16b
785	aesmc	v1.16b,v1.16b
786	ld1	{v3.16b},[x0]
787	aese	v0.16b,v21.16b
788	aesmc	v0.16b,v0.16b
789	aese	v1.16b,v21.16b
790	aesmc	v1.16b,v1.16b
791	eor	v2.16b,v2.16b,v7.16b
792	aese	v0.16b,v22.16b
793	aesmc	v0.16b,v0.16b
794	aese	v1.16b,v22.16b
795	aesmc	v1.16b,v1.16b
796	eor	v3.16b,v3.16b,v7.16b
797	aese	v0.16b,v23.16b
798	aese	v1.16b,v23.16b
799
800	cmp	x2,#1
801	eor	v2.16b,v2.16b,v0.16b
802	eor	v3.16b,v3.16b,v1.16b
803	st1	{v2.16b},[x1],#16
804	b.eq	Lctr32_done
805	st1	{v3.16b},[x1]
806
807Lctr32_done:
808	ldr	x29,[sp],#16
809	ret
810
811#endif
812#endif
813#endif  // !OPENSSL_NO_ASM
814