1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18#if __ARM_MAX_ARCH__>=7
19.text
20.arch	armv8-a+crypto
21.section	.rodata
22.align	5
23.Lrcon:
24.long	0x01,0x01,0x01,0x01
25.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
26.long	0x1b,0x1b,0x1b,0x1b
27
28.text
29
30.globl	aes_hw_set_encrypt_key
31.hidden	aes_hw_set_encrypt_key
32.type	aes_hw_set_encrypt_key,%function
33.align	5
34aes_hw_set_encrypt_key:
35.Lenc_key:
36	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
37	AARCH64_VALID_CALL_TARGET
38	stp	x29,x30,[sp,#-16]!
39	add	x29,sp,#0
40	mov	x3,#-1
41	cmp	x0,#0
42	b.eq	.Lenc_key_abort
43	cmp	x2,#0
44	b.eq	.Lenc_key_abort
45	mov	x3,#-2
46	cmp	w1,#128
47	b.lt	.Lenc_key_abort
48	cmp	w1,#256
49	b.gt	.Lenc_key_abort
50	tst	w1,#0x3f
51	b.ne	.Lenc_key_abort
52
53	adrp	x3,.Lrcon
54	add	x3,x3,:lo12:.Lrcon
55	cmp	w1,#192
56
57	eor	v0.16b,v0.16b,v0.16b
58	ld1	{v3.16b},[x0],#16
59	mov	w1,#8		// reuse w1
60	ld1	{v1.4s,v2.4s},[x3],#32
61
62	b.lt	.Loop128
63	b.eq	.L192
64	b	.L256
65
66.align	4
67.Loop128:
68	tbl	v6.16b,{v3.16b},v2.16b
69	ext	v5.16b,v0.16b,v3.16b,#12
70	st1	{v3.4s},[x2],#16
71	aese	v6.16b,v0.16b
72	subs	w1,w1,#1
73
74	eor	v3.16b,v3.16b,v5.16b
75	ext	v5.16b,v0.16b,v5.16b,#12
76	eor	v3.16b,v3.16b,v5.16b
77	ext	v5.16b,v0.16b,v5.16b,#12
78	eor	v6.16b,v6.16b,v1.16b
79	eor	v3.16b,v3.16b,v5.16b
80	shl	v1.16b,v1.16b,#1
81	eor	v3.16b,v3.16b,v6.16b
82	b.ne	.Loop128
83
84	ld1	{v1.4s},[x3]
85
86	tbl	v6.16b,{v3.16b},v2.16b
87	ext	v5.16b,v0.16b,v3.16b,#12
88	st1	{v3.4s},[x2],#16
89	aese	v6.16b,v0.16b
90
91	eor	v3.16b,v3.16b,v5.16b
92	ext	v5.16b,v0.16b,v5.16b,#12
93	eor	v3.16b,v3.16b,v5.16b
94	ext	v5.16b,v0.16b,v5.16b,#12
95	eor	v6.16b,v6.16b,v1.16b
96	eor	v3.16b,v3.16b,v5.16b
97	shl	v1.16b,v1.16b,#1
98	eor	v3.16b,v3.16b,v6.16b
99
100	tbl	v6.16b,{v3.16b},v2.16b
101	ext	v5.16b,v0.16b,v3.16b,#12
102	st1	{v3.4s},[x2],#16
103	aese	v6.16b,v0.16b
104
105	eor	v3.16b,v3.16b,v5.16b
106	ext	v5.16b,v0.16b,v5.16b,#12
107	eor	v3.16b,v3.16b,v5.16b
108	ext	v5.16b,v0.16b,v5.16b,#12
109	eor	v6.16b,v6.16b,v1.16b
110	eor	v3.16b,v3.16b,v5.16b
111	eor	v3.16b,v3.16b,v6.16b
112	st1	{v3.4s},[x2]
113	add	x2,x2,#0x50
114
115	mov	w12,#10
116	b	.Ldone
117
118.align	4
119.L192:
120	ld1	{v4.8b},[x0],#8
121	movi	v6.16b,#8			// borrow v6.16b
122	st1	{v3.4s},[x2],#16
123	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
124
125.Loop192:
126	tbl	v6.16b,{v4.16b},v2.16b
127	ext	v5.16b,v0.16b,v3.16b,#12
128	st1	{v4.8b},[x2],#8
129	aese	v6.16b,v0.16b
130	subs	w1,w1,#1
131
132	eor	v3.16b,v3.16b,v5.16b
133	ext	v5.16b,v0.16b,v5.16b,#12
134	eor	v3.16b,v3.16b,v5.16b
135	ext	v5.16b,v0.16b,v5.16b,#12
136	eor	v3.16b,v3.16b,v5.16b
137
138	dup	v5.4s,v3.s[3]
139	eor	v5.16b,v5.16b,v4.16b
140	eor	v6.16b,v6.16b,v1.16b
141	ext	v4.16b,v0.16b,v4.16b,#12
142	shl	v1.16b,v1.16b,#1
143	eor	v4.16b,v4.16b,v5.16b
144	eor	v3.16b,v3.16b,v6.16b
145	eor	v4.16b,v4.16b,v6.16b
146	st1	{v3.4s},[x2],#16
147	b.ne	.Loop192
148
149	mov	w12,#12
150	add	x2,x2,#0x20
151	b	.Ldone
152
153.align	4
154.L256:
155	ld1	{v4.16b},[x0]
156	mov	w1,#7
157	mov	w12,#14
158	st1	{v3.4s},[x2],#16
159
160.Loop256:
161	tbl	v6.16b,{v4.16b},v2.16b
162	ext	v5.16b,v0.16b,v3.16b,#12
163	st1	{v4.4s},[x2],#16
164	aese	v6.16b,v0.16b
165	subs	w1,w1,#1
166
167	eor	v3.16b,v3.16b,v5.16b
168	ext	v5.16b,v0.16b,v5.16b,#12
169	eor	v3.16b,v3.16b,v5.16b
170	ext	v5.16b,v0.16b,v5.16b,#12
171	eor	v6.16b,v6.16b,v1.16b
172	eor	v3.16b,v3.16b,v5.16b
173	shl	v1.16b,v1.16b,#1
174	eor	v3.16b,v3.16b,v6.16b
175	st1	{v3.4s},[x2],#16
176	b.eq	.Ldone
177
178	dup	v6.4s,v3.s[3]		// just splat
179	ext	v5.16b,v0.16b,v4.16b,#12
180	aese	v6.16b,v0.16b
181
182	eor	v4.16b,v4.16b,v5.16b
183	ext	v5.16b,v0.16b,v5.16b,#12
184	eor	v4.16b,v4.16b,v5.16b
185	ext	v5.16b,v0.16b,v5.16b,#12
186	eor	v4.16b,v4.16b,v5.16b
187
188	eor	v4.16b,v4.16b,v6.16b
189	b	.Loop256
190
191.Ldone:
192	str	w12,[x2]
193	mov	x3,#0
194
195.Lenc_key_abort:
196	mov	x0,x3			// return value
197	ldr	x29,[sp],#16
198	ret
199.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
200
201.globl	aes_hw_set_decrypt_key
202.hidden	aes_hw_set_decrypt_key
203.type	aes_hw_set_decrypt_key,%function
204.align	5
205aes_hw_set_decrypt_key:
206	AARCH64_SIGN_LINK_REGISTER
207	stp	x29,x30,[sp,#-16]!
208	add	x29,sp,#0
209	bl	.Lenc_key
210
211	cmp	x0,#0
212	b.ne	.Ldec_key_abort
213
214	sub	x2,x2,#240		// restore original x2
215	mov	x4,#-16
216	add	x0,x2,x12,lsl#4	// end of key schedule
217
218	ld1	{v0.4s},[x2]
219	ld1	{v1.4s},[x0]
220	st1	{v0.4s},[x0],x4
221	st1	{v1.4s},[x2],#16
222
223.Loop_imc:
224	ld1	{v0.4s},[x2]
225	ld1	{v1.4s},[x0]
226	aesimc	v0.16b,v0.16b
227	aesimc	v1.16b,v1.16b
228	st1	{v0.4s},[x0],x4
229	st1	{v1.4s},[x2],#16
230	cmp	x0,x2
231	b.hi	.Loop_imc
232
233	ld1	{v0.4s},[x2]
234	aesimc	v0.16b,v0.16b
235	st1	{v0.4s},[x0]
236
237	eor	x0,x0,x0		// return value
238.Ldec_key_abort:
239	ldp	x29,x30,[sp],#16
240	AARCH64_VALIDATE_LINK_REGISTER
241	ret
242.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
243.globl	aes_hw_encrypt
244.hidden	aes_hw_encrypt
245.type	aes_hw_encrypt,%function
246.align	5
247aes_hw_encrypt:
248	AARCH64_VALID_CALL_TARGET
249	ldr	w3,[x2,#240]
250	ld1	{v0.4s},[x2],#16
251	ld1	{v2.16b},[x0]
252	sub	w3,w3,#2
253	ld1	{v1.4s},[x2],#16
254
255.Loop_enc:
256	aese	v2.16b,v0.16b
257	aesmc	v2.16b,v2.16b
258	ld1	{v0.4s},[x2],#16
259	subs	w3,w3,#2
260	aese	v2.16b,v1.16b
261	aesmc	v2.16b,v2.16b
262	ld1	{v1.4s},[x2],#16
263	b.gt	.Loop_enc
264
265	aese	v2.16b,v0.16b
266	aesmc	v2.16b,v2.16b
267	ld1	{v0.4s},[x2]
268	aese	v2.16b,v1.16b
269	eor	v2.16b,v2.16b,v0.16b
270
271	st1	{v2.16b},[x1]
272	ret
273.size	aes_hw_encrypt,.-aes_hw_encrypt
274.globl	aes_hw_decrypt
275.hidden	aes_hw_decrypt
276.type	aes_hw_decrypt,%function
277.align	5
278aes_hw_decrypt:
279	AARCH64_VALID_CALL_TARGET
280	ldr	w3,[x2,#240]
281	ld1	{v0.4s},[x2],#16
282	ld1	{v2.16b},[x0]
283	sub	w3,w3,#2
284	ld1	{v1.4s},[x2],#16
285
286.Loop_dec:
287	aesd	v2.16b,v0.16b
288	aesimc	v2.16b,v2.16b
289	ld1	{v0.4s},[x2],#16
290	subs	w3,w3,#2
291	aesd	v2.16b,v1.16b
292	aesimc	v2.16b,v2.16b
293	ld1	{v1.4s},[x2],#16
294	b.gt	.Loop_dec
295
296	aesd	v2.16b,v0.16b
297	aesimc	v2.16b,v2.16b
298	ld1	{v0.4s},[x2]
299	aesd	v2.16b,v1.16b
300	eor	v2.16b,v2.16b,v0.16b
301
302	st1	{v2.16b},[x1]
303	ret
304.size	aes_hw_decrypt,.-aes_hw_decrypt
305.globl	aes_hw_cbc_encrypt
306.hidden	aes_hw_cbc_encrypt
307.type	aes_hw_cbc_encrypt,%function
308.align	5
309aes_hw_cbc_encrypt:
310	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
311	AARCH64_VALID_CALL_TARGET
312	stp	x29,x30,[sp,#-16]!
313	add	x29,sp,#0
314	subs	x2,x2,#16
315	mov	x8,#16
316	b.lo	.Lcbc_abort
317	csel	x8,xzr,x8,eq
318
319	cmp	w5,#0			// en- or decrypting?
320	ldr	w5,[x3,#240]
321	and	x2,x2,#-16
322	ld1	{v6.16b},[x4]
323	ld1	{v0.16b},[x0],x8
324
325	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
326	sub	w5,w5,#6
327	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
328	sub	w5,w5,#2
329	ld1	{v18.4s,v19.4s},[x7],#32
330	ld1	{v20.4s,v21.4s},[x7],#32
331	ld1	{v22.4s,v23.4s},[x7],#32
332	ld1	{v7.4s},[x7]
333
334	add	x7,x3,#32
335	mov	w6,w5
336	b.eq	.Lcbc_dec
337
338	cmp	w5,#2
339	eor	v0.16b,v0.16b,v6.16b
340	eor	v5.16b,v16.16b,v7.16b
341	b.eq	.Lcbc_enc128
342
343	ld1	{v2.4s,v3.4s},[x7]
344	add	x7,x3,#16
345	add	x6,x3,#16*4
346	add	x12,x3,#16*5
347	aese	v0.16b,v16.16b
348	aesmc	v0.16b,v0.16b
349	add	x14,x3,#16*6
350	add	x3,x3,#16*7
351	b	.Lenter_cbc_enc
352
353.align	4
354.Loop_cbc_enc:
355	aese	v0.16b,v16.16b
356	aesmc	v0.16b,v0.16b
357	st1	{v6.16b},[x1],#16
358.Lenter_cbc_enc:
359	aese	v0.16b,v17.16b
360	aesmc	v0.16b,v0.16b
361	aese	v0.16b,v2.16b
362	aesmc	v0.16b,v0.16b
363	ld1	{v16.4s},[x6]
364	cmp	w5,#4
365	aese	v0.16b,v3.16b
366	aesmc	v0.16b,v0.16b
367	ld1	{v17.4s},[x12]
368	b.eq	.Lcbc_enc192
369
370	aese	v0.16b,v16.16b
371	aesmc	v0.16b,v0.16b
372	ld1	{v16.4s},[x14]
373	aese	v0.16b,v17.16b
374	aesmc	v0.16b,v0.16b
375	ld1	{v17.4s},[x3]
376	nop
377
378.Lcbc_enc192:
379	aese	v0.16b,v16.16b
380	aesmc	v0.16b,v0.16b
381	subs	x2,x2,#16
382	aese	v0.16b,v17.16b
383	aesmc	v0.16b,v0.16b
384	csel	x8,xzr,x8,eq
385	aese	v0.16b,v18.16b
386	aesmc	v0.16b,v0.16b
387	aese	v0.16b,v19.16b
388	aesmc	v0.16b,v0.16b
389	ld1	{v16.16b},[x0],x8
390	aese	v0.16b,v20.16b
391	aesmc	v0.16b,v0.16b
392	eor	v16.16b,v16.16b,v5.16b
393	aese	v0.16b,v21.16b
394	aesmc	v0.16b,v0.16b
395	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
396	aese	v0.16b,v22.16b
397	aesmc	v0.16b,v0.16b
398	aese	v0.16b,v23.16b
399	eor	v6.16b,v0.16b,v7.16b
400	b.hs	.Loop_cbc_enc
401
402	st1	{v6.16b},[x1],#16
403	b	.Lcbc_done
404
405.align	5
406.Lcbc_enc128:
407	ld1	{v2.4s,v3.4s},[x7]
408	aese	v0.16b,v16.16b
409	aesmc	v0.16b,v0.16b
410	b	.Lenter_cbc_enc128
411.Loop_cbc_enc128:
412	aese	v0.16b,v16.16b
413	aesmc	v0.16b,v0.16b
414	st1	{v6.16b},[x1],#16
415.Lenter_cbc_enc128:
416	aese	v0.16b,v17.16b
417	aesmc	v0.16b,v0.16b
418	subs	x2,x2,#16
419	aese	v0.16b,v2.16b
420	aesmc	v0.16b,v0.16b
421	csel	x8,xzr,x8,eq
422	aese	v0.16b,v3.16b
423	aesmc	v0.16b,v0.16b
424	aese	v0.16b,v18.16b
425	aesmc	v0.16b,v0.16b
426	aese	v0.16b,v19.16b
427	aesmc	v0.16b,v0.16b
428	ld1	{v16.16b},[x0],x8
429	aese	v0.16b,v20.16b
430	aesmc	v0.16b,v0.16b
431	aese	v0.16b,v21.16b
432	aesmc	v0.16b,v0.16b
433	aese	v0.16b,v22.16b
434	aesmc	v0.16b,v0.16b
435	eor	v16.16b,v16.16b,v5.16b
436	aese	v0.16b,v23.16b
437	eor	v6.16b,v0.16b,v7.16b
438	b.hs	.Loop_cbc_enc128
439
440	st1	{v6.16b},[x1],#16
441	b	.Lcbc_done
442.align	5
443.Lcbc_dec:
444	ld1	{v18.16b},[x0],#16
445	subs	x2,x2,#32		// bias
446	add	w6,w5,#2
447	orr	v3.16b,v0.16b,v0.16b
448	orr	v1.16b,v0.16b,v0.16b
449	orr	v19.16b,v18.16b,v18.16b
450	b.lo	.Lcbc_dec_tail
451
452	orr	v1.16b,v18.16b,v18.16b
453	ld1	{v18.16b},[x0],#16
454	orr	v2.16b,v0.16b,v0.16b
455	orr	v3.16b,v1.16b,v1.16b
456	orr	v19.16b,v18.16b,v18.16b
457
458.Loop3x_cbc_dec:
459	aesd	v0.16b,v16.16b
460	aesimc	v0.16b,v0.16b
461	aesd	v1.16b,v16.16b
462	aesimc	v1.16b,v1.16b
463	aesd	v18.16b,v16.16b
464	aesimc	v18.16b,v18.16b
465	ld1	{v16.4s},[x7],#16
466	subs	w6,w6,#2
467	aesd	v0.16b,v17.16b
468	aesimc	v0.16b,v0.16b
469	aesd	v1.16b,v17.16b
470	aesimc	v1.16b,v1.16b
471	aesd	v18.16b,v17.16b
472	aesimc	v18.16b,v18.16b
473	ld1	{v17.4s},[x7],#16
474	b.gt	.Loop3x_cbc_dec
475
476	aesd	v0.16b,v16.16b
477	aesimc	v0.16b,v0.16b
478	aesd	v1.16b,v16.16b
479	aesimc	v1.16b,v1.16b
480	aesd	v18.16b,v16.16b
481	aesimc	v18.16b,v18.16b
482	eor	v4.16b,v6.16b,v7.16b
483	subs	x2,x2,#0x30
484	eor	v5.16b,v2.16b,v7.16b
485	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
486	aesd	v0.16b,v17.16b
487	aesimc	v0.16b,v0.16b
488	aesd	v1.16b,v17.16b
489	aesimc	v1.16b,v1.16b
490	aesd	v18.16b,v17.16b
491	aesimc	v18.16b,v18.16b
492	eor	v17.16b,v3.16b,v7.16b
493	add	x0,x0,x6		// x0 is adjusted in such way that
494					// at exit from the loop v1.16b-v18.16b
495					// are loaded with last "words"
496	orr	v6.16b,v19.16b,v19.16b
497	mov	x7,x3
498	aesd	v0.16b,v20.16b
499	aesimc	v0.16b,v0.16b
500	aesd	v1.16b,v20.16b
501	aesimc	v1.16b,v1.16b
502	aesd	v18.16b,v20.16b
503	aesimc	v18.16b,v18.16b
504	ld1	{v2.16b},[x0],#16
505	aesd	v0.16b,v21.16b
506	aesimc	v0.16b,v0.16b
507	aesd	v1.16b,v21.16b
508	aesimc	v1.16b,v1.16b
509	aesd	v18.16b,v21.16b
510	aesimc	v18.16b,v18.16b
511	ld1	{v3.16b},[x0],#16
512	aesd	v0.16b,v22.16b
513	aesimc	v0.16b,v0.16b
514	aesd	v1.16b,v22.16b
515	aesimc	v1.16b,v1.16b
516	aesd	v18.16b,v22.16b
517	aesimc	v18.16b,v18.16b
518	ld1	{v19.16b},[x0],#16
519	aesd	v0.16b,v23.16b
520	aesd	v1.16b,v23.16b
521	aesd	v18.16b,v23.16b
522	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
523	add	w6,w5,#2
524	eor	v4.16b,v4.16b,v0.16b
525	eor	v5.16b,v5.16b,v1.16b
526	eor	v18.16b,v18.16b,v17.16b
527	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
528	st1	{v4.16b},[x1],#16
529	orr	v0.16b,v2.16b,v2.16b
530	st1	{v5.16b},[x1],#16
531	orr	v1.16b,v3.16b,v3.16b
532	st1	{v18.16b},[x1],#16
533	orr	v18.16b,v19.16b,v19.16b
534	b.hs	.Loop3x_cbc_dec
535
536	cmn	x2,#0x30
537	b.eq	.Lcbc_done
538	nop
539
540.Lcbc_dec_tail:
541	aesd	v1.16b,v16.16b
542	aesimc	v1.16b,v1.16b
543	aesd	v18.16b,v16.16b
544	aesimc	v18.16b,v18.16b
545	ld1	{v16.4s},[x7],#16
546	subs	w6,w6,#2
547	aesd	v1.16b,v17.16b
548	aesimc	v1.16b,v1.16b
549	aesd	v18.16b,v17.16b
550	aesimc	v18.16b,v18.16b
551	ld1	{v17.4s},[x7],#16
552	b.gt	.Lcbc_dec_tail
553
554	aesd	v1.16b,v16.16b
555	aesimc	v1.16b,v1.16b
556	aesd	v18.16b,v16.16b
557	aesimc	v18.16b,v18.16b
558	aesd	v1.16b,v17.16b
559	aesimc	v1.16b,v1.16b
560	aesd	v18.16b,v17.16b
561	aesimc	v18.16b,v18.16b
562	aesd	v1.16b,v20.16b
563	aesimc	v1.16b,v1.16b
564	aesd	v18.16b,v20.16b
565	aesimc	v18.16b,v18.16b
566	cmn	x2,#0x20
567	aesd	v1.16b,v21.16b
568	aesimc	v1.16b,v1.16b
569	aesd	v18.16b,v21.16b
570	aesimc	v18.16b,v18.16b
571	eor	v5.16b,v6.16b,v7.16b
572	aesd	v1.16b,v22.16b
573	aesimc	v1.16b,v1.16b
574	aesd	v18.16b,v22.16b
575	aesimc	v18.16b,v18.16b
576	eor	v17.16b,v3.16b,v7.16b
577	aesd	v1.16b,v23.16b
578	aesd	v18.16b,v23.16b
579	b.eq	.Lcbc_dec_one
580	eor	v5.16b,v5.16b,v1.16b
581	eor	v17.16b,v17.16b,v18.16b
582	orr	v6.16b,v19.16b,v19.16b
583	st1	{v5.16b},[x1],#16
584	st1	{v17.16b},[x1],#16
585	b	.Lcbc_done
586
587.Lcbc_dec_one:
588	eor	v5.16b,v5.16b,v18.16b
589	orr	v6.16b,v19.16b,v19.16b
590	st1	{v5.16b},[x1],#16
591
592.Lcbc_done:
593	st1	{v6.16b},[x4]
594.Lcbc_abort:
595	ldr	x29,[sp],#16
596	ret
597.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
598.globl	aes_hw_ctr32_encrypt_blocks
599.hidden	aes_hw_ctr32_encrypt_blocks
600.type	aes_hw_ctr32_encrypt_blocks,%function
601.align	5
602aes_hw_ctr32_encrypt_blocks:
603	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
604	AARCH64_VALID_CALL_TARGET
605	stp	x29,x30,[sp,#-16]!
606	add	x29,sp,#0
607	ldr	w5,[x3,#240]
608
609	ldr	w8, [x4, #12]
610	ld1	{v0.4s},[x4]
611
612	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
613	sub	w5,w5,#4
614	mov	x12,#16
615	cmp	x2,#2
616	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
617	sub	w5,w5,#2
618	ld1	{v20.4s,v21.4s},[x7],#32
619	ld1	{v22.4s,v23.4s},[x7],#32
620	ld1	{v7.4s},[x7]
621	add	x7,x3,#32
622	mov	w6,w5
623	csel	x12,xzr,x12,lo
624
625	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
626	// affected by silicon errata #1742098 [0] and #1655431 [1],
627	// respectively, where the second instruction of an aese/aesmc
628	// instruction pair may execute twice if an interrupt is taken right
629	// after the first instruction consumes an input register of which a
630	// single 32-bit lane has been updated the last time it was modified.
631	//
632	// This function uses a counter in one 32-bit lane. The vmov lines
633	// could write to v1.16b and v18.16b directly, but that trips this bugs.
634	// We write to v6.16b and copy to the final register as a workaround.
635	//
636	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
637	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
638#ifndef __ARMEB__
639	rev	w8, w8
640#endif
641	add	w10, w8, #1
642	orr	v6.16b,v0.16b,v0.16b
643	rev	w10, w10
644	mov	v6.s[3],w10
645	add	w8, w8, #2
646	orr	v1.16b,v6.16b,v6.16b
647	b.ls	.Lctr32_tail
648	rev	w12, w8
649	mov	v6.s[3],w12
650	sub	x2,x2,#3		// bias
651	orr	v18.16b,v6.16b,v6.16b
652	b	.Loop3x_ctr32
653
654.align	4
655.Loop3x_ctr32:
656	aese	v0.16b,v16.16b
657	aesmc	v0.16b,v0.16b
658	aese	v1.16b,v16.16b
659	aesmc	v1.16b,v1.16b
660	aese	v18.16b,v16.16b
661	aesmc	v18.16b,v18.16b
662	ld1	{v16.4s},[x7],#16
663	subs	w6,w6,#2
664	aese	v0.16b,v17.16b
665	aesmc	v0.16b,v0.16b
666	aese	v1.16b,v17.16b
667	aesmc	v1.16b,v1.16b
668	aese	v18.16b,v17.16b
669	aesmc	v18.16b,v18.16b
670	ld1	{v17.4s},[x7],#16
671	b.gt	.Loop3x_ctr32
672
673	aese	v0.16b,v16.16b
674	aesmc	v4.16b,v0.16b
675	aese	v1.16b,v16.16b
676	aesmc	v5.16b,v1.16b
677	ld1	{v2.16b},[x0],#16
678	add	w9,w8,#1
679	aese	v18.16b,v16.16b
680	aesmc	v18.16b,v18.16b
681	ld1	{v3.16b},[x0],#16
682	rev	w9,w9
683	aese	v4.16b,v17.16b
684	aesmc	v4.16b,v4.16b
685	aese	v5.16b,v17.16b
686	aesmc	v5.16b,v5.16b
687	ld1	{v19.16b},[x0],#16
688	mov	x7,x3
689	aese	v18.16b,v17.16b
690	aesmc	v17.16b,v18.16b
691	aese	v4.16b,v20.16b
692	aesmc	v4.16b,v4.16b
693	aese	v5.16b,v20.16b
694	aesmc	v5.16b,v5.16b
695	eor	v2.16b,v2.16b,v7.16b
696	add	w10,w8,#2
697	aese	v17.16b,v20.16b
698	aesmc	v17.16b,v17.16b
699	eor	v3.16b,v3.16b,v7.16b
700	add	w8,w8,#3
701	aese	v4.16b,v21.16b
702	aesmc	v4.16b,v4.16b
703	aese	v5.16b,v21.16b
704	aesmc	v5.16b,v5.16b
705	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
706	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
707	 // 32-bit mode. See the comment above.
708	eor	v19.16b,v19.16b,v7.16b
709	mov	v6.s[3], w9
710	aese	v17.16b,v21.16b
711	aesmc	v17.16b,v17.16b
712	orr	v0.16b,v6.16b,v6.16b
713	rev	w10,w10
714	aese	v4.16b,v22.16b
715	aesmc	v4.16b,v4.16b
716	mov	v6.s[3], w10
717	rev	w12,w8
718	aese	v5.16b,v22.16b
719	aesmc	v5.16b,v5.16b
720	orr	v1.16b,v6.16b,v6.16b
721	mov	v6.s[3], w12
722	aese	v17.16b,v22.16b
723	aesmc	v17.16b,v17.16b
724	orr	v18.16b,v6.16b,v6.16b
725	subs	x2,x2,#3
726	aese	v4.16b,v23.16b
727	aese	v5.16b,v23.16b
728	aese	v17.16b,v23.16b
729
730	eor	v2.16b,v2.16b,v4.16b
731	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
732	st1	{v2.16b},[x1],#16
733	eor	v3.16b,v3.16b,v5.16b
734	mov	w6,w5
735	st1	{v3.16b},[x1],#16
736	eor	v19.16b,v19.16b,v17.16b
737	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
738	st1	{v19.16b},[x1],#16
739	b.hs	.Loop3x_ctr32
740
741	adds	x2,x2,#3
742	b.eq	.Lctr32_done
743	cmp	x2,#1
744	mov	x12,#16
745	csel	x12,xzr,x12,eq
746
747.Lctr32_tail:
748	aese	v0.16b,v16.16b
749	aesmc	v0.16b,v0.16b
750	aese	v1.16b,v16.16b
751	aesmc	v1.16b,v1.16b
752	ld1	{v16.4s},[x7],#16
753	subs	w6,w6,#2
754	aese	v0.16b,v17.16b
755	aesmc	v0.16b,v0.16b
756	aese	v1.16b,v17.16b
757	aesmc	v1.16b,v1.16b
758	ld1	{v17.4s},[x7],#16
759	b.gt	.Lctr32_tail
760
761	aese	v0.16b,v16.16b
762	aesmc	v0.16b,v0.16b
763	aese	v1.16b,v16.16b
764	aesmc	v1.16b,v1.16b
765	aese	v0.16b,v17.16b
766	aesmc	v0.16b,v0.16b
767	aese	v1.16b,v17.16b
768	aesmc	v1.16b,v1.16b
769	ld1	{v2.16b},[x0],x12
770	aese	v0.16b,v20.16b
771	aesmc	v0.16b,v0.16b
772	aese	v1.16b,v20.16b
773	aesmc	v1.16b,v1.16b
774	ld1	{v3.16b},[x0]
775	aese	v0.16b,v21.16b
776	aesmc	v0.16b,v0.16b
777	aese	v1.16b,v21.16b
778	aesmc	v1.16b,v1.16b
779	eor	v2.16b,v2.16b,v7.16b
780	aese	v0.16b,v22.16b
781	aesmc	v0.16b,v0.16b
782	aese	v1.16b,v22.16b
783	aesmc	v1.16b,v1.16b
784	eor	v3.16b,v3.16b,v7.16b
785	aese	v0.16b,v23.16b
786	aese	v1.16b,v23.16b
787
788	cmp	x2,#1
789	eor	v2.16b,v2.16b,v0.16b
790	eor	v3.16b,v3.16b,v1.16b
791	st1	{v2.16b},[x1],#16
792	b.eq	.Lctr32_done
793	st1	{v3.16b},[x1]
794
795.Lctr32_done:
796	ldr	x29,[sp],#16
797	ret
798.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
799#endif
800#endif
801#endif  // !OPENSSL_NO_ASM
802.section	.note.GNU-stack,"",%progbits
803