1#if defined(__aarch64__)
2#include <openssl/arm_arch.h>
3
4#if __ARM_MAX_ARCH__>=7
5.text
6#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
7.arch	armv8-a+crypto
8#endif
9.align	5
10.Lrcon:
11.long	0x01,0x01,0x01,0x01
12.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
13.long	0x1b,0x1b,0x1b,0x1b
14
15.globl	aes_hw_set_encrypt_key
16.hidden	aes_hw_set_encrypt_key
17.type	aes_hw_set_encrypt_key,%function
18.align	5
19aes_hw_set_encrypt_key:
20.Lenc_key:
21	stp	x29,x30,[sp,#-16]!
22	add	x29,sp,#0
23	mov	x3,#-1
24	cmp	x0,#0
25	b.eq	.Lenc_key_abort
26	cmp	x2,#0
27	b.eq	.Lenc_key_abort
28	mov	x3,#-2
29	cmp	w1,#128
30	b.lt	.Lenc_key_abort
31	cmp	w1,#256
32	b.gt	.Lenc_key_abort
33	tst	w1,#0x3f
34	b.ne	.Lenc_key_abort
35
36	adr	x3,.Lrcon
37	cmp	w1,#192
38
39	eor	v0.16b,v0.16b,v0.16b
40	ld1	{v3.16b},[x0],#16
41	mov	w1,#8		// reuse w1
42	ld1	{v1.4s,v2.4s},[x3],#32
43
44	b.lt	.Loop128
45	b.eq	.L192
46	b	.L256
47
48.align	4
49.Loop128:
50	tbl	v6.16b,{v3.16b},v2.16b
51	ext	v5.16b,v0.16b,v3.16b,#12
52	st1	{v3.4s},[x2],#16
53	aese	v6.16b,v0.16b
54	subs	w1,w1,#1
55
56	eor	v3.16b,v3.16b,v5.16b
57	ext	v5.16b,v0.16b,v5.16b,#12
58	eor	v3.16b,v3.16b,v5.16b
59	ext	v5.16b,v0.16b,v5.16b,#12
60	eor	v6.16b,v6.16b,v1.16b
61	eor	v3.16b,v3.16b,v5.16b
62	shl	v1.16b,v1.16b,#1
63	eor	v3.16b,v3.16b,v6.16b
64	b.ne	.Loop128
65
66	ld1	{v1.4s},[x3]
67
68	tbl	v6.16b,{v3.16b},v2.16b
69	ext	v5.16b,v0.16b,v3.16b,#12
70	st1	{v3.4s},[x2],#16
71	aese	v6.16b,v0.16b
72
73	eor	v3.16b,v3.16b,v5.16b
74	ext	v5.16b,v0.16b,v5.16b,#12
75	eor	v3.16b,v3.16b,v5.16b
76	ext	v5.16b,v0.16b,v5.16b,#12
77	eor	v6.16b,v6.16b,v1.16b
78	eor	v3.16b,v3.16b,v5.16b
79	shl	v1.16b,v1.16b,#1
80	eor	v3.16b,v3.16b,v6.16b
81
82	tbl	v6.16b,{v3.16b},v2.16b
83	ext	v5.16b,v0.16b,v3.16b,#12
84	st1	{v3.4s},[x2],#16
85	aese	v6.16b,v0.16b
86
87	eor	v3.16b,v3.16b,v5.16b
88	ext	v5.16b,v0.16b,v5.16b,#12
89	eor	v3.16b,v3.16b,v5.16b
90	ext	v5.16b,v0.16b,v5.16b,#12
91	eor	v6.16b,v6.16b,v1.16b
92	eor	v3.16b,v3.16b,v5.16b
93	eor	v3.16b,v3.16b,v6.16b
94	st1	{v3.4s},[x2]
95	add	x2,x2,#0x50
96
97	mov	w12,#10
98	b	.Ldone
99
100.align	4
101.L192:
102	ld1	{v4.8b},[x0],#8
103	movi	v6.16b,#8			// borrow v6.16b
104	st1	{v3.4s},[x2],#16
105	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
106
107.Loop192:
108	tbl	v6.16b,{v4.16b},v2.16b
109	ext	v5.16b,v0.16b,v3.16b,#12
110	st1	{v4.8b},[x2],#8
111	aese	v6.16b,v0.16b
112	subs	w1,w1,#1
113
114	eor	v3.16b,v3.16b,v5.16b
115	ext	v5.16b,v0.16b,v5.16b,#12
116	eor	v3.16b,v3.16b,v5.16b
117	ext	v5.16b,v0.16b,v5.16b,#12
118	eor	v3.16b,v3.16b,v5.16b
119
120	dup	v5.4s,v3.s[3]
121	eor	v5.16b,v5.16b,v4.16b
122	eor	v6.16b,v6.16b,v1.16b
123	ext	v4.16b,v0.16b,v4.16b,#12
124	shl	v1.16b,v1.16b,#1
125	eor	v4.16b,v4.16b,v5.16b
126	eor	v3.16b,v3.16b,v6.16b
127	eor	v4.16b,v4.16b,v6.16b
128	st1	{v3.4s},[x2],#16
129	b.ne	.Loop192
130
131	mov	w12,#12
132	add	x2,x2,#0x20
133	b	.Ldone
134
135.align	4
136.L256:
137	ld1	{v4.16b},[x0]
138	mov	w1,#7
139	mov	w12,#14
140	st1	{v3.4s},[x2],#16
141
142.Loop256:
143	tbl	v6.16b,{v4.16b},v2.16b
144	ext	v5.16b,v0.16b,v3.16b,#12
145	st1	{v4.4s},[x2],#16
146	aese	v6.16b,v0.16b
147	subs	w1,w1,#1
148
149	eor	v3.16b,v3.16b,v5.16b
150	ext	v5.16b,v0.16b,v5.16b,#12
151	eor	v3.16b,v3.16b,v5.16b
152	ext	v5.16b,v0.16b,v5.16b,#12
153	eor	v6.16b,v6.16b,v1.16b
154	eor	v3.16b,v3.16b,v5.16b
155	shl	v1.16b,v1.16b,#1
156	eor	v3.16b,v3.16b,v6.16b
157	st1	{v3.4s},[x2],#16
158	b.eq	.Ldone
159
160	dup	v6.4s,v3.s[3]		// just splat
161	ext	v5.16b,v0.16b,v4.16b,#12
162	aese	v6.16b,v0.16b
163
164	eor	v4.16b,v4.16b,v5.16b
165	ext	v5.16b,v0.16b,v5.16b,#12
166	eor	v4.16b,v4.16b,v5.16b
167	ext	v5.16b,v0.16b,v5.16b,#12
168	eor	v4.16b,v4.16b,v5.16b
169
170	eor	v4.16b,v4.16b,v6.16b
171	b	.Loop256
172
173.Ldone:
174	str	w12,[x2]
175	mov	x3,#0
176
177.Lenc_key_abort:
178	mov	x0,x3			// return value
179	ldr	x29,[sp],#16
180	ret
181.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
182
183.globl	aes_hw_set_decrypt_key
184.hidden	aes_hw_set_decrypt_key
185.type	aes_hw_set_decrypt_key,%function
186.align	5
187aes_hw_set_decrypt_key:
188	stp	x29,x30,[sp,#-16]!
189	add	x29,sp,#0
190	bl	.Lenc_key
191
192	cmp	x0,#0
193	b.ne	.Ldec_key_abort
194
195	sub	x2,x2,#240		// restore original x2
196	mov	x4,#-16
197	add	x0,x2,x12,lsl#4	// end of key schedule
198
199	ld1	{v0.4s},[x2]
200	ld1	{v1.4s},[x0]
201	st1	{v0.4s},[x0],x4
202	st1	{v1.4s},[x2],#16
203
204.Loop_imc:
205	ld1	{v0.4s},[x2]
206	ld1	{v1.4s},[x0]
207	aesimc	v0.16b,v0.16b
208	aesimc	v1.16b,v1.16b
209	st1	{v0.4s},[x0],x4
210	st1	{v1.4s},[x2],#16
211	cmp	x0,x2
212	b.hi	.Loop_imc
213
214	ld1	{v0.4s},[x2]
215	aesimc	v0.16b,v0.16b
216	st1	{v0.4s},[x0]
217
218	eor	x0,x0,x0		// return value
219.Ldec_key_abort:
220	ldp	x29,x30,[sp],#16
221	ret
222.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
223.globl	aes_hw_encrypt
224.hidden	aes_hw_encrypt
225.type	aes_hw_encrypt,%function
226.align	5
227aes_hw_encrypt:
228	ldr	w3,[x2,#240]
229	ld1	{v0.4s},[x2],#16
230	ld1	{v2.16b},[x0]
231	sub	w3,w3,#2
232	ld1	{v1.4s},[x2],#16
233
234.Loop_enc:
235	aese	v2.16b,v0.16b
236	aesmc	v2.16b,v2.16b
237	ld1	{v0.4s},[x2],#16
238	subs	w3,w3,#2
239	aese	v2.16b,v1.16b
240	aesmc	v2.16b,v2.16b
241	ld1	{v1.4s},[x2],#16
242	b.gt	.Loop_enc
243
244	aese	v2.16b,v0.16b
245	aesmc	v2.16b,v2.16b
246	ld1	{v0.4s},[x2]
247	aese	v2.16b,v1.16b
248	eor	v2.16b,v2.16b,v0.16b
249
250	st1	{v2.16b},[x1]
251	ret
252.size	aes_hw_encrypt,.-aes_hw_encrypt
253.globl	aes_hw_decrypt
254.hidden	aes_hw_decrypt
255.type	aes_hw_decrypt,%function
256.align	5
257aes_hw_decrypt:
258	ldr	w3,[x2,#240]
259	ld1	{v0.4s},[x2],#16
260	ld1	{v2.16b},[x0]
261	sub	w3,w3,#2
262	ld1	{v1.4s},[x2],#16
263
264.Loop_dec:
265	aesd	v2.16b,v0.16b
266	aesimc	v2.16b,v2.16b
267	ld1	{v0.4s},[x2],#16
268	subs	w3,w3,#2
269	aesd	v2.16b,v1.16b
270	aesimc	v2.16b,v2.16b
271	ld1	{v1.4s},[x2],#16
272	b.gt	.Loop_dec
273
274	aesd	v2.16b,v0.16b
275	aesimc	v2.16b,v2.16b
276	ld1	{v0.4s},[x2]
277	aesd	v2.16b,v1.16b
278	eor	v2.16b,v2.16b,v0.16b
279
280	st1	{v2.16b},[x1]
281	ret
282.size	aes_hw_decrypt,.-aes_hw_decrypt
283.globl	aes_hw_cbc_encrypt
284.hidden	aes_hw_cbc_encrypt
285.type	aes_hw_cbc_encrypt,%function
286.align	5
287aes_hw_cbc_encrypt:
288	stp	x29,x30,[sp,#-16]!
289	add	x29,sp,#0
290	subs	x2,x2,#16
291	mov	x8,#16
292	b.lo	.Lcbc_abort
293	csel	x8,xzr,x8,eq
294
295	cmp	w5,#0			// en- or decrypting?
296	ldr	w5,[x3,#240]
297	and	x2,x2,#-16
298	ld1	{v6.16b},[x4]
299	ld1	{v0.16b},[x0],x8
300
301	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
302	sub	w5,w5,#6
303	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
304	sub	w5,w5,#2
305	ld1	{v18.4s,v19.4s},[x7],#32
306	ld1	{v20.4s,v21.4s},[x7],#32
307	ld1	{v22.4s,v23.4s},[x7],#32
308	ld1	{v7.4s},[x7]
309
310	add	x7,x3,#32
311	mov	w6,w5
312	b.eq	.Lcbc_dec
313
314	cmp	w5,#2
315	eor	v0.16b,v0.16b,v6.16b
316	eor	v5.16b,v16.16b,v7.16b
317	b.eq	.Lcbc_enc128
318
319	ld1	{v2.4s,v3.4s},[x7]
320	add	x7,x3,#16
321	add	x6,x3,#16*4
322	add	x12,x3,#16*5
323	aese	v0.16b,v16.16b
324	aesmc	v0.16b,v0.16b
325	add	x14,x3,#16*6
326	add	x3,x3,#16*7
327	b	.Lenter_cbc_enc
328
329.align	4
330.Loop_cbc_enc:
331	aese	v0.16b,v16.16b
332	aesmc	v0.16b,v0.16b
333	st1	{v6.16b},[x1],#16
334.Lenter_cbc_enc:
335	aese	v0.16b,v17.16b
336	aesmc	v0.16b,v0.16b
337	aese	v0.16b,v2.16b
338	aesmc	v0.16b,v0.16b
339	ld1	{v16.4s},[x6]
340	cmp	w5,#4
341	aese	v0.16b,v3.16b
342	aesmc	v0.16b,v0.16b
343	ld1	{v17.4s},[x12]
344	b.eq	.Lcbc_enc192
345
346	aese	v0.16b,v16.16b
347	aesmc	v0.16b,v0.16b
348	ld1	{v16.4s},[x14]
349	aese	v0.16b,v17.16b
350	aesmc	v0.16b,v0.16b
351	ld1	{v17.4s},[x3]
352	nop
353
354.Lcbc_enc192:
355	aese	v0.16b,v16.16b
356	aesmc	v0.16b,v0.16b
357	subs	x2,x2,#16
358	aese	v0.16b,v17.16b
359	aesmc	v0.16b,v0.16b
360	csel	x8,xzr,x8,eq
361	aese	v0.16b,v18.16b
362	aesmc	v0.16b,v0.16b
363	aese	v0.16b,v19.16b
364	aesmc	v0.16b,v0.16b
365	ld1	{v16.16b},[x0],x8
366	aese	v0.16b,v20.16b
367	aesmc	v0.16b,v0.16b
368	eor	v16.16b,v16.16b,v5.16b
369	aese	v0.16b,v21.16b
370	aesmc	v0.16b,v0.16b
371	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
372	aese	v0.16b,v22.16b
373	aesmc	v0.16b,v0.16b
374	aese	v0.16b,v23.16b
375	eor	v6.16b,v0.16b,v7.16b
376	b.hs	.Loop_cbc_enc
377
378	st1	{v6.16b},[x1],#16
379	b	.Lcbc_done
380
381.align	5
382.Lcbc_enc128:
383	ld1	{v2.4s,v3.4s},[x7]
384	aese	v0.16b,v16.16b
385	aesmc	v0.16b,v0.16b
386	b	.Lenter_cbc_enc128
387.Loop_cbc_enc128:
388	aese	v0.16b,v16.16b
389	aesmc	v0.16b,v0.16b
390	st1	{v6.16b},[x1],#16
391.Lenter_cbc_enc128:
392	aese	v0.16b,v17.16b
393	aesmc	v0.16b,v0.16b
394	subs	x2,x2,#16
395	aese	v0.16b,v2.16b
396	aesmc	v0.16b,v0.16b
397	csel	x8,xzr,x8,eq
398	aese	v0.16b,v3.16b
399	aesmc	v0.16b,v0.16b
400	aese	v0.16b,v18.16b
401	aesmc	v0.16b,v0.16b
402	aese	v0.16b,v19.16b
403	aesmc	v0.16b,v0.16b
404	ld1	{v16.16b},[x0],x8
405	aese	v0.16b,v20.16b
406	aesmc	v0.16b,v0.16b
407	aese	v0.16b,v21.16b
408	aesmc	v0.16b,v0.16b
409	aese	v0.16b,v22.16b
410	aesmc	v0.16b,v0.16b
411	eor	v16.16b,v16.16b,v5.16b
412	aese	v0.16b,v23.16b
413	eor	v6.16b,v0.16b,v7.16b
414	b.hs	.Loop_cbc_enc128
415
416	st1	{v6.16b},[x1],#16
417	b	.Lcbc_done
418.align	5
419.Lcbc_dec:
420	ld1	{v18.16b},[x0],#16
421	subs	x2,x2,#32		// bias
422	add	w6,w5,#2
423	orr	v3.16b,v0.16b,v0.16b
424	orr	v1.16b,v0.16b,v0.16b
425	orr	v19.16b,v18.16b,v18.16b
426	b.lo	.Lcbc_dec_tail
427
428	orr	v1.16b,v18.16b,v18.16b
429	ld1	{v18.16b},[x0],#16
430	orr	v2.16b,v0.16b,v0.16b
431	orr	v3.16b,v1.16b,v1.16b
432	orr	v19.16b,v18.16b,v18.16b
433
434.Loop3x_cbc_dec:
435	aesd	v0.16b,v16.16b
436	aesimc	v0.16b,v0.16b
437	aesd	v1.16b,v16.16b
438	aesimc	v1.16b,v1.16b
439	aesd	v18.16b,v16.16b
440	aesimc	v18.16b,v18.16b
441	ld1	{v16.4s},[x7],#16
442	subs	w6,w6,#2
443	aesd	v0.16b,v17.16b
444	aesimc	v0.16b,v0.16b
445	aesd	v1.16b,v17.16b
446	aesimc	v1.16b,v1.16b
447	aesd	v18.16b,v17.16b
448	aesimc	v18.16b,v18.16b
449	ld1	{v17.4s},[x7],#16
450	b.gt	.Loop3x_cbc_dec
451
452	aesd	v0.16b,v16.16b
453	aesimc	v0.16b,v0.16b
454	aesd	v1.16b,v16.16b
455	aesimc	v1.16b,v1.16b
456	aesd	v18.16b,v16.16b
457	aesimc	v18.16b,v18.16b
458	eor	v4.16b,v6.16b,v7.16b
459	subs	x2,x2,#0x30
460	eor	v5.16b,v2.16b,v7.16b
461	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
462	aesd	v0.16b,v17.16b
463	aesimc	v0.16b,v0.16b
464	aesd	v1.16b,v17.16b
465	aesimc	v1.16b,v1.16b
466	aesd	v18.16b,v17.16b
467	aesimc	v18.16b,v18.16b
468	eor	v17.16b,v3.16b,v7.16b
469	add	x0,x0,x6		// x0 is adjusted in such way that
470					// at exit from the loop v1.16b-v18.16b
471					// are loaded with last "words"
472	orr	v6.16b,v19.16b,v19.16b
473	mov	x7,x3
474	aesd	v0.16b,v20.16b
475	aesimc	v0.16b,v0.16b
476	aesd	v1.16b,v20.16b
477	aesimc	v1.16b,v1.16b
478	aesd	v18.16b,v20.16b
479	aesimc	v18.16b,v18.16b
480	ld1	{v2.16b},[x0],#16
481	aesd	v0.16b,v21.16b
482	aesimc	v0.16b,v0.16b
483	aesd	v1.16b,v21.16b
484	aesimc	v1.16b,v1.16b
485	aesd	v18.16b,v21.16b
486	aesimc	v18.16b,v18.16b
487	ld1	{v3.16b},[x0],#16
488	aesd	v0.16b,v22.16b
489	aesimc	v0.16b,v0.16b
490	aesd	v1.16b,v22.16b
491	aesimc	v1.16b,v1.16b
492	aesd	v18.16b,v22.16b
493	aesimc	v18.16b,v18.16b
494	ld1	{v19.16b},[x0],#16
495	aesd	v0.16b,v23.16b
496	aesd	v1.16b,v23.16b
497	aesd	v18.16b,v23.16b
498	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
499	add	w6,w5,#2
500	eor	v4.16b,v4.16b,v0.16b
501	eor	v5.16b,v5.16b,v1.16b
502	eor	v18.16b,v18.16b,v17.16b
503	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
504	st1	{v4.16b},[x1],#16
505	orr	v0.16b,v2.16b,v2.16b
506	st1	{v5.16b},[x1],#16
507	orr	v1.16b,v3.16b,v3.16b
508	st1	{v18.16b},[x1],#16
509	orr	v18.16b,v19.16b,v19.16b
510	b.hs	.Loop3x_cbc_dec
511
512	cmn	x2,#0x30
513	b.eq	.Lcbc_done
514	nop
515
516.Lcbc_dec_tail:
517	aesd	v1.16b,v16.16b
518	aesimc	v1.16b,v1.16b
519	aesd	v18.16b,v16.16b
520	aesimc	v18.16b,v18.16b
521	ld1	{v16.4s},[x7],#16
522	subs	w6,w6,#2
523	aesd	v1.16b,v17.16b
524	aesimc	v1.16b,v1.16b
525	aesd	v18.16b,v17.16b
526	aesimc	v18.16b,v18.16b
527	ld1	{v17.4s},[x7],#16
528	b.gt	.Lcbc_dec_tail
529
530	aesd	v1.16b,v16.16b
531	aesimc	v1.16b,v1.16b
532	aesd	v18.16b,v16.16b
533	aesimc	v18.16b,v18.16b
534	aesd	v1.16b,v17.16b
535	aesimc	v1.16b,v1.16b
536	aesd	v18.16b,v17.16b
537	aesimc	v18.16b,v18.16b
538	aesd	v1.16b,v20.16b
539	aesimc	v1.16b,v1.16b
540	aesd	v18.16b,v20.16b
541	aesimc	v18.16b,v18.16b
542	cmn	x2,#0x20
543	aesd	v1.16b,v21.16b
544	aesimc	v1.16b,v1.16b
545	aesd	v18.16b,v21.16b
546	aesimc	v18.16b,v18.16b
547	eor	v5.16b,v6.16b,v7.16b
548	aesd	v1.16b,v22.16b
549	aesimc	v1.16b,v1.16b
550	aesd	v18.16b,v22.16b
551	aesimc	v18.16b,v18.16b
552	eor	v17.16b,v3.16b,v7.16b
553	aesd	v1.16b,v23.16b
554	aesd	v18.16b,v23.16b
555	b.eq	.Lcbc_dec_one
556	eor	v5.16b,v5.16b,v1.16b
557	eor	v17.16b,v17.16b,v18.16b
558	orr	v6.16b,v19.16b,v19.16b
559	st1	{v5.16b},[x1],#16
560	st1	{v17.16b},[x1],#16
561	b	.Lcbc_done
562
563.Lcbc_dec_one:
564	eor	v5.16b,v5.16b,v18.16b
565	orr	v6.16b,v19.16b,v19.16b
566	st1	{v5.16b},[x1],#16
567
568.Lcbc_done:
569	st1	{v6.16b},[x4]
570.Lcbc_abort:
571	ldr	x29,[sp],#16
572	ret
573.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
574.globl	aes_hw_ctr32_encrypt_blocks
575.hidden	aes_hw_ctr32_encrypt_blocks
576.type	aes_hw_ctr32_encrypt_blocks,%function
577.align	5
578aes_hw_ctr32_encrypt_blocks:
579	stp	x29,x30,[sp,#-16]!
580	add	x29,sp,#0
581	ldr	w5,[x3,#240]
582
583	ldr	w8, [x4, #12]
584	ld1	{v0.4s},[x4]
585
586	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
587	sub	w5,w5,#4
588	mov	x12,#16
589	cmp	x2,#2
590	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
591	sub	w5,w5,#2
592	ld1	{v20.4s,v21.4s},[x7],#32
593	ld1	{v22.4s,v23.4s},[x7],#32
594	ld1	{v7.4s},[x7]
595	add	x7,x3,#32
596	mov	w6,w5
597	csel	x12,xzr,x12,lo
598#ifndef __ARMEB__
599	rev	w8, w8
600#endif
601	orr	v1.16b,v0.16b,v0.16b
602	add	w10, w8, #1
603	orr	v18.16b,v0.16b,v0.16b
604	add	w8, w8, #2
605	orr	v6.16b,v0.16b,v0.16b
606	rev	w10, w10
607	mov	v1.s[3],w10
608	b.ls	.Lctr32_tail
609	rev	w12, w8
610	sub	x2,x2,#3		// bias
611	mov	v18.s[3],w12
612	b	.Loop3x_ctr32
613
614.align	4
615.Loop3x_ctr32:
616	aese	v0.16b,v16.16b
617	aesmc	v0.16b,v0.16b
618	aese	v1.16b,v16.16b
619	aesmc	v1.16b,v1.16b
620	aese	v18.16b,v16.16b
621	aesmc	v18.16b,v18.16b
622	ld1	{v16.4s},[x7],#16
623	subs	w6,w6,#2
624	aese	v0.16b,v17.16b
625	aesmc	v0.16b,v0.16b
626	aese	v1.16b,v17.16b
627	aesmc	v1.16b,v1.16b
628	aese	v18.16b,v17.16b
629	aesmc	v18.16b,v18.16b
630	ld1	{v17.4s},[x7],#16
631	b.gt	.Loop3x_ctr32
632
633	aese	v0.16b,v16.16b
634	aesmc	v4.16b,v0.16b
635	aese	v1.16b,v16.16b
636	aesmc	v5.16b,v1.16b
637	ld1	{v2.16b},[x0],#16
638	orr	v0.16b,v6.16b,v6.16b
639	aese	v18.16b,v16.16b
640	aesmc	v18.16b,v18.16b
641	ld1	{v3.16b},[x0],#16
642	orr	v1.16b,v6.16b,v6.16b
643	aese	v4.16b,v17.16b
644	aesmc	v4.16b,v4.16b
645	aese	v5.16b,v17.16b
646	aesmc	v5.16b,v5.16b
647	ld1	{v19.16b},[x0],#16
648	mov	x7,x3
649	aese	v18.16b,v17.16b
650	aesmc	v17.16b,v18.16b
651	orr	v18.16b,v6.16b,v6.16b
652	add	w9,w8,#1
653	aese	v4.16b,v20.16b
654	aesmc	v4.16b,v4.16b
655	aese	v5.16b,v20.16b
656	aesmc	v5.16b,v5.16b
657	eor	v2.16b,v2.16b,v7.16b
658	add	w10,w8,#2
659	aese	v17.16b,v20.16b
660	aesmc	v17.16b,v17.16b
661	eor	v3.16b,v3.16b,v7.16b
662	add	w8,w8,#3
663	aese	v4.16b,v21.16b
664	aesmc	v4.16b,v4.16b
665	aese	v5.16b,v21.16b
666	aesmc	v5.16b,v5.16b
667	eor	v19.16b,v19.16b,v7.16b
668	rev	w9,w9
669	aese	v17.16b,v21.16b
670	aesmc	v17.16b,v17.16b
671	mov	v0.s[3], w9
672	rev	w10,w10
673	aese	v4.16b,v22.16b
674	aesmc	v4.16b,v4.16b
675	aese	v5.16b,v22.16b
676	aesmc	v5.16b,v5.16b
677	mov	v1.s[3], w10
678	rev	w12,w8
679	aese	v17.16b,v22.16b
680	aesmc	v17.16b,v17.16b
681	mov	v18.s[3], w12
682	subs	x2,x2,#3
683	aese	v4.16b,v23.16b
684	aese	v5.16b,v23.16b
685	aese	v17.16b,v23.16b
686
687	eor	v2.16b,v2.16b,v4.16b
688	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
689	st1	{v2.16b},[x1],#16
690	eor	v3.16b,v3.16b,v5.16b
691	mov	w6,w5
692	st1	{v3.16b},[x1],#16
693	eor	v19.16b,v19.16b,v17.16b
694	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
695	st1	{v19.16b},[x1],#16
696	b.hs	.Loop3x_ctr32
697
698	adds	x2,x2,#3
699	b.eq	.Lctr32_done
700	cmp	x2,#1
701	mov	x12,#16
702	csel	x12,xzr,x12,eq
703
704.Lctr32_tail:
705	aese	v0.16b,v16.16b
706	aesmc	v0.16b,v0.16b
707	aese	v1.16b,v16.16b
708	aesmc	v1.16b,v1.16b
709	ld1	{v16.4s},[x7],#16
710	subs	w6,w6,#2
711	aese	v0.16b,v17.16b
712	aesmc	v0.16b,v0.16b
713	aese	v1.16b,v17.16b
714	aesmc	v1.16b,v1.16b
715	ld1	{v17.4s},[x7],#16
716	b.gt	.Lctr32_tail
717
718	aese	v0.16b,v16.16b
719	aesmc	v0.16b,v0.16b
720	aese	v1.16b,v16.16b
721	aesmc	v1.16b,v1.16b
722	aese	v0.16b,v17.16b
723	aesmc	v0.16b,v0.16b
724	aese	v1.16b,v17.16b
725	aesmc	v1.16b,v1.16b
726	ld1	{v2.16b},[x0],x12
727	aese	v0.16b,v20.16b
728	aesmc	v0.16b,v0.16b
729	aese	v1.16b,v20.16b
730	aesmc	v1.16b,v1.16b
731	ld1	{v3.16b},[x0]
732	aese	v0.16b,v21.16b
733	aesmc	v0.16b,v0.16b
734	aese	v1.16b,v21.16b
735	aesmc	v1.16b,v1.16b
736	eor	v2.16b,v2.16b,v7.16b
737	aese	v0.16b,v22.16b
738	aesmc	v0.16b,v0.16b
739	aese	v1.16b,v22.16b
740	aesmc	v1.16b,v1.16b
741	eor	v3.16b,v3.16b,v7.16b
742	aese	v0.16b,v23.16b
743	aese	v1.16b,v23.16b
744
745	cmp	x2,#1
746	eor	v2.16b,v2.16b,v0.16b
747	eor	v3.16b,v3.16b,v1.16b
748	st1	{v2.16b},[x1],#16
749	b.eq	.Lctr32_done
750	st1	{v3.16b},[x1]
751
752.Lctr32_done:
753	ldr	x29,[sp],#16
754	ret
755.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
756#endif
757#endif
758