1#if defined(__arm__)
2#include <openssl/arm_arch.h>
3
4#if __ARM_MAX_ARCH__>=7
5.text
6.arch	armv7-a
7.fpu	neon
8.code	32
9.align	5
10.Lrcon:
11.long	0x01,0x01,0x01,0x01
12.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
13.long	0x1b,0x1b,0x1b,0x1b
14
15.globl	aes_v8_set_encrypt_key
16.type	aes_v8_set_encrypt_key,%function
17.align	5
18aes_v8_set_encrypt_key:
19.Lenc_key:
20	mov	r3,#-1
21	cmp	r0,#0
22	beq	.Lenc_key_abort
23	cmp	r2,#0
24	beq	.Lenc_key_abort
25	mov	r3,#-2
26	cmp	r1,#128
27	blt	.Lenc_key_abort
28	cmp	r1,#256
29	bgt	.Lenc_key_abort
30	tst	r1,#0x3f
31	bne	.Lenc_key_abort
32
33	adr	r3,.Lrcon
34	cmp	r1,#192
35
36	veor	q0,q0,q0
37	vld1.8	{q3},[r0]!
38	mov	r1,#8		@ reuse r1
39	vld1.32	{q1,q2},[r3]!
40
41	blt	.Loop128
42	beq	.L192
43	b	.L256
44
45.align	4
46.Loop128:
47	vtbl.8	d20,{q3},d4
48	vtbl.8	d21,{q3},d5
49	vext.8	q9,q0,q3,#12
50	vst1.32	{q3},[r2]!
51.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
52	subs	r1,r1,#1
53
54	veor	q3,q3,q9
55	vext.8	q9,q0,q9,#12
56	veor	q3,q3,q9
57	vext.8	q9,q0,q9,#12
58	veor	q10,q10,q1
59	veor	q3,q3,q9
60	vshl.u8	q1,q1,#1
61	veor	q3,q3,q10
62	bne	.Loop128
63
64	vld1.32	{q1},[r3]
65
66	vtbl.8	d20,{q3},d4
67	vtbl.8	d21,{q3},d5
68	vext.8	q9,q0,q3,#12
69	vst1.32	{q3},[r2]!
70.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
71
72	veor	q3,q3,q9
73	vext.8	q9,q0,q9,#12
74	veor	q3,q3,q9
75	vext.8	q9,q0,q9,#12
76	veor	q10,q10,q1
77	veor	q3,q3,q9
78	vshl.u8	q1,q1,#1
79	veor	q3,q3,q10
80
81	vtbl.8	d20,{q3},d4
82	vtbl.8	d21,{q3},d5
83	vext.8	q9,q0,q3,#12
84	vst1.32	{q3},[r2]!
85.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
86
87	veor	q3,q3,q9
88	vext.8	q9,q0,q9,#12
89	veor	q3,q3,q9
90	vext.8	q9,q0,q9,#12
91	veor	q10,q10,q1
92	veor	q3,q3,q9
93	veor	q3,q3,q10
94	vst1.32	{q3},[r2]
95	add	r2,r2,#0x50
96
97	mov	r12,#10
98	b	.Ldone
99
100.align	4
101.L192:
102	vld1.8	{d16},[r0]!
103	vmov.i8	q10,#8			@ borrow q10
104	vst1.32	{q3},[r2]!
105	vsub.i8	q2,q2,q10	@ adjust the mask
106
107.Loop192:
108	vtbl.8	d20,{q8},d4
109	vtbl.8	d21,{q8},d5
110	vext.8	q9,q0,q3,#12
111	vst1.32	{d16},[r2]!
112.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
113	subs	r1,r1,#1
114
115	veor	q3,q3,q9
116	vext.8	q9,q0,q9,#12
117	veor	q3,q3,q9
118	vext.8	q9,q0,q9,#12
119	veor	q3,q3,q9
120
121	vdup.32	q9,d7[1]
122	veor	q9,q9,q8
123	veor	q10,q10,q1
124	vext.8	q8,q0,q8,#12
125	vshl.u8	q1,q1,#1
126	veor	q8,q8,q9
127	veor	q3,q3,q10
128	veor	q8,q8,q10
129	vst1.32	{q3},[r2]!
130	bne	.Loop192
131
132	mov	r12,#12
133	add	r2,r2,#0x20
134	b	.Ldone
135
136.align	4
137.L256:
138	vld1.8	{q8},[r0]
139	mov	r1,#7
140	mov	r12,#14
141	vst1.32	{q3},[r2]!
142
143.Loop256:
144	vtbl.8	d20,{q8},d4
145	vtbl.8	d21,{q8},d5
146	vext.8	q9,q0,q3,#12
147	vst1.32	{q8},[r2]!
148.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
149	subs	r1,r1,#1
150
151	veor	q3,q3,q9
152	vext.8	q9,q0,q9,#12
153	veor	q3,q3,q9
154	vext.8	q9,q0,q9,#12
155	veor	q10,q10,q1
156	veor	q3,q3,q9
157	vshl.u8	q1,q1,#1
158	veor	q3,q3,q10
159	vst1.32	{q3},[r2]!
160	beq	.Ldone
161
162	vdup.32	q10,d7[1]
163	vext.8	q9,q0,q8,#12
164.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
165
166	veor	q8,q8,q9
167	vext.8	q9,q0,q9,#12
168	veor	q8,q8,q9
169	vext.8	q9,q0,q9,#12
170	veor	q8,q8,q9
171
172	veor	q8,q8,q10
173	b	.Loop256
174
175.Ldone:
176	str	r12,[r2]
177	mov	r3,#0
178
179.Lenc_key_abort:
180	mov	r0,r3			@ return value
181
182	bx	lr
183.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
184
185.globl	aes_v8_set_decrypt_key
186.type	aes_v8_set_decrypt_key,%function
187.align	5
188aes_v8_set_decrypt_key:
189	stmdb	sp!,{r4,lr}
190	bl	.Lenc_key
191
192	cmp	r0,#0
193	bne	.Ldec_key_abort
194
195	sub	r2,r2,#240		@ restore original r2
196	mov	r4,#-16
197	add	r0,r2,r12,lsl#4	@ end of key schedule
198
199	vld1.32	{q0},[r2]
200	vld1.32	{q1},[r0]
201	vst1.32	{q0},[r0],r4
202	vst1.32	{q1},[r2]!
203
204.Loop_imc:
205	vld1.32	{q0},[r2]
206	vld1.32	{q1},[r0]
207.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
208.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
209	vst1.32	{q0},[r0],r4
210	vst1.32	{q1},[r2]!
211	cmp	r0,r2
212	bhi	.Loop_imc
213
214	vld1.32	{q0},[r2]
215.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
216	vst1.32	{q0},[r0]
217
218	eor	r0,r0,r0		@ return value
219.Ldec_key_abort:
220	ldmia	sp!,{r4,pc}
221.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
222.globl	aes_v8_encrypt
223.type	aes_v8_encrypt,%function
224.align	5
225aes_v8_encrypt:
226	ldr	r3,[r2,#240]
227	vld1.32	{q0},[r2]!
228	vld1.8	{q2},[r0]
229	sub	r3,r3,#2
230	vld1.32	{q1},[r2]!
231
232.Loop_enc:
233.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
234.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
235	vld1.32	{q0},[r2]!
236	subs	r3,r3,#2
237.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
238.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
239	vld1.32	{q1},[r2]!
240	bgt	.Loop_enc
241
242.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
243.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
244	vld1.32	{q0},[r2]
245.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
246	veor	q2,q2,q0
247
248	vst1.8	{q2},[r1]
249	bx	lr
250.size	aes_v8_encrypt,.-aes_v8_encrypt
251.globl	aes_v8_decrypt
252.type	aes_v8_decrypt,%function
253.align	5
254aes_v8_decrypt:
255	ldr	r3,[r2,#240]
256	vld1.32	{q0},[r2]!
257	vld1.8	{q2},[r0]
258	sub	r3,r3,#2
259	vld1.32	{q1},[r2]!
260
261.Loop_dec:
262.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
263.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
264	vld1.32	{q0},[r2]!
265	subs	r3,r3,#2
266.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
267.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
268	vld1.32	{q1},[r2]!
269	bgt	.Loop_dec
270
271.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
272.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
273	vld1.32	{q0},[r2]
274.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
275	veor	q2,q2,q0
276
277	vst1.8	{q2},[r1]
278	bx	lr
279.size	aes_v8_decrypt,.-aes_v8_decrypt
280.globl	aes_v8_cbc_encrypt
281.type	aes_v8_cbc_encrypt,%function
282.align	5
283aes_v8_cbc_encrypt:
284	mov	ip,sp
285	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
286	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
287	ldmia	ip,{r4,r5}		@ load remaining args
288	subs	r2,r2,#16
289	mov	r8,#16
290	blo	.Lcbc_abort
291	moveq	r8,#0
292
293	cmp	r5,#0			@ en- or decrypting?
294	ldr	r5,[r3,#240]
295	and	r2,r2,#-16
296	vld1.8	{q6},[r4]
297	vld1.8	{q0},[r0],r8
298
299	vld1.32	{q8,q9},[r3]		@ load key schedule...
300	sub	r5,r5,#6
301	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
302	sub	r5,r5,#2
303	vld1.32	{q10,q11},[r7]!
304	vld1.32	{q12,q13},[r7]!
305	vld1.32	{q14,q15},[r7]!
306	vld1.32	{q7},[r7]
307
308	add	r7,r3,#32
309	mov	r6,r5
310	beq	.Lcbc_dec
311
312	cmp	r5,#2
313	veor	q0,q0,q6
314	veor	q5,q8,q7
315	beq	.Lcbc_enc128
316
317	vld1.32	{q2,q3},[r7]
318	add	r7,r3,#16
319	add	r6,r3,#16*4
320	add	r12,r3,#16*5
321.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
322.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
323	add	r14,r3,#16*6
324	add	r3,r3,#16*7
325	b	.Lenter_cbc_enc
326
327.align	4
328.Loop_cbc_enc:
329.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
330.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
331	vst1.8	{q6},[r1]!
332.Lenter_cbc_enc:
333.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
334.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
335.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
336.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
337	vld1.32	{q8},[r6]
338	cmp	r5,#4
339.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
340.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
341	vld1.32	{q9},[r12]
342	beq	.Lcbc_enc192
343
344.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
345.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
346	vld1.32	{q8},[r14]
347.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
348.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
349	vld1.32	{q9},[r3]
350	nop
351
352.Lcbc_enc192:
353.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
354.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
355	subs	r2,r2,#16
356.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
357.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
358	moveq	r8,#0
359.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
360.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
361.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
362.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
363	vld1.8	{q8},[r0],r8
364.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
365.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
366	veor	q8,q8,q5
367.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
368.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
369	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
370.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
371.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
372.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
373	veor	q6,q0,q7
374	bhs	.Loop_cbc_enc
375
376	vst1.8	{q6},[r1]!
377	b	.Lcbc_done
378
379.align	5
380.Lcbc_enc128:
381	vld1.32	{q2,q3},[r7]
382.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
383.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
384	b	.Lenter_cbc_enc128
385.Loop_cbc_enc128:
386.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
387.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
388	vst1.8	{q6},[r1]!
389.Lenter_cbc_enc128:
390.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
391.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
392	subs	r2,r2,#16
393.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
394.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
395	moveq	r8,#0
396.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
397.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
398.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
399.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
400.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
401.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
402	vld1.8	{q8},[r0],r8
403.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
404.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
405.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
406.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
407.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
408.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
409	veor	q8,q8,q5
410.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
411	veor	q6,q0,q7
412	bhs	.Loop_cbc_enc128
413
414	vst1.8	{q6},[r1]!
415	b	.Lcbc_done
416.align	5
417.Lcbc_dec:
418	vld1.8	{q10},[r0]!
419	subs	r2,r2,#32		@ bias
420	add	r6,r5,#2
421	vorr	q3,q0,q0
422	vorr	q1,q0,q0
423	vorr	q11,q10,q10
424	blo	.Lcbc_dec_tail
425
426	vorr	q1,q10,q10
427	vld1.8	{q10},[r0]!
428	vorr	q2,q0,q0
429	vorr	q3,q1,q1
430	vorr	q11,q10,q10
431
432.Loop3x_cbc_dec:
433.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
434.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
435.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
436.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
437.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
438.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
439	vld1.32	{q8},[r7]!
440	subs	r6,r6,#2
441.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
442.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
443.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
444.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
445.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
446.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
447	vld1.32	{q9},[r7]!
448	bgt	.Loop3x_cbc_dec
449
450.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
451.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
452.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
453.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
454.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
455.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
456	veor	q4,q6,q7
457	subs	r2,r2,#0x30
458	veor	q5,q2,q7
459	movlo	r6,r2			@ r6, r6, is zero at this point
460.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
461.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
462.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
463.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
464.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
465.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
466	veor	q9,q3,q7
467	add	r0,r0,r6		@ r0 is adjusted in such way that
468					@ at exit from the loop q1-q10
469					@ are loaded with last "words"
470	vorr	q6,q11,q11
471	mov	r7,r3
472.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
473.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
474.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
475.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
476.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
477.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
478	vld1.8	{q2},[r0]!
479.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
480.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
481.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
482.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
483.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
484.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
485	vld1.8	{q3},[r0]!
486.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
487.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
488.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
489.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
490.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
491.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
492	vld1.8	{q11},[r0]!
493.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
494.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
495.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
496	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
497	add	r6,r5,#2
498	veor	q4,q4,q0
499	veor	q5,q5,q1
500	veor	q10,q10,q9
501	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
502	vst1.8	{q4},[r1]!
503	vorr	q0,q2,q2
504	vst1.8	{q5},[r1]!
505	vorr	q1,q3,q3
506	vst1.8	{q10},[r1]!
507	vorr	q10,q11,q11
508	bhs	.Loop3x_cbc_dec
509
510	cmn	r2,#0x30
511	beq	.Lcbc_done
512	nop
513
514.Lcbc_dec_tail:
515.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
516.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
517.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
518.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
519	vld1.32	{q8},[r7]!
520	subs	r6,r6,#2
521.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
522.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
523.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
524.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
525	vld1.32	{q9},[r7]!
526	bgt	.Lcbc_dec_tail
527
528.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
529.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
530.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
531.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
532.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
533.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
534.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
535.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
536.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
537.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
538.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
539.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
540	cmn	r2,#0x20
541.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
542.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
543.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
544.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
545	veor	q5,q6,q7
546.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
547.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
548.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
549.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
550	veor	q9,q3,q7
551.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
552.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
553	beq	.Lcbc_dec_one
554	veor	q5,q5,q1
555	veor	q9,q9,q10
556	vorr	q6,q11,q11
557	vst1.8	{q5},[r1]!
558	vst1.8	{q9},[r1]!
559	b	.Lcbc_done
560
561.Lcbc_dec_one:
562	veor	q5,q5,q10
563	vorr	q6,q11,q11
564	vst1.8	{q5},[r1]!
565
566.Lcbc_done:
567	vst1.8	{q6},[r4]
568.Lcbc_abort:
569	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
570	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
571.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
572.globl	aes_v8_ctr32_encrypt_blocks
573.type	aes_v8_ctr32_encrypt_blocks,%function
574.align	5
575aes_v8_ctr32_encrypt_blocks:
576	mov	ip,sp
577	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
578	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
579	ldr	r4, [ip]		@ load remaining arg
580	ldr	r5,[r3,#240]
581
582	ldr	r8, [r4, #12]
583	vld1.32	{q0},[r4]
584
585	vld1.32	{q8,q9},[r3]		@ load key schedule...
586	sub	r5,r5,#4
587	mov	r12,#16
588	cmp	r2,#2
589	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
590	sub	r5,r5,#2
591	vld1.32	{q12,q13},[r7]!
592	vld1.32	{q14,q15},[r7]!
593	vld1.32	{q7},[r7]
594	add	r7,r3,#32
595	mov	r6,r5
596	movlo	r12,#0
597#ifndef __ARMEB__
598	rev	r8, r8
599#endif
600	vorr	q1,q0,q0
601	add	r10, r8, #1
602	vorr	q10,q0,q0
603	add	r8, r8, #2
604	vorr	q6,q0,q0
605	rev	r10, r10
606	vmov.32	d3[1],r10
607	bls	.Lctr32_tail
608	rev	r12, r8
609	sub	r2,r2,#3		@ bias
610	vmov.32	d21[1],r12
611	b	.Loop3x_ctr32
612
613.align	4
614.Loop3x_ctr32:
615.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
616.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
617.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
618.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
619.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
620.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
621	vld1.32	{q8},[r7]!
622	subs	r6,r6,#2
623.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
624.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
625.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
626.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
627.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
628.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
629	vld1.32	{q9},[r7]!
630	bgt	.Loop3x_ctr32
631
632.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
633.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
634.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
635.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
636	vld1.8	{q2},[r0]!
637	vorr	q0,q6,q6
638.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
639.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
640	vld1.8	{q3},[r0]!
641	vorr	q1,q6,q6
642.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
643.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
644.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
645.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
646	vld1.8	{q11},[r0]!
647	mov	r7,r3
648.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
649.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
650	vorr	q10,q6,q6
651	add	r9,r8,#1
652.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
653.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
654.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
655.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
656	veor	q2,q2,q7
657	add	r10,r8,#2
658.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
659.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
660	veor	q3,q3,q7
661	add	r8,r8,#3
662.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
663.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
664.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
665.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
666	veor	q11,q11,q7
667	rev	r9,r9
668.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
669.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
670	vmov.32	d1[1], r9
671	rev	r10,r10
672.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
673.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
674.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
675.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
676	vmov.32	d3[1], r10
677	rev	r12,r8
678.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
679.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
680	vmov.32	d21[1], r12
681	subs	r2,r2,#3
682.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
683.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
684.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
685
686	veor	q2,q2,q4
687	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
688	vst1.8	{q2},[r1]!
689	veor	q3,q3,q5
690	mov	r6,r5
691	vst1.8	{q3},[r1]!
692	veor	q11,q11,q9
693	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
694	vst1.8	{q11},[r1]!
695	bhs	.Loop3x_ctr32
696
697	adds	r2,r2,#3
698	beq	.Lctr32_done
699	cmp	r2,#1
700	mov	r12,#16
701	moveq	r12,#0
702
703.Lctr32_tail:
704.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
705.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
706.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
707.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
708	vld1.32	{q8},[r7]!
709	subs	r6,r6,#2
710.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
711.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
712.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
713.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
714	vld1.32	{q9},[r7]!
715	bgt	.Lctr32_tail
716
717.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
718.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
719.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
720.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
721.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
722.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
723.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
724.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
725	vld1.8	{q2},[r0],r12
726.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
727.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
728.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
729.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
730	vld1.8	{q3},[r0]
731.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
732.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
733.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
734.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
735	veor	q2,q2,q7
736.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
737.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
738.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
739.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
740	veor	q3,q3,q7
741.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
742.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
743
744	cmp	r2,#1
745	veor	q2,q2,q0
746	veor	q3,q3,q1
747	vst1.8	{q2},[r1]!
748	beq	.Lctr32_done
749	vst1.8	{q3},[r1]
750
751.Lctr32_done:
752	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
753	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
754.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
755#endif
756#endif