1#if defined(__x86_64__)
2.text
3.extern	OPENSSL_ia32cap_P
4.hidden OPENSSL_ia32cap_P
5.globl	aesni_encrypt
6.hidden aesni_encrypt
7.type	aesni_encrypt,@function
8.align	16
9aesni_encrypt:
10	movups	(%rdi),%xmm2
11	movl	240(%rdx),%eax
12	movups	(%rdx),%xmm0
13	movups	16(%rdx),%xmm1
14	leaq	32(%rdx),%rdx
15	xorps	%xmm0,%xmm2
16.Loop_enc1_1:
17.byte	102,15,56,220,209
18	decl	%eax
19	movups	(%rdx),%xmm1
20	leaq	16(%rdx),%rdx
21	jnz	.Loop_enc1_1
22.byte	102,15,56,221,209
23	pxor	%xmm0,%xmm0
24	pxor	%xmm1,%xmm1
25	movups	%xmm2,(%rsi)
26	pxor	%xmm2,%xmm2
27	.byte	0xf3,0xc3
28.size	aesni_encrypt,.-aesni_encrypt
29
30.globl	aesni_decrypt
31.hidden aesni_decrypt
32.type	aesni_decrypt,@function
33.align	16
34aesni_decrypt:
35	movups	(%rdi),%xmm2
36	movl	240(%rdx),%eax
37	movups	(%rdx),%xmm0
38	movups	16(%rdx),%xmm1
39	leaq	32(%rdx),%rdx
40	xorps	%xmm0,%xmm2
41.Loop_dec1_2:
42.byte	102,15,56,222,209
43	decl	%eax
44	movups	(%rdx),%xmm1
45	leaq	16(%rdx),%rdx
46	jnz	.Loop_dec1_2
47.byte	102,15,56,223,209
48	pxor	%xmm0,%xmm0
49	pxor	%xmm1,%xmm1
50	movups	%xmm2,(%rsi)
51	pxor	%xmm2,%xmm2
52	.byte	0xf3,0xc3
53.size	aesni_decrypt, .-aesni_decrypt
54.type	_aesni_encrypt2,@function
55.align	16
56_aesni_encrypt2:
57	movups	(%rcx),%xmm0
58	shll	$4,%eax
59	movups	16(%rcx),%xmm1
60	xorps	%xmm0,%xmm2
61	xorps	%xmm0,%xmm3
62	movups	32(%rcx),%xmm0
63	leaq	32(%rcx,%rax,1),%rcx
64	negq	%rax
65	addq	$16,%rax
66
67.Lenc_loop2:
68.byte	102,15,56,220,209
69.byte	102,15,56,220,217
70	movups	(%rcx,%rax,1),%xmm1
71	addq	$32,%rax
72.byte	102,15,56,220,208
73.byte	102,15,56,220,216
74	movups	-16(%rcx,%rax,1),%xmm0
75	jnz	.Lenc_loop2
76
77.byte	102,15,56,220,209
78.byte	102,15,56,220,217
79.byte	102,15,56,221,208
80.byte	102,15,56,221,216
81	.byte	0xf3,0xc3
82.size	_aesni_encrypt2,.-_aesni_encrypt2
83.type	_aesni_decrypt2,@function
84.align	16
85_aesni_decrypt2:
86	movups	(%rcx),%xmm0
87	shll	$4,%eax
88	movups	16(%rcx),%xmm1
89	xorps	%xmm0,%xmm2
90	xorps	%xmm0,%xmm3
91	movups	32(%rcx),%xmm0
92	leaq	32(%rcx,%rax,1),%rcx
93	negq	%rax
94	addq	$16,%rax
95
96.Ldec_loop2:
97.byte	102,15,56,222,209
98.byte	102,15,56,222,217
99	movups	(%rcx,%rax,1),%xmm1
100	addq	$32,%rax
101.byte	102,15,56,222,208
102.byte	102,15,56,222,216
103	movups	-16(%rcx,%rax,1),%xmm0
104	jnz	.Ldec_loop2
105
106.byte	102,15,56,222,209
107.byte	102,15,56,222,217
108.byte	102,15,56,223,208
109.byte	102,15,56,223,216
110	.byte	0xf3,0xc3
111.size	_aesni_decrypt2,.-_aesni_decrypt2
112.type	_aesni_encrypt3,@function
113.align	16
114_aesni_encrypt3:
115	movups	(%rcx),%xmm0
116	shll	$4,%eax
117	movups	16(%rcx),%xmm1
118	xorps	%xmm0,%xmm2
119	xorps	%xmm0,%xmm3
120	xorps	%xmm0,%xmm4
121	movups	32(%rcx),%xmm0
122	leaq	32(%rcx,%rax,1),%rcx
123	negq	%rax
124	addq	$16,%rax
125
126.Lenc_loop3:
127.byte	102,15,56,220,209
128.byte	102,15,56,220,217
129.byte	102,15,56,220,225
130	movups	(%rcx,%rax,1),%xmm1
131	addq	$32,%rax
132.byte	102,15,56,220,208
133.byte	102,15,56,220,216
134.byte	102,15,56,220,224
135	movups	-16(%rcx,%rax,1),%xmm0
136	jnz	.Lenc_loop3
137
138.byte	102,15,56,220,209
139.byte	102,15,56,220,217
140.byte	102,15,56,220,225
141.byte	102,15,56,221,208
142.byte	102,15,56,221,216
143.byte	102,15,56,221,224
144	.byte	0xf3,0xc3
145.size	_aesni_encrypt3,.-_aesni_encrypt3
146.type	_aesni_decrypt3,@function
147.align	16
148_aesni_decrypt3:
149	movups	(%rcx),%xmm0
150	shll	$4,%eax
151	movups	16(%rcx),%xmm1
152	xorps	%xmm0,%xmm2
153	xorps	%xmm0,%xmm3
154	xorps	%xmm0,%xmm4
155	movups	32(%rcx),%xmm0
156	leaq	32(%rcx,%rax,1),%rcx
157	negq	%rax
158	addq	$16,%rax
159
160.Ldec_loop3:
161.byte	102,15,56,222,209
162.byte	102,15,56,222,217
163.byte	102,15,56,222,225
164	movups	(%rcx,%rax,1),%xmm1
165	addq	$32,%rax
166.byte	102,15,56,222,208
167.byte	102,15,56,222,216
168.byte	102,15,56,222,224
169	movups	-16(%rcx,%rax,1),%xmm0
170	jnz	.Ldec_loop3
171
172.byte	102,15,56,222,209
173.byte	102,15,56,222,217
174.byte	102,15,56,222,225
175.byte	102,15,56,223,208
176.byte	102,15,56,223,216
177.byte	102,15,56,223,224
178	.byte	0xf3,0xc3
179.size	_aesni_decrypt3,.-_aesni_decrypt3
180.type	_aesni_encrypt4,@function
181.align	16
182_aesni_encrypt4:
183	movups	(%rcx),%xmm0
184	shll	$4,%eax
185	movups	16(%rcx),%xmm1
186	xorps	%xmm0,%xmm2
187	xorps	%xmm0,%xmm3
188	xorps	%xmm0,%xmm4
189	xorps	%xmm0,%xmm5
190	movups	32(%rcx),%xmm0
191	leaq	32(%rcx,%rax,1),%rcx
192	negq	%rax
193.byte	0x0f,0x1f,0x00
194	addq	$16,%rax
195
196.Lenc_loop4:
197.byte	102,15,56,220,209
198.byte	102,15,56,220,217
199.byte	102,15,56,220,225
200.byte	102,15,56,220,233
201	movups	(%rcx,%rax,1),%xmm1
202	addq	$32,%rax
203.byte	102,15,56,220,208
204.byte	102,15,56,220,216
205.byte	102,15,56,220,224
206.byte	102,15,56,220,232
207	movups	-16(%rcx,%rax,1),%xmm0
208	jnz	.Lenc_loop4
209
210.byte	102,15,56,220,209
211.byte	102,15,56,220,217
212.byte	102,15,56,220,225
213.byte	102,15,56,220,233
214.byte	102,15,56,221,208
215.byte	102,15,56,221,216
216.byte	102,15,56,221,224
217.byte	102,15,56,221,232
218	.byte	0xf3,0xc3
219.size	_aesni_encrypt4,.-_aesni_encrypt4
220.type	_aesni_decrypt4,@function
221.align	16
222_aesni_decrypt4:
223	movups	(%rcx),%xmm0
224	shll	$4,%eax
225	movups	16(%rcx),%xmm1
226	xorps	%xmm0,%xmm2
227	xorps	%xmm0,%xmm3
228	xorps	%xmm0,%xmm4
229	xorps	%xmm0,%xmm5
230	movups	32(%rcx),%xmm0
231	leaq	32(%rcx,%rax,1),%rcx
232	negq	%rax
233.byte	0x0f,0x1f,0x00
234	addq	$16,%rax
235
236.Ldec_loop4:
237.byte	102,15,56,222,209
238.byte	102,15,56,222,217
239.byte	102,15,56,222,225
240.byte	102,15,56,222,233
241	movups	(%rcx,%rax,1),%xmm1
242	addq	$32,%rax
243.byte	102,15,56,222,208
244.byte	102,15,56,222,216
245.byte	102,15,56,222,224
246.byte	102,15,56,222,232
247	movups	-16(%rcx,%rax,1),%xmm0
248	jnz	.Ldec_loop4
249
250.byte	102,15,56,222,209
251.byte	102,15,56,222,217
252.byte	102,15,56,222,225
253.byte	102,15,56,222,233
254.byte	102,15,56,223,208
255.byte	102,15,56,223,216
256.byte	102,15,56,223,224
257.byte	102,15,56,223,232
258	.byte	0xf3,0xc3
259.size	_aesni_decrypt4,.-_aesni_decrypt4
260.type	_aesni_encrypt6,@function
261.align	16
262_aesni_encrypt6:
263	movups	(%rcx),%xmm0
264	shll	$4,%eax
265	movups	16(%rcx),%xmm1
266	xorps	%xmm0,%xmm2
267	pxor	%xmm0,%xmm3
268	pxor	%xmm0,%xmm4
269.byte	102,15,56,220,209
270	leaq	32(%rcx,%rax,1),%rcx
271	negq	%rax
272.byte	102,15,56,220,217
273	pxor	%xmm0,%xmm5
274	pxor	%xmm0,%xmm6
275.byte	102,15,56,220,225
276	pxor	%xmm0,%xmm7
277	movups	(%rcx,%rax,1),%xmm0
278	addq	$16,%rax
279	jmp	.Lenc_loop6_enter
280.align	16
281.Lenc_loop6:
282.byte	102,15,56,220,209
283.byte	102,15,56,220,217
284.byte	102,15,56,220,225
285.Lenc_loop6_enter:
286.byte	102,15,56,220,233
287.byte	102,15,56,220,241
288.byte	102,15,56,220,249
289	movups	(%rcx,%rax,1),%xmm1
290	addq	$32,%rax
291.byte	102,15,56,220,208
292.byte	102,15,56,220,216
293.byte	102,15,56,220,224
294.byte	102,15,56,220,232
295.byte	102,15,56,220,240
296.byte	102,15,56,220,248
297	movups	-16(%rcx,%rax,1),%xmm0
298	jnz	.Lenc_loop6
299
300.byte	102,15,56,220,209
301.byte	102,15,56,220,217
302.byte	102,15,56,220,225
303.byte	102,15,56,220,233
304.byte	102,15,56,220,241
305.byte	102,15,56,220,249
306.byte	102,15,56,221,208
307.byte	102,15,56,221,216
308.byte	102,15,56,221,224
309.byte	102,15,56,221,232
310.byte	102,15,56,221,240
311.byte	102,15,56,221,248
312	.byte	0xf3,0xc3
313.size	_aesni_encrypt6,.-_aesni_encrypt6
314.type	_aesni_decrypt6,@function
315.align	16
316_aesni_decrypt6:
317	movups	(%rcx),%xmm0
318	shll	$4,%eax
319	movups	16(%rcx),%xmm1
320	xorps	%xmm0,%xmm2
321	pxor	%xmm0,%xmm3
322	pxor	%xmm0,%xmm4
323.byte	102,15,56,222,209
324	leaq	32(%rcx,%rax,1),%rcx
325	negq	%rax
326.byte	102,15,56,222,217
327	pxor	%xmm0,%xmm5
328	pxor	%xmm0,%xmm6
329.byte	102,15,56,222,225
330	pxor	%xmm0,%xmm7
331	movups	(%rcx,%rax,1),%xmm0
332	addq	$16,%rax
333	jmp	.Ldec_loop6_enter
334.align	16
335.Ldec_loop6:
336.byte	102,15,56,222,209
337.byte	102,15,56,222,217
338.byte	102,15,56,222,225
339.Ldec_loop6_enter:
340.byte	102,15,56,222,233
341.byte	102,15,56,222,241
342.byte	102,15,56,222,249
343	movups	(%rcx,%rax,1),%xmm1
344	addq	$32,%rax
345.byte	102,15,56,222,208
346.byte	102,15,56,222,216
347.byte	102,15,56,222,224
348.byte	102,15,56,222,232
349.byte	102,15,56,222,240
350.byte	102,15,56,222,248
351	movups	-16(%rcx,%rax,1),%xmm0
352	jnz	.Ldec_loop6
353
354.byte	102,15,56,222,209
355.byte	102,15,56,222,217
356.byte	102,15,56,222,225
357.byte	102,15,56,222,233
358.byte	102,15,56,222,241
359.byte	102,15,56,222,249
360.byte	102,15,56,223,208
361.byte	102,15,56,223,216
362.byte	102,15,56,223,224
363.byte	102,15,56,223,232
364.byte	102,15,56,223,240
365.byte	102,15,56,223,248
366	.byte	0xf3,0xc3
367.size	_aesni_decrypt6,.-_aesni_decrypt6
368.type	_aesni_encrypt8,@function
369.align	16
370_aesni_encrypt8:
371	movups	(%rcx),%xmm0
372	shll	$4,%eax
373	movups	16(%rcx),%xmm1
374	xorps	%xmm0,%xmm2
375	xorps	%xmm0,%xmm3
376	pxor	%xmm0,%xmm4
377	pxor	%xmm0,%xmm5
378	pxor	%xmm0,%xmm6
379	leaq	32(%rcx,%rax,1),%rcx
380	negq	%rax
381.byte	102,15,56,220,209
382	pxor	%xmm0,%xmm7
383	pxor	%xmm0,%xmm8
384.byte	102,15,56,220,217
385	pxor	%xmm0,%xmm9
386	movups	(%rcx,%rax,1),%xmm0
387	addq	$16,%rax
388	jmp	.Lenc_loop8_inner
389.align	16
390.Lenc_loop8:
391.byte	102,15,56,220,209
392.byte	102,15,56,220,217
393.Lenc_loop8_inner:
394.byte	102,15,56,220,225
395.byte	102,15,56,220,233
396.byte	102,15,56,220,241
397.byte	102,15,56,220,249
398.byte	102,68,15,56,220,193
399.byte	102,68,15,56,220,201
400.Lenc_loop8_enter:
401	movups	(%rcx,%rax,1),%xmm1
402	addq	$32,%rax
403.byte	102,15,56,220,208
404.byte	102,15,56,220,216
405.byte	102,15,56,220,224
406.byte	102,15,56,220,232
407.byte	102,15,56,220,240
408.byte	102,15,56,220,248
409.byte	102,68,15,56,220,192
410.byte	102,68,15,56,220,200
411	movups	-16(%rcx,%rax,1),%xmm0
412	jnz	.Lenc_loop8
413
414.byte	102,15,56,220,209
415.byte	102,15,56,220,217
416.byte	102,15,56,220,225
417.byte	102,15,56,220,233
418.byte	102,15,56,220,241
419.byte	102,15,56,220,249
420.byte	102,68,15,56,220,193
421.byte	102,68,15,56,220,201
422.byte	102,15,56,221,208
423.byte	102,15,56,221,216
424.byte	102,15,56,221,224
425.byte	102,15,56,221,232
426.byte	102,15,56,221,240
427.byte	102,15,56,221,248
428.byte	102,68,15,56,221,192
429.byte	102,68,15,56,221,200
430	.byte	0xf3,0xc3
431.size	_aesni_encrypt8,.-_aesni_encrypt8
432.type	_aesni_decrypt8,@function
433.align	16
434_aesni_decrypt8:
435	movups	(%rcx),%xmm0
436	shll	$4,%eax
437	movups	16(%rcx),%xmm1
438	xorps	%xmm0,%xmm2
439	xorps	%xmm0,%xmm3
440	pxor	%xmm0,%xmm4
441	pxor	%xmm0,%xmm5
442	pxor	%xmm0,%xmm6
443	leaq	32(%rcx,%rax,1),%rcx
444	negq	%rax
445.byte	102,15,56,222,209
446	pxor	%xmm0,%xmm7
447	pxor	%xmm0,%xmm8
448.byte	102,15,56,222,217
449	pxor	%xmm0,%xmm9
450	movups	(%rcx,%rax,1),%xmm0
451	addq	$16,%rax
452	jmp	.Ldec_loop8_inner
453.align	16
454.Ldec_loop8:
455.byte	102,15,56,222,209
456.byte	102,15,56,222,217
457.Ldec_loop8_inner:
458.byte	102,15,56,222,225
459.byte	102,15,56,222,233
460.byte	102,15,56,222,241
461.byte	102,15,56,222,249
462.byte	102,68,15,56,222,193
463.byte	102,68,15,56,222,201
464.Ldec_loop8_enter:
465	movups	(%rcx,%rax,1),%xmm1
466	addq	$32,%rax
467.byte	102,15,56,222,208
468.byte	102,15,56,222,216
469.byte	102,15,56,222,224
470.byte	102,15,56,222,232
471.byte	102,15,56,222,240
472.byte	102,15,56,222,248
473.byte	102,68,15,56,222,192
474.byte	102,68,15,56,222,200
475	movups	-16(%rcx,%rax,1),%xmm0
476	jnz	.Ldec_loop8
477
478.byte	102,15,56,222,209
479.byte	102,15,56,222,217
480.byte	102,15,56,222,225
481.byte	102,15,56,222,233
482.byte	102,15,56,222,241
483.byte	102,15,56,222,249
484.byte	102,68,15,56,222,193
485.byte	102,68,15,56,222,201
486.byte	102,15,56,223,208
487.byte	102,15,56,223,216
488.byte	102,15,56,223,224
489.byte	102,15,56,223,232
490.byte	102,15,56,223,240
491.byte	102,15,56,223,248
492.byte	102,68,15,56,223,192
493.byte	102,68,15,56,223,200
494	.byte	0xf3,0xc3
495.size	_aesni_decrypt8,.-_aesni_decrypt8
496.globl	aesni_ecb_encrypt
497.hidden aesni_ecb_encrypt
498.type	aesni_ecb_encrypt,@function
499.align	16
500aesni_ecb_encrypt:
501	andq	$-16,%rdx
502	jz	.Lecb_ret
503
504	movl	240(%rcx),%eax
505	movups	(%rcx),%xmm0
506	movq	%rcx,%r11
507	movl	%eax,%r10d
508	testl	%r8d,%r8d
509	jz	.Lecb_decrypt
510
511	cmpq	$0x80,%rdx
512	jb	.Lecb_enc_tail
513
514	movdqu	(%rdi),%xmm2
515	movdqu	16(%rdi),%xmm3
516	movdqu	32(%rdi),%xmm4
517	movdqu	48(%rdi),%xmm5
518	movdqu	64(%rdi),%xmm6
519	movdqu	80(%rdi),%xmm7
520	movdqu	96(%rdi),%xmm8
521	movdqu	112(%rdi),%xmm9
522	leaq	128(%rdi),%rdi
523	subq	$0x80,%rdx
524	jmp	.Lecb_enc_loop8_enter
525.align	16
526.Lecb_enc_loop8:
527	movups	%xmm2,(%rsi)
528	movq	%r11,%rcx
529	movdqu	(%rdi),%xmm2
530	movl	%r10d,%eax
531	movups	%xmm3,16(%rsi)
532	movdqu	16(%rdi),%xmm3
533	movups	%xmm4,32(%rsi)
534	movdqu	32(%rdi),%xmm4
535	movups	%xmm5,48(%rsi)
536	movdqu	48(%rdi),%xmm5
537	movups	%xmm6,64(%rsi)
538	movdqu	64(%rdi),%xmm6
539	movups	%xmm7,80(%rsi)
540	movdqu	80(%rdi),%xmm7
541	movups	%xmm8,96(%rsi)
542	movdqu	96(%rdi),%xmm8
543	movups	%xmm9,112(%rsi)
544	leaq	128(%rsi),%rsi
545	movdqu	112(%rdi),%xmm9
546	leaq	128(%rdi),%rdi
547.Lecb_enc_loop8_enter:
548
549	call	_aesni_encrypt8
550
551	subq	$0x80,%rdx
552	jnc	.Lecb_enc_loop8
553
554	movups	%xmm2,(%rsi)
555	movq	%r11,%rcx
556	movups	%xmm3,16(%rsi)
557	movl	%r10d,%eax
558	movups	%xmm4,32(%rsi)
559	movups	%xmm5,48(%rsi)
560	movups	%xmm6,64(%rsi)
561	movups	%xmm7,80(%rsi)
562	movups	%xmm8,96(%rsi)
563	movups	%xmm9,112(%rsi)
564	leaq	128(%rsi),%rsi
565	addq	$0x80,%rdx
566	jz	.Lecb_ret
567
568.Lecb_enc_tail:
569	movups	(%rdi),%xmm2
570	cmpq	$0x20,%rdx
571	jb	.Lecb_enc_one
572	movups	16(%rdi),%xmm3
573	je	.Lecb_enc_two
574	movups	32(%rdi),%xmm4
575	cmpq	$0x40,%rdx
576	jb	.Lecb_enc_three
577	movups	48(%rdi),%xmm5
578	je	.Lecb_enc_four
579	movups	64(%rdi),%xmm6
580	cmpq	$0x60,%rdx
581	jb	.Lecb_enc_five
582	movups	80(%rdi),%xmm7
583	je	.Lecb_enc_six
584	movdqu	96(%rdi),%xmm8
585	xorps	%xmm9,%xmm9
586	call	_aesni_encrypt8
587	movups	%xmm2,(%rsi)
588	movups	%xmm3,16(%rsi)
589	movups	%xmm4,32(%rsi)
590	movups	%xmm5,48(%rsi)
591	movups	%xmm6,64(%rsi)
592	movups	%xmm7,80(%rsi)
593	movups	%xmm8,96(%rsi)
594	jmp	.Lecb_ret
595.align	16
596.Lecb_enc_one:
597	movups	(%rcx),%xmm0
598	movups	16(%rcx),%xmm1
599	leaq	32(%rcx),%rcx
600	xorps	%xmm0,%xmm2
601.Loop_enc1_3:
602.byte	102,15,56,220,209
603	decl	%eax
604	movups	(%rcx),%xmm1
605	leaq	16(%rcx),%rcx
606	jnz	.Loop_enc1_3
607.byte	102,15,56,221,209
608	movups	%xmm2,(%rsi)
609	jmp	.Lecb_ret
610.align	16
611.Lecb_enc_two:
612	call	_aesni_encrypt2
613	movups	%xmm2,(%rsi)
614	movups	%xmm3,16(%rsi)
615	jmp	.Lecb_ret
616.align	16
617.Lecb_enc_three:
618	call	_aesni_encrypt3
619	movups	%xmm2,(%rsi)
620	movups	%xmm3,16(%rsi)
621	movups	%xmm4,32(%rsi)
622	jmp	.Lecb_ret
623.align	16
624.Lecb_enc_four:
625	call	_aesni_encrypt4
626	movups	%xmm2,(%rsi)
627	movups	%xmm3,16(%rsi)
628	movups	%xmm4,32(%rsi)
629	movups	%xmm5,48(%rsi)
630	jmp	.Lecb_ret
631.align	16
632.Lecb_enc_five:
633	xorps	%xmm7,%xmm7
634	call	_aesni_encrypt6
635	movups	%xmm2,(%rsi)
636	movups	%xmm3,16(%rsi)
637	movups	%xmm4,32(%rsi)
638	movups	%xmm5,48(%rsi)
639	movups	%xmm6,64(%rsi)
640	jmp	.Lecb_ret
641.align	16
642.Lecb_enc_six:
643	call	_aesni_encrypt6
644	movups	%xmm2,(%rsi)
645	movups	%xmm3,16(%rsi)
646	movups	%xmm4,32(%rsi)
647	movups	%xmm5,48(%rsi)
648	movups	%xmm6,64(%rsi)
649	movups	%xmm7,80(%rsi)
650	jmp	.Lecb_ret
651
652.align	16
653.Lecb_decrypt:
654	cmpq	$0x80,%rdx
655	jb	.Lecb_dec_tail
656
657	movdqu	(%rdi),%xmm2
658	movdqu	16(%rdi),%xmm3
659	movdqu	32(%rdi),%xmm4
660	movdqu	48(%rdi),%xmm5
661	movdqu	64(%rdi),%xmm6
662	movdqu	80(%rdi),%xmm7
663	movdqu	96(%rdi),%xmm8
664	movdqu	112(%rdi),%xmm9
665	leaq	128(%rdi),%rdi
666	subq	$0x80,%rdx
667	jmp	.Lecb_dec_loop8_enter
668.align	16
669.Lecb_dec_loop8:
670	movups	%xmm2,(%rsi)
671	movq	%r11,%rcx
672	movdqu	(%rdi),%xmm2
673	movl	%r10d,%eax
674	movups	%xmm3,16(%rsi)
675	movdqu	16(%rdi),%xmm3
676	movups	%xmm4,32(%rsi)
677	movdqu	32(%rdi),%xmm4
678	movups	%xmm5,48(%rsi)
679	movdqu	48(%rdi),%xmm5
680	movups	%xmm6,64(%rsi)
681	movdqu	64(%rdi),%xmm6
682	movups	%xmm7,80(%rsi)
683	movdqu	80(%rdi),%xmm7
684	movups	%xmm8,96(%rsi)
685	movdqu	96(%rdi),%xmm8
686	movups	%xmm9,112(%rsi)
687	leaq	128(%rsi),%rsi
688	movdqu	112(%rdi),%xmm9
689	leaq	128(%rdi),%rdi
690.Lecb_dec_loop8_enter:
691
692	call	_aesni_decrypt8
693
694	movups	(%r11),%xmm0
695	subq	$0x80,%rdx
696	jnc	.Lecb_dec_loop8
697
698	movups	%xmm2,(%rsi)
699	pxor	%xmm2,%xmm2
700	movq	%r11,%rcx
701	movups	%xmm3,16(%rsi)
702	pxor	%xmm3,%xmm3
703	movl	%r10d,%eax
704	movups	%xmm4,32(%rsi)
705	pxor	%xmm4,%xmm4
706	movups	%xmm5,48(%rsi)
707	pxor	%xmm5,%xmm5
708	movups	%xmm6,64(%rsi)
709	pxor	%xmm6,%xmm6
710	movups	%xmm7,80(%rsi)
711	pxor	%xmm7,%xmm7
712	movups	%xmm8,96(%rsi)
713	pxor	%xmm8,%xmm8
714	movups	%xmm9,112(%rsi)
715	pxor	%xmm9,%xmm9
716	leaq	128(%rsi),%rsi
717	addq	$0x80,%rdx
718	jz	.Lecb_ret
719
720.Lecb_dec_tail:
721	movups	(%rdi),%xmm2
722	cmpq	$0x20,%rdx
723	jb	.Lecb_dec_one
724	movups	16(%rdi),%xmm3
725	je	.Lecb_dec_two
726	movups	32(%rdi),%xmm4
727	cmpq	$0x40,%rdx
728	jb	.Lecb_dec_three
729	movups	48(%rdi),%xmm5
730	je	.Lecb_dec_four
731	movups	64(%rdi),%xmm6
732	cmpq	$0x60,%rdx
733	jb	.Lecb_dec_five
734	movups	80(%rdi),%xmm7
735	je	.Lecb_dec_six
736	movups	96(%rdi),%xmm8
737	movups	(%rcx),%xmm0
738	xorps	%xmm9,%xmm9
739	call	_aesni_decrypt8
740	movups	%xmm2,(%rsi)
741	pxor	%xmm2,%xmm2
742	movups	%xmm3,16(%rsi)
743	pxor	%xmm3,%xmm3
744	movups	%xmm4,32(%rsi)
745	pxor	%xmm4,%xmm4
746	movups	%xmm5,48(%rsi)
747	pxor	%xmm5,%xmm5
748	movups	%xmm6,64(%rsi)
749	pxor	%xmm6,%xmm6
750	movups	%xmm7,80(%rsi)
751	pxor	%xmm7,%xmm7
752	movups	%xmm8,96(%rsi)
753	pxor	%xmm8,%xmm8
754	pxor	%xmm9,%xmm9
755	jmp	.Lecb_ret
756.align	16
757.Lecb_dec_one:
758	movups	(%rcx),%xmm0
759	movups	16(%rcx),%xmm1
760	leaq	32(%rcx),%rcx
761	xorps	%xmm0,%xmm2
762.Loop_dec1_4:
763.byte	102,15,56,222,209
764	decl	%eax
765	movups	(%rcx),%xmm1
766	leaq	16(%rcx),%rcx
767	jnz	.Loop_dec1_4
768.byte	102,15,56,223,209
769	movups	%xmm2,(%rsi)
770	pxor	%xmm2,%xmm2
771	jmp	.Lecb_ret
772.align	16
773.Lecb_dec_two:
774	call	_aesni_decrypt2
775	movups	%xmm2,(%rsi)
776	pxor	%xmm2,%xmm2
777	movups	%xmm3,16(%rsi)
778	pxor	%xmm3,%xmm3
779	jmp	.Lecb_ret
780.align	16
781.Lecb_dec_three:
782	call	_aesni_decrypt3
783	movups	%xmm2,(%rsi)
784	pxor	%xmm2,%xmm2
785	movups	%xmm3,16(%rsi)
786	pxor	%xmm3,%xmm3
787	movups	%xmm4,32(%rsi)
788	pxor	%xmm4,%xmm4
789	jmp	.Lecb_ret
790.align	16
791.Lecb_dec_four:
792	call	_aesni_decrypt4
793	movups	%xmm2,(%rsi)
794	pxor	%xmm2,%xmm2
795	movups	%xmm3,16(%rsi)
796	pxor	%xmm3,%xmm3
797	movups	%xmm4,32(%rsi)
798	pxor	%xmm4,%xmm4
799	movups	%xmm5,48(%rsi)
800	pxor	%xmm5,%xmm5
801	jmp	.Lecb_ret
802.align	16
803.Lecb_dec_five:
804	xorps	%xmm7,%xmm7
805	call	_aesni_decrypt6
806	movups	%xmm2,(%rsi)
807	pxor	%xmm2,%xmm2
808	movups	%xmm3,16(%rsi)
809	pxor	%xmm3,%xmm3
810	movups	%xmm4,32(%rsi)
811	pxor	%xmm4,%xmm4
812	movups	%xmm5,48(%rsi)
813	pxor	%xmm5,%xmm5
814	movups	%xmm6,64(%rsi)
815	pxor	%xmm6,%xmm6
816	pxor	%xmm7,%xmm7
817	jmp	.Lecb_ret
818.align	16
819.Lecb_dec_six:
820	call	_aesni_decrypt6
821	movups	%xmm2,(%rsi)
822	pxor	%xmm2,%xmm2
823	movups	%xmm3,16(%rsi)
824	pxor	%xmm3,%xmm3
825	movups	%xmm4,32(%rsi)
826	pxor	%xmm4,%xmm4
827	movups	%xmm5,48(%rsi)
828	pxor	%xmm5,%xmm5
829	movups	%xmm6,64(%rsi)
830	pxor	%xmm6,%xmm6
831	movups	%xmm7,80(%rsi)
832	pxor	%xmm7,%xmm7
833
834.Lecb_ret:
835	xorps	%xmm0,%xmm0
836	pxor	%xmm1,%xmm1
837	.byte	0xf3,0xc3
838.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
839.globl	aesni_ccm64_encrypt_blocks
840.hidden aesni_ccm64_encrypt_blocks
841.type	aesni_ccm64_encrypt_blocks,@function
842.align	16
843aesni_ccm64_encrypt_blocks:
844	movl	240(%rcx),%eax
845	movdqu	(%r8),%xmm6
846	movdqa	.Lincrement64(%rip),%xmm9
847	movdqa	.Lbswap_mask(%rip),%xmm7
848
849	shll	$4,%eax
850	movl	$16,%r10d
851	leaq	0(%rcx),%r11
852	movdqu	(%r9),%xmm3
853	movdqa	%xmm6,%xmm2
854	leaq	32(%rcx,%rax,1),%rcx
855.byte	102,15,56,0,247
856	subq	%rax,%r10
857	jmp	.Lccm64_enc_outer
858.align	16
859.Lccm64_enc_outer:
860	movups	(%r11),%xmm0
861	movq	%r10,%rax
862	movups	(%rdi),%xmm8
863
864	xorps	%xmm0,%xmm2
865	movups	16(%r11),%xmm1
866	xorps	%xmm8,%xmm0
867	xorps	%xmm0,%xmm3
868	movups	32(%r11),%xmm0
869
870.Lccm64_enc2_loop:
871.byte	102,15,56,220,209
872.byte	102,15,56,220,217
873	movups	(%rcx,%rax,1),%xmm1
874	addq	$32,%rax
875.byte	102,15,56,220,208
876.byte	102,15,56,220,216
877	movups	-16(%rcx,%rax,1),%xmm0
878	jnz	.Lccm64_enc2_loop
879.byte	102,15,56,220,209
880.byte	102,15,56,220,217
881	paddq	%xmm9,%xmm6
882	decq	%rdx
883.byte	102,15,56,221,208
884.byte	102,15,56,221,216
885
886	leaq	16(%rdi),%rdi
887	xorps	%xmm2,%xmm8
888	movdqa	%xmm6,%xmm2
889	movups	%xmm8,(%rsi)
890.byte	102,15,56,0,215
891	leaq	16(%rsi),%rsi
892	jnz	.Lccm64_enc_outer
893
894	pxor	%xmm0,%xmm0
895	pxor	%xmm1,%xmm1
896	pxor	%xmm2,%xmm2
897	movups	%xmm3,(%r9)
898	pxor	%xmm3,%xmm3
899	pxor	%xmm8,%xmm8
900	pxor	%xmm6,%xmm6
901	.byte	0xf3,0xc3
902.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
903.globl	aesni_ccm64_decrypt_blocks
904.hidden aesni_ccm64_decrypt_blocks
905.type	aesni_ccm64_decrypt_blocks,@function
906.align	16
907aesni_ccm64_decrypt_blocks:
908	movl	240(%rcx),%eax
909	movups	(%r8),%xmm6
910	movdqu	(%r9),%xmm3
911	movdqa	.Lincrement64(%rip),%xmm9
912	movdqa	.Lbswap_mask(%rip),%xmm7
913
914	movaps	%xmm6,%xmm2
915	movl	%eax,%r10d
916	movq	%rcx,%r11
917.byte	102,15,56,0,247
918	movups	(%rcx),%xmm0
919	movups	16(%rcx),%xmm1
920	leaq	32(%rcx),%rcx
921	xorps	%xmm0,%xmm2
922.Loop_enc1_5:
923.byte	102,15,56,220,209
924	decl	%eax
925	movups	(%rcx),%xmm1
926	leaq	16(%rcx),%rcx
927	jnz	.Loop_enc1_5
928.byte	102,15,56,221,209
929	shll	$4,%r10d
930	movl	$16,%eax
931	movups	(%rdi),%xmm8
932	paddq	%xmm9,%xmm6
933	leaq	16(%rdi),%rdi
934	subq	%r10,%rax
935	leaq	32(%r11,%r10,1),%rcx
936	movq	%rax,%r10
937	jmp	.Lccm64_dec_outer
938.align	16
939.Lccm64_dec_outer:
940	xorps	%xmm2,%xmm8
941	movdqa	%xmm6,%xmm2
942	movups	%xmm8,(%rsi)
943	leaq	16(%rsi),%rsi
944.byte	102,15,56,0,215
945
946	subq	$1,%rdx
947	jz	.Lccm64_dec_break
948
949	movups	(%r11),%xmm0
950	movq	%r10,%rax
951	movups	16(%r11),%xmm1
952	xorps	%xmm0,%xmm8
953	xorps	%xmm0,%xmm2
954	xorps	%xmm8,%xmm3
955	movups	32(%r11),%xmm0
956	jmp	.Lccm64_dec2_loop
957.align	16
958.Lccm64_dec2_loop:
959.byte	102,15,56,220,209
960.byte	102,15,56,220,217
961	movups	(%rcx,%rax,1),%xmm1
962	addq	$32,%rax
963.byte	102,15,56,220,208
964.byte	102,15,56,220,216
965	movups	-16(%rcx,%rax,1),%xmm0
966	jnz	.Lccm64_dec2_loop
967	movups	(%rdi),%xmm8
968	paddq	%xmm9,%xmm6
969.byte	102,15,56,220,209
970.byte	102,15,56,220,217
971.byte	102,15,56,221,208
972.byte	102,15,56,221,216
973	leaq	16(%rdi),%rdi
974	jmp	.Lccm64_dec_outer
975
976.align	16
977.Lccm64_dec_break:
978
979	movl	240(%r11),%eax
980	movups	(%r11),%xmm0
981	movups	16(%r11),%xmm1
982	xorps	%xmm0,%xmm8
983	leaq	32(%r11),%r11
984	xorps	%xmm8,%xmm3
985.Loop_enc1_6:
986.byte	102,15,56,220,217
987	decl	%eax
988	movups	(%r11),%xmm1
989	leaq	16(%r11),%r11
990	jnz	.Loop_enc1_6
991.byte	102,15,56,221,217
992	pxor	%xmm0,%xmm0
993	pxor	%xmm1,%xmm1
994	pxor	%xmm2,%xmm2
995	movups	%xmm3,(%r9)
996	pxor	%xmm3,%xmm3
997	pxor	%xmm8,%xmm8
998	pxor	%xmm6,%xmm6
999	.byte	0xf3,0xc3
1000.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1001.globl	aesni_ctr32_encrypt_blocks
1002.hidden aesni_ctr32_encrypt_blocks
1003.type	aesni_ctr32_encrypt_blocks,@function
1004.align	16
1005aesni_ctr32_encrypt_blocks:
1006	cmpq	$1,%rdx
1007	jne	.Lctr32_bulk
1008
1009
1010
1011	movups	(%r8),%xmm2
1012	movups	(%rdi),%xmm3
1013	movl	240(%rcx),%edx
1014	movups	(%rcx),%xmm0
1015	movups	16(%rcx),%xmm1
1016	leaq	32(%rcx),%rcx
1017	xorps	%xmm0,%xmm2
1018.Loop_enc1_7:
1019.byte	102,15,56,220,209
1020	decl	%edx
1021	movups	(%rcx),%xmm1
1022	leaq	16(%rcx),%rcx
1023	jnz	.Loop_enc1_7
1024.byte	102,15,56,221,209
1025	pxor	%xmm0,%xmm0
1026	pxor	%xmm1,%xmm1
1027	xorps	%xmm3,%xmm2
1028	pxor	%xmm3,%xmm3
1029	movups	%xmm2,(%rsi)
1030	xorps	%xmm2,%xmm2
1031	jmp	.Lctr32_epilogue
1032
1033.align	16
1034.Lctr32_bulk:
1035	leaq	(%rsp),%r11
1036	pushq	%rbp
1037	subq	$128,%rsp
1038	andq	$-16,%rsp
1039
1040
1041
1042
1043	movdqu	(%r8),%xmm2
1044	movdqu	(%rcx),%xmm0
1045	movl	12(%r8),%r8d
1046	pxor	%xmm0,%xmm2
1047	movl	12(%rcx),%ebp
1048	movdqa	%xmm2,0(%rsp)
1049	bswapl	%r8d
1050	movdqa	%xmm2,%xmm3
1051	movdqa	%xmm2,%xmm4
1052	movdqa	%xmm2,%xmm5
1053	movdqa	%xmm2,64(%rsp)
1054	movdqa	%xmm2,80(%rsp)
1055	movdqa	%xmm2,96(%rsp)
1056	movq	%rdx,%r10
1057	movdqa	%xmm2,112(%rsp)
1058
1059	leaq	1(%r8),%rax
1060	leaq	2(%r8),%rdx
1061	bswapl	%eax
1062	bswapl	%edx
1063	xorl	%ebp,%eax
1064	xorl	%ebp,%edx
1065.byte	102,15,58,34,216,3
1066	leaq	3(%r8),%rax
1067	movdqa	%xmm3,16(%rsp)
1068.byte	102,15,58,34,226,3
1069	bswapl	%eax
1070	movq	%r10,%rdx
1071	leaq	4(%r8),%r10
1072	movdqa	%xmm4,32(%rsp)
1073	xorl	%ebp,%eax
1074	bswapl	%r10d
1075.byte	102,15,58,34,232,3
1076	xorl	%ebp,%r10d
1077	movdqa	%xmm5,48(%rsp)
1078	leaq	5(%r8),%r9
1079	movl	%r10d,64+12(%rsp)
1080	bswapl	%r9d
1081	leaq	6(%r8),%r10
1082	movl	240(%rcx),%eax
1083	xorl	%ebp,%r9d
1084	bswapl	%r10d
1085	movl	%r9d,80+12(%rsp)
1086	xorl	%ebp,%r10d
1087	leaq	7(%r8),%r9
1088	movl	%r10d,96+12(%rsp)
1089	bswapl	%r9d
1090	movl	OPENSSL_ia32cap_P+4(%rip),%r10d
1091	xorl	%ebp,%r9d
1092	andl	$71303168,%r10d
1093	movl	%r9d,112+12(%rsp)
1094
1095	movups	16(%rcx),%xmm1
1096
1097	movdqa	64(%rsp),%xmm6
1098	movdqa	80(%rsp),%xmm7
1099
1100	cmpq	$8,%rdx
1101	jb	.Lctr32_tail
1102
1103	subq	$6,%rdx
1104	cmpl	$4194304,%r10d
1105	je	.Lctr32_6x
1106
1107	leaq	128(%rcx),%rcx
1108	subq	$2,%rdx
1109	jmp	.Lctr32_loop8
1110
1111.align	16
1112.Lctr32_6x:
1113	shll	$4,%eax
1114	movl	$48,%r10d
1115	bswapl	%ebp
1116	leaq	32(%rcx,%rax,1),%rcx
1117	subq	%rax,%r10
1118	jmp	.Lctr32_loop6
1119
1120.align	16
1121.Lctr32_loop6:
1122	addl	$6,%r8d
1123	movups	-48(%rcx,%r10,1),%xmm0
1124.byte	102,15,56,220,209
1125	movl	%r8d,%eax
1126	xorl	%ebp,%eax
1127.byte	102,15,56,220,217
1128.byte	0x0f,0x38,0xf1,0x44,0x24,12
1129	leal	1(%r8),%eax
1130.byte	102,15,56,220,225
1131	xorl	%ebp,%eax
1132.byte	0x0f,0x38,0xf1,0x44,0x24,28
1133.byte	102,15,56,220,233
1134	leal	2(%r8),%eax
1135	xorl	%ebp,%eax
1136.byte	102,15,56,220,241
1137.byte	0x0f,0x38,0xf1,0x44,0x24,44
1138	leal	3(%r8),%eax
1139.byte	102,15,56,220,249
1140	movups	-32(%rcx,%r10,1),%xmm1
1141	xorl	%ebp,%eax
1142
1143.byte	102,15,56,220,208
1144.byte	0x0f,0x38,0xf1,0x44,0x24,60
1145	leal	4(%r8),%eax
1146.byte	102,15,56,220,216
1147	xorl	%ebp,%eax
1148.byte	0x0f,0x38,0xf1,0x44,0x24,76
1149.byte	102,15,56,220,224
1150	leal	5(%r8),%eax
1151	xorl	%ebp,%eax
1152.byte	102,15,56,220,232
1153.byte	0x0f,0x38,0xf1,0x44,0x24,92
1154	movq	%r10,%rax
1155.byte	102,15,56,220,240
1156.byte	102,15,56,220,248
1157	movups	-16(%rcx,%r10,1),%xmm0
1158
1159	call	.Lenc_loop6
1160
1161	movdqu	(%rdi),%xmm8
1162	movdqu	16(%rdi),%xmm9
1163	movdqu	32(%rdi),%xmm10
1164	movdqu	48(%rdi),%xmm11
1165	movdqu	64(%rdi),%xmm12
1166	movdqu	80(%rdi),%xmm13
1167	leaq	96(%rdi),%rdi
1168	movups	-64(%rcx,%r10,1),%xmm1
1169	pxor	%xmm2,%xmm8
1170	movaps	0(%rsp),%xmm2
1171	pxor	%xmm3,%xmm9
1172	movaps	16(%rsp),%xmm3
1173	pxor	%xmm4,%xmm10
1174	movaps	32(%rsp),%xmm4
1175	pxor	%xmm5,%xmm11
1176	movaps	48(%rsp),%xmm5
1177	pxor	%xmm6,%xmm12
1178	movaps	64(%rsp),%xmm6
1179	pxor	%xmm7,%xmm13
1180	movaps	80(%rsp),%xmm7
1181	movdqu	%xmm8,(%rsi)
1182	movdqu	%xmm9,16(%rsi)
1183	movdqu	%xmm10,32(%rsi)
1184	movdqu	%xmm11,48(%rsi)
1185	movdqu	%xmm12,64(%rsi)
1186	movdqu	%xmm13,80(%rsi)
1187	leaq	96(%rsi),%rsi
1188
1189	subq	$6,%rdx
1190	jnc	.Lctr32_loop6
1191
1192	addq	$6,%rdx
1193	jz	.Lctr32_done
1194
1195	leal	-48(%r10),%eax
1196	leaq	-80(%rcx,%r10,1),%rcx
1197	negl	%eax
1198	shrl	$4,%eax
1199	jmp	.Lctr32_tail
1200
1201.align	32
1202.Lctr32_loop8:
1203	addl	$8,%r8d
1204	movdqa	96(%rsp),%xmm8
1205.byte	102,15,56,220,209
1206	movl	%r8d,%r9d
1207	movdqa	112(%rsp),%xmm9
1208.byte	102,15,56,220,217
1209	bswapl	%r9d
1210	movups	32-128(%rcx),%xmm0
1211.byte	102,15,56,220,225
1212	xorl	%ebp,%r9d
1213	nop
1214.byte	102,15,56,220,233
1215	movl	%r9d,0+12(%rsp)
1216	leaq	1(%r8),%r9
1217.byte	102,15,56,220,241
1218.byte	102,15,56,220,249
1219.byte	102,68,15,56,220,193
1220.byte	102,68,15,56,220,201
1221	movups	48-128(%rcx),%xmm1
1222	bswapl	%r9d
1223.byte	102,15,56,220,208
1224.byte	102,15,56,220,216
1225	xorl	%ebp,%r9d
1226.byte	0x66,0x90
1227.byte	102,15,56,220,224
1228.byte	102,15,56,220,232
1229	movl	%r9d,16+12(%rsp)
1230	leaq	2(%r8),%r9
1231.byte	102,15,56,220,240
1232.byte	102,15,56,220,248
1233.byte	102,68,15,56,220,192
1234.byte	102,68,15,56,220,200
1235	movups	64-128(%rcx),%xmm0
1236	bswapl	%r9d
1237.byte	102,15,56,220,209
1238.byte	102,15,56,220,217
1239	xorl	%ebp,%r9d
1240.byte	0x66,0x90
1241.byte	102,15,56,220,225
1242.byte	102,15,56,220,233
1243	movl	%r9d,32+12(%rsp)
1244	leaq	3(%r8),%r9
1245.byte	102,15,56,220,241
1246.byte	102,15,56,220,249
1247.byte	102,68,15,56,220,193
1248.byte	102,68,15,56,220,201
1249	movups	80-128(%rcx),%xmm1
1250	bswapl	%r9d
1251.byte	102,15,56,220,208
1252.byte	102,15,56,220,216
1253	xorl	%ebp,%r9d
1254.byte	0x66,0x90
1255.byte	102,15,56,220,224
1256.byte	102,15,56,220,232
1257	movl	%r9d,48+12(%rsp)
1258	leaq	4(%r8),%r9
1259.byte	102,15,56,220,240
1260.byte	102,15,56,220,248
1261.byte	102,68,15,56,220,192
1262.byte	102,68,15,56,220,200
1263	movups	96-128(%rcx),%xmm0
1264	bswapl	%r9d
1265.byte	102,15,56,220,209
1266.byte	102,15,56,220,217
1267	xorl	%ebp,%r9d
1268.byte	0x66,0x90
1269.byte	102,15,56,220,225
1270.byte	102,15,56,220,233
1271	movl	%r9d,64+12(%rsp)
1272	leaq	5(%r8),%r9
1273.byte	102,15,56,220,241
1274.byte	102,15,56,220,249
1275.byte	102,68,15,56,220,193
1276.byte	102,68,15,56,220,201
1277	movups	112-128(%rcx),%xmm1
1278	bswapl	%r9d
1279.byte	102,15,56,220,208
1280.byte	102,15,56,220,216
1281	xorl	%ebp,%r9d
1282.byte	0x66,0x90
1283.byte	102,15,56,220,224
1284.byte	102,15,56,220,232
1285	movl	%r9d,80+12(%rsp)
1286	leaq	6(%r8),%r9
1287.byte	102,15,56,220,240
1288.byte	102,15,56,220,248
1289.byte	102,68,15,56,220,192
1290.byte	102,68,15,56,220,200
1291	movups	128-128(%rcx),%xmm0
1292	bswapl	%r9d
1293.byte	102,15,56,220,209
1294.byte	102,15,56,220,217
1295	xorl	%ebp,%r9d
1296.byte	0x66,0x90
1297.byte	102,15,56,220,225
1298.byte	102,15,56,220,233
1299	movl	%r9d,96+12(%rsp)
1300	leaq	7(%r8),%r9
1301.byte	102,15,56,220,241
1302.byte	102,15,56,220,249
1303.byte	102,68,15,56,220,193
1304.byte	102,68,15,56,220,201
1305	movups	144-128(%rcx),%xmm1
1306	bswapl	%r9d
1307.byte	102,15,56,220,208
1308.byte	102,15,56,220,216
1309.byte	102,15,56,220,224
1310	xorl	%ebp,%r9d
1311	movdqu	0(%rdi),%xmm10
1312.byte	102,15,56,220,232
1313	movl	%r9d,112+12(%rsp)
1314	cmpl	$11,%eax
1315.byte	102,15,56,220,240
1316.byte	102,15,56,220,248
1317.byte	102,68,15,56,220,192
1318.byte	102,68,15,56,220,200
1319	movups	160-128(%rcx),%xmm0
1320
1321	jb	.Lctr32_enc_done
1322
1323.byte	102,15,56,220,209
1324.byte	102,15,56,220,217
1325.byte	102,15,56,220,225
1326.byte	102,15,56,220,233
1327.byte	102,15,56,220,241
1328.byte	102,15,56,220,249
1329.byte	102,68,15,56,220,193
1330.byte	102,68,15,56,220,201
1331	movups	176-128(%rcx),%xmm1
1332
1333.byte	102,15,56,220,208
1334.byte	102,15,56,220,216
1335.byte	102,15,56,220,224
1336.byte	102,15,56,220,232
1337.byte	102,15,56,220,240
1338.byte	102,15,56,220,248
1339.byte	102,68,15,56,220,192
1340.byte	102,68,15,56,220,200
1341	movups	192-128(%rcx),%xmm0
1342	je	.Lctr32_enc_done
1343
1344.byte	102,15,56,220,209
1345.byte	102,15,56,220,217
1346.byte	102,15,56,220,225
1347.byte	102,15,56,220,233
1348.byte	102,15,56,220,241
1349.byte	102,15,56,220,249
1350.byte	102,68,15,56,220,193
1351.byte	102,68,15,56,220,201
1352	movups	208-128(%rcx),%xmm1
1353
1354.byte	102,15,56,220,208
1355.byte	102,15,56,220,216
1356.byte	102,15,56,220,224
1357.byte	102,15,56,220,232
1358.byte	102,15,56,220,240
1359.byte	102,15,56,220,248
1360.byte	102,68,15,56,220,192
1361.byte	102,68,15,56,220,200
1362	movups	224-128(%rcx),%xmm0
1363	jmp	.Lctr32_enc_done
1364
1365.align	16
1366.Lctr32_enc_done:
1367	movdqu	16(%rdi),%xmm11
1368	pxor	%xmm0,%xmm10
1369	movdqu	32(%rdi),%xmm12
1370	pxor	%xmm0,%xmm11
1371	movdqu	48(%rdi),%xmm13
1372	pxor	%xmm0,%xmm12
1373	movdqu	64(%rdi),%xmm14
1374	pxor	%xmm0,%xmm13
1375	movdqu	80(%rdi),%xmm15
1376	pxor	%xmm0,%xmm14
1377	pxor	%xmm0,%xmm15
1378.byte	102,15,56,220,209
1379.byte	102,15,56,220,217
1380.byte	102,15,56,220,225
1381.byte	102,15,56,220,233
1382.byte	102,15,56,220,241
1383.byte	102,15,56,220,249
1384.byte	102,68,15,56,220,193
1385.byte	102,68,15,56,220,201
1386	movdqu	96(%rdi),%xmm1
1387	leaq	128(%rdi),%rdi
1388
1389.byte	102,65,15,56,221,210
1390	pxor	%xmm0,%xmm1
1391	movdqu	112-128(%rdi),%xmm10
1392.byte	102,65,15,56,221,219
1393	pxor	%xmm0,%xmm10
1394	movdqa	0(%rsp),%xmm11
1395.byte	102,65,15,56,221,228
1396.byte	102,65,15,56,221,237
1397	movdqa	16(%rsp),%xmm12
1398	movdqa	32(%rsp),%xmm13
1399.byte	102,65,15,56,221,246
1400.byte	102,65,15,56,221,255
1401	movdqa	48(%rsp),%xmm14
1402	movdqa	64(%rsp),%xmm15
1403.byte	102,68,15,56,221,193
1404	movdqa	80(%rsp),%xmm0
1405	movups	16-128(%rcx),%xmm1
1406.byte	102,69,15,56,221,202
1407
1408	movups	%xmm2,(%rsi)
1409	movdqa	%xmm11,%xmm2
1410	movups	%xmm3,16(%rsi)
1411	movdqa	%xmm12,%xmm3
1412	movups	%xmm4,32(%rsi)
1413	movdqa	%xmm13,%xmm4
1414	movups	%xmm5,48(%rsi)
1415	movdqa	%xmm14,%xmm5
1416	movups	%xmm6,64(%rsi)
1417	movdqa	%xmm15,%xmm6
1418	movups	%xmm7,80(%rsi)
1419	movdqa	%xmm0,%xmm7
1420	movups	%xmm8,96(%rsi)
1421	movups	%xmm9,112(%rsi)
1422	leaq	128(%rsi),%rsi
1423
1424	subq	$8,%rdx
1425	jnc	.Lctr32_loop8
1426
1427	addq	$8,%rdx
1428	jz	.Lctr32_done
1429	leaq	-128(%rcx),%rcx
1430
1431.Lctr32_tail:
1432
1433
1434	leaq	16(%rcx),%rcx
1435	cmpq	$4,%rdx
1436	jb	.Lctr32_loop3
1437	je	.Lctr32_loop4
1438
1439
1440	shll	$4,%eax
1441	movdqa	96(%rsp),%xmm8
1442	pxor	%xmm9,%xmm9
1443
1444	movups	16(%rcx),%xmm0
1445.byte	102,15,56,220,209
1446.byte	102,15,56,220,217
1447	leaq	32-16(%rcx,%rax,1),%rcx
1448	negq	%rax
1449.byte	102,15,56,220,225
1450	addq	$16,%rax
1451	movups	(%rdi),%xmm10
1452.byte	102,15,56,220,233
1453.byte	102,15,56,220,241
1454	movups	16(%rdi),%xmm11
1455	movups	32(%rdi),%xmm12
1456.byte	102,15,56,220,249
1457.byte	102,68,15,56,220,193
1458
1459	call	.Lenc_loop8_enter
1460
1461	movdqu	48(%rdi),%xmm13
1462	pxor	%xmm10,%xmm2
1463	movdqu	64(%rdi),%xmm10
1464	pxor	%xmm11,%xmm3
1465	movdqu	%xmm2,(%rsi)
1466	pxor	%xmm12,%xmm4
1467	movdqu	%xmm3,16(%rsi)
1468	pxor	%xmm13,%xmm5
1469	movdqu	%xmm4,32(%rsi)
1470	pxor	%xmm10,%xmm6
1471	movdqu	%xmm5,48(%rsi)
1472	movdqu	%xmm6,64(%rsi)
1473	cmpq	$6,%rdx
1474	jb	.Lctr32_done
1475
1476	movups	80(%rdi),%xmm11
1477	xorps	%xmm11,%xmm7
1478	movups	%xmm7,80(%rsi)
1479	je	.Lctr32_done
1480
1481	movups	96(%rdi),%xmm12
1482	xorps	%xmm12,%xmm8
1483	movups	%xmm8,96(%rsi)
1484	jmp	.Lctr32_done
1485
1486.align	32
1487.Lctr32_loop4:
1488.byte	102,15,56,220,209
1489	leaq	16(%rcx),%rcx
1490	decl	%eax
1491.byte	102,15,56,220,217
1492.byte	102,15,56,220,225
1493.byte	102,15,56,220,233
1494	movups	(%rcx),%xmm1
1495	jnz	.Lctr32_loop4
1496.byte	102,15,56,221,209
1497.byte	102,15,56,221,217
1498	movups	(%rdi),%xmm10
1499	movups	16(%rdi),%xmm11
1500.byte	102,15,56,221,225
1501.byte	102,15,56,221,233
1502	movups	32(%rdi),%xmm12
1503	movups	48(%rdi),%xmm13
1504
1505	xorps	%xmm10,%xmm2
1506	movups	%xmm2,(%rsi)
1507	xorps	%xmm11,%xmm3
1508	movups	%xmm3,16(%rsi)
1509	pxor	%xmm12,%xmm4
1510	movdqu	%xmm4,32(%rsi)
1511	pxor	%xmm13,%xmm5
1512	movdqu	%xmm5,48(%rsi)
1513	jmp	.Lctr32_done
1514
1515.align	32
1516.Lctr32_loop3:
1517.byte	102,15,56,220,209
1518	leaq	16(%rcx),%rcx
1519	decl	%eax
1520.byte	102,15,56,220,217
1521.byte	102,15,56,220,225
1522	movups	(%rcx),%xmm1
1523	jnz	.Lctr32_loop3
1524.byte	102,15,56,221,209
1525.byte	102,15,56,221,217
1526.byte	102,15,56,221,225
1527
1528	movups	(%rdi),%xmm10
1529	xorps	%xmm10,%xmm2
1530	movups	%xmm2,(%rsi)
1531	cmpq	$2,%rdx
1532	jb	.Lctr32_done
1533
1534	movups	16(%rdi),%xmm11
1535	xorps	%xmm11,%xmm3
1536	movups	%xmm3,16(%rsi)
1537	je	.Lctr32_done
1538
1539	movups	32(%rdi),%xmm12
1540	xorps	%xmm12,%xmm4
1541	movups	%xmm4,32(%rsi)
1542
1543.Lctr32_done:
1544	xorps	%xmm0,%xmm0
1545	xorl	%ebp,%ebp
1546	pxor	%xmm1,%xmm1
1547	pxor	%xmm2,%xmm2
1548	pxor	%xmm3,%xmm3
1549	pxor	%xmm4,%xmm4
1550	pxor	%xmm5,%xmm5
1551	pxor	%xmm6,%xmm6
1552	pxor	%xmm7,%xmm7
1553	movaps	%xmm0,0(%rsp)
1554	pxor	%xmm8,%xmm8
1555	movaps	%xmm0,16(%rsp)
1556	pxor	%xmm9,%xmm9
1557	movaps	%xmm0,32(%rsp)
1558	pxor	%xmm10,%xmm10
1559	movaps	%xmm0,48(%rsp)
1560	pxor	%xmm11,%xmm11
1561	movaps	%xmm0,64(%rsp)
1562	pxor	%xmm12,%xmm12
1563	movaps	%xmm0,80(%rsp)
1564	pxor	%xmm13,%xmm13
1565	movaps	%xmm0,96(%rsp)
1566	pxor	%xmm14,%xmm14
1567	movaps	%xmm0,112(%rsp)
1568	pxor	%xmm15,%xmm15
1569	movq	-8(%r11),%rbp
1570	leaq	(%r11),%rsp
1571.Lctr32_epilogue:
1572	.byte	0xf3,0xc3
1573.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1574.globl	aesni_xts_encrypt
1575.hidden aesni_xts_encrypt
1576.type	aesni_xts_encrypt,@function
1577.align	16
1578aesni_xts_encrypt:
1579	leaq	(%rsp),%r11
1580	pushq	%rbp
1581	subq	$112,%rsp
1582	andq	$-16,%rsp
1583	movups	(%r9),%xmm2
1584	movl	240(%r8),%eax
1585	movl	240(%rcx),%r10d
1586	movups	(%r8),%xmm0
1587	movups	16(%r8),%xmm1
1588	leaq	32(%r8),%r8
1589	xorps	%xmm0,%xmm2
1590.Loop_enc1_8:
1591.byte	102,15,56,220,209
1592	decl	%eax
1593	movups	(%r8),%xmm1
1594	leaq	16(%r8),%r8
1595	jnz	.Loop_enc1_8
1596.byte	102,15,56,221,209
1597	movups	(%rcx),%xmm0
1598	movq	%rcx,%rbp
1599	movl	%r10d,%eax
1600	shll	$4,%r10d
1601	movq	%rdx,%r9
1602	andq	$-16,%rdx
1603
1604	movups	16(%rcx,%r10,1),%xmm1
1605
1606	movdqa	.Lxts_magic(%rip),%xmm8
1607	movdqa	%xmm2,%xmm15
1608	pshufd	$0x5f,%xmm2,%xmm9
1609	pxor	%xmm0,%xmm1
1610	movdqa	%xmm9,%xmm14
1611	paddd	%xmm9,%xmm9
1612	movdqa	%xmm15,%xmm10
1613	psrad	$31,%xmm14
1614	paddq	%xmm15,%xmm15
1615	pand	%xmm8,%xmm14
1616	pxor	%xmm0,%xmm10
1617	pxor	%xmm14,%xmm15
1618	movdqa	%xmm9,%xmm14
1619	paddd	%xmm9,%xmm9
1620	movdqa	%xmm15,%xmm11
1621	psrad	$31,%xmm14
1622	paddq	%xmm15,%xmm15
1623	pand	%xmm8,%xmm14
1624	pxor	%xmm0,%xmm11
1625	pxor	%xmm14,%xmm15
1626	movdqa	%xmm9,%xmm14
1627	paddd	%xmm9,%xmm9
1628	movdqa	%xmm15,%xmm12
1629	psrad	$31,%xmm14
1630	paddq	%xmm15,%xmm15
1631	pand	%xmm8,%xmm14
1632	pxor	%xmm0,%xmm12
1633	pxor	%xmm14,%xmm15
1634	movdqa	%xmm9,%xmm14
1635	paddd	%xmm9,%xmm9
1636	movdqa	%xmm15,%xmm13
1637	psrad	$31,%xmm14
1638	paddq	%xmm15,%xmm15
1639	pand	%xmm8,%xmm14
1640	pxor	%xmm0,%xmm13
1641	pxor	%xmm14,%xmm15
1642	movdqa	%xmm15,%xmm14
1643	psrad	$31,%xmm9
1644	paddq	%xmm15,%xmm15
1645	pand	%xmm8,%xmm9
1646	pxor	%xmm0,%xmm14
1647	pxor	%xmm9,%xmm15
1648	movaps	%xmm1,96(%rsp)
1649
1650	subq	$96,%rdx
1651	jc	.Lxts_enc_short
1652
1653	movl	$16+96,%eax
1654	leaq	32(%rbp,%r10,1),%rcx
1655	subq	%r10,%rax
1656	movups	16(%rbp),%xmm1
1657	movq	%rax,%r10
1658	leaq	.Lxts_magic(%rip),%r8
1659	jmp	.Lxts_enc_grandloop
1660
1661.align	32
1662.Lxts_enc_grandloop:
1663	movdqu	0(%rdi),%xmm2
1664	movdqa	%xmm0,%xmm8
1665	movdqu	16(%rdi),%xmm3
1666	pxor	%xmm10,%xmm2
1667	movdqu	32(%rdi),%xmm4
1668	pxor	%xmm11,%xmm3
1669.byte	102,15,56,220,209
1670	movdqu	48(%rdi),%xmm5
1671	pxor	%xmm12,%xmm4
1672.byte	102,15,56,220,217
1673	movdqu	64(%rdi),%xmm6
1674	pxor	%xmm13,%xmm5
1675.byte	102,15,56,220,225
1676	movdqu	80(%rdi),%xmm7
1677	pxor	%xmm15,%xmm8
1678	movdqa	96(%rsp),%xmm9
1679	pxor	%xmm14,%xmm6
1680.byte	102,15,56,220,233
1681	movups	32(%rbp),%xmm0
1682	leaq	96(%rdi),%rdi
1683	pxor	%xmm8,%xmm7
1684
1685	pxor	%xmm9,%xmm10
1686.byte	102,15,56,220,241
1687	pxor	%xmm9,%xmm11
1688	movdqa	%xmm10,0(%rsp)
1689.byte	102,15,56,220,249
1690	movups	48(%rbp),%xmm1
1691	pxor	%xmm9,%xmm12
1692
1693.byte	102,15,56,220,208
1694	pxor	%xmm9,%xmm13
1695	movdqa	%xmm11,16(%rsp)
1696.byte	102,15,56,220,216
1697	pxor	%xmm9,%xmm14
1698	movdqa	%xmm12,32(%rsp)
1699.byte	102,15,56,220,224
1700.byte	102,15,56,220,232
1701	pxor	%xmm9,%xmm8
1702	movdqa	%xmm14,64(%rsp)
1703.byte	102,15,56,220,240
1704.byte	102,15,56,220,248
1705	movups	64(%rbp),%xmm0
1706	movdqa	%xmm8,80(%rsp)
1707	pshufd	$0x5f,%xmm15,%xmm9
1708	jmp	.Lxts_enc_loop6
1709.align	32
1710.Lxts_enc_loop6:
1711.byte	102,15,56,220,209
1712.byte	102,15,56,220,217
1713.byte	102,15,56,220,225
1714.byte	102,15,56,220,233
1715.byte	102,15,56,220,241
1716.byte	102,15,56,220,249
1717	movups	-64(%rcx,%rax,1),%xmm1
1718	addq	$32,%rax
1719
1720.byte	102,15,56,220,208
1721.byte	102,15,56,220,216
1722.byte	102,15,56,220,224
1723.byte	102,15,56,220,232
1724.byte	102,15,56,220,240
1725.byte	102,15,56,220,248
1726	movups	-80(%rcx,%rax,1),%xmm0
1727	jnz	.Lxts_enc_loop6
1728
1729	movdqa	(%r8),%xmm8
1730	movdqa	%xmm9,%xmm14
1731	paddd	%xmm9,%xmm9
1732.byte	102,15,56,220,209
1733	paddq	%xmm15,%xmm15
1734	psrad	$31,%xmm14
1735.byte	102,15,56,220,217
1736	pand	%xmm8,%xmm14
1737	movups	(%rbp),%xmm10
1738.byte	102,15,56,220,225
1739.byte	102,15,56,220,233
1740.byte	102,15,56,220,241
1741	pxor	%xmm14,%xmm15
1742	movaps	%xmm10,%xmm11
1743.byte	102,15,56,220,249
1744	movups	-64(%rcx),%xmm1
1745
1746	movdqa	%xmm9,%xmm14
1747.byte	102,15,56,220,208
1748	paddd	%xmm9,%xmm9
1749	pxor	%xmm15,%xmm10
1750.byte	102,15,56,220,216
1751	psrad	$31,%xmm14
1752	paddq	%xmm15,%xmm15
1753.byte	102,15,56,220,224
1754.byte	102,15,56,220,232
1755	pand	%xmm8,%xmm14
1756	movaps	%xmm11,%xmm12
1757.byte	102,15,56,220,240
1758	pxor	%xmm14,%xmm15
1759	movdqa	%xmm9,%xmm14
1760.byte	102,15,56,220,248
1761	movups	-48(%rcx),%xmm0
1762
1763	paddd	%xmm9,%xmm9
1764.byte	102,15,56,220,209
1765	pxor	%xmm15,%xmm11
1766	psrad	$31,%xmm14
1767.byte	102,15,56,220,217
1768	paddq	%xmm15,%xmm15
1769	pand	%xmm8,%xmm14
1770.byte	102,15,56,220,225
1771.byte	102,15,56,220,233
1772	movdqa	%xmm13,48(%rsp)
1773	pxor	%xmm14,%xmm15
1774.byte	102,15,56,220,241
1775	movaps	%xmm12,%xmm13
1776	movdqa	%xmm9,%xmm14
1777.byte	102,15,56,220,249
1778	movups	-32(%rcx),%xmm1
1779
1780	paddd	%xmm9,%xmm9
1781.byte	102,15,56,220,208
1782	pxor	%xmm15,%xmm12
1783	psrad	$31,%xmm14
1784.byte	102,15,56,220,216
1785	paddq	%xmm15,%xmm15
1786	pand	%xmm8,%xmm14
1787.byte	102,15,56,220,224
1788.byte	102,15,56,220,232
1789.byte	102,15,56,220,240
1790	pxor	%xmm14,%xmm15
1791	movaps	%xmm13,%xmm14
1792.byte	102,15,56,220,248
1793
1794	movdqa	%xmm9,%xmm0
1795	paddd	%xmm9,%xmm9
1796.byte	102,15,56,220,209
1797	pxor	%xmm15,%xmm13
1798	psrad	$31,%xmm0
1799.byte	102,15,56,220,217
1800	paddq	%xmm15,%xmm15
1801	pand	%xmm8,%xmm0
1802.byte	102,15,56,220,225
1803.byte	102,15,56,220,233
1804	pxor	%xmm0,%xmm15
1805	movups	(%rbp),%xmm0
1806.byte	102,15,56,220,241
1807.byte	102,15,56,220,249
1808	movups	16(%rbp),%xmm1
1809
1810	pxor	%xmm15,%xmm14
1811.byte	102,15,56,221,84,36,0
1812	psrad	$31,%xmm9
1813	paddq	%xmm15,%xmm15
1814.byte	102,15,56,221,92,36,16
1815.byte	102,15,56,221,100,36,32
1816	pand	%xmm8,%xmm9
1817	movq	%r10,%rax
1818.byte	102,15,56,221,108,36,48
1819.byte	102,15,56,221,116,36,64
1820.byte	102,15,56,221,124,36,80
1821	pxor	%xmm9,%xmm15
1822
1823	leaq	96(%rsi),%rsi
1824	movups	%xmm2,-96(%rsi)
1825	movups	%xmm3,-80(%rsi)
1826	movups	%xmm4,-64(%rsi)
1827	movups	%xmm5,-48(%rsi)
1828	movups	%xmm6,-32(%rsi)
1829	movups	%xmm7,-16(%rsi)
1830	subq	$96,%rdx
1831	jnc	.Lxts_enc_grandloop
1832
1833	movl	$16+96,%eax
1834	subl	%r10d,%eax
1835	movq	%rbp,%rcx
1836	shrl	$4,%eax
1837
1838.Lxts_enc_short:
1839
1840	movl	%eax,%r10d
1841	pxor	%xmm0,%xmm10
1842	addq	$96,%rdx
1843	jz	.Lxts_enc_done
1844
1845	pxor	%xmm0,%xmm11
1846	cmpq	$0x20,%rdx
1847	jb	.Lxts_enc_one
1848	pxor	%xmm0,%xmm12
1849	je	.Lxts_enc_two
1850
1851	pxor	%xmm0,%xmm13
1852	cmpq	$0x40,%rdx
1853	jb	.Lxts_enc_three
1854	pxor	%xmm0,%xmm14
1855	je	.Lxts_enc_four
1856
1857	movdqu	(%rdi),%xmm2
1858	movdqu	16(%rdi),%xmm3
1859	movdqu	32(%rdi),%xmm4
1860	pxor	%xmm10,%xmm2
1861	movdqu	48(%rdi),%xmm5
1862	pxor	%xmm11,%xmm3
1863	movdqu	64(%rdi),%xmm6
1864	leaq	80(%rdi),%rdi
1865	pxor	%xmm12,%xmm4
1866	pxor	%xmm13,%xmm5
1867	pxor	%xmm14,%xmm6
1868	pxor	%xmm7,%xmm7
1869
1870	call	_aesni_encrypt6
1871
1872	xorps	%xmm10,%xmm2
1873	movdqa	%xmm15,%xmm10
1874	xorps	%xmm11,%xmm3
1875	xorps	%xmm12,%xmm4
1876	movdqu	%xmm2,(%rsi)
1877	xorps	%xmm13,%xmm5
1878	movdqu	%xmm3,16(%rsi)
1879	xorps	%xmm14,%xmm6
1880	movdqu	%xmm4,32(%rsi)
1881	movdqu	%xmm5,48(%rsi)
1882	movdqu	%xmm6,64(%rsi)
1883	leaq	80(%rsi),%rsi
1884	jmp	.Lxts_enc_done
1885
1886.align	16
1887.Lxts_enc_one:
1888	movups	(%rdi),%xmm2
1889	leaq	16(%rdi),%rdi
1890	xorps	%xmm10,%xmm2
1891	movups	(%rcx),%xmm0
1892	movups	16(%rcx),%xmm1
1893	leaq	32(%rcx),%rcx
1894	xorps	%xmm0,%xmm2
1895.Loop_enc1_9:
1896.byte	102,15,56,220,209
1897	decl	%eax
1898	movups	(%rcx),%xmm1
1899	leaq	16(%rcx),%rcx
1900	jnz	.Loop_enc1_9
1901.byte	102,15,56,221,209
1902	xorps	%xmm10,%xmm2
1903	movdqa	%xmm11,%xmm10
1904	movups	%xmm2,(%rsi)
1905	leaq	16(%rsi),%rsi
1906	jmp	.Lxts_enc_done
1907
1908.align	16
1909.Lxts_enc_two:
1910	movups	(%rdi),%xmm2
1911	movups	16(%rdi),%xmm3
1912	leaq	32(%rdi),%rdi
1913	xorps	%xmm10,%xmm2
1914	xorps	%xmm11,%xmm3
1915
1916	call	_aesni_encrypt2
1917
1918	xorps	%xmm10,%xmm2
1919	movdqa	%xmm12,%xmm10
1920	xorps	%xmm11,%xmm3
1921	movups	%xmm2,(%rsi)
1922	movups	%xmm3,16(%rsi)
1923	leaq	32(%rsi),%rsi
1924	jmp	.Lxts_enc_done
1925
1926.align	16
1927.Lxts_enc_three:
1928	movups	(%rdi),%xmm2
1929	movups	16(%rdi),%xmm3
1930	movups	32(%rdi),%xmm4
1931	leaq	48(%rdi),%rdi
1932	xorps	%xmm10,%xmm2
1933	xorps	%xmm11,%xmm3
1934	xorps	%xmm12,%xmm4
1935
1936	call	_aesni_encrypt3
1937
1938	xorps	%xmm10,%xmm2
1939	movdqa	%xmm13,%xmm10
1940	xorps	%xmm11,%xmm3
1941	xorps	%xmm12,%xmm4
1942	movups	%xmm2,(%rsi)
1943	movups	%xmm3,16(%rsi)
1944	movups	%xmm4,32(%rsi)
1945	leaq	48(%rsi),%rsi
1946	jmp	.Lxts_enc_done
1947
1948.align	16
1949.Lxts_enc_four:
1950	movups	(%rdi),%xmm2
1951	movups	16(%rdi),%xmm3
1952	movups	32(%rdi),%xmm4
1953	xorps	%xmm10,%xmm2
1954	movups	48(%rdi),%xmm5
1955	leaq	64(%rdi),%rdi
1956	xorps	%xmm11,%xmm3
1957	xorps	%xmm12,%xmm4
1958	xorps	%xmm13,%xmm5
1959
1960	call	_aesni_encrypt4
1961
1962	pxor	%xmm10,%xmm2
1963	movdqa	%xmm14,%xmm10
1964	pxor	%xmm11,%xmm3
1965	pxor	%xmm12,%xmm4
1966	movdqu	%xmm2,(%rsi)
1967	pxor	%xmm13,%xmm5
1968	movdqu	%xmm3,16(%rsi)
1969	movdqu	%xmm4,32(%rsi)
1970	movdqu	%xmm5,48(%rsi)
1971	leaq	64(%rsi),%rsi
1972	jmp	.Lxts_enc_done
1973
1974.align	16
1975.Lxts_enc_done:
1976	andq	$15,%r9
1977	jz	.Lxts_enc_ret
1978	movq	%r9,%rdx
1979
1980.Lxts_enc_steal:
1981	movzbl	(%rdi),%eax
1982	movzbl	-16(%rsi),%ecx
1983	leaq	1(%rdi),%rdi
1984	movb	%al,-16(%rsi)
1985	movb	%cl,0(%rsi)
1986	leaq	1(%rsi),%rsi
1987	subq	$1,%rdx
1988	jnz	.Lxts_enc_steal
1989
1990	subq	%r9,%rsi
1991	movq	%rbp,%rcx
1992	movl	%r10d,%eax
1993
1994	movups	-16(%rsi),%xmm2
1995	xorps	%xmm10,%xmm2
1996	movups	(%rcx),%xmm0
1997	movups	16(%rcx),%xmm1
1998	leaq	32(%rcx),%rcx
1999	xorps	%xmm0,%xmm2
2000.Loop_enc1_10:
2001.byte	102,15,56,220,209
2002	decl	%eax
2003	movups	(%rcx),%xmm1
2004	leaq	16(%rcx),%rcx
2005	jnz	.Loop_enc1_10
2006.byte	102,15,56,221,209
2007	xorps	%xmm10,%xmm2
2008	movups	%xmm2,-16(%rsi)
2009
2010.Lxts_enc_ret:
2011	xorps	%xmm0,%xmm0
2012	pxor	%xmm1,%xmm1
2013	pxor	%xmm2,%xmm2
2014	pxor	%xmm3,%xmm3
2015	pxor	%xmm4,%xmm4
2016	pxor	%xmm5,%xmm5
2017	pxor	%xmm6,%xmm6
2018	pxor	%xmm7,%xmm7
2019	movaps	%xmm0,0(%rsp)
2020	pxor	%xmm8,%xmm8
2021	movaps	%xmm0,16(%rsp)
2022	pxor	%xmm9,%xmm9
2023	movaps	%xmm0,32(%rsp)
2024	pxor	%xmm10,%xmm10
2025	movaps	%xmm0,48(%rsp)
2026	pxor	%xmm11,%xmm11
2027	movaps	%xmm0,64(%rsp)
2028	pxor	%xmm12,%xmm12
2029	movaps	%xmm0,80(%rsp)
2030	pxor	%xmm13,%xmm13
2031	movaps	%xmm0,96(%rsp)
2032	pxor	%xmm14,%xmm14
2033	pxor	%xmm15,%xmm15
2034	movq	-8(%r11),%rbp
2035	leaq	(%r11),%rsp
2036.Lxts_enc_epilogue:
2037	.byte	0xf3,0xc3
2038.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2039.globl	aesni_xts_decrypt
2040.hidden aesni_xts_decrypt
2041.type	aesni_xts_decrypt,@function
2042.align	16
2043aesni_xts_decrypt:
2044	leaq	(%rsp),%r11
2045	pushq	%rbp
2046	subq	$112,%rsp
2047	andq	$-16,%rsp
2048	movups	(%r9),%xmm2
2049	movl	240(%r8),%eax
2050	movl	240(%rcx),%r10d
2051	movups	(%r8),%xmm0
2052	movups	16(%r8),%xmm1
2053	leaq	32(%r8),%r8
2054	xorps	%xmm0,%xmm2
2055.Loop_enc1_11:
2056.byte	102,15,56,220,209
2057	decl	%eax
2058	movups	(%r8),%xmm1
2059	leaq	16(%r8),%r8
2060	jnz	.Loop_enc1_11
2061.byte	102,15,56,221,209
2062	xorl	%eax,%eax
2063	testq	$15,%rdx
2064	setnz	%al
2065	shlq	$4,%rax
2066	subq	%rax,%rdx
2067
2068	movups	(%rcx),%xmm0
2069	movq	%rcx,%rbp
2070	movl	%r10d,%eax
2071	shll	$4,%r10d
2072	movq	%rdx,%r9
2073	andq	$-16,%rdx
2074
2075	movups	16(%rcx,%r10,1),%xmm1
2076
2077	movdqa	.Lxts_magic(%rip),%xmm8
2078	movdqa	%xmm2,%xmm15
2079	pshufd	$0x5f,%xmm2,%xmm9
2080	pxor	%xmm0,%xmm1
2081	movdqa	%xmm9,%xmm14
2082	paddd	%xmm9,%xmm9
2083	movdqa	%xmm15,%xmm10
2084	psrad	$31,%xmm14
2085	paddq	%xmm15,%xmm15
2086	pand	%xmm8,%xmm14
2087	pxor	%xmm0,%xmm10
2088	pxor	%xmm14,%xmm15
2089	movdqa	%xmm9,%xmm14
2090	paddd	%xmm9,%xmm9
2091	movdqa	%xmm15,%xmm11
2092	psrad	$31,%xmm14
2093	paddq	%xmm15,%xmm15
2094	pand	%xmm8,%xmm14
2095	pxor	%xmm0,%xmm11
2096	pxor	%xmm14,%xmm15
2097	movdqa	%xmm9,%xmm14
2098	paddd	%xmm9,%xmm9
2099	movdqa	%xmm15,%xmm12
2100	psrad	$31,%xmm14
2101	paddq	%xmm15,%xmm15
2102	pand	%xmm8,%xmm14
2103	pxor	%xmm0,%xmm12
2104	pxor	%xmm14,%xmm15
2105	movdqa	%xmm9,%xmm14
2106	paddd	%xmm9,%xmm9
2107	movdqa	%xmm15,%xmm13
2108	psrad	$31,%xmm14
2109	paddq	%xmm15,%xmm15
2110	pand	%xmm8,%xmm14
2111	pxor	%xmm0,%xmm13
2112	pxor	%xmm14,%xmm15
2113	movdqa	%xmm15,%xmm14
2114	psrad	$31,%xmm9
2115	paddq	%xmm15,%xmm15
2116	pand	%xmm8,%xmm9
2117	pxor	%xmm0,%xmm14
2118	pxor	%xmm9,%xmm15
2119	movaps	%xmm1,96(%rsp)
2120
2121	subq	$96,%rdx
2122	jc	.Lxts_dec_short
2123
2124	movl	$16+96,%eax
2125	leaq	32(%rbp,%r10,1),%rcx
2126	subq	%r10,%rax
2127	movups	16(%rbp),%xmm1
2128	movq	%rax,%r10
2129	leaq	.Lxts_magic(%rip),%r8
2130	jmp	.Lxts_dec_grandloop
2131
2132.align	32
2133.Lxts_dec_grandloop:
2134	movdqu	0(%rdi),%xmm2
2135	movdqa	%xmm0,%xmm8
2136	movdqu	16(%rdi),%xmm3
2137	pxor	%xmm10,%xmm2
2138	movdqu	32(%rdi),%xmm4
2139	pxor	%xmm11,%xmm3
2140.byte	102,15,56,222,209
2141	movdqu	48(%rdi),%xmm5
2142	pxor	%xmm12,%xmm4
2143.byte	102,15,56,222,217
2144	movdqu	64(%rdi),%xmm6
2145	pxor	%xmm13,%xmm5
2146.byte	102,15,56,222,225
2147	movdqu	80(%rdi),%xmm7
2148	pxor	%xmm15,%xmm8
2149	movdqa	96(%rsp),%xmm9
2150	pxor	%xmm14,%xmm6
2151.byte	102,15,56,222,233
2152	movups	32(%rbp),%xmm0
2153	leaq	96(%rdi),%rdi
2154	pxor	%xmm8,%xmm7
2155
2156	pxor	%xmm9,%xmm10
2157.byte	102,15,56,222,241
2158	pxor	%xmm9,%xmm11
2159	movdqa	%xmm10,0(%rsp)
2160.byte	102,15,56,222,249
2161	movups	48(%rbp),%xmm1
2162	pxor	%xmm9,%xmm12
2163
2164.byte	102,15,56,222,208
2165	pxor	%xmm9,%xmm13
2166	movdqa	%xmm11,16(%rsp)
2167.byte	102,15,56,222,216
2168	pxor	%xmm9,%xmm14
2169	movdqa	%xmm12,32(%rsp)
2170.byte	102,15,56,222,224
2171.byte	102,15,56,222,232
2172	pxor	%xmm9,%xmm8
2173	movdqa	%xmm14,64(%rsp)
2174.byte	102,15,56,222,240
2175.byte	102,15,56,222,248
2176	movups	64(%rbp),%xmm0
2177	movdqa	%xmm8,80(%rsp)
2178	pshufd	$0x5f,%xmm15,%xmm9
2179	jmp	.Lxts_dec_loop6
2180.align	32
2181.Lxts_dec_loop6:
2182.byte	102,15,56,222,209
2183.byte	102,15,56,222,217
2184.byte	102,15,56,222,225
2185.byte	102,15,56,222,233
2186.byte	102,15,56,222,241
2187.byte	102,15,56,222,249
2188	movups	-64(%rcx,%rax,1),%xmm1
2189	addq	$32,%rax
2190
2191.byte	102,15,56,222,208
2192.byte	102,15,56,222,216
2193.byte	102,15,56,222,224
2194.byte	102,15,56,222,232
2195.byte	102,15,56,222,240
2196.byte	102,15,56,222,248
2197	movups	-80(%rcx,%rax,1),%xmm0
2198	jnz	.Lxts_dec_loop6
2199
2200	movdqa	(%r8),%xmm8
2201	movdqa	%xmm9,%xmm14
2202	paddd	%xmm9,%xmm9
2203.byte	102,15,56,222,209
2204	paddq	%xmm15,%xmm15
2205	psrad	$31,%xmm14
2206.byte	102,15,56,222,217
2207	pand	%xmm8,%xmm14
2208	movups	(%rbp),%xmm10
2209.byte	102,15,56,222,225
2210.byte	102,15,56,222,233
2211.byte	102,15,56,222,241
2212	pxor	%xmm14,%xmm15
2213	movaps	%xmm10,%xmm11
2214.byte	102,15,56,222,249
2215	movups	-64(%rcx),%xmm1
2216
2217	movdqa	%xmm9,%xmm14
2218.byte	102,15,56,222,208
2219	paddd	%xmm9,%xmm9
2220	pxor	%xmm15,%xmm10
2221.byte	102,15,56,222,216
2222	psrad	$31,%xmm14
2223	paddq	%xmm15,%xmm15
2224.byte	102,15,56,222,224
2225.byte	102,15,56,222,232
2226	pand	%xmm8,%xmm14
2227	movaps	%xmm11,%xmm12
2228.byte	102,15,56,222,240
2229	pxor	%xmm14,%xmm15
2230	movdqa	%xmm9,%xmm14
2231.byte	102,15,56,222,248
2232	movups	-48(%rcx),%xmm0
2233
2234	paddd	%xmm9,%xmm9
2235.byte	102,15,56,222,209
2236	pxor	%xmm15,%xmm11
2237	psrad	$31,%xmm14
2238.byte	102,15,56,222,217
2239	paddq	%xmm15,%xmm15
2240	pand	%xmm8,%xmm14
2241.byte	102,15,56,222,225
2242.byte	102,15,56,222,233
2243	movdqa	%xmm13,48(%rsp)
2244	pxor	%xmm14,%xmm15
2245.byte	102,15,56,222,241
2246	movaps	%xmm12,%xmm13
2247	movdqa	%xmm9,%xmm14
2248.byte	102,15,56,222,249
2249	movups	-32(%rcx),%xmm1
2250
2251	paddd	%xmm9,%xmm9
2252.byte	102,15,56,222,208
2253	pxor	%xmm15,%xmm12
2254	psrad	$31,%xmm14
2255.byte	102,15,56,222,216
2256	paddq	%xmm15,%xmm15
2257	pand	%xmm8,%xmm14
2258.byte	102,15,56,222,224
2259.byte	102,15,56,222,232
2260.byte	102,15,56,222,240
2261	pxor	%xmm14,%xmm15
2262	movaps	%xmm13,%xmm14
2263.byte	102,15,56,222,248
2264
2265	movdqa	%xmm9,%xmm0
2266	paddd	%xmm9,%xmm9
2267.byte	102,15,56,222,209
2268	pxor	%xmm15,%xmm13
2269	psrad	$31,%xmm0
2270.byte	102,15,56,222,217
2271	paddq	%xmm15,%xmm15
2272	pand	%xmm8,%xmm0
2273.byte	102,15,56,222,225
2274.byte	102,15,56,222,233
2275	pxor	%xmm0,%xmm15
2276	movups	(%rbp),%xmm0
2277.byte	102,15,56,222,241
2278.byte	102,15,56,222,249
2279	movups	16(%rbp),%xmm1
2280
2281	pxor	%xmm15,%xmm14
2282.byte	102,15,56,223,84,36,0
2283	psrad	$31,%xmm9
2284	paddq	%xmm15,%xmm15
2285.byte	102,15,56,223,92,36,16
2286.byte	102,15,56,223,100,36,32
2287	pand	%xmm8,%xmm9
2288	movq	%r10,%rax
2289.byte	102,15,56,223,108,36,48
2290.byte	102,15,56,223,116,36,64
2291.byte	102,15,56,223,124,36,80
2292	pxor	%xmm9,%xmm15
2293
2294	leaq	96(%rsi),%rsi
2295	movups	%xmm2,-96(%rsi)
2296	movups	%xmm3,-80(%rsi)
2297	movups	%xmm4,-64(%rsi)
2298	movups	%xmm5,-48(%rsi)
2299	movups	%xmm6,-32(%rsi)
2300	movups	%xmm7,-16(%rsi)
2301	subq	$96,%rdx
2302	jnc	.Lxts_dec_grandloop
2303
2304	movl	$16+96,%eax
2305	subl	%r10d,%eax
2306	movq	%rbp,%rcx
2307	shrl	$4,%eax
2308
2309.Lxts_dec_short:
2310
2311	movl	%eax,%r10d
2312	pxor	%xmm0,%xmm10
2313	pxor	%xmm0,%xmm11
2314	addq	$96,%rdx
2315	jz	.Lxts_dec_done
2316
2317	pxor	%xmm0,%xmm12
2318	cmpq	$0x20,%rdx
2319	jb	.Lxts_dec_one
2320	pxor	%xmm0,%xmm13
2321	je	.Lxts_dec_two
2322
2323	pxor	%xmm0,%xmm14
2324	cmpq	$0x40,%rdx
2325	jb	.Lxts_dec_three
2326	je	.Lxts_dec_four
2327
2328	movdqu	(%rdi),%xmm2
2329	movdqu	16(%rdi),%xmm3
2330	movdqu	32(%rdi),%xmm4
2331	pxor	%xmm10,%xmm2
2332	movdqu	48(%rdi),%xmm5
2333	pxor	%xmm11,%xmm3
2334	movdqu	64(%rdi),%xmm6
2335	leaq	80(%rdi),%rdi
2336	pxor	%xmm12,%xmm4
2337	pxor	%xmm13,%xmm5
2338	pxor	%xmm14,%xmm6
2339
2340	call	_aesni_decrypt6
2341
2342	xorps	%xmm10,%xmm2
2343	xorps	%xmm11,%xmm3
2344	xorps	%xmm12,%xmm4
2345	movdqu	%xmm2,(%rsi)
2346	xorps	%xmm13,%xmm5
2347	movdqu	%xmm3,16(%rsi)
2348	xorps	%xmm14,%xmm6
2349	movdqu	%xmm4,32(%rsi)
2350	pxor	%xmm14,%xmm14
2351	movdqu	%xmm5,48(%rsi)
2352	pcmpgtd	%xmm15,%xmm14
2353	movdqu	%xmm6,64(%rsi)
2354	leaq	80(%rsi),%rsi
2355	pshufd	$0x13,%xmm14,%xmm11
2356	andq	$15,%r9
2357	jz	.Lxts_dec_ret
2358
2359	movdqa	%xmm15,%xmm10
2360	paddq	%xmm15,%xmm15
2361	pand	%xmm8,%xmm11
2362	pxor	%xmm15,%xmm11
2363	jmp	.Lxts_dec_done2
2364
2365.align	16
2366.Lxts_dec_one:
2367	movups	(%rdi),%xmm2
2368	leaq	16(%rdi),%rdi
2369	xorps	%xmm10,%xmm2
2370	movups	(%rcx),%xmm0
2371	movups	16(%rcx),%xmm1
2372	leaq	32(%rcx),%rcx
2373	xorps	%xmm0,%xmm2
2374.Loop_dec1_12:
2375.byte	102,15,56,222,209
2376	decl	%eax
2377	movups	(%rcx),%xmm1
2378	leaq	16(%rcx),%rcx
2379	jnz	.Loop_dec1_12
2380.byte	102,15,56,223,209
2381	xorps	%xmm10,%xmm2
2382	movdqa	%xmm11,%xmm10
2383	movups	%xmm2,(%rsi)
2384	movdqa	%xmm12,%xmm11
2385	leaq	16(%rsi),%rsi
2386	jmp	.Lxts_dec_done
2387
2388.align	16
2389.Lxts_dec_two:
2390	movups	(%rdi),%xmm2
2391	movups	16(%rdi),%xmm3
2392	leaq	32(%rdi),%rdi
2393	xorps	%xmm10,%xmm2
2394	xorps	%xmm11,%xmm3
2395
2396	call	_aesni_decrypt2
2397
2398	xorps	%xmm10,%xmm2
2399	movdqa	%xmm12,%xmm10
2400	xorps	%xmm11,%xmm3
2401	movdqa	%xmm13,%xmm11
2402	movups	%xmm2,(%rsi)
2403	movups	%xmm3,16(%rsi)
2404	leaq	32(%rsi),%rsi
2405	jmp	.Lxts_dec_done
2406
2407.align	16
2408.Lxts_dec_three:
2409	movups	(%rdi),%xmm2
2410	movups	16(%rdi),%xmm3
2411	movups	32(%rdi),%xmm4
2412	leaq	48(%rdi),%rdi
2413	xorps	%xmm10,%xmm2
2414	xorps	%xmm11,%xmm3
2415	xorps	%xmm12,%xmm4
2416
2417	call	_aesni_decrypt3
2418
2419	xorps	%xmm10,%xmm2
2420	movdqa	%xmm13,%xmm10
2421	xorps	%xmm11,%xmm3
2422	movdqa	%xmm14,%xmm11
2423	xorps	%xmm12,%xmm4
2424	movups	%xmm2,(%rsi)
2425	movups	%xmm3,16(%rsi)
2426	movups	%xmm4,32(%rsi)
2427	leaq	48(%rsi),%rsi
2428	jmp	.Lxts_dec_done
2429
2430.align	16
2431.Lxts_dec_four:
2432	movups	(%rdi),%xmm2
2433	movups	16(%rdi),%xmm3
2434	movups	32(%rdi),%xmm4
2435	xorps	%xmm10,%xmm2
2436	movups	48(%rdi),%xmm5
2437	leaq	64(%rdi),%rdi
2438	xorps	%xmm11,%xmm3
2439	xorps	%xmm12,%xmm4
2440	xorps	%xmm13,%xmm5
2441
2442	call	_aesni_decrypt4
2443
2444	pxor	%xmm10,%xmm2
2445	movdqa	%xmm14,%xmm10
2446	pxor	%xmm11,%xmm3
2447	movdqa	%xmm15,%xmm11
2448	pxor	%xmm12,%xmm4
2449	movdqu	%xmm2,(%rsi)
2450	pxor	%xmm13,%xmm5
2451	movdqu	%xmm3,16(%rsi)
2452	movdqu	%xmm4,32(%rsi)
2453	movdqu	%xmm5,48(%rsi)
2454	leaq	64(%rsi),%rsi
2455	jmp	.Lxts_dec_done
2456
2457.align	16
2458.Lxts_dec_done:
2459	andq	$15,%r9
2460	jz	.Lxts_dec_ret
2461.Lxts_dec_done2:
2462	movq	%r9,%rdx
2463	movq	%rbp,%rcx
2464	movl	%r10d,%eax
2465
2466	movups	(%rdi),%xmm2
2467	xorps	%xmm11,%xmm2
2468	movups	(%rcx),%xmm0
2469	movups	16(%rcx),%xmm1
2470	leaq	32(%rcx),%rcx
2471	xorps	%xmm0,%xmm2
2472.Loop_dec1_13:
2473.byte	102,15,56,222,209
2474	decl	%eax
2475	movups	(%rcx),%xmm1
2476	leaq	16(%rcx),%rcx
2477	jnz	.Loop_dec1_13
2478.byte	102,15,56,223,209
2479	xorps	%xmm11,%xmm2
2480	movups	%xmm2,(%rsi)
2481
2482.Lxts_dec_steal:
2483	movzbl	16(%rdi),%eax
2484	movzbl	(%rsi),%ecx
2485	leaq	1(%rdi),%rdi
2486	movb	%al,(%rsi)
2487	movb	%cl,16(%rsi)
2488	leaq	1(%rsi),%rsi
2489	subq	$1,%rdx
2490	jnz	.Lxts_dec_steal
2491
2492	subq	%r9,%rsi
2493	movq	%rbp,%rcx
2494	movl	%r10d,%eax
2495
2496	movups	(%rsi),%xmm2
2497	xorps	%xmm10,%xmm2
2498	movups	(%rcx),%xmm0
2499	movups	16(%rcx),%xmm1
2500	leaq	32(%rcx),%rcx
2501	xorps	%xmm0,%xmm2
2502.Loop_dec1_14:
2503.byte	102,15,56,222,209
2504	decl	%eax
2505	movups	(%rcx),%xmm1
2506	leaq	16(%rcx),%rcx
2507	jnz	.Loop_dec1_14
2508.byte	102,15,56,223,209
2509	xorps	%xmm10,%xmm2
2510	movups	%xmm2,(%rsi)
2511
2512.Lxts_dec_ret:
2513	xorps	%xmm0,%xmm0
2514	pxor	%xmm1,%xmm1
2515	pxor	%xmm2,%xmm2
2516	pxor	%xmm3,%xmm3
2517	pxor	%xmm4,%xmm4
2518	pxor	%xmm5,%xmm5
2519	pxor	%xmm6,%xmm6
2520	pxor	%xmm7,%xmm7
2521	movaps	%xmm0,0(%rsp)
2522	pxor	%xmm8,%xmm8
2523	movaps	%xmm0,16(%rsp)
2524	pxor	%xmm9,%xmm9
2525	movaps	%xmm0,32(%rsp)
2526	pxor	%xmm10,%xmm10
2527	movaps	%xmm0,48(%rsp)
2528	pxor	%xmm11,%xmm11
2529	movaps	%xmm0,64(%rsp)
2530	pxor	%xmm12,%xmm12
2531	movaps	%xmm0,80(%rsp)
2532	pxor	%xmm13,%xmm13
2533	movaps	%xmm0,96(%rsp)
2534	pxor	%xmm14,%xmm14
2535	pxor	%xmm15,%xmm15
2536	movq	-8(%r11),%rbp
2537	leaq	(%r11),%rsp
2538.Lxts_dec_epilogue:
2539	.byte	0xf3,0xc3
2540.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2541.globl	aesni_ocb_encrypt
2542.hidden aesni_ocb_encrypt
2543.type	aesni_ocb_encrypt,@function
2544.align	32
2545aesni_ocb_encrypt:
2546	leaq	(%rsp),%rax
2547	pushq	%rbx
2548	pushq	%rbp
2549	pushq	%r12
2550	pushq	%r13
2551	pushq	%r14
2552	movq	8(%rax),%rbx
2553	movq	8+8(%rax),%rbp
2554
2555	movl	240(%rcx),%r10d
2556	movq	%rcx,%r11
2557	shll	$4,%r10d
2558	movups	(%rcx),%xmm9
2559	movups	16(%rcx,%r10,1),%xmm1
2560
2561	movdqu	(%r9),%xmm15
2562	pxor	%xmm1,%xmm9
2563	pxor	%xmm1,%xmm15
2564
2565	movl	$16+32,%eax
2566	leaq	32(%r11,%r10,1),%rcx
2567	movups	16(%r11),%xmm1
2568	subq	%r10,%rax
2569	movq	%rax,%r10
2570
2571	movdqu	(%rbx),%xmm10
2572	movdqu	(%rbp),%xmm8
2573
2574	testq	$1,%r8
2575	jnz	.Locb_enc_odd
2576
2577	bsfq	%r8,%r12
2578	addq	$1,%r8
2579	shlq	$4,%r12
2580	movdqu	(%rbx,%r12,1),%xmm7
2581	movdqu	(%rdi),%xmm2
2582	leaq	16(%rdi),%rdi
2583
2584	call	__ocb_encrypt1
2585
2586	movdqa	%xmm7,%xmm15
2587	movups	%xmm2,(%rsi)
2588	leaq	16(%rsi),%rsi
2589	subq	$1,%rdx
2590	jz	.Locb_enc_done
2591
2592.Locb_enc_odd:
2593	leaq	1(%r8),%r12
2594	leaq	3(%r8),%r13
2595	leaq	5(%r8),%r14
2596	leaq	6(%r8),%r8
2597	bsfq	%r12,%r12
2598	bsfq	%r13,%r13
2599	bsfq	%r14,%r14
2600	shlq	$4,%r12
2601	shlq	$4,%r13
2602	shlq	$4,%r14
2603
2604	subq	$6,%rdx
2605	jc	.Locb_enc_short
2606	jmp	.Locb_enc_grandloop
2607
2608.align	32
2609.Locb_enc_grandloop:
2610	movdqu	0(%rdi),%xmm2
2611	movdqu	16(%rdi),%xmm3
2612	movdqu	32(%rdi),%xmm4
2613	movdqu	48(%rdi),%xmm5
2614	movdqu	64(%rdi),%xmm6
2615	movdqu	80(%rdi),%xmm7
2616	leaq	96(%rdi),%rdi
2617
2618	call	__ocb_encrypt6
2619
2620	movups	%xmm2,0(%rsi)
2621	movups	%xmm3,16(%rsi)
2622	movups	%xmm4,32(%rsi)
2623	movups	%xmm5,48(%rsi)
2624	movups	%xmm6,64(%rsi)
2625	movups	%xmm7,80(%rsi)
2626	leaq	96(%rsi),%rsi
2627	subq	$6,%rdx
2628	jnc	.Locb_enc_grandloop
2629
2630.Locb_enc_short:
2631	addq	$6,%rdx
2632	jz	.Locb_enc_done
2633
2634	movdqu	0(%rdi),%xmm2
2635	cmpq	$2,%rdx
2636	jb	.Locb_enc_one
2637	movdqu	16(%rdi),%xmm3
2638	je	.Locb_enc_two
2639
2640	movdqu	32(%rdi),%xmm4
2641	cmpq	$4,%rdx
2642	jb	.Locb_enc_three
2643	movdqu	48(%rdi),%xmm5
2644	je	.Locb_enc_four
2645
2646	movdqu	64(%rdi),%xmm6
2647	pxor	%xmm7,%xmm7
2648
2649	call	__ocb_encrypt6
2650
2651	movdqa	%xmm14,%xmm15
2652	movups	%xmm2,0(%rsi)
2653	movups	%xmm3,16(%rsi)
2654	movups	%xmm4,32(%rsi)
2655	movups	%xmm5,48(%rsi)
2656	movups	%xmm6,64(%rsi)
2657
2658	jmp	.Locb_enc_done
2659
2660.align	16
2661.Locb_enc_one:
2662	movdqa	%xmm10,%xmm7
2663
2664	call	__ocb_encrypt1
2665
2666	movdqa	%xmm7,%xmm15
2667	movups	%xmm2,0(%rsi)
2668	jmp	.Locb_enc_done
2669
2670.align	16
2671.Locb_enc_two:
2672	pxor	%xmm4,%xmm4
2673	pxor	%xmm5,%xmm5
2674
2675	call	__ocb_encrypt4
2676
2677	movdqa	%xmm11,%xmm15
2678	movups	%xmm2,0(%rsi)
2679	movups	%xmm3,16(%rsi)
2680
2681	jmp	.Locb_enc_done
2682
2683.align	16
2684.Locb_enc_three:
2685	pxor	%xmm5,%xmm5
2686
2687	call	__ocb_encrypt4
2688
2689	movdqa	%xmm12,%xmm15
2690	movups	%xmm2,0(%rsi)
2691	movups	%xmm3,16(%rsi)
2692	movups	%xmm4,32(%rsi)
2693
2694	jmp	.Locb_enc_done
2695
2696.align	16
2697.Locb_enc_four:
2698	call	__ocb_encrypt4
2699
2700	movdqa	%xmm13,%xmm15
2701	movups	%xmm2,0(%rsi)
2702	movups	%xmm3,16(%rsi)
2703	movups	%xmm4,32(%rsi)
2704	movups	%xmm5,48(%rsi)
2705
2706.Locb_enc_done:
2707	pxor	%xmm0,%xmm15
2708	movdqu	%xmm8,(%rbp)
2709	movdqu	%xmm15,(%r9)
2710
2711	xorps	%xmm0,%xmm0
2712	pxor	%xmm1,%xmm1
2713	pxor	%xmm2,%xmm2
2714	pxor	%xmm3,%xmm3
2715	pxor	%xmm4,%xmm4
2716	pxor	%xmm5,%xmm5
2717	pxor	%xmm6,%xmm6
2718	pxor	%xmm7,%xmm7
2719	pxor	%xmm8,%xmm8
2720	pxor	%xmm9,%xmm9
2721	pxor	%xmm10,%xmm10
2722	pxor	%xmm11,%xmm11
2723	pxor	%xmm12,%xmm12
2724	pxor	%xmm13,%xmm13
2725	pxor	%xmm14,%xmm14
2726	pxor	%xmm15,%xmm15
2727	leaq	40(%rsp),%rax
2728	movq	-40(%rax),%r14
2729	movq	-32(%rax),%r13
2730	movq	-24(%rax),%r12
2731	movq	-16(%rax),%rbp
2732	movq	-8(%rax),%rbx
2733	leaq	(%rax),%rsp
2734.Locb_enc_epilogue:
2735	.byte	0xf3,0xc3
2736.size	aesni_ocb_encrypt,.-aesni_ocb_encrypt
2737
2738.type	__ocb_encrypt6,@function
2739.align	32
2740__ocb_encrypt6:
2741	pxor	%xmm9,%xmm15
2742	movdqu	(%rbx,%r12,1),%xmm11
2743	movdqa	%xmm10,%xmm12
2744	movdqu	(%rbx,%r13,1),%xmm13
2745	movdqa	%xmm10,%xmm14
2746	pxor	%xmm15,%xmm10
2747	movdqu	(%rbx,%r14,1),%xmm15
2748	pxor	%xmm10,%xmm11
2749	pxor	%xmm2,%xmm8
2750	pxor	%xmm10,%xmm2
2751	pxor	%xmm11,%xmm12
2752	pxor	%xmm3,%xmm8
2753	pxor	%xmm11,%xmm3
2754	pxor	%xmm12,%xmm13
2755	pxor	%xmm4,%xmm8
2756	pxor	%xmm12,%xmm4
2757	pxor	%xmm13,%xmm14
2758	pxor	%xmm5,%xmm8
2759	pxor	%xmm13,%xmm5
2760	pxor	%xmm14,%xmm15
2761	pxor	%xmm6,%xmm8
2762	pxor	%xmm14,%xmm6
2763	pxor	%xmm7,%xmm8
2764	pxor	%xmm15,%xmm7
2765	movups	32(%r11),%xmm0
2766
2767	leaq	1(%r8),%r12
2768	leaq	3(%r8),%r13
2769	leaq	5(%r8),%r14
2770	addq	$6,%r8
2771	pxor	%xmm9,%xmm10
2772	bsfq	%r12,%r12
2773	bsfq	%r13,%r13
2774	bsfq	%r14,%r14
2775
2776.byte	102,15,56,220,209
2777.byte	102,15,56,220,217
2778.byte	102,15,56,220,225
2779.byte	102,15,56,220,233
2780	pxor	%xmm9,%xmm11
2781	pxor	%xmm9,%xmm12
2782.byte	102,15,56,220,241
2783	pxor	%xmm9,%xmm13
2784	pxor	%xmm9,%xmm14
2785.byte	102,15,56,220,249
2786	movups	48(%r11),%xmm1
2787	pxor	%xmm9,%xmm15
2788
2789.byte	102,15,56,220,208
2790.byte	102,15,56,220,216
2791.byte	102,15,56,220,224
2792.byte	102,15,56,220,232
2793.byte	102,15,56,220,240
2794.byte	102,15,56,220,248
2795	movups	64(%r11),%xmm0
2796	shlq	$4,%r12
2797	shlq	$4,%r13
2798	jmp	.Locb_enc_loop6
2799
2800.align	32
2801.Locb_enc_loop6:
2802.byte	102,15,56,220,209
2803.byte	102,15,56,220,217
2804.byte	102,15,56,220,225
2805.byte	102,15,56,220,233
2806.byte	102,15,56,220,241
2807.byte	102,15,56,220,249
2808	movups	(%rcx,%rax,1),%xmm1
2809	addq	$32,%rax
2810
2811.byte	102,15,56,220,208
2812.byte	102,15,56,220,216
2813.byte	102,15,56,220,224
2814.byte	102,15,56,220,232
2815.byte	102,15,56,220,240
2816.byte	102,15,56,220,248
2817	movups	-16(%rcx,%rax,1),%xmm0
2818	jnz	.Locb_enc_loop6
2819
2820.byte	102,15,56,220,209
2821.byte	102,15,56,220,217
2822.byte	102,15,56,220,225
2823.byte	102,15,56,220,233
2824.byte	102,15,56,220,241
2825.byte	102,15,56,220,249
2826	movups	16(%r11),%xmm1
2827	shlq	$4,%r14
2828
2829.byte	102,65,15,56,221,210
2830	movdqu	(%rbx),%xmm10
2831	movq	%r10,%rax
2832.byte	102,65,15,56,221,219
2833.byte	102,65,15,56,221,228
2834.byte	102,65,15,56,221,237
2835.byte	102,65,15,56,221,246
2836.byte	102,65,15,56,221,255
2837	.byte	0xf3,0xc3
2838.size	__ocb_encrypt6,.-__ocb_encrypt6
2839
2840.type	__ocb_encrypt4,@function
2841.align	32
2842__ocb_encrypt4:
2843	pxor	%xmm9,%xmm15
2844	movdqu	(%rbx,%r12,1),%xmm11
2845	movdqa	%xmm10,%xmm12
2846	movdqu	(%rbx,%r13,1),%xmm13
2847	pxor	%xmm15,%xmm10
2848	pxor	%xmm10,%xmm11
2849	pxor	%xmm2,%xmm8
2850	pxor	%xmm10,%xmm2
2851	pxor	%xmm11,%xmm12
2852	pxor	%xmm3,%xmm8
2853	pxor	%xmm11,%xmm3
2854	pxor	%xmm12,%xmm13
2855	pxor	%xmm4,%xmm8
2856	pxor	%xmm12,%xmm4
2857	pxor	%xmm5,%xmm8
2858	pxor	%xmm13,%xmm5
2859	movups	32(%r11),%xmm0
2860
2861	pxor	%xmm9,%xmm10
2862	pxor	%xmm9,%xmm11
2863	pxor	%xmm9,%xmm12
2864	pxor	%xmm9,%xmm13
2865
2866.byte	102,15,56,220,209
2867.byte	102,15,56,220,217
2868.byte	102,15,56,220,225
2869.byte	102,15,56,220,233
2870	movups	48(%r11),%xmm1
2871
2872.byte	102,15,56,220,208
2873.byte	102,15,56,220,216
2874.byte	102,15,56,220,224
2875.byte	102,15,56,220,232
2876	movups	64(%r11),%xmm0
2877	jmp	.Locb_enc_loop4
2878
2879.align	32
2880.Locb_enc_loop4:
2881.byte	102,15,56,220,209
2882.byte	102,15,56,220,217
2883.byte	102,15,56,220,225
2884.byte	102,15,56,220,233
2885	movups	(%rcx,%rax,1),%xmm1
2886	addq	$32,%rax
2887
2888.byte	102,15,56,220,208
2889.byte	102,15,56,220,216
2890.byte	102,15,56,220,224
2891.byte	102,15,56,220,232
2892	movups	-16(%rcx,%rax,1),%xmm0
2893	jnz	.Locb_enc_loop4
2894
2895.byte	102,15,56,220,209
2896.byte	102,15,56,220,217
2897.byte	102,15,56,220,225
2898.byte	102,15,56,220,233
2899	movups	16(%r11),%xmm1
2900	movq	%r10,%rax
2901
2902.byte	102,65,15,56,221,210
2903.byte	102,65,15,56,221,219
2904.byte	102,65,15,56,221,228
2905.byte	102,65,15,56,221,237
2906	.byte	0xf3,0xc3
2907.size	__ocb_encrypt4,.-__ocb_encrypt4
2908
2909.type	__ocb_encrypt1,@function
2910.align	32
2911__ocb_encrypt1:
2912	pxor	%xmm15,%xmm7
2913	pxor	%xmm9,%xmm7
2914	pxor	%xmm2,%xmm8
2915	pxor	%xmm7,%xmm2
2916	movups	32(%r11),%xmm0
2917
2918.byte	102,15,56,220,209
2919	movups	48(%r11),%xmm1
2920	pxor	%xmm9,%xmm7
2921
2922.byte	102,15,56,220,208
2923	movups	64(%r11),%xmm0
2924	jmp	.Locb_enc_loop1
2925
2926.align	32
2927.Locb_enc_loop1:
2928.byte	102,15,56,220,209
2929	movups	(%rcx,%rax,1),%xmm1
2930	addq	$32,%rax
2931
2932.byte	102,15,56,220,208
2933	movups	-16(%rcx,%rax,1),%xmm0
2934	jnz	.Locb_enc_loop1
2935
2936.byte	102,15,56,220,209
2937	movups	16(%r11),%xmm1
2938	movq	%r10,%rax
2939
2940.byte	102,15,56,221,215
2941	.byte	0xf3,0xc3
2942.size	__ocb_encrypt1,.-__ocb_encrypt1
2943
2944.globl	aesni_ocb_decrypt
2945.hidden aesni_ocb_decrypt
2946.type	aesni_ocb_decrypt,@function
2947.align	32
2948aesni_ocb_decrypt:
2949	leaq	(%rsp),%rax
2950	pushq	%rbx
2951	pushq	%rbp
2952	pushq	%r12
2953	pushq	%r13
2954	pushq	%r14
2955	movq	8(%rax),%rbx
2956	movq	8+8(%rax),%rbp
2957
2958	movl	240(%rcx),%r10d
2959	movq	%rcx,%r11
2960	shll	$4,%r10d
2961	movups	(%rcx),%xmm9
2962	movups	16(%rcx,%r10,1),%xmm1
2963
2964	movdqu	(%r9),%xmm15
2965	pxor	%xmm1,%xmm9
2966	pxor	%xmm1,%xmm15
2967
2968	movl	$16+32,%eax
2969	leaq	32(%r11,%r10,1),%rcx
2970	movups	16(%r11),%xmm1
2971	subq	%r10,%rax
2972	movq	%rax,%r10
2973
2974	movdqu	(%rbx),%xmm10
2975	movdqu	(%rbp),%xmm8
2976
2977	testq	$1,%r8
2978	jnz	.Locb_dec_odd
2979
2980	bsfq	%r8,%r12
2981	addq	$1,%r8
2982	shlq	$4,%r12
2983	movdqu	(%rbx,%r12,1),%xmm7
2984	movdqu	(%rdi),%xmm2
2985	leaq	16(%rdi),%rdi
2986
2987	call	__ocb_decrypt1
2988
2989	movdqa	%xmm7,%xmm15
2990	movups	%xmm2,(%rsi)
2991	xorps	%xmm2,%xmm8
2992	leaq	16(%rsi),%rsi
2993	subq	$1,%rdx
2994	jz	.Locb_dec_done
2995
2996.Locb_dec_odd:
2997	leaq	1(%r8),%r12
2998	leaq	3(%r8),%r13
2999	leaq	5(%r8),%r14
3000	leaq	6(%r8),%r8
3001	bsfq	%r12,%r12
3002	bsfq	%r13,%r13
3003	bsfq	%r14,%r14
3004	shlq	$4,%r12
3005	shlq	$4,%r13
3006	shlq	$4,%r14
3007
3008	subq	$6,%rdx
3009	jc	.Locb_dec_short
3010	jmp	.Locb_dec_grandloop
3011
3012.align	32
3013.Locb_dec_grandloop:
3014	movdqu	0(%rdi),%xmm2
3015	movdqu	16(%rdi),%xmm3
3016	movdqu	32(%rdi),%xmm4
3017	movdqu	48(%rdi),%xmm5
3018	movdqu	64(%rdi),%xmm6
3019	movdqu	80(%rdi),%xmm7
3020	leaq	96(%rdi),%rdi
3021
3022	call	__ocb_decrypt6
3023
3024	movups	%xmm2,0(%rsi)
3025	pxor	%xmm2,%xmm8
3026	movups	%xmm3,16(%rsi)
3027	pxor	%xmm3,%xmm8
3028	movups	%xmm4,32(%rsi)
3029	pxor	%xmm4,%xmm8
3030	movups	%xmm5,48(%rsi)
3031	pxor	%xmm5,%xmm8
3032	movups	%xmm6,64(%rsi)
3033	pxor	%xmm6,%xmm8
3034	movups	%xmm7,80(%rsi)
3035	pxor	%xmm7,%xmm8
3036	leaq	96(%rsi),%rsi
3037	subq	$6,%rdx
3038	jnc	.Locb_dec_grandloop
3039
3040.Locb_dec_short:
3041	addq	$6,%rdx
3042	jz	.Locb_dec_done
3043
3044	movdqu	0(%rdi),%xmm2
3045	cmpq	$2,%rdx
3046	jb	.Locb_dec_one
3047	movdqu	16(%rdi),%xmm3
3048	je	.Locb_dec_two
3049
3050	movdqu	32(%rdi),%xmm4
3051	cmpq	$4,%rdx
3052	jb	.Locb_dec_three
3053	movdqu	48(%rdi),%xmm5
3054	je	.Locb_dec_four
3055
3056	movdqu	64(%rdi),%xmm6
3057	pxor	%xmm7,%xmm7
3058
3059	call	__ocb_decrypt6
3060
3061	movdqa	%xmm14,%xmm15
3062	movups	%xmm2,0(%rsi)
3063	pxor	%xmm2,%xmm8
3064	movups	%xmm3,16(%rsi)
3065	pxor	%xmm3,%xmm8
3066	movups	%xmm4,32(%rsi)
3067	pxor	%xmm4,%xmm8
3068	movups	%xmm5,48(%rsi)
3069	pxor	%xmm5,%xmm8
3070	movups	%xmm6,64(%rsi)
3071	pxor	%xmm6,%xmm8
3072
3073	jmp	.Locb_dec_done
3074
3075.align	16
3076.Locb_dec_one:
3077	movdqa	%xmm10,%xmm7
3078
3079	call	__ocb_decrypt1
3080
3081	movdqa	%xmm7,%xmm15
3082	movups	%xmm2,0(%rsi)
3083	xorps	%xmm2,%xmm8
3084	jmp	.Locb_dec_done
3085
3086.align	16
3087.Locb_dec_two:
3088	pxor	%xmm4,%xmm4
3089	pxor	%xmm5,%xmm5
3090
3091	call	__ocb_decrypt4
3092
3093	movdqa	%xmm11,%xmm15
3094	movups	%xmm2,0(%rsi)
3095	xorps	%xmm2,%xmm8
3096	movups	%xmm3,16(%rsi)
3097	xorps	%xmm3,%xmm8
3098
3099	jmp	.Locb_dec_done
3100
3101.align	16
3102.Locb_dec_three:
3103	pxor	%xmm5,%xmm5
3104
3105	call	__ocb_decrypt4
3106
3107	movdqa	%xmm12,%xmm15
3108	movups	%xmm2,0(%rsi)
3109	xorps	%xmm2,%xmm8
3110	movups	%xmm3,16(%rsi)
3111	xorps	%xmm3,%xmm8
3112	movups	%xmm4,32(%rsi)
3113	xorps	%xmm4,%xmm8
3114
3115	jmp	.Locb_dec_done
3116
3117.align	16
3118.Locb_dec_four:
3119	call	__ocb_decrypt4
3120
3121	movdqa	%xmm13,%xmm15
3122	movups	%xmm2,0(%rsi)
3123	pxor	%xmm2,%xmm8
3124	movups	%xmm3,16(%rsi)
3125	pxor	%xmm3,%xmm8
3126	movups	%xmm4,32(%rsi)
3127	pxor	%xmm4,%xmm8
3128	movups	%xmm5,48(%rsi)
3129	pxor	%xmm5,%xmm8
3130
3131.Locb_dec_done:
3132	pxor	%xmm0,%xmm15
3133	movdqu	%xmm8,(%rbp)
3134	movdqu	%xmm15,(%r9)
3135
3136	xorps	%xmm0,%xmm0
3137	pxor	%xmm1,%xmm1
3138	pxor	%xmm2,%xmm2
3139	pxor	%xmm3,%xmm3
3140	pxor	%xmm4,%xmm4
3141	pxor	%xmm5,%xmm5
3142	pxor	%xmm6,%xmm6
3143	pxor	%xmm7,%xmm7
3144	pxor	%xmm8,%xmm8
3145	pxor	%xmm9,%xmm9
3146	pxor	%xmm10,%xmm10
3147	pxor	%xmm11,%xmm11
3148	pxor	%xmm12,%xmm12
3149	pxor	%xmm13,%xmm13
3150	pxor	%xmm14,%xmm14
3151	pxor	%xmm15,%xmm15
3152	leaq	40(%rsp),%rax
3153	movq	-40(%rax),%r14
3154	movq	-32(%rax),%r13
3155	movq	-24(%rax),%r12
3156	movq	-16(%rax),%rbp
3157	movq	-8(%rax),%rbx
3158	leaq	(%rax),%rsp
3159.Locb_dec_epilogue:
3160	.byte	0xf3,0xc3
3161.size	aesni_ocb_decrypt,.-aesni_ocb_decrypt
3162
3163.type	__ocb_decrypt6,@function
3164.align	32
3165__ocb_decrypt6:
3166	pxor	%xmm9,%xmm15
3167	movdqu	(%rbx,%r12,1),%xmm11
3168	movdqa	%xmm10,%xmm12
3169	movdqu	(%rbx,%r13,1),%xmm13
3170	movdqa	%xmm10,%xmm14
3171	pxor	%xmm15,%xmm10
3172	movdqu	(%rbx,%r14,1),%xmm15
3173	pxor	%xmm10,%xmm11
3174	pxor	%xmm10,%xmm2
3175	pxor	%xmm11,%xmm12
3176	pxor	%xmm11,%xmm3
3177	pxor	%xmm12,%xmm13
3178	pxor	%xmm12,%xmm4
3179	pxor	%xmm13,%xmm14
3180	pxor	%xmm13,%xmm5
3181	pxor	%xmm14,%xmm15
3182	pxor	%xmm14,%xmm6
3183	pxor	%xmm15,%xmm7
3184	movups	32(%r11),%xmm0
3185
3186	leaq	1(%r8),%r12
3187	leaq	3(%r8),%r13
3188	leaq	5(%r8),%r14
3189	addq	$6,%r8
3190	pxor	%xmm9,%xmm10
3191	bsfq	%r12,%r12
3192	bsfq	%r13,%r13
3193	bsfq	%r14,%r14
3194
3195.byte	102,15,56,222,209
3196.byte	102,15,56,222,217
3197.byte	102,15,56,222,225
3198.byte	102,15,56,222,233
3199	pxor	%xmm9,%xmm11
3200	pxor	%xmm9,%xmm12
3201.byte	102,15,56,222,241
3202	pxor	%xmm9,%xmm13
3203	pxor	%xmm9,%xmm14
3204.byte	102,15,56,222,249
3205	movups	48(%r11),%xmm1
3206	pxor	%xmm9,%xmm15
3207
3208.byte	102,15,56,222,208
3209.byte	102,15,56,222,216
3210.byte	102,15,56,222,224
3211.byte	102,15,56,222,232
3212.byte	102,15,56,222,240
3213.byte	102,15,56,222,248
3214	movups	64(%r11),%xmm0
3215	shlq	$4,%r12
3216	shlq	$4,%r13
3217	jmp	.Locb_dec_loop6
3218
3219.align	32
3220.Locb_dec_loop6:
3221.byte	102,15,56,222,209
3222.byte	102,15,56,222,217
3223.byte	102,15,56,222,225
3224.byte	102,15,56,222,233
3225.byte	102,15,56,222,241
3226.byte	102,15,56,222,249
3227	movups	(%rcx,%rax,1),%xmm1
3228	addq	$32,%rax
3229
3230.byte	102,15,56,222,208
3231.byte	102,15,56,222,216
3232.byte	102,15,56,222,224
3233.byte	102,15,56,222,232
3234.byte	102,15,56,222,240
3235.byte	102,15,56,222,248
3236	movups	-16(%rcx,%rax,1),%xmm0
3237	jnz	.Locb_dec_loop6
3238
3239.byte	102,15,56,222,209
3240.byte	102,15,56,222,217
3241.byte	102,15,56,222,225
3242.byte	102,15,56,222,233
3243.byte	102,15,56,222,241
3244.byte	102,15,56,222,249
3245	movups	16(%r11),%xmm1
3246	shlq	$4,%r14
3247
3248.byte	102,65,15,56,223,210
3249	movdqu	(%rbx),%xmm10
3250	movq	%r10,%rax
3251.byte	102,65,15,56,223,219
3252.byte	102,65,15,56,223,228
3253.byte	102,65,15,56,223,237
3254.byte	102,65,15,56,223,246
3255.byte	102,65,15,56,223,255
3256	.byte	0xf3,0xc3
3257.size	__ocb_decrypt6,.-__ocb_decrypt6
3258
3259.type	__ocb_decrypt4,@function
3260.align	32
3261__ocb_decrypt4:
3262	pxor	%xmm9,%xmm15
3263	movdqu	(%rbx,%r12,1),%xmm11
3264	movdqa	%xmm10,%xmm12
3265	movdqu	(%rbx,%r13,1),%xmm13
3266	pxor	%xmm15,%xmm10
3267	pxor	%xmm10,%xmm11
3268	pxor	%xmm10,%xmm2
3269	pxor	%xmm11,%xmm12
3270	pxor	%xmm11,%xmm3
3271	pxor	%xmm12,%xmm13
3272	pxor	%xmm12,%xmm4
3273	pxor	%xmm13,%xmm5
3274	movups	32(%r11),%xmm0
3275
3276	pxor	%xmm9,%xmm10
3277	pxor	%xmm9,%xmm11
3278	pxor	%xmm9,%xmm12
3279	pxor	%xmm9,%xmm13
3280
3281.byte	102,15,56,222,209
3282.byte	102,15,56,222,217
3283.byte	102,15,56,222,225
3284.byte	102,15,56,222,233
3285	movups	48(%r11),%xmm1
3286
3287.byte	102,15,56,222,208
3288.byte	102,15,56,222,216
3289.byte	102,15,56,222,224
3290.byte	102,15,56,222,232
3291	movups	64(%r11),%xmm0
3292	jmp	.Locb_dec_loop4
3293
3294.align	32
3295.Locb_dec_loop4:
3296.byte	102,15,56,222,209
3297.byte	102,15,56,222,217
3298.byte	102,15,56,222,225
3299.byte	102,15,56,222,233
3300	movups	(%rcx,%rax,1),%xmm1
3301	addq	$32,%rax
3302
3303.byte	102,15,56,222,208
3304.byte	102,15,56,222,216
3305.byte	102,15,56,222,224
3306.byte	102,15,56,222,232
3307	movups	-16(%rcx,%rax,1),%xmm0
3308	jnz	.Locb_dec_loop4
3309
3310.byte	102,15,56,222,209
3311.byte	102,15,56,222,217
3312.byte	102,15,56,222,225
3313.byte	102,15,56,222,233
3314	movups	16(%r11),%xmm1
3315	movq	%r10,%rax
3316
3317.byte	102,65,15,56,223,210
3318.byte	102,65,15,56,223,219
3319.byte	102,65,15,56,223,228
3320.byte	102,65,15,56,223,237
3321	.byte	0xf3,0xc3
3322.size	__ocb_decrypt4,.-__ocb_decrypt4
3323
3324.type	__ocb_decrypt1,@function
3325.align	32
3326__ocb_decrypt1:
3327	pxor	%xmm15,%xmm7
3328	pxor	%xmm9,%xmm7
3329	pxor	%xmm7,%xmm2
3330	movups	32(%r11),%xmm0
3331
3332.byte	102,15,56,222,209
3333	movups	48(%r11),%xmm1
3334	pxor	%xmm9,%xmm7
3335
3336.byte	102,15,56,222,208
3337	movups	64(%r11),%xmm0
3338	jmp	.Locb_dec_loop1
3339
3340.align	32
3341.Locb_dec_loop1:
3342.byte	102,15,56,222,209
3343	movups	(%rcx,%rax,1),%xmm1
3344	addq	$32,%rax
3345
3346.byte	102,15,56,222,208
3347	movups	-16(%rcx,%rax,1),%xmm0
3348	jnz	.Locb_dec_loop1
3349
3350.byte	102,15,56,222,209
3351	movups	16(%r11),%xmm1
3352	movq	%r10,%rax
3353
3354.byte	102,15,56,223,215
3355	.byte	0xf3,0xc3
3356.size	__ocb_decrypt1,.-__ocb_decrypt1
3357.globl	aesni_cbc_encrypt
3358.hidden aesni_cbc_encrypt
3359.type	aesni_cbc_encrypt,@function
3360.align	16
3361aesni_cbc_encrypt:
3362	testq	%rdx,%rdx
3363	jz	.Lcbc_ret
3364
3365	movl	240(%rcx),%r10d
3366	movq	%rcx,%r11
3367	testl	%r9d,%r9d
3368	jz	.Lcbc_decrypt
3369
3370	movups	(%r8),%xmm2
3371	movl	%r10d,%eax
3372	cmpq	$16,%rdx
3373	jb	.Lcbc_enc_tail
3374	subq	$16,%rdx
3375	jmp	.Lcbc_enc_loop
3376.align	16
3377.Lcbc_enc_loop:
3378	movups	(%rdi),%xmm3
3379	leaq	16(%rdi),%rdi
3380
3381	movups	(%rcx),%xmm0
3382	movups	16(%rcx),%xmm1
3383	xorps	%xmm0,%xmm3
3384	leaq	32(%rcx),%rcx
3385	xorps	%xmm3,%xmm2
3386.Loop_enc1_15:
3387.byte	102,15,56,220,209
3388	decl	%eax
3389	movups	(%rcx),%xmm1
3390	leaq	16(%rcx),%rcx
3391	jnz	.Loop_enc1_15
3392.byte	102,15,56,221,209
3393	movl	%r10d,%eax
3394	movq	%r11,%rcx
3395	movups	%xmm2,0(%rsi)
3396	leaq	16(%rsi),%rsi
3397	subq	$16,%rdx
3398	jnc	.Lcbc_enc_loop
3399	addq	$16,%rdx
3400	jnz	.Lcbc_enc_tail
3401	pxor	%xmm0,%xmm0
3402	pxor	%xmm1,%xmm1
3403	movups	%xmm2,(%r8)
3404	pxor	%xmm2,%xmm2
3405	pxor	%xmm3,%xmm3
3406	jmp	.Lcbc_ret
3407
3408.Lcbc_enc_tail:
3409	movq	%rdx,%rcx
3410	xchgq	%rdi,%rsi
3411.long	0x9066A4F3
3412	movl	$16,%ecx
3413	subq	%rdx,%rcx
3414	xorl	%eax,%eax
3415.long	0x9066AAF3
3416	leaq	-16(%rdi),%rdi
3417	movl	%r10d,%eax
3418	movq	%rdi,%rsi
3419	movq	%r11,%rcx
3420	xorq	%rdx,%rdx
3421	jmp	.Lcbc_enc_loop
3422
3423.align	16
3424.Lcbc_decrypt:
3425	cmpq	$16,%rdx
3426	jne	.Lcbc_decrypt_bulk
3427
3428
3429
3430	movdqu	(%rdi),%xmm2
3431	movdqu	(%r8),%xmm3
3432	movdqa	%xmm2,%xmm4
3433	movups	(%rcx),%xmm0
3434	movups	16(%rcx),%xmm1
3435	leaq	32(%rcx),%rcx
3436	xorps	%xmm0,%xmm2
3437.Loop_dec1_16:
3438.byte	102,15,56,222,209
3439	decl	%r10d
3440	movups	(%rcx),%xmm1
3441	leaq	16(%rcx),%rcx
3442	jnz	.Loop_dec1_16
3443.byte	102,15,56,223,209
3444	pxor	%xmm0,%xmm0
3445	pxor	%xmm1,%xmm1
3446	movdqu	%xmm4,(%r8)
3447	xorps	%xmm3,%xmm2
3448	pxor	%xmm3,%xmm3
3449	movups	%xmm2,(%rsi)
3450	pxor	%xmm2,%xmm2
3451	jmp	.Lcbc_ret
3452.align	16
3453.Lcbc_decrypt_bulk:
3454	leaq	(%rsp),%r11
3455	pushq	%rbp
3456	subq	$16,%rsp
3457	andq	$-16,%rsp
3458	movq	%rcx,%rbp
3459	movups	(%r8),%xmm10
3460	movl	%r10d,%eax
3461	cmpq	$0x50,%rdx
3462	jbe	.Lcbc_dec_tail
3463
3464	movups	(%rcx),%xmm0
3465	movdqu	0(%rdi),%xmm2
3466	movdqu	16(%rdi),%xmm3
3467	movdqa	%xmm2,%xmm11
3468	movdqu	32(%rdi),%xmm4
3469	movdqa	%xmm3,%xmm12
3470	movdqu	48(%rdi),%xmm5
3471	movdqa	%xmm4,%xmm13
3472	movdqu	64(%rdi),%xmm6
3473	movdqa	%xmm5,%xmm14
3474	movdqu	80(%rdi),%xmm7
3475	movdqa	%xmm6,%xmm15
3476	movl	OPENSSL_ia32cap_P+4(%rip),%r9d
3477	cmpq	$0x70,%rdx
3478	jbe	.Lcbc_dec_six_or_seven
3479
3480	andl	$71303168,%r9d
3481	subq	$0x50,%rdx
3482	cmpl	$4194304,%r9d
3483	je	.Lcbc_dec_loop6_enter
3484	subq	$0x20,%rdx
3485	leaq	112(%rcx),%rcx
3486	jmp	.Lcbc_dec_loop8_enter
3487.align	16
3488.Lcbc_dec_loop8:
3489	movups	%xmm9,(%rsi)
3490	leaq	16(%rsi),%rsi
3491.Lcbc_dec_loop8_enter:
3492	movdqu	96(%rdi),%xmm8
3493	pxor	%xmm0,%xmm2
3494	movdqu	112(%rdi),%xmm9
3495	pxor	%xmm0,%xmm3
3496	movups	16-112(%rcx),%xmm1
3497	pxor	%xmm0,%xmm4
3498	movq	$-1,%rbp
3499	cmpq	$0x70,%rdx
3500	pxor	%xmm0,%xmm5
3501	pxor	%xmm0,%xmm6
3502	pxor	%xmm0,%xmm7
3503	pxor	%xmm0,%xmm8
3504
3505.byte	102,15,56,222,209
3506	pxor	%xmm0,%xmm9
3507	movups	32-112(%rcx),%xmm0
3508.byte	102,15,56,222,217
3509.byte	102,15,56,222,225
3510.byte	102,15,56,222,233
3511.byte	102,15,56,222,241
3512.byte	102,15,56,222,249
3513.byte	102,68,15,56,222,193
3514	adcq	$0,%rbp
3515	andq	$128,%rbp
3516.byte	102,68,15,56,222,201
3517	addq	%rdi,%rbp
3518	movups	48-112(%rcx),%xmm1
3519.byte	102,15,56,222,208
3520.byte	102,15,56,222,216
3521.byte	102,15,56,222,224
3522.byte	102,15,56,222,232
3523.byte	102,15,56,222,240
3524.byte	102,15,56,222,248
3525.byte	102,68,15,56,222,192
3526.byte	102,68,15,56,222,200
3527	movups	64-112(%rcx),%xmm0
3528	nop
3529.byte	102,15,56,222,209
3530.byte	102,15,56,222,217
3531.byte	102,15,56,222,225
3532.byte	102,15,56,222,233
3533.byte	102,15,56,222,241
3534.byte	102,15,56,222,249
3535.byte	102,68,15,56,222,193
3536.byte	102,68,15,56,222,201
3537	movups	80-112(%rcx),%xmm1
3538	nop
3539.byte	102,15,56,222,208
3540.byte	102,15,56,222,216
3541.byte	102,15,56,222,224
3542.byte	102,15,56,222,232
3543.byte	102,15,56,222,240
3544.byte	102,15,56,222,248
3545.byte	102,68,15,56,222,192
3546.byte	102,68,15,56,222,200
3547	movups	96-112(%rcx),%xmm0
3548	nop
3549.byte	102,15,56,222,209
3550.byte	102,15,56,222,217
3551.byte	102,15,56,222,225
3552.byte	102,15,56,222,233
3553.byte	102,15,56,222,241
3554.byte	102,15,56,222,249
3555.byte	102,68,15,56,222,193
3556.byte	102,68,15,56,222,201
3557	movups	112-112(%rcx),%xmm1
3558	nop
3559.byte	102,15,56,222,208
3560.byte	102,15,56,222,216
3561.byte	102,15,56,222,224
3562.byte	102,15,56,222,232
3563.byte	102,15,56,222,240
3564.byte	102,15,56,222,248
3565.byte	102,68,15,56,222,192
3566.byte	102,68,15,56,222,200
3567	movups	128-112(%rcx),%xmm0
3568	nop
3569.byte	102,15,56,222,209
3570.byte	102,15,56,222,217
3571.byte	102,15,56,222,225
3572.byte	102,15,56,222,233
3573.byte	102,15,56,222,241
3574.byte	102,15,56,222,249
3575.byte	102,68,15,56,222,193
3576.byte	102,68,15,56,222,201
3577	movups	144-112(%rcx),%xmm1
3578	cmpl	$11,%eax
3579.byte	102,15,56,222,208
3580.byte	102,15,56,222,216
3581.byte	102,15,56,222,224
3582.byte	102,15,56,222,232
3583.byte	102,15,56,222,240
3584.byte	102,15,56,222,248
3585.byte	102,68,15,56,222,192
3586.byte	102,68,15,56,222,200
3587	movups	160-112(%rcx),%xmm0
3588	jb	.Lcbc_dec_done
3589.byte	102,15,56,222,209
3590.byte	102,15,56,222,217
3591.byte	102,15,56,222,225
3592.byte	102,15,56,222,233
3593.byte	102,15,56,222,241
3594.byte	102,15,56,222,249
3595.byte	102,68,15,56,222,193
3596.byte	102,68,15,56,222,201
3597	movups	176-112(%rcx),%xmm1
3598	nop
3599.byte	102,15,56,222,208
3600.byte	102,15,56,222,216
3601.byte	102,15,56,222,224
3602.byte	102,15,56,222,232
3603.byte	102,15,56,222,240
3604.byte	102,15,56,222,248
3605.byte	102,68,15,56,222,192
3606.byte	102,68,15,56,222,200
3607	movups	192-112(%rcx),%xmm0
3608	je	.Lcbc_dec_done
3609.byte	102,15,56,222,209
3610.byte	102,15,56,222,217
3611.byte	102,15,56,222,225
3612.byte	102,15,56,222,233
3613.byte	102,15,56,222,241
3614.byte	102,15,56,222,249
3615.byte	102,68,15,56,222,193
3616.byte	102,68,15,56,222,201
3617	movups	208-112(%rcx),%xmm1
3618	nop
3619.byte	102,15,56,222,208
3620.byte	102,15,56,222,216
3621.byte	102,15,56,222,224
3622.byte	102,15,56,222,232
3623.byte	102,15,56,222,240
3624.byte	102,15,56,222,248
3625.byte	102,68,15,56,222,192
3626.byte	102,68,15,56,222,200
3627	movups	224-112(%rcx),%xmm0
3628	jmp	.Lcbc_dec_done
3629.align	16
3630.Lcbc_dec_done:
3631.byte	102,15,56,222,209
3632.byte	102,15,56,222,217
3633	pxor	%xmm0,%xmm10
3634	pxor	%xmm0,%xmm11
3635.byte	102,15,56,222,225
3636.byte	102,15,56,222,233
3637	pxor	%xmm0,%xmm12
3638	pxor	%xmm0,%xmm13
3639.byte	102,15,56,222,241
3640.byte	102,15,56,222,249
3641	pxor	%xmm0,%xmm14
3642	pxor	%xmm0,%xmm15
3643.byte	102,68,15,56,222,193
3644.byte	102,68,15,56,222,201
3645	movdqu	80(%rdi),%xmm1
3646
3647.byte	102,65,15,56,223,210
3648	movdqu	96(%rdi),%xmm10
3649	pxor	%xmm0,%xmm1
3650.byte	102,65,15,56,223,219
3651	pxor	%xmm0,%xmm10
3652	movdqu	112(%rdi),%xmm0
3653.byte	102,65,15,56,223,228
3654	leaq	128(%rdi),%rdi
3655	movdqu	0(%rbp),%xmm11
3656.byte	102,65,15,56,223,237
3657.byte	102,65,15,56,223,246
3658	movdqu	16(%rbp),%xmm12
3659	movdqu	32(%rbp),%xmm13
3660.byte	102,65,15,56,223,255
3661.byte	102,68,15,56,223,193
3662	movdqu	48(%rbp),%xmm14
3663	movdqu	64(%rbp),%xmm15
3664.byte	102,69,15,56,223,202
3665	movdqa	%xmm0,%xmm10
3666	movdqu	80(%rbp),%xmm1
3667	movups	-112(%rcx),%xmm0
3668
3669	movups	%xmm2,(%rsi)
3670	movdqa	%xmm11,%xmm2
3671	movups	%xmm3,16(%rsi)
3672	movdqa	%xmm12,%xmm3
3673	movups	%xmm4,32(%rsi)
3674	movdqa	%xmm13,%xmm4
3675	movups	%xmm5,48(%rsi)
3676	movdqa	%xmm14,%xmm5
3677	movups	%xmm6,64(%rsi)
3678	movdqa	%xmm15,%xmm6
3679	movups	%xmm7,80(%rsi)
3680	movdqa	%xmm1,%xmm7
3681	movups	%xmm8,96(%rsi)
3682	leaq	112(%rsi),%rsi
3683
3684	subq	$0x80,%rdx
3685	ja	.Lcbc_dec_loop8
3686
3687	movaps	%xmm9,%xmm2
3688	leaq	-112(%rcx),%rcx
3689	addq	$0x70,%rdx
3690	jle	.Lcbc_dec_clear_tail_collected
3691	movups	%xmm9,(%rsi)
3692	leaq	16(%rsi),%rsi
3693	cmpq	$0x50,%rdx
3694	jbe	.Lcbc_dec_tail
3695
3696	movaps	%xmm11,%xmm2
3697.Lcbc_dec_six_or_seven:
3698	cmpq	$0x60,%rdx
3699	ja	.Lcbc_dec_seven
3700
3701	movaps	%xmm7,%xmm8
3702	call	_aesni_decrypt6
3703	pxor	%xmm10,%xmm2
3704	movaps	%xmm8,%xmm10
3705	pxor	%xmm11,%xmm3
3706	movdqu	%xmm2,(%rsi)
3707	pxor	%xmm12,%xmm4
3708	movdqu	%xmm3,16(%rsi)
3709	pxor	%xmm3,%xmm3
3710	pxor	%xmm13,%xmm5
3711	movdqu	%xmm4,32(%rsi)
3712	pxor	%xmm4,%xmm4
3713	pxor	%xmm14,%xmm6
3714	movdqu	%xmm5,48(%rsi)
3715	pxor	%xmm5,%xmm5
3716	pxor	%xmm15,%xmm7
3717	movdqu	%xmm6,64(%rsi)
3718	pxor	%xmm6,%xmm6
3719	leaq	80(%rsi),%rsi
3720	movdqa	%xmm7,%xmm2
3721	pxor	%xmm7,%xmm7
3722	jmp	.Lcbc_dec_tail_collected
3723
3724.align	16
3725.Lcbc_dec_seven:
3726	movups	96(%rdi),%xmm8
3727	xorps	%xmm9,%xmm9
3728	call	_aesni_decrypt8
3729	movups	80(%rdi),%xmm9
3730	pxor	%xmm10,%xmm2
3731	movups	96(%rdi),%xmm10
3732	pxor	%xmm11,%xmm3
3733	movdqu	%xmm2,(%rsi)
3734	pxor	%xmm12,%xmm4
3735	movdqu	%xmm3,16(%rsi)
3736	pxor	%xmm3,%xmm3
3737	pxor	%xmm13,%xmm5
3738	movdqu	%xmm4,32(%rsi)
3739	pxor	%xmm4,%xmm4
3740	pxor	%xmm14,%xmm6
3741	movdqu	%xmm5,48(%rsi)
3742	pxor	%xmm5,%xmm5
3743	pxor	%xmm15,%xmm7
3744	movdqu	%xmm6,64(%rsi)
3745	pxor	%xmm6,%xmm6
3746	pxor	%xmm9,%xmm8
3747	movdqu	%xmm7,80(%rsi)
3748	pxor	%xmm7,%xmm7
3749	leaq	96(%rsi),%rsi
3750	movdqa	%xmm8,%xmm2
3751	pxor	%xmm8,%xmm8
3752	pxor	%xmm9,%xmm9
3753	jmp	.Lcbc_dec_tail_collected
3754
3755.align	16
3756.Lcbc_dec_loop6:
3757	movups	%xmm7,(%rsi)
3758	leaq	16(%rsi),%rsi
3759	movdqu	0(%rdi),%xmm2
3760	movdqu	16(%rdi),%xmm3
3761	movdqa	%xmm2,%xmm11
3762	movdqu	32(%rdi),%xmm4
3763	movdqa	%xmm3,%xmm12
3764	movdqu	48(%rdi),%xmm5
3765	movdqa	%xmm4,%xmm13
3766	movdqu	64(%rdi),%xmm6
3767	movdqa	%xmm5,%xmm14
3768	movdqu	80(%rdi),%xmm7
3769	movdqa	%xmm6,%xmm15
3770.Lcbc_dec_loop6_enter:
3771	leaq	96(%rdi),%rdi
3772	movdqa	%xmm7,%xmm8
3773
3774	call	_aesni_decrypt6
3775
3776	pxor	%xmm10,%xmm2
3777	movdqa	%xmm8,%xmm10
3778	pxor	%xmm11,%xmm3
3779	movdqu	%xmm2,(%rsi)
3780	pxor	%xmm12,%xmm4
3781	movdqu	%xmm3,16(%rsi)
3782	pxor	%xmm13,%xmm5
3783	movdqu	%xmm4,32(%rsi)
3784	pxor	%xmm14,%xmm6
3785	movq	%rbp,%rcx
3786	movdqu	%xmm5,48(%rsi)
3787	pxor	%xmm15,%xmm7
3788	movl	%r10d,%eax
3789	movdqu	%xmm6,64(%rsi)
3790	leaq	80(%rsi),%rsi
3791	subq	$0x60,%rdx
3792	ja	.Lcbc_dec_loop6
3793
3794	movdqa	%xmm7,%xmm2
3795	addq	$0x50,%rdx
3796	jle	.Lcbc_dec_clear_tail_collected
3797	movups	%xmm7,(%rsi)
3798	leaq	16(%rsi),%rsi
3799
3800.Lcbc_dec_tail:
3801	movups	(%rdi),%xmm2
3802	subq	$0x10,%rdx
3803	jbe	.Lcbc_dec_one
3804
3805	movups	16(%rdi),%xmm3
3806	movaps	%xmm2,%xmm11
3807	subq	$0x10,%rdx
3808	jbe	.Lcbc_dec_two
3809
3810	movups	32(%rdi),%xmm4
3811	movaps	%xmm3,%xmm12
3812	subq	$0x10,%rdx
3813	jbe	.Lcbc_dec_three
3814
3815	movups	48(%rdi),%xmm5
3816	movaps	%xmm4,%xmm13
3817	subq	$0x10,%rdx
3818	jbe	.Lcbc_dec_four
3819
3820	movups	64(%rdi),%xmm6
3821	movaps	%xmm5,%xmm14
3822	movaps	%xmm6,%xmm15
3823	xorps	%xmm7,%xmm7
3824	call	_aesni_decrypt6
3825	pxor	%xmm10,%xmm2
3826	movaps	%xmm15,%xmm10
3827	pxor	%xmm11,%xmm3
3828	movdqu	%xmm2,(%rsi)
3829	pxor	%xmm12,%xmm4
3830	movdqu	%xmm3,16(%rsi)
3831	pxor	%xmm3,%xmm3
3832	pxor	%xmm13,%xmm5
3833	movdqu	%xmm4,32(%rsi)
3834	pxor	%xmm4,%xmm4
3835	pxor	%xmm14,%xmm6
3836	movdqu	%xmm5,48(%rsi)
3837	pxor	%xmm5,%xmm5
3838	leaq	64(%rsi),%rsi
3839	movdqa	%xmm6,%xmm2
3840	pxor	%xmm6,%xmm6
3841	pxor	%xmm7,%xmm7
3842	subq	$0x10,%rdx
3843	jmp	.Lcbc_dec_tail_collected
3844
3845.align	16
3846.Lcbc_dec_one:
3847	movaps	%xmm2,%xmm11
3848	movups	(%rcx),%xmm0
3849	movups	16(%rcx),%xmm1
3850	leaq	32(%rcx),%rcx
3851	xorps	%xmm0,%xmm2
3852.Loop_dec1_17:
3853.byte	102,15,56,222,209
3854	decl	%eax
3855	movups	(%rcx),%xmm1
3856	leaq	16(%rcx),%rcx
3857	jnz	.Loop_dec1_17
3858.byte	102,15,56,223,209
3859	xorps	%xmm10,%xmm2
3860	movaps	%xmm11,%xmm10
3861	jmp	.Lcbc_dec_tail_collected
3862.align	16
3863.Lcbc_dec_two:
3864	movaps	%xmm3,%xmm12
3865	call	_aesni_decrypt2
3866	pxor	%xmm10,%xmm2
3867	movaps	%xmm12,%xmm10
3868	pxor	%xmm11,%xmm3
3869	movdqu	%xmm2,(%rsi)
3870	movdqa	%xmm3,%xmm2
3871	pxor	%xmm3,%xmm3
3872	leaq	16(%rsi),%rsi
3873	jmp	.Lcbc_dec_tail_collected
3874.align	16
3875.Lcbc_dec_three:
3876	movaps	%xmm4,%xmm13
3877	call	_aesni_decrypt3
3878	pxor	%xmm10,%xmm2
3879	movaps	%xmm13,%xmm10
3880	pxor	%xmm11,%xmm3
3881	movdqu	%xmm2,(%rsi)
3882	pxor	%xmm12,%xmm4
3883	movdqu	%xmm3,16(%rsi)
3884	pxor	%xmm3,%xmm3
3885	movdqa	%xmm4,%xmm2
3886	pxor	%xmm4,%xmm4
3887	leaq	32(%rsi),%rsi
3888	jmp	.Lcbc_dec_tail_collected
3889.align	16
3890.Lcbc_dec_four:
3891	movaps	%xmm5,%xmm14
3892	call	_aesni_decrypt4
3893	pxor	%xmm10,%xmm2
3894	movaps	%xmm14,%xmm10
3895	pxor	%xmm11,%xmm3
3896	movdqu	%xmm2,(%rsi)
3897	pxor	%xmm12,%xmm4
3898	movdqu	%xmm3,16(%rsi)
3899	pxor	%xmm3,%xmm3
3900	pxor	%xmm13,%xmm5
3901	movdqu	%xmm4,32(%rsi)
3902	pxor	%xmm4,%xmm4
3903	movdqa	%xmm5,%xmm2
3904	pxor	%xmm5,%xmm5
3905	leaq	48(%rsi),%rsi
3906	jmp	.Lcbc_dec_tail_collected
3907
3908.align	16
3909.Lcbc_dec_clear_tail_collected:
3910	pxor	%xmm3,%xmm3
3911	pxor	%xmm4,%xmm4
3912	pxor	%xmm5,%xmm5
3913	pxor	%xmm6,%xmm6
3914	pxor	%xmm7,%xmm7
3915	pxor	%xmm8,%xmm8
3916	pxor	%xmm9,%xmm9
3917.Lcbc_dec_tail_collected:
3918	movups	%xmm10,(%r8)
3919	andq	$15,%rdx
3920	jnz	.Lcbc_dec_tail_partial
3921	movups	%xmm2,(%rsi)
3922	pxor	%xmm2,%xmm2
3923	jmp	.Lcbc_dec_ret
3924.align	16
3925.Lcbc_dec_tail_partial:
3926	movaps	%xmm2,(%rsp)
3927	pxor	%xmm2,%xmm2
3928	movq	$16,%rcx
3929	movq	%rsi,%rdi
3930	subq	%rdx,%rcx
3931	leaq	(%rsp),%rsi
3932.long	0x9066A4F3
3933	movdqa	%xmm2,(%rsp)
3934
3935.Lcbc_dec_ret:
3936	xorps	%xmm0,%xmm0
3937	pxor	%xmm1,%xmm1
3938	movq	-8(%r11),%rbp
3939	leaq	(%r11),%rsp
3940.Lcbc_ret:
3941	.byte	0xf3,0xc3
3942.size	aesni_cbc_encrypt,.-aesni_cbc_encrypt
3943.globl	aesni_set_decrypt_key
3944.hidden aesni_set_decrypt_key
3945.type	aesni_set_decrypt_key,@function
3946.align	16
3947aesni_set_decrypt_key:
3948.byte	0x48,0x83,0xEC,0x08
3949	call	__aesni_set_encrypt_key
3950	shll	$4,%esi
3951	testl	%eax,%eax
3952	jnz	.Ldec_key_ret
3953	leaq	16(%rdx,%rsi,1),%rdi
3954
3955	movups	(%rdx),%xmm0
3956	movups	(%rdi),%xmm1
3957	movups	%xmm0,(%rdi)
3958	movups	%xmm1,(%rdx)
3959	leaq	16(%rdx),%rdx
3960	leaq	-16(%rdi),%rdi
3961
3962.Ldec_key_inverse:
3963	movups	(%rdx),%xmm0
3964	movups	(%rdi),%xmm1
3965.byte	102,15,56,219,192
3966.byte	102,15,56,219,201
3967	leaq	16(%rdx),%rdx
3968	leaq	-16(%rdi),%rdi
3969	movups	%xmm0,16(%rdi)
3970	movups	%xmm1,-16(%rdx)
3971	cmpq	%rdx,%rdi
3972	ja	.Ldec_key_inverse
3973
3974	movups	(%rdx),%xmm0
3975.byte	102,15,56,219,192
3976	pxor	%xmm1,%xmm1
3977	movups	%xmm0,(%rdi)
3978	pxor	%xmm0,%xmm0
3979.Ldec_key_ret:
3980	addq	$8,%rsp
3981	.byte	0xf3,0xc3
3982.LSEH_end_set_decrypt_key:
3983.size	aesni_set_decrypt_key,.-aesni_set_decrypt_key
3984.globl	aesni_set_encrypt_key
3985.hidden aesni_set_encrypt_key
3986.type	aesni_set_encrypt_key,@function
3987.align	16
3988aesni_set_encrypt_key:
3989__aesni_set_encrypt_key:
3990.byte	0x48,0x83,0xEC,0x08
3991	movq	$-1,%rax
3992	testq	%rdi,%rdi
3993	jz	.Lenc_key_ret
3994	testq	%rdx,%rdx
3995	jz	.Lenc_key_ret
3996
3997	movl	$268437504,%r10d
3998	movups	(%rdi),%xmm0
3999	xorps	%xmm4,%xmm4
4000	andl	OPENSSL_ia32cap_P+4(%rip),%r10d
4001	leaq	16(%rdx),%rax
4002	cmpl	$256,%esi
4003	je	.L14rounds
4004	cmpl	$192,%esi
4005	je	.L12rounds
4006	cmpl	$128,%esi
4007	jne	.Lbad_keybits
4008
4009.L10rounds:
4010	movl	$9,%esi
4011	cmpl	$268435456,%r10d
4012	je	.L10rounds_alt
4013
4014	movups	%xmm0,(%rdx)
4015.byte	102,15,58,223,200,1
4016	call	.Lkey_expansion_128_cold
4017.byte	102,15,58,223,200,2
4018	call	.Lkey_expansion_128
4019.byte	102,15,58,223,200,4
4020	call	.Lkey_expansion_128
4021.byte	102,15,58,223,200,8
4022	call	.Lkey_expansion_128
4023.byte	102,15,58,223,200,16
4024	call	.Lkey_expansion_128
4025.byte	102,15,58,223,200,32
4026	call	.Lkey_expansion_128
4027.byte	102,15,58,223,200,64
4028	call	.Lkey_expansion_128
4029.byte	102,15,58,223,200,128
4030	call	.Lkey_expansion_128
4031.byte	102,15,58,223,200,27
4032	call	.Lkey_expansion_128
4033.byte	102,15,58,223,200,54
4034	call	.Lkey_expansion_128
4035	movups	%xmm0,(%rax)
4036	movl	%esi,80(%rax)
4037	xorl	%eax,%eax
4038	jmp	.Lenc_key_ret
4039
4040.align	16
4041.L10rounds_alt:
4042	movdqa	.Lkey_rotate(%rip),%xmm5
4043	movl	$8,%r10d
4044	movdqa	.Lkey_rcon1(%rip),%xmm4
4045	movdqa	%xmm0,%xmm2
4046	movdqu	%xmm0,(%rdx)
4047	jmp	.Loop_key128
4048
4049.align	16
4050.Loop_key128:
4051.byte	102,15,56,0,197
4052.byte	102,15,56,221,196
4053	pslld	$1,%xmm4
4054	leaq	16(%rax),%rax
4055
4056	movdqa	%xmm2,%xmm3
4057	pslldq	$4,%xmm2
4058	pxor	%xmm2,%xmm3
4059	pslldq	$4,%xmm2
4060	pxor	%xmm2,%xmm3
4061	pslldq	$4,%xmm2
4062	pxor	%xmm3,%xmm2
4063
4064	pxor	%xmm2,%xmm0
4065	movdqu	%xmm0,-16(%rax)
4066	movdqa	%xmm0,%xmm2
4067
4068	decl	%r10d
4069	jnz	.Loop_key128
4070
4071	movdqa	.Lkey_rcon1b(%rip),%xmm4
4072
4073.byte	102,15,56,0,197
4074.byte	102,15,56,221,196
4075	pslld	$1,%xmm4
4076
4077	movdqa	%xmm2,%xmm3
4078	pslldq	$4,%xmm2
4079	pxor	%xmm2,%xmm3
4080	pslldq	$4,%xmm2
4081	pxor	%xmm2,%xmm3
4082	pslldq	$4,%xmm2
4083	pxor	%xmm3,%xmm2
4084
4085	pxor	%xmm2,%xmm0
4086	movdqu	%xmm0,(%rax)
4087
4088	movdqa	%xmm0,%xmm2
4089.byte	102,15,56,0,197
4090.byte	102,15,56,221,196
4091
4092	movdqa	%xmm2,%xmm3
4093	pslldq	$4,%xmm2
4094	pxor	%xmm2,%xmm3
4095	pslldq	$4,%xmm2
4096	pxor	%xmm2,%xmm3
4097	pslldq	$4,%xmm2
4098	pxor	%xmm3,%xmm2
4099
4100	pxor	%xmm2,%xmm0
4101	movdqu	%xmm0,16(%rax)
4102
4103	movl	%esi,96(%rax)
4104	xorl	%eax,%eax
4105	jmp	.Lenc_key_ret
4106
4107.align	16
4108.L12rounds:
4109	movq	16(%rdi),%xmm2
4110	movl	$11,%esi
4111	cmpl	$268435456,%r10d
4112	je	.L12rounds_alt
4113
4114	movups	%xmm0,(%rdx)
4115.byte	102,15,58,223,202,1
4116	call	.Lkey_expansion_192a_cold
4117.byte	102,15,58,223,202,2
4118	call	.Lkey_expansion_192b
4119.byte	102,15,58,223,202,4
4120	call	.Lkey_expansion_192a
4121.byte	102,15,58,223,202,8
4122	call	.Lkey_expansion_192b
4123.byte	102,15,58,223,202,16
4124	call	.Lkey_expansion_192a
4125.byte	102,15,58,223,202,32
4126	call	.Lkey_expansion_192b
4127.byte	102,15,58,223,202,64
4128	call	.Lkey_expansion_192a
4129.byte	102,15,58,223,202,128
4130	call	.Lkey_expansion_192b
4131	movups	%xmm0,(%rax)
4132	movl	%esi,48(%rax)
4133	xorq	%rax,%rax
4134	jmp	.Lenc_key_ret
4135
4136.align	16
4137.L12rounds_alt:
4138	movdqa	.Lkey_rotate192(%rip),%xmm5
4139	movdqa	.Lkey_rcon1(%rip),%xmm4
4140	movl	$8,%r10d
4141	movdqu	%xmm0,(%rdx)
4142	jmp	.Loop_key192
4143
4144.align	16
4145.Loop_key192:
4146	movq	%xmm2,0(%rax)
4147	movdqa	%xmm2,%xmm1
4148.byte	102,15,56,0,213
4149.byte	102,15,56,221,212
4150	pslld	$1,%xmm4
4151	leaq	24(%rax),%rax
4152
4153	movdqa	%xmm0,%xmm3
4154	pslldq	$4,%xmm0
4155	pxor	%xmm0,%xmm3
4156	pslldq	$4,%xmm0
4157	pxor	%xmm0,%xmm3
4158	pslldq	$4,%xmm0
4159	pxor	%xmm3,%xmm0
4160
4161	pshufd	$0xff,%xmm0,%xmm3
4162	pxor	%xmm1,%xmm3
4163	pslldq	$4,%xmm1
4164	pxor	%xmm1,%xmm3
4165
4166	pxor	%xmm2,%xmm0
4167	pxor	%xmm3,%xmm2
4168	movdqu	%xmm0,-16(%rax)
4169
4170	decl	%r10d
4171	jnz	.Loop_key192
4172
4173	movl	%esi,32(%rax)
4174	xorl	%eax,%eax
4175	jmp	.Lenc_key_ret
4176
4177.align	16
4178.L14rounds:
4179	movups	16(%rdi),%xmm2
4180	movl	$13,%esi
4181	leaq	16(%rax),%rax
4182	cmpl	$268435456,%r10d
4183	je	.L14rounds_alt
4184
4185	movups	%xmm0,(%rdx)
4186	movups	%xmm2,16(%rdx)
4187.byte	102,15,58,223,202,1
4188	call	.Lkey_expansion_256a_cold
4189.byte	102,15,58,223,200,1
4190	call	.Lkey_expansion_256b
4191.byte	102,15,58,223,202,2
4192	call	.Lkey_expansion_256a
4193.byte	102,15,58,223,200,2
4194	call	.Lkey_expansion_256b
4195.byte	102,15,58,223,202,4
4196	call	.Lkey_expansion_256a
4197.byte	102,15,58,223,200,4
4198	call	.Lkey_expansion_256b
4199.byte	102,15,58,223,202,8
4200	call	.Lkey_expansion_256a
4201.byte	102,15,58,223,200,8
4202	call	.Lkey_expansion_256b
4203.byte	102,15,58,223,202,16
4204	call	.Lkey_expansion_256a
4205.byte	102,15,58,223,200,16
4206	call	.Lkey_expansion_256b
4207.byte	102,15,58,223,202,32
4208	call	.Lkey_expansion_256a
4209.byte	102,15,58,223,200,32
4210	call	.Lkey_expansion_256b
4211.byte	102,15,58,223,202,64
4212	call	.Lkey_expansion_256a
4213	movups	%xmm0,(%rax)
4214	movl	%esi,16(%rax)
4215	xorq	%rax,%rax
4216	jmp	.Lenc_key_ret
4217
4218.align	16
4219.L14rounds_alt:
4220	movdqa	.Lkey_rotate(%rip),%xmm5
4221	movdqa	.Lkey_rcon1(%rip),%xmm4
4222	movl	$7,%r10d
4223	movdqu	%xmm0,0(%rdx)
4224	movdqa	%xmm2,%xmm1
4225	movdqu	%xmm2,16(%rdx)
4226	jmp	.Loop_key256
4227
4228.align	16
4229.Loop_key256:
4230.byte	102,15,56,0,213
4231.byte	102,15,56,221,212
4232
4233	movdqa	%xmm0,%xmm3
4234	pslldq	$4,%xmm0
4235	pxor	%xmm0,%xmm3
4236	pslldq	$4,%xmm0
4237	pxor	%xmm0,%xmm3
4238	pslldq	$4,%xmm0
4239	pxor	%xmm3,%xmm0
4240	pslld	$1,%xmm4
4241
4242	pxor	%xmm2,%xmm0
4243	movdqu	%xmm0,(%rax)
4244
4245	decl	%r10d
4246	jz	.Ldone_key256
4247
4248	pshufd	$0xff,%xmm0,%xmm2
4249	pxor	%xmm3,%xmm3
4250.byte	102,15,56,221,211
4251
4252	movdqa	%xmm1,%xmm3
4253	pslldq	$4,%xmm1
4254	pxor	%xmm1,%xmm3
4255	pslldq	$4,%xmm1
4256	pxor	%xmm1,%xmm3
4257	pslldq	$4,%xmm1
4258	pxor	%xmm3,%xmm1
4259
4260	pxor	%xmm1,%xmm2
4261	movdqu	%xmm2,16(%rax)
4262	leaq	32(%rax),%rax
4263	movdqa	%xmm2,%xmm1
4264
4265	jmp	.Loop_key256
4266
4267.Ldone_key256:
4268	movl	%esi,16(%rax)
4269	xorl	%eax,%eax
4270	jmp	.Lenc_key_ret
4271
4272.align	16
4273.Lbad_keybits:
4274	movq	$-2,%rax
4275.Lenc_key_ret:
4276	pxor	%xmm0,%xmm0
4277	pxor	%xmm1,%xmm1
4278	pxor	%xmm2,%xmm2
4279	pxor	%xmm3,%xmm3
4280	pxor	%xmm4,%xmm4
4281	pxor	%xmm5,%xmm5
4282	addq	$8,%rsp
4283	.byte	0xf3,0xc3
4284.LSEH_end_set_encrypt_key:
4285
4286.align	16
4287.Lkey_expansion_128:
4288	movups	%xmm0,(%rax)
4289	leaq	16(%rax),%rax
4290.Lkey_expansion_128_cold:
4291	shufps	$16,%xmm0,%xmm4
4292	xorps	%xmm4,%xmm0
4293	shufps	$140,%xmm0,%xmm4
4294	xorps	%xmm4,%xmm0
4295	shufps	$255,%xmm1,%xmm1
4296	xorps	%xmm1,%xmm0
4297	.byte	0xf3,0xc3
4298
4299.align	16
4300.Lkey_expansion_192a:
4301	movups	%xmm0,(%rax)
4302	leaq	16(%rax),%rax
4303.Lkey_expansion_192a_cold:
4304	movaps	%xmm2,%xmm5
4305.Lkey_expansion_192b_warm:
4306	shufps	$16,%xmm0,%xmm4
4307	movdqa	%xmm2,%xmm3
4308	xorps	%xmm4,%xmm0
4309	shufps	$140,%xmm0,%xmm4
4310	pslldq	$4,%xmm3
4311	xorps	%xmm4,%xmm0
4312	pshufd	$85,%xmm1,%xmm1
4313	pxor	%xmm3,%xmm2
4314	pxor	%xmm1,%xmm0
4315	pshufd	$255,%xmm0,%xmm3
4316	pxor	%xmm3,%xmm2
4317	.byte	0xf3,0xc3
4318
4319.align	16
4320.Lkey_expansion_192b:
4321	movaps	%xmm0,%xmm3
4322	shufps	$68,%xmm0,%xmm5
4323	movups	%xmm5,(%rax)
4324	shufps	$78,%xmm2,%xmm3
4325	movups	%xmm3,16(%rax)
4326	leaq	32(%rax),%rax
4327	jmp	.Lkey_expansion_192b_warm
4328
4329.align	16
4330.Lkey_expansion_256a:
4331	movups	%xmm2,(%rax)
4332	leaq	16(%rax),%rax
4333.Lkey_expansion_256a_cold:
4334	shufps	$16,%xmm0,%xmm4
4335	xorps	%xmm4,%xmm0
4336	shufps	$140,%xmm0,%xmm4
4337	xorps	%xmm4,%xmm0
4338	shufps	$255,%xmm1,%xmm1
4339	xorps	%xmm1,%xmm0
4340	.byte	0xf3,0xc3
4341
4342.align	16
4343.Lkey_expansion_256b:
4344	movups	%xmm0,(%rax)
4345	leaq	16(%rax),%rax
4346
4347	shufps	$16,%xmm2,%xmm4
4348	xorps	%xmm4,%xmm2
4349	shufps	$140,%xmm2,%xmm4
4350	xorps	%xmm4,%xmm2
4351	shufps	$170,%xmm1,%xmm1
4352	xorps	%xmm1,%xmm2
4353	.byte	0xf3,0xc3
4354.size	aesni_set_encrypt_key,.-aesni_set_encrypt_key
4355.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4356.align	64
4357.Lbswap_mask:
4358.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4359.Lincrement32:
4360.long	6,6,6,0
4361.Lincrement64:
4362.long	1,0,0,0
4363.Lxts_magic:
4364.long	0x87,0,1,0
4365.Lincrement1:
4366.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4367.Lkey_rotate:
4368.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4369.Lkey_rotate192:
4370.long	0x04070605,0x04070605,0x04070605,0x04070605
4371.Lkey_rcon1:
4372.long	1,1,1,1
4373.Lkey_rcon1b:
4374.long	0x1b,0x1b,0x1b,0x1b
4375
4376.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
4377.align	64
4378#endif
4379