1#if defined(__x86_64__)
2.text
3.extern	OPENSSL_ia32cap_P
4.hidden OPENSSL_ia32cap_P
5.globl	aesni_encrypt
6.hidden aesni_encrypt
7.type	aesni_encrypt,@function
8.align	16
9aesni_encrypt:
10	movups	(%rdi),%xmm2
11	movl	240(%rdx),%eax
12	movups	(%rdx),%xmm0
13	movups	16(%rdx),%xmm1
14	leaq	32(%rdx),%rdx
15	xorps	%xmm0,%xmm2
16.Loop_enc1_1:
17.byte	102,15,56,220,209
18	decl	%eax
19	movups	(%rdx),%xmm1
20	leaq	16(%rdx),%rdx
21	jnz	.Loop_enc1_1
22.byte	102,15,56,221,209
23	pxor	%xmm0,%xmm0
24	pxor	%xmm1,%xmm1
25	movups	%xmm2,(%rsi)
26	pxor	%xmm2,%xmm2
27	.byte	0xf3,0xc3
28.size	aesni_encrypt,.-aesni_encrypt
29
30.globl	aesni_decrypt
31.hidden aesni_decrypt
32.type	aesni_decrypt,@function
33.align	16
34aesni_decrypt:
35	movups	(%rdi),%xmm2
36	movl	240(%rdx),%eax
37	movups	(%rdx),%xmm0
38	movups	16(%rdx),%xmm1
39	leaq	32(%rdx),%rdx
40	xorps	%xmm0,%xmm2
41.Loop_dec1_2:
42.byte	102,15,56,222,209
43	decl	%eax
44	movups	(%rdx),%xmm1
45	leaq	16(%rdx),%rdx
46	jnz	.Loop_dec1_2
47.byte	102,15,56,223,209
48	pxor	%xmm0,%xmm0
49	pxor	%xmm1,%xmm1
50	movups	%xmm2,(%rsi)
51	pxor	%xmm2,%xmm2
52	.byte	0xf3,0xc3
53.size	aesni_decrypt, .-aesni_decrypt
54.type	_aesni_encrypt2,@function
55.align	16
56_aesni_encrypt2:
57	movups	(%rcx),%xmm0
58	shll	$4,%eax
59	movups	16(%rcx),%xmm1
60	xorps	%xmm0,%xmm2
61	xorps	%xmm0,%xmm3
62	movups	32(%rcx),%xmm0
63	leaq	32(%rcx,%rax,1),%rcx
64	negq	%rax
65	addq	$16,%rax
66
67.Lenc_loop2:
68.byte	102,15,56,220,209
69.byte	102,15,56,220,217
70	movups	(%rcx,%rax,1),%xmm1
71	addq	$32,%rax
72.byte	102,15,56,220,208
73.byte	102,15,56,220,216
74	movups	-16(%rcx,%rax,1),%xmm0
75	jnz	.Lenc_loop2
76
77.byte	102,15,56,220,209
78.byte	102,15,56,220,217
79.byte	102,15,56,221,208
80.byte	102,15,56,221,216
81	.byte	0xf3,0xc3
82.size	_aesni_encrypt2,.-_aesni_encrypt2
83.type	_aesni_decrypt2,@function
84.align	16
85_aesni_decrypt2:
86	movups	(%rcx),%xmm0
87	shll	$4,%eax
88	movups	16(%rcx),%xmm1
89	xorps	%xmm0,%xmm2
90	xorps	%xmm0,%xmm3
91	movups	32(%rcx),%xmm0
92	leaq	32(%rcx,%rax,1),%rcx
93	negq	%rax
94	addq	$16,%rax
95
96.Ldec_loop2:
97.byte	102,15,56,222,209
98.byte	102,15,56,222,217
99	movups	(%rcx,%rax,1),%xmm1
100	addq	$32,%rax
101.byte	102,15,56,222,208
102.byte	102,15,56,222,216
103	movups	-16(%rcx,%rax,1),%xmm0
104	jnz	.Ldec_loop2
105
106.byte	102,15,56,222,209
107.byte	102,15,56,222,217
108.byte	102,15,56,223,208
109.byte	102,15,56,223,216
110	.byte	0xf3,0xc3
111.size	_aesni_decrypt2,.-_aesni_decrypt2
112.type	_aesni_encrypt3,@function
113.align	16
114_aesni_encrypt3:
115	movups	(%rcx),%xmm0
116	shll	$4,%eax
117	movups	16(%rcx),%xmm1
118	xorps	%xmm0,%xmm2
119	xorps	%xmm0,%xmm3
120	xorps	%xmm0,%xmm4
121	movups	32(%rcx),%xmm0
122	leaq	32(%rcx,%rax,1),%rcx
123	negq	%rax
124	addq	$16,%rax
125
126.Lenc_loop3:
127.byte	102,15,56,220,209
128.byte	102,15,56,220,217
129.byte	102,15,56,220,225
130	movups	(%rcx,%rax,1),%xmm1
131	addq	$32,%rax
132.byte	102,15,56,220,208
133.byte	102,15,56,220,216
134.byte	102,15,56,220,224
135	movups	-16(%rcx,%rax,1),%xmm0
136	jnz	.Lenc_loop3
137
138.byte	102,15,56,220,209
139.byte	102,15,56,220,217
140.byte	102,15,56,220,225
141.byte	102,15,56,221,208
142.byte	102,15,56,221,216
143.byte	102,15,56,221,224
144	.byte	0xf3,0xc3
145.size	_aesni_encrypt3,.-_aesni_encrypt3
146.type	_aesni_decrypt3,@function
147.align	16
148_aesni_decrypt3:
149	movups	(%rcx),%xmm0
150	shll	$4,%eax
151	movups	16(%rcx),%xmm1
152	xorps	%xmm0,%xmm2
153	xorps	%xmm0,%xmm3
154	xorps	%xmm0,%xmm4
155	movups	32(%rcx),%xmm0
156	leaq	32(%rcx,%rax,1),%rcx
157	negq	%rax
158	addq	$16,%rax
159
160.Ldec_loop3:
161.byte	102,15,56,222,209
162.byte	102,15,56,222,217
163.byte	102,15,56,222,225
164	movups	(%rcx,%rax,1),%xmm1
165	addq	$32,%rax
166.byte	102,15,56,222,208
167.byte	102,15,56,222,216
168.byte	102,15,56,222,224
169	movups	-16(%rcx,%rax,1),%xmm0
170	jnz	.Ldec_loop3
171
172.byte	102,15,56,222,209
173.byte	102,15,56,222,217
174.byte	102,15,56,222,225
175.byte	102,15,56,223,208
176.byte	102,15,56,223,216
177.byte	102,15,56,223,224
178	.byte	0xf3,0xc3
179.size	_aesni_decrypt3,.-_aesni_decrypt3
180.type	_aesni_encrypt4,@function
181.align	16
182_aesni_encrypt4:
183	movups	(%rcx),%xmm0
184	shll	$4,%eax
185	movups	16(%rcx),%xmm1
186	xorps	%xmm0,%xmm2
187	xorps	%xmm0,%xmm3
188	xorps	%xmm0,%xmm4
189	xorps	%xmm0,%xmm5
190	movups	32(%rcx),%xmm0
191	leaq	32(%rcx,%rax,1),%rcx
192	negq	%rax
193.byte	0x0f,0x1f,0x00
194	addq	$16,%rax
195
196.Lenc_loop4:
197.byte	102,15,56,220,209
198.byte	102,15,56,220,217
199.byte	102,15,56,220,225
200.byte	102,15,56,220,233
201	movups	(%rcx,%rax,1),%xmm1
202	addq	$32,%rax
203.byte	102,15,56,220,208
204.byte	102,15,56,220,216
205.byte	102,15,56,220,224
206.byte	102,15,56,220,232
207	movups	-16(%rcx,%rax,1),%xmm0
208	jnz	.Lenc_loop4
209
210.byte	102,15,56,220,209
211.byte	102,15,56,220,217
212.byte	102,15,56,220,225
213.byte	102,15,56,220,233
214.byte	102,15,56,221,208
215.byte	102,15,56,221,216
216.byte	102,15,56,221,224
217.byte	102,15,56,221,232
218	.byte	0xf3,0xc3
219.size	_aesni_encrypt4,.-_aesni_encrypt4
220.type	_aesni_decrypt4,@function
221.align	16
222_aesni_decrypt4:
223	movups	(%rcx),%xmm0
224	shll	$4,%eax
225	movups	16(%rcx),%xmm1
226	xorps	%xmm0,%xmm2
227	xorps	%xmm0,%xmm3
228	xorps	%xmm0,%xmm4
229	xorps	%xmm0,%xmm5
230	movups	32(%rcx),%xmm0
231	leaq	32(%rcx,%rax,1),%rcx
232	negq	%rax
233.byte	0x0f,0x1f,0x00
234	addq	$16,%rax
235
236.Ldec_loop4:
237.byte	102,15,56,222,209
238.byte	102,15,56,222,217
239.byte	102,15,56,222,225
240.byte	102,15,56,222,233
241	movups	(%rcx,%rax,1),%xmm1
242	addq	$32,%rax
243.byte	102,15,56,222,208
244.byte	102,15,56,222,216
245.byte	102,15,56,222,224
246.byte	102,15,56,222,232
247	movups	-16(%rcx,%rax,1),%xmm0
248	jnz	.Ldec_loop4
249
250.byte	102,15,56,222,209
251.byte	102,15,56,222,217
252.byte	102,15,56,222,225
253.byte	102,15,56,222,233
254.byte	102,15,56,223,208
255.byte	102,15,56,223,216
256.byte	102,15,56,223,224
257.byte	102,15,56,223,232
258	.byte	0xf3,0xc3
259.size	_aesni_decrypt4,.-_aesni_decrypt4
260.type	_aesni_encrypt6,@function
261.align	16
262_aesni_encrypt6:
263	movups	(%rcx),%xmm0
264	shll	$4,%eax
265	movups	16(%rcx),%xmm1
266	xorps	%xmm0,%xmm2
267	pxor	%xmm0,%xmm3
268	pxor	%xmm0,%xmm4
269.byte	102,15,56,220,209
270	leaq	32(%rcx,%rax,1),%rcx
271	negq	%rax
272.byte	102,15,56,220,217
273	pxor	%xmm0,%xmm5
274	pxor	%xmm0,%xmm6
275.byte	102,15,56,220,225
276	pxor	%xmm0,%xmm7
277	movups	(%rcx,%rax,1),%xmm0
278	addq	$16,%rax
279	jmp	.Lenc_loop6_enter
280.align	16
281.Lenc_loop6:
282.byte	102,15,56,220,209
283.byte	102,15,56,220,217
284.byte	102,15,56,220,225
285.Lenc_loop6_enter:
286.byte	102,15,56,220,233
287.byte	102,15,56,220,241
288.byte	102,15,56,220,249
289	movups	(%rcx,%rax,1),%xmm1
290	addq	$32,%rax
291.byte	102,15,56,220,208
292.byte	102,15,56,220,216
293.byte	102,15,56,220,224
294.byte	102,15,56,220,232
295.byte	102,15,56,220,240
296.byte	102,15,56,220,248
297	movups	-16(%rcx,%rax,1),%xmm0
298	jnz	.Lenc_loop6
299
300.byte	102,15,56,220,209
301.byte	102,15,56,220,217
302.byte	102,15,56,220,225
303.byte	102,15,56,220,233
304.byte	102,15,56,220,241
305.byte	102,15,56,220,249
306.byte	102,15,56,221,208
307.byte	102,15,56,221,216
308.byte	102,15,56,221,224
309.byte	102,15,56,221,232
310.byte	102,15,56,221,240
311.byte	102,15,56,221,248
312	.byte	0xf3,0xc3
313.size	_aesni_encrypt6,.-_aesni_encrypt6
314.type	_aesni_decrypt6,@function
315.align	16
316_aesni_decrypt6:
317	movups	(%rcx),%xmm0
318	shll	$4,%eax
319	movups	16(%rcx),%xmm1
320	xorps	%xmm0,%xmm2
321	pxor	%xmm0,%xmm3
322	pxor	%xmm0,%xmm4
323.byte	102,15,56,222,209
324	leaq	32(%rcx,%rax,1),%rcx
325	negq	%rax
326.byte	102,15,56,222,217
327	pxor	%xmm0,%xmm5
328	pxor	%xmm0,%xmm6
329.byte	102,15,56,222,225
330	pxor	%xmm0,%xmm7
331	movups	(%rcx,%rax,1),%xmm0
332	addq	$16,%rax
333	jmp	.Ldec_loop6_enter
334.align	16
335.Ldec_loop6:
336.byte	102,15,56,222,209
337.byte	102,15,56,222,217
338.byte	102,15,56,222,225
339.Ldec_loop6_enter:
340.byte	102,15,56,222,233
341.byte	102,15,56,222,241
342.byte	102,15,56,222,249
343	movups	(%rcx,%rax,1),%xmm1
344	addq	$32,%rax
345.byte	102,15,56,222,208
346.byte	102,15,56,222,216
347.byte	102,15,56,222,224
348.byte	102,15,56,222,232
349.byte	102,15,56,222,240
350.byte	102,15,56,222,248
351	movups	-16(%rcx,%rax,1),%xmm0
352	jnz	.Ldec_loop6
353
354.byte	102,15,56,222,209
355.byte	102,15,56,222,217
356.byte	102,15,56,222,225
357.byte	102,15,56,222,233
358.byte	102,15,56,222,241
359.byte	102,15,56,222,249
360.byte	102,15,56,223,208
361.byte	102,15,56,223,216
362.byte	102,15,56,223,224
363.byte	102,15,56,223,232
364.byte	102,15,56,223,240
365.byte	102,15,56,223,248
366	.byte	0xf3,0xc3
367.size	_aesni_decrypt6,.-_aesni_decrypt6
368.type	_aesni_encrypt8,@function
369.align	16
370_aesni_encrypt8:
371	movups	(%rcx),%xmm0
372	shll	$4,%eax
373	movups	16(%rcx),%xmm1
374	xorps	%xmm0,%xmm2
375	xorps	%xmm0,%xmm3
376	pxor	%xmm0,%xmm4
377	pxor	%xmm0,%xmm5
378	pxor	%xmm0,%xmm6
379	leaq	32(%rcx,%rax,1),%rcx
380	negq	%rax
381.byte	102,15,56,220,209
382	pxor	%xmm0,%xmm7
383	pxor	%xmm0,%xmm8
384.byte	102,15,56,220,217
385	pxor	%xmm0,%xmm9
386	movups	(%rcx,%rax,1),%xmm0
387	addq	$16,%rax
388	jmp	.Lenc_loop8_inner
389.align	16
390.Lenc_loop8:
391.byte	102,15,56,220,209
392.byte	102,15,56,220,217
393.Lenc_loop8_inner:
394.byte	102,15,56,220,225
395.byte	102,15,56,220,233
396.byte	102,15,56,220,241
397.byte	102,15,56,220,249
398.byte	102,68,15,56,220,193
399.byte	102,68,15,56,220,201
400.Lenc_loop8_enter:
401	movups	(%rcx,%rax,1),%xmm1
402	addq	$32,%rax
403.byte	102,15,56,220,208
404.byte	102,15,56,220,216
405.byte	102,15,56,220,224
406.byte	102,15,56,220,232
407.byte	102,15,56,220,240
408.byte	102,15,56,220,248
409.byte	102,68,15,56,220,192
410.byte	102,68,15,56,220,200
411	movups	-16(%rcx,%rax,1),%xmm0
412	jnz	.Lenc_loop8
413
414.byte	102,15,56,220,209
415.byte	102,15,56,220,217
416.byte	102,15,56,220,225
417.byte	102,15,56,220,233
418.byte	102,15,56,220,241
419.byte	102,15,56,220,249
420.byte	102,68,15,56,220,193
421.byte	102,68,15,56,220,201
422.byte	102,15,56,221,208
423.byte	102,15,56,221,216
424.byte	102,15,56,221,224
425.byte	102,15,56,221,232
426.byte	102,15,56,221,240
427.byte	102,15,56,221,248
428.byte	102,68,15,56,221,192
429.byte	102,68,15,56,221,200
430	.byte	0xf3,0xc3
431.size	_aesni_encrypt8,.-_aesni_encrypt8
432.type	_aesni_decrypt8,@function
433.align	16
434_aesni_decrypt8:
435	movups	(%rcx),%xmm0
436	shll	$4,%eax
437	movups	16(%rcx),%xmm1
438	xorps	%xmm0,%xmm2
439	xorps	%xmm0,%xmm3
440	pxor	%xmm0,%xmm4
441	pxor	%xmm0,%xmm5
442	pxor	%xmm0,%xmm6
443	leaq	32(%rcx,%rax,1),%rcx
444	negq	%rax
445.byte	102,15,56,222,209
446	pxor	%xmm0,%xmm7
447	pxor	%xmm0,%xmm8
448.byte	102,15,56,222,217
449	pxor	%xmm0,%xmm9
450	movups	(%rcx,%rax,1),%xmm0
451	addq	$16,%rax
452	jmp	.Ldec_loop8_inner
453.align	16
454.Ldec_loop8:
455.byte	102,15,56,222,209
456.byte	102,15,56,222,217
457.Ldec_loop8_inner:
458.byte	102,15,56,222,225
459.byte	102,15,56,222,233
460.byte	102,15,56,222,241
461.byte	102,15,56,222,249
462.byte	102,68,15,56,222,193
463.byte	102,68,15,56,222,201
464.Ldec_loop8_enter:
465	movups	(%rcx,%rax,1),%xmm1
466	addq	$32,%rax
467.byte	102,15,56,222,208
468.byte	102,15,56,222,216
469.byte	102,15,56,222,224
470.byte	102,15,56,222,232
471.byte	102,15,56,222,240
472.byte	102,15,56,222,248
473.byte	102,68,15,56,222,192
474.byte	102,68,15,56,222,200
475	movups	-16(%rcx,%rax,1),%xmm0
476	jnz	.Ldec_loop8
477
478.byte	102,15,56,222,209
479.byte	102,15,56,222,217
480.byte	102,15,56,222,225
481.byte	102,15,56,222,233
482.byte	102,15,56,222,241
483.byte	102,15,56,222,249
484.byte	102,68,15,56,222,193
485.byte	102,68,15,56,222,201
486.byte	102,15,56,223,208
487.byte	102,15,56,223,216
488.byte	102,15,56,223,224
489.byte	102,15,56,223,232
490.byte	102,15,56,223,240
491.byte	102,15,56,223,248
492.byte	102,68,15,56,223,192
493.byte	102,68,15,56,223,200
494	.byte	0xf3,0xc3
495.size	_aesni_decrypt8,.-_aesni_decrypt8
496.globl	aesni_ecb_encrypt
497.hidden aesni_ecb_encrypt
498.type	aesni_ecb_encrypt,@function
499.align	16
500aesni_ecb_encrypt:
501	andq	$-16,%rdx
502	jz	.Lecb_ret
503
504	movl	240(%rcx),%eax
505	movups	(%rcx),%xmm0
506	movq	%rcx,%r11
507	movl	%eax,%r10d
508	testl	%r8d,%r8d
509	jz	.Lecb_decrypt
510
511	cmpq	$128,%rdx
512	jb	.Lecb_enc_tail
513
514	movdqu	(%rdi),%xmm2
515	movdqu	16(%rdi),%xmm3
516	movdqu	32(%rdi),%xmm4
517	movdqu	48(%rdi),%xmm5
518	movdqu	64(%rdi),%xmm6
519	movdqu	80(%rdi),%xmm7
520	movdqu	96(%rdi),%xmm8
521	movdqu	112(%rdi),%xmm9
522	leaq	128(%rdi),%rdi
523	subq	$128,%rdx
524	jmp	.Lecb_enc_loop8_enter
525.align	16
526.Lecb_enc_loop8:
527	movups	%xmm2,(%rsi)
528	movq	%r11,%rcx
529	movdqu	(%rdi),%xmm2
530	movl	%r10d,%eax
531	movups	%xmm3,16(%rsi)
532	movdqu	16(%rdi),%xmm3
533	movups	%xmm4,32(%rsi)
534	movdqu	32(%rdi),%xmm4
535	movups	%xmm5,48(%rsi)
536	movdqu	48(%rdi),%xmm5
537	movups	%xmm6,64(%rsi)
538	movdqu	64(%rdi),%xmm6
539	movups	%xmm7,80(%rsi)
540	movdqu	80(%rdi),%xmm7
541	movups	%xmm8,96(%rsi)
542	movdqu	96(%rdi),%xmm8
543	movups	%xmm9,112(%rsi)
544	leaq	128(%rsi),%rsi
545	movdqu	112(%rdi),%xmm9
546	leaq	128(%rdi),%rdi
547.Lecb_enc_loop8_enter:
548
549	call	_aesni_encrypt8
550
551	subq	$128,%rdx
552	jnc	.Lecb_enc_loop8
553
554	movups	%xmm2,(%rsi)
555	movq	%r11,%rcx
556	movups	%xmm3,16(%rsi)
557	movl	%r10d,%eax
558	movups	%xmm4,32(%rsi)
559	movups	%xmm5,48(%rsi)
560	movups	%xmm6,64(%rsi)
561	movups	%xmm7,80(%rsi)
562	movups	%xmm8,96(%rsi)
563	movups	%xmm9,112(%rsi)
564	leaq	128(%rsi),%rsi
565	addq	$128,%rdx
566	jz	.Lecb_ret
567
568.Lecb_enc_tail:
569	movups	(%rdi),%xmm2
570	cmpq	$32,%rdx
571	jb	.Lecb_enc_one
572	movups	16(%rdi),%xmm3
573	je	.Lecb_enc_two
574	movups	32(%rdi),%xmm4
575	cmpq	$64,%rdx
576	jb	.Lecb_enc_three
577	movups	48(%rdi),%xmm5
578	je	.Lecb_enc_four
579	movups	64(%rdi),%xmm6
580	cmpq	$96,%rdx
581	jb	.Lecb_enc_five
582	movups	80(%rdi),%xmm7
583	je	.Lecb_enc_six
584	movdqu	96(%rdi),%xmm8
585	xorps	%xmm9,%xmm9
586	call	_aesni_encrypt8
587	movups	%xmm2,(%rsi)
588	movups	%xmm3,16(%rsi)
589	movups	%xmm4,32(%rsi)
590	movups	%xmm5,48(%rsi)
591	movups	%xmm6,64(%rsi)
592	movups	%xmm7,80(%rsi)
593	movups	%xmm8,96(%rsi)
594	jmp	.Lecb_ret
595.align	16
596.Lecb_enc_one:
597	movups	(%rcx),%xmm0
598	movups	16(%rcx),%xmm1
599	leaq	32(%rcx),%rcx
600	xorps	%xmm0,%xmm2
601.Loop_enc1_3:
602.byte	102,15,56,220,209
603	decl	%eax
604	movups	(%rcx),%xmm1
605	leaq	16(%rcx),%rcx
606	jnz	.Loop_enc1_3
607.byte	102,15,56,221,209
608	movups	%xmm2,(%rsi)
609	jmp	.Lecb_ret
610.align	16
611.Lecb_enc_two:
612	call	_aesni_encrypt2
613	movups	%xmm2,(%rsi)
614	movups	%xmm3,16(%rsi)
615	jmp	.Lecb_ret
616.align	16
617.Lecb_enc_three:
618	call	_aesni_encrypt3
619	movups	%xmm2,(%rsi)
620	movups	%xmm3,16(%rsi)
621	movups	%xmm4,32(%rsi)
622	jmp	.Lecb_ret
623.align	16
624.Lecb_enc_four:
625	call	_aesni_encrypt4
626	movups	%xmm2,(%rsi)
627	movups	%xmm3,16(%rsi)
628	movups	%xmm4,32(%rsi)
629	movups	%xmm5,48(%rsi)
630	jmp	.Lecb_ret
631.align	16
632.Lecb_enc_five:
633	xorps	%xmm7,%xmm7
634	call	_aesni_encrypt6
635	movups	%xmm2,(%rsi)
636	movups	%xmm3,16(%rsi)
637	movups	%xmm4,32(%rsi)
638	movups	%xmm5,48(%rsi)
639	movups	%xmm6,64(%rsi)
640	jmp	.Lecb_ret
641.align	16
642.Lecb_enc_six:
643	call	_aesni_encrypt6
644	movups	%xmm2,(%rsi)
645	movups	%xmm3,16(%rsi)
646	movups	%xmm4,32(%rsi)
647	movups	%xmm5,48(%rsi)
648	movups	%xmm6,64(%rsi)
649	movups	%xmm7,80(%rsi)
650	jmp	.Lecb_ret
651
652.align	16
653.Lecb_decrypt:
654	cmpq	$128,%rdx
655	jb	.Lecb_dec_tail
656
657	movdqu	(%rdi),%xmm2
658	movdqu	16(%rdi),%xmm3
659	movdqu	32(%rdi),%xmm4
660	movdqu	48(%rdi),%xmm5
661	movdqu	64(%rdi),%xmm6
662	movdqu	80(%rdi),%xmm7
663	movdqu	96(%rdi),%xmm8
664	movdqu	112(%rdi),%xmm9
665	leaq	128(%rdi),%rdi
666	subq	$128,%rdx
667	jmp	.Lecb_dec_loop8_enter
668.align	16
669.Lecb_dec_loop8:
670	movups	%xmm2,(%rsi)
671	movq	%r11,%rcx
672	movdqu	(%rdi),%xmm2
673	movl	%r10d,%eax
674	movups	%xmm3,16(%rsi)
675	movdqu	16(%rdi),%xmm3
676	movups	%xmm4,32(%rsi)
677	movdqu	32(%rdi),%xmm4
678	movups	%xmm5,48(%rsi)
679	movdqu	48(%rdi),%xmm5
680	movups	%xmm6,64(%rsi)
681	movdqu	64(%rdi),%xmm6
682	movups	%xmm7,80(%rsi)
683	movdqu	80(%rdi),%xmm7
684	movups	%xmm8,96(%rsi)
685	movdqu	96(%rdi),%xmm8
686	movups	%xmm9,112(%rsi)
687	leaq	128(%rsi),%rsi
688	movdqu	112(%rdi),%xmm9
689	leaq	128(%rdi),%rdi
690.Lecb_dec_loop8_enter:
691
692	call	_aesni_decrypt8
693
694	movups	(%r11),%xmm0
695	subq	$128,%rdx
696	jnc	.Lecb_dec_loop8
697
698	movups	%xmm2,(%rsi)
699	pxor	%xmm2,%xmm2
700	movq	%r11,%rcx
701	movups	%xmm3,16(%rsi)
702	pxor	%xmm3,%xmm3
703	movl	%r10d,%eax
704	movups	%xmm4,32(%rsi)
705	pxor	%xmm4,%xmm4
706	movups	%xmm5,48(%rsi)
707	pxor	%xmm5,%xmm5
708	movups	%xmm6,64(%rsi)
709	pxor	%xmm6,%xmm6
710	movups	%xmm7,80(%rsi)
711	pxor	%xmm7,%xmm7
712	movups	%xmm8,96(%rsi)
713	pxor	%xmm8,%xmm8
714	movups	%xmm9,112(%rsi)
715	pxor	%xmm9,%xmm9
716	leaq	128(%rsi),%rsi
717	addq	$128,%rdx
718	jz	.Lecb_ret
719
720.Lecb_dec_tail:
721	movups	(%rdi),%xmm2
722	cmpq	$32,%rdx
723	jb	.Lecb_dec_one
724	movups	16(%rdi),%xmm3
725	je	.Lecb_dec_two
726	movups	32(%rdi),%xmm4
727	cmpq	$64,%rdx
728	jb	.Lecb_dec_three
729	movups	48(%rdi),%xmm5
730	je	.Lecb_dec_four
731	movups	64(%rdi),%xmm6
732	cmpq	$96,%rdx
733	jb	.Lecb_dec_five
734	movups	80(%rdi),%xmm7
735	je	.Lecb_dec_six
736	movups	96(%rdi),%xmm8
737	movups	(%rcx),%xmm0
738	xorps	%xmm9,%xmm9
739	call	_aesni_decrypt8
740	movups	%xmm2,(%rsi)
741	pxor	%xmm2,%xmm2
742	movups	%xmm3,16(%rsi)
743	pxor	%xmm3,%xmm3
744	movups	%xmm4,32(%rsi)
745	pxor	%xmm4,%xmm4
746	movups	%xmm5,48(%rsi)
747	pxor	%xmm5,%xmm5
748	movups	%xmm6,64(%rsi)
749	pxor	%xmm6,%xmm6
750	movups	%xmm7,80(%rsi)
751	pxor	%xmm7,%xmm7
752	movups	%xmm8,96(%rsi)
753	pxor	%xmm8,%xmm8
754	pxor	%xmm9,%xmm9
755	jmp	.Lecb_ret
756.align	16
757.Lecb_dec_one:
758	movups	(%rcx),%xmm0
759	movups	16(%rcx),%xmm1
760	leaq	32(%rcx),%rcx
761	xorps	%xmm0,%xmm2
762.Loop_dec1_4:
763.byte	102,15,56,222,209
764	decl	%eax
765	movups	(%rcx),%xmm1
766	leaq	16(%rcx),%rcx
767	jnz	.Loop_dec1_4
768.byte	102,15,56,223,209
769	movups	%xmm2,(%rsi)
770	pxor	%xmm2,%xmm2
771	jmp	.Lecb_ret
772.align	16
773.Lecb_dec_two:
774	call	_aesni_decrypt2
775	movups	%xmm2,(%rsi)
776	pxor	%xmm2,%xmm2
777	movups	%xmm3,16(%rsi)
778	pxor	%xmm3,%xmm3
779	jmp	.Lecb_ret
780.align	16
781.Lecb_dec_three:
782	call	_aesni_decrypt3
783	movups	%xmm2,(%rsi)
784	pxor	%xmm2,%xmm2
785	movups	%xmm3,16(%rsi)
786	pxor	%xmm3,%xmm3
787	movups	%xmm4,32(%rsi)
788	pxor	%xmm4,%xmm4
789	jmp	.Lecb_ret
790.align	16
791.Lecb_dec_four:
792	call	_aesni_decrypt4
793	movups	%xmm2,(%rsi)
794	pxor	%xmm2,%xmm2
795	movups	%xmm3,16(%rsi)
796	pxor	%xmm3,%xmm3
797	movups	%xmm4,32(%rsi)
798	pxor	%xmm4,%xmm4
799	movups	%xmm5,48(%rsi)
800	pxor	%xmm5,%xmm5
801	jmp	.Lecb_ret
802.align	16
803.Lecb_dec_five:
804	xorps	%xmm7,%xmm7
805	call	_aesni_decrypt6
806	movups	%xmm2,(%rsi)
807	pxor	%xmm2,%xmm2
808	movups	%xmm3,16(%rsi)
809	pxor	%xmm3,%xmm3
810	movups	%xmm4,32(%rsi)
811	pxor	%xmm4,%xmm4
812	movups	%xmm5,48(%rsi)
813	pxor	%xmm5,%xmm5
814	movups	%xmm6,64(%rsi)
815	pxor	%xmm6,%xmm6
816	pxor	%xmm7,%xmm7
817	jmp	.Lecb_ret
818.align	16
819.Lecb_dec_six:
820	call	_aesni_decrypt6
821	movups	%xmm2,(%rsi)
822	pxor	%xmm2,%xmm2
823	movups	%xmm3,16(%rsi)
824	pxor	%xmm3,%xmm3
825	movups	%xmm4,32(%rsi)
826	pxor	%xmm4,%xmm4
827	movups	%xmm5,48(%rsi)
828	pxor	%xmm5,%xmm5
829	movups	%xmm6,64(%rsi)
830	pxor	%xmm6,%xmm6
831	movups	%xmm7,80(%rsi)
832	pxor	%xmm7,%xmm7
833
834.Lecb_ret:
835	xorps	%xmm0,%xmm0
836	pxor	%xmm1,%xmm1
837	.byte	0xf3,0xc3
838.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
839.globl	aesni_ccm64_encrypt_blocks
840.hidden aesni_ccm64_encrypt_blocks
841.type	aesni_ccm64_encrypt_blocks,@function
842.align	16
843aesni_ccm64_encrypt_blocks:
844	movl	240(%rcx),%eax
845	movdqu	(%r8),%xmm6
846	movdqa	.Lincrement64(%rip),%xmm9
847	movdqa	.Lbswap_mask(%rip),%xmm7
848
849	shll	$4,%eax
850	movl	$16,%r10d
851	leaq	0(%rcx),%r11
852	movdqu	(%r9),%xmm3
853	movdqa	%xmm6,%xmm2
854	leaq	32(%rcx,%rax,1),%rcx
855.byte	102,15,56,0,247
856	subq	%rax,%r10
857	jmp	.Lccm64_enc_outer
858.align	16
859.Lccm64_enc_outer:
860	movups	(%r11),%xmm0
861	movq	%r10,%rax
862	movups	(%rdi),%xmm8
863
864	xorps	%xmm0,%xmm2
865	movups	16(%r11),%xmm1
866	xorps	%xmm8,%xmm0
867	xorps	%xmm0,%xmm3
868	movups	32(%r11),%xmm0
869
870.Lccm64_enc2_loop:
871.byte	102,15,56,220,209
872.byte	102,15,56,220,217
873	movups	(%rcx,%rax,1),%xmm1
874	addq	$32,%rax
875.byte	102,15,56,220,208
876.byte	102,15,56,220,216
877	movups	-16(%rcx,%rax,1),%xmm0
878	jnz	.Lccm64_enc2_loop
879.byte	102,15,56,220,209
880.byte	102,15,56,220,217
881	paddq	%xmm9,%xmm6
882	decq	%rdx
883.byte	102,15,56,221,208
884.byte	102,15,56,221,216
885
886	leaq	16(%rdi),%rdi
887	xorps	%xmm2,%xmm8
888	movdqa	%xmm6,%xmm2
889	movups	%xmm8,(%rsi)
890.byte	102,15,56,0,215
891	leaq	16(%rsi),%rsi
892	jnz	.Lccm64_enc_outer
893
894	pxor	%xmm0,%xmm0
895	pxor	%xmm1,%xmm1
896	pxor	%xmm2,%xmm2
897	movups	%xmm3,(%r9)
898	pxor	%xmm3,%xmm3
899	pxor	%xmm8,%xmm8
900	pxor	%xmm6,%xmm6
901	.byte	0xf3,0xc3
902.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
903.globl	aesni_ccm64_decrypt_blocks
904.hidden aesni_ccm64_decrypt_blocks
905.type	aesni_ccm64_decrypt_blocks,@function
906.align	16
907aesni_ccm64_decrypt_blocks:
908	movl	240(%rcx),%eax
909	movups	(%r8),%xmm6
910	movdqu	(%r9),%xmm3
911	movdqa	.Lincrement64(%rip),%xmm9
912	movdqa	.Lbswap_mask(%rip),%xmm7
913
914	movaps	%xmm6,%xmm2
915	movl	%eax,%r10d
916	movq	%rcx,%r11
917.byte	102,15,56,0,247
918	movups	(%rcx),%xmm0
919	movups	16(%rcx),%xmm1
920	leaq	32(%rcx),%rcx
921	xorps	%xmm0,%xmm2
922.Loop_enc1_5:
923.byte	102,15,56,220,209
924	decl	%eax
925	movups	(%rcx),%xmm1
926	leaq	16(%rcx),%rcx
927	jnz	.Loop_enc1_5
928.byte	102,15,56,221,209
929	shll	$4,%r10d
930	movl	$16,%eax
931	movups	(%rdi),%xmm8
932	paddq	%xmm9,%xmm6
933	leaq	16(%rdi),%rdi
934	subq	%r10,%rax
935	leaq	32(%r11,%r10,1),%rcx
936	movq	%rax,%r10
937	jmp	.Lccm64_dec_outer
938.align	16
939.Lccm64_dec_outer:
940	xorps	%xmm2,%xmm8
941	movdqa	%xmm6,%xmm2
942	movups	%xmm8,(%rsi)
943	leaq	16(%rsi),%rsi
944.byte	102,15,56,0,215
945
946	subq	$1,%rdx
947	jz	.Lccm64_dec_break
948
949	movups	(%r11),%xmm0
950	movq	%r10,%rax
951	movups	16(%r11),%xmm1
952	xorps	%xmm0,%xmm8
953	xorps	%xmm0,%xmm2
954	xorps	%xmm8,%xmm3
955	movups	32(%r11),%xmm0
956	jmp	.Lccm64_dec2_loop
957.align	16
958.Lccm64_dec2_loop:
959.byte	102,15,56,220,209
960.byte	102,15,56,220,217
961	movups	(%rcx,%rax,1),%xmm1
962	addq	$32,%rax
963.byte	102,15,56,220,208
964.byte	102,15,56,220,216
965	movups	-16(%rcx,%rax,1),%xmm0
966	jnz	.Lccm64_dec2_loop
967	movups	(%rdi),%xmm8
968	paddq	%xmm9,%xmm6
969.byte	102,15,56,220,209
970.byte	102,15,56,220,217
971.byte	102,15,56,221,208
972.byte	102,15,56,221,216
973	leaq	16(%rdi),%rdi
974	jmp	.Lccm64_dec_outer
975
976.align	16
977.Lccm64_dec_break:
978
979	movl	240(%r11),%eax
980	movups	(%r11),%xmm0
981	movups	16(%r11),%xmm1
982	xorps	%xmm0,%xmm8
983	leaq	32(%r11),%r11
984	xorps	%xmm8,%xmm3
985.Loop_enc1_6:
986.byte	102,15,56,220,217
987	decl	%eax
988	movups	(%r11),%xmm1
989	leaq	16(%r11),%r11
990	jnz	.Loop_enc1_6
991.byte	102,15,56,221,217
992	pxor	%xmm0,%xmm0
993	pxor	%xmm1,%xmm1
994	pxor	%xmm2,%xmm2
995	movups	%xmm3,(%r9)
996	pxor	%xmm3,%xmm3
997	pxor	%xmm8,%xmm8
998	pxor	%xmm6,%xmm6
999	.byte	0xf3,0xc3
1000.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1001.globl	aesni_ctr32_encrypt_blocks
1002.hidden aesni_ctr32_encrypt_blocks
1003.type	aesni_ctr32_encrypt_blocks,@function
1004.align	16
1005aesni_ctr32_encrypt_blocks:
1006	cmpq	$1,%rdx
1007	jne	.Lctr32_bulk
1008
1009
1010
1011	movups	(%r8),%xmm2
1012	movups	(%rdi),%xmm3
1013	movl	240(%rcx),%edx
1014	movups	(%rcx),%xmm0
1015	movups	16(%rcx),%xmm1
1016	leaq	32(%rcx),%rcx
1017	xorps	%xmm0,%xmm2
1018.Loop_enc1_7:
1019.byte	102,15,56,220,209
1020	decl	%edx
1021	movups	(%rcx),%xmm1
1022	leaq	16(%rcx),%rcx
1023	jnz	.Loop_enc1_7
1024.byte	102,15,56,221,209
1025	pxor	%xmm0,%xmm0
1026	pxor	%xmm1,%xmm1
1027	xorps	%xmm3,%xmm2
1028	pxor	%xmm3,%xmm3
1029	movups	%xmm2,(%rsi)
1030	xorps	%xmm2,%xmm2
1031	jmp	.Lctr32_epilogue
1032
1033.align	16
1034.Lctr32_bulk:
1035	leaq	(%rsp),%rax
1036	pushq	%rbp
1037	subq	$128,%rsp
1038	andq	$-16,%rsp
1039	leaq	-8(%rax),%rbp
1040
1041
1042
1043
1044	movdqu	(%r8),%xmm2
1045	movdqu	(%rcx),%xmm0
1046	movl	12(%r8),%r8d
1047	pxor	%xmm0,%xmm2
1048	movl	12(%rcx),%r11d
1049	movdqa	%xmm2,0(%rsp)
1050	bswapl	%r8d
1051	movdqa	%xmm2,%xmm3
1052	movdqa	%xmm2,%xmm4
1053	movdqa	%xmm2,%xmm5
1054	movdqa	%xmm2,64(%rsp)
1055	movdqa	%xmm2,80(%rsp)
1056	movdqa	%xmm2,96(%rsp)
1057	movq	%rdx,%r10
1058	movdqa	%xmm2,112(%rsp)
1059
1060	leaq	1(%r8),%rax
1061	leaq	2(%r8),%rdx
1062	bswapl	%eax
1063	bswapl	%edx
1064	xorl	%r11d,%eax
1065	xorl	%r11d,%edx
1066.byte	102,15,58,34,216,3
1067	leaq	3(%r8),%rax
1068	movdqa	%xmm3,16(%rsp)
1069.byte	102,15,58,34,226,3
1070	bswapl	%eax
1071	movq	%r10,%rdx
1072	leaq	4(%r8),%r10
1073	movdqa	%xmm4,32(%rsp)
1074	xorl	%r11d,%eax
1075	bswapl	%r10d
1076.byte	102,15,58,34,232,3
1077	xorl	%r11d,%r10d
1078	movdqa	%xmm5,48(%rsp)
1079	leaq	5(%r8),%r9
1080	movl	%r10d,64+12(%rsp)
1081	bswapl	%r9d
1082	leaq	6(%r8),%r10
1083	movl	240(%rcx),%eax
1084	xorl	%r11d,%r9d
1085	bswapl	%r10d
1086	movl	%r9d,80+12(%rsp)
1087	xorl	%r11d,%r10d
1088	leaq	7(%r8),%r9
1089	movl	%r10d,96+12(%rsp)
1090	bswapl	%r9d
1091	movl	OPENSSL_ia32cap_P+4(%rip),%r10d
1092	xorl	%r11d,%r9d
1093	andl	$71303168,%r10d
1094	movl	%r9d,112+12(%rsp)
1095
1096	movups	16(%rcx),%xmm1
1097
1098	movdqa	64(%rsp),%xmm6
1099	movdqa	80(%rsp),%xmm7
1100
1101	cmpq	$8,%rdx
1102	jb	.Lctr32_tail
1103
1104	subq	$6,%rdx
1105	cmpl	$4194304,%r10d
1106	je	.Lctr32_6x
1107
1108	leaq	128(%rcx),%rcx
1109	subq	$2,%rdx
1110	jmp	.Lctr32_loop8
1111
1112.align	16
1113.Lctr32_6x:
1114	shll	$4,%eax
1115	movl	$48,%r10d
1116	bswapl	%r11d
1117	leaq	32(%rcx,%rax,1),%rcx
1118	subq	%rax,%r10
1119	jmp	.Lctr32_loop6
1120
1121.align	16
1122.Lctr32_loop6:
1123	addl	$6,%r8d
1124	movups	-48(%rcx,%r10,1),%xmm0
1125.byte	102,15,56,220,209
1126	movl	%r8d,%eax
1127	xorl	%r11d,%eax
1128.byte	102,15,56,220,217
1129.byte	0x0f,0x38,0xf1,0x44,0x24,12
1130	leal	1(%r8),%eax
1131.byte	102,15,56,220,225
1132	xorl	%r11d,%eax
1133.byte	0x0f,0x38,0xf1,0x44,0x24,28
1134.byte	102,15,56,220,233
1135	leal	2(%r8),%eax
1136	xorl	%r11d,%eax
1137.byte	102,15,56,220,241
1138.byte	0x0f,0x38,0xf1,0x44,0x24,44
1139	leal	3(%r8),%eax
1140.byte	102,15,56,220,249
1141	movups	-32(%rcx,%r10,1),%xmm1
1142	xorl	%r11d,%eax
1143
1144.byte	102,15,56,220,208
1145.byte	0x0f,0x38,0xf1,0x44,0x24,60
1146	leal	4(%r8),%eax
1147.byte	102,15,56,220,216
1148	xorl	%r11d,%eax
1149.byte	0x0f,0x38,0xf1,0x44,0x24,76
1150.byte	102,15,56,220,224
1151	leal	5(%r8),%eax
1152	xorl	%r11d,%eax
1153.byte	102,15,56,220,232
1154.byte	0x0f,0x38,0xf1,0x44,0x24,92
1155	movq	%r10,%rax
1156.byte	102,15,56,220,240
1157.byte	102,15,56,220,248
1158	movups	-16(%rcx,%r10,1),%xmm0
1159
1160	call	.Lenc_loop6
1161
1162	movdqu	(%rdi),%xmm8
1163	movdqu	16(%rdi),%xmm9
1164	movdqu	32(%rdi),%xmm10
1165	movdqu	48(%rdi),%xmm11
1166	movdqu	64(%rdi),%xmm12
1167	movdqu	80(%rdi),%xmm13
1168	leaq	96(%rdi),%rdi
1169	movups	-64(%rcx,%r10,1),%xmm1
1170	pxor	%xmm2,%xmm8
1171	movaps	0(%rsp),%xmm2
1172	pxor	%xmm3,%xmm9
1173	movaps	16(%rsp),%xmm3
1174	pxor	%xmm4,%xmm10
1175	movaps	32(%rsp),%xmm4
1176	pxor	%xmm5,%xmm11
1177	movaps	48(%rsp),%xmm5
1178	pxor	%xmm6,%xmm12
1179	movaps	64(%rsp),%xmm6
1180	pxor	%xmm7,%xmm13
1181	movaps	80(%rsp),%xmm7
1182	movdqu	%xmm8,(%rsi)
1183	movdqu	%xmm9,16(%rsi)
1184	movdqu	%xmm10,32(%rsi)
1185	movdqu	%xmm11,48(%rsi)
1186	movdqu	%xmm12,64(%rsi)
1187	movdqu	%xmm13,80(%rsi)
1188	leaq	96(%rsi),%rsi
1189
1190	subq	$6,%rdx
1191	jnc	.Lctr32_loop6
1192
1193	addq	$6,%rdx
1194	jz	.Lctr32_done
1195
1196	leal	-48(%r10),%eax
1197	leaq	-80(%rcx,%r10,1),%rcx
1198	negl	%eax
1199	shrl	$4,%eax
1200	jmp	.Lctr32_tail
1201
1202.align	32
1203.Lctr32_loop8:
1204	addl	$8,%r8d
1205	movdqa	96(%rsp),%xmm8
1206.byte	102,15,56,220,209
1207	movl	%r8d,%r9d
1208	movdqa	112(%rsp),%xmm9
1209.byte	102,15,56,220,217
1210	bswapl	%r9d
1211	movups	32-128(%rcx),%xmm0
1212.byte	102,15,56,220,225
1213	xorl	%r11d,%r9d
1214	nop
1215.byte	102,15,56,220,233
1216	movl	%r9d,0+12(%rsp)
1217	leaq	1(%r8),%r9
1218.byte	102,15,56,220,241
1219.byte	102,15,56,220,249
1220.byte	102,68,15,56,220,193
1221.byte	102,68,15,56,220,201
1222	movups	48-128(%rcx),%xmm1
1223	bswapl	%r9d
1224.byte	102,15,56,220,208
1225.byte	102,15,56,220,216
1226	xorl	%r11d,%r9d
1227.byte	0x66,0x90
1228.byte	102,15,56,220,224
1229.byte	102,15,56,220,232
1230	movl	%r9d,16+12(%rsp)
1231	leaq	2(%r8),%r9
1232.byte	102,15,56,220,240
1233.byte	102,15,56,220,248
1234.byte	102,68,15,56,220,192
1235.byte	102,68,15,56,220,200
1236	movups	64-128(%rcx),%xmm0
1237	bswapl	%r9d
1238.byte	102,15,56,220,209
1239.byte	102,15,56,220,217
1240	xorl	%r11d,%r9d
1241.byte	0x66,0x90
1242.byte	102,15,56,220,225
1243.byte	102,15,56,220,233
1244	movl	%r9d,32+12(%rsp)
1245	leaq	3(%r8),%r9
1246.byte	102,15,56,220,241
1247.byte	102,15,56,220,249
1248.byte	102,68,15,56,220,193
1249.byte	102,68,15,56,220,201
1250	movups	80-128(%rcx),%xmm1
1251	bswapl	%r9d
1252.byte	102,15,56,220,208
1253.byte	102,15,56,220,216
1254	xorl	%r11d,%r9d
1255.byte	0x66,0x90
1256.byte	102,15,56,220,224
1257.byte	102,15,56,220,232
1258	movl	%r9d,48+12(%rsp)
1259	leaq	4(%r8),%r9
1260.byte	102,15,56,220,240
1261.byte	102,15,56,220,248
1262.byte	102,68,15,56,220,192
1263.byte	102,68,15,56,220,200
1264	movups	96-128(%rcx),%xmm0
1265	bswapl	%r9d
1266.byte	102,15,56,220,209
1267.byte	102,15,56,220,217
1268	xorl	%r11d,%r9d
1269.byte	0x66,0x90
1270.byte	102,15,56,220,225
1271.byte	102,15,56,220,233
1272	movl	%r9d,64+12(%rsp)
1273	leaq	5(%r8),%r9
1274.byte	102,15,56,220,241
1275.byte	102,15,56,220,249
1276.byte	102,68,15,56,220,193
1277.byte	102,68,15,56,220,201
1278	movups	112-128(%rcx),%xmm1
1279	bswapl	%r9d
1280.byte	102,15,56,220,208
1281.byte	102,15,56,220,216
1282	xorl	%r11d,%r9d
1283.byte	0x66,0x90
1284.byte	102,15,56,220,224
1285.byte	102,15,56,220,232
1286	movl	%r9d,80+12(%rsp)
1287	leaq	6(%r8),%r9
1288.byte	102,15,56,220,240
1289.byte	102,15,56,220,248
1290.byte	102,68,15,56,220,192
1291.byte	102,68,15,56,220,200
1292	movups	128-128(%rcx),%xmm0
1293	bswapl	%r9d
1294.byte	102,15,56,220,209
1295.byte	102,15,56,220,217
1296	xorl	%r11d,%r9d
1297.byte	0x66,0x90
1298.byte	102,15,56,220,225
1299.byte	102,15,56,220,233
1300	movl	%r9d,96+12(%rsp)
1301	leaq	7(%r8),%r9
1302.byte	102,15,56,220,241
1303.byte	102,15,56,220,249
1304.byte	102,68,15,56,220,193
1305.byte	102,68,15,56,220,201
1306	movups	144-128(%rcx),%xmm1
1307	bswapl	%r9d
1308.byte	102,15,56,220,208
1309.byte	102,15,56,220,216
1310.byte	102,15,56,220,224
1311	xorl	%r11d,%r9d
1312	movdqu	0(%rdi),%xmm10
1313.byte	102,15,56,220,232
1314	movl	%r9d,112+12(%rsp)
1315	cmpl	$11,%eax
1316.byte	102,15,56,220,240
1317.byte	102,15,56,220,248
1318.byte	102,68,15,56,220,192
1319.byte	102,68,15,56,220,200
1320	movups	160-128(%rcx),%xmm0
1321
1322	jb	.Lctr32_enc_done
1323
1324.byte	102,15,56,220,209
1325.byte	102,15,56,220,217
1326.byte	102,15,56,220,225
1327.byte	102,15,56,220,233
1328.byte	102,15,56,220,241
1329.byte	102,15,56,220,249
1330.byte	102,68,15,56,220,193
1331.byte	102,68,15,56,220,201
1332	movups	176-128(%rcx),%xmm1
1333
1334.byte	102,15,56,220,208
1335.byte	102,15,56,220,216
1336.byte	102,15,56,220,224
1337.byte	102,15,56,220,232
1338.byte	102,15,56,220,240
1339.byte	102,15,56,220,248
1340.byte	102,68,15,56,220,192
1341.byte	102,68,15,56,220,200
1342	movups	192-128(%rcx),%xmm0
1343	je	.Lctr32_enc_done
1344
1345.byte	102,15,56,220,209
1346.byte	102,15,56,220,217
1347.byte	102,15,56,220,225
1348.byte	102,15,56,220,233
1349.byte	102,15,56,220,241
1350.byte	102,15,56,220,249
1351.byte	102,68,15,56,220,193
1352.byte	102,68,15,56,220,201
1353	movups	208-128(%rcx),%xmm1
1354
1355.byte	102,15,56,220,208
1356.byte	102,15,56,220,216
1357.byte	102,15,56,220,224
1358.byte	102,15,56,220,232
1359.byte	102,15,56,220,240
1360.byte	102,15,56,220,248
1361.byte	102,68,15,56,220,192
1362.byte	102,68,15,56,220,200
1363	movups	224-128(%rcx),%xmm0
1364	jmp	.Lctr32_enc_done
1365
1366.align	16
1367.Lctr32_enc_done:
1368	movdqu	16(%rdi),%xmm11
1369	pxor	%xmm0,%xmm10
1370	movdqu	32(%rdi),%xmm12
1371	pxor	%xmm0,%xmm11
1372	movdqu	48(%rdi),%xmm13
1373	pxor	%xmm0,%xmm12
1374	movdqu	64(%rdi),%xmm14
1375	pxor	%xmm0,%xmm13
1376	movdqu	80(%rdi),%xmm15
1377	pxor	%xmm0,%xmm14
1378	pxor	%xmm0,%xmm15
1379.byte	102,15,56,220,209
1380.byte	102,15,56,220,217
1381.byte	102,15,56,220,225
1382.byte	102,15,56,220,233
1383.byte	102,15,56,220,241
1384.byte	102,15,56,220,249
1385.byte	102,68,15,56,220,193
1386.byte	102,68,15,56,220,201
1387	movdqu	96(%rdi),%xmm1
1388	leaq	128(%rdi),%rdi
1389
1390.byte	102,65,15,56,221,210
1391	pxor	%xmm0,%xmm1
1392	movdqu	112-128(%rdi),%xmm10
1393.byte	102,65,15,56,221,219
1394	pxor	%xmm0,%xmm10
1395	movdqa	0(%rsp),%xmm11
1396.byte	102,65,15,56,221,228
1397.byte	102,65,15,56,221,237
1398	movdqa	16(%rsp),%xmm12
1399	movdqa	32(%rsp),%xmm13
1400.byte	102,65,15,56,221,246
1401.byte	102,65,15,56,221,255
1402	movdqa	48(%rsp),%xmm14
1403	movdqa	64(%rsp),%xmm15
1404.byte	102,68,15,56,221,193
1405	movdqa	80(%rsp),%xmm0
1406	movups	16-128(%rcx),%xmm1
1407.byte	102,69,15,56,221,202
1408
1409	movups	%xmm2,(%rsi)
1410	movdqa	%xmm11,%xmm2
1411	movups	%xmm3,16(%rsi)
1412	movdqa	%xmm12,%xmm3
1413	movups	%xmm4,32(%rsi)
1414	movdqa	%xmm13,%xmm4
1415	movups	%xmm5,48(%rsi)
1416	movdqa	%xmm14,%xmm5
1417	movups	%xmm6,64(%rsi)
1418	movdqa	%xmm15,%xmm6
1419	movups	%xmm7,80(%rsi)
1420	movdqa	%xmm0,%xmm7
1421	movups	%xmm8,96(%rsi)
1422	movups	%xmm9,112(%rsi)
1423	leaq	128(%rsi),%rsi
1424
1425	subq	$8,%rdx
1426	jnc	.Lctr32_loop8
1427
1428	addq	$8,%rdx
1429	jz	.Lctr32_done
1430	leaq	-128(%rcx),%rcx
1431
1432.Lctr32_tail:
1433
1434
1435	leaq	16(%rcx),%rcx
1436	cmpq	$4,%rdx
1437	jb	.Lctr32_loop3
1438	je	.Lctr32_loop4
1439
1440
1441	shll	$4,%eax
1442	movdqa	96(%rsp),%xmm8
1443	pxor	%xmm9,%xmm9
1444
1445	movups	16(%rcx),%xmm0
1446.byte	102,15,56,220,209
1447.byte	102,15,56,220,217
1448	leaq	32-16(%rcx,%rax,1),%rcx
1449	negq	%rax
1450.byte	102,15,56,220,225
1451	addq	$16,%rax
1452	movups	(%rdi),%xmm10
1453.byte	102,15,56,220,233
1454.byte	102,15,56,220,241
1455	movups	16(%rdi),%xmm11
1456	movups	32(%rdi),%xmm12
1457.byte	102,15,56,220,249
1458.byte	102,68,15,56,220,193
1459
1460	call	.Lenc_loop8_enter
1461
1462	movdqu	48(%rdi),%xmm13
1463	pxor	%xmm10,%xmm2
1464	movdqu	64(%rdi),%xmm10
1465	pxor	%xmm11,%xmm3
1466	movdqu	%xmm2,(%rsi)
1467	pxor	%xmm12,%xmm4
1468	movdqu	%xmm3,16(%rsi)
1469	pxor	%xmm13,%xmm5
1470	movdqu	%xmm4,32(%rsi)
1471	pxor	%xmm10,%xmm6
1472	movdqu	%xmm5,48(%rsi)
1473	movdqu	%xmm6,64(%rsi)
1474	cmpq	$6,%rdx
1475	jb	.Lctr32_done
1476
1477	movups	80(%rdi),%xmm11
1478	xorps	%xmm11,%xmm7
1479	movups	%xmm7,80(%rsi)
1480	je	.Lctr32_done
1481
1482	movups	96(%rdi),%xmm12
1483	xorps	%xmm12,%xmm8
1484	movups	%xmm8,96(%rsi)
1485	jmp	.Lctr32_done
1486
1487.align	32
1488.Lctr32_loop4:
1489.byte	102,15,56,220,209
1490	leaq	16(%rcx),%rcx
1491	decl	%eax
1492.byte	102,15,56,220,217
1493.byte	102,15,56,220,225
1494.byte	102,15,56,220,233
1495	movups	(%rcx),%xmm1
1496	jnz	.Lctr32_loop4
1497.byte	102,15,56,221,209
1498.byte	102,15,56,221,217
1499	movups	(%rdi),%xmm10
1500	movups	16(%rdi),%xmm11
1501.byte	102,15,56,221,225
1502.byte	102,15,56,221,233
1503	movups	32(%rdi),%xmm12
1504	movups	48(%rdi),%xmm13
1505
1506	xorps	%xmm10,%xmm2
1507	movups	%xmm2,(%rsi)
1508	xorps	%xmm11,%xmm3
1509	movups	%xmm3,16(%rsi)
1510	pxor	%xmm12,%xmm4
1511	movdqu	%xmm4,32(%rsi)
1512	pxor	%xmm13,%xmm5
1513	movdqu	%xmm5,48(%rsi)
1514	jmp	.Lctr32_done
1515
1516.align	32
1517.Lctr32_loop3:
1518.byte	102,15,56,220,209
1519	leaq	16(%rcx),%rcx
1520	decl	%eax
1521.byte	102,15,56,220,217
1522.byte	102,15,56,220,225
1523	movups	(%rcx),%xmm1
1524	jnz	.Lctr32_loop3
1525.byte	102,15,56,221,209
1526.byte	102,15,56,221,217
1527.byte	102,15,56,221,225
1528
1529	movups	(%rdi),%xmm10
1530	xorps	%xmm10,%xmm2
1531	movups	%xmm2,(%rsi)
1532	cmpq	$2,%rdx
1533	jb	.Lctr32_done
1534
1535	movups	16(%rdi),%xmm11
1536	xorps	%xmm11,%xmm3
1537	movups	%xmm3,16(%rsi)
1538	je	.Lctr32_done
1539
1540	movups	32(%rdi),%xmm12
1541	xorps	%xmm12,%xmm4
1542	movups	%xmm4,32(%rsi)
1543
1544.Lctr32_done:
1545	xorps	%xmm0,%xmm0
1546	xorl	%r11d,%r11d
1547	pxor	%xmm1,%xmm1
1548	pxor	%xmm2,%xmm2
1549	pxor	%xmm3,%xmm3
1550	pxor	%xmm4,%xmm4
1551	pxor	%xmm5,%xmm5
1552	pxor	%xmm6,%xmm6
1553	pxor	%xmm7,%xmm7
1554	movaps	%xmm0,0(%rsp)
1555	pxor	%xmm8,%xmm8
1556	movaps	%xmm0,16(%rsp)
1557	pxor	%xmm9,%xmm9
1558	movaps	%xmm0,32(%rsp)
1559	pxor	%xmm10,%xmm10
1560	movaps	%xmm0,48(%rsp)
1561	pxor	%xmm11,%xmm11
1562	movaps	%xmm0,64(%rsp)
1563	pxor	%xmm12,%xmm12
1564	movaps	%xmm0,80(%rsp)
1565	pxor	%xmm13,%xmm13
1566	movaps	%xmm0,96(%rsp)
1567	pxor	%xmm14,%xmm14
1568	movaps	%xmm0,112(%rsp)
1569	pxor	%xmm15,%xmm15
1570	leaq	(%rbp),%rsp
1571	popq	%rbp
1572.Lctr32_epilogue:
1573	.byte	0xf3,0xc3
1574.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1575.globl	aesni_xts_encrypt
1576.hidden aesni_xts_encrypt
1577.type	aesni_xts_encrypt,@function
1578.align	16
1579aesni_xts_encrypt:
1580	leaq	(%rsp),%rax
1581	pushq	%rbp
1582	subq	$112,%rsp
1583	andq	$-16,%rsp
1584	leaq	-8(%rax),%rbp
1585	movups	(%r9),%xmm2
1586	movl	240(%r8),%eax
1587	movl	240(%rcx),%r10d
1588	movups	(%r8),%xmm0
1589	movups	16(%r8),%xmm1
1590	leaq	32(%r8),%r8
1591	xorps	%xmm0,%xmm2
1592.Loop_enc1_8:
1593.byte	102,15,56,220,209
1594	decl	%eax
1595	movups	(%r8),%xmm1
1596	leaq	16(%r8),%r8
1597	jnz	.Loop_enc1_8
1598.byte	102,15,56,221,209
1599	movups	(%rcx),%xmm0
1600	movq	%rcx,%r11
1601	movl	%r10d,%eax
1602	shll	$4,%r10d
1603	movq	%rdx,%r9
1604	andq	$-16,%rdx
1605
1606	movups	16(%rcx,%r10,1),%xmm1
1607
1608	movdqa	.Lxts_magic(%rip),%xmm8
1609	movdqa	%xmm2,%xmm15
1610	pshufd	$95,%xmm2,%xmm9
1611	pxor	%xmm0,%xmm1
1612	movdqa	%xmm9,%xmm14
1613	paddd	%xmm9,%xmm9
1614	movdqa	%xmm15,%xmm10
1615	psrad	$31,%xmm14
1616	paddq	%xmm15,%xmm15
1617	pand	%xmm8,%xmm14
1618	pxor	%xmm0,%xmm10
1619	pxor	%xmm14,%xmm15
1620	movdqa	%xmm9,%xmm14
1621	paddd	%xmm9,%xmm9
1622	movdqa	%xmm15,%xmm11
1623	psrad	$31,%xmm14
1624	paddq	%xmm15,%xmm15
1625	pand	%xmm8,%xmm14
1626	pxor	%xmm0,%xmm11
1627	pxor	%xmm14,%xmm15
1628	movdqa	%xmm9,%xmm14
1629	paddd	%xmm9,%xmm9
1630	movdqa	%xmm15,%xmm12
1631	psrad	$31,%xmm14
1632	paddq	%xmm15,%xmm15
1633	pand	%xmm8,%xmm14
1634	pxor	%xmm0,%xmm12
1635	pxor	%xmm14,%xmm15
1636	movdqa	%xmm9,%xmm14
1637	paddd	%xmm9,%xmm9
1638	movdqa	%xmm15,%xmm13
1639	psrad	$31,%xmm14
1640	paddq	%xmm15,%xmm15
1641	pand	%xmm8,%xmm14
1642	pxor	%xmm0,%xmm13
1643	pxor	%xmm14,%xmm15
1644	movdqa	%xmm15,%xmm14
1645	psrad	$31,%xmm9
1646	paddq	%xmm15,%xmm15
1647	pand	%xmm8,%xmm9
1648	pxor	%xmm0,%xmm14
1649	pxor	%xmm9,%xmm15
1650	movaps	%xmm1,96(%rsp)
1651
1652	subq	$96,%rdx
1653	jc	.Lxts_enc_short
1654
1655	movl	$16+96,%eax
1656	leaq	32(%r11,%r10,1),%rcx
1657	subq	%r10,%rax
1658	movups	16(%r11),%xmm1
1659	movq	%rax,%r10
1660	leaq	.Lxts_magic(%rip),%r8
1661	jmp	.Lxts_enc_grandloop
1662
1663.align	32
1664.Lxts_enc_grandloop:
1665	movdqu	0(%rdi),%xmm2
1666	movdqa	%xmm0,%xmm8
1667	movdqu	16(%rdi),%xmm3
1668	pxor	%xmm10,%xmm2
1669	movdqu	32(%rdi),%xmm4
1670	pxor	%xmm11,%xmm3
1671.byte	102,15,56,220,209
1672	movdqu	48(%rdi),%xmm5
1673	pxor	%xmm12,%xmm4
1674.byte	102,15,56,220,217
1675	movdqu	64(%rdi),%xmm6
1676	pxor	%xmm13,%xmm5
1677.byte	102,15,56,220,225
1678	movdqu	80(%rdi),%xmm7
1679	pxor	%xmm15,%xmm8
1680	movdqa	96(%rsp),%xmm9
1681	pxor	%xmm14,%xmm6
1682.byte	102,15,56,220,233
1683	movups	32(%r11),%xmm0
1684	leaq	96(%rdi),%rdi
1685	pxor	%xmm8,%xmm7
1686
1687	pxor	%xmm9,%xmm10
1688.byte	102,15,56,220,241
1689	pxor	%xmm9,%xmm11
1690	movdqa	%xmm10,0(%rsp)
1691.byte	102,15,56,220,249
1692	movups	48(%r11),%xmm1
1693	pxor	%xmm9,%xmm12
1694
1695.byte	102,15,56,220,208
1696	pxor	%xmm9,%xmm13
1697	movdqa	%xmm11,16(%rsp)
1698.byte	102,15,56,220,216
1699	pxor	%xmm9,%xmm14
1700	movdqa	%xmm12,32(%rsp)
1701.byte	102,15,56,220,224
1702.byte	102,15,56,220,232
1703	pxor	%xmm9,%xmm8
1704	movdqa	%xmm14,64(%rsp)
1705.byte	102,15,56,220,240
1706.byte	102,15,56,220,248
1707	movups	64(%r11),%xmm0
1708	movdqa	%xmm8,80(%rsp)
1709	pshufd	$95,%xmm15,%xmm9
1710	jmp	.Lxts_enc_loop6
1711.align	32
1712.Lxts_enc_loop6:
1713.byte	102,15,56,220,209
1714.byte	102,15,56,220,217
1715.byte	102,15,56,220,225
1716.byte	102,15,56,220,233
1717.byte	102,15,56,220,241
1718.byte	102,15,56,220,249
1719	movups	-64(%rcx,%rax,1),%xmm1
1720	addq	$32,%rax
1721
1722.byte	102,15,56,220,208
1723.byte	102,15,56,220,216
1724.byte	102,15,56,220,224
1725.byte	102,15,56,220,232
1726.byte	102,15,56,220,240
1727.byte	102,15,56,220,248
1728	movups	-80(%rcx,%rax,1),%xmm0
1729	jnz	.Lxts_enc_loop6
1730
1731	movdqa	(%r8),%xmm8
1732	movdqa	%xmm9,%xmm14
1733	paddd	%xmm9,%xmm9
1734.byte	102,15,56,220,209
1735	paddq	%xmm15,%xmm15
1736	psrad	$31,%xmm14
1737.byte	102,15,56,220,217
1738	pand	%xmm8,%xmm14
1739	movups	(%r11),%xmm10
1740.byte	102,15,56,220,225
1741.byte	102,15,56,220,233
1742.byte	102,15,56,220,241
1743	pxor	%xmm14,%xmm15
1744	movaps	%xmm10,%xmm11
1745.byte	102,15,56,220,249
1746	movups	-64(%rcx),%xmm1
1747
1748	movdqa	%xmm9,%xmm14
1749.byte	102,15,56,220,208
1750	paddd	%xmm9,%xmm9
1751	pxor	%xmm15,%xmm10
1752.byte	102,15,56,220,216
1753	psrad	$31,%xmm14
1754	paddq	%xmm15,%xmm15
1755.byte	102,15,56,220,224
1756.byte	102,15,56,220,232
1757	pand	%xmm8,%xmm14
1758	movaps	%xmm11,%xmm12
1759.byte	102,15,56,220,240
1760	pxor	%xmm14,%xmm15
1761	movdqa	%xmm9,%xmm14
1762.byte	102,15,56,220,248
1763	movups	-48(%rcx),%xmm0
1764
1765	paddd	%xmm9,%xmm9
1766.byte	102,15,56,220,209
1767	pxor	%xmm15,%xmm11
1768	psrad	$31,%xmm14
1769.byte	102,15,56,220,217
1770	paddq	%xmm15,%xmm15
1771	pand	%xmm8,%xmm14
1772.byte	102,15,56,220,225
1773.byte	102,15,56,220,233
1774	movdqa	%xmm13,48(%rsp)
1775	pxor	%xmm14,%xmm15
1776.byte	102,15,56,220,241
1777	movaps	%xmm12,%xmm13
1778	movdqa	%xmm9,%xmm14
1779.byte	102,15,56,220,249
1780	movups	-32(%rcx),%xmm1
1781
1782	paddd	%xmm9,%xmm9
1783.byte	102,15,56,220,208
1784	pxor	%xmm15,%xmm12
1785	psrad	$31,%xmm14
1786.byte	102,15,56,220,216
1787	paddq	%xmm15,%xmm15
1788	pand	%xmm8,%xmm14
1789.byte	102,15,56,220,224
1790.byte	102,15,56,220,232
1791.byte	102,15,56,220,240
1792	pxor	%xmm14,%xmm15
1793	movaps	%xmm13,%xmm14
1794.byte	102,15,56,220,248
1795
1796	movdqa	%xmm9,%xmm0
1797	paddd	%xmm9,%xmm9
1798.byte	102,15,56,220,209
1799	pxor	%xmm15,%xmm13
1800	psrad	$31,%xmm0
1801.byte	102,15,56,220,217
1802	paddq	%xmm15,%xmm15
1803	pand	%xmm8,%xmm0
1804.byte	102,15,56,220,225
1805.byte	102,15,56,220,233
1806	pxor	%xmm0,%xmm15
1807	movups	(%r11),%xmm0
1808.byte	102,15,56,220,241
1809.byte	102,15,56,220,249
1810	movups	16(%r11),%xmm1
1811
1812	pxor	%xmm15,%xmm14
1813.byte	102,15,56,221,84,36,0
1814	psrad	$31,%xmm9
1815	paddq	%xmm15,%xmm15
1816.byte	102,15,56,221,92,36,16
1817.byte	102,15,56,221,100,36,32
1818	pand	%xmm8,%xmm9
1819	movq	%r10,%rax
1820.byte	102,15,56,221,108,36,48
1821.byte	102,15,56,221,116,36,64
1822.byte	102,15,56,221,124,36,80
1823	pxor	%xmm9,%xmm15
1824
1825	leaq	96(%rsi),%rsi
1826	movups	%xmm2,-96(%rsi)
1827	movups	%xmm3,-80(%rsi)
1828	movups	%xmm4,-64(%rsi)
1829	movups	%xmm5,-48(%rsi)
1830	movups	%xmm6,-32(%rsi)
1831	movups	%xmm7,-16(%rsi)
1832	subq	$96,%rdx
1833	jnc	.Lxts_enc_grandloop
1834
1835	movl	$16+96,%eax
1836	subl	%r10d,%eax
1837	movq	%r11,%rcx
1838	shrl	$4,%eax
1839
1840.Lxts_enc_short:
1841
1842	movl	%eax,%r10d
1843	pxor	%xmm0,%xmm10
1844	addq	$96,%rdx
1845	jz	.Lxts_enc_done
1846
1847	pxor	%xmm0,%xmm11
1848	cmpq	$32,%rdx
1849	jb	.Lxts_enc_one
1850	pxor	%xmm0,%xmm12
1851	je	.Lxts_enc_two
1852
1853	pxor	%xmm0,%xmm13
1854	cmpq	$64,%rdx
1855	jb	.Lxts_enc_three
1856	pxor	%xmm0,%xmm14
1857	je	.Lxts_enc_four
1858
1859	movdqu	(%rdi),%xmm2
1860	movdqu	16(%rdi),%xmm3
1861	movdqu	32(%rdi),%xmm4
1862	pxor	%xmm10,%xmm2
1863	movdqu	48(%rdi),%xmm5
1864	pxor	%xmm11,%xmm3
1865	movdqu	64(%rdi),%xmm6
1866	leaq	80(%rdi),%rdi
1867	pxor	%xmm12,%xmm4
1868	pxor	%xmm13,%xmm5
1869	pxor	%xmm14,%xmm6
1870	pxor	%xmm7,%xmm7
1871
1872	call	_aesni_encrypt6
1873
1874	xorps	%xmm10,%xmm2
1875	movdqa	%xmm15,%xmm10
1876	xorps	%xmm11,%xmm3
1877	xorps	%xmm12,%xmm4
1878	movdqu	%xmm2,(%rsi)
1879	xorps	%xmm13,%xmm5
1880	movdqu	%xmm3,16(%rsi)
1881	xorps	%xmm14,%xmm6
1882	movdqu	%xmm4,32(%rsi)
1883	movdqu	%xmm5,48(%rsi)
1884	movdqu	%xmm6,64(%rsi)
1885	leaq	80(%rsi),%rsi
1886	jmp	.Lxts_enc_done
1887
1888.align	16
1889.Lxts_enc_one:
1890	movups	(%rdi),%xmm2
1891	leaq	16(%rdi),%rdi
1892	xorps	%xmm10,%xmm2
1893	movups	(%rcx),%xmm0
1894	movups	16(%rcx),%xmm1
1895	leaq	32(%rcx),%rcx
1896	xorps	%xmm0,%xmm2
1897.Loop_enc1_9:
1898.byte	102,15,56,220,209
1899	decl	%eax
1900	movups	(%rcx),%xmm1
1901	leaq	16(%rcx),%rcx
1902	jnz	.Loop_enc1_9
1903.byte	102,15,56,221,209
1904	xorps	%xmm10,%xmm2
1905	movdqa	%xmm11,%xmm10
1906	movups	%xmm2,(%rsi)
1907	leaq	16(%rsi),%rsi
1908	jmp	.Lxts_enc_done
1909
1910.align	16
1911.Lxts_enc_two:
1912	movups	(%rdi),%xmm2
1913	movups	16(%rdi),%xmm3
1914	leaq	32(%rdi),%rdi
1915	xorps	%xmm10,%xmm2
1916	xorps	%xmm11,%xmm3
1917
1918	call	_aesni_encrypt2
1919
1920	xorps	%xmm10,%xmm2
1921	movdqa	%xmm12,%xmm10
1922	xorps	%xmm11,%xmm3
1923	movups	%xmm2,(%rsi)
1924	movups	%xmm3,16(%rsi)
1925	leaq	32(%rsi),%rsi
1926	jmp	.Lxts_enc_done
1927
1928.align	16
1929.Lxts_enc_three:
1930	movups	(%rdi),%xmm2
1931	movups	16(%rdi),%xmm3
1932	movups	32(%rdi),%xmm4
1933	leaq	48(%rdi),%rdi
1934	xorps	%xmm10,%xmm2
1935	xorps	%xmm11,%xmm3
1936	xorps	%xmm12,%xmm4
1937
1938	call	_aesni_encrypt3
1939
1940	xorps	%xmm10,%xmm2
1941	movdqa	%xmm13,%xmm10
1942	xorps	%xmm11,%xmm3
1943	xorps	%xmm12,%xmm4
1944	movups	%xmm2,(%rsi)
1945	movups	%xmm3,16(%rsi)
1946	movups	%xmm4,32(%rsi)
1947	leaq	48(%rsi),%rsi
1948	jmp	.Lxts_enc_done
1949
1950.align	16
1951.Lxts_enc_four:
1952	movups	(%rdi),%xmm2
1953	movups	16(%rdi),%xmm3
1954	movups	32(%rdi),%xmm4
1955	xorps	%xmm10,%xmm2
1956	movups	48(%rdi),%xmm5
1957	leaq	64(%rdi),%rdi
1958	xorps	%xmm11,%xmm3
1959	xorps	%xmm12,%xmm4
1960	xorps	%xmm13,%xmm5
1961
1962	call	_aesni_encrypt4
1963
1964	pxor	%xmm10,%xmm2
1965	movdqa	%xmm14,%xmm10
1966	pxor	%xmm11,%xmm3
1967	pxor	%xmm12,%xmm4
1968	movdqu	%xmm2,(%rsi)
1969	pxor	%xmm13,%xmm5
1970	movdqu	%xmm3,16(%rsi)
1971	movdqu	%xmm4,32(%rsi)
1972	movdqu	%xmm5,48(%rsi)
1973	leaq	64(%rsi),%rsi
1974	jmp	.Lxts_enc_done
1975
1976.align	16
1977.Lxts_enc_done:
1978	andq	$15,%r9
1979	jz	.Lxts_enc_ret
1980	movq	%r9,%rdx
1981
1982.Lxts_enc_steal:
1983	movzbl	(%rdi),%eax
1984	movzbl	-16(%rsi),%ecx
1985	leaq	1(%rdi),%rdi
1986	movb	%al,-16(%rsi)
1987	movb	%cl,0(%rsi)
1988	leaq	1(%rsi),%rsi
1989	subq	$1,%rdx
1990	jnz	.Lxts_enc_steal
1991
1992	subq	%r9,%rsi
1993	movq	%r11,%rcx
1994	movl	%r10d,%eax
1995
1996	movups	-16(%rsi),%xmm2
1997	xorps	%xmm10,%xmm2
1998	movups	(%rcx),%xmm0
1999	movups	16(%rcx),%xmm1
2000	leaq	32(%rcx),%rcx
2001	xorps	%xmm0,%xmm2
2002.Loop_enc1_10:
2003.byte	102,15,56,220,209
2004	decl	%eax
2005	movups	(%rcx),%xmm1
2006	leaq	16(%rcx),%rcx
2007	jnz	.Loop_enc1_10
2008.byte	102,15,56,221,209
2009	xorps	%xmm10,%xmm2
2010	movups	%xmm2,-16(%rsi)
2011
2012.Lxts_enc_ret:
2013	xorps	%xmm0,%xmm0
2014	pxor	%xmm1,%xmm1
2015	pxor	%xmm2,%xmm2
2016	pxor	%xmm3,%xmm3
2017	pxor	%xmm4,%xmm4
2018	pxor	%xmm5,%xmm5
2019	pxor	%xmm6,%xmm6
2020	pxor	%xmm7,%xmm7
2021	movaps	%xmm0,0(%rsp)
2022	pxor	%xmm8,%xmm8
2023	movaps	%xmm0,16(%rsp)
2024	pxor	%xmm9,%xmm9
2025	movaps	%xmm0,32(%rsp)
2026	pxor	%xmm10,%xmm10
2027	movaps	%xmm0,48(%rsp)
2028	pxor	%xmm11,%xmm11
2029	movaps	%xmm0,64(%rsp)
2030	pxor	%xmm12,%xmm12
2031	movaps	%xmm0,80(%rsp)
2032	pxor	%xmm13,%xmm13
2033	movaps	%xmm0,96(%rsp)
2034	pxor	%xmm14,%xmm14
2035	pxor	%xmm15,%xmm15
2036	leaq	(%rbp),%rsp
2037	popq	%rbp
2038.Lxts_enc_epilogue:
2039	.byte	0xf3,0xc3
2040.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2041.globl	aesni_xts_decrypt
2042.hidden aesni_xts_decrypt
2043.type	aesni_xts_decrypt,@function
2044.align	16
2045aesni_xts_decrypt:
2046	leaq	(%rsp),%rax
2047	pushq	%rbp
2048	subq	$112,%rsp
2049	andq	$-16,%rsp
2050	leaq	-8(%rax),%rbp
2051	movups	(%r9),%xmm2
2052	movl	240(%r8),%eax
2053	movl	240(%rcx),%r10d
2054	movups	(%r8),%xmm0
2055	movups	16(%r8),%xmm1
2056	leaq	32(%r8),%r8
2057	xorps	%xmm0,%xmm2
2058.Loop_enc1_11:
2059.byte	102,15,56,220,209
2060	decl	%eax
2061	movups	(%r8),%xmm1
2062	leaq	16(%r8),%r8
2063	jnz	.Loop_enc1_11
2064.byte	102,15,56,221,209
2065	xorl	%eax,%eax
2066	testq	$15,%rdx
2067	setnz	%al
2068	shlq	$4,%rax
2069	subq	%rax,%rdx
2070
2071	movups	(%rcx),%xmm0
2072	movq	%rcx,%r11
2073	movl	%r10d,%eax
2074	shll	$4,%r10d
2075	movq	%rdx,%r9
2076	andq	$-16,%rdx
2077
2078	movups	16(%rcx,%r10,1),%xmm1
2079
2080	movdqa	.Lxts_magic(%rip),%xmm8
2081	movdqa	%xmm2,%xmm15
2082	pshufd	$95,%xmm2,%xmm9
2083	pxor	%xmm0,%xmm1
2084	movdqa	%xmm9,%xmm14
2085	paddd	%xmm9,%xmm9
2086	movdqa	%xmm15,%xmm10
2087	psrad	$31,%xmm14
2088	paddq	%xmm15,%xmm15
2089	pand	%xmm8,%xmm14
2090	pxor	%xmm0,%xmm10
2091	pxor	%xmm14,%xmm15
2092	movdqa	%xmm9,%xmm14
2093	paddd	%xmm9,%xmm9
2094	movdqa	%xmm15,%xmm11
2095	psrad	$31,%xmm14
2096	paddq	%xmm15,%xmm15
2097	pand	%xmm8,%xmm14
2098	pxor	%xmm0,%xmm11
2099	pxor	%xmm14,%xmm15
2100	movdqa	%xmm9,%xmm14
2101	paddd	%xmm9,%xmm9
2102	movdqa	%xmm15,%xmm12
2103	psrad	$31,%xmm14
2104	paddq	%xmm15,%xmm15
2105	pand	%xmm8,%xmm14
2106	pxor	%xmm0,%xmm12
2107	pxor	%xmm14,%xmm15
2108	movdqa	%xmm9,%xmm14
2109	paddd	%xmm9,%xmm9
2110	movdqa	%xmm15,%xmm13
2111	psrad	$31,%xmm14
2112	paddq	%xmm15,%xmm15
2113	pand	%xmm8,%xmm14
2114	pxor	%xmm0,%xmm13
2115	pxor	%xmm14,%xmm15
2116	movdqa	%xmm15,%xmm14
2117	psrad	$31,%xmm9
2118	paddq	%xmm15,%xmm15
2119	pand	%xmm8,%xmm9
2120	pxor	%xmm0,%xmm14
2121	pxor	%xmm9,%xmm15
2122	movaps	%xmm1,96(%rsp)
2123
2124	subq	$96,%rdx
2125	jc	.Lxts_dec_short
2126
2127	movl	$16+96,%eax
2128	leaq	32(%r11,%r10,1),%rcx
2129	subq	%r10,%rax
2130	movups	16(%r11),%xmm1
2131	movq	%rax,%r10
2132	leaq	.Lxts_magic(%rip),%r8
2133	jmp	.Lxts_dec_grandloop
2134
2135.align	32
2136.Lxts_dec_grandloop:
2137	movdqu	0(%rdi),%xmm2
2138	movdqa	%xmm0,%xmm8
2139	movdqu	16(%rdi),%xmm3
2140	pxor	%xmm10,%xmm2
2141	movdqu	32(%rdi),%xmm4
2142	pxor	%xmm11,%xmm3
2143.byte	102,15,56,222,209
2144	movdqu	48(%rdi),%xmm5
2145	pxor	%xmm12,%xmm4
2146.byte	102,15,56,222,217
2147	movdqu	64(%rdi),%xmm6
2148	pxor	%xmm13,%xmm5
2149.byte	102,15,56,222,225
2150	movdqu	80(%rdi),%xmm7
2151	pxor	%xmm15,%xmm8
2152	movdqa	96(%rsp),%xmm9
2153	pxor	%xmm14,%xmm6
2154.byte	102,15,56,222,233
2155	movups	32(%r11),%xmm0
2156	leaq	96(%rdi),%rdi
2157	pxor	%xmm8,%xmm7
2158
2159	pxor	%xmm9,%xmm10
2160.byte	102,15,56,222,241
2161	pxor	%xmm9,%xmm11
2162	movdqa	%xmm10,0(%rsp)
2163.byte	102,15,56,222,249
2164	movups	48(%r11),%xmm1
2165	pxor	%xmm9,%xmm12
2166
2167.byte	102,15,56,222,208
2168	pxor	%xmm9,%xmm13
2169	movdqa	%xmm11,16(%rsp)
2170.byte	102,15,56,222,216
2171	pxor	%xmm9,%xmm14
2172	movdqa	%xmm12,32(%rsp)
2173.byte	102,15,56,222,224
2174.byte	102,15,56,222,232
2175	pxor	%xmm9,%xmm8
2176	movdqa	%xmm14,64(%rsp)
2177.byte	102,15,56,222,240
2178.byte	102,15,56,222,248
2179	movups	64(%r11),%xmm0
2180	movdqa	%xmm8,80(%rsp)
2181	pshufd	$95,%xmm15,%xmm9
2182	jmp	.Lxts_dec_loop6
2183.align	32
2184.Lxts_dec_loop6:
2185.byte	102,15,56,222,209
2186.byte	102,15,56,222,217
2187.byte	102,15,56,222,225
2188.byte	102,15,56,222,233
2189.byte	102,15,56,222,241
2190.byte	102,15,56,222,249
2191	movups	-64(%rcx,%rax,1),%xmm1
2192	addq	$32,%rax
2193
2194.byte	102,15,56,222,208
2195.byte	102,15,56,222,216
2196.byte	102,15,56,222,224
2197.byte	102,15,56,222,232
2198.byte	102,15,56,222,240
2199.byte	102,15,56,222,248
2200	movups	-80(%rcx,%rax,1),%xmm0
2201	jnz	.Lxts_dec_loop6
2202
2203	movdqa	(%r8),%xmm8
2204	movdqa	%xmm9,%xmm14
2205	paddd	%xmm9,%xmm9
2206.byte	102,15,56,222,209
2207	paddq	%xmm15,%xmm15
2208	psrad	$31,%xmm14
2209.byte	102,15,56,222,217
2210	pand	%xmm8,%xmm14
2211	movups	(%r11),%xmm10
2212.byte	102,15,56,222,225
2213.byte	102,15,56,222,233
2214.byte	102,15,56,222,241
2215	pxor	%xmm14,%xmm15
2216	movaps	%xmm10,%xmm11
2217.byte	102,15,56,222,249
2218	movups	-64(%rcx),%xmm1
2219
2220	movdqa	%xmm9,%xmm14
2221.byte	102,15,56,222,208
2222	paddd	%xmm9,%xmm9
2223	pxor	%xmm15,%xmm10
2224.byte	102,15,56,222,216
2225	psrad	$31,%xmm14
2226	paddq	%xmm15,%xmm15
2227.byte	102,15,56,222,224
2228.byte	102,15,56,222,232
2229	pand	%xmm8,%xmm14
2230	movaps	%xmm11,%xmm12
2231.byte	102,15,56,222,240
2232	pxor	%xmm14,%xmm15
2233	movdqa	%xmm9,%xmm14
2234.byte	102,15,56,222,248
2235	movups	-48(%rcx),%xmm0
2236
2237	paddd	%xmm9,%xmm9
2238.byte	102,15,56,222,209
2239	pxor	%xmm15,%xmm11
2240	psrad	$31,%xmm14
2241.byte	102,15,56,222,217
2242	paddq	%xmm15,%xmm15
2243	pand	%xmm8,%xmm14
2244.byte	102,15,56,222,225
2245.byte	102,15,56,222,233
2246	movdqa	%xmm13,48(%rsp)
2247	pxor	%xmm14,%xmm15
2248.byte	102,15,56,222,241
2249	movaps	%xmm12,%xmm13
2250	movdqa	%xmm9,%xmm14
2251.byte	102,15,56,222,249
2252	movups	-32(%rcx),%xmm1
2253
2254	paddd	%xmm9,%xmm9
2255.byte	102,15,56,222,208
2256	pxor	%xmm15,%xmm12
2257	psrad	$31,%xmm14
2258.byte	102,15,56,222,216
2259	paddq	%xmm15,%xmm15
2260	pand	%xmm8,%xmm14
2261.byte	102,15,56,222,224
2262.byte	102,15,56,222,232
2263.byte	102,15,56,222,240
2264	pxor	%xmm14,%xmm15
2265	movaps	%xmm13,%xmm14
2266.byte	102,15,56,222,248
2267
2268	movdqa	%xmm9,%xmm0
2269	paddd	%xmm9,%xmm9
2270.byte	102,15,56,222,209
2271	pxor	%xmm15,%xmm13
2272	psrad	$31,%xmm0
2273.byte	102,15,56,222,217
2274	paddq	%xmm15,%xmm15
2275	pand	%xmm8,%xmm0
2276.byte	102,15,56,222,225
2277.byte	102,15,56,222,233
2278	pxor	%xmm0,%xmm15
2279	movups	(%r11),%xmm0
2280.byte	102,15,56,222,241
2281.byte	102,15,56,222,249
2282	movups	16(%r11),%xmm1
2283
2284	pxor	%xmm15,%xmm14
2285.byte	102,15,56,223,84,36,0
2286	psrad	$31,%xmm9
2287	paddq	%xmm15,%xmm15
2288.byte	102,15,56,223,92,36,16
2289.byte	102,15,56,223,100,36,32
2290	pand	%xmm8,%xmm9
2291	movq	%r10,%rax
2292.byte	102,15,56,223,108,36,48
2293.byte	102,15,56,223,116,36,64
2294.byte	102,15,56,223,124,36,80
2295	pxor	%xmm9,%xmm15
2296
2297	leaq	96(%rsi),%rsi
2298	movups	%xmm2,-96(%rsi)
2299	movups	%xmm3,-80(%rsi)
2300	movups	%xmm4,-64(%rsi)
2301	movups	%xmm5,-48(%rsi)
2302	movups	%xmm6,-32(%rsi)
2303	movups	%xmm7,-16(%rsi)
2304	subq	$96,%rdx
2305	jnc	.Lxts_dec_grandloop
2306
2307	movl	$16+96,%eax
2308	subl	%r10d,%eax
2309	movq	%r11,%rcx
2310	shrl	$4,%eax
2311
2312.Lxts_dec_short:
2313
2314	movl	%eax,%r10d
2315	pxor	%xmm0,%xmm10
2316	pxor	%xmm0,%xmm11
2317	addq	$96,%rdx
2318	jz	.Lxts_dec_done
2319
2320	pxor	%xmm0,%xmm12
2321	cmpq	$32,%rdx
2322	jb	.Lxts_dec_one
2323	pxor	%xmm0,%xmm13
2324	je	.Lxts_dec_two
2325
2326	pxor	%xmm0,%xmm14
2327	cmpq	$64,%rdx
2328	jb	.Lxts_dec_three
2329	je	.Lxts_dec_four
2330
2331	movdqu	(%rdi),%xmm2
2332	movdqu	16(%rdi),%xmm3
2333	movdqu	32(%rdi),%xmm4
2334	pxor	%xmm10,%xmm2
2335	movdqu	48(%rdi),%xmm5
2336	pxor	%xmm11,%xmm3
2337	movdqu	64(%rdi),%xmm6
2338	leaq	80(%rdi),%rdi
2339	pxor	%xmm12,%xmm4
2340	pxor	%xmm13,%xmm5
2341	pxor	%xmm14,%xmm6
2342
2343	call	_aesni_decrypt6
2344
2345	xorps	%xmm10,%xmm2
2346	xorps	%xmm11,%xmm3
2347	xorps	%xmm12,%xmm4
2348	movdqu	%xmm2,(%rsi)
2349	xorps	%xmm13,%xmm5
2350	movdqu	%xmm3,16(%rsi)
2351	xorps	%xmm14,%xmm6
2352	movdqu	%xmm4,32(%rsi)
2353	pxor	%xmm14,%xmm14
2354	movdqu	%xmm5,48(%rsi)
2355	pcmpgtd	%xmm15,%xmm14
2356	movdqu	%xmm6,64(%rsi)
2357	leaq	80(%rsi),%rsi
2358	pshufd	$19,%xmm14,%xmm11
2359	andq	$15,%r9
2360	jz	.Lxts_dec_ret
2361
2362	movdqa	%xmm15,%xmm10
2363	paddq	%xmm15,%xmm15
2364	pand	%xmm8,%xmm11
2365	pxor	%xmm15,%xmm11
2366	jmp	.Lxts_dec_done2
2367
2368.align	16
2369.Lxts_dec_one:
2370	movups	(%rdi),%xmm2
2371	leaq	16(%rdi),%rdi
2372	xorps	%xmm10,%xmm2
2373	movups	(%rcx),%xmm0
2374	movups	16(%rcx),%xmm1
2375	leaq	32(%rcx),%rcx
2376	xorps	%xmm0,%xmm2
2377.Loop_dec1_12:
2378.byte	102,15,56,222,209
2379	decl	%eax
2380	movups	(%rcx),%xmm1
2381	leaq	16(%rcx),%rcx
2382	jnz	.Loop_dec1_12
2383.byte	102,15,56,223,209
2384	xorps	%xmm10,%xmm2
2385	movdqa	%xmm11,%xmm10
2386	movups	%xmm2,(%rsi)
2387	movdqa	%xmm12,%xmm11
2388	leaq	16(%rsi),%rsi
2389	jmp	.Lxts_dec_done
2390
2391.align	16
2392.Lxts_dec_two:
2393	movups	(%rdi),%xmm2
2394	movups	16(%rdi),%xmm3
2395	leaq	32(%rdi),%rdi
2396	xorps	%xmm10,%xmm2
2397	xorps	%xmm11,%xmm3
2398
2399	call	_aesni_decrypt2
2400
2401	xorps	%xmm10,%xmm2
2402	movdqa	%xmm12,%xmm10
2403	xorps	%xmm11,%xmm3
2404	movdqa	%xmm13,%xmm11
2405	movups	%xmm2,(%rsi)
2406	movups	%xmm3,16(%rsi)
2407	leaq	32(%rsi),%rsi
2408	jmp	.Lxts_dec_done
2409
2410.align	16
2411.Lxts_dec_three:
2412	movups	(%rdi),%xmm2
2413	movups	16(%rdi),%xmm3
2414	movups	32(%rdi),%xmm4
2415	leaq	48(%rdi),%rdi
2416	xorps	%xmm10,%xmm2
2417	xorps	%xmm11,%xmm3
2418	xorps	%xmm12,%xmm4
2419
2420	call	_aesni_decrypt3
2421
2422	xorps	%xmm10,%xmm2
2423	movdqa	%xmm13,%xmm10
2424	xorps	%xmm11,%xmm3
2425	movdqa	%xmm14,%xmm11
2426	xorps	%xmm12,%xmm4
2427	movups	%xmm2,(%rsi)
2428	movups	%xmm3,16(%rsi)
2429	movups	%xmm4,32(%rsi)
2430	leaq	48(%rsi),%rsi
2431	jmp	.Lxts_dec_done
2432
2433.align	16
2434.Lxts_dec_four:
2435	movups	(%rdi),%xmm2
2436	movups	16(%rdi),%xmm3
2437	movups	32(%rdi),%xmm4
2438	xorps	%xmm10,%xmm2
2439	movups	48(%rdi),%xmm5
2440	leaq	64(%rdi),%rdi
2441	xorps	%xmm11,%xmm3
2442	xorps	%xmm12,%xmm4
2443	xorps	%xmm13,%xmm5
2444
2445	call	_aesni_decrypt4
2446
2447	pxor	%xmm10,%xmm2
2448	movdqa	%xmm14,%xmm10
2449	pxor	%xmm11,%xmm3
2450	movdqa	%xmm15,%xmm11
2451	pxor	%xmm12,%xmm4
2452	movdqu	%xmm2,(%rsi)
2453	pxor	%xmm13,%xmm5
2454	movdqu	%xmm3,16(%rsi)
2455	movdqu	%xmm4,32(%rsi)
2456	movdqu	%xmm5,48(%rsi)
2457	leaq	64(%rsi),%rsi
2458	jmp	.Lxts_dec_done
2459
2460.align	16
2461.Lxts_dec_done:
2462	andq	$15,%r9
2463	jz	.Lxts_dec_ret
2464.Lxts_dec_done2:
2465	movq	%r9,%rdx
2466	movq	%r11,%rcx
2467	movl	%r10d,%eax
2468
2469	movups	(%rdi),%xmm2
2470	xorps	%xmm11,%xmm2
2471	movups	(%rcx),%xmm0
2472	movups	16(%rcx),%xmm1
2473	leaq	32(%rcx),%rcx
2474	xorps	%xmm0,%xmm2
2475.Loop_dec1_13:
2476.byte	102,15,56,222,209
2477	decl	%eax
2478	movups	(%rcx),%xmm1
2479	leaq	16(%rcx),%rcx
2480	jnz	.Loop_dec1_13
2481.byte	102,15,56,223,209
2482	xorps	%xmm11,%xmm2
2483	movups	%xmm2,(%rsi)
2484
2485.Lxts_dec_steal:
2486	movzbl	16(%rdi),%eax
2487	movzbl	(%rsi),%ecx
2488	leaq	1(%rdi),%rdi
2489	movb	%al,(%rsi)
2490	movb	%cl,16(%rsi)
2491	leaq	1(%rsi),%rsi
2492	subq	$1,%rdx
2493	jnz	.Lxts_dec_steal
2494
2495	subq	%r9,%rsi
2496	movq	%r11,%rcx
2497	movl	%r10d,%eax
2498
2499	movups	(%rsi),%xmm2
2500	xorps	%xmm10,%xmm2
2501	movups	(%rcx),%xmm0
2502	movups	16(%rcx),%xmm1
2503	leaq	32(%rcx),%rcx
2504	xorps	%xmm0,%xmm2
2505.Loop_dec1_14:
2506.byte	102,15,56,222,209
2507	decl	%eax
2508	movups	(%rcx),%xmm1
2509	leaq	16(%rcx),%rcx
2510	jnz	.Loop_dec1_14
2511.byte	102,15,56,223,209
2512	xorps	%xmm10,%xmm2
2513	movups	%xmm2,(%rsi)
2514
2515.Lxts_dec_ret:
2516	xorps	%xmm0,%xmm0
2517	pxor	%xmm1,%xmm1
2518	pxor	%xmm2,%xmm2
2519	pxor	%xmm3,%xmm3
2520	pxor	%xmm4,%xmm4
2521	pxor	%xmm5,%xmm5
2522	pxor	%xmm6,%xmm6
2523	pxor	%xmm7,%xmm7
2524	movaps	%xmm0,0(%rsp)
2525	pxor	%xmm8,%xmm8
2526	movaps	%xmm0,16(%rsp)
2527	pxor	%xmm9,%xmm9
2528	movaps	%xmm0,32(%rsp)
2529	pxor	%xmm10,%xmm10
2530	movaps	%xmm0,48(%rsp)
2531	pxor	%xmm11,%xmm11
2532	movaps	%xmm0,64(%rsp)
2533	pxor	%xmm12,%xmm12
2534	movaps	%xmm0,80(%rsp)
2535	pxor	%xmm13,%xmm13
2536	movaps	%xmm0,96(%rsp)
2537	pxor	%xmm14,%xmm14
2538	pxor	%xmm15,%xmm15
2539	leaq	(%rbp),%rsp
2540	popq	%rbp
2541.Lxts_dec_epilogue:
2542	.byte	0xf3,0xc3
2543.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2544.globl	aesni_cbc_encrypt
2545.hidden aesni_cbc_encrypt
2546.type	aesni_cbc_encrypt,@function
2547.align	16
2548aesni_cbc_encrypt:
2549	testq	%rdx,%rdx
2550	jz	.Lcbc_ret
2551
2552	movl	240(%rcx),%r10d
2553	movq	%rcx,%r11
2554	testl	%r9d,%r9d
2555	jz	.Lcbc_decrypt
2556
2557	movups	(%r8),%xmm2
2558	movl	%r10d,%eax
2559	cmpq	$16,%rdx
2560	jb	.Lcbc_enc_tail
2561	subq	$16,%rdx
2562	jmp	.Lcbc_enc_loop
2563.align	16
2564.Lcbc_enc_loop:
2565	movups	(%rdi),%xmm3
2566	leaq	16(%rdi),%rdi
2567
2568	movups	(%rcx),%xmm0
2569	movups	16(%rcx),%xmm1
2570	xorps	%xmm0,%xmm3
2571	leaq	32(%rcx),%rcx
2572	xorps	%xmm3,%xmm2
2573.Loop_enc1_15:
2574.byte	102,15,56,220,209
2575	decl	%eax
2576	movups	(%rcx),%xmm1
2577	leaq	16(%rcx),%rcx
2578	jnz	.Loop_enc1_15
2579.byte	102,15,56,221,209
2580	movl	%r10d,%eax
2581	movq	%r11,%rcx
2582	movups	%xmm2,0(%rsi)
2583	leaq	16(%rsi),%rsi
2584	subq	$16,%rdx
2585	jnc	.Lcbc_enc_loop
2586	addq	$16,%rdx
2587	jnz	.Lcbc_enc_tail
2588	pxor	%xmm0,%xmm0
2589	pxor	%xmm1,%xmm1
2590	movups	%xmm2,(%r8)
2591	pxor	%xmm2,%xmm2
2592	pxor	%xmm3,%xmm3
2593	jmp	.Lcbc_ret
2594
2595.Lcbc_enc_tail:
2596	movq	%rdx,%rcx
2597	xchgq	%rdi,%rsi
2598.long	0x9066A4F3
2599	movl	$16,%ecx
2600	subq	%rdx,%rcx
2601	xorl	%eax,%eax
2602.long	0x9066AAF3
2603	leaq	-16(%rdi),%rdi
2604	movl	%r10d,%eax
2605	movq	%rdi,%rsi
2606	movq	%r11,%rcx
2607	xorq	%rdx,%rdx
2608	jmp	.Lcbc_enc_loop
2609
2610.align	16
2611.Lcbc_decrypt:
2612	cmpq	$16,%rdx
2613	jne	.Lcbc_decrypt_bulk
2614
2615
2616
2617	movdqu	(%rdi),%xmm2
2618	movdqu	(%r8),%xmm3
2619	movdqa	%xmm2,%xmm4
2620	movups	(%rcx),%xmm0
2621	movups	16(%rcx),%xmm1
2622	leaq	32(%rcx),%rcx
2623	xorps	%xmm0,%xmm2
2624.Loop_dec1_16:
2625.byte	102,15,56,222,209
2626	decl	%r10d
2627	movups	(%rcx),%xmm1
2628	leaq	16(%rcx),%rcx
2629	jnz	.Loop_dec1_16
2630.byte	102,15,56,223,209
2631	pxor	%xmm0,%xmm0
2632	pxor	%xmm1,%xmm1
2633	movdqu	%xmm4,(%r8)
2634	xorps	%xmm3,%xmm2
2635	pxor	%xmm3,%xmm3
2636	movups	%xmm2,(%rsi)
2637	pxor	%xmm2,%xmm2
2638	jmp	.Lcbc_ret
2639.align	16
2640.Lcbc_decrypt_bulk:
2641	leaq	(%rsp),%rax
2642	pushq	%rbp
2643	subq	$16,%rsp
2644	andq	$-16,%rsp
2645	leaq	-8(%rax),%rbp
2646	movups	(%r8),%xmm10
2647	movl	%r10d,%eax
2648	cmpq	$80,%rdx
2649	jbe	.Lcbc_dec_tail
2650
2651	movups	(%rcx),%xmm0
2652	movdqu	0(%rdi),%xmm2
2653	movdqu	16(%rdi),%xmm3
2654	movdqa	%xmm2,%xmm11
2655	movdqu	32(%rdi),%xmm4
2656	movdqa	%xmm3,%xmm12
2657	movdqu	48(%rdi),%xmm5
2658	movdqa	%xmm4,%xmm13
2659	movdqu	64(%rdi),%xmm6
2660	movdqa	%xmm5,%xmm14
2661	movdqu	80(%rdi),%xmm7
2662	movdqa	%xmm6,%xmm15
2663	movl	OPENSSL_ia32cap_P+4(%rip),%r9d
2664	cmpq	$112,%rdx
2665	jbe	.Lcbc_dec_six_or_seven
2666
2667	andl	$71303168,%r9d
2668	subq	$80,%rdx
2669	cmpl	$4194304,%r9d
2670	je	.Lcbc_dec_loop6_enter
2671	subq	$32,%rdx
2672	leaq	112(%rcx),%rcx
2673	jmp	.Lcbc_dec_loop8_enter
2674.align	16
2675.Lcbc_dec_loop8:
2676	movups	%xmm9,(%rsi)
2677	leaq	16(%rsi),%rsi
2678.Lcbc_dec_loop8_enter:
2679	movdqu	96(%rdi),%xmm8
2680	pxor	%xmm0,%xmm2
2681	movdqu	112(%rdi),%xmm9
2682	pxor	%xmm0,%xmm3
2683	movups	16-112(%rcx),%xmm1
2684	pxor	%xmm0,%xmm4
2685	xorq	%r11,%r11
2686	cmpq	$112,%rdx
2687	pxor	%xmm0,%xmm5
2688	pxor	%xmm0,%xmm6
2689	pxor	%xmm0,%xmm7
2690	pxor	%xmm0,%xmm8
2691
2692.byte	102,15,56,222,209
2693	pxor	%xmm0,%xmm9
2694	movups	32-112(%rcx),%xmm0
2695.byte	102,15,56,222,217
2696.byte	102,15,56,222,225
2697.byte	102,15,56,222,233
2698.byte	102,15,56,222,241
2699.byte	102,15,56,222,249
2700.byte	102,68,15,56,222,193
2701	setnc	%r11b
2702	shlq	$7,%r11
2703.byte	102,68,15,56,222,201
2704	addq	%rdi,%r11
2705	movups	48-112(%rcx),%xmm1
2706.byte	102,15,56,222,208
2707.byte	102,15,56,222,216
2708.byte	102,15,56,222,224
2709.byte	102,15,56,222,232
2710.byte	102,15,56,222,240
2711.byte	102,15,56,222,248
2712.byte	102,68,15,56,222,192
2713.byte	102,68,15,56,222,200
2714	movups	64-112(%rcx),%xmm0
2715	nop
2716.byte	102,15,56,222,209
2717.byte	102,15,56,222,217
2718.byte	102,15,56,222,225
2719.byte	102,15,56,222,233
2720.byte	102,15,56,222,241
2721.byte	102,15,56,222,249
2722.byte	102,68,15,56,222,193
2723.byte	102,68,15,56,222,201
2724	movups	80-112(%rcx),%xmm1
2725	nop
2726.byte	102,15,56,222,208
2727.byte	102,15,56,222,216
2728.byte	102,15,56,222,224
2729.byte	102,15,56,222,232
2730.byte	102,15,56,222,240
2731.byte	102,15,56,222,248
2732.byte	102,68,15,56,222,192
2733.byte	102,68,15,56,222,200
2734	movups	96-112(%rcx),%xmm0
2735	nop
2736.byte	102,15,56,222,209
2737.byte	102,15,56,222,217
2738.byte	102,15,56,222,225
2739.byte	102,15,56,222,233
2740.byte	102,15,56,222,241
2741.byte	102,15,56,222,249
2742.byte	102,68,15,56,222,193
2743.byte	102,68,15,56,222,201
2744	movups	112-112(%rcx),%xmm1
2745	nop
2746.byte	102,15,56,222,208
2747.byte	102,15,56,222,216
2748.byte	102,15,56,222,224
2749.byte	102,15,56,222,232
2750.byte	102,15,56,222,240
2751.byte	102,15,56,222,248
2752.byte	102,68,15,56,222,192
2753.byte	102,68,15,56,222,200
2754	movups	128-112(%rcx),%xmm0
2755	nop
2756.byte	102,15,56,222,209
2757.byte	102,15,56,222,217
2758.byte	102,15,56,222,225
2759.byte	102,15,56,222,233
2760.byte	102,15,56,222,241
2761.byte	102,15,56,222,249
2762.byte	102,68,15,56,222,193
2763.byte	102,68,15,56,222,201
2764	movups	144-112(%rcx),%xmm1
2765	cmpl	$11,%eax
2766.byte	102,15,56,222,208
2767.byte	102,15,56,222,216
2768.byte	102,15,56,222,224
2769.byte	102,15,56,222,232
2770.byte	102,15,56,222,240
2771.byte	102,15,56,222,248
2772.byte	102,68,15,56,222,192
2773.byte	102,68,15,56,222,200
2774	movups	160-112(%rcx),%xmm0
2775	jb	.Lcbc_dec_done
2776.byte	102,15,56,222,209
2777.byte	102,15,56,222,217
2778.byte	102,15,56,222,225
2779.byte	102,15,56,222,233
2780.byte	102,15,56,222,241
2781.byte	102,15,56,222,249
2782.byte	102,68,15,56,222,193
2783.byte	102,68,15,56,222,201
2784	movups	176-112(%rcx),%xmm1
2785	nop
2786.byte	102,15,56,222,208
2787.byte	102,15,56,222,216
2788.byte	102,15,56,222,224
2789.byte	102,15,56,222,232
2790.byte	102,15,56,222,240
2791.byte	102,15,56,222,248
2792.byte	102,68,15,56,222,192
2793.byte	102,68,15,56,222,200
2794	movups	192-112(%rcx),%xmm0
2795	je	.Lcbc_dec_done
2796.byte	102,15,56,222,209
2797.byte	102,15,56,222,217
2798.byte	102,15,56,222,225
2799.byte	102,15,56,222,233
2800.byte	102,15,56,222,241
2801.byte	102,15,56,222,249
2802.byte	102,68,15,56,222,193
2803.byte	102,68,15,56,222,201
2804	movups	208-112(%rcx),%xmm1
2805	nop
2806.byte	102,15,56,222,208
2807.byte	102,15,56,222,216
2808.byte	102,15,56,222,224
2809.byte	102,15,56,222,232
2810.byte	102,15,56,222,240
2811.byte	102,15,56,222,248
2812.byte	102,68,15,56,222,192
2813.byte	102,68,15,56,222,200
2814	movups	224-112(%rcx),%xmm0
2815	jmp	.Lcbc_dec_done
2816.align	16
2817.Lcbc_dec_done:
2818.byte	102,15,56,222,209
2819.byte	102,15,56,222,217
2820	pxor	%xmm0,%xmm10
2821	pxor	%xmm0,%xmm11
2822.byte	102,15,56,222,225
2823.byte	102,15,56,222,233
2824	pxor	%xmm0,%xmm12
2825	pxor	%xmm0,%xmm13
2826.byte	102,15,56,222,241
2827.byte	102,15,56,222,249
2828	pxor	%xmm0,%xmm14
2829	pxor	%xmm0,%xmm15
2830.byte	102,68,15,56,222,193
2831.byte	102,68,15,56,222,201
2832	movdqu	80(%rdi),%xmm1
2833
2834.byte	102,65,15,56,223,210
2835	movdqu	96(%rdi),%xmm10
2836	pxor	%xmm0,%xmm1
2837.byte	102,65,15,56,223,219
2838	pxor	%xmm0,%xmm10
2839	movdqu	112(%rdi),%xmm0
2840.byte	102,65,15,56,223,228
2841	leaq	128(%rdi),%rdi
2842	movdqu	0(%r11),%xmm11
2843.byte	102,65,15,56,223,237
2844.byte	102,65,15,56,223,246
2845	movdqu	16(%r11),%xmm12
2846	movdqu	32(%r11),%xmm13
2847.byte	102,65,15,56,223,255
2848.byte	102,68,15,56,223,193
2849	movdqu	48(%r11),%xmm14
2850	movdqu	64(%r11),%xmm15
2851.byte	102,69,15,56,223,202
2852	movdqa	%xmm0,%xmm10
2853	movdqu	80(%r11),%xmm1
2854	movups	-112(%rcx),%xmm0
2855
2856	movups	%xmm2,(%rsi)
2857	movdqa	%xmm11,%xmm2
2858	movups	%xmm3,16(%rsi)
2859	movdqa	%xmm12,%xmm3
2860	movups	%xmm4,32(%rsi)
2861	movdqa	%xmm13,%xmm4
2862	movups	%xmm5,48(%rsi)
2863	movdqa	%xmm14,%xmm5
2864	movups	%xmm6,64(%rsi)
2865	movdqa	%xmm15,%xmm6
2866	movups	%xmm7,80(%rsi)
2867	movdqa	%xmm1,%xmm7
2868	movups	%xmm8,96(%rsi)
2869	leaq	112(%rsi),%rsi
2870
2871	subq	$128,%rdx
2872	ja	.Lcbc_dec_loop8
2873
2874	movaps	%xmm9,%xmm2
2875	leaq	-112(%rcx),%rcx
2876	addq	$112,%rdx
2877	jle	.Lcbc_dec_clear_tail_collected
2878	movups	%xmm9,(%rsi)
2879	leaq	16(%rsi),%rsi
2880	cmpq	$80,%rdx
2881	jbe	.Lcbc_dec_tail
2882
2883	movaps	%xmm11,%xmm2
2884.Lcbc_dec_six_or_seven:
2885	cmpq	$96,%rdx
2886	ja	.Lcbc_dec_seven
2887
2888	movaps	%xmm7,%xmm8
2889	call	_aesni_decrypt6
2890	pxor	%xmm10,%xmm2
2891	movaps	%xmm8,%xmm10
2892	pxor	%xmm11,%xmm3
2893	movdqu	%xmm2,(%rsi)
2894	pxor	%xmm12,%xmm4
2895	movdqu	%xmm3,16(%rsi)
2896	pxor	%xmm3,%xmm3
2897	pxor	%xmm13,%xmm5
2898	movdqu	%xmm4,32(%rsi)
2899	pxor	%xmm4,%xmm4
2900	pxor	%xmm14,%xmm6
2901	movdqu	%xmm5,48(%rsi)
2902	pxor	%xmm5,%xmm5
2903	pxor	%xmm15,%xmm7
2904	movdqu	%xmm6,64(%rsi)
2905	pxor	%xmm6,%xmm6
2906	leaq	80(%rsi),%rsi
2907	movdqa	%xmm7,%xmm2
2908	pxor	%xmm7,%xmm7
2909	jmp	.Lcbc_dec_tail_collected
2910
2911.align	16
2912.Lcbc_dec_seven:
2913	movups	96(%rdi),%xmm8
2914	xorps	%xmm9,%xmm9
2915	call	_aesni_decrypt8
2916	movups	80(%rdi),%xmm9
2917	pxor	%xmm10,%xmm2
2918	movups	96(%rdi),%xmm10
2919	pxor	%xmm11,%xmm3
2920	movdqu	%xmm2,(%rsi)
2921	pxor	%xmm12,%xmm4
2922	movdqu	%xmm3,16(%rsi)
2923	pxor	%xmm3,%xmm3
2924	pxor	%xmm13,%xmm5
2925	movdqu	%xmm4,32(%rsi)
2926	pxor	%xmm4,%xmm4
2927	pxor	%xmm14,%xmm6
2928	movdqu	%xmm5,48(%rsi)
2929	pxor	%xmm5,%xmm5
2930	pxor	%xmm15,%xmm7
2931	movdqu	%xmm6,64(%rsi)
2932	pxor	%xmm6,%xmm6
2933	pxor	%xmm9,%xmm8
2934	movdqu	%xmm7,80(%rsi)
2935	pxor	%xmm7,%xmm7
2936	leaq	96(%rsi),%rsi
2937	movdqa	%xmm8,%xmm2
2938	pxor	%xmm8,%xmm8
2939	pxor	%xmm9,%xmm9
2940	jmp	.Lcbc_dec_tail_collected
2941
2942.align	16
2943.Lcbc_dec_loop6:
2944	movups	%xmm7,(%rsi)
2945	leaq	16(%rsi),%rsi
2946	movdqu	0(%rdi),%xmm2
2947	movdqu	16(%rdi),%xmm3
2948	movdqa	%xmm2,%xmm11
2949	movdqu	32(%rdi),%xmm4
2950	movdqa	%xmm3,%xmm12
2951	movdqu	48(%rdi),%xmm5
2952	movdqa	%xmm4,%xmm13
2953	movdqu	64(%rdi),%xmm6
2954	movdqa	%xmm5,%xmm14
2955	movdqu	80(%rdi),%xmm7
2956	movdqa	%xmm6,%xmm15
2957.Lcbc_dec_loop6_enter:
2958	leaq	96(%rdi),%rdi
2959	movdqa	%xmm7,%xmm8
2960
2961	call	_aesni_decrypt6
2962
2963	pxor	%xmm10,%xmm2
2964	movdqa	%xmm8,%xmm10
2965	pxor	%xmm11,%xmm3
2966	movdqu	%xmm2,(%rsi)
2967	pxor	%xmm12,%xmm4
2968	movdqu	%xmm3,16(%rsi)
2969	pxor	%xmm13,%xmm5
2970	movdqu	%xmm4,32(%rsi)
2971	pxor	%xmm14,%xmm6
2972	movq	%r11,%rcx
2973	movdqu	%xmm5,48(%rsi)
2974	pxor	%xmm15,%xmm7
2975	movl	%r10d,%eax
2976	movdqu	%xmm6,64(%rsi)
2977	leaq	80(%rsi),%rsi
2978	subq	$96,%rdx
2979	ja	.Lcbc_dec_loop6
2980
2981	movdqa	%xmm7,%xmm2
2982	addq	$80,%rdx
2983	jle	.Lcbc_dec_clear_tail_collected
2984	movups	%xmm7,(%rsi)
2985	leaq	16(%rsi),%rsi
2986
2987.Lcbc_dec_tail:
2988	movups	(%rdi),%xmm2
2989	subq	$16,%rdx
2990	jbe	.Lcbc_dec_one
2991
2992	movups	16(%rdi),%xmm3
2993	movaps	%xmm2,%xmm11
2994	subq	$16,%rdx
2995	jbe	.Lcbc_dec_two
2996
2997	movups	32(%rdi),%xmm4
2998	movaps	%xmm3,%xmm12
2999	subq	$16,%rdx
3000	jbe	.Lcbc_dec_three
3001
3002	movups	48(%rdi),%xmm5
3003	movaps	%xmm4,%xmm13
3004	subq	$16,%rdx
3005	jbe	.Lcbc_dec_four
3006
3007	movups	64(%rdi),%xmm6
3008	movaps	%xmm5,%xmm14
3009	movaps	%xmm6,%xmm15
3010	xorps	%xmm7,%xmm7
3011	call	_aesni_decrypt6
3012	pxor	%xmm10,%xmm2
3013	movaps	%xmm15,%xmm10
3014	pxor	%xmm11,%xmm3
3015	movdqu	%xmm2,(%rsi)
3016	pxor	%xmm12,%xmm4
3017	movdqu	%xmm3,16(%rsi)
3018	pxor	%xmm3,%xmm3
3019	pxor	%xmm13,%xmm5
3020	movdqu	%xmm4,32(%rsi)
3021	pxor	%xmm4,%xmm4
3022	pxor	%xmm14,%xmm6
3023	movdqu	%xmm5,48(%rsi)
3024	pxor	%xmm5,%xmm5
3025	leaq	64(%rsi),%rsi
3026	movdqa	%xmm6,%xmm2
3027	pxor	%xmm6,%xmm6
3028	pxor	%xmm7,%xmm7
3029	subq	$16,%rdx
3030	jmp	.Lcbc_dec_tail_collected
3031
3032.align	16
3033.Lcbc_dec_one:
3034	movaps	%xmm2,%xmm11
3035	movups	(%rcx),%xmm0
3036	movups	16(%rcx),%xmm1
3037	leaq	32(%rcx),%rcx
3038	xorps	%xmm0,%xmm2
3039.Loop_dec1_17:
3040.byte	102,15,56,222,209
3041	decl	%eax
3042	movups	(%rcx),%xmm1
3043	leaq	16(%rcx),%rcx
3044	jnz	.Loop_dec1_17
3045.byte	102,15,56,223,209
3046	xorps	%xmm10,%xmm2
3047	movaps	%xmm11,%xmm10
3048	jmp	.Lcbc_dec_tail_collected
3049.align	16
3050.Lcbc_dec_two:
3051	movaps	%xmm3,%xmm12
3052	call	_aesni_decrypt2
3053	pxor	%xmm10,%xmm2
3054	movaps	%xmm12,%xmm10
3055	pxor	%xmm11,%xmm3
3056	movdqu	%xmm2,(%rsi)
3057	movdqa	%xmm3,%xmm2
3058	pxor	%xmm3,%xmm3
3059	leaq	16(%rsi),%rsi
3060	jmp	.Lcbc_dec_tail_collected
3061.align	16
3062.Lcbc_dec_three:
3063	movaps	%xmm4,%xmm13
3064	call	_aesni_decrypt3
3065	pxor	%xmm10,%xmm2
3066	movaps	%xmm13,%xmm10
3067	pxor	%xmm11,%xmm3
3068	movdqu	%xmm2,(%rsi)
3069	pxor	%xmm12,%xmm4
3070	movdqu	%xmm3,16(%rsi)
3071	pxor	%xmm3,%xmm3
3072	movdqa	%xmm4,%xmm2
3073	pxor	%xmm4,%xmm4
3074	leaq	32(%rsi),%rsi
3075	jmp	.Lcbc_dec_tail_collected
3076.align	16
3077.Lcbc_dec_four:
3078	movaps	%xmm5,%xmm14
3079	call	_aesni_decrypt4
3080	pxor	%xmm10,%xmm2
3081	movaps	%xmm14,%xmm10
3082	pxor	%xmm11,%xmm3
3083	movdqu	%xmm2,(%rsi)
3084	pxor	%xmm12,%xmm4
3085	movdqu	%xmm3,16(%rsi)
3086	pxor	%xmm3,%xmm3
3087	pxor	%xmm13,%xmm5
3088	movdqu	%xmm4,32(%rsi)
3089	pxor	%xmm4,%xmm4
3090	movdqa	%xmm5,%xmm2
3091	pxor	%xmm5,%xmm5
3092	leaq	48(%rsi),%rsi
3093	jmp	.Lcbc_dec_tail_collected
3094
3095.align	16
3096.Lcbc_dec_clear_tail_collected:
3097	pxor	%xmm3,%xmm3
3098	pxor	%xmm4,%xmm4
3099	pxor	%xmm5,%xmm5
3100	pxor	%xmm6,%xmm6
3101	pxor	%xmm7,%xmm7
3102	pxor	%xmm8,%xmm8
3103	pxor	%xmm9,%xmm9
3104.Lcbc_dec_tail_collected:
3105	movups	%xmm10,(%r8)
3106	andq	$15,%rdx
3107	jnz	.Lcbc_dec_tail_partial
3108	movups	%xmm2,(%rsi)
3109	pxor	%xmm2,%xmm2
3110	jmp	.Lcbc_dec_ret
3111.align	16
3112.Lcbc_dec_tail_partial:
3113	movaps	%xmm2,(%rsp)
3114	pxor	%xmm2,%xmm2
3115	movq	$16,%rcx
3116	movq	%rsi,%rdi
3117	subq	%rdx,%rcx
3118	leaq	(%rsp),%rsi
3119.long	0x9066A4F3
3120	movdqa	%xmm2,(%rsp)
3121
3122.Lcbc_dec_ret:
3123	xorps	%xmm0,%xmm0
3124	pxor	%xmm1,%xmm1
3125	leaq	(%rbp),%rsp
3126	popq	%rbp
3127.Lcbc_ret:
3128	.byte	0xf3,0xc3
3129.size	aesni_cbc_encrypt,.-aesni_cbc_encrypt
3130.globl	aesni_set_decrypt_key
3131.hidden aesni_set_decrypt_key
3132.type	aesni_set_decrypt_key,@function
3133.align	16
3134aesni_set_decrypt_key:
3135.byte	0x48,0x83,0xEC,0x08
3136	call	__aesni_set_encrypt_key
3137	shll	$4,%esi
3138	testl	%eax,%eax
3139	jnz	.Ldec_key_ret
3140	leaq	16(%rdx,%rsi,1),%rdi
3141
3142	movups	(%rdx),%xmm0
3143	movups	(%rdi),%xmm1
3144	movups	%xmm0,(%rdi)
3145	movups	%xmm1,(%rdx)
3146	leaq	16(%rdx),%rdx
3147	leaq	-16(%rdi),%rdi
3148
3149.Ldec_key_inverse:
3150	movups	(%rdx),%xmm0
3151	movups	(%rdi),%xmm1
3152.byte	102,15,56,219,192
3153.byte	102,15,56,219,201
3154	leaq	16(%rdx),%rdx
3155	leaq	-16(%rdi),%rdi
3156	movups	%xmm0,16(%rdi)
3157	movups	%xmm1,-16(%rdx)
3158	cmpq	%rdx,%rdi
3159	ja	.Ldec_key_inverse
3160
3161	movups	(%rdx),%xmm0
3162.byte	102,15,56,219,192
3163	pxor	%xmm1,%xmm1
3164	movups	%xmm0,(%rdi)
3165	pxor	%xmm0,%xmm0
3166.Ldec_key_ret:
3167	addq	$8,%rsp
3168	.byte	0xf3,0xc3
3169.LSEH_end_set_decrypt_key:
3170.size	aesni_set_decrypt_key,.-aesni_set_decrypt_key
3171.globl	aesni_set_encrypt_key
3172.hidden aesni_set_encrypt_key
3173.type	aesni_set_encrypt_key,@function
3174.align	16
3175aesni_set_encrypt_key:
3176__aesni_set_encrypt_key:
3177.byte	0x48,0x83,0xEC,0x08
3178	movq	$-1,%rax
3179	testq	%rdi,%rdi
3180	jz	.Lenc_key_ret
3181	testq	%rdx,%rdx
3182	jz	.Lenc_key_ret
3183
3184	movl	$268437504,%r10d
3185	movups	(%rdi),%xmm0
3186	xorps	%xmm4,%xmm4
3187	andl	OPENSSL_ia32cap_P+4(%rip),%r10d
3188	leaq	16(%rdx),%rax
3189	cmpl	$256,%esi
3190	je	.L14rounds
3191	cmpl	$192,%esi
3192	je	.L12rounds
3193	cmpl	$128,%esi
3194	jne	.Lbad_keybits
3195
3196.L10rounds:
3197	movl	$9,%esi
3198	cmpl	$268435456,%r10d
3199	je	.L10rounds_alt
3200
3201	movups	%xmm0,(%rdx)
3202.byte	102,15,58,223,200,1
3203	call	.Lkey_expansion_128_cold
3204.byte	102,15,58,223,200,2
3205	call	.Lkey_expansion_128
3206.byte	102,15,58,223,200,4
3207	call	.Lkey_expansion_128
3208.byte	102,15,58,223,200,8
3209	call	.Lkey_expansion_128
3210.byte	102,15,58,223,200,16
3211	call	.Lkey_expansion_128
3212.byte	102,15,58,223,200,32
3213	call	.Lkey_expansion_128
3214.byte	102,15,58,223,200,64
3215	call	.Lkey_expansion_128
3216.byte	102,15,58,223,200,128
3217	call	.Lkey_expansion_128
3218.byte	102,15,58,223,200,27
3219	call	.Lkey_expansion_128
3220.byte	102,15,58,223,200,54
3221	call	.Lkey_expansion_128
3222	movups	%xmm0,(%rax)
3223	movl	%esi,80(%rax)
3224	xorl	%eax,%eax
3225	jmp	.Lenc_key_ret
3226
3227.align	16
3228.L10rounds_alt:
3229	movdqa	.Lkey_rotate(%rip),%xmm5
3230	movl	$8,%r10d
3231	movdqa	.Lkey_rcon1(%rip),%xmm4
3232	movdqa	%xmm0,%xmm2
3233	movdqu	%xmm0,(%rdx)
3234	jmp	.Loop_key128
3235
3236.align	16
3237.Loop_key128:
3238.byte	102,15,56,0,197
3239.byte	102,15,56,221,196
3240	pslld	$1,%xmm4
3241	leaq	16(%rax),%rax
3242
3243	movdqa	%xmm2,%xmm3
3244	pslldq	$4,%xmm2
3245	pxor	%xmm2,%xmm3
3246	pslldq	$4,%xmm2
3247	pxor	%xmm2,%xmm3
3248	pslldq	$4,%xmm2
3249	pxor	%xmm3,%xmm2
3250
3251	pxor	%xmm2,%xmm0
3252	movdqu	%xmm0,-16(%rax)
3253	movdqa	%xmm0,%xmm2
3254
3255	decl	%r10d
3256	jnz	.Loop_key128
3257
3258	movdqa	.Lkey_rcon1b(%rip),%xmm4
3259
3260.byte	102,15,56,0,197
3261.byte	102,15,56,221,196
3262	pslld	$1,%xmm4
3263
3264	movdqa	%xmm2,%xmm3
3265	pslldq	$4,%xmm2
3266	pxor	%xmm2,%xmm3
3267	pslldq	$4,%xmm2
3268	pxor	%xmm2,%xmm3
3269	pslldq	$4,%xmm2
3270	pxor	%xmm3,%xmm2
3271
3272	pxor	%xmm2,%xmm0
3273	movdqu	%xmm0,(%rax)
3274
3275	movdqa	%xmm0,%xmm2
3276.byte	102,15,56,0,197
3277.byte	102,15,56,221,196
3278
3279	movdqa	%xmm2,%xmm3
3280	pslldq	$4,%xmm2
3281	pxor	%xmm2,%xmm3
3282	pslldq	$4,%xmm2
3283	pxor	%xmm2,%xmm3
3284	pslldq	$4,%xmm2
3285	pxor	%xmm3,%xmm2
3286
3287	pxor	%xmm2,%xmm0
3288	movdqu	%xmm0,16(%rax)
3289
3290	movl	%esi,96(%rax)
3291	xorl	%eax,%eax
3292	jmp	.Lenc_key_ret
3293
3294.align	16
3295.L12rounds:
3296	movq	16(%rdi),%xmm2
3297	movl	$11,%esi
3298	cmpl	$268435456,%r10d
3299	je	.L12rounds_alt
3300
3301	movups	%xmm0,(%rdx)
3302.byte	102,15,58,223,202,1
3303	call	.Lkey_expansion_192a_cold
3304.byte	102,15,58,223,202,2
3305	call	.Lkey_expansion_192b
3306.byte	102,15,58,223,202,4
3307	call	.Lkey_expansion_192a
3308.byte	102,15,58,223,202,8
3309	call	.Lkey_expansion_192b
3310.byte	102,15,58,223,202,16
3311	call	.Lkey_expansion_192a
3312.byte	102,15,58,223,202,32
3313	call	.Lkey_expansion_192b
3314.byte	102,15,58,223,202,64
3315	call	.Lkey_expansion_192a
3316.byte	102,15,58,223,202,128
3317	call	.Lkey_expansion_192b
3318	movups	%xmm0,(%rax)
3319	movl	%esi,48(%rax)
3320	xorq	%rax,%rax
3321	jmp	.Lenc_key_ret
3322
3323.align	16
3324.L12rounds_alt:
3325	movdqa	.Lkey_rotate192(%rip),%xmm5
3326	movdqa	.Lkey_rcon1(%rip),%xmm4
3327	movl	$8,%r10d
3328	movdqu	%xmm0,(%rdx)
3329	jmp	.Loop_key192
3330
3331.align	16
3332.Loop_key192:
3333	movq	%xmm2,0(%rax)
3334	movdqa	%xmm2,%xmm1
3335.byte	102,15,56,0,213
3336.byte	102,15,56,221,212
3337	pslld	$1,%xmm4
3338	leaq	24(%rax),%rax
3339
3340	movdqa	%xmm0,%xmm3
3341	pslldq	$4,%xmm0
3342	pxor	%xmm0,%xmm3
3343	pslldq	$4,%xmm0
3344	pxor	%xmm0,%xmm3
3345	pslldq	$4,%xmm0
3346	pxor	%xmm3,%xmm0
3347
3348	pshufd	$255,%xmm0,%xmm3
3349	pxor	%xmm1,%xmm3
3350	pslldq	$4,%xmm1
3351	pxor	%xmm1,%xmm3
3352
3353	pxor	%xmm2,%xmm0
3354	pxor	%xmm3,%xmm2
3355	movdqu	%xmm0,-16(%rax)
3356
3357	decl	%r10d
3358	jnz	.Loop_key192
3359
3360	movl	%esi,32(%rax)
3361	xorl	%eax,%eax
3362	jmp	.Lenc_key_ret
3363
3364.align	16
3365.L14rounds:
3366	movups	16(%rdi),%xmm2
3367	movl	$13,%esi
3368	leaq	16(%rax),%rax
3369	cmpl	$268435456,%r10d
3370	je	.L14rounds_alt
3371
3372	movups	%xmm0,(%rdx)
3373	movups	%xmm2,16(%rdx)
3374.byte	102,15,58,223,202,1
3375	call	.Lkey_expansion_256a_cold
3376.byte	102,15,58,223,200,1
3377	call	.Lkey_expansion_256b
3378.byte	102,15,58,223,202,2
3379	call	.Lkey_expansion_256a
3380.byte	102,15,58,223,200,2
3381	call	.Lkey_expansion_256b
3382.byte	102,15,58,223,202,4
3383	call	.Lkey_expansion_256a
3384.byte	102,15,58,223,200,4
3385	call	.Lkey_expansion_256b
3386.byte	102,15,58,223,202,8
3387	call	.Lkey_expansion_256a
3388.byte	102,15,58,223,200,8
3389	call	.Lkey_expansion_256b
3390.byte	102,15,58,223,202,16
3391	call	.Lkey_expansion_256a
3392.byte	102,15,58,223,200,16
3393	call	.Lkey_expansion_256b
3394.byte	102,15,58,223,202,32
3395	call	.Lkey_expansion_256a
3396.byte	102,15,58,223,200,32
3397	call	.Lkey_expansion_256b
3398.byte	102,15,58,223,202,64
3399	call	.Lkey_expansion_256a
3400	movups	%xmm0,(%rax)
3401	movl	%esi,16(%rax)
3402	xorq	%rax,%rax
3403	jmp	.Lenc_key_ret
3404
3405.align	16
3406.L14rounds_alt:
3407	movdqa	.Lkey_rotate(%rip),%xmm5
3408	movdqa	.Lkey_rcon1(%rip),%xmm4
3409	movl	$7,%r10d
3410	movdqu	%xmm0,0(%rdx)
3411	movdqa	%xmm2,%xmm1
3412	movdqu	%xmm2,16(%rdx)
3413	jmp	.Loop_key256
3414
3415.align	16
3416.Loop_key256:
3417.byte	102,15,56,0,213
3418.byte	102,15,56,221,212
3419
3420	movdqa	%xmm0,%xmm3
3421	pslldq	$4,%xmm0
3422	pxor	%xmm0,%xmm3
3423	pslldq	$4,%xmm0
3424	pxor	%xmm0,%xmm3
3425	pslldq	$4,%xmm0
3426	pxor	%xmm3,%xmm0
3427	pslld	$1,%xmm4
3428
3429	pxor	%xmm2,%xmm0
3430	movdqu	%xmm0,(%rax)
3431
3432	decl	%r10d
3433	jz	.Ldone_key256
3434
3435	pshufd	$255,%xmm0,%xmm2
3436	pxor	%xmm3,%xmm3
3437.byte	102,15,56,221,211
3438
3439	movdqa	%xmm1,%xmm3
3440	pslldq	$4,%xmm1
3441	pxor	%xmm1,%xmm3
3442	pslldq	$4,%xmm1
3443	pxor	%xmm1,%xmm3
3444	pslldq	$4,%xmm1
3445	pxor	%xmm3,%xmm1
3446
3447	pxor	%xmm1,%xmm2
3448	movdqu	%xmm2,16(%rax)
3449	leaq	32(%rax),%rax
3450	movdqa	%xmm2,%xmm1
3451
3452	jmp	.Loop_key256
3453
3454.Ldone_key256:
3455	movl	%esi,16(%rax)
3456	xorl	%eax,%eax
3457	jmp	.Lenc_key_ret
3458
3459.align	16
3460.Lbad_keybits:
3461	movq	$-2,%rax
3462.Lenc_key_ret:
3463	pxor	%xmm0,%xmm0
3464	pxor	%xmm1,%xmm1
3465	pxor	%xmm2,%xmm2
3466	pxor	%xmm3,%xmm3
3467	pxor	%xmm4,%xmm4
3468	pxor	%xmm5,%xmm5
3469	addq	$8,%rsp
3470	.byte	0xf3,0xc3
3471.LSEH_end_set_encrypt_key:
3472
3473.align	16
3474.Lkey_expansion_128:
3475	movups	%xmm0,(%rax)
3476	leaq	16(%rax),%rax
3477.Lkey_expansion_128_cold:
3478	shufps	$16,%xmm0,%xmm4
3479	xorps	%xmm4,%xmm0
3480	shufps	$140,%xmm0,%xmm4
3481	xorps	%xmm4,%xmm0
3482	shufps	$255,%xmm1,%xmm1
3483	xorps	%xmm1,%xmm0
3484	.byte	0xf3,0xc3
3485
3486.align	16
3487.Lkey_expansion_192a:
3488	movups	%xmm0,(%rax)
3489	leaq	16(%rax),%rax
3490.Lkey_expansion_192a_cold:
3491	movaps	%xmm2,%xmm5
3492.Lkey_expansion_192b_warm:
3493	shufps	$16,%xmm0,%xmm4
3494	movdqa	%xmm2,%xmm3
3495	xorps	%xmm4,%xmm0
3496	shufps	$140,%xmm0,%xmm4
3497	pslldq	$4,%xmm3
3498	xorps	%xmm4,%xmm0
3499	pshufd	$85,%xmm1,%xmm1
3500	pxor	%xmm3,%xmm2
3501	pxor	%xmm1,%xmm0
3502	pshufd	$255,%xmm0,%xmm3
3503	pxor	%xmm3,%xmm2
3504	.byte	0xf3,0xc3
3505
3506.align	16
3507.Lkey_expansion_192b:
3508	movaps	%xmm0,%xmm3
3509	shufps	$68,%xmm0,%xmm5
3510	movups	%xmm5,(%rax)
3511	shufps	$78,%xmm2,%xmm3
3512	movups	%xmm3,16(%rax)
3513	leaq	32(%rax),%rax
3514	jmp	.Lkey_expansion_192b_warm
3515
3516.align	16
3517.Lkey_expansion_256a:
3518	movups	%xmm2,(%rax)
3519	leaq	16(%rax),%rax
3520.Lkey_expansion_256a_cold:
3521	shufps	$16,%xmm0,%xmm4
3522	xorps	%xmm4,%xmm0
3523	shufps	$140,%xmm0,%xmm4
3524	xorps	%xmm4,%xmm0
3525	shufps	$255,%xmm1,%xmm1
3526	xorps	%xmm1,%xmm0
3527	.byte	0xf3,0xc3
3528
3529.align	16
3530.Lkey_expansion_256b:
3531	movups	%xmm0,(%rax)
3532	leaq	16(%rax),%rax
3533
3534	shufps	$16,%xmm2,%xmm4
3535	xorps	%xmm4,%xmm2
3536	shufps	$140,%xmm2,%xmm4
3537	xorps	%xmm4,%xmm2
3538	shufps	$170,%xmm1,%xmm1
3539	xorps	%xmm1,%xmm2
3540	.byte	0xf3,0xc3
3541.size	aesni_set_encrypt_key,.-aesni_set_encrypt_key
3542.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
3543.align	64
3544.Lbswap_mask:
3545.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
3546.Lincrement32:
3547.long	6,6,6,0
3548.Lincrement64:
3549.long	1,0,0,0
3550.Lxts_magic:
3551.long	0x87,0,1,0
3552.Lincrement1:
3553.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3554.Lkey_rotate:
3555.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
3556.Lkey_rotate192:
3557.long	0x04070605,0x04070605,0x04070605,0x04070605
3558.Lkey_rcon1:
3559.long	1,1,1,1
3560.Lkey_rcon1b:
3561.long	0x1b,0x1b,0x1b,0x1b
3562
3563.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3564.align	64
3565#endif
3566