1#if defined(__x86_64__)
2.text
3.extern	OPENSSL_ia32cap_P
4.hidden OPENSSL_ia32cap_P
5
6chacha20_poly1305_constants:
7
8.align	64
9.chacha20_consts:
10.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
11.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
12.rol8:
13.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
14.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
15.rol16:
16.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
17.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
18.avx2_init:
19.long	0,0,0,0
20.sse_inc:
21.long	1,0,0,0
22.avx2_inc:
23.long	2,0,0,0,2,0,0,0
24.clamp:
25.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
26.quad	0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
27.align	16
28.and_masks:
29.byte	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
30.byte	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
31.byte	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
32.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
33.byte	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
34.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
35.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
36.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
37.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
38.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
39.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
40.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
41.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
42.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
43.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
44
45.type	poly_hash_ad_internal,@function
46.align	64
47poly_hash_ad_internal:
48.cfi_startproc
49	xorq	%r10,%r10
50	xorq	%r11,%r11
51	xorq	%r12,%r12
52	cmpq	$13,%r8
53	jne	hash_ad_loop
54poly_fast_tls_ad:
55
56	movq	(%rcx),%r10
57	movq	5(%rcx),%r11
58	shrq	$24,%r11
59	movq	$1,%r12
60	movq	0+0(%rbp),%rax
61	movq	%rax,%r15
62	mulq	%r10
63	movq	%rax,%r13
64	movq	%rdx,%r14
65	movq	0+0(%rbp),%rax
66	mulq	%r11
67	imulq	%r12,%r15
68	addq	%rax,%r14
69	adcq	%rdx,%r15
70	movq	8+0(%rbp),%rax
71	movq	%rax,%r9
72	mulq	%r10
73	addq	%rax,%r14
74	adcq	$0,%rdx
75	movq	%rdx,%r10
76	movq	8+0(%rbp),%rax
77	mulq	%r11
78	addq	%rax,%r15
79	adcq	$0,%rdx
80	imulq	%r12,%r9
81	addq	%r10,%r15
82	adcq	%rdx,%r9
83	movq	%r13,%r10
84	movq	%r14,%r11
85	movq	%r15,%r12
86	andq	$3,%r12
87	movq	%r15,%r13
88	andq	$-4,%r13
89	movq	%r9,%r14
90	shrdq	$2,%r9,%r15
91	shrq	$2,%r9
92	addq	%r13,%r10
93	adcq	%r14,%r11
94	adcq	$0,%r12
95	addq	%r15,%r10
96	adcq	%r9,%r11
97	adcq	$0,%r12
98
99	.byte	0xf3,0xc3
100hash_ad_loop:
101
102	cmpq	$16,%r8
103	jb	hash_ad_tail
104	addq	0(%rcx),%r10
105	adcq	8+0(%rcx),%r11
106	adcq	$1,%r12
107	movq	0+0(%rbp),%rax
108	movq	%rax,%r15
109	mulq	%r10
110	movq	%rax,%r13
111	movq	%rdx,%r14
112	movq	0+0(%rbp),%rax
113	mulq	%r11
114	imulq	%r12,%r15
115	addq	%rax,%r14
116	adcq	%rdx,%r15
117	movq	8+0(%rbp),%rax
118	movq	%rax,%r9
119	mulq	%r10
120	addq	%rax,%r14
121	adcq	$0,%rdx
122	movq	%rdx,%r10
123	movq	8+0(%rbp),%rax
124	mulq	%r11
125	addq	%rax,%r15
126	adcq	$0,%rdx
127	imulq	%r12,%r9
128	addq	%r10,%r15
129	adcq	%rdx,%r9
130	movq	%r13,%r10
131	movq	%r14,%r11
132	movq	%r15,%r12
133	andq	$3,%r12
134	movq	%r15,%r13
135	andq	$-4,%r13
136	movq	%r9,%r14
137	shrdq	$2,%r9,%r15
138	shrq	$2,%r9
139	addq	%r13,%r10
140	adcq	%r14,%r11
141	adcq	$0,%r12
142	addq	%r15,%r10
143	adcq	%r9,%r11
144	adcq	$0,%r12
145
146	leaq	16(%rcx),%rcx
147	subq	$16,%r8
148	jmp	hash_ad_loop
149hash_ad_tail:
150	cmpq	$0,%r8
151	je	1f
152
153	xorq	%r13,%r13
154	xorq	%r14,%r14
155	xorq	%r15,%r15
156	addq	%r8,%rcx
157hash_ad_tail_loop:
158	shldq	$8,%r13,%r14
159	shlq	$8,%r13
160	movzbq	-1(%rcx),%r15
161	xorq	%r15,%r13
162	decq	%rcx
163	decq	%r8
164	jne	hash_ad_tail_loop
165
166	addq	%r13,%r10
167	adcq	%r14,%r11
168	adcq	$1,%r12
169	movq	0+0(%rbp),%rax
170	movq	%rax,%r15
171	mulq	%r10
172	movq	%rax,%r13
173	movq	%rdx,%r14
174	movq	0+0(%rbp),%rax
175	mulq	%r11
176	imulq	%r12,%r15
177	addq	%rax,%r14
178	adcq	%rdx,%r15
179	movq	8+0(%rbp),%rax
180	movq	%rax,%r9
181	mulq	%r10
182	addq	%rax,%r14
183	adcq	$0,%rdx
184	movq	%rdx,%r10
185	movq	8+0(%rbp),%rax
186	mulq	%r11
187	addq	%rax,%r15
188	adcq	$0,%rdx
189	imulq	%r12,%r9
190	addq	%r10,%r15
191	adcq	%rdx,%r9
192	movq	%r13,%r10
193	movq	%r14,%r11
194	movq	%r15,%r12
195	andq	$3,%r12
196	movq	%r15,%r13
197	andq	$-4,%r13
198	movq	%r9,%r14
199	shrdq	$2,%r9,%r15
200	shrq	$2,%r9
201	addq	%r13,%r10
202	adcq	%r14,%r11
203	adcq	$0,%r12
204	addq	%r15,%r10
205	adcq	%r9,%r11
206	adcq	$0,%r12
207
208
2091:
210	.byte	0xf3,0xc3
211.cfi_endproc
212.size	poly_hash_ad_internal, .-poly_hash_ad_internal
213
214.globl	chacha20_poly1305_open
215.hidden chacha20_poly1305_open
216.type	chacha20_poly1305_open,@function
217.align	64
218chacha20_poly1305_open:
219.cfi_startproc
220	pushq	%rbp
221.cfi_adjust_cfa_offset	8
222	pushq	%rbx
223.cfi_adjust_cfa_offset	8
224	pushq	%r12
225.cfi_adjust_cfa_offset	8
226	pushq	%r13
227.cfi_adjust_cfa_offset	8
228	pushq	%r14
229.cfi_adjust_cfa_offset	8
230	pushq	%r15
231.cfi_adjust_cfa_offset	8
232
233
234	pushq	%r9
235.cfi_adjust_cfa_offset	8
236	subq	$288 + 32,%rsp
237.cfi_adjust_cfa_offset	288 + 32
238.cfi_offset	rbp, -16
239.cfi_offset	rbx, -24
240.cfi_offset	r12, -32
241.cfi_offset	r13, -40
242.cfi_offset	r14, -48
243.cfi_offset	r15, -56
244	leaq	32(%rsp),%rbp
245	andq	$-32,%rbp
246	movq	%rdx,8+32(%rbp)
247	movq	%r8,0+32(%rbp)
248	movq	%rdx,%rbx
249
250	movl	OPENSSL_ia32cap_P+8(%rip),%eax
251	andl	$288,%eax
252	xorl	$288,%eax
253	jz	chacha20_poly1305_open_avx2
254
2551:
256	cmpq	$128,%rbx
257	jbe	open_sse_128
258
259	movdqa	.chacha20_consts(%rip),%xmm0
260	movdqu	0(%r9),%xmm4
261	movdqu	16(%r9),%xmm8
262	movdqu	32(%r9),%xmm12
263	movdqa	%xmm12,%xmm7
264
265	movdqa	%xmm4,48(%rbp)
266	movdqa	%xmm8,64(%rbp)
267	movdqa	%xmm12,96(%rbp)
268	movq	$10,%r10
2691:
270	paddd	%xmm4,%xmm0
271	pxor	%xmm0,%xmm12
272	pshufb	.rol16(%rip),%xmm12
273	paddd	%xmm12,%xmm8
274	pxor	%xmm8,%xmm4
275	movdqa	%xmm4,%xmm3
276	pslld	$12,%xmm3
277	psrld	$20,%xmm4
278	pxor	%xmm3,%xmm4
279	paddd	%xmm4,%xmm0
280	pxor	%xmm0,%xmm12
281	pshufb	.rol8(%rip),%xmm12
282	paddd	%xmm12,%xmm8
283	pxor	%xmm8,%xmm4
284	movdqa	%xmm4,%xmm3
285	pslld	$7,%xmm3
286	psrld	$25,%xmm4
287	pxor	%xmm3,%xmm4
288.byte	102,15,58,15,228,4
289.byte	102,69,15,58,15,192,8
290.byte	102,69,15,58,15,228,12
291	paddd	%xmm4,%xmm0
292	pxor	%xmm0,%xmm12
293	pshufb	.rol16(%rip),%xmm12
294	paddd	%xmm12,%xmm8
295	pxor	%xmm8,%xmm4
296	movdqa	%xmm4,%xmm3
297	pslld	$12,%xmm3
298	psrld	$20,%xmm4
299	pxor	%xmm3,%xmm4
300	paddd	%xmm4,%xmm0
301	pxor	%xmm0,%xmm12
302	pshufb	.rol8(%rip),%xmm12
303	paddd	%xmm12,%xmm8
304	pxor	%xmm8,%xmm4
305	movdqa	%xmm4,%xmm3
306	pslld	$7,%xmm3
307	psrld	$25,%xmm4
308	pxor	%xmm3,%xmm4
309.byte	102,15,58,15,228,12
310.byte	102,69,15,58,15,192,8
311.byte	102,69,15,58,15,228,4
312
313	decq	%r10
314	jne	1b
315
316	paddd	.chacha20_consts(%rip),%xmm0
317	paddd	48(%rbp),%xmm4
318
319	pand	.clamp(%rip),%xmm0
320	movdqa	%xmm0,0(%rbp)
321	movdqa	%xmm4,16(%rbp)
322
323	movq	%r8,%r8
324	call	poly_hash_ad_internal
325open_sse_main_loop:
326	cmpq	$256,%rbx
327	jb	2f
328
329	movdqa	.chacha20_consts(%rip),%xmm0
330	movdqa	48(%rbp),%xmm4
331	movdqa	64(%rbp),%xmm8
332	movdqa	%xmm0,%xmm1
333	movdqa	%xmm4,%xmm5
334	movdqa	%xmm8,%xmm9
335	movdqa	%xmm0,%xmm2
336	movdqa	%xmm4,%xmm6
337	movdqa	%xmm8,%xmm10
338	movdqa	%xmm0,%xmm3
339	movdqa	%xmm4,%xmm7
340	movdqa	%xmm8,%xmm11
341	movdqa	96(%rbp),%xmm15
342	paddd	.sse_inc(%rip),%xmm15
343	movdqa	%xmm15,%xmm14
344	paddd	.sse_inc(%rip),%xmm14
345	movdqa	%xmm14,%xmm13
346	paddd	.sse_inc(%rip),%xmm13
347	movdqa	%xmm13,%xmm12
348	paddd	.sse_inc(%rip),%xmm12
349	movdqa	%xmm12,96(%rbp)
350	movdqa	%xmm13,112(%rbp)
351	movdqa	%xmm14,128(%rbp)
352	movdqa	%xmm15,144(%rbp)
353
354
355
356	movq	$4,%rcx
357	movq	%rsi,%r8
3581:
359	movdqa	%xmm8,80(%rbp)
360	movdqa	.rol16(%rip),%xmm8
361	paddd	%xmm7,%xmm3
362	paddd	%xmm6,%xmm2
363	paddd	%xmm5,%xmm1
364	paddd	%xmm4,%xmm0
365	pxor	%xmm3,%xmm15
366	pxor	%xmm2,%xmm14
367	pxor	%xmm1,%xmm13
368	pxor	%xmm0,%xmm12
369.byte	102,69,15,56,0,248
370.byte	102,69,15,56,0,240
371.byte	102,69,15,56,0,232
372.byte	102,69,15,56,0,224
373	movdqa	80(%rbp),%xmm8
374	paddd	%xmm15,%xmm11
375	paddd	%xmm14,%xmm10
376	paddd	%xmm13,%xmm9
377	paddd	%xmm12,%xmm8
378	pxor	%xmm11,%xmm7
379	addq	0(%r8),%r10
380	adcq	8+0(%r8),%r11
381	adcq	$1,%r12
382
383	leaq	16(%r8),%r8
384	pxor	%xmm10,%xmm6
385	pxor	%xmm9,%xmm5
386	pxor	%xmm8,%xmm4
387	movdqa	%xmm8,80(%rbp)
388	movdqa	%xmm7,%xmm8
389	psrld	$20,%xmm8
390	pslld	$32-20,%xmm7
391	pxor	%xmm8,%xmm7
392	movdqa	%xmm6,%xmm8
393	psrld	$20,%xmm8
394	pslld	$32-20,%xmm6
395	pxor	%xmm8,%xmm6
396	movdqa	%xmm5,%xmm8
397	psrld	$20,%xmm8
398	pslld	$32-20,%xmm5
399	pxor	%xmm8,%xmm5
400	movdqa	%xmm4,%xmm8
401	psrld	$20,%xmm8
402	pslld	$32-20,%xmm4
403	pxor	%xmm8,%xmm4
404	movq	0+0(%rbp),%rax
405	movq	%rax,%r15
406	mulq	%r10
407	movq	%rax,%r13
408	movq	%rdx,%r14
409	movq	0+0(%rbp),%rax
410	mulq	%r11
411	imulq	%r12,%r15
412	addq	%rax,%r14
413	adcq	%rdx,%r15
414	movdqa	.rol8(%rip),%xmm8
415	paddd	%xmm7,%xmm3
416	paddd	%xmm6,%xmm2
417	paddd	%xmm5,%xmm1
418	paddd	%xmm4,%xmm0
419	pxor	%xmm3,%xmm15
420	pxor	%xmm2,%xmm14
421	pxor	%xmm1,%xmm13
422	pxor	%xmm0,%xmm12
423.byte	102,69,15,56,0,248
424.byte	102,69,15,56,0,240
425.byte	102,69,15,56,0,232
426.byte	102,69,15,56,0,224
427	movdqa	80(%rbp),%xmm8
428	paddd	%xmm15,%xmm11
429	paddd	%xmm14,%xmm10
430	paddd	%xmm13,%xmm9
431	paddd	%xmm12,%xmm8
432	pxor	%xmm11,%xmm7
433	pxor	%xmm10,%xmm6
434	movq	8+0(%rbp),%rax
435	movq	%rax,%r9
436	mulq	%r10
437	addq	%rax,%r14
438	adcq	$0,%rdx
439	movq	%rdx,%r10
440	movq	8+0(%rbp),%rax
441	mulq	%r11
442	addq	%rax,%r15
443	adcq	$0,%rdx
444	pxor	%xmm9,%xmm5
445	pxor	%xmm8,%xmm4
446	movdqa	%xmm8,80(%rbp)
447	movdqa	%xmm7,%xmm8
448	psrld	$25,%xmm8
449	pslld	$32-25,%xmm7
450	pxor	%xmm8,%xmm7
451	movdqa	%xmm6,%xmm8
452	psrld	$25,%xmm8
453	pslld	$32-25,%xmm6
454	pxor	%xmm8,%xmm6
455	movdqa	%xmm5,%xmm8
456	psrld	$25,%xmm8
457	pslld	$32-25,%xmm5
458	pxor	%xmm8,%xmm5
459	movdqa	%xmm4,%xmm8
460	psrld	$25,%xmm8
461	pslld	$32-25,%xmm4
462	pxor	%xmm8,%xmm4
463	movdqa	80(%rbp),%xmm8
464	imulq	%r12,%r9
465	addq	%r10,%r15
466	adcq	%rdx,%r9
467.byte	102,15,58,15,255,4
468.byte	102,69,15,58,15,219,8
469.byte	102,69,15,58,15,255,12
470.byte	102,15,58,15,246,4
471.byte	102,69,15,58,15,210,8
472.byte	102,69,15,58,15,246,12
473.byte	102,15,58,15,237,4
474.byte	102,69,15,58,15,201,8
475.byte	102,69,15,58,15,237,12
476.byte	102,15,58,15,228,4
477.byte	102,69,15,58,15,192,8
478.byte	102,69,15,58,15,228,12
479	movdqa	%xmm8,80(%rbp)
480	movdqa	.rol16(%rip),%xmm8
481	paddd	%xmm7,%xmm3
482	paddd	%xmm6,%xmm2
483	paddd	%xmm5,%xmm1
484	paddd	%xmm4,%xmm0
485	pxor	%xmm3,%xmm15
486	pxor	%xmm2,%xmm14
487	movq	%r13,%r10
488	movq	%r14,%r11
489	movq	%r15,%r12
490	andq	$3,%r12
491	movq	%r15,%r13
492	andq	$-4,%r13
493	movq	%r9,%r14
494	shrdq	$2,%r9,%r15
495	shrq	$2,%r9
496	addq	%r13,%r10
497	adcq	%r14,%r11
498	adcq	$0,%r12
499	addq	%r15,%r10
500	adcq	%r9,%r11
501	adcq	$0,%r12
502	pxor	%xmm1,%xmm13
503	pxor	%xmm0,%xmm12
504.byte	102,69,15,56,0,248
505.byte	102,69,15,56,0,240
506.byte	102,69,15,56,0,232
507.byte	102,69,15,56,0,224
508	movdqa	80(%rbp),%xmm8
509	paddd	%xmm15,%xmm11
510	paddd	%xmm14,%xmm10
511	paddd	%xmm13,%xmm9
512	paddd	%xmm12,%xmm8
513	pxor	%xmm11,%xmm7
514	pxor	%xmm10,%xmm6
515	pxor	%xmm9,%xmm5
516	pxor	%xmm8,%xmm4
517	movdqa	%xmm8,80(%rbp)
518	movdqa	%xmm7,%xmm8
519	psrld	$20,%xmm8
520	pslld	$32-20,%xmm7
521	pxor	%xmm8,%xmm7
522	movdqa	%xmm6,%xmm8
523	psrld	$20,%xmm8
524	pslld	$32-20,%xmm6
525	pxor	%xmm8,%xmm6
526	movdqa	%xmm5,%xmm8
527	psrld	$20,%xmm8
528	pslld	$32-20,%xmm5
529	pxor	%xmm8,%xmm5
530	movdqa	%xmm4,%xmm8
531	psrld	$20,%xmm8
532	pslld	$32-20,%xmm4
533	pxor	%xmm8,%xmm4
534	movdqa	.rol8(%rip),%xmm8
535	paddd	%xmm7,%xmm3
536	paddd	%xmm6,%xmm2
537	paddd	%xmm5,%xmm1
538	paddd	%xmm4,%xmm0
539	pxor	%xmm3,%xmm15
540	pxor	%xmm2,%xmm14
541	pxor	%xmm1,%xmm13
542	pxor	%xmm0,%xmm12
543.byte	102,69,15,56,0,248
544.byte	102,69,15,56,0,240
545.byte	102,69,15,56,0,232
546.byte	102,69,15,56,0,224
547	movdqa	80(%rbp),%xmm8
548	paddd	%xmm15,%xmm11
549	paddd	%xmm14,%xmm10
550	paddd	%xmm13,%xmm9
551	paddd	%xmm12,%xmm8
552	pxor	%xmm11,%xmm7
553	pxor	%xmm10,%xmm6
554	pxor	%xmm9,%xmm5
555	pxor	%xmm8,%xmm4
556	movdqa	%xmm8,80(%rbp)
557	movdqa	%xmm7,%xmm8
558	psrld	$25,%xmm8
559	pslld	$32-25,%xmm7
560	pxor	%xmm8,%xmm7
561	movdqa	%xmm6,%xmm8
562	psrld	$25,%xmm8
563	pslld	$32-25,%xmm6
564	pxor	%xmm8,%xmm6
565	movdqa	%xmm5,%xmm8
566	psrld	$25,%xmm8
567	pslld	$32-25,%xmm5
568	pxor	%xmm8,%xmm5
569	movdqa	%xmm4,%xmm8
570	psrld	$25,%xmm8
571	pslld	$32-25,%xmm4
572	pxor	%xmm8,%xmm4
573	movdqa	80(%rbp),%xmm8
574.byte	102,15,58,15,255,12
575.byte	102,69,15,58,15,219,8
576.byte	102,69,15,58,15,255,4
577.byte	102,15,58,15,246,12
578.byte	102,69,15,58,15,210,8
579.byte	102,69,15,58,15,246,4
580.byte	102,15,58,15,237,12
581.byte	102,69,15,58,15,201,8
582.byte	102,69,15,58,15,237,4
583.byte	102,15,58,15,228,12
584.byte	102,69,15,58,15,192,8
585.byte	102,69,15,58,15,228,4
586
587	decq	%rcx
588	jge	1b
589	addq	0(%r8),%r10
590	adcq	8+0(%r8),%r11
591	adcq	$1,%r12
592	movq	0+0(%rbp),%rax
593	movq	%rax,%r15
594	mulq	%r10
595	movq	%rax,%r13
596	movq	%rdx,%r14
597	movq	0+0(%rbp),%rax
598	mulq	%r11
599	imulq	%r12,%r15
600	addq	%rax,%r14
601	adcq	%rdx,%r15
602	movq	8+0(%rbp),%rax
603	movq	%rax,%r9
604	mulq	%r10
605	addq	%rax,%r14
606	adcq	$0,%rdx
607	movq	%rdx,%r10
608	movq	8+0(%rbp),%rax
609	mulq	%r11
610	addq	%rax,%r15
611	adcq	$0,%rdx
612	imulq	%r12,%r9
613	addq	%r10,%r15
614	adcq	%rdx,%r9
615	movq	%r13,%r10
616	movq	%r14,%r11
617	movq	%r15,%r12
618	andq	$3,%r12
619	movq	%r15,%r13
620	andq	$-4,%r13
621	movq	%r9,%r14
622	shrdq	$2,%r9,%r15
623	shrq	$2,%r9
624	addq	%r13,%r10
625	adcq	%r14,%r11
626	adcq	$0,%r12
627	addq	%r15,%r10
628	adcq	%r9,%r11
629	adcq	$0,%r12
630
631	leaq	16(%r8),%r8
632	cmpq	$-6,%rcx
633	jg	1b
634	paddd	.chacha20_consts(%rip),%xmm3
635	paddd	48(%rbp),%xmm7
636	paddd	64(%rbp),%xmm11
637	paddd	144(%rbp),%xmm15
638	paddd	.chacha20_consts(%rip),%xmm2
639	paddd	48(%rbp),%xmm6
640	paddd	64(%rbp),%xmm10
641	paddd	128(%rbp),%xmm14
642	paddd	.chacha20_consts(%rip),%xmm1
643	paddd	48(%rbp),%xmm5
644	paddd	64(%rbp),%xmm9
645	paddd	112(%rbp),%xmm13
646	paddd	.chacha20_consts(%rip),%xmm0
647	paddd	48(%rbp),%xmm4
648	paddd	64(%rbp),%xmm8
649	paddd	96(%rbp),%xmm12
650	movdqa	%xmm12,80(%rbp)
651	movdqu	0 + 0(%rsi),%xmm12
652	pxor	%xmm3,%xmm12
653	movdqu	%xmm12,0 + 0(%rdi)
654	movdqu	16 + 0(%rsi),%xmm12
655	pxor	%xmm7,%xmm12
656	movdqu	%xmm12,16 + 0(%rdi)
657	movdqu	32 + 0(%rsi),%xmm12
658	pxor	%xmm11,%xmm12
659	movdqu	%xmm12,32 + 0(%rdi)
660	movdqu	48 + 0(%rsi),%xmm12
661	pxor	%xmm15,%xmm12
662	movdqu	%xmm12,48 + 0(%rdi)
663	movdqu	0 + 64(%rsi),%xmm3
664	movdqu	16 + 64(%rsi),%xmm7
665	movdqu	32 + 64(%rsi),%xmm11
666	movdqu	48 + 64(%rsi),%xmm15
667	pxor	%xmm3,%xmm2
668	pxor	%xmm7,%xmm6
669	pxor	%xmm11,%xmm10
670	pxor	%xmm14,%xmm15
671	movdqu	%xmm2,0 + 64(%rdi)
672	movdqu	%xmm6,16 + 64(%rdi)
673	movdqu	%xmm10,32 + 64(%rdi)
674	movdqu	%xmm15,48 + 64(%rdi)
675	movdqu	0 + 128(%rsi),%xmm3
676	movdqu	16 + 128(%rsi),%xmm7
677	movdqu	32 + 128(%rsi),%xmm11
678	movdqu	48 + 128(%rsi),%xmm15
679	pxor	%xmm3,%xmm1
680	pxor	%xmm7,%xmm5
681	pxor	%xmm11,%xmm9
682	pxor	%xmm13,%xmm15
683	movdqu	%xmm1,0 + 128(%rdi)
684	movdqu	%xmm5,16 + 128(%rdi)
685	movdqu	%xmm9,32 + 128(%rdi)
686	movdqu	%xmm15,48 + 128(%rdi)
687	movdqu	0 + 192(%rsi),%xmm3
688	movdqu	16 + 192(%rsi),%xmm7
689	movdqu	32 + 192(%rsi),%xmm11
690	movdqu	48 + 192(%rsi),%xmm15
691	pxor	%xmm3,%xmm0
692	pxor	%xmm7,%xmm4
693	pxor	%xmm11,%xmm8
694	pxor	80(%rbp),%xmm15
695	movdqu	%xmm0,0 + 192(%rdi)
696	movdqu	%xmm4,16 + 192(%rdi)
697	movdqu	%xmm8,32 + 192(%rdi)
698	movdqu	%xmm15,48 + 192(%rdi)
699
700	leaq	256(%rsi),%rsi
701	leaq	256(%rdi),%rdi
702	subq	$256,%rbx
703	jmp	open_sse_main_loop
7042:
705
706	testq	%rbx,%rbx
707	jz	open_sse_finalize
708	cmpq	$64,%rbx
709	ja	3f
710	movdqa	.chacha20_consts(%rip),%xmm0
711	movdqa	48(%rbp),%xmm4
712	movdqa	64(%rbp),%xmm8
713	movdqa	96(%rbp),%xmm12
714	paddd	.sse_inc(%rip),%xmm12
715	movdqa	%xmm12,96(%rbp)
716
717	xorq	%r8,%r8
718	movq	%rbx,%rcx
719	cmpq	$16,%rcx
720	jb	2f
7211:
722	addq	0(%rsi,%r8), %r10
723	adcq	8+0(%rsi,%r8), %r11
724	adcq	$1,%r12
725	movq	0+0(%rbp),%rax
726	movq	%rax,%r15
727	mulq	%r10
728	movq	%rax,%r13
729	movq	%rdx,%r14
730	movq	0+0(%rbp),%rax
731	mulq	%r11
732	imulq	%r12,%r15
733	addq	%rax,%r14
734	adcq	%rdx,%r15
735	movq	8+0(%rbp),%rax
736	movq	%rax,%r9
737	mulq	%r10
738	addq	%rax,%r14
739	adcq	$0,%rdx
740	movq	%rdx,%r10
741	movq	8+0(%rbp),%rax
742	mulq	%r11
743	addq	%rax,%r15
744	adcq	$0,%rdx
745	imulq	%r12,%r9
746	addq	%r10,%r15
747	adcq	%rdx,%r9
748	movq	%r13,%r10
749	movq	%r14,%r11
750	movq	%r15,%r12
751	andq	$3,%r12
752	movq	%r15,%r13
753	andq	$-4,%r13
754	movq	%r9,%r14
755	shrdq	$2,%r9,%r15
756	shrq	$2,%r9
757	addq	%r13,%r10
758	adcq	%r14,%r11
759	adcq	$0,%r12
760	addq	%r15,%r10
761	adcq	%r9,%r11
762	adcq	$0,%r12
763
764	subq	$16,%rcx
7652:
766	addq	$16,%r8
767	paddd	%xmm4,%xmm0
768	pxor	%xmm0,%xmm12
769	pshufb	.rol16(%rip),%xmm12
770	paddd	%xmm12,%xmm8
771	pxor	%xmm8,%xmm4
772	movdqa	%xmm4,%xmm3
773	pslld	$12,%xmm3
774	psrld	$20,%xmm4
775	pxor	%xmm3,%xmm4
776	paddd	%xmm4,%xmm0
777	pxor	%xmm0,%xmm12
778	pshufb	.rol8(%rip),%xmm12
779	paddd	%xmm12,%xmm8
780	pxor	%xmm8,%xmm4
781	movdqa	%xmm4,%xmm3
782	pslld	$7,%xmm3
783	psrld	$25,%xmm4
784	pxor	%xmm3,%xmm4
785.byte	102,15,58,15,228,4
786.byte	102,69,15,58,15,192,8
787.byte	102,69,15,58,15,228,12
788	paddd	%xmm4,%xmm0
789	pxor	%xmm0,%xmm12
790	pshufb	.rol16(%rip),%xmm12
791	paddd	%xmm12,%xmm8
792	pxor	%xmm8,%xmm4
793	movdqa	%xmm4,%xmm3
794	pslld	$12,%xmm3
795	psrld	$20,%xmm4
796	pxor	%xmm3,%xmm4
797	paddd	%xmm4,%xmm0
798	pxor	%xmm0,%xmm12
799	pshufb	.rol8(%rip),%xmm12
800	paddd	%xmm12,%xmm8
801	pxor	%xmm8,%xmm4
802	movdqa	%xmm4,%xmm3
803	pslld	$7,%xmm3
804	psrld	$25,%xmm4
805	pxor	%xmm3,%xmm4
806.byte	102,15,58,15,228,12
807.byte	102,69,15,58,15,192,8
808.byte	102,69,15,58,15,228,4
809
810	cmpq	$16,%rcx
811	jae	1b
812	cmpq	$160,%r8
813	jne	2b
814	paddd	.chacha20_consts(%rip),%xmm0
815	paddd	48(%rbp),%xmm4
816	paddd	64(%rbp),%xmm8
817	paddd	96(%rbp),%xmm12
818
819	jmp	open_sse_tail_64_dec_loop
8203:
821	cmpq	$128,%rbx
822	ja	3f
823	movdqa	.chacha20_consts(%rip),%xmm0
824	movdqa	48(%rbp),%xmm4
825	movdqa	64(%rbp),%xmm8
826	movdqa	%xmm0,%xmm1
827	movdqa	%xmm4,%xmm5
828	movdqa	%xmm8,%xmm9
829	movdqa	96(%rbp),%xmm13
830	paddd	.sse_inc(%rip),%xmm13
831	movdqa	%xmm13,%xmm12
832	paddd	.sse_inc(%rip),%xmm12
833	movdqa	%xmm12,96(%rbp)
834	movdqa	%xmm13,112(%rbp)
835
836	movq	%rbx,%rcx
837	andq	$-16,%rcx
838	xorq	%r8,%r8
8391:
840	addq	0(%rsi,%r8), %r10
841	adcq	8+0(%rsi,%r8), %r11
842	adcq	$1,%r12
843	movq	0+0(%rbp),%rax
844	movq	%rax,%r15
845	mulq	%r10
846	movq	%rax,%r13
847	movq	%rdx,%r14
848	movq	0+0(%rbp),%rax
849	mulq	%r11
850	imulq	%r12,%r15
851	addq	%rax,%r14
852	adcq	%rdx,%r15
853	movq	8+0(%rbp),%rax
854	movq	%rax,%r9
855	mulq	%r10
856	addq	%rax,%r14
857	adcq	$0,%rdx
858	movq	%rdx,%r10
859	movq	8+0(%rbp),%rax
860	mulq	%r11
861	addq	%rax,%r15
862	adcq	$0,%rdx
863	imulq	%r12,%r9
864	addq	%r10,%r15
865	adcq	%rdx,%r9
866	movq	%r13,%r10
867	movq	%r14,%r11
868	movq	%r15,%r12
869	andq	$3,%r12
870	movq	%r15,%r13
871	andq	$-4,%r13
872	movq	%r9,%r14
873	shrdq	$2,%r9,%r15
874	shrq	$2,%r9
875	addq	%r13,%r10
876	adcq	%r14,%r11
877	adcq	$0,%r12
878	addq	%r15,%r10
879	adcq	%r9,%r11
880	adcq	$0,%r12
881
8822:
883	addq	$16,%r8
884	paddd	%xmm4,%xmm0
885	pxor	%xmm0,%xmm12
886	pshufb	.rol16(%rip),%xmm12
887	paddd	%xmm12,%xmm8
888	pxor	%xmm8,%xmm4
889	movdqa	%xmm4,%xmm3
890	pslld	$12,%xmm3
891	psrld	$20,%xmm4
892	pxor	%xmm3,%xmm4
893	paddd	%xmm4,%xmm0
894	pxor	%xmm0,%xmm12
895	pshufb	.rol8(%rip),%xmm12
896	paddd	%xmm12,%xmm8
897	pxor	%xmm8,%xmm4
898	movdqa	%xmm4,%xmm3
899	pslld	$7,%xmm3
900	psrld	$25,%xmm4
901	pxor	%xmm3,%xmm4
902.byte	102,15,58,15,228,4
903.byte	102,69,15,58,15,192,8
904.byte	102,69,15,58,15,228,12
905	paddd	%xmm5,%xmm1
906	pxor	%xmm1,%xmm13
907	pshufb	.rol16(%rip),%xmm13
908	paddd	%xmm13,%xmm9
909	pxor	%xmm9,%xmm5
910	movdqa	%xmm5,%xmm3
911	pslld	$12,%xmm3
912	psrld	$20,%xmm5
913	pxor	%xmm3,%xmm5
914	paddd	%xmm5,%xmm1
915	pxor	%xmm1,%xmm13
916	pshufb	.rol8(%rip),%xmm13
917	paddd	%xmm13,%xmm9
918	pxor	%xmm9,%xmm5
919	movdqa	%xmm5,%xmm3
920	pslld	$7,%xmm3
921	psrld	$25,%xmm5
922	pxor	%xmm3,%xmm5
923.byte	102,15,58,15,237,4
924.byte	102,69,15,58,15,201,8
925.byte	102,69,15,58,15,237,12
926	paddd	%xmm4,%xmm0
927	pxor	%xmm0,%xmm12
928	pshufb	.rol16(%rip),%xmm12
929	paddd	%xmm12,%xmm8
930	pxor	%xmm8,%xmm4
931	movdqa	%xmm4,%xmm3
932	pslld	$12,%xmm3
933	psrld	$20,%xmm4
934	pxor	%xmm3,%xmm4
935	paddd	%xmm4,%xmm0
936	pxor	%xmm0,%xmm12
937	pshufb	.rol8(%rip),%xmm12
938	paddd	%xmm12,%xmm8
939	pxor	%xmm8,%xmm4
940	movdqa	%xmm4,%xmm3
941	pslld	$7,%xmm3
942	psrld	$25,%xmm4
943	pxor	%xmm3,%xmm4
944.byte	102,15,58,15,228,12
945.byte	102,69,15,58,15,192,8
946.byte	102,69,15,58,15,228,4
947	paddd	%xmm5,%xmm1
948	pxor	%xmm1,%xmm13
949	pshufb	.rol16(%rip),%xmm13
950	paddd	%xmm13,%xmm9
951	pxor	%xmm9,%xmm5
952	movdqa	%xmm5,%xmm3
953	pslld	$12,%xmm3
954	psrld	$20,%xmm5
955	pxor	%xmm3,%xmm5
956	paddd	%xmm5,%xmm1
957	pxor	%xmm1,%xmm13
958	pshufb	.rol8(%rip),%xmm13
959	paddd	%xmm13,%xmm9
960	pxor	%xmm9,%xmm5
961	movdqa	%xmm5,%xmm3
962	pslld	$7,%xmm3
963	psrld	$25,%xmm5
964	pxor	%xmm3,%xmm5
965.byte	102,15,58,15,237,12
966.byte	102,69,15,58,15,201,8
967.byte	102,69,15,58,15,237,4
968
969	cmpq	%rcx,%r8
970	jb	1b
971	cmpq	$160,%r8
972	jne	2b
973	paddd	.chacha20_consts(%rip),%xmm1
974	paddd	48(%rbp),%xmm5
975	paddd	64(%rbp),%xmm9
976	paddd	112(%rbp),%xmm13
977	paddd	.chacha20_consts(%rip),%xmm0
978	paddd	48(%rbp),%xmm4
979	paddd	64(%rbp),%xmm8
980	paddd	96(%rbp),%xmm12
981	movdqu	0 + 0(%rsi),%xmm3
982	movdqu	16 + 0(%rsi),%xmm7
983	movdqu	32 + 0(%rsi),%xmm11
984	movdqu	48 + 0(%rsi),%xmm15
985	pxor	%xmm3,%xmm1
986	pxor	%xmm7,%xmm5
987	pxor	%xmm11,%xmm9
988	pxor	%xmm13,%xmm15
989	movdqu	%xmm1,0 + 0(%rdi)
990	movdqu	%xmm5,16 + 0(%rdi)
991	movdqu	%xmm9,32 + 0(%rdi)
992	movdqu	%xmm15,48 + 0(%rdi)
993
994	subq	$64,%rbx
995	leaq	64(%rsi),%rsi
996	leaq	64(%rdi),%rdi
997	jmp	open_sse_tail_64_dec_loop
9983:
999	cmpq	$192,%rbx
1000	ja	3f
1001	movdqa	.chacha20_consts(%rip),%xmm0
1002	movdqa	48(%rbp),%xmm4
1003	movdqa	64(%rbp),%xmm8
1004	movdqa	%xmm0,%xmm1
1005	movdqa	%xmm4,%xmm5
1006	movdqa	%xmm8,%xmm9
1007	movdqa	%xmm0,%xmm2
1008	movdqa	%xmm4,%xmm6
1009	movdqa	%xmm8,%xmm10
1010	movdqa	96(%rbp),%xmm14
1011	paddd	.sse_inc(%rip),%xmm14
1012	movdqa	%xmm14,%xmm13
1013	paddd	.sse_inc(%rip),%xmm13
1014	movdqa	%xmm13,%xmm12
1015	paddd	.sse_inc(%rip),%xmm12
1016	movdqa	%xmm12,96(%rbp)
1017	movdqa	%xmm13,112(%rbp)
1018	movdqa	%xmm14,128(%rbp)
1019
1020	movq	%rbx,%rcx
1021	movq	$160,%r8
1022	cmpq	$160,%rcx
1023	cmovgq	%r8,%rcx
1024	andq	$-16,%rcx
1025	xorq	%r8,%r8
10261:
1027	addq	0(%rsi,%r8), %r10
1028	adcq	8+0(%rsi,%r8), %r11
1029	adcq	$1,%r12
1030	movq	0+0(%rbp),%rax
1031	movq	%rax,%r15
1032	mulq	%r10
1033	movq	%rax,%r13
1034	movq	%rdx,%r14
1035	movq	0+0(%rbp),%rax
1036	mulq	%r11
1037	imulq	%r12,%r15
1038	addq	%rax,%r14
1039	adcq	%rdx,%r15
1040	movq	8+0(%rbp),%rax
1041	movq	%rax,%r9
1042	mulq	%r10
1043	addq	%rax,%r14
1044	adcq	$0,%rdx
1045	movq	%rdx,%r10
1046	movq	8+0(%rbp),%rax
1047	mulq	%r11
1048	addq	%rax,%r15
1049	adcq	$0,%rdx
1050	imulq	%r12,%r9
1051	addq	%r10,%r15
1052	adcq	%rdx,%r9
1053	movq	%r13,%r10
1054	movq	%r14,%r11
1055	movq	%r15,%r12
1056	andq	$3,%r12
1057	movq	%r15,%r13
1058	andq	$-4,%r13
1059	movq	%r9,%r14
1060	shrdq	$2,%r9,%r15
1061	shrq	$2,%r9
1062	addq	%r13,%r10
1063	adcq	%r14,%r11
1064	adcq	$0,%r12
1065	addq	%r15,%r10
1066	adcq	%r9,%r11
1067	adcq	$0,%r12
1068
10692:
1070	addq	$16,%r8
1071	paddd	%xmm4,%xmm0
1072	pxor	%xmm0,%xmm12
1073	pshufb	.rol16(%rip),%xmm12
1074	paddd	%xmm12,%xmm8
1075	pxor	%xmm8,%xmm4
1076	movdqa	%xmm4,%xmm3
1077	pslld	$12,%xmm3
1078	psrld	$20,%xmm4
1079	pxor	%xmm3,%xmm4
1080	paddd	%xmm4,%xmm0
1081	pxor	%xmm0,%xmm12
1082	pshufb	.rol8(%rip),%xmm12
1083	paddd	%xmm12,%xmm8
1084	pxor	%xmm8,%xmm4
1085	movdqa	%xmm4,%xmm3
1086	pslld	$7,%xmm3
1087	psrld	$25,%xmm4
1088	pxor	%xmm3,%xmm4
1089.byte	102,15,58,15,228,4
1090.byte	102,69,15,58,15,192,8
1091.byte	102,69,15,58,15,228,12
1092	paddd	%xmm5,%xmm1
1093	pxor	%xmm1,%xmm13
1094	pshufb	.rol16(%rip),%xmm13
1095	paddd	%xmm13,%xmm9
1096	pxor	%xmm9,%xmm5
1097	movdqa	%xmm5,%xmm3
1098	pslld	$12,%xmm3
1099	psrld	$20,%xmm5
1100	pxor	%xmm3,%xmm5
1101	paddd	%xmm5,%xmm1
1102	pxor	%xmm1,%xmm13
1103	pshufb	.rol8(%rip),%xmm13
1104	paddd	%xmm13,%xmm9
1105	pxor	%xmm9,%xmm5
1106	movdqa	%xmm5,%xmm3
1107	pslld	$7,%xmm3
1108	psrld	$25,%xmm5
1109	pxor	%xmm3,%xmm5
1110.byte	102,15,58,15,237,4
1111.byte	102,69,15,58,15,201,8
1112.byte	102,69,15,58,15,237,12
1113	paddd	%xmm6,%xmm2
1114	pxor	%xmm2,%xmm14
1115	pshufb	.rol16(%rip),%xmm14
1116	paddd	%xmm14,%xmm10
1117	pxor	%xmm10,%xmm6
1118	movdqa	%xmm6,%xmm3
1119	pslld	$12,%xmm3
1120	psrld	$20,%xmm6
1121	pxor	%xmm3,%xmm6
1122	paddd	%xmm6,%xmm2
1123	pxor	%xmm2,%xmm14
1124	pshufb	.rol8(%rip),%xmm14
1125	paddd	%xmm14,%xmm10
1126	pxor	%xmm10,%xmm6
1127	movdqa	%xmm6,%xmm3
1128	pslld	$7,%xmm3
1129	psrld	$25,%xmm6
1130	pxor	%xmm3,%xmm6
1131.byte	102,15,58,15,246,4
1132.byte	102,69,15,58,15,210,8
1133.byte	102,69,15,58,15,246,12
1134	paddd	%xmm4,%xmm0
1135	pxor	%xmm0,%xmm12
1136	pshufb	.rol16(%rip),%xmm12
1137	paddd	%xmm12,%xmm8
1138	pxor	%xmm8,%xmm4
1139	movdqa	%xmm4,%xmm3
1140	pslld	$12,%xmm3
1141	psrld	$20,%xmm4
1142	pxor	%xmm3,%xmm4
1143	paddd	%xmm4,%xmm0
1144	pxor	%xmm0,%xmm12
1145	pshufb	.rol8(%rip),%xmm12
1146	paddd	%xmm12,%xmm8
1147	pxor	%xmm8,%xmm4
1148	movdqa	%xmm4,%xmm3
1149	pslld	$7,%xmm3
1150	psrld	$25,%xmm4
1151	pxor	%xmm3,%xmm4
1152.byte	102,15,58,15,228,12
1153.byte	102,69,15,58,15,192,8
1154.byte	102,69,15,58,15,228,4
1155	paddd	%xmm5,%xmm1
1156	pxor	%xmm1,%xmm13
1157	pshufb	.rol16(%rip),%xmm13
1158	paddd	%xmm13,%xmm9
1159	pxor	%xmm9,%xmm5
1160	movdqa	%xmm5,%xmm3
1161	pslld	$12,%xmm3
1162	psrld	$20,%xmm5
1163	pxor	%xmm3,%xmm5
1164	paddd	%xmm5,%xmm1
1165	pxor	%xmm1,%xmm13
1166	pshufb	.rol8(%rip),%xmm13
1167	paddd	%xmm13,%xmm9
1168	pxor	%xmm9,%xmm5
1169	movdqa	%xmm5,%xmm3
1170	pslld	$7,%xmm3
1171	psrld	$25,%xmm5
1172	pxor	%xmm3,%xmm5
1173.byte	102,15,58,15,237,12
1174.byte	102,69,15,58,15,201,8
1175.byte	102,69,15,58,15,237,4
1176	paddd	%xmm6,%xmm2
1177	pxor	%xmm2,%xmm14
1178	pshufb	.rol16(%rip),%xmm14
1179	paddd	%xmm14,%xmm10
1180	pxor	%xmm10,%xmm6
1181	movdqa	%xmm6,%xmm3
1182	pslld	$12,%xmm3
1183	psrld	$20,%xmm6
1184	pxor	%xmm3,%xmm6
1185	paddd	%xmm6,%xmm2
1186	pxor	%xmm2,%xmm14
1187	pshufb	.rol8(%rip),%xmm14
1188	paddd	%xmm14,%xmm10
1189	pxor	%xmm10,%xmm6
1190	movdqa	%xmm6,%xmm3
1191	pslld	$7,%xmm3
1192	psrld	$25,%xmm6
1193	pxor	%xmm3,%xmm6
1194.byte	102,15,58,15,246,12
1195.byte	102,69,15,58,15,210,8
1196.byte	102,69,15,58,15,246,4
1197
1198	cmpq	%rcx,%r8
1199	jb	1b
1200	cmpq	$160,%r8
1201	jne	2b
1202	cmpq	$176,%rbx
1203	jb	1f
1204	addq	160(%rsi),%r10
1205	adcq	8+160(%rsi),%r11
1206	adcq	$1,%r12
1207	movq	0+0(%rbp),%rax
1208	movq	%rax,%r15
1209	mulq	%r10
1210	movq	%rax,%r13
1211	movq	%rdx,%r14
1212	movq	0+0(%rbp),%rax
1213	mulq	%r11
1214	imulq	%r12,%r15
1215	addq	%rax,%r14
1216	adcq	%rdx,%r15
1217	movq	8+0(%rbp),%rax
1218	movq	%rax,%r9
1219	mulq	%r10
1220	addq	%rax,%r14
1221	adcq	$0,%rdx
1222	movq	%rdx,%r10
1223	movq	8+0(%rbp),%rax
1224	mulq	%r11
1225	addq	%rax,%r15
1226	adcq	$0,%rdx
1227	imulq	%r12,%r9
1228	addq	%r10,%r15
1229	adcq	%rdx,%r9
1230	movq	%r13,%r10
1231	movq	%r14,%r11
1232	movq	%r15,%r12
1233	andq	$3,%r12
1234	movq	%r15,%r13
1235	andq	$-4,%r13
1236	movq	%r9,%r14
1237	shrdq	$2,%r9,%r15
1238	shrq	$2,%r9
1239	addq	%r13,%r10
1240	adcq	%r14,%r11
1241	adcq	$0,%r12
1242	addq	%r15,%r10
1243	adcq	%r9,%r11
1244	adcq	$0,%r12
1245
1246	cmpq	$192,%rbx
1247	jb	1f
1248	addq	176(%rsi),%r10
1249	adcq	8+176(%rsi),%r11
1250	adcq	$1,%r12
1251	movq	0+0(%rbp),%rax
1252	movq	%rax,%r15
1253	mulq	%r10
1254	movq	%rax,%r13
1255	movq	%rdx,%r14
1256	movq	0+0(%rbp),%rax
1257	mulq	%r11
1258	imulq	%r12,%r15
1259	addq	%rax,%r14
1260	adcq	%rdx,%r15
1261	movq	8+0(%rbp),%rax
1262	movq	%rax,%r9
1263	mulq	%r10
1264	addq	%rax,%r14
1265	adcq	$0,%rdx
1266	movq	%rdx,%r10
1267	movq	8+0(%rbp),%rax
1268	mulq	%r11
1269	addq	%rax,%r15
1270	adcq	$0,%rdx
1271	imulq	%r12,%r9
1272	addq	%r10,%r15
1273	adcq	%rdx,%r9
1274	movq	%r13,%r10
1275	movq	%r14,%r11
1276	movq	%r15,%r12
1277	andq	$3,%r12
1278	movq	%r15,%r13
1279	andq	$-4,%r13
1280	movq	%r9,%r14
1281	shrdq	$2,%r9,%r15
1282	shrq	$2,%r9
1283	addq	%r13,%r10
1284	adcq	%r14,%r11
1285	adcq	$0,%r12
1286	addq	%r15,%r10
1287	adcq	%r9,%r11
1288	adcq	$0,%r12
1289
12901:
1291	paddd	.chacha20_consts(%rip),%xmm2
1292	paddd	48(%rbp),%xmm6
1293	paddd	64(%rbp),%xmm10
1294	paddd	128(%rbp),%xmm14
1295	paddd	.chacha20_consts(%rip),%xmm1
1296	paddd	48(%rbp),%xmm5
1297	paddd	64(%rbp),%xmm9
1298	paddd	112(%rbp),%xmm13
1299	paddd	.chacha20_consts(%rip),%xmm0
1300	paddd	48(%rbp),%xmm4
1301	paddd	64(%rbp),%xmm8
1302	paddd	96(%rbp),%xmm12
1303	movdqu	0 + 0(%rsi),%xmm3
1304	movdqu	16 + 0(%rsi),%xmm7
1305	movdqu	32 + 0(%rsi),%xmm11
1306	movdqu	48 + 0(%rsi),%xmm15
1307	pxor	%xmm3,%xmm2
1308	pxor	%xmm7,%xmm6
1309	pxor	%xmm11,%xmm10
1310	pxor	%xmm14,%xmm15
1311	movdqu	%xmm2,0 + 0(%rdi)
1312	movdqu	%xmm6,16 + 0(%rdi)
1313	movdqu	%xmm10,32 + 0(%rdi)
1314	movdqu	%xmm15,48 + 0(%rdi)
1315	movdqu	0 + 64(%rsi),%xmm3
1316	movdqu	16 + 64(%rsi),%xmm7
1317	movdqu	32 + 64(%rsi),%xmm11
1318	movdqu	48 + 64(%rsi),%xmm15
1319	pxor	%xmm3,%xmm1
1320	pxor	%xmm7,%xmm5
1321	pxor	%xmm11,%xmm9
1322	pxor	%xmm13,%xmm15
1323	movdqu	%xmm1,0 + 64(%rdi)
1324	movdqu	%xmm5,16 + 64(%rdi)
1325	movdqu	%xmm9,32 + 64(%rdi)
1326	movdqu	%xmm15,48 + 64(%rdi)
1327
1328	subq	$128,%rbx
1329	leaq	128(%rsi),%rsi
1330	leaq	128(%rdi),%rdi
1331	jmp	open_sse_tail_64_dec_loop
13323:
1333
1334	movdqa	.chacha20_consts(%rip),%xmm0
1335	movdqa	48(%rbp),%xmm4
1336	movdqa	64(%rbp),%xmm8
1337	movdqa	%xmm0,%xmm1
1338	movdqa	%xmm4,%xmm5
1339	movdqa	%xmm8,%xmm9
1340	movdqa	%xmm0,%xmm2
1341	movdqa	%xmm4,%xmm6
1342	movdqa	%xmm8,%xmm10
1343	movdqa	%xmm0,%xmm3
1344	movdqa	%xmm4,%xmm7
1345	movdqa	%xmm8,%xmm11
1346	movdqa	96(%rbp),%xmm15
1347	paddd	.sse_inc(%rip),%xmm15
1348	movdqa	%xmm15,%xmm14
1349	paddd	.sse_inc(%rip),%xmm14
1350	movdqa	%xmm14,%xmm13
1351	paddd	.sse_inc(%rip),%xmm13
1352	movdqa	%xmm13,%xmm12
1353	paddd	.sse_inc(%rip),%xmm12
1354	movdqa	%xmm12,96(%rbp)
1355	movdqa	%xmm13,112(%rbp)
1356	movdqa	%xmm14,128(%rbp)
1357	movdqa	%xmm15,144(%rbp)
1358
1359	xorq	%r8,%r8
13601:
1361	addq	0(%rsi,%r8), %r10
1362	adcq	8+0(%rsi,%r8), %r11
1363	adcq	$1,%r12
1364	movdqa	%xmm11,80(%rbp)
1365	paddd	%xmm4,%xmm0
1366	pxor	%xmm0,%xmm12
1367	pshufb	.rol16(%rip),%xmm12
1368	paddd	%xmm12,%xmm8
1369	pxor	%xmm8,%xmm4
1370	movdqa	%xmm4,%xmm11
1371	pslld	$12,%xmm11
1372	psrld	$20,%xmm4
1373	pxor	%xmm11,%xmm4
1374	paddd	%xmm4,%xmm0
1375	pxor	%xmm0,%xmm12
1376	pshufb	.rol8(%rip),%xmm12
1377	paddd	%xmm12,%xmm8
1378	pxor	%xmm8,%xmm4
1379	movdqa	%xmm4,%xmm11
1380	pslld	$7,%xmm11
1381	psrld	$25,%xmm4
1382	pxor	%xmm11,%xmm4
1383.byte	102,15,58,15,228,4
1384.byte	102,69,15,58,15,192,8
1385.byte	102,69,15,58,15,228,12
1386	paddd	%xmm5,%xmm1
1387	pxor	%xmm1,%xmm13
1388	pshufb	.rol16(%rip),%xmm13
1389	paddd	%xmm13,%xmm9
1390	pxor	%xmm9,%xmm5
1391	movdqa	%xmm5,%xmm11
1392	pslld	$12,%xmm11
1393	psrld	$20,%xmm5
1394	pxor	%xmm11,%xmm5
1395	paddd	%xmm5,%xmm1
1396	pxor	%xmm1,%xmm13
1397	pshufb	.rol8(%rip),%xmm13
1398	paddd	%xmm13,%xmm9
1399	pxor	%xmm9,%xmm5
1400	movdqa	%xmm5,%xmm11
1401	pslld	$7,%xmm11
1402	psrld	$25,%xmm5
1403	pxor	%xmm11,%xmm5
1404.byte	102,15,58,15,237,4
1405.byte	102,69,15,58,15,201,8
1406.byte	102,69,15,58,15,237,12
1407	paddd	%xmm6,%xmm2
1408	pxor	%xmm2,%xmm14
1409	pshufb	.rol16(%rip),%xmm14
1410	paddd	%xmm14,%xmm10
1411	pxor	%xmm10,%xmm6
1412	movdqa	%xmm6,%xmm11
1413	pslld	$12,%xmm11
1414	psrld	$20,%xmm6
1415	pxor	%xmm11,%xmm6
1416	paddd	%xmm6,%xmm2
1417	pxor	%xmm2,%xmm14
1418	pshufb	.rol8(%rip),%xmm14
1419	paddd	%xmm14,%xmm10
1420	pxor	%xmm10,%xmm6
1421	movdqa	%xmm6,%xmm11
1422	pslld	$7,%xmm11
1423	psrld	$25,%xmm6
1424	pxor	%xmm11,%xmm6
1425.byte	102,15,58,15,246,4
1426.byte	102,69,15,58,15,210,8
1427.byte	102,69,15,58,15,246,12
1428	movdqa	80(%rbp),%xmm11
1429	movq	0+0(%rbp),%rax
1430	movq	%rax,%r15
1431	mulq	%r10
1432	movq	%rax,%r13
1433	movq	%rdx,%r14
1434	movq	0+0(%rbp),%rax
1435	mulq	%r11
1436	imulq	%r12,%r15
1437	addq	%rax,%r14
1438	adcq	%rdx,%r15
1439	movdqa	%xmm9,80(%rbp)
1440	paddd	%xmm7,%xmm3
1441	pxor	%xmm3,%xmm15
1442	pshufb	.rol16(%rip),%xmm15
1443	paddd	%xmm15,%xmm11
1444	pxor	%xmm11,%xmm7
1445	movdqa	%xmm7,%xmm9
1446	pslld	$12,%xmm9
1447	psrld	$20,%xmm7
1448	pxor	%xmm9,%xmm7
1449	paddd	%xmm7,%xmm3
1450	pxor	%xmm3,%xmm15
1451	pshufb	.rol8(%rip),%xmm15
1452	paddd	%xmm15,%xmm11
1453	pxor	%xmm11,%xmm7
1454	movdqa	%xmm7,%xmm9
1455	pslld	$7,%xmm9
1456	psrld	$25,%xmm7
1457	pxor	%xmm9,%xmm7
1458.byte	102,15,58,15,255,4
1459.byte	102,69,15,58,15,219,8
1460.byte	102,69,15,58,15,255,12
1461	movdqa	80(%rbp),%xmm9
1462	movq	8+0(%rbp),%rax
1463	movq	%rax,%r9
1464	mulq	%r10
1465	addq	%rax,%r14
1466	adcq	$0,%rdx
1467	movq	%rdx,%r10
1468	movq	8+0(%rbp),%rax
1469	mulq	%r11
1470	addq	%rax,%r15
1471	adcq	$0,%rdx
1472	movdqa	%xmm11,80(%rbp)
1473	paddd	%xmm4,%xmm0
1474	pxor	%xmm0,%xmm12
1475	pshufb	.rol16(%rip),%xmm12
1476	paddd	%xmm12,%xmm8
1477	pxor	%xmm8,%xmm4
1478	movdqa	%xmm4,%xmm11
1479	pslld	$12,%xmm11
1480	psrld	$20,%xmm4
1481	pxor	%xmm11,%xmm4
1482	paddd	%xmm4,%xmm0
1483	pxor	%xmm0,%xmm12
1484	pshufb	.rol8(%rip),%xmm12
1485	paddd	%xmm12,%xmm8
1486	pxor	%xmm8,%xmm4
1487	movdqa	%xmm4,%xmm11
1488	pslld	$7,%xmm11
1489	psrld	$25,%xmm4
1490	pxor	%xmm11,%xmm4
1491.byte	102,15,58,15,228,12
1492.byte	102,69,15,58,15,192,8
1493.byte	102,69,15,58,15,228,4
1494	paddd	%xmm5,%xmm1
1495	pxor	%xmm1,%xmm13
1496	pshufb	.rol16(%rip),%xmm13
1497	paddd	%xmm13,%xmm9
1498	pxor	%xmm9,%xmm5
1499	movdqa	%xmm5,%xmm11
1500	pslld	$12,%xmm11
1501	psrld	$20,%xmm5
1502	pxor	%xmm11,%xmm5
1503	paddd	%xmm5,%xmm1
1504	pxor	%xmm1,%xmm13
1505	pshufb	.rol8(%rip),%xmm13
1506	paddd	%xmm13,%xmm9
1507	pxor	%xmm9,%xmm5
1508	movdqa	%xmm5,%xmm11
1509	pslld	$7,%xmm11
1510	psrld	$25,%xmm5
1511	pxor	%xmm11,%xmm5
1512.byte	102,15,58,15,237,12
1513.byte	102,69,15,58,15,201,8
1514.byte	102,69,15,58,15,237,4
1515	imulq	%r12,%r9
1516	addq	%r10,%r15
1517	adcq	%rdx,%r9
1518	paddd	%xmm6,%xmm2
1519	pxor	%xmm2,%xmm14
1520	pshufb	.rol16(%rip),%xmm14
1521	paddd	%xmm14,%xmm10
1522	pxor	%xmm10,%xmm6
1523	movdqa	%xmm6,%xmm11
1524	pslld	$12,%xmm11
1525	psrld	$20,%xmm6
1526	pxor	%xmm11,%xmm6
1527	paddd	%xmm6,%xmm2
1528	pxor	%xmm2,%xmm14
1529	pshufb	.rol8(%rip),%xmm14
1530	paddd	%xmm14,%xmm10
1531	pxor	%xmm10,%xmm6
1532	movdqa	%xmm6,%xmm11
1533	pslld	$7,%xmm11
1534	psrld	$25,%xmm6
1535	pxor	%xmm11,%xmm6
1536.byte	102,15,58,15,246,12
1537.byte	102,69,15,58,15,210,8
1538.byte	102,69,15,58,15,246,4
1539	movdqa	80(%rbp),%xmm11
1540	movq	%r13,%r10
1541	movq	%r14,%r11
1542	movq	%r15,%r12
1543	andq	$3,%r12
1544	movq	%r15,%r13
1545	andq	$-4,%r13
1546	movq	%r9,%r14
1547	shrdq	$2,%r9,%r15
1548	shrq	$2,%r9
1549	addq	%r13,%r10
1550	adcq	%r14,%r11
1551	adcq	$0,%r12
1552	addq	%r15,%r10
1553	adcq	%r9,%r11
1554	adcq	$0,%r12
1555	movdqa	%xmm9,80(%rbp)
1556	paddd	%xmm7,%xmm3
1557	pxor	%xmm3,%xmm15
1558	pshufb	.rol16(%rip),%xmm15
1559	paddd	%xmm15,%xmm11
1560	pxor	%xmm11,%xmm7
1561	movdqa	%xmm7,%xmm9
1562	pslld	$12,%xmm9
1563	psrld	$20,%xmm7
1564	pxor	%xmm9,%xmm7
1565	paddd	%xmm7,%xmm3
1566	pxor	%xmm3,%xmm15
1567	pshufb	.rol8(%rip),%xmm15
1568	paddd	%xmm15,%xmm11
1569	pxor	%xmm11,%xmm7
1570	movdqa	%xmm7,%xmm9
1571	pslld	$7,%xmm9
1572	psrld	$25,%xmm7
1573	pxor	%xmm9,%xmm7
1574.byte	102,15,58,15,255,12
1575.byte	102,69,15,58,15,219,8
1576.byte	102,69,15,58,15,255,4
1577	movdqa	80(%rbp),%xmm9
1578
1579	addq	$16,%r8
1580	cmpq	$160,%r8
1581	jb	1b
1582	movq	%rbx,%rcx
1583	andq	$-16,%rcx
15841:
1585	addq	0(%rsi,%r8), %r10
1586	adcq	8+0(%rsi,%r8), %r11
1587	adcq	$1,%r12
1588	movq	0+0(%rbp),%rax
1589	movq	%rax,%r15
1590	mulq	%r10
1591	movq	%rax,%r13
1592	movq	%rdx,%r14
1593	movq	0+0(%rbp),%rax
1594	mulq	%r11
1595	imulq	%r12,%r15
1596	addq	%rax,%r14
1597	adcq	%rdx,%r15
1598	movq	8+0(%rbp),%rax
1599	movq	%rax,%r9
1600	mulq	%r10
1601	addq	%rax,%r14
1602	adcq	$0,%rdx
1603	movq	%rdx,%r10
1604	movq	8+0(%rbp),%rax
1605	mulq	%r11
1606	addq	%rax,%r15
1607	adcq	$0,%rdx
1608	imulq	%r12,%r9
1609	addq	%r10,%r15
1610	adcq	%rdx,%r9
1611	movq	%r13,%r10
1612	movq	%r14,%r11
1613	movq	%r15,%r12
1614	andq	$3,%r12
1615	movq	%r15,%r13
1616	andq	$-4,%r13
1617	movq	%r9,%r14
1618	shrdq	$2,%r9,%r15
1619	shrq	$2,%r9
1620	addq	%r13,%r10
1621	adcq	%r14,%r11
1622	adcq	$0,%r12
1623	addq	%r15,%r10
1624	adcq	%r9,%r11
1625	adcq	$0,%r12
1626
1627	addq	$16,%r8
1628	cmpq	%rcx,%r8
1629	jb	1b
1630	paddd	.chacha20_consts(%rip),%xmm3
1631	paddd	48(%rbp),%xmm7
1632	paddd	64(%rbp),%xmm11
1633	paddd	144(%rbp),%xmm15
1634	paddd	.chacha20_consts(%rip),%xmm2
1635	paddd	48(%rbp),%xmm6
1636	paddd	64(%rbp),%xmm10
1637	paddd	128(%rbp),%xmm14
1638	paddd	.chacha20_consts(%rip),%xmm1
1639	paddd	48(%rbp),%xmm5
1640	paddd	64(%rbp),%xmm9
1641	paddd	112(%rbp),%xmm13
1642	paddd	.chacha20_consts(%rip),%xmm0
1643	paddd	48(%rbp),%xmm4
1644	paddd	64(%rbp),%xmm8
1645	paddd	96(%rbp),%xmm12
1646	movdqa	%xmm12,80(%rbp)
1647	movdqu	0 + 0(%rsi),%xmm12
1648	pxor	%xmm3,%xmm12
1649	movdqu	%xmm12,0 + 0(%rdi)
1650	movdqu	16 + 0(%rsi),%xmm12
1651	pxor	%xmm7,%xmm12
1652	movdqu	%xmm12,16 + 0(%rdi)
1653	movdqu	32 + 0(%rsi),%xmm12
1654	pxor	%xmm11,%xmm12
1655	movdqu	%xmm12,32 + 0(%rdi)
1656	movdqu	48 + 0(%rsi),%xmm12
1657	pxor	%xmm15,%xmm12
1658	movdqu	%xmm12,48 + 0(%rdi)
1659	movdqu	0 + 64(%rsi),%xmm3
1660	movdqu	16 + 64(%rsi),%xmm7
1661	movdqu	32 + 64(%rsi),%xmm11
1662	movdqu	48 + 64(%rsi),%xmm15
1663	pxor	%xmm3,%xmm2
1664	pxor	%xmm7,%xmm6
1665	pxor	%xmm11,%xmm10
1666	pxor	%xmm14,%xmm15
1667	movdqu	%xmm2,0 + 64(%rdi)
1668	movdqu	%xmm6,16 + 64(%rdi)
1669	movdqu	%xmm10,32 + 64(%rdi)
1670	movdqu	%xmm15,48 + 64(%rdi)
1671	movdqu	0 + 128(%rsi),%xmm3
1672	movdqu	16 + 128(%rsi),%xmm7
1673	movdqu	32 + 128(%rsi),%xmm11
1674	movdqu	48 + 128(%rsi),%xmm15
1675	pxor	%xmm3,%xmm1
1676	pxor	%xmm7,%xmm5
1677	pxor	%xmm11,%xmm9
1678	pxor	%xmm13,%xmm15
1679	movdqu	%xmm1,0 + 128(%rdi)
1680	movdqu	%xmm5,16 + 128(%rdi)
1681	movdqu	%xmm9,32 + 128(%rdi)
1682	movdqu	%xmm15,48 + 128(%rdi)
1683
1684	movdqa	80(%rbp),%xmm12
1685	subq	$192,%rbx
1686	leaq	192(%rsi),%rsi
1687	leaq	192(%rdi),%rdi
1688
1689
1690open_sse_tail_64_dec_loop:
1691	cmpq	$16,%rbx
1692	jb	1f
1693	subq	$16,%rbx
1694	movdqu	(%rsi),%xmm3
1695	pxor	%xmm3,%xmm0
1696	movdqu	%xmm0,(%rdi)
1697	leaq	16(%rsi),%rsi
1698	leaq	16(%rdi),%rdi
1699	movdqa	%xmm4,%xmm0
1700	movdqa	%xmm8,%xmm4
1701	movdqa	%xmm12,%xmm8
1702	jmp	open_sse_tail_64_dec_loop
17031:
1704	movdqa	%xmm0,%xmm1
1705
1706
1707open_sse_tail_16:
1708	testq	%rbx,%rbx
1709	jz	open_sse_finalize
1710
1711
1712
1713	pxor	%xmm3,%xmm3
1714	leaq	-1(%rsi,%rbx), %rsi
1715	movq	%rbx,%r8
17162:
1717	pslldq	$1,%xmm3
1718	pinsrb	$0,(%rsi),%xmm3
1719	subq	$1,%rsi
1720	subq	$1,%r8
1721	jnz	2b
1722
17233:
1724.byte	102,73,15,126,221
1725	pextrq	$1,%xmm3,%r14
1726
1727	pxor	%xmm1,%xmm3
1728
1729
17302:
1731	pextrb	$0,%xmm3,(%rdi)
1732	psrldq	$1,%xmm3
1733	addq	$1,%rdi
1734	subq	$1,%rbx
1735	jne	2b
1736
1737	addq	%r13,%r10
1738	adcq	%r14,%r11
1739	adcq	$1,%r12
1740	movq	0+0(%rbp),%rax
1741	movq	%rax,%r15
1742	mulq	%r10
1743	movq	%rax,%r13
1744	movq	%rdx,%r14
1745	movq	0+0(%rbp),%rax
1746	mulq	%r11
1747	imulq	%r12,%r15
1748	addq	%rax,%r14
1749	adcq	%rdx,%r15
1750	movq	8+0(%rbp),%rax
1751	movq	%rax,%r9
1752	mulq	%r10
1753	addq	%rax,%r14
1754	adcq	$0,%rdx
1755	movq	%rdx,%r10
1756	movq	8+0(%rbp),%rax
1757	mulq	%r11
1758	addq	%rax,%r15
1759	adcq	$0,%rdx
1760	imulq	%r12,%r9
1761	addq	%r10,%r15
1762	adcq	%rdx,%r9
1763	movq	%r13,%r10
1764	movq	%r14,%r11
1765	movq	%r15,%r12
1766	andq	$3,%r12
1767	movq	%r15,%r13
1768	andq	$-4,%r13
1769	movq	%r9,%r14
1770	shrdq	$2,%r9,%r15
1771	shrq	$2,%r9
1772	addq	%r13,%r10
1773	adcq	%r14,%r11
1774	adcq	$0,%r12
1775	addq	%r15,%r10
1776	adcq	%r9,%r11
1777	adcq	$0,%r12
1778
1779
1780open_sse_finalize:
1781	addq	32(%rbp),%r10
1782	adcq	8+32(%rbp),%r11
1783	adcq	$1,%r12
1784	movq	0+0(%rbp),%rax
1785	movq	%rax,%r15
1786	mulq	%r10
1787	movq	%rax,%r13
1788	movq	%rdx,%r14
1789	movq	0+0(%rbp),%rax
1790	mulq	%r11
1791	imulq	%r12,%r15
1792	addq	%rax,%r14
1793	adcq	%rdx,%r15
1794	movq	8+0(%rbp),%rax
1795	movq	%rax,%r9
1796	mulq	%r10
1797	addq	%rax,%r14
1798	adcq	$0,%rdx
1799	movq	%rdx,%r10
1800	movq	8+0(%rbp),%rax
1801	mulq	%r11
1802	addq	%rax,%r15
1803	adcq	$0,%rdx
1804	imulq	%r12,%r9
1805	addq	%r10,%r15
1806	adcq	%rdx,%r9
1807	movq	%r13,%r10
1808	movq	%r14,%r11
1809	movq	%r15,%r12
1810	andq	$3,%r12
1811	movq	%r15,%r13
1812	andq	$-4,%r13
1813	movq	%r9,%r14
1814	shrdq	$2,%r9,%r15
1815	shrq	$2,%r9
1816	addq	%r13,%r10
1817	adcq	%r14,%r11
1818	adcq	$0,%r12
1819	addq	%r15,%r10
1820	adcq	%r9,%r11
1821	adcq	$0,%r12
1822
1823
1824	movq	%r10,%r13
1825	movq	%r11,%r14
1826	movq	%r12,%r15
1827	subq	$-5,%r10
1828	sbbq	$-1,%r11
1829	sbbq	$3,%r12
1830	cmovcq	%r13,%r10
1831	cmovcq	%r14,%r11
1832	cmovcq	%r15,%r12
1833
1834	addq	0+16(%rbp),%r10
1835	adcq	8+16(%rbp),%r11
1836
1837	addq	$288 + 32,%rsp
1838.cfi_adjust_cfa_offset	-(288 + 32)
1839	popq	%r9
1840.cfi_adjust_cfa_offset	-8
1841	movq	%r10,(%r9)
1842	movq	%r11,8(%r9)
1843
1844	popq	%r15
1845.cfi_adjust_cfa_offset	-8
1846	popq	%r14
1847.cfi_adjust_cfa_offset	-8
1848	popq	%r13
1849.cfi_adjust_cfa_offset	-8
1850	popq	%r12
1851.cfi_adjust_cfa_offset	-8
1852	popq	%rbx
1853.cfi_adjust_cfa_offset	-8
1854	popq	%rbp
1855.cfi_adjust_cfa_offset	-8
1856	.byte	0xf3,0xc3
1857.cfi_adjust_cfa_offset	(8 * 6) + 288 + 32
1858
1859open_sse_128:
1860	movdqu	.chacha20_consts(%rip),%xmm0
1861	movdqa	%xmm0,%xmm1
1862	movdqa	%xmm0,%xmm2
1863	movdqu	0(%r9),%xmm4
1864	movdqa	%xmm4,%xmm5
1865	movdqa	%xmm4,%xmm6
1866	movdqu	16(%r9),%xmm8
1867	movdqa	%xmm8,%xmm9
1868	movdqa	%xmm8,%xmm10
1869	movdqu	32(%r9),%xmm12
1870	movdqa	%xmm12,%xmm13
1871	paddd	.sse_inc(%rip),%xmm13
1872	movdqa	%xmm13,%xmm14
1873	paddd	.sse_inc(%rip),%xmm14
1874	movdqa	%xmm4,%xmm7
1875	movdqa	%xmm8,%xmm11
1876	movdqa	%xmm13,%xmm15
1877	movq	$10,%r10
18781:
1879	paddd	%xmm4,%xmm0
1880	pxor	%xmm0,%xmm12
1881	pshufb	.rol16(%rip),%xmm12
1882	paddd	%xmm12,%xmm8
1883	pxor	%xmm8,%xmm4
1884	movdqa	%xmm4,%xmm3
1885	pslld	$12,%xmm3
1886	psrld	$20,%xmm4
1887	pxor	%xmm3,%xmm4
1888	paddd	%xmm4,%xmm0
1889	pxor	%xmm0,%xmm12
1890	pshufb	.rol8(%rip),%xmm12
1891	paddd	%xmm12,%xmm8
1892	pxor	%xmm8,%xmm4
1893	movdqa	%xmm4,%xmm3
1894	pslld	$7,%xmm3
1895	psrld	$25,%xmm4
1896	pxor	%xmm3,%xmm4
1897.byte	102,15,58,15,228,4
1898.byte	102,69,15,58,15,192,8
1899.byte	102,69,15,58,15,228,12
1900	paddd	%xmm5,%xmm1
1901	pxor	%xmm1,%xmm13
1902	pshufb	.rol16(%rip),%xmm13
1903	paddd	%xmm13,%xmm9
1904	pxor	%xmm9,%xmm5
1905	movdqa	%xmm5,%xmm3
1906	pslld	$12,%xmm3
1907	psrld	$20,%xmm5
1908	pxor	%xmm3,%xmm5
1909	paddd	%xmm5,%xmm1
1910	pxor	%xmm1,%xmm13
1911	pshufb	.rol8(%rip),%xmm13
1912	paddd	%xmm13,%xmm9
1913	pxor	%xmm9,%xmm5
1914	movdqa	%xmm5,%xmm3
1915	pslld	$7,%xmm3
1916	psrld	$25,%xmm5
1917	pxor	%xmm3,%xmm5
1918.byte	102,15,58,15,237,4
1919.byte	102,69,15,58,15,201,8
1920.byte	102,69,15,58,15,237,12
1921	paddd	%xmm6,%xmm2
1922	pxor	%xmm2,%xmm14
1923	pshufb	.rol16(%rip),%xmm14
1924	paddd	%xmm14,%xmm10
1925	pxor	%xmm10,%xmm6
1926	movdqa	%xmm6,%xmm3
1927	pslld	$12,%xmm3
1928	psrld	$20,%xmm6
1929	pxor	%xmm3,%xmm6
1930	paddd	%xmm6,%xmm2
1931	pxor	%xmm2,%xmm14
1932	pshufb	.rol8(%rip),%xmm14
1933	paddd	%xmm14,%xmm10
1934	pxor	%xmm10,%xmm6
1935	movdqa	%xmm6,%xmm3
1936	pslld	$7,%xmm3
1937	psrld	$25,%xmm6
1938	pxor	%xmm3,%xmm6
1939.byte	102,15,58,15,246,4
1940.byte	102,69,15,58,15,210,8
1941.byte	102,69,15,58,15,246,12
1942	paddd	%xmm4,%xmm0
1943	pxor	%xmm0,%xmm12
1944	pshufb	.rol16(%rip),%xmm12
1945	paddd	%xmm12,%xmm8
1946	pxor	%xmm8,%xmm4
1947	movdqa	%xmm4,%xmm3
1948	pslld	$12,%xmm3
1949	psrld	$20,%xmm4
1950	pxor	%xmm3,%xmm4
1951	paddd	%xmm4,%xmm0
1952	pxor	%xmm0,%xmm12
1953	pshufb	.rol8(%rip),%xmm12
1954	paddd	%xmm12,%xmm8
1955	pxor	%xmm8,%xmm4
1956	movdqa	%xmm4,%xmm3
1957	pslld	$7,%xmm3
1958	psrld	$25,%xmm4
1959	pxor	%xmm3,%xmm4
1960.byte	102,15,58,15,228,12
1961.byte	102,69,15,58,15,192,8
1962.byte	102,69,15,58,15,228,4
1963	paddd	%xmm5,%xmm1
1964	pxor	%xmm1,%xmm13
1965	pshufb	.rol16(%rip),%xmm13
1966	paddd	%xmm13,%xmm9
1967	pxor	%xmm9,%xmm5
1968	movdqa	%xmm5,%xmm3
1969	pslld	$12,%xmm3
1970	psrld	$20,%xmm5
1971	pxor	%xmm3,%xmm5
1972	paddd	%xmm5,%xmm1
1973	pxor	%xmm1,%xmm13
1974	pshufb	.rol8(%rip),%xmm13
1975	paddd	%xmm13,%xmm9
1976	pxor	%xmm9,%xmm5
1977	movdqa	%xmm5,%xmm3
1978	pslld	$7,%xmm3
1979	psrld	$25,%xmm5
1980	pxor	%xmm3,%xmm5
1981.byte	102,15,58,15,237,12
1982.byte	102,69,15,58,15,201,8
1983.byte	102,69,15,58,15,237,4
1984	paddd	%xmm6,%xmm2
1985	pxor	%xmm2,%xmm14
1986	pshufb	.rol16(%rip),%xmm14
1987	paddd	%xmm14,%xmm10
1988	pxor	%xmm10,%xmm6
1989	movdqa	%xmm6,%xmm3
1990	pslld	$12,%xmm3
1991	psrld	$20,%xmm6
1992	pxor	%xmm3,%xmm6
1993	paddd	%xmm6,%xmm2
1994	pxor	%xmm2,%xmm14
1995	pshufb	.rol8(%rip),%xmm14
1996	paddd	%xmm14,%xmm10
1997	pxor	%xmm10,%xmm6
1998	movdqa	%xmm6,%xmm3
1999	pslld	$7,%xmm3
2000	psrld	$25,%xmm6
2001	pxor	%xmm3,%xmm6
2002.byte	102,15,58,15,246,12
2003.byte	102,69,15,58,15,210,8
2004.byte	102,69,15,58,15,246,4
2005
2006	decq	%r10
2007	jnz	1b
2008	paddd	.chacha20_consts(%rip),%xmm0
2009	paddd	.chacha20_consts(%rip),%xmm1
2010	paddd	.chacha20_consts(%rip),%xmm2
2011	paddd	%xmm7,%xmm4
2012	paddd	%xmm7,%xmm5
2013	paddd	%xmm7,%xmm6
2014	paddd	%xmm11,%xmm9
2015	paddd	%xmm11,%xmm10
2016	paddd	%xmm15,%xmm13
2017	paddd	.sse_inc(%rip),%xmm15
2018	paddd	%xmm15,%xmm14
2019
2020	pand	.clamp(%rip),%xmm0
2021	movdqa	%xmm0,0(%rbp)
2022	movdqa	%xmm4,16(%rbp)
2023
2024	movq	%r8,%r8
2025	call	poly_hash_ad_internal
20261:
2027	cmpq	$16,%rbx
2028	jb	open_sse_tail_16
2029	subq	$16,%rbx
2030	addq	0(%rsi),%r10
2031	adcq	8+0(%rsi),%r11
2032	adcq	$1,%r12
2033
2034
2035	movdqu	0(%rsi),%xmm3
2036	pxor	%xmm3,%xmm1
2037	movdqu	%xmm1,0(%rdi)
2038	leaq	16(%rsi),%rsi
2039	leaq	16(%rdi),%rdi
2040	movq	0+0(%rbp),%rax
2041	movq	%rax,%r15
2042	mulq	%r10
2043	movq	%rax,%r13
2044	movq	%rdx,%r14
2045	movq	0+0(%rbp),%rax
2046	mulq	%r11
2047	imulq	%r12,%r15
2048	addq	%rax,%r14
2049	adcq	%rdx,%r15
2050	movq	8+0(%rbp),%rax
2051	movq	%rax,%r9
2052	mulq	%r10
2053	addq	%rax,%r14
2054	adcq	$0,%rdx
2055	movq	%rdx,%r10
2056	movq	8+0(%rbp),%rax
2057	mulq	%r11
2058	addq	%rax,%r15
2059	adcq	$0,%rdx
2060	imulq	%r12,%r9
2061	addq	%r10,%r15
2062	adcq	%rdx,%r9
2063	movq	%r13,%r10
2064	movq	%r14,%r11
2065	movq	%r15,%r12
2066	andq	$3,%r12
2067	movq	%r15,%r13
2068	andq	$-4,%r13
2069	movq	%r9,%r14
2070	shrdq	$2,%r9,%r15
2071	shrq	$2,%r9
2072	addq	%r13,%r10
2073	adcq	%r14,%r11
2074	adcq	$0,%r12
2075	addq	%r15,%r10
2076	adcq	%r9,%r11
2077	adcq	$0,%r12
2078
2079
2080	movdqa	%xmm5,%xmm1
2081	movdqa	%xmm9,%xmm5
2082	movdqa	%xmm13,%xmm9
2083	movdqa	%xmm2,%xmm13
2084	movdqa	%xmm6,%xmm2
2085	movdqa	%xmm10,%xmm6
2086	movdqa	%xmm14,%xmm10
2087	jmp	1b
2088	jmp	open_sse_tail_16
2089.size	chacha20_poly1305_open, .-chacha20_poly1305_open
2090.cfi_endproc
2091
2092
2093
2094
2095.globl	chacha20_poly1305_seal
2096.hidden chacha20_poly1305_seal
2097.type	chacha20_poly1305_seal,@function
2098.align	64
2099chacha20_poly1305_seal:
2100.cfi_startproc
2101	pushq	%rbp
2102.cfi_adjust_cfa_offset	8
2103	pushq	%rbx
2104.cfi_adjust_cfa_offset	8
2105	pushq	%r12
2106.cfi_adjust_cfa_offset	8
2107	pushq	%r13
2108.cfi_adjust_cfa_offset	8
2109	pushq	%r14
2110.cfi_adjust_cfa_offset	8
2111	pushq	%r15
2112.cfi_adjust_cfa_offset	8
2113
2114
2115	pushq	%r9
2116.cfi_adjust_cfa_offset	8
2117	subq	$288 + 32,%rsp
2118.cfi_adjust_cfa_offset	288 + 32
2119.cfi_offset	rbp, -16
2120.cfi_offset	rbx, -24
2121.cfi_offset	r12, -32
2122.cfi_offset	r13, -40
2123.cfi_offset	r14, -48
2124.cfi_offset	r15, -56
2125	leaq	32(%rsp),%rbp
2126	andq	$-32,%rbp
2127	movq	%rdx,8+32(%rbp)
2128	movq	%r8,0+32(%rbp)
2129	movq	%rdx,%rbx
2130
2131	movl	OPENSSL_ia32cap_P+8(%rip),%eax
2132	andl	$288,%eax
2133	xorl	$288,%eax
2134	jz	chacha20_poly1305_seal_avx2
2135
2136	cmpq	$128,%rbx
2137	jbe	seal_sse_128
2138
2139	movdqa	.chacha20_consts(%rip),%xmm0
2140	movdqu	0(%r9),%xmm4
2141	movdqu	16(%r9),%xmm8
2142	movdqu	32(%r9),%xmm12
2143	movdqa	%xmm0,%xmm1
2144	movdqa	%xmm0,%xmm2
2145	movdqa	%xmm0,%xmm3
2146	movdqa	%xmm4,%xmm5
2147	movdqa	%xmm4,%xmm6
2148	movdqa	%xmm4,%xmm7
2149	movdqa	%xmm8,%xmm9
2150	movdqa	%xmm8,%xmm10
2151	movdqa	%xmm8,%xmm11
2152	movdqa	%xmm12,%xmm15
2153	paddd	.sse_inc(%rip),%xmm12
2154	movdqa	%xmm12,%xmm14
2155	paddd	.sse_inc(%rip),%xmm12
2156	movdqa	%xmm12,%xmm13
2157	paddd	.sse_inc(%rip),%xmm12
2158
2159	movdqa	%xmm4,48(%rbp)
2160	movdqa	%xmm8,64(%rbp)
2161	movdqa	%xmm12,96(%rbp)
2162	movdqa	%xmm13,112(%rbp)
2163	movdqa	%xmm14,128(%rbp)
2164	movdqa	%xmm15,144(%rbp)
2165	movq	$10,%r10
21661:
2167	movdqa	%xmm8,80(%rbp)
2168	movdqa	.rol16(%rip),%xmm8
2169	paddd	%xmm7,%xmm3
2170	paddd	%xmm6,%xmm2
2171	paddd	%xmm5,%xmm1
2172	paddd	%xmm4,%xmm0
2173	pxor	%xmm3,%xmm15
2174	pxor	%xmm2,%xmm14
2175	pxor	%xmm1,%xmm13
2176	pxor	%xmm0,%xmm12
2177.byte	102,69,15,56,0,248
2178.byte	102,69,15,56,0,240
2179.byte	102,69,15,56,0,232
2180.byte	102,69,15,56,0,224
2181	movdqa	80(%rbp),%xmm8
2182	paddd	%xmm15,%xmm11
2183	paddd	%xmm14,%xmm10
2184	paddd	%xmm13,%xmm9
2185	paddd	%xmm12,%xmm8
2186	pxor	%xmm11,%xmm7
2187	pxor	%xmm10,%xmm6
2188	pxor	%xmm9,%xmm5
2189	pxor	%xmm8,%xmm4
2190	movdqa	%xmm8,80(%rbp)
2191	movdqa	%xmm7,%xmm8
2192	psrld	$20,%xmm8
2193	pslld	$32-20,%xmm7
2194	pxor	%xmm8,%xmm7
2195	movdqa	%xmm6,%xmm8
2196	psrld	$20,%xmm8
2197	pslld	$32-20,%xmm6
2198	pxor	%xmm8,%xmm6
2199	movdqa	%xmm5,%xmm8
2200	psrld	$20,%xmm8
2201	pslld	$32-20,%xmm5
2202	pxor	%xmm8,%xmm5
2203	movdqa	%xmm4,%xmm8
2204	psrld	$20,%xmm8
2205	pslld	$32-20,%xmm4
2206	pxor	%xmm8,%xmm4
2207	movdqa	.rol8(%rip),%xmm8
2208	paddd	%xmm7,%xmm3
2209	paddd	%xmm6,%xmm2
2210	paddd	%xmm5,%xmm1
2211	paddd	%xmm4,%xmm0
2212	pxor	%xmm3,%xmm15
2213	pxor	%xmm2,%xmm14
2214	pxor	%xmm1,%xmm13
2215	pxor	%xmm0,%xmm12
2216.byte	102,69,15,56,0,248
2217.byte	102,69,15,56,0,240
2218.byte	102,69,15,56,0,232
2219.byte	102,69,15,56,0,224
2220	movdqa	80(%rbp),%xmm8
2221	paddd	%xmm15,%xmm11
2222	paddd	%xmm14,%xmm10
2223	paddd	%xmm13,%xmm9
2224	paddd	%xmm12,%xmm8
2225	pxor	%xmm11,%xmm7
2226	pxor	%xmm10,%xmm6
2227	pxor	%xmm9,%xmm5
2228	pxor	%xmm8,%xmm4
2229	movdqa	%xmm8,80(%rbp)
2230	movdqa	%xmm7,%xmm8
2231	psrld	$25,%xmm8
2232	pslld	$32-25,%xmm7
2233	pxor	%xmm8,%xmm7
2234	movdqa	%xmm6,%xmm8
2235	psrld	$25,%xmm8
2236	pslld	$32-25,%xmm6
2237	pxor	%xmm8,%xmm6
2238	movdqa	%xmm5,%xmm8
2239	psrld	$25,%xmm8
2240	pslld	$32-25,%xmm5
2241	pxor	%xmm8,%xmm5
2242	movdqa	%xmm4,%xmm8
2243	psrld	$25,%xmm8
2244	pslld	$32-25,%xmm4
2245	pxor	%xmm8,%xmm4
2246	movdqa	80(%rbp),%xmm8
2247.byte	102,15,58,15,255,4
2248.byte	102,69,15,58,15,219,8
2249.byte	102,69,15,58,15,255,12
2250.byte	102,15,58,15,246,4
2251.byte	102,69,15,58,15,210,8
2252.byte	102,69,15,58,15,246,12
2253.byte	102,15,58,15,237,4
2254.byte	102,69,15,58,15,201,8
2255.byte	102,69,15,58,15,237,12
2256.byte	102,15,58,15,228,4
2257.byte	102,69,15,58,15,192,8
2258.byte	102,69,15,58,15,228,12
2259	movdqa	%xmm8,80(%rbp)
2260	movdqa	.rol16(%rip),%xmm8
2261	paddd	%xmm7,%xmm3
2262	paddd	%xmm6,%xmm2
2263	paddd	%xmm5,%xmm1
2264	paddd	%xmm4,%xmm0
2265	pxor	%xmm3,%xmm15
2266	pxor	%xmm2,%xmm14
2267	pxor	%xmm1,%xmm13
2268	pxor	%xmm0,%xmm12
2269.byte	102,69,15,56,0,248
2270.byte	102,69,15,56,0,240
2271.byte	102,69,15,56,0,232
2272.byte	102,69,15,56,0,224
2273	movdqa	80(%rbp),%xmm8
2274	paddd	%xmm15,%xmm11
2275	paddd	%xmm14,%xmm10
2276	paddd	%xmm13,%xmm9
2277	paddd	%xmm12,%xmm8
2278	pxor	%xmm11,%xmm7
2279	pxor	%xmm10,%xmm6
2280	pxor	%xmm9,%xmm5
2281	pxor	%xmm8,%xmm4
2282	movdqa	%xmm8,80(%rbp)
2283	movdqa	%xmm7,%xmm8
2284	psrld	$20,%xmm8
2285	pslld	$32-20,%xmm7
2286	pxor	%xmm8,%xmm7
2287	movdqa	%xmm6,%xmm8
2288	psrld	$20,%xmm8
2289	pslld	$32-20,%xmm6
2290	pxor	%xmm8,%xmm6
2291	movdqa	%xmm5,%xmm8
2292	psrld	$20,%xmm8
2293	pslld	$32-20,%xmm5
2294	pxor	%xmm8,%xmm5
2295	movdqa	%xmm4,%xmm8
2296	psrld	$20,%xmm8
2297	pslld	$32-20,%xmm4
2298	pxor	%xmm8,%xmm4
2299	movdqa	.rol8(%rip),%xmm8
2300	paddd	%xmm7,%xmm3
2301	paddd	%xmm6,%xmm2
2302	paddd	%xmm5,%xmm1
2303	paddd	%xmm4,%xmm0
2304	pxor	%xmm3,%xmm15
2305	pxor	%xmm2,%xmm14
2306	pxor	%xmm1,%xmm13
2307	pxor	%xmm0,%xmm12
2308.byte	102,69,15,56,0,248
2309.byte	102,69,15,56,0,240
2310.byte	102,69,15,56,0,232
2311.byte	102,69,15,56,0,224
2312	movdqa	80(%rbp),%xmm8
2313	paddd	%xmm15,%xmm11
2314	paddd	%xmm14,%xmm10
2315	paddd	%xmm13,%xmm9
2316	paddd	%xmm12,%xmm8
2317	pxor	%xmm11,%xmm7
2318	pxor	%xmm10,%xmm6
2319	pxor	%xmm9,%xmm5
2320	pxor	%xmm8,%xmm4
2321	movdqa	%xmm8,80(%rbp)
2322	movdqa	%xmm7,%xmm8
2323	psrld	$25,%xmm8
2324	pslld	$32-25,%xmm7
2325	pxor	%xmm8,%xmm7
2326	movdqa	%xmm6,%xmm8
2327	psrld	$25,%xmm8
2328	pslld	$32-25,%xmm6
2329	pxor	%xmm8,%xmm6
2330	movdqa	%xmm5,%xmm8
2331	psrld	$25,%xmm8
2332	pslld	$32-25,%xmm5
2333	pxor	%xmm8,%xmm5
2334	movdqa	%xmm4,%xmm8
2335	psrld	$25,%xmm8
2336	pslld	$32-25,%xmm4
2337	pxor	%xmm8,%xmm4
2338	movdqa	80(%rbp),%xmm8
2339.byte	102,15,58,15,255,12
2340.byte	102,69,15,58,15,219,8
2341.byte	102,69,15,58,15,255,4
2342.byte	102,15,58,15,246,12
2343.byte	102,69,15,58,15,210,8
2344.byte	102,69,15,58,15,246,4
2345.byte	102,15,58,15,237,12
2346.byte	102,69,15,58,15,201,8
2347.byte	102,69,15,58,15,237,4
2348.byte	102,15,58,15,228,12
2349.byte	102,69,15,58,15,192,8
2350.byte	102,69,15,58,15,228,4
2351
2352	decq	%r10
2353	jnz	1b
2354	paddd	.chacha20_consts(%rip),%xmm3
2355	paddd	48(%rbp),%xmm7
2356	paddd	64(%rbp),%xmm11
2357	paddd	144(%rbp),%xmm15
2358	paddd	.chacha20_consts(%rip),%xmm2
2359	paddd	48(%rbp),%xmm6
2360	paddd	64(%rbp),%xmm10
2361	paddd	128(%rbp),%xmm14
2362	paddd	.chacha20_consts(%rip),%xmm1
2363	paddd	48(%rbp),%xmm5
2364	paddd	64(%rbp),%xmm9
2365	paddd	112(%rbp),%xmm13
2366	paddd	.chacha20_consts(%rip),%xmm0
2367	paddd	48(%rbp),%xmm4
2368	paddd	64(%rbp),%xmm8
2369	paddd	96(%rbp),%xmm12
2370
2371
2372	pand	.clamp(%rip),%xmm3
2373	movdqa	%xmm3,0(%rbp)
2374	movdqa	%xmm7,16(%rbp)
2375
2376	movq	%r8,%r8
2377	call	poly_hash_ad_internal
2378	movdqu	0 + 0(%rsi),%xmm3
2379	movdqu	16 + 0(%rsi),%xmm7
2380	movdqu	32 + 0(%rsi),%xmm11
2381	movdqu	48 + 0(%rsi),%xmm15
2382	pxor	%xmm3,%xmm2
2383	pxor	%xmm7,%xmm6
2384	pxor	%xmm11,%xmm10
2385	pxor	%xmm14,%xmm15
2386	movdqu	%xmm2,0 + 0(%rdi)
2387	movdqu	%xmm6,16 + 0(%rdi)
2388	movdqu	%xmm10,32 + 0(%rdi)
2389	movdqu	%xmm15,48 + 0(%rdi)
2390	movdqu	0 + 64(%rsi),%xmm3
2391	movdqu	16 + 64(%rsi),%xmm7
2392	movdqu	32 + 64(%rsi),%xmm11
2393	movdqu	48 + 64(%rsi),%xmm15
2394	pxor	%xmm3,%xmm1
2395	pxor	%xmm7,%xmm5
2396	pxor	%xmm11,%xmm9
2397	pxor	%xmm13,%xmm15
2398	movdqu	%xmm1,0 + 64(%rdi)
2399	movdqu	%xmm5,16 + 64(%rdi)
2400	movdqu	%xmm9,32 + 64(%rdi)
2401	movdqu	%xmm15,48 + 64(%rdi)
2402
2403	cmpq	$192,%rbx
2404	ja	1f
2405	movq	$128,%rcx
2406	subq	$128,%rbx
2407	leaq	128(%rsi),%rsi
2408	jmp	seal_sse_128_seal_hash
24091:
2410	movdqu	0 + 128(%rsi),%xmm3
2411	movdqu	16 + 128(%rsi),%xmm7
2412	movdqu	32 + 128(%rsi),%xmm11
2413	movdqu	48 + 128(%rsi),%xmm15
2414	pxor	%xmm3,%xmm0
2415	pxor	%xmm7,%xmm4
2416	pxor	%xmm11,%xmm8
2417	pxor	%xmm12,%xmm15
2418	movdqu	%xmm0,0 + 128(%rdi)
2419	movdqu	%xmm4,16 + 128(%rdi)
2420	movdqu	%xmm8,32 + 128(%rdi)
2421	movdqu	%xmm15,48 + 128(%rdi)
2422
2423	movq	$192,%rcx
2424	subq	$192,%rbx
2425	leaq	192(%rsi),%rsi
2426	movq	$2,%rcx
2427	movq	$8,%r8
2428	cmpq	$64,%rbx
2429	jbe	seal_sse_tail_64
2430	cmpq	$128,%rbx
2431	jbe	seal_sse_tail_128
2432	cmpq	$192,%rbx
2433	jbe	seal_sse_tail_192
2434
24351:
2436	movdqa	.chacha20_consts(%rip),%xmm0
2437	movdqa	48(%rbp),%xmm4
2438	movdqa	64(%rbp),%xmm8
2439	movdqa	%xmm0,%xmm1
2440	movdqa	%xmm4,%xmm5
2441	movdqa	%xmm8,%xmm9
2442	movdqa	%xmm0,%xmm2
2443	movdqa	%xmm4,%xmm6
2444	movdqa	%xmm8,%xmm10
2445	movdqa	%xmm0,%xmm3
2446	movdqa	%xmm4,%xmm7
2447	movdqa	%xmm8,%xmm11
2448	movdqa	96(%rbp),%xmm15
2449	paddd	.sse_inc(%rip),%xmm15
2450	movdqa	%xmm15,%xmm14
2451	paddd	.sse_inc(%rip),%xmm14
2452	movdqa	%xmm14,%xmm13
2453	paddd	.sse_inc(%rip),%xmm13
2454	movdqa	%xmm13,%xmm12
2455	paddd	.sse_inc(%rip),%xmm12
2456	movdqa	%xmm12,96(%rbp)
2457	movdqa	%xmm13,112(%rbp)
2458	movdqa	%xmm14,128(%rbp)
2459	movdqa	%xmm15,144(%rbp)
2460
24612:
2462	movdqa	%xmm8,80(%rbp)
2463	movdqa	.rol16(%rip),%xmm8
2464	paddd	%xmm7,%xmm3
2465	paddd	%xmm6,%xmm2
2466	paddd	%xmm5,%xmm1
2467	paddd	%xmm4,%xmm0
2468	pxor	%xmm3,%xmm15
2469	pxor	%xmm2,%xmm14
2470	pxor	%xmm1,%xmm13
2471	pxor	%xmm0,%xmm12
2472.byte	102,69,15,56,0,248
2473.byte	102,69,15,56,0,240
2474.byte	102,69,15,56,0,232
2475.byte	102,69,15,56,0,224
2476	movdqa	80(%rbp),%xmm8
2477	paddd	%xmm15,%xmm11
2478	paddd	%xmm14,%xmm10
2479	paddd	%xmm13,%xmm9
2480	paddd	%xmm12,%xmm8
2481	pxor	%xmm11,%xmm7
2482	addq	0(%rdi),%r10
2483	adcq	8+0(%rdi),%r11
2484	adcq	$1,%r12
2485	pxor	%xmm10,%xmm6
2486	pxor	%xmm9,%xmm5
2487	pxor	%xmm8,%xmm4
2488	movdqa	%xmm8,80(%rbp)
2489	movdqa	%xmm7,%xmm8
2490	psrld	$20,%xmm8
2491	pslld	$32-20,%xmm7
2492	pxor	%xmm8,%xmm7
2493	movdqa	%xmm6,%xmm8
2494	psrld	$20,%xmm8
2495	pslld	$32-20,%xmm6
2496	pxor	%xmm8,%xmm6
2497	movdqa	%xmm5,%xmm8
2498	psrld	$20,%xmm8
2499	pslld	$32-20,%xmm5
2500	pxor	%xmm8,%xmm5
2501	movdqa	%xmm4,%xmm8
2502	psrld	$20,%xmm8
2503	pslld	$32-20,%xmm4
2504	pxor	%xmm8,%xmm4
2505	movq	0+0(%rbp),%rax
2506	movq	%rax,%r15
2507	mulq	%r10
2508	movq	%rax,%r13
2509	movq	%rdx,%r14
2510	movq	0+0(%rbp),%rax
2511	mulq	%r11
2512	imulq	%r12,%r15
2513	addq	%rax,%r14
2514	adcq	%rdx,%r15
2515	movdqa	.rol8(%rip),%xmm8
2516	paddd	%xmm7,%xmm3
2517	paddd	%xmm6,%xmm2
2518	paddd	%xmm5,%xmm1
2519	paddd	%xmm4,%xmm0
2520	pxor	%xmm3,%xmm15
2521	pxor	%xmm2,%xmm14
2522	pxor	%xmm1,%xmm13
2523	pxor	%xmm0,%xmm12
2524.byte	102,69,15,56,0,248
2525.byte	102,69,15,56,0,240
2526.byte	102,69,15,56,0,232
2527.byte	102,69,15,56,0,224
2528	movdqa	80(%rbp),%xmm8
2529	paddd	%xmm15,%xmm11
2530	paddd	%xmm14,%xmm10
2531	paddd	%xmm13,%xmm9
2532	paddd	%xmm12,%xmm8
2533	pxor	%xmm11,%xmm7
2534	pxor	%xmm10,%xmm6
2535	movq	8+0(%rbp),%rax
2536	movq	%rax,%r9
2537	mulq	%r10
2538	addq	%rax,%r14
2539	adcq	$0,%rdx
2540	movq	%rdx,%r10
2541	movq	8+0(%rbp),%rax
2542	mulq	%r11
2543	addq	%rax,%r15
2544	adcq	$0,%rdx
2545	pxor	%xmm9,%xmm5
2546	pxor	%xmm8,%xmm4
2547	movdqa	%xmm8,80(%rbp)
2548	movdqa	%xmm7,%xmm8
2549	psrld	$25,%xmm8
2550	pslld	$32-25,%xmm7
2551	pxor	%xmm8,%xmm7
2552	movdqa	%xmm6,%xmm8
2553	psrld	$25,%xmm8
2554	pslld	$32-25,%xmm6
2555	pxor	%xmm8,%xmm6
2556	movdqa	%xmm5,%xmm8
2557	psrld	$25,%xmm8
2558	pslld	$32-25,%xmm5
2559	pxor	%xmm8,%xmm5
2560	movdqa	%xmm4,%xmm8
2561	psrld	$25,%xmm8
2562	pslld	$32-25,%xmm4
2563	pxor	%xmm8,%xmm4
2564	movdqa	80(%rbp),%xmm8
2565	imulq	%r12,%r9
2566	addq	%r10,%r15
2567	adcq	%rdx,%r9
2568.byte	102,15,58,15,255,4
2569.byte	102,69,15,58,15,219,8
2570.byte	102,69,15,58,15,255,12
2571.byte	102,15,58,15,246,4
2572.byte	102,69,15,58,15,210,8
2573.byte	102,69,15,58,15,246,12
2574.byte	102,15,58,15,237,4
2575.byte	102,69,15,58,15,201,8
2576.byte	102,69,15,58,15,237,12
2577.byte	102,15,58,15,228,4
2578.byte	102,69,15,58,15,192,8
2579.byte	102,69,15,58,15,228,12
2580	movdqa	%xmm8,80(%rbp)
2581	movdqa	.rol16(%rip),%xmm8
2582	paddd	%xmm7,%xmm3
2583	paddd	%xmm6,%xmm2
2584	paddd	%xmm5,%xmm1
2585	paddd	%xmm4,%xmm0
2586	pxor	%xmm3,%xmm15
2587	pxor	%xmm2,%xmm14
2588	movq	%r13,%r10
2589	movq	%r14,%r11
2590	movq	%r15,%r12
2591	andq	$3,%r12
2592	movq	%r15,%r13
2593	andq	$-4,%r13
2594	movq	%r9,%r14
2595	shrdq	$2,%r9,%r15
2596	shrq	$2,%r9
2597	addq	%r13,%r10
2598	adcq	%r14,%r11
2599	adcq	$0,%r12
2600	addq	%r15,%r10
2601	adcq	%r9,%r11
2602	adcq	$0,%r12
2603	pxor	%xmm1,%xmm13
2604	pxor	%xmm0,%xmm12
2605.byte	102,69,15,56,0,248
2606.byte	102,69,15,56,0,240
2607.byte	102,69,15,56,0,232
2608.byte	102,69,15,56,0,224
2609	movdqa	80(%rbp),%xmm8
2610	paddd	%xmm15,%xmm11
2611	paddd	%xmm14,%xmm10
2612	paddd	%xmm13,%xmm9
2613	paddd	%xmm12,%xmm8
2614	pxor	%xmm11,%xmm7
2615	pxor	%xmm10,%xmm6
2616	pxor	%xmm9,%xmm5
2617	pxor	%xmm8,%xmm4
2618	movdqa	%xmm8,80(%rbp)
2619	movdqa	%xmm7,%xmm8
2620	psrld	$20,%xmm8
2621	pslld	$32-20,%xmm7
2622	pxor	%xmm8,%xmm7
2623	movdqa	%xmm6,%xmm8
2624	psrld	$20,%xmm8
2625	pslld	$32-20,%xmm6
2626	pxor	%xmm8,%xmm6
2627	movdqa	%xmm5,%xmm8
2628	psrld	$20,%xmm8
2629	pslld	$32-20,%xmm5
2630	pxor	%xmm8,%xmm5
2631	movdqa	%xmm4,%xmm8
2632	psrld	$20,%xmm8
2633	pslld	$32-20,%xmm4
2634	pxor	%xmm8,%xmm4
2635	movdqa	.rol8(%rip),%xmm8
2636	paddd	%xmm7,%xmm3
2637	paddd	%xmm6,%xmm2
2638	paddd	%xmm5,%xmm1
2639	paddd	%xmm4,%xmm0
2640	pxor	%xmm3,%xmm15
2641	pxor	%xmm2,%xmm14
2642	pxor	%xmm1,%xmm13
2643	pxor	%xmm0,%xmm12
2644.byte	102,69,15,56,0,248
2645.byte	102,69,15,56,0,240
2646.byte	102,69,15,56,0,232
2647.byte	102,69,15,56,0,224
2648	movdqa	80(%rbp),%xmm8
2649	paddd	%xmm15,%xmm11
2650	paddd	%xmm14,%xmm10
2651	paddd	%xmm13,%xmm9
2652	paddd	%xmm12,%xmm8
2653	pxor	%xmm11,%xmm7
2654	pxor	%xmm10,%xmm6
2655	pxor	%xmm9,%xmm5
2656	pxor	%xmm8,%xmm4
2657	movdqa	%xmm8,80(%rbp)
2658	movdqa	%xmm7,%xmm8
2659	psrld	$25,%xmm8
2660	pslld	$32-25,%xmm7
2661	pxor	%xmm8,%xmm7
2662	movdqa	%xmm6,%xmm8
2663	psrld	$25,%xmm8
2664	pslld	$32-25,%xmm6
2665	pxor	%xmm8,%xmm6
2666	movdqa	%xmm5,%xmm8
2667	psrld	$25,%xmm8
2668	pslld	$32-25,%xmm5
2669	pxor	%xmm8,%xmm5
2670	movdqa	%xmm4,%xmm8
2671	psrld	$25,%xmm8
2672	pslld	$32-25,%xmm4
2673	pxor	%xmm8,%xmm4
2674	movdqa	80(%rbp),%xmm8
2675.byte	102,15,58,15,255,12
2676.byte	102,69,15,58,15,219,8
2677.byte	102,69,15,58,15,255,4
2678.byte	102,15,58,15,246,12
2679.byte	102,69,15,58,15,210,8
2680.byte	102,69,15,58,15,246,4
2681.byte	102,15,58,15,237,12
2682.byte	102,69,15,58,15,201,8
2683.byte	102,69,15,58,15,237,4
2684.byte	102,15,58,15,228,12
2685.byte	102,69,15,58,15,192,8
2686.byte	102,69,15,58,15,228,4
2687
2688	leaq	16(%rdi),%rdi
2689	decq	%r8
2690	jge	2b
2691	addq	0(%rdi),%r10
2692	adcq	8+0(%rdi),%r11
2693	adcq	$1,%r12
2694	movq	0+0(%rbp),%rax
2695	movq	%rax,%r15
2696	mulq	%r10
2697	movq	%rax,%r13
2698	movq	%rdx,%r14
2699	movq	0+0(%rbp),%rax
2700	mulq	%r11
2701	imulq	%r12,%r15
2702	addq	%rax,%r14
2703	adcq	%rdx,%r15
2704	movq	8+0(%rbp),%rax
2705	movq	%rax,%r9
2706	mulq	%r10
2707	addq	%rax,%r14
2708	adcq	$0,%rdx
2709	movq	%rdx,%r10
2710	movq	8+0(%rbp),%rax
2711	mulq	%r11
2712	addq	%rax,%r15
2713	adcq	$0,%rdx
2714	imulq	%r12,%r9
2715	addq	%r10,%r15
2716	adcq	%rdx,%r9
2717	movq	%r13,%r10
2718	movq	%r14,%r11
2719	movq	%r15,%r12
2720	andq	$3,%r12
2721	movq	%r15,%r13
2722	andq	$-4,%r13
2723	movq	%r9,%r14
2724	shrdq	$2,%r9,%r15
2725	shrq	$2,%r9
2726	addq	%r13,%r10
2727	adcq	%r14,%r11
2728	adcq	$0,%r12
2729	addq	%r15,%r10
2730	adcq	%r9,%r11
2731	adcq	$0,%r12
2732
2733	leaq	16(%rdi),%rdi
2734	decq	%rcx
2735	jg	2b
2736	paddd	.chacha20_consts(%rip),%xmm3
2737	paddd	48(%rbp),%xmm7
2738	paddd	64(%rbp),%xmm11
2739	paddd	144(%rbp),%xmm15
2740	paddd	.chacha20_consts(%rip),%xmm2
2741	paddd	48(%rbp),%xmm6
2742	paddd	64(%rbp),%xmm10
2743	paddd	128(%rbp),%xmm14
2744	paddd	.chacha20_consts(%rip),%xmm1
2745	paddd	48(%rbp),%xmm5
2746	paddd	64(%rbp),%xmm9
2747	paddd	112(%rbp),%xmm13
2748	paddd	.chacha20_consts(%rip),%xmm0
2749	paddd	48(%rbp),%xmm4
2750	paddd	64(%rbp),%xmm8
2751	paddd	96(%rbp),%xmm12
2752
2753	movdqa	%xmm14,80(%rbp)
2754	movdqa	%xmm14,80(%rbp)
2755	movdqu	0 + 0(%rsi),%xmm14
2756	pxor	%xmm3,%xmm14
2757	movdqu	%xmm14,0 + 0(%rdi)
2758	movdqu	16 + 0(%rsi),%xmm14
2759	pxor	%xmm7,%xmm14
2760	movdqu	%xmm14,16 + 0(%rdi)
2761	movdqu	32 + 0(%rsi),%xmm14
2762	pxor	%xmm11,%xmm14
2763	movdqu	%xmm14,32 + 0(%rdi)
2764	movdqu	48 + 0(%rsi),%xmm14
2765	pxor	%xmm15,%xmm14
2766	movdqu	%xmm14,48 + 0(%rdi)
2767
2768	movdqa	80(%rbp),%xmm14
2769	movdqu	0 + 64(%rsi),%xmm3
2770	movdqu	16 + 64(%rsi),%xmm7
2771	movdqu	32 + 64(%rsi),%xmm11
2772	movdqu	48 + 64(%rsi),%xmm15
2773	pxor	%xmm3,%xmm2
2774	pxor	%xmm7,%xmm6
2775	pxor	%xmm11,%xmm10
2776	pxor	%xmm14,%xmm15
2777	movdqu	%xmm2,0 + 64(%rdi)
2778	movdqu	%xmm6,16 + 64(%rdi)
2779	movdqu	%xmm10,32 + 64(%rdi)
2780	movdqu	%xmm15,48 + 64(%rdi)
2781	movdqu	0 + 128(%rsi),%xmm3
2782	movdqu	16 + 128(%rsi),%xmm7
2783	movdqu	32 + 128(%rsi),%xmm11
2784	movdqu	48 + 128(%rsi),%xmm15
2785	pxor	%xmm3,%xmm1
2786	pxor	%xmm7,%xmm5
2787	pxor	%xmm11,%xmm9
2788	pxor	%xmm13,%xmm15
2789	movdqu	%xmm1,0 + 128(%rdi)
2790	movdqu	%xmm5,16 + 128(%rdi)
2791	movdqu	%xmm9,32 + 128(%rdi)
2792	movdqu	%xmm15,48 + 128(%rdi)
2793
2794	cmpq	$256,%rbx
2795	ja	3f
2796
2797	movq	$192,%rcx
2798	subq	$192,%rbx
2799	leaq	192(%rsi),%rsi
2800	jmp	seal_sse_128_seal_hash
28013:
2802	movdqu	0 + 192(%rsi),%xmm3
2803	movdqu	16 + 192(%rsi),%xmm7
2804	movdqu	32 + 192(%rsi),%xmm11
2805	movdqu	48 + 192(%rsi),%xmm15
2806	pxor	%xmm3,%xmm0
2807	pxor	%xmm7,%xmm4
2808	pxor	%xmm11,%xmm8
2809	pxor	%xmm12,%xmm15
2810	movdqu	%xmm0,0 + 192(%rdi)
2811	movdqu	%xmm4,16 + 192(%rdi)
2812	movdqu	%xmm8,32 + 192(%rdi)
2813	movdqu	%xmm15,48 + 192(%rdi)
2814
2815	leaq	256(%rsi),%rsi
2816	subq	$256,%rbx
2817	movq	$6,%rcx
2818	movq	$4,%r8
2819	cmpq	$192,%rbx
2820	jg	1b
2821	movq	%rbx,%rcx
2822	testq	%rbx,%rbx
2823	je	seal_sse_128_seal_hash
2824	movq	$6,%rcx
2825	cmpq	$64,%rbx
2826	jg	3f
2827
2828seal_sse_tail_64:
2829	movdqa	.chacha20_consts(%rip),%xmm0
2830	movdqa	48(%rbp),%xmm4
2831	movdqa	64(%rbp),%xmm8
2832	movdqa	96(%rbp),%xmm12
2833	paddd	.sse_inc(%rip),%xmm12
2834	movdqa	%xmm12,96(%rbp)
2835
28361:
2837	addq	0(%rdi),%r10
2838	adcq	8+0(%rdi),%r11
2839	adcq	$1,%r12
2840	movq	0+0(%rbp),%rax
2841	movq	%rax,%r15
2842	mulq	%r10
2843	movq	%rax,%r13
2844	movq	%rdx,%r14
2845	movq	0+0(%rbp),%rax
2846	mulq	%r11
2847	imulq	%r12,%r15
2848	addq	%rax,%r14
2849	adcq	%rdx,%r15
2850	movq	8+0(%rbp),%rax
2851	movq	%rax,%r9
2852	mulq	%r10
2853	addq	%rax,%r14
2854	adcq	$0,%rdx
2855	movq	%rdx,%r10
2856	movq	8+0(%rbp),%rax
2857	mulq	%r11
2858	addq	%rax,%r15
2859	adcq	$0,%rdx
2860	imulq	%r12,%r9
2861	addq	%r10,%r15
2862	adcq	%rdx,%r9
2863	movq	%r13,%r10
2864	movq	%r14,%r11
2865	movq	%r15,%r12
2866	andq	$3,%r12
2867	movq	%r15,%r13
2868	andq	$-4,%r13
2869	movq	%r9,%r14
2870	shrdq	$2,%r9,%r15
2871	shrq	$2,%r9
2872	addq	%r13,%r10
2873	adcq	%r14,%r11
2874	adcq	$0,%r12
2875	addq	%r15,%r10
2876	adcq	%r9,%r11
2877	adcq	$0,%r12
2878
2879	leaq	16(%rdi),%rdi
28802:
2881	paddd	%xmm4,%xmm0
2882	pxor	%xmm0,%xmm12
2883	pshufb	.rol16(%rip),%xmm12
2884	paddd	%xmm12,%xmm8
2885	pxor	%xmm8,%xmm4
2886	movdqa	%xmm4,%xmm3
2887	pslld	$12,%xmm3
2888	psrld	$20,%xmm4
2889	pxor	%xmm3,%xmm4
2890	paddd	%xmm4,%xmm0
2891	pxor	%xmm0,%xmm12
2892	pshufb	.rol8(%rip),%xmm12
2893	paddd	%xmm12,%xmm8
2894	pxor	%xmm8,%xmm4
2895	movdqa	%xmm4,%xmm3
2896	pslld	$7,%xmm3
2897	psrld	$25,%xmm4
2898	pxor	%xmm3,%xmm4
2899.byte	102,15,58,15,228,4
2900.byte	102,69,15,58,15,192,8
2901.byte	102,69,15,58,15,228,12
2902	paddd	%xmm4,%xmm0
2903	pxor	%xmm0,%xmm12
2904	pshufb	.rol16(%rip),%xmm12
2905	paddd	%xmm12,%xmm8
2906	pxor	%xmm8,%xmm4
2907	movdqa	%xmm4,%xmm3
2908	pslld	$12,%xmm3
2909	psrld	$20,%xmm4
2910	pxor	%xmm3,%xmm4
2911	paddd	%xmm4,%xmm0
2912	pxor	%xmm0,%xmm12
2913	pshufb	.rol8(%rip),%xmm12
2914	paddd	%xmm12,%xmm8
2915	pxor	%xmm8,%xmm4
2916	movdqa	%xmm4,%xmm3
2917	pslld	$7,%xmm3
2918	psrld	$25,%xmm4
2919	pxor	%xmm3,%xmm4
2920.byte	102,15,58,15,228,12
2921.byte	102,69,15,58,15,192,8
2922.byte	102,69,15,58,15,228,4
2923	addq	0(%rdi),%r10
2924	adcq	8+0(%rdi),%r11
2925	adcq	$1,%r12
2926	movq	0+0(%rbp),%rax
2927	movq	%rax,%r15
2928	mulq	%r10
2929	movq	%rax,%r13
2930	movq	%rdx,%r14
2931	movq	0+0(%rbp),%rax
2932	mulq	%r11
2933	imulq	%r12,%r15
2934	addq	%rax,%r14
2935	adcq	%rdx,%r15
2936	movq	8+0(%rbp),%rax
2937	movq	%rax,%r9
2938	mulq	%r10
2939	addq	%rax,%r14
2940	adcq	$0,%rdx
2941	movq	%rdx,%r10
2942	movq	8+0(%rbp),%rax
2943	mulq	%r11
2944	addq	%rax,%r15
2945	adcq	$0,%rdx
2946	imulq	%r12,%r9
2947	addq	%r10,%r15
2948	adcq	%rdx,%r9
2949	movq	%r13,%r10
2950	movq	%r14,%r11
2951	movq	%r15,%r12
2952	andq	$3,%r12
2953	movq	%r15,%r13
2954	andq	$-4,%r13
2955	movq	%r9,%r14
2956	shrdq	$2,%r9,%r15
2957	shrq	$2,%r9
2958	addq	%r13,%r10
2959	adcq	%r14,%r11
2960	adcq	$0,%r12
2961	addq	%r15,%r10
2962	adcq	%r9,%r11
2963	adcq	$0,%r12
2964
2965	leaq	16(%rdi),%rdi
2966	decq	%rcx
2967	jg	1b
2968	decq	%r8
2969	jge	2b
2970	paddd	.chacha20_consts(%rip),%xmm0
2971	paddd	48(%rbp),%xmm4
2972	paddd	64(%rbp),%xmm8
2973	paddd	96(%rbp),%xmm12
2974
2975	jmp	seal_sse_128_seal
29763:
2977	cmpq	$128,%rbx
2978	jg	3f
2979
2980seal_sse_tail_128:
2981	movdqa	.chacha20_consts(%rip),%xmm0
2982	movdqa	48(%rbp),%xmm4
2983	movdqa	64(%rbp),%xmm8
2984	movdqa	%xmm0,%xmm1
2985	movdqa	%xmm4,%xmm5
2986	movdqa	%xmm8,%xmm9
2987	movdqa	96(%rbp),%xmm13
2988	paddd	.sse_inc(%rip),%xmm13
2989	movdqa	%xmm13,%xmm12
2990	paddd	.sse_inc(%rip),%xmm12
2991	movdqa	%xmm12,96(%rbp)
2992	movdqa	%xmm13,112(%rbp)
2993
29941:
2995	addq	0(%rdi),%r10
2996	adcq	8+0(%rdi),%r11
2997	adcq	$1,%r12
2998	movq	0+0(%rbp),%rax
2999	movq	%rax,%r15
3000	mulq	%r10
3001	movq	%rax,%r13
3002	movq	%rdx,%r14
3003	movq	0+0(%rbp),%rax
3004	mulq	%r11
3005	imulq	%r12,%r15
3006	addq	%rax,%r14
3007	adcq	%rdx,%r15
3008	movq	8+0(%rbp),%rax
3009	movq	%rax,%r9
3010	mulq	%r10
3011	addq	%rax,%r14
3012	adcq	$0,%rdx
3013	movq	%rdx,%r10
3014	movq	8+0(%rbp),%rax
3015	mulq	%r11
3016	addq	%rax,%r15
3017	adcq	$0,%rdx
3018	imulq	%r12,%r9
3019	addq	%r10,%r15
3020	adcq	%rdx,%r9
3021	movq	%r13,%r10
3022	movq	%r14,%r11
3023	movq	%r15,%r12
3024	andq	$3,%r12
3025	movq	%r15,%r13
3026	andq	$-4,%r13
3027	movq	%r9,%r14
3028	shrdq	$2,%r9,%r15
3029	shrq	$2,%r9
3030	addq	%r13,%r10
3031	adcq	%r14,%r11
3032	adcq	$0,%r12
3033	addq	%r15,%r10
3034	adcq	%r9,%r11
3035	adcq	$0,%r12
3036
3037	leaq	16(%rdi),%rdi
30382:
3039	paddd	%xmm4,%xmm0
3040	pxor	%xmm0,%xmm12
3041	pshufb	.rol16(%rip),%xmm12
3042	paddd	%xmm12,%xmm8
3043	pxor	%xmm8,%xmm4
3044	movdqa	%xmm4,%xmm3
3045	pslld	$12,%xmm3
3046	psrld	$20,%xmm4
3047	pxor	%xmm3,%xmm4
3048	paddd	%xmm4,%xmm0
3049	pxor	%xmm0,%xmm12
3050	pshufb	.rol8(%rip),%xmm12
3051	paddd	%xmm12,%xmm8
3052	pxor	%xmm8,%xmm4
3053	movdqa	%xmm4,%xmm3
3054	pslld	$7,%xmm3
3055	psrld	$25,%xmm4
3056	pxor	%xmm3,%xmm4
3057.byte	102,15,58,15,228,4
3058.byte	102,69,15,58,15,192,8
3059.byte	102,69,15,58,15,228,12
3060	paddd	%xmm5,%xmm1
3061	pxor	%xmm1,%xmm13
3062	pshufb	.rol16(%rip),%xmm13
3063	paddd	%xmm13,%xmm9
3064	pxor	%xmm9,%xmm5
3065	movdqa	%xmm5,%xmm3
3066	pslld	$12,%xmm3
3067	psrld	$20,%xmm5
3068	pxor	%xmm3,%xmm5
3069	paddd	%xmm5,%xmm1
3070	pxor	%xmm1,%xmm13
3071	pshufb	.rol8(%rip),%xmm13
3072	paddd	%xmm13,%xmm9
3073	pxor	%xmm9,%xmm5
3074	movdqa	%xmm5,%xmm3
3075	pslld	$7,%xmm3
3076	psrld	$25,%xmm5
3077	pxor	%xmm3,%xmm5
3078.byte	102,15,58,15,237,4
3079.byte	102,69,15,58,15,201,8
3080.byte	102,69,15,58,15,237,12
3081	addq	0(%rdi),%r10
3082	adcq	8+0(%rdi),%r11
3083	adcq	$1,%r12
3084	movq	0+0(%rbp),%rax
3085	movq	%rax,%r15
3086	mulq	%r10
3087	movq	%rax,%r13
3088	movq	%rdx,%r14
3089	movq	0+0(%rbp),%rax
3090	mulq	%r11
3091	imulq	%r12,%r15
3092	addq	%rax,%r14
3093	adcq	%rdx,%r15
3094	movq	8+0(%rbp),%rax
3095	movq	%rax,%r9
3096	mulq	%r10
3097	addq	%rax,%r14
3098	adcq	$0,%rdx
3099	movq	%rdx,%r10
3100	movq	8+0(%rbp),%rax
3101	mulq	%r11
3102	addq	%rax,%r15
3103	adcq	$0,%rdx
3104	imulq	%r12,%r9
3105	addq	%r10,%r15
3106	adcq	%rdx,%r9
3107	movq	%r13,%r10
3108	movq	%r14,%r11
3109	movq	%r15,%r12
3110	andq	$3,%r12
3111	movq	%r15,%r13
3112	andq	$-4,%r13
3113	movq	%r9,%r14
3114	shrdq	$2,%r9,%r15
3115	shrq	$2,%r9
3116	addq	%r13,%r10
3117	adcq	%r14,%r11
3118	adcq	$0,%r12
3119	addq	%r15,%r10
3120	adcq	%r9,%r11
3121	adcq	$0,%r12
3122	paddd	%xmm4,%xmm0
3123	pxor	%xmm0,%xmm12
3124	pshufb	.rol16(%rip),%xmm12
3125	paddd	%xmm12,%xmm8
3126	pxor	%xmm8,%xmm4
3127	movdqa	%xmm4,%xmm3
3128	pslld	$12,%xmm3
3129	psrld	$20,%xmm4
3130	pxor	%xmm3,%xmm4
3131	paddd	%xmm4,%xmm0
3132	pxor	%xmm0,%xmm12
3133	pshufb	.rol8(%rip),%xmm12
3134	paddd	%xmm12,%xmm8
3135	pxor	%xmm8,%xmm4
3136	movdqa	%xmm4,%xmm3
3137	pslld	$7,%xmm3
3138	psrld	$25,%xmm4
3139	pxor	%xmm3,%xmm4
3140.byte	102,15,58,15,228,12
3141.byte	102,69,15,58,15,192,8
3142.byte	102,69,15,58,15,228,4
3143	paddd	%xmm5,%xmm1
3144	pxor	%xmm1,%xmm13
3145	pshufb	.rol16(%rip),%xmm13
3146	paddd	%xmm13,%xmm9
3147	pxor	%xmm9,%xmm5
3148	movdqa	%xmm5,%xmm3
3149	pslld	$12,%xmm3
3150	psrld	$20,%xmm5
3151	pxor	%xmm3,%xmm5
3152	paddd	%xmm5,%xmm1
3153	pxor	%xmm1,%xmm13
3154	pshufb	.rol8(%rip),%xmm13
3155	paddd	%xmm13,%xmm9
3156	pxor	%xmm9,%xmm5
3157	movdqa	%xmm5,%xmm3
3158	pslld	$7,%xmm3
3159	psrld	$25,%xmm5
3160	pxor	%xmm3,%xmm5
3161.byte	102,15,58,15,237,12
3162.byte	102,69,15,58,15,201,8
3163.byte	102,69,15,58,15,237,4
3164
3165	leaq	16(%rdi),%rdi
3166	decq	%rcx
3167	jg	1b
3168	decq	%r8
3169	jge	2b
3170	paddd	.chacha20_consts(%rip),%xmm1
3171	paddd	48(%rbp),%xmm5
3172	paddd	64(%rbp),%xmm9
3173	paddd	112(%rbp),%xmm13
3174	paddd	.chacha20_consts(%rip),%xmm0
3175	paddd	48(%rbp),%xmm4
3176	paddd	64(%rbp),%xmm8
3177	paddd	96(%rbp),%xmm12
3178	movdqu	0 + 0(%rsi),%xmm3
3179	movdqu	16 + 0(%rsi),%xmm7
3180	movdqu	32 + 0(%rsi),%xmm11
3181	movdqu	48 + 0(%rsi),%xmm15
3182	pxor	%xmm3,%xmm1
3183	pxor	%xmm7,%xmm5
3184	pxor	%xmm11,%xmm9
3185	pxor	%xmm13,%xmm15
3186	movdqu	%xmm1,0 + 0(%rdi)
3187	movdqu	%xmm5,16 + 0(%rdi)
3188	movdqu	%xmm9,32 + 0(%rdi)
3189	movdqu	%xmm15,48 + 0(%rdi)
3190
3191	movq	$64,%rcx
3192	subq	$64,%rbx
3193	leaq	64(%rsi),%rsi
3194	jmp	seal_sse_128_seal_hash
31953:
3196
3197seal_sse_tail_192:
3198	movdqa	.chacha20_consts(%rip),%xmm0
3199	movdqa	48(%rbp),%xmm4
3200	movdqa	64(%rbp),%xmm8
3201	movdqa	%xmm0,%xmm1
3202	movdqa	%xmm4,%xmm5
3203	movdqa	%xmm8,%xmm9
3204	movdqa	%xmm0,%xmm2
3205	movdqa	%xmm4,%xmm6
3206	movdqa	%xmm8,%xmm10
3207	movdqa	96(%rbp),%xmm14
3208	paddd	.sse_inc(%rip),%xmm14
3209	movdqa	%xmm14,%xmm13
3210	paddd	.sse_inc(%rip),%xmm13
3211	movdqa	%xmm13,%xmm12
3212	paddd	.sse_inc(%rip),%xmm12
3213	movdqa	%xmm12,96(%rbp)
3214	movdqa	%xmm13,112(%rbp)
3215	movdqa	%xmm14,128(%rbp)
3216
32171:
3218	addq	0(%rdi),%r10
3219	adcq	8+0(%rdi),%r11
3220	adcq	$1,%r12
3221	movq	0+0(%rbp),%rax
3222	movq	%rax,%r15
3223	mulq	%r10
3224	movq	%rax,%r13
3225	movq	%rdx,%r14
3226	movq	0+0(%rbp),%rax
3227	mulq	%r11
3228	imulq	%r12,%r15
3229	addq	%rax,%r14
3230	adcq	%rdx,%r15
3231	movq	8+0(%rbp),%rax
3232	movq	%rax,%r9
3233	mulq	%r10
3234	addq	%rax,%r14
3235	adcq	$0,%rdx
3236	movq	%rdx,%r10
3237	movq	8+0(%rbp),%rax
3238	mulq	%r11
3239	addq	%rax,%r15
3240	adcq	$0,%rdx
3241	imulq	%r12,%r9
3242	addq	%r10,%r15
3243	adcq	%rdx,%r9
3244	movq	%r13,%r10
3245	movq	%r14,%r11
3246	movq	%r15,%r12
3247	andq	$3,%r12
3248	movq	%r15,%r13
3249	andq	$-4,%r13
3250	movq	%r9,%r14
3251	shrdq	$2,%r9,%r15
3252	shrq	$2,%r9
3253	addq	%r13,%r10
3254	adcq	%r14,%r11
3255	adcq	$0,%r12
3256	addq	%r15,%r10
3257	adcq	%r9,%r11
3258	adcq	$0,%r12
3259
3260	leaq	16(%rdi),%rdi
32612:
3262	paddd	%xmm4,%xmm0
3263	pxor	%xmm0,%xmm12
3264	pshufb	.rol16(%rip),%xmm12
3265	paddd	%xmm12,%xmm8
3266	pxor	%xmm8,%xmm4
3267	movdqa	%xmm4,%xmm3
3268	pslld	$12,%xmm3
3269	psrld	$20,%xmm4
3270	pxor	%xmm3,%xmm4
3271	paddd	%xmm4,%xmm0
3272	pxor	%xmm0,%xmm12
3273	pshufb	.rol8(%rip),%xmm12
3274	paddd	%xmm12,%xmm8
3275	pxor	%xmm8,%xmm4
3276	movdqa	%xmm4,%xmm3
3277	pslld	$7,%xmm3
3278	psrld	$25,%xmm4
3279	pxor	%xmm3,%xmm4
3280.byte	102,15,58,15,228,4
3281.byte	102,69,15,58,15,192,8
3282.byte	102,69,15,58,15,228,12
3283	paddd	%xmm5,%xmm1
3284	pxor	%xmm1,%xmm13
3285	pshufb	.rol16(%rip),%xmm13
3286	paddd	%xmm13,%xmm9
3287	pxor	%xmm9,%xmm5
3288	movdqa	%xmm5,%xmm3
3289	pslld	$12,%xmm3
3290	psrld	$20,%xmm5
3291	pxor	%xmm3,%xmm5
3292	paddd	%xmm5,%xmm1
3293	pxor	%xmm1,%xmm13
3294	pshufb	.rol8(%rip),%xmm13
3295	paddd	%xmm13,%xmm9
3296	pxor	%xmm9,%xmm5
3297	movdqa	%xmm5,%xmm3
3298	pslld	$7,%xmm3
3299	psrld	$25,%xmm5
3300	pxor	%xmm3,%xmm5
3301.byte	102,15,58,15,237,4
3302.byte	102,69,15,58,15,201,8
3303.byte	102,69,15,58,15,237,12
3304	paddd	%xmm6,%xmm2
3305	pxor	%xmm2,%xmm14
3306	pshufb	.rol16(%rip),%xmm14
3307	paddd	%xmm14,%xmm10
3308	pxor	%xmm10,%xmm6
3309	movdqa	%xmm6,%xmm3
3310	pslld	$12,%xmm3
3311	psrld	$20,%xmm6
3312	pxor	%xmm3,%xmm6
3313	paddd	%xmm6,%xmm2
3314	pxor	%xmm2,%xmm14
3315	pshufb	.rol8(%rip),%xmm14
3316	paddd	%xmm14,%xmm10
3317	pxor	%xmm10,%xmm6
3318	movdqa	%xmm6,%xmm3
3319	pslld	$7,%xmm3
3320	psrld	$25,%xmm6
3321	pxor	%xmm3,%xmm6
3322.byte	102,15,58,15,246,4
3323.byte	102,69,15,58,15,210,8
3324.byte	102,69,15,58,15,246,12
3325	addq	0(%rdi),%r10
3326	adcq	8+0(%rdi),%r11
3327	adcq	$1,%r12
3328	movq	0+0(%rbp),%rax
3329	movq	%rax,%r15
3330	mulq	%r10
3331	movq	%rax,%r13
3332	movq	%rdx,%r14
3333	movq	0+0(%rbp),%rax
3334	mulq	%r11
3335	imulq	%r12,%r15
3336	addq	%rax,%r14
3337	adcq	%rdx,%r15
3338	movq	8+0(%rbp),%rax
3339	movq	%rax,%r9
3340	mulq	%r10
3341	addq	%rax,%r14
3342	adcq	$0,%rdx
3343	movq	%rdx,%r10
3344	movq	8+0(%rbp),%rax
3345	mulq	%r11
3346	addq	%rax,%r15
3347	adcq	$0,%rdx
3348	imulq	%r12,%r9
3349	addq	%r10,%r15
3350	adcq	%rdx,%r9
3351	movq	%r13,%r10
3352	movq	%r14,%r11
3353	movq	%r15,%r12
3354	andq	$3,%r12
3355	movq	%r15,%r13
3356	andq	$-4,%r13
3357	movq	%r9,%r14
3358	shrdq	$2,%r9,%r15
3359	shrq	$2,%r9
3360	addq	%r13,%r10
3361	adcq	%r14,%r11
3362	adcq	$0,%r12
3363	addq	%r15,%r10
3364	adcq	%r9,%r11
3365	adcq	$0,%r12
3366	paddd	%xmm4,%xmm0
3367	pxor	%xmm0,%xmm12
3368	pshufb	.rol16(%rip),%xmm12
3369	paddd	%xmm12,%xmm8
3370	pxor	%xmm8,%xmm4
3371	movdqa	%xmm4,%xmm3
3372	pslld	$12,%xmm3
3373	psrld	$20,%xmm4
3374	pxor	%xmm3,%xmm4
3375	paddd	%xmm4,%xmm0
3376	pxor	%xmm0,%xmm12
3377	pshufb	.rol8(%rip),%xmm12
3378	paddd	%xmm12,%xmm8
3379	pxor	%xmm8,%xmm4
3380	movdqa	%xmm4,%xmm3
3381	pslld	$7,%xmm3
3382	psrld	$25,%xmm4
3383	pxor	%xmm3,%xmm4
3384.byte	102,15,58,15,228,12
3385.byte	102,69,15,58,15,192,8
3386.byte	102,69,15,58,15,228,4
3387	paddd	%xmm5,%xmm1
3388	pxor	%xmm1,%xmm13
3389	pshufb	.rol16(%rip),%xmm13
3390	paddd	%xmm13,%xmm9
3391	pxor	%xmm9,%xmm5
3392	movdqa	%xmm5,%xmm3
3393	pslld	$12,%xmm3
3394	psrld	$20,%xmm5
3395	pxor	%xmm3,%xmm5
3396	paddd	%xmm5,%xmm1
3397	pxor	%xmm1,%xmm13
3398	pshufb	.rol8(%rip),%xmm13
3399	paddd	%xmm13,%xmm9
3400	pxor	%xmm9,%xmm5
3401	movdqa	%xmm5,%xmm3
3402	pslld	$7,%xmm3
3403	psrld	$25,%xmm5
3404	pxor	%xmm3,%xmm5
3405.byte	102,15,58,15,237,12
3406.byte	102,69,15,58,15,201,8
3407.byte	102,69,15,58,15,237,4
3408	paddd	%xmm6,%xmm2
3409	pxor	%xmm2,%xmm14
3410	pshufb	.rol16(%rip),%xmm14
3411	paddd	%xmm14,%xmm10
3412	pxor	%xmm10,%xmm6
3413	movdqa	%xmm6,%xmm3
3414	pslld	$12,%xmm3
3415	psrld	$20,%xmm6
3416	pxor	%xmm3,%xmm6
3417	paddd	%xmm6,%xmm2
3418	pxor	%xmm2,%xmm14
3419	pshufb	.rol8(%rip),%xmm14
3420	paddd	%xmm14,%xmm10
3421	pxor	%xmm10,%xmm6
3422	movdqa	%xmm6,%xmm3
3423	pslld	$7,%xmm3
3424	psrld	$25,%xmm6
3425	pxor	%xmm3,%xmm6
3426.byte	102,15,58,15,246,12
3427.byte	102,69,15,58,15,210,8
3428.byte	102,69,15,58,15,246,4
3429
3430	leaq	16(%rdi),%rdi
3431	decq	%rcx
3432	jg	1b
3433	decq	%r8
3434	jge	2b
3435	paddd	.chacha20_consts(%rip),%xmm2
3436	paddd	48(%rbp),%xmm6
3437	paddd	64(%rbp),%xmm10
3438	paddd	128(%rbp),%xmm14
3439	paddd	.chacha20_consts(%rip),%xmm1
3440	paddd	48(%rbp),%xmm5
3441	paddd	64(%rbp),%xmm9
3442	paddd	112(%rbp),%xmm13
3443	paddd	.chacha20_consts(%rip),%xmm0
3444	paddd	48(%rbp),%xmm4
3445	paddd	64(%rbp),%xmm8
3446	paddd	96(%rbp),%xmm12
3447	movdqu	0 + 0(%rsi),%xmm3
3448	movdqu	16 + 0(%rsi),%xmm7
3449	movdqu	32 + 0(%rsi),%xmm11
3450	movdqu	48 + 0(%rsi),%xmm15
3451	pxor	%xmm3,%xmm2
3452	pxor	%xmm7,%xmm6
3453	pxor	%xmm11,%xmm10
3454	pxor	%xmm14,%xmm15
3455	movdqu	%xmm2,0 + 0(%rdi)
3456	movdqu	%xmm6,16 + 0(%rdi)
3457	movdqu	%xmm10,32 + 0(%rdi)
3458	movdqu	%xmm15,48 + 0(%rdi)
3459	movdqu	0 + 64(%rsi),%xmm3
3460	movdqu	16 + 64(%rsi),%xmm7
3461	movdqu	32 + 64(%rsi),%xmm11
3462	movdqu	48 + 64(%rsi),%xmm15
3463	pxor	%xmm3,%xmm1
3464	pxor	%xmm7,%xmm5
3465	pxor	%xmm11,%xmm9
3466	pxor	%xmm13,%xmm15
3467	movdqu	%xmm1,0 + 64(%rdi)
3468	movdqu	%xmm5,16 + 64(%rdi)
3469	movdqu	%xmm9,32 + 64(%rdi)
3470	movdqu	%xmm15,48 + 64(%rdi)
3471
3472	movq	$128,%rcx
3473	subq	$128,%rbx
3474	leaq	128(%rsi),%rsi
3475
3476seal_sse_128_seal_hash:
3477	cmpq	$16,%rcx
3478	jb	seal_sse_128_seal
3479	addq	0(%rdi),%r10
3480	adcq	8+0(%rdi),%r11
3481	adcq	$1,%r12
3482	movq	0+0(%rbp),%rax
3483	movq	%rax,%r15
3484	mulq	%r10
3485	movq	%rax,%r13
3486	movq	%rdx,%r14
3487	movq	0+0(%rbp),%rax
3488	mulq	%r11
3489	imulq	%r12,%r15
3490	addq	%rax,%r14
3491	adcq	%rdx,%r15
3492	movq	8+0(%rbp),%rax
3493	movq	%rax,%r9
3494	mulq	%r10
3495	addq	%rax,%r14
3496	adcq	$0,%rdx
3497	movq	%rdx,%r10
3498	movq	8+0(%rbp),%rax
3499	mulq	%r11
3500	addq	%rax,%r15
3501	adcq	$0,%rdx
3502	imulq	%r12,%r9
3503	addq	%r10,%r15
3504	adcq	%rdx,%r9
3505	movq	%r13,%r10
3506	movq	%r14,%r11
3507	movq	%r15,%r12
3508	andq	$3,%r12
3509	movq	%r15,%r13
3510	andq	$-4,%r13
3511	movq	%r9,%r14
3512	shrdq	$2,%r9,%r15
3513	shrq	$2,%r9
3514	addq	%r13,%r10
3515	adcq	%r14,%r11
3516	adcq	$0,%r12
3517	addq	%r15,%r10
3518	adcq	%r9,%r11
3519	adcq	$0,%r12
3520
3521	subq	$16,%rcx
3522	leaq	16(%rdi),%rdi
3523	jmp	seal_sse_128_seal_hash
3524
3525seal_sse_128_seal:
3526	cmpq	$16,%rbx
3527	jb	seal_sse_tail_16
3528	subq	$16,%rbx
3529
3530	movdqu	0(%rsi),%xmm3
3531	pxor	%xmm3,%xmm0
3532	movdqu	%xmm0,0(%rdi)
3533
3534	addq	0(%rdi),%r10
3535	adcq	8(%rdi),%r11
3536	adcq	$1,%r12
3537	leaq	16(%rsi),%rsi
3538	leaq	16(%rdi),%rdi
3539	movq	0+0(%rbp),%rax
3540	movq	%rax,%r15
3541	mulq	%r10
3542	movq	%rax,%r13
3543	movq	%rdx,%r14
3544	movq	0+0(%rbp),%rax
3545	mulq	%r11
3546	imulq	%r12,%r15
3547	addq	%rax,%r14
3548	adcq	%rdx,%r15
3549	movq	8+0(%rbp),%rax
3550	movq	%rax,%r9
3551	mulq	%r10
3552	addq	%rax,%r14
3553	adcq	$0,%rdx
3554	movq	%rdx,%r10
3555	movq	8+0(%rbp),%rax
3556	mulq	%r11
3557	addq	%rax,%r15
3558	adcq	$0,%rdx
3559	imulq	%r12,%r9
3560	addq	%r10,%r15
3561	adcq	%rdx,%r9
3562	movq	%r13,%r10
3563	movq	%r14,%r11
3564	movq	%r15,%r12
3565	andq	$3,%r12
3566	movq	%r15,%r13
3567	andq	$-4,%r13
3568	movq	%r9,%r14
3569	shrdq	$2,%r9,%r15
3570	shrq	$2,%r9
3571	addq	%r13,%r10
3572	adcq	%r14,%r11
3573	adcq	$0,%r12
3574	addq	%r15,%r10
3575	adcq	%r9,%r11
3576	adcq	$0,%r12
3577
3578
3579	movdqa	%xmm4,%xmm0
3580	movdqa	%xmm8,%xmm4
3581	movdqa	%xmm12,%xmm8
3582	movdqa	%xmm1,%xmm12
3583	movdqa	%xmm5,%xmm1
3584	movdqa	%xmm9,%xmm5
3585	movdqa	%xmm13,%xmm9
3586	jmp	seal_sse_128_seal
3587
3588seal_sse_tail_16:
3589	testq	%rbx,%rbx
3590	jz	seal_sse_finalize
3591
3592	movq	%rbx,%r8
3593	shlq	$4,%r8
3594	leaq	.and_masks(%rip),%r13
3595	movq	%rbx,%rcx
3596	leaq	-1(%rsi,%rbx), %rsi
3597	pxor	%xmm15,%xmm15
35981:
3599	pslldq	$1,%xmm15
3600	pinsrb	$0,(%rsi),%xmm15
3601	leaq	-1(%rsi),%rsi
3602	decq	%rcx
3603	jne	1b
3604
3605
3606	pxor	%xmm0,%xmm15
3607
3608
3609	movq	%rbx,%rcx
3610	movdqu	%xmm15,%xmm0
36112:
3612	pextrb	$0,%xmm0,(%rdi)
3613	psrldq	$1,%xmm0
3614	addq	$1,%rdi
3615	subq	$1,%rcx
3616	jnz	2b
3617
3618	pand	-16(%r13,%r8), %xmm15
3619.byte	102,77,15,126,253
3620	pextrq	$1,%xmm15,%r14
3621	addq	%r13,%r10
3622	adcq	%r14,%r11
3623	adcq	$1,%r12
3624	movq	0+0(%rbp),%rax
3625	movq	%rax,%r15
3626	mulq	%r10
3627	movq	%rax,%r13
3628	movq	%rdx,%r14
3629	movq	0+0(%rbp),%rax
3630	mulq	%r11
3631	imulq	%r12,%r15
3632	addq	%rax,%r14
3633	adcq	%rdx,%r15
3634	movq	8+0(%rbp),%rax
3635	movq	%rax,%r9
3636	mulq	%r10
3637	addq	%rax,%r14
3638	adcq	$0,%rdx
3639	movq	%rdx,%r10
3640	movq	8+0(%rbp),%rax
3641	mulq	%r11
3642	addq	%rax,%r15
3643	adcq	$0,%rdx
3644	imulq	%r12,%r9
3645	addq	%r10,%r15
3646	adcq	%rdx,%r9
3647	movq	%r13,%r10
3648	movq	%r14,%r11
3649	movq	%r15,%r12
3650	andq	$3,%r12
3651	movq	%r15,%r13
3652	andq	$-4,%r13
3653	movq	%r9,%r14
3654	shrdq	$2,%r9,%r15
3655	shrq	$2,%r9
3656	addq	%r13,%r10
3657	adcq	%r14,%r11
3658	adcq	$0,%r12
3659	addq	%r15,%r10
3660	adcq	%r9,%r11
3661	adcq	$0,%r12
3662
3663seal_sse_finalize:
3664	addq	32(%rbp),%r10
3665	adcq	8+32(%rbp),%r11
3666	adcq	$1,%r12
3667	movq	0+0(%rbp),%rax
3668	movq	%rax,%r15
3669	mulq	%r10
3670	movq	%rax,%r13
3671	movq	%rdx,%r14
3672	movq	0+0(%rbp),%rax
3673	mulq	%r11
3674	imulq	%r12,%r15
3675	addq	%rax,%r14
3676	adcq	%rdx,%r15
3677	movq	8+0(%rbp),%rax
3678	movq	%rax,%r9
3679	mulq	%r10
3680	addq	%rax,%r14
3681	adcq	$0,%rdx
3682	movq	%rdx,%r10
3683	movq	8+0(%rbp),%rax
3684	mulq	%r11
3685	addq	%rax,%r15
3686	adcq	$0,%rdx
3687	imulq	%r12,%r9
3688	addq	%r10,%r15
3689	adcq	%rdx,%r9
3690	movq	%r13,%r10
3691	movq	%r14,%r11
3692	movq	%r15,%r12
3693	andq	$3,%r12
3694	movq	%r15,%r13
3695	andq	$-4,%r13
3696	movq	%r9,%r14
3697	shrdq	$2,%r9,%r15
3698	shrq	$2,%r9
3699	addq	%r13,%r10
3700	adcq	%r14,%r11
3701	adcq	$0,%r12
3702	addq	%r15,%r10
3703	adcq	%r9,%r11
3704	adcq	$0,%r12
3705
3706
3707	movq	%r10,%r13
3708	movq	%r11,%r14
3709	movq	%r12,%r15
3710	subq	$-5,%r10
3711	sbbq	$-1,%r11
3712	sbbq	$3,%r12
3713	cmovcq	%r13,%r10
3714	cmovcq	%r14,%r11
3715	cmovcq	%r15,%r12
3716
3717	addq	0+16(%rbp),%r10
3718	adcq	8+16(%rbp),%r11
3719
3720	addq	$288 + 32,%rsp
3721.cfi_adjust_cfa_offset	-(288 + 32)
3722	popq	%r9
3723.cfi_adjust_cfa_offset	-8
3724	movq	%r10,0(%r9)
3725	movq	%r11,8(%r9)
3726
3727	popq	%r15
3728.cfi_adjust_cfa_offset	-8
3729	popq	%r14
3730.cfi_adjust_cfa_offset	-8
3731	popq	%r13
3732.cfi_adjust_cfa_offset	-8
3733	popq	%r12
3734.cfi_adjust_cfa_offset	-8
3735	popq	%rbx
3736.cfi_adjust_cfa_offset	-8
3737	popq	%rbp
3738.cfi_adjust_cfa_offset	-8
3739	.byte	0xf3,0xc3
3740.cfi_adjust_cfa_offset	(8 * 6) + 288 + 32
3741
3742seal_sse_128:
3743	movdqu	.chacha20_consts(%rip),%xmm0
3744	movdqa	%xmm0,%xmm1
3745	movdqa	%xmm0,%xmm2
3746	movdqu	0(%r9),%xmm4
3747	movdqa	%xmm4,%xmm5
3748	movdqa	%xmm4,%xmm6
3749	movdqu	16(%r9),%xmm8
3750	movdqa	%xmm8,%xmm9
3751	movdqa	%xmm8,%xmm10
3752	movdqu	32(%r9),%xmm14
3753	movdqa	%xmm14,%xmm12
3754	paddd	.sse_inc(%rip),%xmm12
3755	movdqa	%xmm12,%xmm13
3756	paddd	.sse_inc(%rip),%xmm13
3757	movdqa	%xmm4,%xmm7
3758	movdqa	%xmm8,%xmm11
3759	movdqa	%xmm12,%xmm15
3760	movq	$10,%r10
37611:
3762	paddd	%xmm4,%xmm0
3763	pxor	%xmm0,%xmm12
3764	pshufb	.rol16(%rip),%xmm12
3765	paddd	%xmm12,%xmm8
3766	pxor	%xmm8,%xmm4
3767	movdqa	%xmm4,%xmm3
3768	pslld	$12,%xmm3
3769	psrld	$20,%xmm4
3770	pxor	%xmm3,%xmm4
3771	paddd	%xmm4,%xmm0
3772	pxor	%xmm0,%xmm12
3773	pshufb	.rol8(%rip),%xmm12
3774	paddd	%xmm12,%xmm8
3775	pxor	%xmm8,%xmm4
3776	movdqa	%xmm4,%xmm3
3777	pslld	$7,%xmm3
3778	psrld	$25,%xmm4
3779	pxor	%xmm3,%xmm4
3780.byte	102,15,58,15,228,4
3781.byte	102,69,15,58,15,192,8
3782.byte	102,69,15,58,15,228,12
3783	paddd	%xmm5,%xmm1
3784	pxor	%xmm1,%xmm13
3785	pshufb	.rol16(%rip),%xmm13
3786	paddd	%xmm13,%xmm9
3787	pxor	%xmm9,%xmm5
3788	movdqa	%xmm5,%xmm3
3789	pslld	$12,%xmm3
3790	psrld	$20,%xmm5
3791	pxor	%xmm3,%xmm5
3792	paddd	%xmm5,%xmm1
3793	pxor	%xmm1,%xmm13
3794	pshufb	.rol8(%rip),%xmm13
3795	paddd	%xmm13,%xmm9
3796	pxor	%xmm9,%xmm5
3797	movdqa	%xmm5,%xmm3
3798	pslld	$7,%xmm3
3799	psrld	$25,%xmm5
3800	pxor	%xmm3,%xmm5
3801.byte	102,15,58,15,237,4
3802.byte	102,69,15,58,15,201,8
3803.byte	102,69,15,58,15,237,12
3804	paddd	%xmm6,%xmm2
3805	pxor	%xmm2,%xmm14
3806	pshufb	.rol16(%rip),%xmm14
3807	paddd	%xmm14,%xmm10
3808	pxor	%xmm10,%xmm6
3809	movdqa	%xmm6,%xmm3
3810	pslld	$12,%xmm3
3811	psrld	$20,%xmm6
3812	pxor	%xmm3,%xmm6
3813	paddd	%xmm6,%xmm2
3814	pxor	%xmm2,%xmm14
3815	pshufb	.rol8(%rip),%xmm14
3816	paddd	%xmm14,%xmm10
3817	pxor	%xmm10,%xmm6
3818	movdqa	%xmm6,%xmm3
3819	pslld	$7,%xmm3
3820	psrld	$25,%xmm6
3821	pxor	%xmm3,%xmm6
3822.byte	102,15,58,15,246,4
3823.byte	102,69,15,58,15,210,8
3824.byte	102,69,15,58,15,246,12
3825	paddd	%xmm4,%xmm0
3826	pxor	%xmm0,%xmm12
3827	pshufb	.rol16(%rip),%xmm12
3828	paddd	%xmm12,%xmm8
3829	pxor	%xmm8,%xmm4
3830	movdqa	%xmm4,%xmm3
3831	pslld	$12,%xmm3
3832	psrld	$20,%xmm4
3833	pxor	%xmm3,%xmm4
3834	paddd	%xmm4,%xmm0
3835	pxor	%xmm0,%xmm12
3836	pshufb	.rol8(%rip),%xmm12
3837	paddd	%xmm12,%xmm8
3838	pxor	%xmm8,%xmm4
3839	movdqa	%xmm4,%xmm3
3840	pslld	$7,%xmm3
3841	psrld	$25,%xmm4
3842	pxor	%xmm3,%xmm4
3843.byte	102,15,58,15,228,12
3844.byte	102,69,15,58,15,192,8
3845.byte	102,69,15,58,15,228,4
3846	paddd	%xmm5,%xmm1
3847	pxor	%xmm1,%xmm13
3848	pshufb	.rol16(%rip),%xmm13
3849	paddd	%xmm13,%xmm9
3850	pxor	%xmm9,%xmm5
3851	movdqa	%xmm5,%xmm3
3852	pslld	$12,%xmm3
3853	psrld	$20,%xmm5
3854	pxor	%xmm3,%xmm5
3855	paddd	%xmm5,%xmm1
3856	pxor	%xmm1,%xmm13
3857	pshufb	.rol8(%rip),%xmm13
3858	paddd	%xmm13,%xmm9
3859	pxor	%xmm9,%xmm5
3860	movdqa	%xmm5,%xmm3
3861	pslld	$7,%xmm3
3862	psrld	$25,%xmm5
3863	pxor	%xmm3,%xmm5
3864.byte	102,15,58,15,237,12
3865.byte	102,69,15,58,15,201,8
3866.byte	102,69,15,58,15,237,4
3867	paddd	%xmm6,%xmm2
3868	pxor	%xmm2,%xmm14
3869	pshufb	.rol16(%rip),%xmm14
3870	paddd	%xmm14,%xmm10
3871	pxor	%xmm10,%xmm6
3872	movdqa	%xmm6,%xmm3
3873	pslld	$12,%xmm3
3874	psrld	$20,%xmm6
3875	pxor	%xmm3,%xmm6
3876	paddd	%xmm6,%xmm2
3877	pxor	%xmm2,%xmm14
3878	pshufb	.rol8(%rip),%xmm14
3879	paddd	%xmm14,%xmm10
3880	pxor	%xmm10,%xmm6
3881	movdqa	%xmm6,%xmm3
3882	pslld	$7,%xmm3
3883	psrld	$25,%xmm6
3884	pxor	%xmm3,%xmm6
3885.byte	102,15,58,15,246,12
3886.byte	102,69,15,58,15,210,8
3887.byte	102,69,15,58,15,246,4
3888
3889	decq	%r10
3890	jnz	1b
3891	paddd	.chacha20_consts(%rip),%xmm0
3892	paddd	.chacha20_consts(%rip),%xmm1
3893	paddd	.chacha20_consts(%rip),%xmm2
3894	paddd	%xmm7,%xmm4
3895	paddd	%xmm7,%xmm5
3896	paddd	%xmm7,%xmm6
3897	paddd	%xmm11,%xmm8
3898	paddd	%xmm11,%xmm9
3899	paddd	%xmm15,%xmm12
3900	paddd	.sse_inc(%rip),%xmm15
3901	paddd	%xmm15,%xmm13
3902
3903	pand	.clamp(%rip),%xmm2
3904	movdqa	%xmm2,0(%rbp)
3905	movdqa	%xmm6,16(%rbp)
3906
3907	movq	%r8,%r8
3908	call	poly_hash_ad_internal
3909	jmp	seal_sse_128_seal
3910.size	chacha20_poly1305_seal, .-chacha20_poly1305_seal
3911
3912
3913.type	chacha20_poly1305_open_avx2,@function
3914.align	64
3915chacha20_poly1305_open_avx2:
3916	vzeroupper
3917	vmovdqa	.chacha20_consts(%rip),%ymm0
3918	vbroadcasti128	0(%r9),%ymm4
3919	vbroadcasti128	16(%r9),%ymm8
3920	vbroadcasti128	32(%r9),%ymm12
3921	vpaddd	.avx2_init(%rip),%ymm12,%ymm12
3922	cmpq	$192,%rbx
3923	jbe	open_avx2_192
3924	cmpq	$320,%rbx
3925	jbe	open_avx2_320
3926
3927	vmovdqa	%ymm4,64(%rbp)
3928	vmovdqa	%ymm8,96(%rbp)
3929	vmovdqa	%ymm12,160(%rbp)
3930	movq	$10,%r10
39311:
3932	vpaddd	%ymm4,%ymm0,%ymm0
3933	vpxor	%ymm0,%ymm12,%ymm12
3934	vpshufb	.rol16(%rip),%ymm12,%ymm12
3935	vpaddd	%ymm12,%ymm8,%ymm8
3936	vpxor	%ymm8,%ymm4,%ymm4
3937	vpsrld	$20,%ymm4,%ymm3
3938	vpslld	$12,%ymm4,%ymm4
3939	vpxor	%ymm3,%ymm4,%ymm4
3940	vpaddd	%ymm4,%ymm0,%ymm0
3941	vpxor	%ymm0,%ymm12,%ymm12
3942	vpshufb	.rol8(%rip),%ymm12,%ymm12
3943	vpaddd	%ymm12,%ymm8,%ymm8
3944	vpxor	%ymm8,%ymm4,%ymm4
3945	vpslld	$7,%ymm4,%ymm3
3946	vpsrld	$25,%ymm4,%ymm4
3947	vpxor	%ymm3,%ymm4,%ymm4
3948	vpalignr	$12,%ymm12,%ymm12,%ymm12
3949	vpalignr	$8,%ymm8,%ymm8,%ymm8
3950	vpalignr	$4,%ymm4,%ymm4,%ymm4
3951	vpaddd	%ymm4,%ymm0,%ymm0
3952	vpxor	%ymm0,%ymm12,%ymm12
3953	vpshufb	.rol16(%rip),%ymm12,%ymm12
3954	vpaddd	%ymm12,%ymm8,%ymm8
3955	vpxor	%ymm8,%ymm4,%ymm4
3956	vpsrld	$20,%ymm4,%ymm3
3957	vpslld	$12,%ymm4,%ymm4
3958	vpxor	%ymm3,%ymm4,%ymm4
3959	vpaddd	%ymm4,%ymm0,%ymm0
3960	vpxor	%ymm0,%ymm12,%ymm12
3961	vpshufb	.rol8(%rip),%ymm12,%ymm12
3962	vpaddd	%ymm12,%ymm8,%ymm8
3963	vpxor	%ymm8,%ymm4,%ymm4
3964	vpslld	$7,%ymm4,%ymm3
3965	vpsrld	$25,%ymm4,%ymm4
3966	vpxor	%ymm3,%ymm4,%ymm4
3967	vpalignr	$4,%ymm12,%ymm12,%ymm12
3968	vpalignr	$8,%ymm8,%ymm8,%ymm8
3969	vpalignr	$12,%ymm4,%ymm4,%ymm4
3970
3971	decq	%r10
3972	jne	1b
3973	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
3974	vpaddd	64(%rbp),%ymm4,%ymm4
3975	vpaddd	96(%rbp),%ymm8,%ymm8
3976	vpaddd	160(%rbp),%ymm12,%ymm12
3977
3978	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
3979
3980	vpand	.clamp(%rip),%ymm3,%ymm3
3981	vmovdqa	%ymm3,0(%rbp)
3982
3983	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
3984	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
3985
3986	movq	%r8,%r8
3987	call	poly_hash_ad_internal
3988	xorq	%rcx,%rcx
3989
39901:
3991	addq	0(%rsi,%rcx), %r10
3992	adcq	8+0(%rsi,%rcx), %r11
3993	adcq	$1,%r12
3994	movq	0+0(%rbp),%rax
3995	movq	%rax,%r15
3996	mulq	%r10
3997	movq	%rax,%r13
3998	movq	%rdx,%r14
3999	movq	0+0(%rbp),%rax
4000	mulq	%r11
4001	imulq	%r12,%r15
4002	addq	%rax,%r14
4003	adcq	%rdx,%r15
4004	movq	8+0(%rbp),%rax
4005	movq	%rax,%r9
4006	mulq	%r10
4007	addq	%rax,%r14
4008	adcq	$0,%rdx
4009	movq	%rdx,%r10
4010	movq	8+0(%rbp),%rax
4011	mulq	%r11
4012	addq	%rax,%r15
4013	adcq	$0,%rdx
4014	imulq	%r12,%r9
4015	addq	%r10,%r15
4016	adcq	%rdx,%r9
4017	movq	%r13,%r10
4018	movq	%r14,%r11
4019	movq	%r15,%r12
4020	andq	$3,%r12
4021	movq	%r15,%r13
4022	andq	$-4,%r13
4023	movq	%r9,%r14
4024	shrdq	$2,%r9,%r15
4025	shrq	$2,%r9
4026	addq	%r13,%r10
4027	adcq	%r14,%r11
4028	adcq	$0,%r12
4029	addq	%r15,%r10
4030	adcq	%r9,%r11
4031	adcq	$0,%r12
4032
4033	addq	$16,%rcx
4034	cmpq	$64,%rcx
4035	jne	1b
4036
4037	vpxor	0(%rsi),%ymm0,%ymm0
4038	vpxor	32(%rsi),%ymm4,%ymm4
4039	vmovdqu	%ymm0,0(%rdi)
4040	vmovdqu	%ymm4,32(%rdi)
4041	leaq	64(%rsi),%rsi
4042	leaq	64(%rdi),%rdi
4043	subq	$64,%rbx
40441:
4045
4046	cmpq	$512,%rbx
4047	jb	3f
4048	vmovdqa	.chacha20_consts(%rip),%ymm0
4049	vmovdqa	64(%rbp),%ymm4
4050	vmovdqa	96(%rbp),%ymm8
4051	vmovdqa	%ymm0,%ymm1
4052	vmovdqa	%ymm4,%ymm5
4053	vmovdqa	%ymm8,%ymm9
4054	vmovdqa	%ymm0,%ymm2
4055	vmovdqa	%ymm4,%ymm6
4056	vmovdqa	%ymm8,%ymm10
4057	vmovdqa	%ymm0,%ymm3
4058	vmovdqa	%ymm4,%ymm7
4059	vmovdqa	%ymm8,%ymm11
4060	vmovdqa	.avx2_inc(%rip),%ymm12
4061	vpaddd	160(%rbp),%ymm12,%ymm15
4062	vpaddd	%ymm15,%ymm12,%ymm14
4063	vpaddd	%ymm14,%ymm12,%ymm13
4064	vpaddd	%ymm13,%ymm12,%ymm12
4065	vmovdqa	%ymm15,256(%rbp)
4066	vmovdqa	%ymm14,224(%rbp)
4067	vmovdqa	%ymm13,192(%rbp)
4068	vmovdqa	%ymm12,160(%rbp)
4069
4070	xorq	%rcx,%rcx
40712:
4072	addq	0*8(%rsi,%rcx), %r10
4073	adcq	8+0*8(%rsi,%rcx), %r11
4074	adcq	$1,%r12
4075	vmovdqa	%ymm8,128(%rbp)
4076	vmovdqa	.rol16(%rip),%ymm8
4077	vpaddd	%ymm7,%ymm3,%ymm3
4078	vpaddd	%ymm6,%ymm2,%ymm2
4079	vpaddd	%ymm5,%ymm1,%ymm1
4080	vpaddd	%ymm4,%ymm0,%ymm0
4081	vpxor	%ymm3,%ymm15,%ymm15
4082	vpxor	%ymm2,%ymm14,%ymm14
4083	vpxor	%ymm1,%ymm13,%ymm13
4084	vpxor	%ymm0,%ymm12,%ymm12
4085	movq	0+0(%rbp),%rdx
4086	movq	%rdx,%r15
4087	mulxq	%r10,%r13,%r14
4088	mulxq	%r11,%rax,%rdx
4089	imulq	%r12,%r15
4090	addq	%rax,%r14
4091	adcq	%rdx,%r15
4092	vpshufb	%ymm8,%ymm15,%ymm15
4093	vpshufb	%ymm8,%ymm14,%ymm14
4094	vpshufb	%ymm8,%ymm13,%ymm13
4095	vpshufb	%ymm8,%ymm12,%ymm12
4096	vmovdqa	128(%rbp),%ymm8
4097	vpaddd	%ymm15,%ymm11,%ymm11
4098	vpaddd	%ymm14,%ymm10,%ymm10
4099	vpaddd	%ymm13,%ymm9,%ymm9
4100	vpaddd	%ymm12,%ymm8,%ymm8
4101	movq	8+0(%rbp),%rdx
4102	mulxq	%r10,%r10,%rax
4103	addq	%r10,%r14
4104	mulxq	%r11,%r11,%r9
4105	adcq	%r11,%r15
4106	adcq	$0,%r9
4107	imulq	%r12,%rdx
4108	vpxor	%ymm11,%ymm7,%ymm7
4109	vpxor	%ymm10,%ymm6,%ymm6
4110	vpxor	%ymm9,%ymm5,%ymm5
4111	vpxor	%ymm8,%ymm4,%ymm4
4112	vmovdqa	%ymm8,128(%rbp)
4113	vpsrld	$20,%ymm7,%ymm8
4114	vpslld	$32-20,%ymm7,%ymm7
4115	vpxor	%ymm8,%ymm7,%ymm7
4116	vpsrld	$20,%ymm6,%ymm8
4117	vpslld	$32-20,%ymm6,%ymm6
4118	vpxor	%ymm8,%ymm6,%ymm6
4119	vpsrld	$20,%ymm5,%ymm8
4120	addq	%rax,%r15
4121	adcq	%rdx,%r9
4122	vpslld	$32-20,%ymm5,%ymm5
4123	vpxor	%ymm8,%ymm5,%ymm5
4124	vpsrld	$20,%ymm4,%ymm8
4125	vpslld	$32-20,%ymm4,%ymm4
4126	vpxor	%ymm8,%ymm4,%ymm4
4127	vmovdqa	.rol8(%rip),%ymm8
4128	vpaddd	%ymm7,%ymm3,%ymm3
4129	vpaddd	%ymm6,%ymm2,%ymm2
4130	vpaddd	%ymm5,%ymm1,%ymm1
4131	vpaddd	%ymm4,%ymm0,%ymm0
4132	movq	%r13,%r10
4133	movq	%r14,%r11
4134	movq	%r15,%r12
4135	andq	$3,%r12
4136	movq	%r15,%r13
4137	andq	$-4,%r13
4138	movq	%r9,%r14
4139	shrdq	$2,%r9,%r15
4140	shrq	$2,%r9
4141	addq	%r13,%r10
4142	adcq	%r14,%r11
4143	adcq	$0,%r12
4144	addq	%r15,%r10
4145	adcq	%r9,%r11
4146	adcq	$0,%r12
4147	vpxor	%ymm3,%ymm15,%ymm15
4148	vpxor	%ymm2,%ymm14,%ymm14
4149	vpxor	%ymm1,%ymm13,%ymm13
4150	vpxor	%ymm0,%ymm12,%ymm12
4151	vpshufb	%ymm8,%ymm15,%ymm15
4152	vpshufb	%ymm8,%ymm14,%ymm14
4153	vpshufb	%ymm8,%ymm13,%ymm13
4154	vpshufb	%ymm8,%ymm12,%ymm12
4155	vmovdqa	128(%rbp),%ymm8
4156	addq	2*8(%rsi,%rcx), %r10
4157	adcq	8+2*8(%rsi,%rcx), %r11
4158	adcq	$1,%r12
4159	vpaddd	%ymm15,%ymm11,%ymm11
4160	vpaddd	%ymm14,%ymm10,%ymm10
4161	vpaddd	%ymm13,%ymm9,%ymm9
4162	vpaddd	%ymm12,%ymm8,%ymm8
4163	vpxor	%ymm11,%ymm7,%ymm7
4164	vpxor	%ymm10,%ymm6,%ymm6
4165	vpxor	%ymm9,%ymm5,%ymm5
4166	vpxor	%ymm8,%ymm4,%ymm4
4167	movq	0+0(%rbp),%rdx
4168	movq	%rdx,%r15
4169	mulxq	%r10,%r13,%r14
4170	mulxq	%r11,%rax,%rdx
4171	imulq	%r12,%r15
4172	addq	%rax,%r14
4173	adcq	%rdx,%r15
4174	vmovdqa	%ymm8,128(%rbp)
4175	vpsrld	$25,%ymm7,%ymm8
4176	vpslld	$32-25,%ymm7,%ymm7
4177	vpxor	%ymm8,%ymm7,%ymm7
4178	vpsrld	$25,%ymm6,%ymm8
4179	vpslld	$32-25,%ymm6,%ymm6
4180	vpxor	%ymm8,%ymm6,%ymm6
4181	vpsrld	$25,%ymm5,%ymm8
4182	vpslld	$32-25,%ymm5,%ymm5
4183	vpxor	%ymm8,%ymm5,%ymm5
4184	vpsrld	$25,%ymm4,%ymm8
4185	vpslld	$32-25,%ymm4,%ymm4
4186	vpxor	%ymm8,%ymm4,%ymm4
4187	vmovdqa	128(%rbp),%ymm8
4188	vpalignr	$4,%ymm7,%ymm7,%ymm7
4189	vpalignr	$8,%ymm11,%ymm11,%ymm11
4190	vpalignr	$12,%ymm15,%ymm15,%ymm15
4191	vpalignr	$4,%ymm6,%ymm6,%ymm6
4192	movq	8+0(%rbp),%rdx
4193	mulxq	%r10,%r10,%rax
4194	addq	%r10,%r14
4195	mulxq	%r11,%r11,%r9
4196	adcq	%r11,%r15
4197	adcq	$0,%r9
4198	imulq	%r12,%rdx
4199	vpalignr	$8,%ymm10,%ymm10,%ymm10
4200	vpalignr	$12,%ymm14,%ymm14,%ymm14
4201	vpalignr	$4,%ymm5,%ymm5,%ymm5
4202	vpalignr	$8,%ymm9,%ymm9,%ymm9
4203	vpalignr	$12,%ymm13,%ymm13,%ymm13
4204	vpalignr	$4,%ymm4,%ymm4,%ymm4
4205	vpalignr	$8,%ymm8,%ymm8,%ymm8
4206	vpalignr	$12,%ymm12,%ymm12,%ymm12
4207	vmovdqa	%ymm8,128(%rbp)
4208	vmovdqa	.rol16(%rip),%ymm8
4209	vpaddd	%ymm7,%ymm3,%ymm3
4210	vpaddd	%ymm6,%ymm2,%ymm2
4211	vpaddd	%ymm5,%ymm1,%ymm1
4212	vpaddd	%ymm4,%ymm0,%ymm0
4213	vpxor	%ymm3,%ymm15,%ymm15
4214	vpxor	%ymm2,%ymm14,%ymm14
4215	vpxor	%ymm1,%ymm13,%ymm13
4216	vpxor	%ymm0,%ymm12,%ymm12
4217	addq	%rax,%r15
4218	adcq	%rdx,%r9
4219	vpshufb	%ymm8,%ymm15,%ymm15
4220	vpshufb	%ymm8,%ymm14,%ymm14
4221	vpshufb	%ymm8,%ymm13,%ymm13
4222	vpshufb	%ymm8,%ymm12,%ymm12
4223	vmovdqa	128(%rbp),%ymm8
4224	vpaddd	%ymm15,%ymm11,%ymm11
4225	vpaddd	%ymm14,%ymm10,%ymm10
4226	vpaddd	%ymm13,%ymm9,%ymm9
4227	vpaddd	%ymm12,%ymm8,%ymm8
4228	movq	%r13,%r10
4229	movq	%r14,%r11
4230	movq	%r15,%r12
4231	andq	$3,%r12
4232	movq	%r15,%r13
4233	andq	$-4,%r13
4234	movq	%r9,%r14
4235	shrdq	$2,%r9,%r15
4236	shrq	$2,%r9
4237	addq	%r13,%r10
4238	adcq	%r14,%r11
4239	adcq	$0,%r12
4240	addq	%r15,%r10
4241	adcq	%r9,%r11
4242	adcq	$0,%r12
4243	vpxor	%ymm11,%ymm7,%ymm7
4244	vpxor	%ymm10,%ymm6,%ymm6
4245	vpxor	%ymm9,%ymm5,%ymm5
4246	vpxor	%ymm8,%ymm4,%ymm4
4247	vmovdqa	%ymm8,128(%rbp)
4248	vpsrld	$20,%ymm7,%ymm8
4249	vpslld	$32-20,%ymm7,%ymm7
4250	vpxor	%ymm8,%ymm7,%ymm7
4251	addq	4*8(%rsi,%rcx), %r10
4252	adcq	8+4*8(%rsi,%rcx), %r11
4253	adcq	$1,%r12
4254
4255	leaq	48(%rcx),%rcx
4256	vpsrld	$20,%ymm6,%ymm8
4257	vpslld	$32-20,%ymm6,%ymm6
4258	vpxor	%ymm8,%ymm6,%ymm6
4259	vpsrld	$20,%ymm5,%ymm8
4260	vpslld	$32-20,%ymm5,%ymm5
4261	vpxor	%ymm8,%ymm5,%ymm5
4262	vpsrld	$20,%ymm4,%ymm8
4263	vpslld	$32-20,%ymm4,%ymm4
4264	vpxor	%ymm8,%ymm4,%ymm4
4265	vmovdqa	.rol8(%rip),%ymm8
4266	vpaddd	%ymm7,%ymm3,%ymm3
4267	vpaddd	%ymm6,%ymm2,%ymm2
4268	vpaddd	%ymm5,%ymm1,%ymm1
4269	vpaddd	%ymm4,%ymm0,%ymm0
4270	vpxor	%ymm3,%ymm15,%ymm15
4271	vpxor	%ymm2,%ymm14,%ymm14
4272	vpxor	%ymm1,%ymm13,%ymm13
4273	vpxor	%ymm0,%ymm12,%ymm12
4274	movq	0+0(%rbp),%rdx
4275	movq	%rdx,%r15
4276	mulxq	%r10,%r13,%r14
4277	mulxq	%r11,%rax,%rdx
4278	imulq	%r12,%r15
4279	addq	%rax,%r14
4280	adcq	%rdx,%r15
4281	vpshufb	%ymm8,%ymm15,%ymm15
4282	vpshufb	%ymm8,%ymm14,%ymm14
4283	vpshufb	%ymm8,%ymm13,%ymm13
4284	vpshufb	%ymm8,%ymm12,%ymm12
4285	vmovdqa	128(%rbp),%ymm8
4286	vpaddd	%ymm15,%ymm11,%ymm11
4287	vpaddd	%ymm14,%ymm10,%ymm10
4288	vpaddd	%ymm13,%ymm9,%ymm9
4289	movq	8+0(%rbp),%rdx
4290	mulxq	%r10,%r10,%rax
4291	addq	%r10,%r14
4292	mulxq	%r11,%r11,%r9
4293	adcq	%r11,%r15
4294	adcq	$0,%r9
4295	imulq	%r12,%rdx
4296	vpaddd	%ymm12,%ymm8,%ymm8
4297	vpxor	%ymm11,%ymm7,%ymm7
4298	vpxor	%ymm10,%ymm6,%ymm6
4299	vpxor	%ymm9,%ymm5,%ymm5
4300	vpxor	%ymm8,%ymm4,%ymm4
4301	vmovdqa	%ymm8,128(%rbp)
4302	vpsrld	$25,%ymm7,%ymm8
4303	vpslld	$32-25,%ymm7,%ymm7
4304	addq	%rax,%r15
4305	adcq	%rdx,%r9
4306	vpxor	%ymm8,%ymm7,%ymm7
4307	vpsrld	$25,%ymm6,%ymm8
4308	vpslld	$32-25,%ymm6,%ymm6
4309	vpxor	%ymm8,%ymm6,%ymm6
4310	vpsrld	$25,%ymm5,%ymm8
4311	vpslld	$32-25,%ymm5,%ymm5
4312	vpxor	%ymm8,%ymm5,%ymm5
4313	vpsrld	$25,%ymm4,%ymm8
4314	vpslld	$32-25,%ymm4,%ymm4
4315	vpxor	%ymm8,%ymm4,%ymm4
4316	vmovdqa	128(%rbp),%ymm8
4317	vpalignr	$12,%ymm7,%ymm7,%ymm7
4318	vpalignr	$8,%ymm11,%ymm11,%ymm11
4319	vpalignr	$4,%ymm15,%ymm15,%ymm15
4320	vpalignr	$12,%ymm6,%ymm6,%ymm6
4321	vpalignr	$8,%ymm10,%ymm10,%ymm10
4322	vpalignr	$4,%ymm14,%ymm14,%ymm14
4323	vpalignr	$12,%ymm5,%ymm5,%ymm5
4324	movq	%r13,%r10
4325	movq	%r14,%r11
4326	movq	%r15,%r12
4327	andq	$3,%r12
4328	movq	%r15,%r13
4329	andq	$-4,%r13
4330	movq	%r9,%r14
4331	shrdq	$2,%r9,%r15
4332	shrq	$2,%r9
4333	addq	%r13,%r10
4334	adcq	%r14,%r11
4335	adcq	$0,%r12
4336	addq	%r15,%r10
4337	adcq	%r9,%r11
4338	adcq	$0,%r12
4339	vpalignr	$8,%ymm9,%ymm9,%ymm9
4340	vpalignr	$4,%ymm13,%ymm13,%ymm13
4341	vpalignr	$12,%ymm4,%ymm4,%ymm4
4342	vpalignr	$8,%ymm8,%ymm8,%ymm8
4343	vpalignr	$4,%ymm12,%ymm12,%ymm12
4344
4345	cmpq	$60*8,%rcx
4346	jne	2b
4347	vpaddd	.chacha20_consts(%rip),%ymm3,%ymm3
4348	vpaddd	64(%rbp),%ymm7,%ymm7
4349	vpaddd	96(%rbp),%ymm11,%ymm11
4350	vpaddd	256(%rbp),%ymm15,%ymm15
4351	vpaddd	.chacha20_consts(%rip),%ymm2,%ymm2
4352	vpaddd	64(%rbp),%ymm6,%ymm6
4353	vpaddd	96(%rbp),%ymm10,%ymm10
4354	vpaddd	224(%rbp),%ymm14,%ymm14
4355	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
4356	vpaddd	64(%rbp),%ymm5,%ymm5
4357	vpaddd	96(%rbp),%ymm9,%ymm9
4358	vpaddd	192(%rbp),%ymm13,%ymm13
4359	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
4360	vpaddd	64(%rbp),%ymm4,%ymm4
4361	vpaddd	96(%rbp),%ymm8,%ymm8
4362	vpaddd	160(%rbp),%ymm12,%ymm12
4363
4364	vmovdqa	%ymm0,128(%rbp)
4365	addq	60*8(%rsi),%r10
4366	adcq	8+60*8(%rsi),%r11
4367	adcq	$1,%r12
4368	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
4369	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
4370	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
4371	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
4372	vpxor	0+0(%rsi),%ymm0,%ymm0
4373	vpxor	32+0(%rsi),%ymm3,%ymm3
4374	vpxor	64+0(%rsi),%ymm7,%ymm7
4375	vpxor	96+0(%rsi),%ymm11,%ymm11
4376	vmovdqu	%ymm0,0+0(%rdi)
4377	vmovdqu	%ymm3,32+0(%rdi)
4378	vmovdqu	%ymm7,64+0(%rdi)
4379	vmovdqu	%ymm11,96+0(%rdi)
4380
4381	vmovdqa	128(%rbp),%ymm0
4382	movq	0+0(%rbp),%rax
4383	movq	%rax,%r15
4384	mulq	%r10
4385	movq	%rax,%r13
4386	movq	%rdx,%r14
4387	movq	0+0(%rbp),%rax
4388	mulq	%r11
4389	imulq	%r12,%r15
4390	addq	%rax,%r14
4391	adcq	%rdx,%r15
4392	movq	8+0(%rbp),%rax
4393	movq	%rax,%r9
4394	mulq	%r10
4395	addq	%rax,%r14
4396	adcq	$0,%rdx
4397	movq	%rdx,%r10
4398	movq	8+0(%rbp),%rax
4399	mulq	%r11
4400	addq	%rax,%r15
4401	adcq	$0,%rdx
4402	imulq	%r12,%r9
4403	addq	%r10,%r15
4404	adcq	%rdx,%r9
4405	movq	%r13,%r10
4406	movq	%r14,%r11
4407	movq	%r15,%r12
4408	andq	$3,%r12
4409	movq	%r15,%r13
4410	andq	$-4,%r13
4411	movq	%r9,%r14
4412	shrdq	$2,%r9,%r15
4413	shrq	$2,%r9
4414	addq	%r13,%r10
4415	adcq	%r14,%r11
4416	adcq	$0,%r12
4417	addq	%r15,%r10
4418	adcq	%r9,%r11
4419	adcq	$0,%r12
4420	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
4421	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
4422	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
4423	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
4424	vpxor	0+128(%rsi),%ymm3,%ymm3
4425	vpxor	32+128(%rsi),%ymm2,%ymm2
4426	vpxor	64+128(%rsi),%ymm6,%ymm6
4427	vpxor	96+128(%rsi),%ymm10,%ymm10
4428	vmovdqu	%ymm3,0+128(%rdi)
4429	vmovdqu	%ymm2,32+128(%rdi)
4430	vmovdqu	%ymm6,64+128(%rdi)
4431	vmovdqu	%ymm10,96+128(%rdi)
4432	addq	60*8+16(%rsi),%r10
4433	adcq	8+60*8+16(%rsi),%r11
4434	adcq	$1,%r12
4435	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
4436	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
4437	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
4438	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
4439	vpxor	0+256(%rsi),%ymm3,%ymm3
4440	vpxor	32+256(%rsi),%ymm1,%ymm1
4441	vpxor	64+256(%rsi),%ymm5,%ymm5
4442	vpxor	96+256(%rsi),%ymm9,%ymm9
4443	vmovdqu	%ymm3,0+256(%rdi)
4444	vmovdqu	%ymm1,32+256(%rdi)
4445	vmovdqu	%ymm5,64+256(%rdi)
4446	vmovdqu	%ymm9,96+256(%rdi)
4447	movq	0+0(%rbp),%rax
4448	movq	%rax,%r15
4449	mulq	%r10
4450	movq	%rax,%r13
4451	movq	%rdx,%r14
4452	movq	0+0(%rbp),%rax
4453	mulq	%r11
4454	imulq	%r12,%r15
4455	addq	%rax,%r14
4456	adcq	%rdx,%r15
4457	movq	8+0(%rbp),%rax
4458	movq	%rax,%r9
4459	mulq	%r10
4460	addq	%rax,%r14
4461	adcq	$0,%rdx
4462	movq	%rdx,%r10
4463	movq	8+0(%rbp),%rax
4464	mulq	%r11
4465	addq	%rax,%r15
4466	adcq	$0,%rdx
4467	imulq	%r12,%r9
4468	addq	%r10,%r15
4469	adcq	%rdx,%r9
4470	movq	%r13,%r10
4471	movq	%r14,%r11
4472	movq	%r15,%r12
4473	andq	$3,%r12
4474	movq	%r15,%r13
4475	andq	$-4,%r13
4476	movq	%r9,%r14
4477	shrdq	$2,%r9,%r15
4478	shrq	$2,%r9
4479	addq	%r13,%r10
4480	adcq	%r14,%r11
4481	adcq	$0,%r12
4482	addq	%r15,%r10
4483	adcq	%r9,%r11
4484	adcq	$0,%r12
4485	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
4486	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
4487	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
4488	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
4489	vpxor	0+384(%rsi),%ymm3,%ymm3
4490	vpxor	32+384(%rsi),%ymm0,%ymm0
4491	vpxor	64+384(%rsi),%ymm4,%ymm4
4492	vpxor	96+384(%rsi),%ymm8,%ymm8
4493	vmovdqu	%ymm3,0+384(%rdi)
4494	vmovdqu	%ymm0,32+384(%rdi)
4495	vmovdqu	%ymm4,64+384(%rdi)
4496	vmovdqu	%ymm8,96+384(%rdi)
4497
4498	leaq	512(%rsi),%rsi
4499	leaq	512(%rdi),%rdi
4500	subq	$512,%rbx
4501	jmp	1b
45023:
4503	testq	%rbx,%rbx
4504	vzeroupper
4505	je	open_sse_finalize
45063:
4507	cmpq	$128,%rbx
4508	ja	3f
4509	vmovdqa	.chacha20_consts(%rip),%ymm0
4510	vmovdqa	64(%rbp),%ymm4
4511	vmovdqa	96(%rbp),%ymm8
4512	vmovdqa	.avx2_inc(%rip),%ymm12
4513	vpaddd	160(%rbp),%ymm12,%ymm12
4514	vmovdqa	%ymm12,160(%rbp)
4515
4516	xorq	%r8,%r8
4517	movq	%rbx,%rcx
4518	andq	$-16,%rcx
4519	testq	%rcx,%rcx
4520	je	2f
45211:
4522	addq	0*8(%rsi,%r8), %r10
4523	adcq	8+0*8(%rsi,%r8), %r11
4524	adcq	$1,%r12
4525	movq	0+0(%rbp),%rax
4526	movq	%rax,%r15
4527	mulq	%r10
4528	movq	%rax,%r13
4529	movq	%rdx,%r14
4530	movq	0+0(%rbp),%rax
4531	mulq	%r11
4532	imulq	%r12,%r15
4533	addq	%rax,%r14
4534	adcq	%rdx,%r15
4535	movq	8+0(%rbp),%rax
4536	movq	%rax,%r9
4537	mulq	%r10
4538	addq	%rax,%r14
4539	adcq	$0,%rdx
4540	movq	%rdx,%r10
4541	movq	8+0(%rbp),%rax
4542	mulq	%r11
4543	addq	%rax,%r15
4544	adcq	$0,%rdx
4545	imulq	%r12,%r9
4546	addq	%r10,%r15
4547	adcq	%rdx,%r9
4548	movq	%r13,%r10
4549	movq	%r14,%r11
4550	movq	%r15,%r12
4551	andq	$3,%r12
4552	movq	%r15,%r13
4553	andq	$-4,%r13
4554	movq	%r9,%r14
4555	shrdq	$2,%r9,%r15
4556	shrq	$2,%r9
4557	addq	%r13,%r10
4558	adcq	%r14,%r11
4559	adcq	$0,%r12
4560	addq	%r15,%r10
4561	adcq	%r9,%r11
4562	adcq	$0,%r12
4563
45642:
4565	addq	$16,%r8
4566	vpaddd	%ymm4,%ymm0,%ymm0
4567	vpxor	%ymm0,%ymm12,%ymm12
4568	vpshufb	.rol16(%rip),%ymm12,%ymm12
4569	vpaddd	%ymm12,%ymm8,%ymm8
4570	vpxor	%ymm8,%ymm4,%ymm4
4571	vpsrld	$20,%ymm4,%ymm3
4572	vpslld	$12,%ymm4,%ymm4
4573	vpxor	%ymm3,%ymm4,%ymm4
4574	vpaddd	%ymm4,%ymm0,%ymm0
4575	vpxor	%ymm0,%ymm12,%ymm12
4576	vpshufb	.rol8(%rip),%ymm12,%ymm12
4577	vpaddd	%ymm12,%ymm8,%ymm8
4578	vpxor	%ymm8,%ymm4,%ymm4
4579	vpslld	$7,%ymm4,%ymm3
4580	vpsrld	$25,%ymm4,%ymm4
4581	vpxor	%ymm3,%ymm4,%ymm4
4582	vpalignr	$12,%ymm12,%ymm12,%ymm12
4583	vpalignr	$8,%ymm8,%ymm8,%ymm8
4584	vpalignr	$4,%ymm4,%ymm4,%ymm4
4585	vpaddd	%ymm4,%ymm0,%ymm0
4586	vpxor	%ymm0,%ymm12,%ymm12
4587	vpshufb	.rol16(%rip),%ymm12,%ymm12
4588	vpaddd	%ymm12,%ymm8,%ymm8
4589	vpxor	%ymm8,%ymm4,%ymm4
4590	vpsrld	$20,%ymm4,%ymm3
4591	vpslld	$12,%ymm4,%ymm4
4592	vpxor	%ymm3,%ymm4,%ymm4
4593	vpaddd	%ymm4,%ymm0,%ymm0
4594	vpxor	%ymm0,%ymm12,%ymm12
4595	vpshufb	.rol8(%rip),%ymm12,%ymm12
4596	vpaddd	%ymm12,%ymm8,%ymm8
4597	vpxor	%ymm8,%ymm4,%ymm4
4598	vpslld	$7,%ymm4,%ymm3
4599	vpsrld	$25,%ymm4,%ymm4
4600	vpxor	%ymm3,%ymm4,%ymm4
4601	vpalignr	$4,%ymm12,%ymm12,%ymm12
4602	vpalignr	$8,%ymm8,%ymm8,%ymm8
4603	vpalignr	$12,%ymm4,%ymm4,%ymm4
4604
4605	cmpq	%rcx,%r8
4606	jb	1b
4607	cmpq	$160,%r8
4608	jne	2b
4609	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
4610	vpaddd	64(%rbp),%ymm4,%ymm4
4611	vpaddd	96(%rbp),%ymm8,%ymm8
4612	vpaddd	160(%rbp),%ymm12,%ymm12
4613	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
4614	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
4615	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
4616	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
4617	vmovdqa	%ymm3,%ymm8
4618
4619	jmp	open_avx2_tail_loop
46203:
4621	cmpq	$256,%rbx
4622	ja	3f
4623	vmovdqa	.chacha20_consts(%rip),%ymm0
4624	vmovdqa	64(%rbp),%ymm4
4625	vmovdqa	96(%rbp),%ymm8
4626	vmovdqa	%ymm0,%ymm1
4627	vmovdqa	%ymm4,%ymm5
4628	vmovdqa	%ymm8,%ymm9
4629	vmovdqa	.avx2_inc(%rip),%ymm12
4630	vpaddd	160(%rbp),%ymm12,%ymm13
4631	vpaddd	%ymm13,%ymm12,%ymm12
4632	vmovdqa	%ymm12,160(%rbp)
4633	vmovdqa	%ymm13,192(%rbp)
4634
4635	movq	%rbx,128(%rbp)
4636	movq	%rbx,%rcx
4637	subq	$128,%rcx
4638	shrq	$4,%rcx
4639	movq	$10,%r8
4640	cmpq	$10,%rcx
4641	cmovgq	%r8,%rcx
4642	movq	%rsi,%rbx
4643	xorq	%r8,%r8
46441:
4645	addq	0(%rbx),%r10
4646	adcq	8+0(%rbx),%r11
4647	adcq	$1,%r12
4648	movq	0+0(%rbp),%rdx
4649	movq	%rdx,%r15
4650	mulxq	%r10,%r13,%r14
4651	mulxq	%r11,%rax,%rdx
4652	imulq	%r12,%r15
4653	addq	%rax,%r14
4654	adcq	%rdx,%r15
4655	movq	8+0(%rbp),%rdx
4656	mulxq	%r10,%r10,%rax
4657	addq	%r10,%r14
4658	mulxq	%r11,%r11,%r9
4659	adcq	%r11,%r15
4660	adcq	$0,%r9
4661	imulq	%r12,%rdx
4662	addq	%rax,%r15
4663	adcq	%rdx,%r9
4664	movq	%r13,%r10
4665	movq	%r14,%r11
4666	movq	%r15,%r12
4667	andq	$3,%r12
4668	movq	%r15,%r13
4669	andq	$-4,%r13
4670	movq	%r9,%r14
4671	shrdq	$2,%r9,%r15
4672	shrq	$2,%r9
4673	addq	%r13,%r10
4674	adcq	%r14,%r11
4675	adcq	$0,%r12
4676	addq	%r15,%r10
4677	adcq	%r9,%r11
4678	adcq	$0,%r12
4679
4680	leaq	16(%rbx),%rbx
46812:
4682	vpaddd	%ymm4,%ymm0,%ymm0
4683	vpxor	%ymm0,%ymm12,%ymm12
4684	vpshufb	.rol16(%rip),%ymm12,%ymm12
4685	vpaddd	%ymm12,%ymm8,%ymm8
4686	vpxor	%ymm8,%ymm4,%ymm4
4687	vpsrld	$20,%ymm4,%ymm3
4688	vpslld	$12,%ymm4,%ymm4
4689	vpxor	%ymm3,%ymm4,%ymm4
4690	vpaddd	%ymm4,%ymm0,%ymm0
4691	vpxor	%ymm0,%ymm12,%ymm12
4692	vpshufb	.rol8(%rip),%ymm12,%ymm12
4693	vpaddd	%ymm12,%ymm8,%ymm8
4694	vpxor	%ymm8,%ymm4,%ymm4
4695	vpslld	$7,%ymm4,%ymm3
4696	vpsrld	$25,%ymm4,%ymm4
4697	vpxor	%ymm3,%ymm4,%ymm4
4698	vpalignr	$12,%ymm12,%ymm12,%ymm12
4699	vpalignr	$8,%ymm8,%ymm8,%ymm8
4700	vpalignr	$4,%ymm4,%ymm4,%ymm4
4701	vpaddd	%ymm5,%ymm1,%ymm1
4702	vpxor	%ymm1,%ymm13,%ymm13
4703	vpshufb	.rol16(%rip),%ymm13,%ymm13
4704	vpaddd	%ymm13,%ymm9,%ymm9
4705	vpxor	%ymm9,%ymm5,%ymm5
4706	vpsrld	$20,%ymm5,%ymm3
4707	vpslld	$12,%ymm5,%ymm5
4708	vpxor	%ymm3,%ymm5,%ymm5
4709	vpaddd	%ymm5,%ymm1,%ymm1
4710	vpxor	%ymm1,%ymm13,%ymm13
4711	vpshufb	.rol8(%rip),%ymm13,%ymm13
4712	vpaddd	%ymm13,%ymm9,%ymm9
4713	vpxor	%ymm9,%ymm5,%ymm5
4714	vpslld	$7,%ymm5,%ymm3
4715	vpsrld	$25,%ymm5,%ymm5
4716	vpxor	%ymm3,%ymm5,%ymm5
4717	vpalignr	$12,%ymm13,%ymm13,%ymm13
4718	vpalignr	$8,%ymm9,%ymm9,%ymm9
4719	vpalignr	$4,%ymm5,%ymm5,%ymm5
4720
4721	incq	%r8
4722	vpaddd	%ymm4,%ymm0,%ymm0
4723	vpxor	%ymm0,%ymm12,%ymm12
4724	vpshufb	.rol16(%rip),%ymm12,%ymm12
4725	vpaddd	%ymm12,%ymm8,%ymm8
4726	vpxor	%ymm8,%ymm4,%ymm4
4727	vpsrld	$20,%ymm4,%ymm3
4728	vpslld	$12,%ymm4,%ymm4
4729	vpxor	%ymm3,%ymm4,%ymm4
4730	vpaddd	%ymm4,%ymm0,%ymm0
4731	vpxor	%ymm0,%ymm12,%ymm12
4732	vpshufb	.rol8(%rip),%ymm12,%ymm12
4733	vpaddd	%ymm12,%ymm8,%ymm8
4734	vpxor	%ymm8,%ymm4,%ymm4
4735	vpslld	$7,%ymm4,%ymm3
4736	vpsrld	$25,%ymm4,%ymm4
4737	vpxor	%ymm3,%ymm4,%ymm4
4738	vpalignr	$4,%ymm12,%ymm12,%ymm12
4739	vpalignr	$8,%ymm8,%ymm8,%ymm8
4740	vpalignr	$12,%ymm4,%ymm4,%ymm4
4741	vpaddd	%ymm5,%ymm1,%ymm1
4742	vpxor	%ymm1,%ymm13,%ymm13
4743	vpshufb	.rol16(%rip),%ymm13,%ymm13
4744	vpaddd	%ymm13,%ymm9,%ymm9
4745	vpxor	%ymm9,%ymm5,%ymm5
4746	vpsrld	$20,%ymm5,%ymm3
4747	vpslld	$12,%ymm5,%ymm5
4748	vpxor	%ymm3,%ymm5,%ymm5
4749	vpaddd	%ymm5,%ymm1,%ymm1
4750	vpxor	%ymm1,%ymm13,%ymm13
4751	vpshufb	.rol8(%rip),%ymm13,%ymm13
4752	vpaddd	%ymm13,%ymm9,%ymm9
4753	vpxor	%ymm9,%ymm5,%ymm5
4754	vpslld	$7,%ymm5,%ymm3
4755	vpsrld	$25,%ymm5,%ymm5
4756	vpxor	%ymm3,%ymm5,%ymm5
4757	vpalignr	$4,%ymm13,%ymm13,%ymm13
4758	vpalignr	$8,%ymm9,%ymm9,%ymm9
4759	vpalignr	$12,%ymm5,%ymm5,%ymm5
4760	vpaddd	%ymm6,%ymm2,%ymm2
4761	vpxor	%ymm2,%ymm14,%ymm14
4762	vpshufb	.rol16(%rip),%ymm14,%ymm14
4763	vpaddd	%ymm14,%ymm10,%ymm10
4764	vpxor	%ymm10,%ymm6,%ymm6
4765	vpsrld	$20,%ymm6,%ymm3
4766	vpslld	$12,%ymm6,%ymm6
4767	vpxor	%ymm3,%ymm6,%ymm6
4768	vpaddd	%ymm6,%ymm2,%ymm2
4769	vpxor	%ymm2,%ymm14,%ymm14
4770	vpshufb	.rol8(%rip),%ymm14,%ymm14
4771	vpaddd	%ymm14,%ymm10,%ymm10
4772	vpxor	%ymm10,%ymm6,%ymm6
4773	vpslld	$7,%ymm6,%ymm3
4774	vpsrld	$25,%ymm6,%ymm6
4775	vpxor	%ymm3,%ymm6,%ymm6
4776	vpalignr	$4,%ymm14,%ymm14,%ymm14
4777	vpalignr	$8,%ymm10,%ymm10,%ymm10
4778	vpalignr	$12,%ymm6,%ymm6,%ymm6
4779
4780	cmpq	%rcx,%r8
4781	jb	1b
4782	cmpq	$10,%r8
4783	jne	2b
4784	movq	%rbx,%r8
4785	subq	%rsi,%rbx
4786	movq	%rbx,%rcx
4787	movq	128(%rbp),%rbx
47881:
4789	addq	$16,%rcx
4790	cmpq	%rbx,%rcx
4791	jg	1f
4792	addq	0(%r8),%r10
4793	adcq	8+0(%r8),%r11
4794	adcq	$1,%r12
4795	movq	0+0(%rbp),%rdx
4796	movq	%rdx,%r15
4797	mulxq	%r10,%r13,%r14
4798	mulxq	%r11,%rax,%rdx
4799	imulq	%r12,%r15
4800	addq	%rax,%r14
4801	adcq	%rdx,%r15
4802	movq	8+0(%rbp),%rdx
4803	mulxq	%r10,%r10,%rax
4804	addq	%r10,%r14
4805	mulxq	%r11,%r11,%r9
4806	adcq	%r11,%r15
4807	adcq	$0,%r9
4808	imulq	%r12,%rdx
4809	addq	%rax,%r15
4810	adcq	%rdx,%r9
4811	movq	%r13,%r10
4812	movq	%r14,%r11
4813	movq	%r15,%r12
4814	andq	$3,%r12
4815	movq	%r15,%r13
4816	andq	$-4,%r13
4817	movq	%r9,%r14
4818	shrdq	$2,%r9,%r15
4819	shrq	$2,%r9
4820	addq	%r13,%r10
4821	adcq	%r14,%r11
4822	adcq	$0,%r12
4823	addq	%r15,%r10
4824	adcq	%r9,%r11
4825	adcq	$0,%r12
4826
4827	leaq	16(%r8),%r8
4828	jmp	1b
48291:
4830	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
4831	vpaddd	64(%rbp),%ymm5,%ymm5
4832	vpaddd	96(%rbp),%ymm9,%ymm9
4833	vpaddd	192(%rbp),%ymm13,%ymm13
4834	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
4835	vpaddd	64(%rbp),%ymm4,%ymm4
4836	vpaddd	96(%rbp),%ymm8,%ymm8
4837	vpaddd	160(%rbp),%ymm12,%ymm12
4838	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
4839	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
4840	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
4841	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
4842	vpxor	0+0(%rsi),%ymm3,%ymm3
4843	vpxor	32+0(%rsi),%ymm1,%ymm1
4844	vpxor	64+0(%rsi),%ymm5,%ymm5
4845	vpxor	96+0(%rsi),%ymm9,%ymm9
4846	vmovdqu	%ymm3,0+0(%rdi)
4847	vmovdqu	%ymm1,32+0(%rdi)
4848	vmovdqu	%ymm5,64+0(%rdi)
4849	vmovdqu	%ymm9,96+0(%rdi)
4850	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
4851	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
4852	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
4853	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
4854	vmovdqa	%ymm3,%ymm8
4855
4856	leaq	128(%rsi),%rsi
4857	leaq	128(%rdi),%rdi
4858	subq	$128,%rbx
4859	jmp	open_avx2_tail_loop
48603:
4861	cmpq	$384,%rbx
4862	ja	3f
4863	vmovdqa	.chacha20_consts(%rip),%ymm0
4864	vmovdqa	64(%rbp),%ymm4
4865	vmovdqa	96(%rbp),%ymm8
4866	vmovdqa	%ymm0,%ymm1
4867	vmovdqa	%ymm4,%ymm5
4868	vmovdqa	%ymm8,%ymm9
4869	vmovdqa	%ymm0,%ymm2
4870	vmovdqa	%ymm4,%ymm6
4871	vmovdqa	%ymm8,%ymm10
4872	vmovdqa	.avx2_inc(%rip),%ymm12
4873	vpaddd	160(%rbp),%ymm12,%ymm14
4874	vpaddd	%ymm14,%ymm12,%ymm13
4875	vpaddd	%ymm13,%ymm12,%ymm12
4876	vmovdqa	%ymm12,160(%rbp)
4877	vmovdqa	%ymm13,192(%rbp)
4878	vmovdqa	%ymm14,224(%rbp)
4879
4880	movq	%rbx,128(%rbp)
4881	movq	%rbx,%rcx
4882	subq	$256,%rcx
4883	shrq	$4,%rcx
4884	addq	$6,%rcx
4885	movq	$10,%r8
4886	cmpq	$10,%rcx
4887	cmovgq	%r8,%rcx
4888	movq	%rsi,%rbx
4889	xorq	%r8,%r8
48901:
4891	addq	0(%rbx),%r10
4892	adcq	8+0(%rbx),%r11
4893	adcq	$1,%r12
4894	movq	0+0(%rbp),%rdx
4895	movq	%rdx,%r15
4896	mulxq	%r10,%r13,%r14
4897	mulxq	%r11,%rax,%rdx
4898	imulq	%r12,%r15
4899	addq	%rax,%r14
4900	adcq	%rdx,%r15
4901	movq	8+0(%rbp),%rdx
4902	mulxq	%r10,%r10,%rax
4903	addq	%r10,%r14
4904	mulxq	%r11,%r11,%r9
4905	adcq	%r11,%r15
4906	adcq	$0,%r9
4907	imulq	%r12,%rdx
4908	addq	%rax,%r15
4909	adcq	%rdx,%r9
4910	movq	%r13,%r10
4911	movq	%r14,%r11
4912	movq	%r15,%r12
4913	andq	$3,%r12
4914	movq	%r15,%r13
4915	andq	$-4,%r13
4916	movq	%r9,%r14
4917	shrdq	$2,%r9,%r15
4918	shrq	$2,%r9
4919	addq	%r13,%r10
4920	adcq	%r14,%r11
4921	adcq	$0,%r12
4922	addq	%r15,%r10
4923	adcq	%r9,%r11
4924	adcq	$0,%r12
4925
4926	leaq	16(%rbx),%rbx
49272:
4928	vpaddd	%ymm6,%ymm2,%ymm2
4929	vpxor	%ymm2,%ymm14,%ymm14
4930	vpshufb	.rol16(%rip),%ymm14,%ymm14
4931	vpaddd	%ymm14,%ymm10,%ymm10
4932	vpxor	%ymm10,%ymm6,%ymm6
4933	vpsrld	$20,%ymm6,%ymm3
4934	vpslld	$12,%ymm6,%ymm6
4935	vpxor	%ymm3,%ymm6,%ymm6
4936	vpaddd	%ymm6,%ymm2,%ymm2
4937	vpxor	%ymm2,%ymm14,%ymm14
4938	vpshufb	.rol8(%rip),%ymm14,%ymm14
4939	vpaddd	%ymm14,%ymm10,%ymm10
4940	vpxor	%ymm10,%ymm6,%ymm6
4941	vpslld	$7,%ymm6,%ymm3
4942	vpsrld	$25,%ymm6,%ymm6
4943	vpxor	%ymm3,%ymm6,%ymm6
4944	vpalignr	$12,%ymm14,%ymm14,%ymm14
4945	vpalignr	$8,%ymm10,%ymm10,%ymm10
4946	vpalignr	$4,%ymm6,%ymm6,%ymm6
4947	vpaddd	%ymm5,%ymm1,%ymm1
4948	vpxor	%ymm1,%ymm13,%ymm13
4949	vpshufb	.rol16(%rip),%ymm13,%ymm13
4950	vpaddd	%ymm13,%ymm9,%ymm9
4951	vpxor	%ymm9,%ymm5,%ymm5
4952	vpsrld	$20,%ymm5,%ymm3
4953	vpslld	$12,%ymm5,%ymm5
4954	vpxor	%ymm3,%ymm5,%ymm5
4955	vpaddd	%ymm5,%ymm1,%ymm1
4956	vpxor	%ymm1,%ymm13,%ymm13
4957	vpshufb	.rol8(%rip),%ymm13,%ymm13
4958	vpaddd	%ymm13,%ymm9,%ymm9
4959	vpxor	%ymm9,%ymm5,%ymm5
4960	vpslld	$7,%ymm5,%ymm3
4961	vpsrld	$25,%ymm5,%ymm5
4962	vpxor	%ymm3,%ymm5,%ymm5
4963	vpalignr	$12,%ymm13,%ymm13,%ymm13
4964	vpalignr	$8,%ymm9,%ymm9,%ymm9
4965	vpalignr	$4,%ymm5,%ymm5,%ymm5
4966	vpaddd	%ymm4,%ymm0,%ymm0
4967	vpxor	%ymm0,%ymm12,%ymm12
4968	vpshufb	.rol16(%rip),%ymm12,%ymm12
4969	vpaddd	%ymm12,%ymm8,%ymm8
4970	vpxor	%ymm8,%ymm4,%ymm4
4971	vpsrld	$20,%ymm4,%ymm3
4972	vpslld	$12,%ymm4,%ymm4
4973	vpxor	%ymm3,%ymm4,%ymm4
4974	vpaddd	%ymm4,%ymm0,%ymm0
4975	vpxor	%ymm0,%ymm12,%ymm12
4976	vpshufb	.rol8(%rip),%ymm12,%ymm12
4977	vpaddd	%ymm12,%ymm8,%ymm8
4978	vpxor	%ymm8,%ymm4,%ymm4
4979	vpslld	$7,%ymm4,%ymm3
4980	vpsrld	$25,%ymm4,%ymm4
4981	vpxor	%ymm3,%ymm4,%ymm4
4982	vpalignr	$12,%ymm12,%ymm12,%ymm12
4983	vpalignr	$8,%ymm8,%ymm8,%ymm8
4984	vpalignr	$4,%ymm4,%ymm4,%ymm4
4985	addq	0(%rbx),%r10
4986	adcq	8+0(%rbx),%r11
4987	adcq	$1,%r12
4988	movq	0+0(%rbp),%rax
4989	movq	%rax,%r15
4990	mulq	%r10
4991	movq	%rax,%r13
4992	movq	%rdx,%r14
4993	movq	0+0(%rbp),%rax
4994	mulq	%r11
4995	imulq	%r12,%r15
4996	addq	%rax,%r14
4997	adcq	%rdx,%r15
4998	movq	8+0(%rbp),%rax
4999	movq	%rax,%r9
5000	mulq	%r10
5001	addq	%rax,%r14
5002	adcq	$0,%rdx
5003	movq	%rdx,%r10
5004	movq	8+0(%rbp),%rax
5005	mulq	%r11
5006	addq	%rax,%r15
5007	adcq	$0,%rdx
5008	imulq	%r12,%r9
5009	addq	%r10,%r15
5010	adcq	%rdx,%r9
5011	movq	%r13,%r10
5012	movq	%r14,%r11
5013	movq	%r15,%r12
5014	andq	$3,%r12
5015	movq	%r15,%r13
5016	andq	$-4,%r13
5017	movq	%r9,%r14
5018	shrdq	$2,%r9,%r15
5019	shrq	$2,%r9
5020	addq	%r13,%r10
5021	adcq	%r14,%r11
5022	adcq	$0,%r12
5023	addq	%r15,%r10
5024	adcq	%r9,%r11
5025	adcq	$0,%r12
5026
5027	leaq	16(%rbx),%rbx
5028	incq	%r8
5029	vpaddd	%ymm6,%ymm2,%ymm2
5030	vpxor	%ymm2,%ymm14,%ymm14
5031	vpshufb	.rol16(%rip),%ymm14,%ymm14
5032	vpaddd	%ymm14,%ymm10,%ymm10
5033	vpxor	%ymm10,%ymm6,%ymm6
5034	vpsrld	$20,%ymm6,%ymm3
5035	vpslld	$12,%ymm6,%ymm6
5036	vpxor	%ymm3,%ymm6,%ymm6
5037	vpaddd	%ymm6,%ymm2,%ymm2
5038	vpxor	%ymm2,%ymm14,%ymm14
5039	vpshufb	.rol8(%rip),%ymm14,%ymm14
5040	vpaddd	%ymm14,%ymm10,%ymm10
5041	vpxor	%ymm10,%ymm6,%ymm6
5042	vpslld	$7,%ymm6,%ymm3
5043	vpsrld	$25,%ymm6,%ymm6
5044	vpxor	%ymm3,%ymm6,%ymm6
5045	vpalignr	$4,%ymm14,%ymm14,%ymm14
5046	vpalignr	$8,%ymm10,%ymm10,%ymm10
5047	vpalignr	$12,%ymm6,%ymm6,%ymm6
5048	vpaddd	%ymm5,%ymm1,%ymm1
5049	vpxor	%ymm1,%ymm13,%ymm13
5050	vpshufb	.rol16(%rip),%ymm13,%ymm13
5051	vpaddd	%ymm13,%ymm9,%ymm9
5052	vpxor	%ymm9,%ymm5,%ymm5
5053	vpsrld	$20,%ymm5,%ymm3
5054	vpslld	$12,%ymm5,%ymm5
5055	vpxor	%ymm3,%ymm5,%ymm5
5056	vpaddd	%ymm5,%ymm1,%ymm1
5057	vpxor	%ymm1,%ymm13,%ymm13
5058	vpshufb	.rol8(%rip),%ymm13,%ymm13
5059	vpaddd	%ymm13,%ymm9,%ymm9
5060	vpxor	%ymm9,%ymm5,%ymm5
5061	vpslld	$7,%ymm5,%ymm3
5062	vpsrld	$25,%ymm5,%ymm5
5063	vpxor	%ymm3,%ymm5,%ymm5
5064	vpalignr	$4,%ymm13,%ymm13,%ymm13
5065	vpalignr	$8,%ymm9,%ymm9,%ymm9
5066	vpalignr	$12,%ymm5,%ymm5,%ymm5
5067	vpaddd	%ymm4,%ymm0,%ymm0
5068	vpxor	%ymm0,%ymm12,%ymm12
5069	vpshufb	.rol16(%rip),%ymm12,%ymm12
5070	vpaddd	%ymm12,%ymm8,%ymm8
5071	vpxor	%ymm8,%ymm4,%ymm4
5072	vpsrld	$20,%ymm4,%ymm3
5073	vpslld	$12,%ymm4,%ymm4
5074	vpxor	%ymm3,%ymm4,%ymm4
5075	vpaddd	%ymm4,%ymm0,%ymm0
5076	vpxor	%ymm0,%ymm12,%ymm12
5077	vpshufb	.rol8(%rip),%ymm12,%ymm12
5078	vpaddd	%ymm12,%ymm8,%ymm8
5079	vpxor	%ymm8,%ymm4,%ymm4
5080	vpslld	$7,%ymm4,%ymm3
5081	vpsrld	$25,%ymm4,%ymm4
5082	vpxor	%ymm3,%ymm4,%ymm4
5083	vpalignr	$4,%ymm12,%ymm12,%ymm12
5084	vpalignr	$8,%ymm8,%ymm8,%ymm8
5085	vpalignr	$12,%ymm4,%ymm4,%ymm4
5086
5087	cmpq	%rcx,%r8
5088	jb	1b
5089	cmpq	$10,%r8
5090	jne	2b
5091	movq	%rbx,%r8
5092	subq	%rsi,%rbx
5093	movq	%rbx,%rcx
5094	movq	128(%rbp),%rbx
50951:
5096	addq	$16,%rcx
5097	cmpq	%rbx,%rcx
5098	jg	1f
5099	addq	0(%r8),%r10
5100	adcq	8+0(%r8),%r11
5101	adcq	$1,%r12
5102	movq	0+0(%rbp),%rdx
5103	movq	%rdx,%r15
5104	mulxq	%r10,%r13,%r14
5105	mulxq	%r11,%rax,%rdx
5106	imulq	%r12,%r15
5107	addq	%rax,%r14
5108	adcq	%rdx,%r15
5109	movq	8+0(%rbp),%rdx
5110	mulxq	%r10,%r10,%rax
5111	addq	%r10,%r14
5112	mulxq	%r11,%r11,%r9
5113	adcq	%r11,%r15
5114	adcq	$0,%r9
5115	imulq	%r12,%rdx
5116	addq	%rax,%r15
5117	adcq	%rdx,%r9
5118	movq	%r13,%r10
5119	movq	%r14,%r11
5120	movq	%r15,%r12
5121	andq	$3,%r12
5122	movq	%r15,%r13
5123	andq	$-4,%r13
5124	movq	%r9,%r14
5125	shrdq	$2,%r9,%r15
5126	shrq	$2,%r9
5127	addq	%r13,%r10
5128	adcq	%r14,%r11
5129	adcq	$0,%r12
5130	addq	%r15,%r10
5131	adcq	%r9,%r11
5132	adcq	$0,%r12
5133
5134	leaq	16(%r8),%r8
5135	jmp	1b
51361:
5137	vpaddd	.chacha20_consts(%rip),%ymm2,%ymm2
5138	vpaddd	64(%rbp),%ymm6,%ymm6
5139	vpaddd	96(%rbp),%ymm10,%ymm10
5140	vpaddd	224(%rbp),%ymm14,%ymm14
5141	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
5142	vpaddd	64(%rbp),%ymm5,%ymm5
5143	vpaddd	96(%rbp),%ymm9,%ymm9
5144	vpaddd	192(%rbp),%ymm13,%ymm13
5145	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
5146	vpaddd	64(%rbp),%ymm4,%ymm4
5147	vpaddd	96(%rbp),%ymm8,%ymm8
5148	vpaddd	160(%rbp),%ymm12,%ymm12
5149	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
5150	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
5151	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
5152	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
5153	vpxor	0+0(%rsi),%ymm3,%ymm3
5154	vpxor	32+0(%rsi),%ymm2,%ymm2
5155	vpxor	64+0(%rsi),%ymm6,%ymm6
5156	vpxor	96+0(%rsi),%ymm10,%ymm10
5157	vmovdqu	%ymm3,0+0(%rdi)
5158	vmovdqu	%ymm2,32+0(%rdi)
5159	vmovdqu	%ymm6,64+0(%rdi)
5160	vmovdqu	%ymm10,96+0(%rdi)
5161	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
5162	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
5163	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
5164	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
5165	vpxor	0+128(%rsi),%ymm3,%ymm3
5166	vpxor	32+128(%rsi),%ymm1,%ymm1
5167	vpxor	64+128(%rsi),%ymm5,%ymm5
5168	vpxor	96+128(%rsi),%ymm9,%ymm9
5169	vmovdqu	%ymm3,0+128(%rdi)
5170	vmovdqu	%ymm1,32+128(%rdi)
5171	vmovdqu	%ymm5,64+128(%rdi)
5172	vmovdqu	%ymm9,96+128(%rdi)
5173	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
5174	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
5175	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
5176	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
5177	vmovdqa	%ymm3,%ymm8
5178
5179	leaq	256(%rsi),%rsi
5180	leaq	256(%rdi),%rdi
5181	subq	$256,%rbx
5182	jmp	open_avx2_tail_loop
51833:
5184	vmovdqa	.chacha20_consts(%rip),%ymm0
5185	vmovdqa	64(%rbp),%ymm4
5186	vmovdqa	96(%rbp),%ymm8
5187	vmovdqa	%ymm0,%ymm1
5188	vmovdqa	%ymm4,%ymm5
5189	vmovdqa	%ymm8,%ymm9
5190	vmovdqa	%ymm0,%ymm2
5191	vmovdqa	%ymm4,%ymm6
5192	vmovdqa	%ymm8,%ymm10
5193	vmovdqa	%ymm0,%ymm3
5194	vmovdqa	%ymm4,%ymm7
5195	vmovdqa	%ymm8,%ymm11
5196	vmovdqa	.avx2_inc(%rip),%ymm12
5197	vpaddd	160(%rbp),%ymm12,%ymm15
5198	vpaddd	%ymm15,%ymm12,%ymm14
5199	vpaddd	%ymm14,%ymm12,%ymm13
5200	vpaddd	%ymm13,%ymm12,%ymm12
5201	vmovdqa	%ymm15,256(%rbp)
5202	vmovdqa	%ymm14,224(%rbp)
5203	vmovdqa	%ymm13,192(%rbp)
5204	vmovdqa	%ymm12,160(%rbp)
5205
5206	xorq	%rcx,%rcx
5207	movq	%rsi,%r8
52081:
5209	addq	0(%r8),%r10
5210	adcq	8+0(%r8),%r11
5211	adcq	$1,%r12
5212	movq	0+0(%rbp),%rax
5213	movq	%rax,%r15
5214	mulq	%r10
5215	movq	%rax,%r13
5216	movq	%rdx,%r14
5217	movq	0+0(%rbp),%rax
5218	mulq	%r11
5219	imulq	%r12,%r15
5220	addq	%rax,%r14
5221	adcq	%rdx,%r15
5222	movq	8+0(%rbp),%rax
5223	movq	%rax,%r9
5224	mulq	%r10
5225	addq	%rax,%r14
5226	adcq	$0,%rdx
5227	movq	%rdx,%r10
5228	movq	8+0(%rbp),%rax
5229	mulq	%r11
5230	addq	%rax,%r15
5231	adcq	$0,%rdx
5232	imulq	%r12,%r9
5233	addq	%r10,%r15
5234	adcq	%rdx,%r9
5235	movq	%r13,%r10
5236	movq	%r14,%r11
5237	movq	%r15,%r12
5238	andq	$3,%r12
5239	movq	%r15,%r13
5240	andq	$-4,%r13
5241	movq	%r9,%r14
5242	shrdq	$2,%r9,%r15
5243	shrq	$2,%r9
5244	addq	%r13,%r10
5245	adcq	%r14,%r11
5246	adcq	$0,%r12
5247	addq	%r15,%r10
5248	adcq	%r9,%r11
5249	adcq	$0,%r12
5250
5251	leaq	16(%r8),%r8
52522:
5253	vmovdqa	%ymm8,128(%rbp)
5254	vmovdqa	.rol16(%rip),%ymm8
5255	vpaddd	%ymm7,%ymm3,%ymm3
5256	vpaddd	%ymm6,%ymm2,%ymm2
5257	vpaddd	%ymm5,%ymm1,%ymm1
5258	vpaddd	%ymm4,%ymm0,%ymm0
5259	vpxor	%ymm3,%ymm15,%ymm15
5260	vpxor	%ymm2,%ymm14,%ymm14
5261	vpxor	%ymm1,%ymm13,%ymm13
5262	vpxor	%ymm0,%ymm12,%ymm12
5263	vpshufb	%ymm8,%ymm15,%ymm15
5264	vpshufb	%ymm8,%ymm14,%ymm14
5265	vpshufb	%ymm8,%ymm13,%ymm13
5266	vpshufb	%ymm8,%ymm12,%ymm12
5267	vmovdqa	128(%rbp),%ymm8
5268	vpaddd	%ymm15,%ymm11,%ymm11
5269	vpaddd	%ymm14,%ymm10,%ymm10
5270	vpaddd	%ymm13,%ymm9,%ymm9
5271	vpaddd	%ymm12,%ymm8,%ymm8
5272	vpxor	%ymm11,%ymm7,%ymm7
5273	vpxor	%ymm10,%ymm6,%ymm6
5274	vpxor	%ymm9,%ymm5,%ymm5
5275	vpxor	%ymm8,%ymm4,%ymm4
5276	vmovdqa	%ymm8,128(%rbp)
5277	vpsrld	$20,%ymm7,%ymm8
5278	vpslld	$32-20,%ymm7,%ymm7
5279	vpxor	%ymm8,%ymm7,%ymm7
5280	vpsrld	$20,%ymm6,%ymm8
5281	vpslld	$32-20,%ymm6,%ymm6
5282	vpxor	%ymm8,%ymm6,%ymm6
5283	vpsrld	$20,%ymm5,%ymm8
5284	vpslld	$32-20,%ymm5,%ymm5
5285	vpxor	%ymm8,%ymm5,%ymm5
5286	vpsrld	$20,%ymm4,%ymm8
5287	vpslld	$32-20,%ymm4,%ymm4
5288	vpxor	%ymm8,%ymm4,%ymm4
5289	vmovdqa	.rol8(%rip),%ymm8
5290	addq	0(%r8),%r10
5291	adcq	8+0(%r8),%r11
5292	adcq	$1,%r12
5293	movq	0+0(%rbp),%rdx
5294	movq	%rdx,%r15
5295	mulxq	%r10,%r13,%r14
5296	mulxq	%r11,%rax,%rdx
5297	imulq	%r12,%r15
5298	addq	%rax,%r14
5299	adcq	%rdx,%r15
5300	movq	8+0(%rbp),%rdx
5301	mulxq	%r10,%r10,%rax
5302	addq	%r10,%r14
5303	mulxq	%r11,%r11,%r9
5304	adcq	%r11,%r15
5305	adcq	$0,%r9
5306	imulq	%r12,%rdx
5307	addq	%rax,%r15
5308	adcq	%rdx,%r9
5309	movq	%r13,%r10
5310	movq	%r14,%r11
5311	movq	%r15,%r12
5312	andq	$3,%r12
5313	movq	%r15,%r13
5314	andq	$-4,%r13
5315	movq	%r9,%r14
5316	shrdq	$2,%r9,%r15
5317	shrq	$2,%r9
5318	addq	%r13,%r10
5319	adcq	%r14,%r11
5320	adcq	$0,%r12
5321	addq	%r15,%r10
5322	adcq	%r9,%r11
5323	adcq	$0,%r12
5324	vpaddd	%ymm7,%ymm3,%ymm3
5325	vpaddd	%ymm6,%ymm2,%ymm2
5326	vpaddd	%ymm5,%ymm1,%ymm1
5327	vpaddd	%ymm4,%ymm0,%ymm0
5328	vpxor	%ymm3,%ymm15,%ymm15
5329	vpxor	%ymm2,%ymm14,%ymm14
5330	vpxor	%ymm1,%ymm13,%ymm13
5331	vpxor	%ymm0,%ymm12,%ymm12
5332	vpshufb	%ymm8,%ymm15,%ymm15
5333	vpshufb	%ymm8,%ymm14,%ymm14
5334	vpshufb	%ymm8,%ymm13,%ymm13
5335	vpshufb	%ymm8,%ymm12,%ymm12
5336	vmovdqa	128(%rbp),%ymm8
5337	vpaddd	%ymm15,%ymm11,%ymm11
5338	vpaddd	%ymm14,%ymm10,%ymm10
5339	vpaddd	%ymm13,%ymm9,%ymm9
5340	vpaddd	%ymm12,%ymm8,%ymm8
5341	vpxor	%ymm11,%ymm7,%ymm7
5342	vpxor	%ymm10,%ymm6,%ymm6
5343	vpxor	%ymm9,%ymm5,%ymm5
5344	vpxor	%ymm8,%ymm4,%ymm4
5345	vmovdqa	%ymm8,128(%rbp)
5346	vpsrld	$25,%ymm7,%ymm8
5347	vpslld	$32-25,%ymm7,%ymm7
5348	vpxor	%ymm8,%ymm7,%ymm7
5349	vpsrld	$25,%ymm6,%ymm8
5350	vpslld	$32-25,%ymm6,%ymm6
5351	vpxor	%ymm8,%ymm6,%ymm6
5352	vpsrld	$25,%ymm5,%ymm8
5353	vpslld	$32-25,%ymm5,%ymm5
5354	vpxor	%ymm8,%ymm5,%ymm5
5355	vpsrld	$25,%ymm4,%ymm8
5356	vpslld	$32-25,%ymm4,%ymm4
5357	vpxor	%ymm8,%ymm4,%ymm4
5358	vmovdqa	128(%rbp),%ymm8
5359	vpalignr	$4,%ymm7,%ymm7,%ymm7
5360	vpalignr	$8,%ymm11,%ymm11,%ymm11
5361	vpalignr	$12,%ymm15,%ymm15,%ymm15
5362	vpalignr	$4,%ymm6,%ymm6,%ymm6
5363	vpalignr	$8,%ymm10,%ymm10,%ymm10
5364	vpalignr	$12,%ymm14,%ymm14,%ymm14
5365	vpalignr	$4,%ymm5,%ymm5,%ymm5
5366	vpalignr	$8,%ymm9,%ymm9,%ymm9
5367	vpalignr	$12,%ymm13,%ymm13,%ymm13
5368	vpalignr	$4,%ymm4,%ymm4,%ymm4
5369	vpalignr	$8,%ymm8,%ymm8,%ymm8
5370	vpalignr	$12,%ymm12,%ymm12,%ymm12
5371	vmovdqa	%ymm8,128(%rbp)
5372	addq	16(%r8),%r10
5373	adcq	8+16(%r8),%r11
5374	adcq	$1,%r12
5375	movq	0+0(%rbp),%rdx
5376	movq	%rdx,%r15
5377	mulxq	%r10,%r13,%r14
5378	mulxq	%r11,%rax,%rdx
5379	imulq	%r12,%r15
5380	addq	%rax,%r14
5381	adcq	%rdx,%r15
5382	movq	8+0(%rbp),%rdx
5383	mulxq	%r10,%r10,%rax
5384	addq	%r10,%r14
5385	mulxq	%r11,%r11,%r9
5386	adcq	%r11,%r15
5387	adcq	$0,%r9
5388	imulq	%r12,%rdx
5389	addq	%rax,%r15
5390	adcq	%rdx,%r9
5391	movq	%r13,%r10
5392	movq	%r14,%r11
5393	movq	%r15,%r12
5394	andq	$3,%r12
5395	movq	%r15,%r13
5396	andq	$-4,%r13
5397	movq	%r9,%r14
5398	shrdq	$2,%r9,%r15
5399	shrq	$2,%r9
5400	addq	%r13,%r10
5401	adcq	%r14,%r11
5402	adcq	$0,%r12
5403	addq	%r15,%r10
5404	adcq	%r9,%r11
5405	adcq	$0,%r12
5406
5407	leaq	32(%r8),%r8
5408	vmovdqa	.rol16(%rip),%ymm8
5409	vpaddd	%ymm7,%ymm3,%ymm3
5410	vpaddd	%ymm6,%ymm2,%ymm2
5411	vpaddd	%ymm5,%ymm1,%ymm1
5412	vpaddd	%ymm4,%ymm0,%ymm0
5413	vpxor	%ymm3,%ymm15,%ymm15
5414	vpxor	%ymm2,%ymm14,%ymm14
5415	vpxor	%ymm1,%ymm13,%ymm13
5416	vpxor	%ymm0,%ymm12,%ymm12
5417	vpshufb	%ymm8,%ymm15,%ymm15
5418	vpshufb	%ymm8,%ymm14,%ymm14
5419	vpshufb	%ymm8,%ymm13,%ymm13
5420	vpshufb	%ymm8,%ymm12,%ymm12
5421	vmovdqa	128(%rbp),%ymm8
5422	vpaddd	%ymm15,%ymm11,%ymm11
5423	vpaddd	%ymm14,%ymm10,%ymm10
5424	vpaddd	%ymm13,%ymm9,%ymm9
5425	vpaddd	%ymm12,%ymm8,%ymm8
5426	vpxor	%ymm11,%ymm7,%ymm7
5427	vpxor	%ymm10,%ymm6,%ymm6
5428	vpxor	%ymm9,%ymm5,%ymm5
5429	vpxor	%ymm8,%ymm4,%ymm4
5430	vmovdqa	%ymm8,128(%rbp)
5431	vpsrld	$20,%ymm7,%ymm8
5432	vpslld	$32-20,%ymm7,%ymm7
5433	vpxor	%ymm8,%ymm7,%ymm7
5434	vpsrld	$20,%ymm6,%ymm8
5435	vpslld	$32-20,%ymm6,%ymm6
5436	vpxor	%ymm8,%ymm6,%ymm6
5437	vpsrld	$20,%ymm5,%ymm8
5438	vpslld	$32-20,%ymm5,%ymm5
5439	vpxor	%ymm8,%ymm5,%ymm5
5440	vpsrld	$20,%ymm4,%ymm8
5441	vpslld	$32-20,%ymm4,%ymm4
5442	vpxor	%ymm8,%ymm4,%ymm4
5443	vmovdqa	.rol8(%rip),%ymm8
5444	vpaddd	%ymm7,%ymm3,%ymm3
5445	vpaddd	%ymm6,%ymm2,%ymm2
5446	vpaddd	%ymm5,%ymm1,%ymm1
5447	vpaddd	%ymm4,%ymm0,%ymm0
5448	vpxor	%ymm3,%ymm15,%ymm15
5449	vpxor	%ymm2,%ymm14,%ymm14
5450	vpxor	%ymm1,%ymm13,%ymm13
5451	vpxor	%ymm0,%ymm12,%ymm12
5452	vpshufb	%ymm8,%ymm15,%ymm15
5453	vpshufb	%ymm8,%ymm14,%ymm14
5454	vpshufb	%ymm8,%ymm13,%ymm13
5455	vpshufb	%ymm8,%ymm12,%ymm12
5456	vmovdqa	128(%rbp),%ymm8
5457	vpaddd	%ymm15,%ymm11,%ymm11
5458	vpaddd	%ymm14,%ymm10,%ymm10
5459	vpaddd	%ymm13,%ymm9,%ymm9
5460	vpaddd	%ymm12,%ymm8,%ymm8
5461	vpxor	%ymm11,%ymm7,%ymm7
5462	vpxor	%ymm10,%ymm6,%ymm6
5463	vpxor	%ymm9,%ymm5,%ymm5
5464	vpxor	%ymm8,%ymm4,%ymm4
5465	vmovdqa	%ymm8,128(%rbp)
5466	vpsrld	$25,%ymm7,%ymm8
5467	vpslld	$32-25,%ymm7,%ymm7
5468	vpxor	%ymm8,%ymm7,%ymm7
5469	vpsrld	$25,%ymm6,%ymm8
5470	vpslld	$32-25,%ymm6,%ymm6
5471	vpxor	%ymm8,%ymm6,%ymm6
5472	vpsrld	$25,%ymm5,%ymm8
5473	vpslld	$32-25,%ymm5,%ymm5
5474	vpxor	%ymm8,%ymm5,%ymm5
5475	vpsrld	$25,%ymm4,%ymm8
5476	vpslld	$32-25,%ymm4,%ymm4
5477	vpxor	%ymm8,%ymm4,%ymm4
5478	vmovdqa	128(%rbp),%ymm8
5479	vpalignr	$12,%ymm7,%ymm7,%ymm7
5480	vpalignr	$8,%ymm11,%ymm11,%ymm11
5481	vpalignr	$4,%ymm15,%ymm15,%ymm15
5482	vpalignr	$12,%ymm6,%ymm6,%ymm6
5483	vpalignr	$8,%ymm10,%ymm10,%ymm10
5484	vpalignr	$4,%ymm14,%ymm14,%ymm14
5485	vpalignr	$12,%ymm5,%ymm5,%ymm5
5486	vpalignr	$8,%ymm9,%ymm9,%ymm9
5487	vpalignr	$4,%ymm13,%ymm13,%ymm13
5488	vpalignr	$12,%ymm4,%ymm4,%ymm4
5489	vpalignr	$8,%ymm8,%ymm8,%ymm8
5490	vpalignr	$4,%ymm12,%ymm12,%ymm12
5491
5492	incq	%rcx
5493	cmpq	$4,%rcx
5494	jl	1b
5495	cmpq	$10,%rcx
5496	jne	2b
5497	movq	%rbx,%rcx
5498	subq	$384,%rcx
5499	andq	$-16,%rcx
55001:
5501	testq	%rcx,%rcx
5502	je	1f
5503	addq	0(%r8),%r10
5504	adcq	8+0(%r8),%r11
5505	adcq	$1,%r12
5506	movq	0+0(%rbp),%rdx
5507	movq	%rdx,%r15
5508	mulxq	%r10,%r13,%r14
5509	mulxq	%r11,%rax,%rdx
5510	imulq	%r12,%r15
5511	addq	%rax,%r14
5512	adcq	%rdx,%r15
5513	movq	8+0(%rbp),%rdx
5514	mulxq	%r10,%r10,%rax
5515	addq	%r10,%r14
5516	mulxq	%r11,%r11,%r9
5517	adcq	%r11,%r15
5518	adcq	$0,%r9
5519	imulq	%r12,%rdx
5520	addq	%rax,%r15
5521	adcq	%rdx,%r9
5522	movq	%r13,%r10
5523	movq	%r14,%r11
5524	movq	%r15,%r12
5525	andq	$3,%r12
5526	movq	%r15,%r13
5527	andq	$-4,%r13
5528	movq	%r9,%r14
5529	shrdq	$2,%r9,%r15
5530	shrq	$2,%r9
5531	addq	%r13,%r10
5532	adcq	%r14,%r11
5533	adcq	$0,%r12
5534	addq	%r15,%r10
5535	adcq	%r9,%r11
5536	adcq	$0,%r12
5537
5538	leaq	16(%r8),%r8
5539	subq	$16,%rcx
5540	jmp	1b
55411:
5542	vpaddd	.chacha20_consts(%rip),%ymm3,%ymm3
5543	vpaddd	64(%rbp),%ymm7,%ymm7
5544	vpaddd	96(%rbp),%ymm11,%ymm11
5545	vpaddd	256(%rbp),%ymm15,%ymm15
5546	vpaddd	.chacha20_consts(%rip),%ymm2,%ymm2
5547	vpaddd	64(%rbp),%ymm6,%ymm6
5548	vpaddd	96(%rbp),%ymm10,%ymm10
5549	vpaddd	224(%rbp),%ymm14,%ymm14
5550	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
5551	vpaddd	64(%rbp),%ymm5,%ymm5
5552	vpaddd	96(%rbp),%ymm9,%ymm9
5553	vpaddd	192(%rbp),%ymm13,%ymm13
5554	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
5555	vpaddd	64(%rbp),%ymm4,%ymm4
5556	vpaddd	96(%rbp),%ymm8,%ymm8
5557	vpaddd	160(%rbp),%ymm12,%ymm12
5558
5559	vmovdqa	%ymm0,128(%rbp)
5560	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
5561	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
5562	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
5563	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
5564	vpxor	0+0(%rsi),%ymm0,%ymm0
5565	vpxor	32+0(%rsi),%ymm3,%ymm3
5566	vpxor	64+0(%rsi),%ymm7,%ymm7
5567	vpxor	96+0(%rsi),%ymm11,%ymm11
5568	vmovdqu	%ymm0,0+0(%rdi)
5569	vmovdqu	%ymm3,32+0(%rdi)
5570	vmovdqu	%ymm7,64+0(%rdi)
5571	vmovdqu	%ymm11,96+0(%rdi)
5572
5573	vmovdqa	128(%rbp),%ymm0
5574	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
5575	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
5576	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
5577	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
5578	vpxor	0+128(%rsi),%ymm3,%ymm3
5579	vpxor	32+128(%rsi),%ymm2,%ymm2
5580	vpxor	64+128(%rsi),%ymm6,%ymm6
5581	vpxor	96+128(%rsi),%ymm10,%ymm10
5582	vmovdqu	%ymm3,0+128(%rdi)
5583	vmovdqu	%ymm2,32+128(%rdi)
5584	vmovdqu	%ymm6,64+128(%rdi)
5585	vmovdqu	%ymm10,96+128(%rdi)
5586	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
5587	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
5588	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
5589	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
5590	vpxor	0+256(%rsi),%ymm3,%ymm3
5591	vpxor	32+256(%rsi),%ymm1,%ymm1
5592	vpxor	64+256(%rsi),%ymm5,%ymm5
5593	vpxor	96+256(%rsi),%ymm9,%ymm9
5594	vmovdqu	%ymm3,0+256(%rdi)
5595	vmovdqu	%ymm1,32+256(%rdi)
5596	vmovdqu	%ymm5,64+256(%rdi)
5597	vmovdqu	%ymm9,96+256(%rdi)
5598	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
5599	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
5600	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
5601	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
5602	vmovdqa	%ymm3,%ymm8
5603
5604	leaq	384(%rsi),%rsi
5605	leaq	384(%rdi),%rdi
5606	subq	$384,%rbx
5607open_avx2_tail_loop:
5608	cmpq	$32,%rbx
5609	jb	open_avx2_tail
5610	subq	$32,%rbx
5611	vpxor	(%rsi),%ymm0,%ymm0
5612	vmovdqu	%ymm0,(%rdi)
5613	leaq	32(%rsi),%rsi
5614	leaq	32(%rdi),%rdi
5615	vmovdqa	%ymm4,%ymm0
5616	vmovdqa	%ymm8,%ymm4
5617	vmovdqa	%ymm12,%ymm8
5618	jmp	open_avx2_tail_loop
5619open_avx2_tail:
5620	cmpq	$16,%rbx
5621	vmovdqa	%xmm0,%xmm1
5622	jb	1f
5623	subq	$16,%rbx
5624
5625	vpxor	(%rsi),%xmm0,%xmm1
5626	vmovdqu	%xmm1,(%rdi)
5627	leaq	16(%rsi),%rsi
5628	leaq	16(%rdi),%rdi
5629	vperm2i128	$0x11,%ymm0,%ymm0,%ymm0
5630	vmovdqa	%xmm0,%xmm1
56311:
5632	vzeroupper
5633	jmp	open_sse_tail_16
5634
5635open_avx2_192:
5636	vmovdqa	%ymm0,%ymm1
5637	vmovdqa	%ymm0,%ymm2
5638	vmovdqa	%ymm4,%ymm5
5639	vmovdqa	%ymm4,%ymm6
5640	vmovdqa	%ymm8,%ymm9
5641	vmovdqa	%ymm8,%ymm10
5642	vpaddd	.avx2_inc(%rip),%ymm12,%ymm13
5643	vmovdqa	%ymm12,%ymm11
5644	vmovdqa	%ymm13,%ymm15
5645	movq	$10,%r10
56461:
5647	vpaddd	%ymm4,%ymm0,%ymm0
5648	vpxor	%ymm0,%ymm12,%ymm12
5649	vpshufb	.rol16(%rip),%ymm12,%ymm12
5650	vpaddd	%ymm12,%ymm8,%ymm8
5651	vpxor	%ymm8,%ymm4,%ymm4
5652	vpsrld	$20,%ymm4,%ymm3
5653	vpslld	$12,%ymm4,%ymm4
5654	vpxor	%ymm3,%ymm4,%ymm4
5655	vpaddd	%ymm4,%ymm0,%ymm0
5656	vpxor	%ymm0,%ymm12,%ymm12
5657	vpshufb	.rol8(%rip),%ymm12,%ymm12
5658	vpaddd	%ymm12,%ymm8,%ymm8
5659	vpxor	%ymm8,%ymm4,%ymm4
5660	vpslld	$7,%ymm4,%ymm3
5661	vpsrld	$25,%ymm4,%ymm4
5662	vpxor	%ymm3,%ymm4,%ymm4
5663	vpalignr	$12,%ymm12,%ymm12,%ymm12
5664	vpalignr	$8,%ymm8,%ymm8,%ymm8
5665	vpalignr	$4,%ymm4,%ymm4,%ymm4
5666	vpaddd	%ymm5,%ymm1,%ymm1
5667	vpxor	%ymm1,%ymm13,%ymm13
5668	vpshufb	.rol16(%rip),%ymm13,%ymm13
5669	vpaddd	%ymm13,%ymm9,%ymm9
5670	vpxor	%ymm9,%ymm5,%ymm5
5671	vpsrld	$20,%ymm5,%ymm3
5672	vpslld	$12,%ymm5,%ymm5
5673	vpxor	%ymm3,%ymm5,%ymm5
5674	vpaddd	%ymm5,%ymm1,%ymm1
5675	vpxor	%ymm1,%ymm13,%ymm13
5676	vpshufb	.rol8(%rip),%ymm13,%ymm13
5677	vpaddd	%ymm13,%ymm9,%ymm9
5678	vpxor	%ymm9,%ymm5,%ymm5
5679	vpslld	$7,%ymm5,%ymm3
5680	vpsrld	$25,%ymm5,%ymm5
5681	vpxor	%ymm3,%ymm5,%ymm5
5682	vpalignr	$12,%ymm13,%ymm13,%ymm13
5683	vpalignr	$8,%ymm9,%ymm9,%ymm9
5684	vpalignr	$4,%ymm5,%ymm5,%ymm5
5685	vpaddd	%ymm4,%ymm0,%ymm0
5686	vpxor	%ymm0,%ymm12,%ymm12
5687	vpshufb	.rol16(%rip),%ymm12,%ymm12
5688	vpaddd	%ymm12,%ymm8,%ymm8
5689	vpxor	%ymm8,%ymm4,%ymm4
5690	vpsrld	$20,%ymm4,%ymm3
5691	vpslld	$12,%ymm4,%ymm4
5692	vpxor	%ymm3,%ymm4,%ymm4
5693	vpaddd	%ymm4,%ymm0,%ymm0
5694	vpxor	%ymm0,%ymm12,%ymm12
5695	vpshufb	.rol8(%rip),%ymm12,%ymm12
5696	vpaddd	%ymm12,%ymm8,%ymm8
5697	vpxor	%ymm8,%ymm4,%ymm4
5698	vpslld	$7,%ymm4,%ymm3
5699	vpsrld	$25,%ymm4,%ymm4
5700	vpxor	%ymm3,%ymm4,%ymm4
5701	vpalignr	$4,%ymm12,%ymm12,%ymm12
5702	vpalignr	$8,%ymm8,%ymm8,%ymm8
5703	vpalignr	$12,%ymm4,%ymm4,%ymm4
5704	vpaddd	%ymm5,%ymm1,%ymm1
5705	vpxor	%ymm1,%ymm13,%ymm13
5706	vpshufb	.rol16(%rip),%ymm13,%ymm13
5707	vpaddd	%ymm13,%ymm9,%ymm9
5708	vpxor	%ymm9,%ymm5,%ymm5
5709	vpsrld	$20,%ymm5,%ymm3
5710	vpslld	$12,%ymm5,%ymm5
5711	vpxor	%ymm3,%ymm5,%ymm5
5712	vpaddd	%ymm5,%ymm1,%ymm1
5713	vpxor	%ymm1,%ymm13,%ymm13
5714	vpshufb	.rol8(%rip),%ymm13,%ymm13
5715	vpaddd	%ymm13,%ymm9,%ymm9
5716	vpxor	%ymm9,%ymm5,%ymm5
5717	vpslld	$7,%ymm5,%ymm3
5718	vpsrld	$25,%ymm5,%ymm5
5719	vpxor	%ymm3,%ymm5,%ymm5
5720	vpalignr	$4,%ymm13,%ymm13,%ymm13
5721	vpalignr	$8,%ymm9,%ymm9,%ymm9
5722	vpalignr	$12,%ymm5,%ymm5,%ymm5
5723
5724	decq	%r10
5725	jne	1b
5726	vpaddd	%ymm2,%ymm0,%ymm0
5727	vpaddd	%ymm2,%ymm1,%ymm1
5728	vpaddd	%ymm6,%ymm4,%ymm4
5729	vpaddd	%ymm6,%ymm5,%ymm5
5730	vpaddd	%ymm10,%ymm8,%ymm8
5731	vpaddd	%ymm10,%ymm9,%ymm9
5732	vpaddd	%ymm11,%ymm12,%ymm12
5733	vpaddd	%ymm15,%ymm13,%ymm13
5734	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
5735
5736	vpand	.clamp(%rip),%ymm3,%ymm3
5737	vmovdqa	%ymm3,0(%rbp)
5738
5739	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
5740	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
5741	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
5742	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
5743	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
5744	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
5745open_avx2_short:
5746	movq	%r8,%r8
5747	call	poly_hash_ad_internal
5748open_avx2_hash_and_xor_loop:
5749	cmpq	$32,%rbx
5750	jb	open_avx2_short_tail_32
5751	subq	$32,%rbx
5752	addq	0(%rsi),%r10
5753	adcq	8+0(%rsi),%r11
5754	adcq	$1,%r12
5755	movq	0+0(%rbp),%rax
5756	movq	%rax,%r15
5757	mulq	%r10
5758	movq	%rax,%r13
5759	movq	%rdx,%r14
5760	movq	0+0(%rbp),%rax
5761	mulq	%r11
5762	imulq	%r12,%r15
5763	addq	%rax,%r14
5764	adcq	%rdx,%r15
5765	movq	8+0(%rbp),%rax
5766	movq	%rax,%r9
5767	mulq	%r10
5768	addq	%rax,%r14
5769	adcq	$0,%rdx
5770	movq	%rdx,%r10
5771	movq	8+0(%rbp),%rax
5772	mulq	%r11
5773	addq	%rax,%r15
5774	adcq	$0,%rdx
5775	imulq	%r12,%r9
5776	addq	%r10,%r15
5777	adcq	%rdx,%r9
5778	movq	%r13,%r10
5779	movq	%r14,%r11
5780	movq	%r15,%r12
5781	andq	$3,%r12
5782	movq	%r15,%r13
5783	andq	$-4,%r13
5784	movq	%r9,%r14
5785	shrdq	$2,%r9,%r15
5786	shrq	$2,%r9
5787	addq	%r13,%r10
5788	adcq	%r14,%r11
5789	adcq	$0,%r12
5790	addq	%r15,%r10
5791	adcq	%r9,%r11
5792	adcq	$0,%r12
5793	addq	16(%rsi),%r10
5794	adcq	8+16(%rsi),%r11
5795	adcq	$1,%r12
5796	movq	0+0(%rbp),%rax
5797	movq	%rax,%r15
5798	mulq	%r10
5799	movq	%rax,%r13
5800	movq	%rdx,%r14
5801	movq	0+0(%rbp),%rax
5802	mulq	%r11
5803	imulq	%r12,%r15
5804	addq	%rax,%r14
5805	adcq	%rdx,%r15
5806	movq	8+0(%rbp),%rax
5807	movq	%rax,%r9
5808	mulq	%r10
5809	addq	%rax,%r14
5810	adcq	$0,%rdx
5811	movq	%rdx,%r10
5812	movq	8+0(%rbp),%rax
5813	mulq	%r11
5814	addq	%rax,%r15
5815	adcq	$0,%rdx
5816	imulq	%r12,%r9
5817	addq	%r10,%r15
5818	adcq	%rdx,%r9
5819	movq	%r13,%r10
5820	movq	%r14,%r11
5821	movq	%r15,%r12
5822	andq	$3,%r12
5823	movq	%r15,%r13
5824	andq	$-4,%r13
5825	movq	%r9,%r14
5826	shrdq	$2,%r9,%r15
5827	shrq	$2,%r9
5828	addq	%r13,%r10
5829	adcq	%r14,%r11
5830	adcq	$0,%r12
5831	addq	%r15,%r10
5832	adcq	%r9,%r11
5833	adcq	$0,%r12
5834
5835
5836	vpxor	(%rsi),%ymm0,%ymm0
5837	vmovdqu	%ymm0,(%rdi)
5838	leaq	32(%rsi),%rsi
5839	leaq	32(%rdi),%rdi
5840
5841	vmovdqa	%ymm4,%ymm0
5842	vmovdqa	%ymm8,%ymm4
5843	vmovdqa	%ymm12,%ymm8
5844	vmovdqa	%ymm1,%ymm12
5845	vmovdqa	%ymm5,%ymm1
5846	vmovdqa	%ymm9,%ymm5
5847	vmovdqa	%ymm13,%ymm9
5848	vmovdqa	%ymm2,%ymm13
5849	vmovdqa	%ymm6,%ymm2
5850	jmp	open_avx2_hash_and_xor_loop
5851open_avx2_short_tail_32:
5852	cmpq	$16,%rbx
5853	vmovdqa	%xmm0,%xmm1
5854	jb	1f
5855	subq	$16,%rbx
5856	addq	0(%rsi),%r10
5857	adcq	8+0(%rsi),%r11
5858	adcq	$1,%r12
5859	movq	0+0(%rbp),%rax
5860	movq	%rax,%r15
5861	mulq	%r10
5862	movq	%rax,%r13
5863	movq	%rdx,%r14
5864	movq	0+0(%rbp),%rax
5865	mulq	%r11
5866	imulq	%r12,%r15
5867	addq	%rax,%r14
5868	adcq	%rdx,%r15
5869	movq	8+0(%rbp),%rax
5870	movq	%rax,%r9
5871	mulq	%r10
5872	addq	%rax,%r14
5873	adcq	$0,%rdx
5874	movq	%rdx,%r10
5875	movq	8+0(%rbp),%rax
5876	mulq	%r11
5877	addq	%rax,%r15
5878	adcq	$0,%rdx
5879	imulq	%r12,%r9
5880	addq	%r10,%r15
5881	adcq	%rdx,%r9
5882	movq	%r13,%r10
5883	movq	%r14,%r11
5884	movq	%r15,%r12
5885	andq	$3,%r12
5886	movq	%r15,%r13
5887	andq	$-4,%r13
5888	movq	%r9,%r14
5889	shrdq	$2,%r9,%r15
5890	shrq	$2,%r9
5891	addq	%r13,%r10
5892	adcq	%r14,%r11
5893	adcq	$0,%r12
5894	addq	%r15,%r10
5895	adcq	%r9,%r11
5896	adcq	$0,%r12
5897
5898	vpxor	(%rsi),%xmm0,%xmm3
5899	vmovdqu	%xmm3,(%rdi)
5900	leaq	16(%rsi),%rsi
5901	leaq	16(%rdi),%rdi
5902	vextracti128	$1,%ymm0,%xmm1
59031:
5904	vzeroupper
5905	jmp	open_sse_tail_16
5906
5907open_avx2_320:
5908	vmovdqa	%ymm0,%ymm1
5909	vmovdqa	%ymm0,%ymm2
5910	vmovdqa	%ymm4,%ymm5
5911	vmovdqa	%ymm4,%ymm6
5912	vmovdqa	%ymm8,%ymm9
5913	vmovdqa	%ymm8,%ymm10
5914	vpaddd	.avx2_inc(%rip),%ymm12,%ymm13
5915	vpaddd	.avx2_inc(%rip),%ymm13,%ymm14
5916	vmovdqa	%ymm4,%ymm7
5917	vmovdqa	%ymm8,%ymm11
5918	vmovdqa	%ymm12,160(%rbp)
5919	vmovdqa	%ymm13,192(%rbp)
5920	vmovdqa	%ymm14,224(%rbp)
5921	movq	$10,%r10
59221:
5923	vpaddd	%ymm4,%ymm0,%ymm0
5924	vpxor	%ymm0,%ymm12,%ymm12
5925	vpshufb	.rol16(%rip),%ymm12,%ymm12
5926	vpaddd	%ymm12,%ymm8,%ymm8
5927	vpxor	%ymm8,%ymm4,%ymm4
5928	vpsrld	$20,%ymm4,%ymm3
5929	vpslld	$12,%ymm4,%ymm4
5930	vpxor	%ymm3,%ymm4,%ymm4
5931	vpaddd	%ymm4,%ymm0,%ymm0
5932	vpxor	%ymm0,%ymm12,%ymm12
5933	vpshufb	.rol8(%rip),%ymm12,%ymm12
5934	vpaddd	%ymm12,%ymm8,%ymm8
5935	vpxor	%ymm8,%ymm4,%ymm4
5936	vpslld	$7,%ymm4,%ymm3
5937	vpsrld	$25,%ymm4,%ymm4
5938	vpxor	%ymm3,%ymm4,%ymm4
5939	vpalignr	$12,%ymm12,%ymm12,%ymm12
5940	vpalignr	$8,%ymm8,%ymm8,%ymm8
5941	vpalignr	$4,%ymm4,%ymm4,%ymm4
5942	vpaddd	%ymm5,%ymm1,%ymm1
5943	vpxor	%ymm1,%ymm13,%ymm13
5944	vpshufb	.rol16(%rip),%ymm13,%ymm13
5945	vpaddd	%ymm13,%ymm9,%ymm9
5946	vpxor	%ymm9,%ymm5,%ymm5
5947	vpsrld	$20,%ymm5,%ymm3
5948	vpslld	$12,%ymm5,%ymm5
5949	vpxor	%ymm3,%ymm5,%ymm5
5950	vpaddd	%ymm5,%ymm1,%ymm1
5951	vpxor	%ymm1,%ymm13,%ymm13
5952	vpshufb	.rol8(%rip),%ymm13,%ymm13
5953	vpaddd	%ymm13,%ymm9,%ymm9
5954	vpxor	%ymm9,%ymm5,%ymm5
5955	vpslld	$7,%ymm5,%ymm3
5956	vpsrld	$25,%ymm5,%ymm5
5957	vpxor	%ymm3,%ymm5,%ymm5
5958	vpalignr	$12,%ymm13,%ymm13,%ymm13
5959	vpalignr	$8,%ymm9,%ymm9,%ymm9
5960	vpalignr	$4,%ymm5,%ymm5,%ymm5
5961	vpaddd	%ymm6,%ymm2,%ymm2
5962	vpxor	%ymm2,%ymm14,%ymm14
5963	vpshufb	.rol16(%rip),%ymm14,%ymm14
5964	vpaddd	%ymm14,%ymm10,%ymm10
5965	vpxor	%ymm10,%ymm6,%ymm6
5966	vpsrld	$20,%ymm6,%ymm3
5967	vpslld	$12,%ymm6,%ymm6
5968	vpxor	%ymm3,%ymm6,%ymm6
5969	vpaddd	%ymm6,%ymm2,%ymm2
5970	vpxor	%ymm2,%ymm14,%ymm14
5971	vpshufb	.rol8(%rip),%ymm14,%ymm14
5972	vpaddd	%ymm14,%ymm10,%ymm10
5973	vpxor	%ymm10,%ymm6,%ymm6
5974	vpslld	$7,%ymm6,%ymm3
5975	vpsrld	$25,%ymm6,%ymm6
5976	vpxor	%ymm3,%ymm6,%ymm6
5977	vpalignr	$12,%ymm14,%ymm14,%ymm14
5978	vpalignr	$8,%ymm10,%ymm10,%ymm10
5979	vpalignr	$4,%ymm6,%ymm6,%ymm6
5980	vpaddd	%ymm4,%ymm0,%ymm0
5981	vpxor	%ymm0,%ymm12,%ymm12
5982	vpshufb	.rol16(%rip),%ymm12,%ymm12
5983	vpaddd	%ymm12,%ymm8,%ymm8
5984	vpxor	%ymm8,%ymm4,%ymm4
5985	vpsrld	$20,%ymm4,%ymm3
5986	vpslld	$12,%ymm4,%ymm4
5987	vpxor	%ymm3,%ymm4,%ymm4
5988	vpaddd	%ymm4,%ymm0,%ymm0
5989	vpxor	%ymm0,%ymm12,%ymm12
5990	vpshufb	.rol8(%rip),%ymm12,%ymm12
5991	vpaddd	%ymm12,%ymm8,%ymm8
5992	vpxor	%ymm8,%ymm4,%ymm4
5993	vpslld	$7,%ymm4,%ymm3
5994	vpsrld	$25,%ymm4,%ymm4
5995	vpxor	%ymm3,%ymm4,%ymm4
5996	vpalignr	$4,%ymm12,%ymm12,%ymm12
5997	vpalignr	$8,%ymm8,%ymm8,%ymm8
5998	vpalignr	$12,%ymm4,%ymm4,%ymm4
5999	vpaddd	%ymm5,%ymm1,%ymm1
6000	vpxor	%ymm1,%ymm13,%ymm13
6001	vpshufb	.rol16(%rip),%ymm13,%ymm13
6002	vpaddd	%ymm13,%ymm9,%ymm9
6003	vpxor	%ymm9,%ymm5,%ymm5
6004	vpsrld	$20,%ymm5,%ymm3
6005	vpslld	$12,%ymm5,%ymm5
6006	vpxor	%ymm3,%ymm5,%ymm5
6007	vpaddd	%ymm5,%ymm1,%ymm1
6008	vpxor	%ymm1,%ymm13,%ymm13
6009	vpshufb	.rol8(%rip),%ymm13,%ymm13
6010	vpaddd	%ymm13,%ymm9,%ymm9
6011	vpxor	%ymm9,%ymm5,%ymm5
6012	vpslld	$7,%ymm5,%ymm3
6013	vpsrld	$25,%ymm5,%ymm5
6014	vpxor	%ymm3,%ymm5,%ymm5
6015	vpalignr	$4,%ymm13,%ymm13,%ymm13
6016	vpalignr	$8,%ymm9,%ymm9,%ymm9
6017	vpalignr	$12,%ymm5,%ymm5,%ymm5
6018	vpaddd	%ymm6,%ymm2,%ymm2
6019	vpxor	%ymm2,%ymm14,%ymm14
6020	vpshufb	.rol16(%rip),%ymm14,%ymm14
6021	vpaddd	%ymm14,%ymm10,%ymm10
6022	vpxor	%ymm10,%ymm6,%ymm6
6023	vpsrld	$20,%ymm6,%ymm3
6024	vpslld	$12,%ymm6,%ymm6
6025	vpxor	%ymm3,%ymm6,%ymm6
6026	vpaddd	%ymm6,%ymm2,%ymm2
6027	vpxor	%ymm2,%ymm14,%ymm14
6028	vpshufb	.rol8(%rip),%ymm14,%ymm14
6029	vpaddd	%ymm14,%ymm10,%ymm10
6030	vpxor	%ymm10,%ymm6,%ymm6
6031	vpslld	$7,%ymm6,%ymm3
6032	vpsrld	$25,%ymm6,%ymm6
6033	vpxor	%ymm3,%ymm6,%ymm6
6034	vpalignr	$4,%ymm14,%ymm14,%ymm14
6035	vpalignr	$8,%ymm10,%ymm10,%ymm10
6036	vpalignr	$12,%ymm6,%ymm6,%ymm6
6037
6038	decq	%r10
6039	jne	1b
6040	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
6041	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
6042	vpaddd	.chacha20_consts(%rip),%ymm2,%ymm2
6043	vpaddd	%ymm7,%ymm4,%ymm4
6044	vpaddd	%ymm7,%ymm5,%ymm5
6045	vpaddd	%ymm7,%ymm6,%ymm6
6046	vpaddd	%ymm11,%ymm8,%ymm8
6047	vpaddd	%ymm11,%ymm9,%ymm9
6048	vpaddd	%ymm11,%ymm10,%ymm10
6049	vpaddd	160(%rbp),%ymm12,%ymm12
6050	vpaddd	192(%rbp),%ymm13,%ymm13
6051	vpaddd	224(%rbp),%ymm14,%ymm14
6052	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
6053
6054	vpand	.clamp(%rip),%ymm3,%ymm3
6055	vmovdqa	%ymm3,0(%rbp)
6056
6057	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
6058	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
6059	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
6060	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
6061	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
6062	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
6063	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
6064	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
6065	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
6066	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
6067	jmp	open_avx2_short
6068.size	chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
6069
6070
6071.type	chacha20_poly1305_seal_avx2,@function
6072.align	64
6073chacha20_poly1305_seal_avx2:
6074	vzeroupper
6075	vmovdqa	.chacha20_consts(%rip),%ymm0
6076	vbroadcasti128	0(%r9),%ymm4
6077	vbroadcasti128	16(%r9),%ymm8
6078	vbroadcasti128	32(%r9),%ymm12
6079	vpaddd	.avx2_init(%rip),%ymm12,%ymm12
6080	cmpq	$192,%rbx
6081	jbe	seal_avx2_192
6082	cmpq	$320,%rbx
6083	jbe	seal_avx2_320
6084	vmovdqa	%ymm0,%ymm1
6085	vmovdqa	%ymm0,%ymm2
6086	vmovdqa	%ymm0,%ymm3
6087	vmovdqa	%ymm4,%ymm5
6088	vmovdqa	%ymm4,%ymm6
6089	vmovdqa	%ymm4,%ymm7
6090	vmovdqa	%ymm4,64(%rbp)
6091	vmovdqa	%ymm8,%ymm9
6092	vmovdqa	%ymm8,%ymm10
6093	vmovdqa	%ymm8,%ymm11
6094	vmovdqa	%ymm8,96(%rbp)
6095	vmovdqa	%ymm12,%ymm15
6096	vpaddd	.avx2_inc(%rip),%ymm15,%ymm14
6097	vpaddd	.avx2_inc(%rip),%ymm14,%ymm13
6098	vpaddd	.avx2_inc(%rip),%ymm13,%ymm12
6099	vmovdqa	%ymm12,160(%rbp)
6100	vmovdqa	%ymm13,192(%rbp)
6101	vmovdqa	%ymm14,224(%rbp)
6102	vmovdqa	%ymm15,256(%rbp)
6103	movq	$10,%r10
61041:
6105	vmovdqa	%ymm8,128(%rbp)
6106	vmovdqa	.rol16(%rip),%ymm8
6107	vpaddd	%ymm7,%ymm3,%ymm3
6108	vpaddd	%ymm6,%ymm2,%ymm2
6109	vpaddd	%ymm5,%ymm1,%ymm1
6110	vpaddd	%ymm4,%ymm0,%ymm0
6111	vpxor	%ymm3,%ymm15,%ymm15
6112	vpxor	%ymm2,%ymm14,%ymm14
6113	vpxor	%ymm1,%ymm13,%ymm13
6114	vpxor	%ymm0,%ymm12,%ymm12
6115	vpshufb	%ymm8,%ymm15,%ymm15
6116	vpshufb	%ymm8,%ymm14,%ymm14
6117	vpshufb	%ymm8,%ymm13,%ymm13
6118	vpshufb	%ymm8,%ymm12,%ymm12
6119	vmovdqa	128(%rbp),%ymm8
6120	vpaddd	%ymm15,%ymm11,%ymm11
6121	vpaddd	%ymm14,%ymm10,%ymm10
6122	vpaddd	%ymm13,%ymm9,%ymm9
6123	vpaddd	%ymm12,%ymm8,%ymm8
6124	vpxor	%ymm11,%ymm7,%ymm7
6125	vpxor	%ymm10,%ymm6,%ymm6
6126	vpxor	%ymm9,%ymm5,%ymm5
6127	vpxor	%ymm8,%ymm4,%ymm4
6128	vmovdqa	%ymm8,128(%rbp)
6129	vpsrld	$20,%ymm7,%ymm8
6130	vpslld	$32-20,%ymm7,%ymm7
6131	vpxor	%ymm8,%ymm7,%ymm7
6132	vpsrld	$20,%ymm6,%ymm8
6133	vpslld	$32-20,%ymm6,%ymm6
6134	vpxor	%ymm8,%ymm6,%ymm6
6135	vpsrld	$20,%ymm5,%ymm8
6136	vpslld	$32-20,%ymm5,%ymm5
6137	vpxor	%ymm8,%ymm5,%ymm5
6138	vpsrld	$20,%ymm4,%ymm8
6139	vpslld	$32-20,%ymm4,%ymm4
6140	vpxor	%ymm8,%ymm4,%ymm4
6141	vmovdqa	.rol8(%rip),%ymm8
6142	vpaddd	%ymm7,%ymm3,%ymm3
6143	vpaddd	%ymm6,%ymm2,%ymm2
6144	vpaddd	%ymm5,%ymm1,%ymm1
6145	vpaddd	%ymm4,%ymm0,%ymm0
6146	vpxor	%ymm3,%ymm15,%ymm15
6147	vpxor	%ymm2,%ymm14,%ymm14
6148	vpxor	%ymm1,%ymm13,%ymm13
6149	vpxor	%ymm0,%ymm12,%ymm12
6150	vpshufb	%ymm8,%ymm15,%ymm15
6151	vpshufb	%ymm8,%ymm14,%ymm14
6152	vpshufb	%ymm8,%ymm13,%ymm13
6153	vpshufb	%ymm8,%ymm12,%ymm12
6154	vmovdqa	128(%rbp),%ymm8
6155	vpaddd	%ymm15,%ymm11,%ymm11
6156	vpaddd	%ymm14,%ymm10,%ymm10
6157	vpaddd	%ymm13,%ymm9,%ymm9
6158	vpaddd	%ymm12,%ymm8,%ymm8
6159	vpxor	%ymm11,%ymm7,%ymm7
6160	vpxor	%ymm10,%ymm6,%ymm6
6161	vpxor	%ymm9,%ymm5,%ymm5
6162	vpxor	%ymm8,%ymm4,%ymm4
6163	vmovdqa	%ymm8,128(%rbp)
6164	vpsrld	$25,%ymm7,%ymm8
6165	vpslld	$32-25,%ymm7,%ymm7
6166	vpxor	%ymm8,%ymm7,%ymm7
6167	vpsrld	$25,%ymm6,%ymm8
6168	vpslld	$32-25,%ymm6,%ymm6
6169	vpxor	%ymm8,%ymm6,%ymm6
6170	vpsrld	$25,%ymm5,%ymm8
6171	vpslld	$32-25,%ymm5,%ymm5
6172	vpxor	%ymm8,%ymm5,%ymm5
6173	vpsrld	$25,%ymm4,%ymm8
6174	vpslld	$32-25,%ymm4,%ymm4
6175	vpxor	%ymm8,%ymm4,%ymm4
6176	vmovdqa	128(%rbp),%ymm8
6177	vpalignr	$4,%ymm7,%ymm7,%ymm7
6178	vpalignr	$8,%ymm11,%ymm11,%ymm11
6179	vpalignr	$12,%ymm15,%ymm15,%ymm15
6180	vpalignr	$4,%ymm6,%ymm6,%ymm6
6181	vpalignr	$8,%ymm10,%ymm10,%ymm10
6182	vpalignr	$12,%ymm14,%ymm14,%ymm14
6183	vpalignr	$4,%ymm5,%ymm5,%ymm5
6184	vpalignr	$8,%ymm9,%ymm9,%ymm9
6185	vpalignr	$12,%ymm13,%ymm13,%ymm13
6186	vpalignr	$4,%ymm4,%ymm4,%ymm4
6187	vpalignr	$8,%ymm8,%ymm8,%ymm8
6188	vpalignr	$12,%ymm12,%ymm12,%ymm12
6189	vmovdqa	%ymm8,128(%rbp)
6190	vmovdqa	.rol16(%rip),%ymm8
6191	vpaddd	%ymm7,%ymm3,%ymm3
6192	vpaddd	%ymm6,%ymm2,%ymm2
6193	vpaddd	%ymm5,%ymm1,%ymm1
6194	vpaddd	%ymm4,%ymm0,%ymm0
6195	vpxor	%ymm3,%ymm15,%ymm15
6196	vpxor	%ymm2,%ymm14,%ymm14
6197	vpxor	%ymm1,%ymm13,%ymm13
6198	vpxor	%ymm0,%ymm12,%ymm12
6199	vpshufb	%ymm8,%ymm15,%ymm15
6200	vpshufb	%ymm8,%ymm14,%ymm14
6201	vpshufb	%ymm8,%ymm13,%ymm13
6202	vpshufb	%ymm8,%ymm12,%ymm12
6203	vmovdqa	128(%rbp),%ymm8
6204	vpaddd	%ymm15,%ymm11,%ymm11
6205	vpaddd	%ymm14,%ymm10,%ymm10
6206	vpaddd	%ymm13,%ymm9,%ymm9
6207	vpaddd	%ymm12,%ymm8,%ymm8
6208	vpxor	%ymm11,%ymm7,%ymm7
6209	vpxor	%ymm10,%ymm6,%ymm6
6210	vpxor	%ymm9,%ymm5,%ymm5
6211	vpxor	%ymm8,%ymm4,%ymm4
6212	vmovdqa	%ymm8,128(%rbp)
6213	vpsrld	$20,%ymm7,%ymm8
6214	vpslld	$32-20,%ymm7,%ymm7
6215	vpxor	%ymm8,%ymm7,%ymm7
6216	vpsrld	$20,%ymm6,%ymm8
6217	vpslld	$32-20,%ymm6,%ymm6
6218	vpxor	%ymm8,%ymm6,%ymm6
6219	vpsrld	$20,%ymm5,%ymm8
6220	vpslld	$32-20,%ymm5,%ymm5
6221	vpxor	%ymm8,%ymm5,%ymm5
6222	vpsrld	$20,%ymm4,%ymm8
6223	vpslld	$32-20,%ymm4,%ymm4
6224	vpxor	%ymm8,%ymm4,%ymm4
6225	vmovdqa	.rol8(%rip),%ymm8
6226	vpaddd	%ymm7,%ymm3,%ymm3
6227	vpaddd	%ymm6,%ymm2,%ymm2
6228	vpaddd	%ymm5,%ymm1,%ymm1
6229	vpaddd	%ymm4,%ymm0,%ymm0
6230	vpxor	%ymm3,%ymm15,%ymm15
6231	vpxor	%ymm2,%ymm14,%ymm14
6232	vpxor	%ymm1,%ymm13,%ymm13
6233	vpxor	%ymm0,%ymm12,%ymm12
6234	vpshufb	%ymm8,%ymm15,%ymm15
6235	vpshufb	%ymm8,%ymm14,%ymm14
6236	vpshufb	%ymm8,%ymm13,%ymm13
6237	vpshufb	%ymm8,%ymm12,%ymm12
6238	vmovdqa	128(%rbp),%ymm8
6239	vpaddd	%ymm15,%ymm11,%ymm11
6240	vpaddd	%ymm14,%ymm10,%ymm10
6241	vpaddd	%ymm13,%ymm9,%ymm9
6242	vpaddd	%ymm12,%ymm8,%ymm8
6243	vpxor	%ymm11,%ymm7,%ymm7
6244	vpxor	%ymm10,%ymm6,%ymm6
6245	vpxor	%ymm9,%ymm5,%ymm5
6246	vpxor	%ymm8,%ymm4,%ymm4
6247	vmovdqa	%ymm8,128(%rbp)
6248	vpsrld	$25,%ymm7,%ymm8
6249	vpslld	$32-25,%ymm7,%ymm7
6250	vpxor	%ymm8,%ymm7,%ymm7
6251	vpsrld	$25,%ymm6,%ymm8
6252	vpslld	$32-25,%ymm6,%ymm6
6253	vpxor	%ymm8,%ymm6,%ymm6
6254	vpsrld	$25,%ymm5,%ymm8
6255	vpslld	$32-25,%ymm5,%ymm5
6256	vpxor	%ymm8,%ymm5,%ymm5
6257	vpsrld	$25,%ymm4,%ymm8
6258	vpslld	$32-25,%ymm4,%ymm4
6259	vpxor	%ymm8,%ymm4,%ymm4
6260	vmovdqa	128(%rbp),%ymm8
6261	vpalignr	$12,%ymm7,%ymm7,%ymm7
6262	vpalignr	$8,%ymm11,%ymm11,%ymm11
6263	vpalignr	$4,%ymm15,%ymm15,%ymm15
6264	vpalignr	$12,%ymm6,%ymm6,%ymm6
6265	vpalignr	$8,%ymm10,%ymm10,%ymm10
6266	vpalignr	$4,%ymm14,%ymm14,%ymm14
6267	vpalignr	$12,%ymm5,%ymm5,%ymm5
6268	vpalignr	$8,%ymm9,%ymm9,%ymm9
6269	vpalignr	$4,%ymm13,%ymm13,%ymm13
6270	vpalignr	$12,%ymm4,%ymm4,%ymm4
6271	vpalignr	$8,%ymm8,%ymm8,%ymm8
6272	vpalignr	$4,%ymm12,%ymm12,%ymm12
6273
6274	decq	%r10
6275	jnz	1b
6276	vpaddd	.chacha20_consts(%rip),%ymm3,%ymm3
6277	vpaddd	64(%rbp),%ymm7,%ymm7
6278	vpaddd	96(%rbp),%ymm11,%ymm11
6279	vpaddd	256(%rbp),%ymm15,%ymm15
6280	vpaddd	.chacha20_consts(%rip),%ymm2,%ymm2
6281	vpaddd	64(%rbp),%ymm6,%ymm6
6282	vpaddd	96(%rbp),%ymm10,%ymm10
6283	vpaddd	224(%rbp),%ymm14,%ymm14
6284	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
6285	vpaddd	64(%rbp),%ymm5,%ymm5
6286	vpaddd	96(%rbp),%ymm9,%ymm9
6287	vpaddd	192(%rbp),%ymm13,%ymm13
6288	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
6289	vpaddd	64(%rbp),%ymm4,%ymm4
6290	vpaddd	96(%rbp),%ymm8,%ymm8
6291	vpaddd	160(%rbp),%ymm12,%ymm12
6292
6293	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
6294	vperm2i128	$0x02,%ymm3,%ymm7,%ymm15
6295	vperm2i128	$0x13,%ymm3,%ymm7,%ymm3
6296	vpand	.clamp(%rip),%ymm15,%ymm15
6297	vmovdqa	%ymm15,0(%rbp)
6298	movq	%r8,%r8
6299	call	poly_hash_ad_internal
6300
6301	vpxor	0(%rsi),%ymm3,%ymm3
6302	vpxor	32(%rsi),%ymm11,%ymm11
6303	vmovdqu	%ymm3,0(%rdi)
6304	vmovdqu	%ymm11,32(%rdi)
6305	vperm2i128	$0x02,%ymm2,%ymm6,%ymm15
6306	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
6307	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
6308	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
6309	vpxor	0+64(%rsi),%ymm15,%ymm15
6310	vpxor	32+64(%rsi),%ymm2,%ymm2
6311	vpxor	64+64(%rsi),%ymm6,%ymm6
6312	vpxor	96+64(%rsi),%ymm10,%ymm10
6313	vmovdqu	%ymm15,0+64(%rdi)
6314	vmovdqu	%ymm2,32+64(%rdi)
6315	vmovdqu	%ymm6,64+64(%rdi)
6316	vmovdqu	%ymm10,96+64(%rdi)
6317	vperm2i128	$0x02,%ymm1,%ymm5,%ymm15
6318	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
6319	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
6320	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
6321	vpxor	0+192(%rsi),%ymm15,%ymm15
6322	vpxor	32+192(%rsi),%ymm1,%ymm1
6323	vpxor	64+192(%rsi),%ymm5,%ymm5
6324	vpxor	96+192(%rsi),%ymm9,%ymm9
6325	vmovdqu	%ymm15,0+192(%rdi)
6326	vmovdqu	%ymm1,32+192(%rdi)
6327	vmovdqu	%ymm5,64+192(%rdi)
6328	vmovdqu	%ymm9,96+192(%rdi)
6329	vperm2i128	$0x13,%ymm0,%ymm4,%ymm15
6330	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
6331	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
6332	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
6333	vmovdqa	%ymm15,%ymm8
6334
6335	leaq	320(%rsi),%rsi
6336	subq	$320,%rbx
6337	movq	$320,%rcx
6338	cmpq	$128,%rbx
6339	jbe	seal_avx2_hash
6340	vpxor	0(%rsi),%ymm0,%ymm0
6341	vpxor	32(%rsi),%ymm4,%ymm4
6342	vpxor	64(%rsi),%ymm8,%ymm8
6343	vpxor	96(%rsi),%ymm12,%ymm12
6344	vmovdqu	%ymm0,320(%rdi)
6345	vmovdqu	%ymm4,352(%rdi)
6346	vmovdqu	%ymm8,384(%rdi)
6347	vmovdqu	%ymm12,416(%rdi)
6348	leaq	128(%rsi),%rsi
6349	subq	$128,%rbx
6350	movq	$8,%rcx
6351	movq	$2,%r8
6352	cmpq	$128,%rbx
6353	jbe	seal_avx2_tail_128
6354	cmpq	$256,%rbx
6355	jbe	seal_avx2_tail_256
6356	cmpq	$384,%rbx
6357	jbe	seal_avx2_tail_384
6358	cmpq	$512,%rbx
6359	jbe	seal_avx2_tail_512
6360	vmovdqa	.chacha20_consts(%rip),%ymm0
6361	vmovdqa	64(%rbp),%ymm4
6362	vmovdqa	96(%rbp),%ymm8
6363	vmovdqa	%ymm0,%ymm1
6364	vmovdqa	%ymm4,%ymm5
6365	vmovdqa	%ymm8,%ymm9
6366	vmovdqa	%ymm0,%ymm2
6367	vmovdqa	%ymm4,%ymm6
6368	vmovdqa	%ymm8,%ymm10
6369	vmovdqa	%ymm0,%ymm3
6370	vmovdqa	%ymm4,%ymm7
6371	vmovdqa	%ymm8,%ymm11
6372	vmovdqa	.avx2_inc(%rip),%ymm12
6373	vpaddd	160(%rbp),%ymm12,%ymm15
6374	vpaddd	%ymm15,%ymm12,%ymm14
6375	vpaddd	%ymm14,%ymm12,%ymm13
6376	vpaddd	%ymm13,%ymm12,%ymm12
6377	vmovdqa	%ymm15,256(%rbp)
6378	vmovdqa	%ymm14,224(%rbp)
6379	vmovdqa	%ymm13,192(%rbp)
6380	vmovdqa	%ymm12,160(%rbp)
6381	vmovdqa	%ymm8,128(%rbp)
6382	vmovdqa	.rol16(%rip),%ymm8
6383	vpaddd	%ymm7,%ymm3,%ymm3
6384	vpaddd	%ymm6,%ymm2,%ymm2
6385	vpaddd	%ymm5,%ymm1,%ymm1
6386	vpaddd	%ymm4,%ymm0,%ymm0
6387	vpxor	%ymm3,%ymm15,%ymm15
6388	vpxor	%ymm2,%ymm14,%ymm14
6389	vpxor	%ymm1,%ymm13,%ymm13
6390	vpxor	%ymm0,%ymm12,%ymm12
6391	vpshufb	%ymm8,%ymm15,%ymm15
6392	vpshufb	%ymm8,%ymm14,%ymm14
6393	vpshufb	%ymm8,%ymm13,%ymm13
6394	vpshufb	%ymm8,%ymm12,%ymm12
6395	vmovdqa	128(%rbp),%ymm8
6396	vpaddd	%ymm15,%ymm11,%ymm11
6397	vpaddd	%ymm14,%ymm10,%ymm10
6398	vpaddd	%ymm13,%ymm9,%ymm9
6399	vpaddd	%ymm12,%ymm8,%ymm8
6400	vpxor	%ymm11,%ymm7,%ymm7
6401	vpxor	%ymm10,%ymm6,%ymm6
6402	vpxor	%ymm9,%ymm5,%ymm5
6403	vpxor	%ymm8,%ymm4,%ymm4
6404	vmovdqa	%ymm8,128(%rbp)
6405	vpsrld	$20,%ymm7,%ymm8
6406	vpslld	$32-20,%ymm7,%ymm7
6407	vpxor	%ymm8,%ymm7,%ymm7
6408	vpsrld	$20,%ymm6,%ymm8
6409	vpslld	$32-20,%ymm6,%ymm6
6410	vpxor	%ymm8,%ymm6,%ymm6
6411	vpsrld	$20,%ymm5,%ymm8
6412	vpslld	$32-20,%ymm5,%ymm5
6413	vpxor	%ymm8,%ymm5,%ymm5
6414	vpsrld	$20,%ymm4,%ymm8
6415	vpslld	$32-20,%ymm4,%ymm4
6416	vpxor	%ymm8,%ymm4,%ymm4
6417	vmovdqa	.rol8(%rip),%ymm8
6418	vpaddd	%ymm7,%ymm3,%ymm3
6419	vpaddd	%ymm6,%ymm2,%ymm2
6420	vpaddd	%ymm5,%ymm1,%ymm1
6421	vpaddd	%ymm4,%ymm0,%ymm0
6422	vpxor	%ymm3,%ymm15,%ymm15
6423	vpxor	%ymm2,%ymm14,%ymm14
6424	vpxor	%ymm1,%ymm13,%ymm13
6425	vpxor	%ymm0,%ymm12,%ymm12
6426	vpshufb	%ymm8,%ymm15,%ymm15
6427	vpshufb	%ymm8,%ymm14,%ymm14
6428	vpshufb	%ymm8,%ymm13,%ymm13
6429	vpshufb	%ymm8,%ymm12,%ymm12
6430	vmovdqa	128(%rbp),%ymm8
6431	vpaddd	%ymm15,%ymm11,%ymm11
6432	vpaddd	%ymm14,%ymm10,%ymm10
6433	vpaddd	%ymm13,%ymm9,%ymm9
6434	vpaddd	%ymm12,%ymm8,%ymm8
6435	vpxor	%ymm11,%ymm7,%ymm7
6436	vpxor	%ymm10,%ymm6,%ymm6
6437	vpxor	%ymm9,%ymm5,%ymm5
6438	vpxor	%ymm8,%ymm4,%ymm4
6439	vmovdqa	%ymm8,128(%rbp)
6440	vpsrld	$25,%ymm7,%ymm8
6441	vpslld	$32-25,%ymm7,%ymm7
6442	vpxor	%ymm8,%ymm7,%ymm7
6443	vpsrld	$25,%ymm6,%ymm8
6444	vpslld	$32-25,%ymm6,%ymm6
6445	vpxor	%ymm8,%ymm6,%ymm6
6446	vpsrld	$25,%ymm5,%ymm8
6447	vpslld	$32-25,%ymm5,%ymm5
6448	vpxor	%ymm8,%ymm5,%ymm5
6449	vpsrld	$25,%ymm4,%ymm8
6450	vpslld	$32-25,%ymm4,%ymm4
6451	vpxor	%ymm8,%ymm4,%ymm4
6452	vmovdqa	128(%rbp),%ymm8
6453	vpalignr	$4,%ymm7,%ymm7,%ymm7
6454	vpalignr	$8,%ymm11,%ymm11,%ymm11
6455	vpalignr	$12,%ymm15,%ymm15,%ymm15
6456	vpalignr	$4,%ymm6,%ymm6,%ymm6
6457	vpalignr	$8,%ymm10,%ymm10,%ymm10
6458	vpalignr	$12,%ymm14,%ymm14,%ymm14
6459	vpalignr	$4,%ymm5,%ymm5,%ymm5
6460	vpalignr	$8,%ymm9,%ymm9,%ymm9
6461	vpalignr	$12,%ymm13,%ymm13,%ymm13
6462	vpalignr	$4,%ymm4,%ymm4,%ymm4
6463	vpalignr	$8,%ymm8,%ymm8,%ymm8
6464	vpalignr	$12,%ymm12,%ymm12,%ymm12
6465	vmovdqa	%ymm8,128(%rbp)
6466	vmovdqa	.rol16(%rip),%ymm8
6467	vpaddd	%ymm7,%ymm3,%ymm3
6468	vpaddd	%ymm6,%ymm2,%ymm2
6469	vpaddd	%ymm5,%ymm1,%ymm1
6470	vpaddd	%ymm4,%ymm0,%ymm0
6471	vpxor	%ymm3,%ymm15,%ymm15
6472	vpxor	%ymm2,%ymm14,%ymm14
6473	vpxor	%ymm1,%ymm13,%ymm13
6474	vpxor	%ymm0,%ymm12,%ymm12
6475	vpshufb	%ymm8,%ymm15,%ymm15
6476	vpshufb	%ymm8,%ymm14,%ymm14
6477	vpshufb	%ymm8,%ymm13,%ymm13
6478	vpshufb	%ymm8,%ymm12,%ymm12
6479	vmovdqa	128(%rbp),%ymm8
6480	vpaddd	%ymm15,%ymm11,%ymm11
6481	vpaddd	%ymm14,%ymm10,%ymm10
6482	vpaddd	%ymm13,%ymm9,%ymm9
6483	vpaddd	%ymm12,%ymm8,%ymm8
6484	vpxor	%ymm11,%ymm7,%ymm7
6485	vpxor	%ymm10,%ymm6,%ymm6
6486	vpxor	%ymm9,%ymm5,%ymm5
6487	vpxor	%ymm8,%ymm4,%ymm4
6488	vmovdqa	%ymm8,128(%rbp)
6489	vpsrld	$20,%ymm7,%ymm8
6490	vpslld	$32-20,%ymm7,%ymm7
6491	vpxor	%ymm8,%ymm7,%ymm7
6492	vpsrld	$20,%ymm6,%ymm8
6493	vpslld	$32-20,%ymm6,%ymm6
6494	vpxor	%ymm8,%ymm6,%ymm6
6495	vpsrld	$20,%ymm5,%ymm8
6496	vpslld	$32-20,%ymm5,%ymm5
6497	vpxor	%ymm8,%ymm5,%ymm5
6498	vpsrld	$20,%ymm4,%ymm8
6499	vpslld	$32-20,%ymm4,%ymm4
6500	vpxor	%ymm8,%ymm4,%ymm4
6501	vmovdqa	.rol8(%rip),%ymm8
6502	vpaddd	%ymm7,%ymm3,%ymm3
6503	vpaddd	%ymm6,%ymm2,%ymm2
6504	vpaddd	%ymm5,%ymm1,%ymm1
6505	vpaddd	%ymm4,%ymm0,%ymm0
6506	vpxor	%ymm3,%ymm15,%ymm15
6507	vpxor	%ymm2,%ymm14,%ymm14
6508	vpxor	%ymm1,%ymm13,%ymm13
6509	vpxor	%ymm0,%ymm12,%ymm12
6510	vpshufb	%ymm8,%ymm15,%ymm15
6511	vpshufb	%ymm8,%ymm14,%ymm14
6512	vpshufb	%ymm8,%ymm13,%ymm13
6513	vpshufb	%ymm8,%ymm12,%ymm12
6514	vmovdqa	128(%rbp),%ymm8
6515	vpaddd	%ymm15,%ymm11,%ymm11
6516	vpaddd	%ymm14,%ymm10,%ymm10
6517	vpaddd	%ymm13,%ymm9,%ymm9
6518	vpaddd	%ymm12,%ymm8,%ymm8
6519	vpxor	%ymm11,%ymm7,%ymm7
6520	vpxor	%ymm10,%ymm6,%ymm6
6521	vpxor	%ymm9,%ymm5,%ymm5
6522	vpxor	%ymm8,%ymm4,%ymm4
6523	vmovdqa	%ymm8,128(%rbp)
6524	vpsrld	$25,%ymm7,%ymm8
6525	vpslld	$32-25,%ymm7,%ymm7
6526	vpxor	%ymm8,%ymm7,%ymm7
6527	vpsrld	$25,%ymm6,%ymm8
6528	vpslld	$32-25,%ymm6,%ymm6
6529	vpxor	%ymm8,%ymm6,%ymm6
6530	vpsrld	$25,%ymm5,%ymm8
6531	vpslld	$32-25,%ymm5,%ymm5
6532	vpxor	%ymm8,%ymm5,%ymm5
6533	vpsrld	$25,%ymm4,%ymm8
6534	vpslld	$32-25,%ymm4,%ymm4
6535	vpxor	%ymm8,%ymm4,%ymm4
6536	vmovdqa	128(%rbp),%ymm8
6537	vpalignr	$12,%ymm7,%ymm7,%ymm7
6538	vpalignr	$8,%ymm11,%ymm11,%ymm11
6539	vpalignr	$4,%ymm15,%ymm15,%ymm15
6540	vpalignr	$12,%ymm6,%ymm6,%ymm6
6541	vpalignr	$8,%ymm10,%ymm10,%ymm10
6542	vpalignr	$4,%ymm14,%ymm14,%ymm14
6543	vpalignr	$12,%ymm5,%ymm5,%ymm5
6544	vpalignr	$8,%ymm9,%ymm9,%ymm9
6545	vpalignr	$4,%ymm13,%ymm13,%ymm13
6546	vpalignr	$12,%ymm4,%ymm4,%ymm4
6547	vpalignr	$8,%ymm8,%ymm8,%ymm8
6548	vpalignr	$4,%ymm12,%ymm12,%ymm12
6549	vmovdqa	%ymm8,128(%rbp)
6550	vmovdqa	.rol16(%rip),%ymm8
6551	vpaddd	%ymm7,%ymm3,%ymm3
6552	vpaddd	%ymm6,%ymm2,%ymm2
6553	vpaddd	%ymm5,%ymm1,%ymm1
6554	vpaddd	%ymm4,%ymm0,%ymm0
6555	vpxor	%ymm3,%ymm15,%ymm15
6556	vpxor	%ymm2,%ymm14,%ymm14
6557	vpxor	%ymm1,%ymm13,%ymm13
6558	vpxor	%ymm0,%ymm12,%ymm12
6559	vpshufb	%ymm8,%ymm15,%ymm15
6560	vpshufb	%ymm8,%ymm14,%ymm14
6561	vpshufb	%ymm8,%ymm13,%ymm13
6562	vpshufb	%ymm8,%ymm12,%ymm12
6563	vmovdqa	128(%rbp),%ymm8
6564	vpaddd	%ymm15,%ymm11,%ymm11
6565	vpaddd	%ymm14,%ymm10,%ymm10
6566	vpaddd	%ymm13,%ymm9,%ymm9
6567	vpaddd	%ymm12,%ymm8,%ymm8
6568	vpxor	%ymm11,%ymm7,%ymm7
6569	vpxor	%ymm10,%ymm6,%ymm6
6570	vpxor	%ymm9,%ymm5,%ymm5
6571	vpxor	%ymm8,%ymm4,%ymm4
6572	vmovdqa	%ymm8,128(%rbp)
6573	vpsrld	$20,%ymm7,%ymm8
6574	vpslld	$32-20,%ymm7,%ymm7
6575	vpxor	%ymm8,%ymm7,%ymm7
6576	vpsrld	$20,%ymm6,%ymm8
6577	vpslld	$32-20,%ymm6,%ymm6
6578	vpxor	%ymm8,%ymm6,%ymm6
6579	vpsrld	$20,%ymm5,%ymm8
6580	vpslld	$32-20,%ymm5,%ymm5
6581	vpxor	%ymm8,%ymm5,%ymm5
6582	vpsrld	$20,%ymm4,%ymm8
6583	vpslld	$32-20,%ymm4,%ymm4
6584	vpxor	%ymm8,%ymm4,%ymm4
6585	vmovdqa	.rol8(%rip),%ymm8
6586	vpaddd	%ymm7,%ymm3,%ymm3
6587	vpaddd	%ymm6,%ymm2,%ymm2
6588	vpaddd	%ymm5,%ymm1,%ymm1
6589	vpaddd	%ymm4,%ymm0,%ymm0
6590
6591	subq	$16,%rdi
6592	movq	$9,%rcx
6593	jmp	4f
65941:
6595	vmovdqa	.chacha20_consts(%rip),%ymm0
6596	vmovdqa	64(%rbp),%ymm4
6597	vmovdqa	96(%rbp),%ymm8
6598	vmovdqa	%ymm0,%ymm1
6599	vmovdqa	%ymm4,%ymm5
6600	vmovdqa	%ymm8,%ymm9
6601	vmovdqa	%ymm0,%ymm2
6602	vmovdqa	%ymm4,%ymm6
6603	vmovdqa	%ymm8,%ymm10
6604	vmovdqa	%ymm0,%ymm3
6605	vmovdqa	%ymm4,%ymm7
6606	vmovdqa	%ymm8,%ymm11
6607	vmovdqa	.avx2_inc(%rip),%ymm12
6608	vpaddd	160(%rbp),%ymm12,%ymm15
6609	vpaddd	%ymm15,%ymm12,%ymm14
6610	vpaddd	%ymm14,%ymm12,%ymm13
6611	vpaddd	%ymm13,%ymm12,%ymm12
6612	vmovdqa	%ymm15,256(%rbp)
6613	vmovdqa	%ymm14,224(%rbp)
6614	vmovdqa	%ymm13,192(%rbp)
6615	vmovdqa	%ymm12,160(%rbp)
6616
6617	movq	$10,%rcx
66182:
6619	addq	0(%rdi),%r10
6620	adcq	8+0(%rdi),%r11
6621	adcq	$1,%r12
6622	vmovdqa	%ymm8,128(%rbp)
6623	vmovdqa	.rol16(%rip),%ymm8
6624	vpaddd	%ymm7,%ymm3,%ymm3
6625	vpaddd	%ymm6,%ymm2,%ymm2
6626	vpaddd	%ymm5,%ymm1,%ymm1
6627	vpaddd	%ymm4,%ymm0,%ymm0
6628	vpxor	%ymm3,%ymm15,%ymm15
6629	vpxor	%ymm2,%ymm14,%ymm14
6630	vpxor	%ymm1,%ymm13,%ymm13
6631	vpxor	%ymm0,%ymm12,%ymm12
6632	movq	0+0(%rbp),%rdx
6633	movq	%rdx,%r15
6634	mulxq	%r10,%r13,%r14
6635	mulxq	%r11,%rax,%rdx
6636	imulq	%r12,%r15
6637	addq	%rax,%r14
6638	adcq	%rdx,%r15
6639	vpshufb	%ymm8,%ymm15,%ymm15
6640	vpshufb	%ymm8,%ymm14,%ymm14
6641	vpshufb	%ymm8,%ymm13,%ymm13
6642	vpshufb	%ymm8,%ymm12,%ymm12
6643	vmovdqa	128(%rbp),%ymm8
6644	vpaddd	%ymm15,%ymm11,%ymm11
6645	vpaddd	%ymm14,%ymm10,%ymm10
6646	vpaddd	%ymm13,%ymm9,%ymm9
6647	vpaddd	%ymm12,%ymm8,%ymm8
6648	movq	8+0(%rbp),%rdx
6649	mulxq	%r10,%r10,%rax
6650	addq	%r10,%r14
6651	mulxq	%r11,%r11,%r9
6652	adcq	%r11,%r15
6653	adcq	$0,%r9
6654	imulq	%r12,%rdx
6655	vpxor	%ymm11,%ymm7,%ymm7
6656	vpxor	%ymm10,%ymm6,%ymm6
6657	vpxor	%ymm9,%ymm5,%ymm5
6658	vpxor	%ymm8,%ymm4,%ymm4
6659	vmovdqa	%ymm8,128(%rbp)
6660	vpsrld	$20,%ymm7,%ymm8
6661	vpslld	$32-20,%ymm7,%ymm7
6662	vpxor	%ymm8,%ymm7,%ymm7
6663	vpsrld	$20,%ymm6,%ymm8
6664	vpslld	$32-20,%ymm6,%ymm6
6665	vpxor	%ymm8,%ymm6,%ymm6
6666	vpsrld	$20,%ymm5,%ymm8
6667	addq	%rax,%r15
6668	adcq	%rdx,%r9
6669	vpslld	$32-20,%ymm5,%ymm5
6670	vpxor	%ymm8,%ymm5,%ymm5
6671	vpsrld	$20,%ymm4,%ymm8
6672	vpslld	$32-20,%ymm4,%ymm4
6673	vpxor	%ymm8,%ymm4,%ymm4
6674	vmovdqa	.rol8(%rip),%ymm8
6675	vpaddd	%ymm7,%ymm3,%ymm3
6676	vpaddd	%ymm6,%ymm2,%ymm2
6677	vpaddd	%ymm5,%ymm1,%ymm1
6678	vpaddd	%ymm4,%ymm0,%ymm0
6679	movq	%r13,%r10
6680	movq	%r14,%r11
6681	movq	%r15,%r12
6682	andq	$3,%r12
6683	movq	%r15,%r13
6684	andq	$-4,%r13
6685	movq	%r9,%r14
6686	shrdq	$2,%r9,%r15
6687	shrq	$2,%r9
6688	addq	%r13,%r10
6689	adcq	%r14,%r11
6690	adcq	$0,%r12
6691	addq	%r15,%r10
6692	adcq	%r9,%r11
6693	adcq	$0,%r12
6694
66954:
6696	vpxor	%ymm3,%ymm15,%ymm15
6697	vpxor	%ymm2,%ymm14,%ymm14
6698	vpxor	%ymm1,%ymm13,%ymm13
6699	vpxor	%ymm0,%ymm12,%ymm12
6700	vpshufb	%ymm8,%ymm15,%ymm15
6701	vpshufb	%ymm8,%ymm14,%ymm14
6702	vpshufb	%ymm8,%ymm13,%ymm13
6703	vpshufb	%ymm8,%ymm12,%ymm12
6704	vmovdqa	128(%rbp),%ymm8
6705	addq	16(%rdi),%r10
6706	adcq	8+16(%rdi),%r11
6707	adcq	$1,%r12
6708	vpaddd	%ymm15,%ymm11,%ymm11
6709	vpaddd	%ymm14,%ymm10,%ymm10
6710	vpaddd	%ymm13,%ymm9,%ymm9
6711	vpaddd	%ymm12,%ymm8,%ymm8
6712	vpxor	%ymm11,%ymm7,%ymm7
6713	vpxor	%ymm10,%ymm6,%ymm6
6714	vpxor	%ymm9,%ymm5,%ymm5
6715	vpxor	%ymm8,%ymm4,%ymm4
6716	movq	0+0(%rbp),%rdx
6717	movq	%rdx,%r15
6718	mulxq	%r10,%r13,%r14
6719	mulxq	%r11,%rax,%rdx
6720	imulq	%r12,%r15
6721	addq	%rax,%r14
6722	adcq	%rdx,%r15
6723	vmovdqa	%ymm8,128(%rbp)
6724	vpsrld	$25,%ymm7,%ymm8
6725	vpslld	$32-25,%ymm7,%ymm7
6726	vpxor	%ymm8,%ymm7,%ymm7
6727	vpsrld	$25,%ymm6,%ymm8
6728	vpslld	$32-25,%ymm6,%ymm6
6729	vpxor	%ymm8,%ymm6,%ymm6
6730	vpsrld	$25,%ymm5,%ymm8
6731	vpslld	$32-25,%ymm5,%ymm5
6732	vpxor	%ymm8,%ymm5,%ymm5
6733	vpsrld	$25,%ymm4,%ymm8
6734	vpslld	$32-25,%ymm4,%ymm4
6735	vpxor	%ymm8,%ymm4,%ymm4
6736	vmovdqa	128(%rbp),%ymm8
6737	vpalignr	$4,%ymm7,%ymm7,%ymm7
6738	vpalignr	$8,%ymm11,%ymm11,%ymm11
6739	vpalignr	$12,%ymm15,%ymm15,%ymm15
6740	vpalignr	$4,%ymm6,%ymm6,%ymm6
6741	movq	8+0(%rbp),%rdx
6742	mulxq	%r10,%r10,%rax
6743	addq	%r10,%r14
6744	mulxq	%r11,%r11,%r9
6745	adcq	%r11,%r15
6746	adcq	$0,%r9
6747	imulq	%r12,%rdx
6748	vpalignr	$8,%ymm10,%ymm10,%ymm10
6749	vpalignr	$12,%ymm14,%ymm14,%ymm14
6750	vpalignr	$4,%ymm5,%ymm5,%ymm5
6751	vpalignr	$8,%ymm9,%ymm9,%ymm9
6752	vpalignr	$12,%ymm13,%ymm13,%ymm13
6753	vpalignr	$4,%ymm4,%ymm4,%ymm4
6754	vpalignr	$8,%ymm8,%ymm8,%ymm8
6755	vpalignr	$12,%ymm12,%ymm12,%ymm12
6756	vmovdqa	%ymm8,128(%rbp)
6757	vmovdqa	.rol16(%rip),%ymm8
6758	vpaddd	%ymm7,%ymm3,%ymm3
6759	vpaddd	%ymm6,%ymm2,%ymm2
6760	vpaddd	%ymm5,%ymm1,%ymm1
6761	vpaddd	%ymm4,%ymm0,%ymm0
6762	vpxor	%ymm3,%ymm15,%ymm15
6763	vpxor	%ymm2,%ymm14,%ymm14
6764	vpxor	%ymm1,%ymm13,%ymm13
6765	vpxor	%ymm0,%ymm12,%ymm12
6766	addq	%rax,%r15
6767	adcq	%rdx,%r9
6768	vpshufb	%ymm8,%ymm15,%ymm15
6769	vpshufb	%ymm8,%ymm14,%ymm14
6770	vpshufb	%ymm8,%ymm13,%ymm13
6771	vpshufb	%ymm8,%ymm12,%ymm12
6772	vmovdqa	128(%rbp),%ymm8
6773	vpaddd	%ymm15,%ymm11,%ymm11
6774	vpaddd	%ymm14,%ymm10,%ymm10
6775	vpaddd	%ymm13,%ymm9,%ymm9
6776	vpaddd	%ymm12,%ymm8,%ymm8
6777	movq	%r13,%r10
6778	movq	%r14,%r11
6779	movq	%r15,%r12
6780	andq	$3,%r12
6781	movq	%r15,%r13
6782	andq	$-4,%r13
6783	movq	%r9,%r14
6784	shrdq	$2,%r9,%r15
6785	shrq	$2,%r9
6786	addq	%r13,%r10
6787	adcq	%r14,%r11
6788	adcq	$0,%r12
6789	addq	%r15,%r10
6790	adcq	%r9,%r11
6791	adcq	$0,%r12
6792	vpxor	%ymm11,%ymm7,%ymm7
6793	vpxor	%ymm10,%ymm6,%ymm6
6794	vpxor	%ymm9,%ymm5,%ymm5
6795	vpxor	%ymm8,%ymm4,%ymm4
6796	vmovdqa	%ymm8,128(%rbp)
6797	vpsrld	$20,%ymm7,%ymm8
6798	vpslld	$32-20,%ymm7,%ymm7
6799	vpxor	%ymm8,%ymm7,%ymm7
6800	addq	32(%rdi),%r10
6801	adcq	8+32(%rdi),%r11
6802	adcq	$1,%r12
6803
6804	leaq	48(%rdi),%rdi
6805	vpsrld	$20,%ymm6,%ymm8
6806	vpslld	$32-20,%ymm6,%ymm6
6807	vpxor	%ymm8,%ymm6,%ymm6
6808	vpsrld	$20,%ymm5,%ymm8
6809	vpslld	$32-20,%ymm5,%ymm5
6810	vpxor	%ymm8,%ymm5,%ymm5
6811	vpsrld	$20,%ymm4,%ymm8
6812	vpslld	$32-20,%ymm4,%ymm4
6813	vpxor	%ymm8,%ymm4,%ymm4
6814	vmovdqa	.rol8(%rip),%ymm8
6815	vpaddd	%ymm7,%ymm3,%ymm3
6816	vpaddd	%ymm6,%ymm2,%ymm2
6817	vpaddd	%ymm5,%ymm1,%ymm1
6818	vpaddd	%ymm4,%ymm0,%ymm0
6819	vpxor	%ymm3,%ymm15,%ymm15
6820	vpxor	%ymm2,%ymm14,%ymm14
6821	vpxor	%ymm1,%ymm13,%ymm13
6822	vpxor	%ymm0,%ymm12,%ymm12
6823	movq	0+0(%rbp),%rdx
6824	movq	%rdx,%r15
6825	mulxq	%r10,%r13,%r14
6826	mulxq	%r11,%rax,%rdx
6827	imulq	%r12,%r15
6828	addq	%rax,%r14
6829	adcq	%rdx,%r15
6830	vpshufb	%ymm8,%ymm15,%ymm15
6831	vpshufb	%ymm8,%ymm14,%ymm14
6832	vpshufb	%ymm8,%ymm13,%ymm13
6833	vpshufb	%ymm8,%ymm12,%ymm12
6834	vmovdqa	128(%rbp),%ymm8
6835	vpaddd	%ymm15,%ymm11,%ymm11
6836	vpaddd	%ymm14,%ymm10,%ymm10
6837	vpaddd	%ymm13,%ymm9,%ymm9
6838	movq	8+0(%rbp),%rdx
6839	mulxq	%r10,%r10,%rax
6840	addq	%r10,%r14
6841	mulxq	%r11,%r11,%r9
6842	adcq	%r11,%r15
6843	adcq	$0,%r9
6844	imulq	%r12,%rdx
6845	vpaddd	%ymm12,%ymm8,%ymm8
6846	vpxor	%ymm11,%ymm7,%ymm7
6847	vpxor	%ymm10,%ymm6,%ymm6
6848	vpxor	%ymm9,%ymm5,%ymm5
6849	vpxor	%ymm8,%ymm4,%ymm4
6850	vmovdqa	%ymm8,128(%rbp)
6851	vpsrld	$25,%ymm7,%ymm8
6852	vpslld	$32-25,%ymm7,%ymm7
6853	addq	%rax,%r15
6854	adcq	%rdx,%r9
6855	vpxor	%ymm8,%ymm7,%ymm7
6856	vpsrld	$25,%ymm6,%ymm8
6857	vpslld	$32-25,%ymm6,%ymm6
6858	vpxor	%ymm8,%ymm6,%ymm6
6859	vpsrld	$25,%ymm5,%ymm8
6860	vpslld	$32-25,%ymm5,%ymm5
6861	vpxor	%ymm8,%ymm5,%ymm5
6862	vpsrld	$25,%ymm4,%ymm8
6863	vpslld	$32-25,%ymm4,%ymm4
6864	vpxor	%ymm8,%ymm4,%ymm4
6865	vmovdqa	128(%rbp),%ymm8
6866	vpalignr	$12,%ymm7,%ymm7,%ymm7
6867	vpalignr	$8,%ymm11,%ymm11,%ymm11
6868	vpalignr	$4,%ymm15,%ymm15,%ymm15
6869	vpalignr	$12,%ymm6,%ymm6,%ymm6
6870	vpalignr	$8,%ymm10,%ymm10,%ymm10
6871	vpalignr	$4,%ymm14,%ymm14,%ymm14
6872	vpalignr	$12,%ymm5,%ymm5,%ymm5
6873	movq	%r13,%r10
6874	movq	%r14,%r11
6875	movq	%r15,%r12
6876	andq	$3,%r12
6877	movq	%r15,%r13
6878	andq	$-4,%r13
6879	movq	%r9,%r14
6880	shrdq	$2,%r9,%r15
6881	shrq	$2,%r9
6882	addq	%r13,%r10
6883	adcq	%r14,%r11
6884	adcq	$0,%r12
6885	addq	%r15,%r10
6886	adcq	%r9,%r11
6887	adcq	$0,%r12
6888	vpalignr	$8,%ymm9,%ymm9,%ymm9
6889	vpalignr	$4,%ymm13,%ymm13,%ymm13
6890	vpalignr	$12,%ymm4,%ymm4,%ymm4
6891	vpalignr	$8,%ymm8,%ymm8,%ymm8
6892	vpalignr	$4,%ymm12,%ymm12,%ymm12
6893
6894	decq	%rcx
6895	jne	2b
6896	vpaddd	.chacha20_consts(%rip),%ymm3,%ymm3
6897	vpaddd	64(%rbp),%ymm7,%ymm7
6898	vpaddd	96(%rbp),%ymm11,%ymm11
6899	vpaddd	256(%rbp),%ymm15,%ymm15
6900	vpaddd	.chacha20_consts(%rip),%ymm2,%ymm2
6901	vpaddd	64(%rbp),%ymm6,%ymm6
6902	vpaddd	96(%rbp),%ymm10,%ymm10
6903	vpaddd	224(%rbp),%ymm14,%ymm14
6904	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
6905	vpaddd	64(%rbp),%ymm5,%ymm5
6906	vpaddd	96(%rbp),%ymm9,%ymm9
6907	vpaddd	192(%rbp),%ymm13,%ymm13
6908	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
6909	vpaddd	64(%rbp),%ymm4,%ymm4
6910	vpaddd	96(%rbp),%ymm8,%ymm8
6911	vpaddd	160(%rbp),%ymm12,%ymm12
6912
6913	leaq	32(%rdi),%rdi
6914	vmovdqa	%ymm0,128(%rbp)
6915	addq	-32(%rdi),%r10
6916	adcq	8+-32(%rdi),%r11
6917	adcq	$1,%r12
6918	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
6919	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
6920	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
6921	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
6922	vpxor	0+0(%rsi),%ymm0,%ymm0
6923	vpxor	32+0(%rsi),%ymm3,%ymm3
6924	vpxor	64+0(%rsi),%ymm7,%ymm7
6925	vpxor	96+0(%rsi),%ymm11,%ymm11
6926	vmovdqu	%ymm0,0+0(%rdi)
6927	vmovdqu	%ymm3,32+0(%rdi)
6928	vmovdqu	%ymm7,64+0(%rdi)
6929	vmovdqu	%ymm11,96+0(%rdi)
6930
6931	vmovdqa	128(%rbp),%ymm0
6932	movq	0+0(%rbp),%rax
6933	movq	%rax,%r15
6934	mulq	%r10
6935	movq	%rax,%r13
6936	movq	%rdx,%r14
6937	movq	0+0(%rbp),%rax
6938	mulq	%r11
6939	imulq	%r12,%r15
6940	addq	%rax,%r14
6941	adcq	%rdx,%r15
6942	movq	8+0(%rbp),%rax
6943	movq	%rax,%r9
6944	mulq	%r10
6945	addq	%rax,%r14
6946	adcq	$0,%rdx
6947	movq	%rdx,%r10
6948	movq	8+0(%rbp),%rax
6949	mulq	%r11
6950	addq	%rax,%r15
6951	adcq	$0,%rdx
6952	imulq	%r12,%r9
6953	addq	%r10,%r15
6954	adcq	%rdx,%r9
6955	movq	%r13,%r10
6956	movq	%r14,%r11
6957	movq	%r15,%r12
6958	andq	$3,%r12
6959	movq	%r15,%r13
6960	andq	$-4,%r13
6961	movq	%r9,%r14
6962	shrdq	$2,%r9,%r15
6963	shrq	$2,%r9
6964	addq	%r13,%r10
6965	adcq	%r14,%r11
6966	adcq	$0,%r12
6967	addq	%r15,%r10
6968	adcq	%r9,%r11
6969	adcq	$0,%r12
6970	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
6971	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
6972	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
6973	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
6974	vpxor	0+128(%rsi),%ymm3,%ymm3
6975	vpxor	32+128(%rsi),%ymm2,%ymm2
6976	vpxor	64+128(%rsi),%ymm6,%ymm6
6977	vpxor	96+128(%rsi),%ymm10,%ymm10
6978	vmovdqu	%ymm3,0+128(%rdi)
6979	vmovdqu	%ymm2,32+128(%rdi)
6980	vmovdqu	%ymm6,64+128(%rdi)
6981	vmovdqu	%ymm10,96+128(%rdi)
6982	addq	-16(%rdi),%r10
6983	adcq	8+-16(%rdi),%r11
6984	adcq	$1,%r12
6985	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
6986	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
6987	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
6988	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
6989	vpxor	0+256(%rsi),%ymm3,%ymm3
6990	vpxor	32+256(%rsi),%ymm1,%ymm1
6991	vpxor	64+256(%rsi),%ymm5,%ymm5
6992	vpxor	96+256(%rsi),%ymm9,%ymm9
6993	vmovdqu	%ymm3,0+256(%rdi)
6994	vmovdqu	%ymm1,32+256(%rdi)
6995	vmovdqu	%ymm5,64+256(%rdi)
6996	vmovdqu	%ymm9,96+256(%rdi)
6997	movq	0+0(%rbp),%rax
6998	movq	%rax,%r15
6999	mulq	%r10
7000	movq	%rax,%r13
7001	movq	%rdx,%r14
7002	movq	0+0(%rbp),%rax
7003	mulq	%r11
7004	imulq	%r12,%r15
7005	addq	%rax,%r14
7006	adcq	%rdx,%r15
7007	movq	8+0(%rbp),%rax
7008	movq	%rax,%r9
7009	mulq	%r10
7010	addq	%rax,%r14
7011	adcq	$0,%rdx
7012	movq	%rdx,%r10
7013	movq	8+0(%rbp),%rax
7014	mulq	%r11
7015	addq	%rax,%r15
7016	adcq	$0,%rdx
7017	imulq	%r12,%r9
7018	addq	%r10,%r15
7019	adcq	%rdx,%r9
7020	movq	%r13,%r10
7021	movq	%r14,%r11
7022	movq	%r15,%r12
7023	andq	$3,%r12
7024	movq	%r15,%r13
7025	andq	$-4,%r13
7026	movq	%r9,%r14
7027	shrdq	$2,%r9,%r15
7028	shrq	$2,%r9
7029	addq	%r13,%r10
7030	adcq	%r14,%r11
7031	adcq	$0,%r12
7032	addq	%r15,%r10
7033	adcq	%r9,%r11
7034	adcq	$0,%r12
7035	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
7036	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
7037	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
7038	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
7039	vpxor	0+384(%rsi),%ymm3,%ymm3
7040	vpxor	32+384(%rsi),%ymm0,%ymm0
7041	vpxor	64+384(%rsi),%ymm4,%ymm4
7042	vpxor	96+384(%rsi),%ymm8,%ymm8
7043	vmovdqu	%ymm3,0+384(%rdi)
7044	vmovdqu	%ymm0,32+384(%rdi)
7045	vmovdqu	%ymm4,64+384(%rdi)
7046	vmovdqu	%ymm8,96+384(%rdi)
7047
7048	leaq	512(%rsi),%rsi
7049	subq	$512,%rbx
7050	cmpq	$512,%rbx
7051	jg	1b
7052	addq	0(%rdi),%r10
7053	adcq	8+0(%rdi),%r11
7054	adcq	$1,%r12
7055	movq	0+0(%rbp),%rax
7056	movq	%rax,%r15
7057	mulq	%r10
7058	movq	%rax,%r13
7059	movq	%rdx,%r14
7060	movq	0+0(%rbp),%rax
7061	mulq	%r11
7062	imulq	%r12,%r15
7063	addq	%rax,%r14
7064	adcq	%rdx,%r15
7065	movq	8+0(%rbp),%rax
7066	movq	%rax,%r9
7067	mulq	%r10
7068	addq	%rax,%r14
7069	adcq	$0,%rdx
7070	movq	%rdx,%r10
7071	movq	8+0(%rbp),%rax
7072	mulq	%r11
7073	addq	%rax,%r15
7074	adcq	$0,%rdx
7075	imulq	%r12,%r9
7076	addq	%r10,%r15
7077	adcq	%rdx,%r9
7078	movq	%r13,%r10
7079	movq	%r14,%r11
7080	movq	%r15,%r12
7081	andq	$3,%r12
7082	movq	%r15,%r13
7083	andq	$-4,%r13
7084	movq	%r9,%r14
7085	shrdq	$2,%r9,%r15
7086	shrq	$2,%r9
7087	addq	%r13,%r10
7088	adcq	%r14,%r11
7089	adcq	$0,%r12
7090	addq	%r15,%r10
7091	adcq	%r9,%r11
7092	adcq	$0,%r12
7093	addq	16(%rdi),%r10
7094	adcq	8+16(%rdi),%r11
7095	adcq	$1,%r12
7096	movq	0+0(%rbp),%rax
7097	movq	%rax,%r15
7098	mulq	%r10
7099	movq	%rax,%r13
7100	movq	%rdx,%r14
7101	movq	0+0(%rbp),%rax
7102	mulq	%r11
7103	imulq	%r12,%r15
7104	addq	%rax,%r14
7105	adcq	%rdx,%r15
7106	movq	8+0(%rbp),%rax
7107	movq	%rax,%r9
7108	mulq	%r10
7109	addq	%rax,%r14
7110	adcq	$0,%rdx
7111	movq	%rdx,%r10
7112	movq	8+0(%rbp),%rax
7113	mulq	%r11
7114	addq	%rax,%r15
7115	adcq	$0,%rdx
7116	imulq	%r12,%r9
7117	addq	%r10,%r15
7118	adcq	%rdx,%r9
7119	movq	%r13,%r10
7120	movq	%r14,%r11
7121	movq	%r15,%r12
7122	andq	$3,%r12
7123	movq	%r15,%r13
7124	andq	$-4,%r13
7125	movq	%r9,%r14
7126	shrdq	$2,%r9,%r15
7127	shrq	$2,%r9
7128	addq	%r13,%r10
7129	adcq	%r14,%r11
7130	adcq	$0,%r12
7131	addq	%r15,%r10
7132	adcq	%r9,%r11
7133	adcq	$0,%r12
7134
7135	leaq	32(%rdi),%rdi
7136	movq	$10,%rcx
7137	xorq	%r8,%r8
7138	cmpq	$128,%rbx
7139	ja	3f
7140
7141seal_avx2_tail_128:
7142	vmovdqa	.chacha20_consts(%rip),%ymm0
7143	vmovdqa	64(%rbp),%ymm4
7144	vmovdqa	96(%rbp),%ymm8
7145	vmovdqa	.avx2_inc(%rip),%ymm12
7146	vpaddd	160(%rbp),%ymm12,%ymm12
7147	vmovdqa	%ymm12,160(%rbp)
7148
71491:
7150	addq	0(%rdi),%r10
7151	adcq	8+0(%rdi),%r11
7152	adcq	$1,%r12
7153	movq	0+0(%rbp),%rax
7154	movq	%rax,%r15
7155	mulq	%r10
7156	movq	%rax,%r13
7157	movq	%rdx,%r14
7158	movq	0+0(%rbp),%rax
7159	mulq	%r11
7160	imulq	%r12,%r15
7161	addq	%rax,%r14
7162	adcq	%rdx,%r15
7163	movq	8+0(%rbp),%rax
7164	movq	%rax,%r9
7165	mulq	%r10
7166	addq	%rax,%r14
7167	adcq	$0,%rdx
7168	movq	%rdx,%r10
7169	movq	8+0(%rbp),%rax
7170	mulq	%r11
7171	addq	%rax,%r15
7172	adcq	$0,%rdx
7173	imulq	%r12,%r9
7174	addq	%r10,%r15
7175	adcq	%rdx,%r9
7176	movq	%r13,%r10
7177	movq	%r14,%r11
7178	movq	%r15,%r12
7179	andq	$3,%r12
7180	movq	%r15,%r13
7181	andq	$-4,%r13
7182	movq	%r9,%r14
7183	shrdq	$2,%r9,%r15
7184	shrq	$2,%r9
7185	addq	%r13,%r10
7186	adcq	%r14,%r11
7187	adcq	$0,%r12
7188	addq	%r15,%r10
7189	adcq	%r9,%r11
7190	adcq	$0,%r12
7191
7192	leaq	16(%rdi),%rdi
71932:
7194	vpaddd	%ymm4,%ymm0,%ymm0
7195	vpxor	%ymm0,%ymm12,%ymm12
7196	vpshufb	.rol16(%rip),%ymm12,%ymm12
7197	vpaddd	%ymm12,%ymm8,%ymm8
7198	vpxor	%ymm8,%ymm4,%ymm4
7199	vpsrld	$20,%ymm4,%ymm3
7200	vpslld	$12,%ymm4,%ymm4
7201	vpxor	%ymm3,%ymm4,%ymm4
7202	vpaddd	%ymm4,%ymm0,%ymm0
7203	vpxor	%ymm0,%ymm12,%ymm12
7204	vpshufb	.rol8(%rip),%ymm12,%ymm12
7205	vpaddd	%ymm12,%ymm8,%ymm8
7206	vpxor	%ymm8,%ymm4,%ymm4
7207	vpslld	$7,%ymm4,%ymm3
7208	vpsrld	$25,%ymm4,%ymm4
7209	vpxor	%ymm3,%ymm4,%ymm4
7210	vpalignr	$12,%ymm12,%ymm12,%ymm12
7211	vpalignr	$8,%ymm8,%ymm8,%ymm8
7212	vpalignr	$4,%ymm4,%ymm4,%ymm4
7213	addq	0(%rdi),%r10
7214	adcq	8+0(%rdi),%r11
7215	adcq	$1,%r12
7216	movq	0+0(%rbp),%rax
7217	movq	%rax,%r15
7218	mulq	%r10
7219	movq	%rax,%r13
7220	movq	%rdx,%r14
7221	movq	0+0(%rbp),%rax
7222	mulq	%r11
7223	imulq	%r12,%r15
7224	addq	%rax,%r14
7225	adcq	%rdx,%r15
7226	movq	8+0(%rbp),%rax
7227	movq	%rax,%r9
7228	mulq	%r10
7229	addq	%rax,%r14
7230	adcq	$0,%rdx
7231	movq	%rdx,%r10
7232	movq	8+0(%rbp),%rax
7233	mulq	%r11
7234	addq	%rax,%r15
7235	adcq	$0,%rdx
7236	imulq	%r12,%r9
7237	addq	%r10,%r15
7238	adcq	%rdx,%r9
7239	movq	%r13,%r10
7240	movq	%r14,%r11
7241	movq	%r15,%r12
7242	andq	$3,%r12
7243	movq	%r15,%r13
7244	andq	$-4,%r13
7245	movq	%r9,%r14
7246	shrdq	$2,%r9,%r15
7247	shrq	$2,%r9
7248	addq	%r13,%r10
7249	adcq	%r14,%r11
7250	adcq	$0,%r12
7251	addq	%r15,%r10
7252	adcq	%r9,%r11
7253	adcq	$0,%r12
7254	vpaddd	%ymm4,%ymm0,%ymm0
7255	vpxor	%ymm0,%ymm12,%ymm12
7256	vpshufb	.rol16(%rip),%ymm12,%ymm12
7257	vpaddd	%ymm12,%ymm8,%ymm8
7258	vpxor	%ymm8,%ymm4,%ymm4
7259	vpsrld	$20,%ymm4,%ymm3
7260	vpslld	$12,%ymm4,%ymm4
7261	vpxor	%ymm3,%ymm4,%ymm4
7262	vpaddd	%ymm4,%ymm0,%ymm0
7263	vpxor	%ymm0,%ymm12,%ymm12
7264	vpshufb	.rol8(%rip),%ymm12,%ymm12
7265	vpaddd	%ymm12,%ymm8,%ymm8
7266	vpxor	%ymm8,%ymm4,%ymm4
7267	vpslld	$7,%ymm4,%ymm3
7268	vpsrld	$25,%ymm4,%ymm4
7269	vpxor	%ymm3,%ymm4,%ymm4
7270	vpalignr	$4,%ymm12,%ymm12,%ymm12
7271	vpalignr	$8,%ymm8,%ymm8,%ymm8
7272	vpalignr	$12,%ymm4,%ymm4,%ymm4
7273	addq	16(%rdi),%r10
7274	adcq	8+16(%rdi),%r11
7275	adcq	$1,%r12
7276	movq	0+0(%rbp),%rax
7277	movq	%rax,%r15
7278	mulq	%r10
7279	movq	%rax,%r13
7280	movq	%rdx,%r14
7281	movq	0+0(%rbp),%rax
7282	mulq	%r11
7283	imulq	%r12,%r15
7284	addq	%rax,%r14
7285	adcq	%rdx,%r15
7286	movq	8+0(%rbp),%rax
7287	movq	%rax,%r9
7288	mulq	%r10
7289	addq	%rax,%r14
7290	adcq	$0,%rdx
7291	movq	%rdx,%r10
7292	movq	8+0(%rbp),%rax
7293	mulq	%r11
7294	addq	%rax,%r15
7295	adcq	$0,%rdx
7296	imulq	%r12,%r9
7297	addq	%r10,%r15
7298	adcq	%rdx,%r9
7299	movq	%r13,%r10
7300	movq	%r14,%r11
7301	movq	%r15,%r12
7302	andq	$3,%r12
7303	movq	%r15,%r13
7304	andq	$-4,%r13
7305	movq	%r9,%r14
7306	shrdq	$2,%r9,%r15
7307	shrq	$2,%r9
7308	addq	%r13,%r10
7309	adcq	%r14,%r11
7310	adcq	$0,%r12
7311	addq	%r15,%r10
7312	adcq	%r9,%r11
7313	adcq	$0,%r12
7314
7315	leaq	32(%rdi),%rdi
7316	decq	%rcx
7317	jg	1b
7318	decq	%r8
7319	jge	2b
7320	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
7321	vpaddd	64(%rbp),%ymm4,%ymm4
7322	vpaddd	96(%rbp),%ymm8,%ymm8
7323	vpaddd	160(%rbp),%ymm12,%ymm12
7324	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
7325	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
7326	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
7327	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
7328	vmovdqa	%ymm3,%ymm8
7329
7330	jmp	seal_avx2_short_loop
73313:
7332	cmpq	$256,%rbx
7333	ja	3f
7334
7335seal_avx2_tail_256:
7336	vmovdqa	.chacha20_consts(%rip),%ymm0
7337	vmovdqa	64(%rbp),%ymm4
7338	vmovdqa	96(%rbp),%ymm8
7339	vmovdqa	%ymm0,%ymm1
7340	vmovdqa	%ymm4,%ymm5
7341	vmovdqa	%ymm8,%ymm9
7342	vmovdqa	.avx2_inc(%rip),%ymm12
7343	vpaddd	160(%rbp),%ymm12,%ymm13
7344	vpaddd	%ymm13,%ymm12,%ymm12
7345	vmovdqa	%ymm12,160(%rbp)
7346	vmovdqa	%ymm13,192(%rbp)
7347
73481:
7349	addq	0(%rdi),%r10
7350	adcq	8+0(%rdi),%r11
7351	adcq	$1,%r12
7352	movq	0+0(%rbp),%rax
7353	movq	%rax,%r15
7354	mulq	%r10
7355	movq	%rax,%r13
7356	movq	%rdx,%r14
7357	movq	0+0(%rbp),%rax
7358	mulq	%r11
7359	imulq	%r12,%r15
7360	addq	%rax,%r14
7361	adcq	%rdx,%r15
7362	movq	8+0(%rbp),%rax
7363	movq	%rax,%r9
7364	mulq	%r10
7365	addq	%rax,%r14
7366	adcq	$0,%rdx
7367	movq	%rdx,%r10
7368	movq	8+0(%rbp),%rax
7369	mulq	%r11
7370	addq	%rax,%r15
7371	adcq	$0,%rdx
7372	imulq	%r12,%r9
7373	addq	%r10,%r15
7374	adcq	%rdx,%r9
7375	movq	%r13,%r10
7376	movq	%r14,%r11
7377	movq	%r15,%r12
7378	andq	$3,%r12
7379	movq	%r15,%r13
7380	andq	$-4,%r13
7381	movq	%r9,%r14
7382	shrdq	$2,%r9,%r15
7383	shrq	$2,%r9
7384	addq	%r13,%r10
7385	adcq	%r14,%r11
7386	adcq	$0,%r12
7387	addq	%r15,%r10
7388	adcq	%r9,%r11
7389	adcq	$0,%r12
7390
7391	leaq	16(%rdi),%rdi
73922:
7393	vpaddd	%ymm4,%ymm0,%ymm0
7394	vpxor	%ymm0,%ymm12,%ymm12
7395	vpshufb	.rol16(%rip),%ymm12,%ymm12
7396	vpaddd	%ymm12,%ymm8,%ymm8
7397	vpxor	%ymm8,%ymm4,%ymm4
7398	vpsrld	$20,%ymm4,%ymm3
7399	vpslld	$12,%ymm4,%ymm4
7400	vpxor	%ymm3,%ymm4,%ymm4
7401	vpaddd	%ymm4,%ymm0,%ymm0
7402	vpxor	%ymm0,%ymm12,%ymm12
7403	vpshufb	.rol8(%rip),%ymm12,%ymm12
7404	vpaddd	%ymm12,%ymm8,%ymm8
7405	vpxor	%ymm8,%ymm4,%ymm4
7406	vpslld	$7,%ymm4,%ymm3
7407	vpsrld	$25,%ymm4,%ymm4
7408	vpxor	%ymm3,%ymm4,%ymm4
7409	vpalignr	$12,%ymm12,%ymm12,%ymm12
7410	vpalignr	$8,%ymm8,%ymm8,%ymm8
7411	vpalignr	$4,%ymm4,%ymm4,%ymm4
7412	vpaddd	%ymm5,%ymm1,%ymm1
7413	vpxor	%ymm1,%ymm13,%ymm13
7414	vpshufb	.rol16(%rip),%ymm13,%ymm13
7415	vpaddd	%ymm13,%ymm9,%ymm9
7416	vpxor	%ymm9,%ymm5,%ymm5
7417	vpsrld	$20,%ymm5,%ymm3
7418	vpslld	$12,%ymm5,%ymm5
7419	vpxor	%ymm3,%ymm5,%ymm5
7420	vpaddd	%ymm5,%ymm1,%ymm1
7421	vpxor	%ymm1,%ymm13,%ymm13
7422	vpshufb	.rol8(%rip),%ymm13,%ymm13
7423	vpaddd	%ymm13,%ymm9,%ymm9
7424	vpxor	%ymm9,%ymm5,%ymm5
7425	vpslld	$7,%ymm5,%ymm3
7426	vpsrld	$25,%ymm5,%ymm5
7427	vpxor	%ymm3,%ymm5,%ymm5
7428	vpalignr	$12,%ymm13,%ymm13,%ymm13
7429	vpalignr	$8,%ymm9,%ymm9,%ymm9
7430	vpalignr	$4,%ymm5,%ymm5,%ymm5
7431	addq	0(%rdi),%r10
7432	adcq	8+0(%rdi),%r11
7433	adcq	$1,%r12
7434	movq	0+0(%rbp),%rax
7435	movq	%rax,%r15
7436	mulq	%r10
7437	movq	%rax,%r13
7438	movq	%rdx,%r14
7439	movq	0+0(%rbp),%rax
7440	mulq	%r11
7441	imulq	%r12,%r15
7442	addq	%rax,%r14
7443	adcq	%rdx,%r15
7444	movq	8+0(%rbp),%rax
7445	movq	%rax,%r9
7446	mulq	%r10
7447	addq	%rax,%r14
7448	adcq	$0,%rdx
7449	movq	%rdx,%r10
7450	movq	8+0(%rbp),%rax
7451	mulq	%r11
7452	addq	%rax,%r15
7453	adcq	$0,%rdx
7454	imulq	%r12,%r9
7455	addq	%r10,%r15
7456	adcq	%rdx,%r9
7457	movq	%r13,%r10
7458	movq	%r14,%r11
7459	movq	%r15,%r12
7460	andq	$3,%r12
7461	movq	%r15,%r13
7462	andq	$-4,%r13
7463	movq	%r9,%r14
7464	shrdq	$2,%r9,%r15
7465	shrq	$2,%r9
7466	addq	%r13,%r10
7467	adcq	%r14,%r11
7468	adcq	$0,%r12
7469	addq	%r15,%r10
7470	adcq	%r9,%r11
7471	adcq	$0,%r12
7472	vpaddd	%ymm4,%ymm0,%ymm0
7473	vpxor	%ymm0,%ymm12,%ymm12
7474	vpshufb	.rol16(%rip),%ymm12,%ymm12
7475	vpaddd	%ymm12,%ymm8,%ymm8
7476	vpxor	%ymm8,%ymm4,%ymm4
7477	vpsrld	$20,%ymm4,%ymm3
7478	vpslld	$12,%ymm4,%ymm4
7479	vpxor	%ymm3,%ymm4,%ymm4
7480	vpaddd	%ymm4,%ymm0,%ymm0
7481	vpxor	%ymm0,%ymm12,%ymm12
7482	vpshufb	.rol8(%rip),%ymm12,%ymm12
7483	vpaddd	%ymm12,%ymm8,%ymm8
7484	vpxor	%ymm8,%ymm4,%ymm4
7485	vpslld	$7,%ymm4,%ymm3
7486	vpsrld	$25,%ymm4,%ymm4
7487	vpxor	%ymm3,%ymm4,%ymm4
7488	vpalignr	$4,%ymm12,%ymm12,%ymm12
7489	vpalignr	$8,%ymm8,%ymm8,%ymm8
7490	vpalignr	$12,%ymm4,%ymm4,%ymm4
7491	vpaddd	%ymm5,%ymm1,%ymm1
7492	vpxor	%ymm1,%ymm13,%ymm13
7493	vpshufb	.rol16(%rip),%ymm13,%ymm13
7494	vpaddd	%ymm13,%ymm9,%ymm9
7495	vpxor	%ymm9,%ymm5,%ymm5
7496	vpsrld	$20,%ymm5,%ymm3
7497	vpslld	$12,%ymm5,%ymm5
7498	vpxor	%ymm3,%ymm5,%ymm5
7499	vpaddd	%ymm5,%ymm1,%ymm1
7500	vpxor	%ymm1,%ymm13,%ymm13
7501	vpshufb	.rol8(%rip),%ymm13,%ymm13
7502	vpaddd	%ymm13,%ymm9,%ymm9
7503	vpxor	%ymm9,%ymm5,%ymm5
7504	vpslld	$7,%ymm5,%ymm3
7505	vpsrld	$25,%ymm5,%ymm5
7506	vpxor	%ymm3,%ymm5,%ymm5
7507	vpalignr	$4,%ymm13,%ymm13,%ymm13
7508	vpalignr	$8,%ymm9,%ymm9,%ymm9
7509	vpalignr	$12,%ymm5,%ymm5,%ymm5
7510	addq	16(%rdi),%r10
7511	adcq	8+16(%rdi),%r11
7512	adcq	$1,%r12
7513	movq	0+0(%rbp),%rax
7514	movq	%rax,%r15
7515	mulq	%r10
7516	movq	%rax,%r13
7517	movq	%rdx,%r14
7518	movq	0+0(%rbp),%rax
7519	mulq	%r11
7520	imulq	%r12,%r15
7521	addq	%rax,%r14
7522	adcq	%rdx,%r15
7523	movq	8+0(%rbp),%rax
7524	movq	%rax,%r9
7525	mulq	%r10
7526	addq	%rax,%r14
7527	adcq	$0,%rdx
7528	movq	%rdx,%r10
7529	movq	8+0(%rbp),%rax
7530	mulq	%r11
7531	addq	%rax,%r15
7532	adcq	$0,%rdx
7533	imulq	%r12,%r9
7534	addq	%r10,%r15
7535	adcq	%rdx,%r9
7536	movq	%r13,%r10
7537	movq	%r14,%r11
7538	movq	%r15,%r12
7539	andq	$3,%r12
7540	movq	%r15,%r13
7541	andq	$-4,%r13
7542	movq	%r9,%r14
7543	shrdq	$2,%r9,%r15
7544	shrq	$2,%r9
7545	addq	%r13,%r10
7546	adcq	%r14,%r11
7547	adcq	$0,%r12
7548	addq	%r15,%r10
7549	adcq	%r9,%r11
7550	adcq	$0,%r12
7551
7552	leaq	32(%rdi),%rdi
7553	decq	%rcx
7554	jg	1b
7555	decq	%r8
7556	jge	2b
7557	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
7558	vpaddd	64(%rbp),%ymm5,%ymm5
7559	vpaddd	96(%rbp),%ymm9,%ymm9
7560	vpaddd	192(%rbp),%ymm13,%ymm13
7561	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
7562	vpaddd	64(%rbp),%ymm4,%ymm4
7563	vpaddd	96(%rbp),%ymm8,%ymm8
7564	vpaddd	160(%rbp),%ymm12,%ymm12
7565	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
7566	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
7567	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
7568	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
7569	vpxor	0+0(%rsi),%ymm3,%ymm3
7570	vpxor	32+0(%rsi),%ymm1,%ymm1
7571	vpxor	64+0(%rsi),%ymm5,%ymm5
7572	vpxor	96+0(%rsi),%ymm9,%ymm9
7573	vmovdqu	%ymm3,0+0(%rdi)
7574	vmovdqu	%ymm1,32+0(%rdi)
7575	vmovdqu	%ymm5,64+0(%rdi)
7576	vmovdqu	%ymm9,96+0(%rdi)
7577	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
7578	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
7579	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
7580	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
7581	vmovdqa	%ymm3,%ymm8
7582
7583	movq	$128,%rcx
7584	leaq	128(%rsi),%rsi
7585	subq	$128,%rbx
7586	jmp	seal_avx2_hash
75873:
7588	cmpq	$384,%rbx
7589	ja	seal_avx2_tail_512
7590
7591seal_avx2_tail_384:
7592	vmovdqa	.chacha20_consts(%rip),%ymm0
7593	vmovdqa	64(%rbp),%ymm4
7594	vmovdqa	96(%rbp),%ymm8
7595	vmovdqa	%ymm0,%ymm1
7596	vmovdqa	%ymm4,%ymm5
7597	vmovdqa	%ymm8,%ymm9
7598	vmovdqa	%ymm0,%ymm2
7599	vmovdqa	%ymm4,%ymm6
7600	vmovdqa	%ymm8,%ymm10
7601	vmovdqa	.avx2_inc(%rip),%ymm12
7602	vpaddd	160(%rbp),%ymm12,%ymm14
7603	vpaddd	%ymm14,%ymm12,%ymm13
7604	vpaddd	%ymm13,%ymm12,%ymm12
7605	vmovdqa	%ymm12,160(%rbp)
7606	vmovdqa	%ymm13,192(%rbp)
7607	vmovdqa	%ymm14,224(%rbp)
7608
76091:
7610	addq	0(%rdi),%r10
7611	adcq	8+0(%rdi),%r11
7612	adcq	$1,%r12
7613	movq	0+0(%rbp),%rax
7614	movq	%rax,%r15
7615	mulq	%r10
7616	movq	%rax,%r13
7617	movq	%rdx,%r14
7618	movq	0+0(%rbp),%rax
7619	mulq	%r11
7620	imulq	%r12,%r15
7621	addq	%rax,%r14
7622	adcq	%rdx,%r15
7623	movq	8+0(%rbp),%rax
7624	movq	%rax,%r9
7625	mulq	%r10
7626	addq	%rax,%r14
7627	adcq	$0,%rdx
7628	movq	%rdx,%r10
7629	movq	8+0(%rbp),%rax
7630	mulq	%r11
7631	addq	%rax,%r15
7632	adcq	$0,%rdx
7633	imulq	%r12,%r9
7634	addq	%r10,%r15
7635	adcq	%rdx,%r9
7636	movq	%r13,%r10
7637	movq	%r14,%r11
7638	movq	%r15,%r12
7639	andq	$3,%r12
7640	movq	%r15,%r13
7641	andq	$-4,%r13
7642	movq	%r9,%r14
7643	shrdq	$2,%r9,%r15
7644	shrq	$2,%r9
7645	addq	%r13,%r10
7646	adcq	%r14,%r11
7647	adcq	$0,%r12
7648	addq	%r15,%r10
7649	adcq	%r9,%r11
7650	adcq	$0,%r12
7651
7652	leaq	16(%rdi),%rdi
76532:
7654	vpaddd	%ymm4,%ymm0,%ymm0
7655	vpxor	%ymm0,%ymm12,%ymm12
7656	vpshufb	.rol16(%rip),%ymm12,%ymm12
7657	vpaddd	%ymm12,%ymm8,%ymm8
7658	vpxor	%ymm8,%ymm4,%ymm4
7659	vpsrld	$20,%ymm4,%ymm3
7660	vpslld	$12,%ymm4,%ymm4
7661	vpxor	%ymm3,%ymm4,%ymm4
7662	vpaddd	%ymm4,%ymm0,%ymm0
7663	vpxor	%ymm0,%ymm12,%ymm12
7664	vpshufb	.rol8(%rip),%ymm12,%ymm12
7665	vpaddd	%ymm12,%ymm8,%ymm8
7666	vpxor	%ymm8,%ymm4,%ymm4
7667	vpslld	$7,%ymm4,%ymm3
7668	vpsrld	$25,%ymm4,%ymm4
7669	vpxor	%ymm3,%ymm4,%ymm4
7670	vpalignr	$12,%ymm12,%ymm12,%ymm12
7671	vpalignr	$8,%ymm8,%ymm8,%ymm8
7672	vpalignr	$4,%ymm4,%ymm4,%ymm4
7673	vpaddd	%ymm5,%ymm1,%ymm1
7674	vpxor	%ymm1,%ymm13,%ymm13
7675	vpshufb	.rol16(%rip),%ymm13,%ymm13
7676	vpaddd	%ymm13,%ymm9,%ymm9
7677	vpxor	%ymm9,%ymm5,%ymm5
7678	vpsrld	$20,%ymm5,%ymm3
7679	vpslld	$12,%ymm5,%ymm5
7680	vpxor	%ymm3,%ymm5,%ymm5
7681	vpaddd	%ymm5,%ymm1,%ymm1
7682	vpxor	%ymm1,%ymm13,%ymm13
7683	vpshufb	.rol8(%rip),%ymm13,%ymm13
7684	vpaddd	%ymm13,%ymm9,%ymm9
7685	vpxor	%ymm9,%ymm5,%ymm5
7686	vpslld	$7,%ymm5,%ymm3
7687	vpsrld	$25,%ymm5,%ymm5
7688	vpxor	%ymm3,%ymm5,%ymm5
7689	vpalignr	$12,%ymm13,%ymm13,%ymm13
7690	vpalignr	$8,%ymm9,%ymm9,%ymm9
7691	vpalignr	$4,%ymm5,%ymm5,%ymm5
7692	addq	0(%rdi),%r10
7693	adcq	8+0(%rdi),%r11
7694	adcq	$1,%r12
7695	movq	0+0(%rbp),%rax
7696	movq	%rax,%r15
7697	mulq	%r10
7698	movq	%rax,%r13
7699	movq	%rdx,%r14
7700	movq	0+0(%rbp),%rax
7701	mulq	%r11
7702	imulq	%r12,%r15
7703	addq	%rax,%r14
7704	adcq	%rdx,%r15
7705	movq	8+0(%rbp),%rax
7706	movq	%rax,%r9
7707	mulq	%r10
7708	addq	%rax,%r14
7709	adcq	$0,%rdx
7710	movq	%rdx,%r10
7711	movq	8+0(%rbp),%rax
7712	mulq	%r11
7713	addq	%rax,%r15
7714	adcq	$0,%rdx
7715	imulq	%r12,%r9
7716	addq	%r10,%r15
7717	adcq	%rdx,%r9
7718	movq	%r13,%r10
7719	movq	%r14,%r11
7720	movq	%r15,%r12
7721	andq	$3,%r12
7722	movq	%r15,%r13
7723	andq	$-4,%r13
7724	movq	%r9,%r14
7725	shrdq	$2,%r9,%r15
7726	shrq	$2,%r9
7727	addq	%r13,%r10
7728	adcq	%r14,%r11
7729	adcq	$0,%r12
7730	addq	%r15,%r10
7731	adcq	%r9,%r11
7732	adcq	$0,%r12
7733	vpaddd	%ymm6,%ymm2,%ymm2
7734	vpxor	%ymm2,%ymm14,%ymm14
7735	vpshufb	.rol16(%rip),%ymm14,%ymm14
7736	vpaddd	%ymm14,%ymm10,%ymm10
7737	vpxor	%ymm10,%ymm6,%ymm6
7738	vpsrld	$20,%ymm6,%ymm3
7739	vpslld	$12,%ymm6,%ymm6
7740	vpxor	%ymm3,%ymm6,%ymm6
7741	vpaddd	%ymm6,%ymm2,%ymm2
7742	vpxor	%ymm2,%ymm14,%ymm14
7743	vpshufb	.rol8(%rip),%ymm14,%ymm14
7744	vpaddd	%ymm14,%ymm10,%ymm10
7745	vpxor	%ymm10,%ymm6,%ymm6
7746	vpslld	$7,%ymm6,%ymm3
7747	vpsrld	$25,%ymm6,%ymm6
7748	vpxor	%ymm3,%ymm6,%ymm6
7749	vpalignr	$12,%ymm14,%ymm14,%ymm14
7750	vpalignr	$8,%ymm10,%ymm10,%ymm10
7751	vpalignr	$4,%ymm6,%ymm6,%ymm6
7752	vpaddd	%ymm4,%ymm0,%ymm0
7753	vpxor	%ymm0,%ymm12,%ymm12
7754	vpshufb	.rol16(%rip),%ymm12,%ymm12
7755	vpaddd	%ymm12,%ymm8,%ymm8
7756	vpxor	%ymm8,%ymm4,%ymm4
7757	vpsrld	$20,%ymm4,%ymm3
7758	vpslld	$12,%ymm4,%ymm4
7759	vpxor	%ymm3,%ymm4,%ymm4
7760	vpaddd	%ymm4,%ymm0,%ymm0
7761	vpxor	%ymm0,%ymm12,%ymm12
7762	vpshufb	.rol8(%rip),%ymm12,%ymm12
7763	vpaddd	%ymm12,%ymm8,%ymm8
7764	vpxor	%ymm8,%ymm4,%ymm4
7765	vpslld	$7,%ymm4,%ymm3
7766	vpsrld	$25,%ymm4,%ymm4
7767	vpxor	%ymm3,%ymm4,%ymm4
7768	vpalignr	$4,%ymm12,%ymm12,%ymm12
7769	vpalignr	$8,%ymm8,%ymm8,%ymm8
7770	vpalignr	$12,%ymm4,%ymm4,%ymm4
7771	addq	16(%rdi),%r10
7772	adcq	8+16(%rdi),%r11
7773	adcq	$1,%r12
7774	movq	0+0(%rbp),%rax
7775	movq	%rax,%r15
7776	mulq	%r10
7777	movq	%rax,%r13
7778	movq	%rdx,%r14
7779	movq	0+0(%rbp),%rax
7780	mulq	%r11
7781	imulq	%r12,%r15
7782	addq	%rax,%r14
7783	adcq	%rdx,%r15
7784	movq	8+0(%rbp),%rax
7785	movq	%rax,%r9
7786	mulq	%r10
7787	addq	%rax,%r14
7788	adcq	$0,%rdx
7789	movq	%rdx,%r10
7790	movq	8+0(%rbp),%rax
7791	mulq	%r11
7792	addq	%rax,%r15
7793	adcq	$0,%rdx
7794	imulq	%r12,%r9
7795	addq	%r10,%r15
7796	adcq	%rdx,%r9
7797	movq	%r13,%r10
7798	movq	%r14,%r11
7799	movq	%r15,%r12
7800	andq	$3,%r12
7801	movq	%r15,%r13
7802	andq	$-4,%r13
7803	movq	%r9,%r14
7804	shrdq	$2,%r9,%r15
7805	shrq	$2,%r9
7806	addq	%r13,%r10
7807	adcq	%r14,%r11
7808	adcq	$0,%r12
7809	addq	%r15,%r10
7810	adcq	%r9,%r11
7811	adcq	$0,%r12
7812	vpaddd	%ymm5,%ymm1,%ymm1
7813	vpxor	%ymm1,%ymm13,%ymm13
7814	vpshufb	.rol16(%rip),%ymm13,%ymm13
7815	vpaddd	%ymm13,%ymm9,%ymm9
7816	vpxor	%ymm9,%ymm5,%ymm5
7817	vpsrld	$20,%ymm5,%ymm3
7818	vpslld	$12,%ymm5,%ymm5
7819	vpxor	%ymm3,%ymm5,%ymm5
7820	vpaddd	%ymm5,%ymm1,%ymm1
7821	vpxor	%ymm1,%ymm13,%ymm13
7822	vpshufb	.rol8(%rip),%ymm13,%ymm13
7823	vpaddd	%ymm13,%ymm9,%ymm9
7824	vpxor	%ymm9,%ymm5,%ymm5
7825	vpslld	$7,%ymm5,%ymm3
7826	vpsrld	$25,%ymm5,%ymm5
7827	vpxor	%ymm3,%ymm5,%ymm5
7828	vpalignr	$4,%ymm13,%ymm13,%ymm13
7829	vpalignr	$8,%ymm9,%ymm9,%ymm9
7830	vpalignr	$12,%ymm5,%ymm5,%ymm5
7831	vpaddd	%ymm6,%ymm2,%ymm2
7832	vpxor	%ymm2,%ymm14,%ymm14
7833	vpshufb	.rol16(%rip),%ymm14,%ymm14
7834	vpaddd	%ymm14,%ymm10,%ymm10
7835	vpxor	%ymm10,%ymm6,%ymm6
7836	vpsrld	$20,%ymm6,%ymm3
7837	vpslld	$12,%ymm6,%ymm6
7838	vpxor	%ymm3,%ymm6,%ymm6
7839	vpaddd	%ymm6,%ymm2,%ymm2
7840	vpxor	%ymm2,%ymm14,%ymm14
7841	vpshufb	.rol8(%rip),%ymm14,%ymm14
7842	vpaddd	%ymm14,%ymm10,%ymm10
7843	vpxor	%ymm10,%ymm6,%ymm6
7844	vpslld	$7,%ymm6,%ymm3
7845	vpsrld	$25,%ymm6,%ymm6
7846	vpxor	%ymm3,%ymm6,%ymm6
7847	vpalignr	$4,%ymm14,%ymm14,%ymm14
7848	vpalignr	$8,%ymm10,%ymm10,%ymm10
7849	vpalignr	$12,%ymm6,%ymm6,%ymm6
7850
7851	leaq	32(%rdi),%rdi
7852	decq	%rcx
7853	jg	1b
7854	decq	%r8
7855	jge	2b
7856	vpaddd	.chacha20_consts(%rip),%ymm2,%ymm2
7857	vpaddd	64(%rbp),%ymm6,%ymm6
7858	vpaddd	96(%rbp),%ymm10,%ymm10
7859	vpaddd	224(%rbp),%ymm14,%ymm14
7860	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
7861	vpaddd	64(%rbp),%ymm5,%ymm5
7862	vpaddd	96(%rbp),%ymm9,%ymm9
7863	vpaddd	192(%rbp),%ymm13,%ymm13
7864	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
7865	vpaddd	64(%rbp),%ymm4,%ymm4
7866	vpaddd	96(%rbp),%ymm8,%ymm8
7867	vpaddd	160(%rbp),%ymm12,%ymm12
7868	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
7869	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
7870	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
7871	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
7872	vpxor	0+0(%rsi),%ymm3,%ymm3
7873	vpxor	32+0(%rsi),%ymm2,%ymm2
7874	vpxor	64+0(%rsi),%ymm6,%ymm6
7875	vpxor	96+0(%rsi),%ymm10,%ymm10
7876	vmovdqu	%ymm3,0+0(%rdi)
7877	vmovdqu	%ymm2,32+0(%rdi)
7878	vmovdqu	%ymm6,64+0(%rdi)
7879	vmovdqu	%ymm10,96+0(%rdi)
7880	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
7881	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
7882	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
7883	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
7884	vpxor	0+128(%rsi),%ymm3,%ymm3
7885	vpxor	32+128(%rsi),%ymm1,%ymm1
7886	vpxor	64+128(%rsi),%ymm5,%ymm5
7887	vpxor	96+128(%rsi),%ymm9,%ymm9
7888	vmovdqu	%ymm3,0+128(%rdi)
7889	vmovdqu	%ymm1,32+128(%rdi)
7890	vmovdqu	%ymm5,64+128(%rdi)
7891	vmovdqu	%ymm9,96+128(%rdi)
7892	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
7893	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
7894	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
7895	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
7896	vmovdqa	%ymm3,%ymm8
7897
7898	movq	$256,%rcx
7899	leaq	256(%rsi),%rsi
7900	subq	$256,%rbx
7901	jmp	seal_avx2_hash
7902
7903seal_avx2_tail_512:
7904	vmovdqa	.chacha20_consts(%rip),%ymm0
7905	vmovdqa	64(%rbp),%ymm4
7906	vmovdqa	96(%rbp),%ymm8
7907	vmovdqa	%ymm0,%ymm1
7908	vmovdqa	%ymm4,%ymm5
7909	vmovdqa	%ymm8,%ymm9
7910	vmovdqa	%ymm0,%ymm2
7911	vmovdqa	%ymm4,%ymm6
7912	vmovdqa	%ymm8,%ymm10
7913	vmovdqa	%ymm0,%ymm3
7914	vmovdqa	%ymm4,%ymm7
7915	vmovdqa	%ymm8,%ymm11
7916	vmovdqa	.avx2_inc(%rip),%ymm12
7917	vpaddd	160(%rbp),%ymm12,%ymm15
7918	vpaddd	%ymm15,%ymm12,%ymm14
7919	vpaddd	%ymm14,%ymm12,%ymm13
7920	vpaddd	%ymm13,%ymm12,%ymm12
7921	vmovdqa	%ymm15,256(%rbp)
7922	vmovdqa	%ymm14,224(%rbp)
7923	vmovdqa	%ymm13,192(%rbp)
7924	vmovdqa	%ymm12,160(%rbp)
7925
79261:
7927	addq	0(%rdi),%r10
7928	adcq	8+0(%rdi),%r11
7929	adcq	$1,%r12
7930	movq	0+0(%rbp),%rdx
7931	movq	%rdx,%r15
7932	mulxq	%r10,%r13,%r14
7933	mulxq	%r11,%rax,%rdx
7934	imulq	%r12,%r15
7935	addq	%rax,%r14
7936	adcq	%rdx,%r15
7937	movq	8+0(%rbp),%rdx
7938	mulxq	%r10,%r10,%rax
7939	addq	%r10,%r14
7940	mulxq	%r11,%r11,%r9
7941	adcq	%r11,%r15
7942	adcq	$0,%r9
7943	imulq	%r12,%rdx
7944	addq	%rax,%r15
7945	adcq	%rdx,%r9
7946	movq	%r13,%r10
7947	movq	%r14,%r11
7948	movq	%r15,%r12
7949	andq	$3,%r12
7950	movq	%r15,%r13
7951	andq	$-4,%r13
7952	movq	%r9,%r14
7953	shrdq	$2,%r9,%r15
7954	shrq	$2,%r9
7955	addq	%r13,%r10
7956	adcq	%r14,%r11
7957	adcq	$0,%r12
7958	addq	%r15,%r10
7959	adcq	%r9,%r11
7960	adcq	$0,%r12
7961
7962	leaq	16(%rdi),%rdi
79632:
7964	vmovdqa	%ymm8,128(%rbp)
7965	vmovdqa	.rol16(%rip),%ymm8
7966	vpaddd	%ymm7,%ymm3,%ymm3
7967	vpaddd	%ymm6,%ymm2,%ymm2
7968	vpaddd	%ymm5,%ymm1,%ymm1
7969	vpaddd	%ymm4,%ymm0,%ymm0
7970	vpxor	%ymm3,%ymm15,%ymm15
7971	vpxor	%ymm2,%ymm14,%ymm14
7972	vpxor	%ymm1,%ymm13,%ymm13
7973	vpxor	%ymm0,%ymm12,%ymm12
7974	vpshufb	%ymm8,%ymm15,%ymm15
7975	vpshufb	%ymm8,%ymm14,%ymm14
7976	vpshufb	%ymm8,%ymm13,%ymm13
7977	vpshufb	%ymm8,%ymm12,%ymm12
7978	vmovdqa	128(%rbp),%ymm8
7979	vpaddd	%ymm15,%ymm11,%ymm11
7980	vpaddd	%ymm14,%ymm10,%ymm10
7981	vpaddd	%ymm13,%ymm9,%ymm9
7982	vpaddd	%ymm12,%ymm8,%ymm8
7983	vpxor	%ymm11,%ymm7,%ymm7
7984	addq	0(%rdi),%r10
7985	adcq	8+0(%rdi),%r11
7986	adcq	$1,%r12
7987	vpxor	%ymm10,%ymm6,%ymm6
7988	vpxor	%ymm9,%ymm5,%ymm5
7989	vpxor	%ymm8,%ymm4,%ymm4
7990	vmovdqa	%ymm8,128(%rbp)
7991	vpsrld	$20,%ymm7,%ymm8
7992	vpslld	$32-20,%ymm7,%ymm7
7993	vpxor	%ymm8,%ymm7,%ymm7
7994	vpsrld	$20,%ymm6,%ymm8
7995	vpslld	$32-20,%ymm6,%ymm6
7996	vpxor	%ymm8,%ymm6,%ymm6
7997	vpsrld	$20,%ymm5,%ymm8
7998	vpslld	$32-20,%ymm5,%ymm5
7999	vpxor	%ymm8,%ymm5,%ymm5
8000	vpsrld	$20,%ymm4,%ymm8
8001	vpslld	$32-20,%ymm4,%ymm4
8002	vpxor	%ymm8,%ymm4,%ymm4
8003	vmovdqa	.rol8(%rip),%ymm8
8004	vpaddd	%ymm7,%ymm3,%ymm3
8005	vpaddd	%ymm6,%ymm2,%ymm2
8006	vpaddd	%ymm5,%ymm1,%ymm1
8007	movq	0+0(%rbp),%rdx
8008	movq	%rdx,%r15
8009	mulxq	%r10,%r13,%r14
8010	mulxq	%r11,%rax,%rdx
8011	imulq	%r12,%r15
8012	addq	%rax,%r14
8013	adcq	%rdx,%r15
8014	vpaddd	%ymm4,%ymm0,%ymm0
8015	vpxor	%ymm3,%ymm15,%ymm15
8016	vpxor	%ymm2,%ymm14,%ymm14
8017	vpxor	%ymm1,%ymm13,%ymm13
8018	vpxor	%ymm0,%ymm12,%ymm12
8019	vpshufb	%ymm8,%ymm15,%ymm15
8020	vpshufb	%ymm8,%ymm14,%ymm14
8021	vpshufb	%ymm8,%ymm13,%ymm13
8022	vpshufb	%ymm8,%ymm12,%ymm12
8023	vmovdqa	128(%rbp),%ymm8
8024	vpaddd	%ymm15,%ymm11,%ymm11
8025	vpaddd	%ymm14,%ymm10,%ymm10
8026	vpaddd	%ymm13,%ymm9,%ymm9
8027	vpaddd	%ymm12,%ymm8,%ymm8
8028	vpxor	%ymm11,%ymm7,%ymm7
8029	vpxor	%ymm10,%ymm6,%ymm6
8030	vpxor	%ymm9,%ymm5,%ymm5
8031	vpxor	%ymm8,%ymm4,%ymm4
8032	vmovdqa	%ymm8,128(%rbp)
8033	vpsrld	$25,%ymm7,%ymm8
8034	movq	8+0(%rbp),%rdx
8035	mulxq	%r10,%r10,%rax
8036	addq	%r10,%r14
8037	mulxq	%r11,%r11,%r9
8038	adcq	%r11,%r15
8039	adcq	$0,%r9
8040	imulq	%r12,%rdx
8041	vpslld	$32-25,%ymm7,%ymm7
8042	vpxor	%ymm8,%ymm7,%ymm7
8043	vpsrld	$25,%ymm6,%ymm8
8044	vpslld	$32-25,%ymm6,%ymm6
8045	vpxor	%ymm8,%ymm6,%ymm6
8046	vpsrld	$25,%ymm5,%ymm8
8047	vpslld	$32-25,%ymm5,%ymm5
8048	vpxor	%ymm8,%ymm5,%ymm5
8049	vpsrld	$25,%ymm4,%ymm8
8050	vpslld	$32-25,%ymm4,%ymm4
8051	vpxor	%ymm8,%ymm4,%ymm4
8052	vmovdqa	128(%rbp),%ymm8
8053	vpalignr	$4,%ymm7,%ymm7,%ymm7
8054	vpalignr	$8,%ymm11,%ymm11,%ymm11
8055	vpalignr	$12,%ymm15,%ymm15,%ymm15
8056	vpalignr	$4,%ymm6,%ymm6,%ymm6
8057	vpalignr	$8,%ymm10,%ymm10,%ymm10
8058	vpalignr	$12,%ymm14,%ymm14,%ymm14
8059	vpalignr	$4,%ymm5,%ymm5,%ymm5
8060	vpalignr	$8,%ymm9,%ymm9,%ymm9
8061	addq	%rax,%r15
8062	adcq	%rdx,%r9
8063	vpalignr	$12,%ymm13,%ymm13,%ymm13
8064	vpalignr	$4,%ymm4,%ymm4,%ymm4
8065	vpalignr	$8,%ymm8,%ymm8,%ymm8
8066	vpalignr	$12,%ymm12,%ymm12,%ymm12
8067	vmovdqa	%ymm8,128(%rbp)
8068	vmovdqa	.rol16(%rip),%ymm8
8069	vpaddd	%ymm7,%ymm3,%ymm3
8070	vpaddd	%ymm6,%ymm2,%ymm2
8071	vpaddd	%ymm5,%ymm1,%ymm1
8072	vpaddd	%ymm4,%ymm0,%ymm0
8073	vpxor	%ymm3,%ymm15,%ymm15
8074	vpxor	%ymm2,%ymm14,%ymm14
8075	vpxor	%ymm1,%ymm13,%ymm13
8076	vpxor	%ymm0,%ymm12,%ymm12
8077	vpshufb	%ymm8,%ymm15,%ymm15
8078	vpshufb	%ymm8,%ymm14,%ymm14
8079	vpshufb	%ymm8,%ymm13,%ymm13
8080	vpshufb	%ymm8,%ymm12,%ymm12
8081	vmovdqa	128(%rbp),%ymm8
8082	vpaddd	%ymm15,%ymm11,%ymm11
8083	movq	%r13,%r10
8084	movq	%r14,%r11
8085	movq	%r15,%r12
8086	andq	$3,%r12
8087	movq	%r15,%r13
8088	andq	$-4,%r13
8089	movq	%r9,%r14
8090	shrdq	$2,%r9,%r15
8091	shrq	$2,%r9
8092	addq	%r13,%r10
8093	adcq	%r14,%r11
8094	adcq	$0,%r12
8095	addq	%r15,%r10
8096	adcq	%r9,%r11
8097	adcq	$0,%r12
8098	vpaddd	%ymm14,%ymm10,%ymm10
8099	vpaddd	%ymm13,%ymm9,%ymm9
8100	vpaddd	%ymm12,%ymm8,%ymm8
8101	vpxor	%ymm11,%ymm7,%ymm7
8102	vpxor	%ymm10,%ymm6,%ymm6
8103	vpxor	%ymm9,%ymm5,%ymm5
8104	vpxor	%ymm8,%ymm4,%ymm4
8105	vmovdqa	%ymm8,128(%rbp)
8106	vpsrld	$20,%ymm7,%ymm8
8107	vpslld	$32-20,%ymm7,%ymm7
8108	vpxor	%ymm8,%ymm7,%ymm7
8109	vpsrld	$20,%ymm6,%ymm8
8110	vpslld	$32-20,%ymm6,%ymm6
8111	vpxor	%ymm8,%ymm6,%ymm6
8112	vpsrld	$20,%ymm5,%ymm8
8113	vpslld	$32-20,%ymm5,%ymm5
8114	vpxor	%ymm8,%ymm5,%ymm5
8115	vpsrld	$20,%ymm4,%ymm8
8116	vpslld	$32-20,%ymm4,%ymm4
8117	vpxor	%ymm8,%ymm4,%ymm4
8118	addq	16(%rdi),%r10
8119	adcq	8+16(%rdi),%r11
8120	adcq	$1,%r12
8121	vmovdqa	.rol8(%rip),%ymm8
8122	vpaddd	%ymm7,%ymm3,%ymm3
8123	vpaddd	%ymm6,%ymm2,%ymm2
8124	vpaddd	%ymm5,%ymm1,%ymm1
8125	vpaddd	%ymm4,%ymm0,%ymm0
8126	vpxor	%ymm3,%ymm15,%ymm15
8127	vpxor	%ymm2,%ymm14,%ymm14
8128	vpxor	%ymm1,%ymm13,%ymm13
8129	vpxor	%ymm0,%ymm12,%ymm12
8130	vpshufb	%ymm8,%ymm15,%ymm15
8131	vpshufb	%ymm8,%ymm14,%ymm14
8132	vpshufb	%ymm8,%ymm13,%ymm13
8133	vpshufb	%ymm8,%ymm12,%ymm12
8134	vmovdqa	128(%rbp),%ymm8
8135	vpaddd	%ymm15,%ymm11,%ymm11
8136	vpaddd	%ymm14,%ymm10,%ymm10
8137	vpaddd	%ymm13,%ymm9,%ymm9
8138	vpaddd	%ymm12,%ymm8,%ymm8
8139	vpxor	%ymm11,%ymm7,%ymm7
8140	vpxor	%ymm10,%ymm6,%ymm6
8141	movq	0+0(%rbp),%rdx
8142	movq	%rdx,%r15
8143	mulxq	%r10,%r13,%r14
8144	mulxq	%r11,%rax,%rdx
8145	imulq	%r12,%r15
8146	addq	%rax,%r14
8147	adcq	%rdx,%r15
8148	vpxor	%ymm9,%ymm5,%ymm5
8149	vpxor	%ymm8,%ymm4,%ymm4
8150	vmovdqa	%ymm8,128(%rbp)
8151	vpsrld	$25,%ymm7,%ymm8
8152	vpslld	$32-25,%ymm7,%ymm7
8153	vpxor	%ymm8,%ymm7,%ymm7
8154	vpsrld	$25,%ymm6,%ymm8
8155	vpslld	$32-25,%ymm6,%ymm6
8156	vpxor	%ymm8,%ymm6,%ymm6
8157	vpsrld	$25,%ymm5,%ymm8
8158	vpslld	$32-25,%ymm5,%ymm5
8159	vpxor	%ymm8,%ymm5,%ymm5
8160	vpsrld	$25,%ymm4,%ymm8
8161	vpslld	$32-25,%ymm4,%ymm4
8162	vpxor	%ymm8,%ymm4,%ymm4
8163	vmovdqa	128(%rbp),%ymm8
8164	vpalignr	$12,%ymm7,%ymm7,%ymm7
8165	vpalignr	$8,%ymm11,%ymm11,%ymm11
8166	vpalignr	$4,%ymm15,%ymm15,%ymm15
8167	vpalignr	$12,%ymm6,%ymm6,%ymm6
8168	movq	8+0(%rbp),%rdx
8169	mulxq	%r10,%r10,%rax
8170	addq	%r10,%r14
8171	mulxq	%r11,%r11,%r9
8172	adcq	%r11,%r15
8173	adcq	$0,%r9
8174	imulq	%r12,%rdx
8175	vpalignr	$8,%ymm10,%ymm10,%ymm10
8176	vpalignr	$4,%ymm14,%ymm14,%ymm14
8177	vpalignr	$12,%ymm5,%ymm5,%ymm5
8178	vpalignr	$8,%ymm9,%ymm9,%ymm9
8179	vpalignr	$4,%ymm13,%ymm13,%ymm13
8180	vpalignr	$12,%ymm4,%ymm4,%ymm4
8181	vpalignr	$8,%ymm8,%ymm8,%ymm8
8182	vpalignr	$4,%ymm12,%ymm12,%ymm12
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195	addq	%rax,%r15
8196	adcq	%rdx,%r9
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217	movq	%r13,%r10
8218	movq	%r14,%r11
8219	movq	%r15,%r12
8220	andq	$3,%r12
8221	movq	%r15,%r13
8222	andq	$-4,%r13
8223	movq	%r9,%r14
8224	shrdq	$2,%r9,%r15
8225	shrq	$2,%r9
8226	addq	%r13,%r10
8227	adcq	%r14,%r11
8228	adcq	$0,%r12
8229	addq	%r15,%r10
8230	adcq	%r9,%r11
8231	adcq	$0,%r12
8232
8233	leaq	32(%rdi),%rdi
8234	decq	%rcx
8235	jg	1b
8236	decq	%r8
8237	jge	2b
8238	vpaddd	.chacha20_consts(%rip),%ymm3,%ymm3
8239	vpaddd	64(%rbp),%ymm7,%ymm7
8240	vpaddd	96(%rbp),%ymm11,%ymm11
8241	vpaddd	256(%rbp),%ymm15,%ymm15
8242	vpaddd	.chacha20_consts(%rip),%ymm2,%ymm2
8243	vpaddd	64(%rbp),%ymm6,%ymm6
8244	vpaddd	96(%rbp),%ymm10,%ymm10
8245	vpaddd	224(%rbp),%ymm14,%ymm14
8246	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
8247	vpaddd	64(%rbp),%ymm5,%ymm5
8248	vpaddd	96(%rbp),%ymm9,%ymm9
8249	vpaddd	192(%rbp),%ymm13,%ymm13
8250	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
8251	vpaddd	64(%rbp),%ymm4,%ymm4
8252	vpaddd	96(%rbp),%ymm8,%ymm8
8253	vpaddd	160(%rbp),%ymm12,%ymm12
8254
8255	vmovdqa	%ymm0,128(%rbp)
8256	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
8257	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
8258	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
8259	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
8260	vpxor	0+0(%rsi),%ymm0,%ymm0
8261	vpxor	32+0(%rsi),%ymm3,%ymm3
8262	vpxor	64+0(%rsi),%ymm7,%ymm7
8263	vpxor	96+0(%rsi),%ymm11,%ymm11
8264	vmovdqu	%ymm0,0+0(%rdi)
8265	vmovdqu	%ymm3,32+0(%rdi)
8266	vmovdqu	%ymm7,64+0(%rdi)
8267	vmovdqu	%ymm11,96+0(%rdi)
8268
8269	vmovdqa	128(%rbp),%ymm0
8270	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
8271	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
8272	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
8273	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
8274	vpxor	0+128(%rsi),%ymm3,%ymm3
8275	vpxor	32+128(%rsi),%ymm2,%ymm2
8276	vpxor	64+128(%rsi),%ymm6,%ymm6
8277	vpxor	96+128(%rsi),%ymm10,%ymm10
8278	vmovdqu	%ymm3,0+128(%rdi)
8279	vmovdqu	%ymm2,32+128(%rdi)
8280	vmovdqu	%ymm6,64+128(%rdi)
8281	vmovdqu	%ymm10,96+128(%rdi)
8282	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
8283	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
8284	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
8285	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
8286	vpxor	0+256(%rsi),%ymm3,%ymm3
8287	vpxor	32+256(%rsi),%ymm1,%ymm1
8288	vpxor	64+256(%rsi),%ymm5,%ymm5
8289	vpxor	96+256(%rsi),%ymm9,%ymm9
8290	vmovdqu	%ymm3,0+256(%rdi)
8291	vmovdqu	%ymm1,32+256(%rdi)
8292	vmovdqu	%ymm5,64+256(%rdi)
8293	vmovdqu	%ymm9,96+256(%rdi)
8294	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
8295	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
8296	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
8297	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
8298	vmovdqa	%ymm3,%ymm8
8299
8300	movq	$384,%rcx
8301	leaq	384(%rsi),%rsi
8302	subq	$384,%rbx
8303	jmp	seal_avx2_hash
8304
8305seal_avx2_320:
8306	vmovdqa	%ymm0,%ymm1
8307	vmovdqa	%ymm0,%ymm2
8308	vmovdqa	%ymm4,%ymm5
8309	vmovdqa	%ymm4,%ymm6
8310	vmovdqa	%ymm8,%ymm9
8311	vmovdqa	%ymm8,%ymm10
8312	vpaddd	.avx2_inc(%rip),%ymm12,%ymm13
8313	vpaddd	.avx2_inc(%rip),%ymm13,%ymm14
8314	vmovdqa	%ymm4,%ymm7
8315	vmovdqa	%ymm8,%ymm11
8316	vmovdqa	%ymm12,160(%rbp)
8317	vmovdqa	%ymm13,192(%rbp)
8318	vmovdqa	%ymm14,224(%rbp)
8319	movq	$10,%r10
83201:
8321	vpaddd	%ymm4,%ymm0,%ymm0
8322	vpxor	%ymm0,%ymm12,%ymm12
8323	vpshufb	.rol16(%rip),%ymm12,%ymm12
8324	vpaddd	%ymm12,%ymm8,%ymm8
8325	vpxor	%ymm8,%ymm4,%ymm4
8326	vpsrld	$20,%ymm4,%ymm3
8327	vpslld	$12,%ymm4,%ymm4
8328	vpxor	%ymm3,%ymm4,%ymm4
8329	vpaddd	%ymm4,%ymm0,%ymm0
8330	vpxor	%ymm0,%ymm12,%ymm12
8331	vpshufb	.rol8(%rip),%ymm12,%ymm12
8332	vpaddd	%ymm12,%ymm8,%ymm8
8333	vpxor	%ymm8,%ymm4,%ymm4
8334	vpslld	$7,%ymm4,%ymm3
8335	vpsrld	$25,%ymm4,%ymm4
8336	vpxor	%ymm3,%ymm4,%ymm4
8337	vpalignr	$12,%ymm12,%ymm12,%ymm12
8338	vpalignr	$8,%ymm8,%ymm8,%ymm8
8339	vpalignr	$4,%ymm4,%ymm4,%ymm4
8340	vpaddd	%ymm5,%ymm1,%ymm1
8341	vpxor	%ymm1,%ymm13,%ymm13
8342	vpshufb	.rol16(%rip),%ymm13,%ymm13
8343	vpaddd	%ymm13,%ymm9,%ymm9
8344	vpxor	%ymm9,%ymm5,%ymm5
8345	vpsrld	$20,%ymm5,%ymm3
8346	vpslld	$12,%ymm5,%ymm5
8347	vpxor	%ymm3,%ymm5,%ymm5
8348	vpaddd	%ymm5,%ymm1,%ymm1
8349	vpxor	%ymm1,%ymm13,%ymm13
8350	vpshufb	.rol8(%rip),%ymm13,%ymm13
8351	vpaddd	%ymm13,%ymm9,%ymm9
8352	vpxor	%ymm9,%ymm5,%ymm5
8353	vpslld	$7,%ymm5,%ymm3
8354	vpsrld	$25,%ymm5,%ymm5
8355	vpxor	%ymm3,%ymm5,%ymm5
8356	vpalignr	$12,%ymm13,%ymm13,%ymm13
8357	vpalignr	$8,%ymm9,%ymm9,%ymm9
8358	vpalignr	$4,%ymm5,%ymm5,%ymm5
8359	vpaddd	%ymm6,%ymm2,%ymm2
8360	vpxor	%ymm2,%ymm14,%ymm14
8361	vpshufb	.rol16(%rip),%ymm14,%ymm14
8362	vpaddd	%ymm14,%ymm10,%ymm10
8363	vpxor	%ymm10,%ymm6,%ymm6
8364	vpsrld	$20,%ymm6,%ymm3
8365	vpslld	$12,%ymm6,%ymm6
8366	vpxor	%ymm3,%ymm6,%ymm6
8367	vpaddd	%ymm6,%ymm2,%ymm2
8368	vpxor	%ymm2,%ymm14,%ymm14
8369	vpshufb	.rol8(%rip),%ymm14,%ymm14
8370	vpaddd	%ymm14,%ymm10,%ymm10
8371	vpxor	%ymm10,%ymm6,%ymm6
8372	vpslld	$7,%ymm6,%ymm3
8373	vpsrld	$25,%ymm6,%ymm6
8374	vpxor	%ymm3,%ymm6,%ymm6
8375	vpalignr	$12,%ymm14,%ymm14,%ymm14
8376	vpalignr	$8,%ymm10,%ymm10,%ymm10
8377	vpalignr	$4,%ymm6,%ymm6,%ymm6
8378	vpaddd	%ymm4,%ymm0,%ymm0
8379	vpxor	%ymm0,%ymm12,%ymm12
8380	vpshufb	.rol16(%rip),%ymm12,%ymm12
8381	vpaddd	%ymm12,%ymm8,%ymm8
8382	vpxor	%ymm8,%ymm4,%ymm4
8383	vpsrld	$20,%ymm4,%ymm3
8384	vpslld	$12,%ymm4,%ymm4
8385	vpxor	%ymm3,%ymm4,%ymm4
8386	vpaddd	%ymm4,%ymm0,%ymm0
8387	vpxor	%ymm0,%ymm12,%ymm12
8388	vpshufb	.rol8(%rip),%ymm12,%ymm12
8389	vpaddd	%ymm12,%ymm8,%ymm8
8390	vpxor	%ymm8,%ymm4,%ymm4
8391	vpslld	$7,%ymm4,%ymm3
8392	vpsrld	$25,%ymm4,%ymm4
8393	vpxor	%ymm3,%ymm4,%ymm4
8394	vpalignr	$4,%ymm12,%ymm12,%ymm12
8395	vpalignr	$8,%ymm8,%ymm8,%ymm8
8396	vpalignr	$12,%ymm4,%ymm4,%ymm4
8397	vpaddd	%ymm5,%ymm1,%ymm1
8398	vpxor	%ymm1,%ymm13,%ymm13
8399	vpshufb	.rol16(%rip),%ymm13,%ymm13
8400	vpaddd	%ymm13,%ymm9,%ymm9
8401	vpxor	%ymm9,%ymm5,%ymm5
8402	vpsrld	$20,%ymm5,%ymm3
8403	vpslld	$12,%ymm5,%ymm5
8404	vpxor	%ymm3,%ymm5,%ymm5
8405	vpaddd	%ymm5,%ymm1,%ymm1
8406	vpxor	%ymm1,%ymm13,%ymm13
8407	vpshufb	.rol8(%rip),%ymm13,%ymm13
8408	vpaddd	%ymm13,%ymm9,%ymm9
8409	vpxor	%ymm9,%ymm5,%ymm5
8410	vpslld	$7,%ymm5,%ymm3
8411	vpsrld	$25,%ymm5,%ymm5
8412	vpxor	%ymm3,%ymm5,%ymm5
8413	vpalignr	$4,%ymm13,%ymm13,%ymm13
8414	vpalignr	$8,%ymm9,%ymm9,%ymm9
8415	vpalignr	$12,%ymm5,%ymm5,%ymm5
8416	vpaddd	%ymm6,%ymm2,%ymm2
8417	vpxor	%ymm2,%ymm14,%ymm14
8418	vpshufb	.rol16(%rip),%ymm14,%ymm14
8419	vpaddd	%ymm14,%ymm10,%ymm10
8420	vpxor	%ymm10,%ymm6,%ymm6
8421	vpsrld	$20,%ymm6,%ymm3
8422	vpslld	$12,%ymm6,%ymm6
8423	vpxor	%ymm3,%ymm6,%ymm6
8424	vpaddd	%ymm6,%ymm2,%ymm2
8425	vpxor	%ymm2,%ymm14,%ymm14
8426	vpshufb	.rol8(%rip),%ymm14,%ymm14
8427	vpaddd	%ymm14,%ymm10,%ymm10
8428	vpxor	%ymm10,%ymm6,%ymm6
8429	vpslld	$7,%ymm6,%ymm3
8430	vpsrld	$25,%ymm6,%ymm6
8431	vpxor	%ymm3,%ymm6,%ymm6
8432	vpalignr	$4,%ymm14,%ymm14,%ymm14
8433	vpalignr	$8,%ymm10,%ymm10,%ymm10
8434	vpalignr	$12,%ymm6,%ymm6,%ymm6
8435
8436	decq	%r10
8437	jne	1b
8438	vpaddd	.chacha20_consts(%rip),%ymm0,%ymm0
8439	vpaddd	.chacha20_consts(%rip),%ymm1,%ymm1
8440	vpaddd	.chacha20_consts(%rip),%ymm2,%ymm2
8441	vpaddd	%ymm7,%ymm4,%ymm4
8442	vpaddd	%ymm7,%ymm5,%ymm5
8443	vpaddd	%ymm7,%ymm6,%ymm6
8444	vpaddd	%ymm11,%ymm8,%ymm8
8445	vpaddd	%ymm11,%ymm9,%ymm9
8446	vpaddd	%ymm11,%ymm10,%ymm10
8447	vpaddd	160(%rbp),%ymm12,%ymm12
8448	vpaddd	192(%rbp),%ymm13,%ymm13
8449	vpaddd	224(%rbp),%ymm14,%ymm14
8450	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
8451
8452	vpand	.clamp(%rip),%ymm3,%ymm3
8453	vmovdqa	%ymm3,0(%rbp)
8454
8455	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
8456	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
8457	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
8458	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
8459	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
8460	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
8461	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
8462	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
8463	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
8464	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
8465	jmp	seal_avx2_short
8466
8467seal_avx2_192:
8468	vmovdqa	%ymm0,%ymm1
8469	vmovdqa	%ymm0,%ymm2
8470	vmovdqa	%ymm4,%ymm5
8471	vmovdqa	%ymm4,%ymm6
8472	vmovdqa	%ymm8,%ymm9
8473	vmovdqa	%ymm8,%ymm10
8474	vpaddd	.avx2_inc(%rip),%ymm12,%ymm13
8475	vmovdqa	%ymm12,%ymm11
8476	vmovdqa	%ymm13,%ymm15
8477	movq	$10,%r10
84781:
8479	vpaddd	%ymm4,%ymm0,%ymm0
8480	vpxor	%ymm0,%ymm12,%ymm12
8481	vpshufb	.rol16(%rip),%ymm12,%ymm12
8482	vpaddd	%ymm12,%ymm8,%ymm8
8483	vpxor	%ymm8,%ymm4,%ymm4
8484	vpsrld	$20,%ymm4,%ymm3
8485	vpslld	$12,%ymm4,%ymm4
8486	vpxor	%ymm3,%ymm4,%ymm4
8487	vpaddd	%ymm4,%ymm0,%ymm0
8488	vpxor	%ymm0,%ymm12,%ymm12
8489	vpshufb	.rol8(%rip),%ymm12,%ymm12
8490	vpaddd	%ymm12,%ymm8,%ymm8
8491	vpxor	%ymm8,%ymm4,%ymm4
8492	vpslld	$7,%ymm4,%ymm3
8493	vpsrld	$25,%ymm4,%ymm4
8494	vpxor	%ymm3,%ymm4,%ymm4
8495	vpalignr	$12,%ymm12,%ymm12,%ymm12
8496	vpalignr	$8,%ymm8,%ymm8,%ymm8
8497	vpalignr	$4,%ymm4,%ymm4,%ymm4
8498	vpaddd	%ymm5,%ymm1,%ymm1
8499	vpxor	%ymm1,%ymm13,%ymm13
8500	vpshufb	.rol16(%rip),%ymm13,%ymm13
8501	vpaddd	%ymm13,%ymm9,%ymm9
8502	vpxor	%ymm9,%ymm5,%ymm5
8503	vpsrld	$20,%ymm5,%ymm3
8504	vpslld	$12,%ymm5,%ymm5
8505	vpxor	%ymm3,%ymm5,%ymm5
8506	vpaddd	%ymm5,%ymm1,%ymm1
8507	vpxor	%ymm1,%ymm13,%ymm13
8508	vpshufb	.rol8(%rip),%ymm13,%ymm13
8509	vpaddd	%ymm13,%ymm9,%ymm9
8510	vpxor	%ymm9,%ymm5,%ymm5
8511	vpslld	$7,%ymm5,%ymm3
8512	vpsrld	$25,%ymm5,%ymm5
8513	vpxor	%ymm3,%ymm5,%ymm5
8514	vpalignr	$12,%ymm13,%ymm13,%ymm13
8515	vpalignr	$8,%ymm9,%ymm9,%ymm9
8516	vpalignr	$4,%ymm5,%ymm5,%ymm5
8517	vpaddd	%ymm4,%ymm0,%ymm0
8518	vpxor	%ymm0,%ymm12,%ymm12
8519	vpshufb	.rol16(%rip),%ymm12,%ymm12
8520	vpaddd	%ymm12,%ymm8,%ymm8
8521	vpxor	%ymm8,%ymm4,%ymm4
8522	vpsrld	$20,%ymm4,%ymm3
8523	vpslld	$12,%ymm4,%ymm4
8524	vpxor	%ymm3,%ymm4,%ymm4
8525	vpaddd	%ymm4,%ymm0,%ymm0
8526	vpxor	%ymm0,%ymm12,%ymm12
8527	vpshufb	.rol8(%rip),%ymm12,%ymm12
8528	vpaddd	%ymm12,%ymm8,%ymm8
8529	vpxor	%ymm8,%ymm4,%ymm4
8530	vpslld	$7,%ymm4,%ymm3
8531	vpsrld	$25,%ymm4,%ymm4
8532	vpxor	%ymm3,%ymm4,%ymm4
8533	vpalignr	$4,%ymm12,%ymm12,%ymm12
8534	vpalignr	$8,%ymm8,%ymm8,%ymm8
8535	vpalignr	$12,%ymm4,%ymm4,%ymm4
8536	vpaddd	%ymm5,%ymm1,%ymm1
8537	vpxor	%ymm1,%ymm13,%ymm13
8538	vpshufb	.rol16(%rip),%ymm13,%ymm13
8539	vpaddd	%ymm13,%ymm9,%ymm9
8540	vpxor	%ymm9,%ymm5,%ymm5
8541	vpsrld	$20,%ymm5,%ymm3
8542	vpslld	$12,%ymm5,%ymm5
8543	vpxor	%ymm3,%ymm5,%ymm5
8544	vpaddd	%ymm5,%ymm1,%ymm1
8545	vpxor	%ymm1,%ymm13,%ymm13
8546	vpshufb	.rol8(%rip),%ymm13,%ymm13
8547	vpaddd	%ymm13,%ymm9,%ymm9
8548	vpxor	%ymm9,%ymm5,%ymm5
8549	vpslld	$7,%ymm5,%ymm3
8550	vpsrld	$25,%ymm5,%ymm5
8551	vpxor	%ymm3,%ymm5,%ymm5
8552	vpalignr	$4,%ymm13,%ymm13,%ymm13
8553	vpalignr	$8,%ymm9,%ymm9,%ymm9
8554	vpalignr	$12,%ymm5,%ymm5,%ymm5
8555
8556	decq	%r10
8557	jne	1b
8558	vpaddd	%ymm2,%ymm0,%ymm0
8559	vpaddd	%ymm2,%ymm1,%ymm1
8560	vpaddd	%ymm6,%ymm4,%ymm4
8561	vpaddd	%ymm6,%ymm5,%ymm5
8562	vpaddd	%ymm10,%ymm8,%ymm8
8563	vpaddd	%ymm10,%ymm9,%ymm9
8564	vpaddd	%ymm11,%ymm12,%ymm12
8565	vpaddd	%ymm15,%ymm13,%ymm13
8566	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
8567
8568	vpand	.clamp(%rip),%ymm3,%ymm3
8569	vmovdqa	%ymm3,0(%rbp)
8570
8571	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
8572	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
8573	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
8574	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
8575	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
8576	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
8577seal_avx2_short:
8578	movq	%r8,%r8
8579	call	poly_hash_ad_internal
8580	xorq	%rcx,%rcx
8581seal_avx2_hash:
8582	cmpq	$16,%rcx
8583	jb	seal_avx2_short_loop
8584	addq	0(%rdi),%r10
8585	adcq	8+0(%rdi),%r11
8586	adcq	$1,%r12
8587	movq	0+0(%rbp),%rax
8588	movq	%rax,%r15
8589	mulq	%r10
8590	movq	%rax,%r13
8591	movq	%rdx,%r14
8592	movq	0+0(%rbp),%rax
8593	mulq	%r11
8594	imulq	%r12,%r15
8595	addq	%rax,%r14
8596	adcq	%rdx,%r15
8597	movq	8+0(%rbp),%rax
8598	movq	%rax,%r9
8599	mulq	%r10
8600	addq	%rax,%r14
8601	adcq	$0,%rdx
8602	movq	%rdx,%r10
8603	movq	8+0(%rbp),%rax
8604	mulq	%r11
8605	addq	%rax,%r15
8606	adcq	$0,%rdx
8607	imulq	%r12,%r9
8608	addq	%r10,%r15
8609	adcq	%rdx,%r9
8610	movq	%r13,%r10
8611	movq	%r14,%r11
8612	movq	%r15,%r12
8613	andq	$3,%r12
8614	movq	%r15,%r13
8615	andq	$-4,%r13
8616	movq	%r9,%r14
8617	shrdq	$2,%r9,%r15
8618	shrq	$2,%r9
8619	addq	%r13,%r10
8620	adcq	%r14,%r11
8621	adcq	$0,%r12
8622	addq	%r15,%r10
8623	adcq	%r9,%r11
8624	adcq	$0,%r12
8625
8626	subq	$16,%rcx
8627	addq	$16,%rdi
8628	jmp	seal_avx2_hash
8629seal_avx2_short_loop:
8630	cmpq	$32,%rbx
8631	jb	seal_avx2_short_tail
8632	subq	$32,%rbx
8633
8634	vpxor	(%rsi),%ymm0,%ymm0
8635	vmovdqu	%ymm0,(%rdi)
8636	leaq	32(%rsi),%rsi
8637
8638	addq	0(%rdi),%r10
8639	adcq	8+0(%rdi),%r11
8640	adcq	$1,%r12
8641	movq	0+0(%rbp),%rax
8642	movq	%rax,%r15
8643	mulq	%r10
8644	movq	%rax,%r13
8645	movq	%rdx,%r14
8646	movq	0+0(%rbp),%rax
8647	mulq	%r11
8648	imulq	%r12,%r15
8649	addq	%rax,%r14
8650	adcq	%rdx,%r15
8651	movq	8+0(%rbp),%rax
8652	movq	%rax,%r9
8653	mulq	%r10
8654	addq	%rax,%r14
8655	adcq	$0,%rdx
8656	movq	%rdx,%r10
8657	movq	8+0(%rbp),%rax
8658	mulq	%r11
8659	addq	%rax,%r15
8660	adcq	$0,%rdx
8661	imulq	%r12,%r9
8662	addq	%r10,%r15
8663	adcq	%rdx,%r9
8664	movq	%r13,%r10
8665	movq	%r14,%r11
8666	movq	%r15,%r12
8667	andq	$3,%r12
8668	movq	%r15,%r13
8669	andq	$-4,%r13
8670	movq	%r9,%r14
8671	shrdq	$2,%r9,%r15
8672	shrq	$2,%r9
8673	addq	%r13,%r10
8674	adcq	%r14,%r11
8675	adcq	$0,%r12
8676	addq	%r15,%r10
8677	adcq	%r9,%r11
8678	adcq	$0,%r12
8679	addq	16(%rdi),%r10
8680	adcq	8+16(%rdi),%r11
8681	adcq	$1,%r12
8682	movq	0+0(%rbp),%rax
8683	movq	%rax,%r15
8684	mulq	%r10
8685	movq	%rax,%r13
8686	movq	%rdx,%r14
8687	movq	0+0(%rbp),%rax
8688	mulq	%r11
8689	imulq	%r12,%r15
8690	addq	%rax,%r14
8691	adcq	%rdx,%r15
8692	movq	8+0(%rbp),%rax
8693	movq	%rax,%r9
8694	mulq	%r10
8695	addq	%rax,%r14
8696	adcq	$0,%rdx
8697	movq	%rdx,%r10
8698	movq	8+0(%rbp),%rax
8699	mulq	%r11
8700	addq	%rax,%r15
8701	adcq	$0,%rdx
8702	imulq	%r12,%r9
8703	addq	%r10,%r15
8704	adcq	%rdx,%r9
8705	movq	%r13,%r10
8706	movq	%r14,%r11
8707	movq	%r15,%r12
8708	andq	$3,%r12
8709	movq	%r15,%r13
8710	andq	$-4,%r13
8711	movq	%r9,%r14
8712	shrdq	$2,%r9,%r15
8713	shrq	$2,%r9
8714	addq	%r13,%r10
8715	adcq	%r14,%r11
8716	adcq	$0,%r12
8717	addq	%r15,%r10
8718	adcq	%r9,%r11
8719	adcq	$0,%r12
8720
8721	leaq	32(%rdi),%rdi
8722
8723	vmovdqa	%ymm4,%ymm0
8724	vmovdqa	%ymm8,%ymm4
8725	vmovdqa	%ymm12,%ymm8
8726	vmovdqa	%ymm1,%ymm12
8727	vmovdqa	%ymm5,%ymm1
8728	vmovdqa	%ymm9,%ymm5
8729	vmovdqa	%ymm13,%ymm9
8730	vmovdqa	%ymm2,%ymm13
8731	vmovdqa	%ymm6,%ymm2
8732	jmp	seal_avx2_short_loop
8733seal_avx2_short_tail:
8734	cmpq	$16,%rbx
8735	jb	1f
8736	subq	$16,%rbx
8737	vpxor	(%rsi),%xmm0,%xmm3
8738	vmovdqu	%xmm3,(%rdi)
8739	leaq	16(%rsi),%rsi
8740	addq	0(%rdi),%r10
8741	adcq	8+0(%rdi),%r11
8742	adcq	$1,%r12
8743	movq	0+0(%rbp),%rax
8744	movq	%rax,%r15
8745	mulq	%r10
8746	movq	%rax,%r13
8747	movq	%rdx,%r14
8748	movq	0+0(%rbp),%rax
8749	mulq	%r11
8750	imulq	%r12,%r15
8751	addq	%rax,%r14
8752	adcq	%rdx,%r15
8753	movq	8+0(%rbp),%rax
8754	movq	%rax,%r9
8755	mulq	%r10
8756	addq	%rax,%r14
8757	adcq	$0,%rdx
8758	movq	%rdx,%r10
8759	movq	8+0(%rbp),%rax
8760	mulq	%r11
8761	addq	%rax,%r15
8762	adcq	$0,%rdx
8763	imulq	%r12,%r9
8764	addq	%r10,%r15
8765	adcq	%rdx,%r9
8766	movq	%r13,%r10
8767	movq	%r14,%r11
8768	movq	%r15,%r12
8769	andq	$3,%r12
8770	movq	%r15,%r13
8771	andq	$-4,%r13
8772	movq	%r9,%r14
8773	shrdq	$2,%r9,%r15
8774	shrq	$2,%r9
8775	addq	%r13,%r10
8776	adcq	%r14,%r11
8777	adcq	$0,%r12
8778	addq	%r15,%r10
8779	adcq	%r9,%r11
8780	adcq	$0,%r12
8781
8782	leaq	16(%rdi),%rdi
8783	vextracti128	$1,%ymm0,%xmm0
87841:
8785	vzeroupper
8786	jmp	seal_sse_tail_16
8787.cfi_endproc
8788#endif
8789