1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# November 2014
18#
19# ChaCha20 for x86_64.
20#
21# December 2016
22#
23# Add AVX512F code path.
24#
25# Performance in cycles per byte out of large buffer.
26#
27#		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    NxAVX(v)
28#
29# P4		9.48/+99%	-/22.7(ii)	-
30# Core2		7.83/+55%	7.90/8.08	4.35
31# Westmere	7.19/+50%	5.60/6.70	3.00
32# Sandy Bridge	8.31/+42%	5.45/6.76	2.72
33# Ivy Bridge	6.71/+46%	5.40/6.49	2.41
34# Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
35# Skylake[-X]	5.87/+39%	4.70/-		2.31	    1.19[0.57]
36# Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
37# Knights L	11.7/-		-		9.60(iii)   0.80
38# Goldmont	10.6/+17%	5.10/-		3.28
39# Sledgehammer	7.28/+52%	-/14.2(ii)	-
40# Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
41# Ryzen		5.96/+50%	5.19/-		2.40        2.09
42# VIA Nano	10.5/+46%	6.72/8.60	6.05
43#
44# (i)	compared to older gcc 3.x one can observe >2x improvement on
45#	most platforms;
46# (ii)	as it can be seen, SSE2 performance is too low on legacy
47#	processors; NxSSE2 results are naturally better, but not
48#	impressively better than IALU ones, which is why you won't
49#	find SSE2 code below;
50# (iii)	this is not optimal result for Atom because of MSROM
51#	limitations, SSE2 can do better, but gain is considered too
52#	low to justify the [maintenance] effort;
53# (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20;
54#
55# Modified from upstream OpenSSL to remove the XOP code.
56
57$flavour = shift;
58$output  = shift;
59if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
60
61$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
62
63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
65( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
66die "can't locate x86_64-xlate.pl";
67
68$avx = 2;
69
70open OUT,"| \"$^X\" $xlate $flavour $output";
71*STDOUT=*OUT;
72
73# input parameter block
74($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
75
76$code.=<<___;
77.text
78
79.extern OPENSSL_ia32cap_P
80
81.align	64
82.Lzero:
83.long	0,0,0,0
84.Lone:
85.long	1,0,0,0
86.Linc:
87.long	0,1,2,3
88.Lfour:
89.long	4,4,4,4
90.Lincy:
91.long	0,2,4,6,1,3,5,7
92.Leight:
93.long	8,8,8,8,8,8,8,8
94.Lrot16:
95.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
96.Lrot24:
97.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
98.Lsigma:
99.asciz	"expand 32-byte k"
100.align	64
101.Lzeroz:
102.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
103.Lfourz:
104.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
105.Lincz:
106.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
107.Lsixteen:
108.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
109.asciz	"ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
110___
111
112sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
113{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
114  my $arg = pop;
115    $arg = "\$$arg" if ($arg*1 eq $arg);
116    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
117}
118
119@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
120    "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
121@t=("%esi","%edi");
122
123sub ROUND {			# critical path is 24 cycles per round
124my ($a0,$b0,$c0,$d0)=@_;
125my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
126my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
127my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
128my ($xc,$xc_)=map("\"$_\"",@t);
129my @x=map("\"$_\"",@x);
130
131	# Consider order in which variables are addressed by their
132	# index:
133	#
134	#	a   b   c   d
135	#
136	#	0   4   8  12 < even round
137	#	1   5   9  13
138	#	2   6  10  14
139	#	3   7  11  15
140	#	0   5  10  15 < odd round
141	#	1   6  11  12
142	#	2   7   8  13
143	#	3   4   9  14
144	#
145	# 'a', 'b' and 'd's are permanently allocated in registers,
146	# @x[0..7,12..15], while 'c's are maintained in memory. If
147	# you observe 'c' column, you'll notice that pair of 'c's is
148	# invariant between rounds. This means that we have to reload
149	# them once per round, in the middle. This is why you'll see
150	# bunch of 'c' stores and loads in the middle, but none in
151	# the beginning or end.
152
153	# Normally instructions would be interleaved to favour in-order
154	# execution. Generally out-of-order cores manage it gracefully,
155	# but not this time for some reason. As in-order execution
156	# cores are dying breed, old Atom is the only one around,
157	# instructions are left uninterleaved. Besides, Atom is better
158	# off executing 1xSSSE3 code anyway...
159
160	(
161	"&add	(@x[$a0],@x[$b0])",	# Q1
162	"&xor	(@x[$d0],@x[$a0])",
163	"&rol	(@x[$d0],16)",
164	 "&add	(@x[$a1],@x[$b1])",	# Q2
165	 "&xor	(@x[$d1],@x[$a1])",
166	 "&rol	(@x[$d1],16)",
167
168	"&add	($xc,@x[$d0])",
169	"&xor	(@x[$b0],$xc)",
170	"&rol	(@x[$b0],12)",
171	 "&add	($xc_,@x[$d1])",
172	 "&xor	(@x[$b1],$xc_)",
173	 "&rol	(@x[$b1],12)",
174
175	"&add	(@x[$a0],@x[$b0])",
176	"&xor	(@x[$d0],@x[$a0])",
177	"&rol	(@x[$d0],8)",
178	 "&add	(@x[$a1],@x[$b1])",
179	 "&xor	(@x[$d1],@x[$a1])",
180	 "&rol	(@x[$d1],8)",
181
182	"&add	($xc,@x[$d0])",
183	"&xor	(@x[$b0],$xc)",
184	"&rol	(@x[$b0],7)",
185	 "&add	($xc_,@x[$d1])",
186	 "&xor	(@x[$b1],$xc_)",
187	 "&rol	(@x[$b1],7)",
188
189	"&mov	(\"4*$c0(%rsp)\",$xc)",	# reload pair of 'c's
190	 "&mov	(\"4*$c1(%rsp)\",$xc_)",
191	"&mov	($xc,\"4*$c2(%rsp)\")",
192	 "&mov	($xc_,\"4*$c3(%rsp)\")",
193
194	"&add	(@x[$a2],@x[$b2])",	# Q3
195	"&xor	(@x[$d2],@x[$a2])",
196	"&rol	(@x[$d2],16)",
197	 "&add	(@x[$a3],@x[$b3])",	# Q4
198	 "&xor	(@x[$d3],@x[$a3])",
199	 "&rol	(@x[$d3],16)",
200
201	"&add	($xc,@x[$d2])",
202	"&xor	(@x[$b2],$xc)",
203	"&rol	(@x[$b2],12)",
204	 "&add	($xc_,@x[$d3])",
205	 "&xor	(@x[$b3],$xc_)",
206	 "&rol	(@x[$b3],12)",
207
208	"&add	(@x[$a2],@x[$b2])",
209	"&xor	(@x[$d2],@x[$a2])",
210	"&rol	(@x[$d2],8)",
211	 "&add	(@x[$a3],@x[$b3])",
212	 "&xor	(@x[$d3],@x[$a3])",
213	 "&rol	(@x[$d3],8)",
214
215	"&add	($xc,@x[$d2])",
216	"&xor	(@x[$b2],$xc)",
217	"&rol	(@x[$b2],7)",
218	 "&add	($xc_,@x[$d3])",
219	 "&xor	(@x[$b3],$xc_)",
220	 "&rol	(@x[$b3],7)"
221	);
222}
223
224########################################################################
225# Generic code path that handles all lengths on pre-SSSE3 processors.
226$code.=<<___;
227.globl	ChaCha20_ctr32
228.type	ChaCha20_ctr32,\@function,5
229.align	64
230ChaCha20_ctr32:
231.cfi_startproc
232	cmp	\$0,$len
233	je	.Lno_data
234	mov	OPENSSL_ia32cap_P+4(%rip),%r10
235___
236$code.=<<___	if ($avx>2);
237	bt	\$48,%r10		# check for AVX512F
238	jc	.LChaCha20_avx512
239___
240$code.=<<___;
241	test	\$`1<<(41-32)`,%r10d
242	jnz	.LChaCha20_ssse3
243
244	push	%rbx
245.cfi_push	rbx
246	push	%rbp
247.cfi_push	rbp
248	push	%r12
249.cfi_push	r12
250	push	%r13
251.cfi_push	r13
252	push	%r14
253.cfi_push	r14
254	push	%r15
255.cfi_push	r15
256	sub	\$64+24,%rsp
257.cfi_adjust_cfa_offset	`64+24`
258.Lctr32_body:
259
260	#movdqa	.Lsigma(%rip),%xmm0
261	movdqu	($key),%xmm1
262	movdqu	16($key),%xmm2
263	movdqu	($counter),%xmm3
264	movdqa	.Lone(%rip),%xmm4
265
266	#movdqa	%xmm0,4*0(%rsp)		# key[0]
267	movdqa	%xmm1,4*4(%rsp)		# key[1]
268	movdqa	%xmm2,4*8(%rsp)		# key[2]
269	movdqa	%xmm3,4*12(%rsp)	# key[3]
270	mov	$len,%rbp		# reassign $len
271	jmp	.Loop_outer
272
273.align	32
274.Loop_outer:
275	mov	\$0x61707865,@x[0]      # 'expa'
276	mov	\$0x3320646e,@x[1]      # 'nd 3'
277	mov	\$0x79622d32,@x[2]      # '2-by'
278	mov	\$0x6b206574,@x[3]      # 'te k'
279	mov	4*4(%rsp),@x[4]
280	mov	4*5(%rsp),@x[5]
281	mov	4*6(%rsp),@x[6]
282	mov	4*7(%rsp),@x[7]
283	movd	%xmm3,@x[12]
284	mov	4*13(%rsp),@x[13]
285	mov	4*14(%rsp),@x[14]
286	mov	4*15(%rsp),@x[15]
287
288	mov	%rbp,64+0(%rsp)		# save len
289	mov	\$10,%ebp
290	mov	$inp,64+8(%rsp)		# save inp
291	movq	%xmm2,%rsi		# "@x[8]"
292	mov	$out,64+16(%rsp)	# save out
293	mov	%rsi,%rdi
294	shr	\$32,%rdi		# "@x[9]"
295	jmp	.Loop
296
297.align	32
298.Loop:
299___
300	foreach (&ROUND (0, 4, 8,12)) { eval; }
301	foreach (&ROUND	(0, 5,10,15)) { eval; }
302	&dec	("%ebp");
303	&jnz	(".Loop");
304
305$code.=<<___;
306	mov	@t[1],4*9(%rsp)		# modulo-scheduled
307	mov	@t[0],4*8(%rsp)
308	mov	64(%rsp),%rbp		# load len
309	movdqa	%xmm2,%xmm1
310	mov	64+8(%rsp),$inp		# load inp
311	paddd	%xmm4,%xmm3		# increment counter
312	mov	64+16(%rsp),$out	# load out
313
314	add	\$0x61707865,@x[0]      # 'expa'
315	add	\$0x3320646e,@x[1]      # 'nd 3'
316	add	\$0x79622d32,@x[2]      # '2-by'
317	add	\$0x6b206574,@x[3]      # 'te k'
318	add	4*4(%rsp),@x[4]
319	add	4*5(%rsp),@x[5]
320	add	4*6(%rsp),@x[6]
321	add	4*7(%rsp),@x[7]
322	add	4*12(%rsp),@x[12]
323	add	4*13(%rsp),@x[13]
324	add	4*14(%rsp),@x[14]
325	add	4*15(%rsp),@x[15]
326	paddd	4*8(%rsp),%xmm1
327
328	cmp	\$64,%rbp
329	jb	.Ltail
330
331	xor	4*0($inp),@x[0]		# xor with input
332	xor	4*1($inp),@x[1]
333	xor	4*2($inp),@x[2]
334	xor	4*3($inp),@x[3]
335	xor	4*4($inp),@x[4]
336	xor	4*5($inp),@x[5]
337	xor	4*6($inp),@x[6]
338	xor	4*7($inp),@x[7]
339	movdqu	4*8($inp),%xmm0
340	xor	4*12($inp),@x[12]
341	xor	4*13($inp),@x[13]
342	xor	4*14($inp),@x[14]
343	xor	4*15($inp),@x[15]
344	lea	4*16($inp),$inp		# inp+=64
345	pxor	%xmm1,%xmm0
346
347	movdqa	%xmm2,4*8(%rsp)
348	movd	%xmm3,4*12(%rsp)
349
350	mov	@x[0],4*0($out)		# write output
351	mov	@x[1],4*1($out)
352	mov	@x[2],4*2($out)
353	mov	@x[3],4*3($out)
354	mov	@x[4],4*4($out)
355	mov	@x[5],4*5($out)
356	mov	@x[6],4*6($out)
357	mov	@x[7],4*7($out)
358	movdqu	%xmm0,4*8($out)
359	mov	@x[12],4*12($out)
360	mov	@x[13],4*13($out)
361	mov	@x[14],4*14($out)
362	mov	@x[15],4*15($out)
363	lea	4*16($out),$out		# out+=64
364
365	sub	\$64,%rbp
366	jnz	.Loop_outer
367
368	jmp	.Ldone
369
370.align	16
371.Ltail:
372	mov	@x[0],4*0(%rsp)
373	mov	@x[1],4*1(%rsp)
374	xor	%rbx,%rbx
375	mov	@x[2],4*2(%rsp)
376	mov	@x[3],4*3(%rsp)
377	mov	@x[4],4*4(%rsp)
378	mov	@x[5],4*5(%rsp)
379	mov	@x[6],4*6(%rsp)
380	mov	@x[7],4*7(%rsp)
381	movdqa	%xmm1,4*8(%rsp)
382	mov	@x[12],4*12(%rsp)
383	mov	@x[13],4*13(%rsp)
384	mov	@x[14],4*14(%rsp)
385	mov	@x[15],4*15(%rsp)
386
387.Loop_tail:
388	movzb	($inp,%rbx),%eax
389	movzb	(%rsp,%rbx),%edx
390	lea	1(%rbx),%rbx
391	xor	%edx,%eax
392	mov	%al,-1($out,%rbx)
393	dec	%rbp
394	jnz	.Loop_tail
395
396.Ldone:
397	lea	64+24+48(%rsp),%rsi
398	mov	-48(%rsi),%r15
399.cfi_restore	r15
400	mov	-40(%rsi),%r14
401.cfi_restore	r14
402	mov	-32(%rsi),%r13
403.cfi_restore	r13
404	mov	-24(%rsi),%r12
405.cfi_restore	r12
406	mov	-16(%rsi),%rbp
407.cfi_restore	rbp
408	mov	-8(%rsi),%rbx
409.cfi_restore	rbx
410	lea	(%rsi),%rsp
411.cfi_adjust_cfa_offset	`-64-24-48`
412.Lno_data:
413	ret
414.cfi_endproc
415.size	ChaCha20_ctr32,.-ChaCha20_ctr32
416___
417
418########################################################################
419# SSSE3 code path that handles shorter lengths
420{
421my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
422
423sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
424	&paddd	($a,$b);
425	&pxor	($d,$a);
426	&pshufb	($d,$rot16);
427
428	&paddd	($c,$d);
429	&pxor	($b,$c);
430	&movdqa	($t,$b);
431	&psrld	($b,20);
432	&pslld	($t,12);
433	&por	($b,$t);
434
435	&paddd	($a,$b);
436	&pxor	($d,$a);
437	&pshufb	($d,$rot24);
438
439	&paddd	($c,$d);
440	&pxor	($b,$c);
441	&movdqa	($t,$b);
442	&psrld	($b,25);
443	&pslld	($t,7);
444	&por	($b,$t);
445}
446
447my $xframe = $win64 ? 32+8 : 8;
448
449$code.=<<___;
450.type	ChaCha20_ssse3,\@function,5
451.align	32
452ChaCha20_ssse3:
453.LChaCha20_ssse3:
454.cfi_startproc
455	mov	%rsp,%r9		# frame pointer
456.cfi_def_cfa_register	r9
457___
458$code.=<<___;
459	cmp	\$128,$len		# we might throw away some data,
460	ja	.LChaCha20_4x		# but overall it won't be slower
461
462.Ldo_sse3_after_all:
463	sub	\$64+$xframe,%rsp
464___
465$code.=<<___	if ($win64);
466	movaps	%xmm6,-0x28(%r9)
467	movaps	%xmm7,-0x18(%r9)
468.Lssse3_body:
469___
470$code.=<<___;
471	movdqa	.Lsigma(%rip),$a
472	movdqu	($key),$b
473	movdqu	16($key),$c
474	movdqu	($counter),$d
475	movdqa	.Lrot16(%rip),$rot16
476	movdqa	.Lrot24(%rip),$rot24
477
478	movdqa	$a,0x00(%rsp)
479	movdqa	$b,0x10(%rsp)
480	movdqa	$c,0x20(%rsp)
481	movdqa	$d,0x30(%rsp)
482	mov	\$10,$counter		# reuse $counter
483	jmp	.Loop_ssse3
484
485.align	32
486.Loop_outer_ssse3:
487	movdqa	.Lone(%rip),$d
488	movdqa	0x00(%rsp),$a
489	movdqa	0x10(%rsp),$b
490	movdqa	0x20(%rsp),$c
491	paddd	0x30(%rsp),$d
492	mov	\$10,$counter
493	movdqa	$d,0x30(%rsp)
494	jmp	.Loop_ssse3
495
496.align	32
497.Loop_ssse3:
498___
499	&SSSE3ROUND();
500	&pshufd	($c,$c,0b01001110);
501	&pshufd	($b,$b,0b00111001);
502	&pshufd	($d,$d,0b10010011);
503	&nop	();
504
505	&SSSE3ROUND();
506	&pshufd	($c,$c,0b01001110);
507	&pshufd	($b,$b,0b10010011);
508	&pshufd	($d,$d,0b00111001);
509
510	&dec	($counter);
511	&jnz	(".Loop_ssse3");
512
513$code.=<<___;
514	paddd	0x00(%rsp),$a
515	paddd	0x10(%rsp),$b
516	paddd	0x20(%rsp),$c
517	paddd	0x30(%rsp),$d
518
519	cmp	\$64,$len
520	jb	.Ltail_ssse3
521
522	movdqu	0x00($inp),$t
523	movdqu	0x10($inp),$t1
524	pxor	$t,$a			# xor with input
525	movdqu	0x20($inp),$t
526	pxor	$t1,$b
527	movdqu	0x30($inp),$t1
528	lea	0x40($inp),$inp		# inp+=64
529	pxor	$t,$c
530	pxor	$t1,$d
531
532	movdqu	$a,0x00($out)		# write output
533	movdqu	$b,0x10($out)
534	movdqu	$c,0x20($out)
535	movdqu	$d,0x30($out)
536	lea	0x40($out),$out		# out+=64
537
538	sub	\$64,$len
539	jnz	.Loop_outer_ssse3
540
541	jmp	.Ldone_ssse3
542
543.align	16
544.Ltail_ssse3:
545	movdqa	$a,0x00(%rsp)
546	movdqa	$b,0x10(%rsp)
547	movdqa	$c,0x20(%rsp)
548	movdqa	$d,0x30(%rsp)
549	xor	$counter,$counter
550
551.Loop_tail_ssse3:
552	movzb	($inp,$counter),%eax
553	movzb	(%rsp,$counter),%ecx
554	lea	1($counter),$counter
555	xor	%ecx,%eax
556	mov	%al,-1($out,$counter)
557	dec	$len
558	jnz	.Loop_tail_ssse3
559
560.Ldone_ssse3:
561___
562$code.=<<___	if ($win64);
563	movaps	-0x28(%r9),%xmm6
564	movaps	-0x18(%r9),%xmm7
565___
566$code.=<<___;
567	lea	(%r9),%rsp
568.cfi_def_cfa_register	rsp
569.Lssse3_epilogue:
570	ret
571.cfi_endproc
572.size	ChaCha20_ssse3,.-ChaCha20_ssse3
573___
574}
575
576########################################################################
577# SSSE3 code path that handles longer messages.
578{
579# assign variables to favor Atom front-end
580my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
581    $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
582my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
583	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
584
585sub SSSE3_lane_ROUND {
586my ($a0,$b0,$c0,$d0)=@_;
587my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
588my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
589my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
590my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
591my @x=map("\"$_\"",@xx);
592
593	# Consider order in which variables are addressed by their
594	# index:
595	#
596	#	a   b   c   d
597	#
598	#	0   4   8  12 < even round
599	#	1   5   9  13
600	#	2   6  10  14
601	#	3   7  11  15
602	#	0   5  10  15 < odd round
603	#	1   6  11  12
604	#	2   7   8  13
605	#	3   4   9  14
606	#
607	# 'a', 'b' and 'd's are permanently allocated in registers,
608	# @x[0..7,12..15], while 'c's are maintained in memory. If
609	# you observe 'c' column, you'll notice that pair of 'c's is
610	# invariant between rounds. This means that we have to reload
611	# them once per round, in the middle. This is why you'll see
612	# bunch of 'c' stores and loads in the middle, but none in
613	# the beginning or end.
614
615	(
616	"&paddd		(@x[$a0],@x[$b0])",	# Q1
617	 "&paddd	(@x[$a1],@x[$b1])",	# Q2
618	"&pxor		(@x[$d0],@x[$a0])",
619	 "&pxor		(@x[$d1],@x[$a1])",
620	"&pshufb	(@x[$d0],$t1)",
621	 "&pshufb	(@x[$d1],$t1)",
622
623	"&paddd		($xc,@x[$d0])",
624	 "&paddd	($xc_,@x[$d1])",
625	"&pxor		(@x[$b0],$xc)",
626	 "&pxor		(@x[$b1],$xc_)",
627	"&movdqa	($t0,@x[$b0])",
628	"&pslld		(@x[$b0],12)",
629	"&psrld		($t0,20)",
630	 "&movdqa	($t1,@x[$b1])",
631	 "&pslld	(@x[$b1],12)",
632	"&por		(@x[$b0],$t0)",
633	 "&psrld	($t1,20)",
634	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
635	 "&por		(@x[$b1],$t1)",
636
637	"&paddd		(@x[$a0],@x[$b0])",
638	 "&paddd	(@x[$a1],@x[$b1])",
639	"&pxor		(@x[$d0],@x[$a0])",
640	 "&pxor		(@x[$d1],@x[$a1])",
641	"&pshufb	(@x[$d0],$t0)",
642	 "&pshufb	(@x[$d1],$t0)",
643
644	"&paddd		($xc,@x[$d0])",
645	 "&paddd	($xc_,@x[$d1])",
646	"&pxor		(@x[$b0],$xc)",
647	 "&pxor		(@x[$b1],$xc_)",
648	"&movdqa	($t1,@x[$b0])",
649	"&pslld		(@x[$b0],7)",
650	"&psrld		($t1,25)",
651	 "&movdqa	($t0,@x[$b1])",
652	 "&pslld	(@x[$b1],7)",
653	"&por		(@x[$b0],$t1)",
654	 "&psrld	($t0,25)",
655	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
656	 "&por		(@x[$b1],$t0)",
657
658	"&movdqa	(\"`16*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
659	 "&movdqa	(\"`16*($c1-8)`(%rsp)\",$xc_)",
660	"&movdqa	($xc,\"`16*($c2-8)`(%rsp)\")",
661	 "&movdqa	($xc_,\"`16*($c3-8)`(%rsp)\")",
662
663	"&paddd		(@x[$a2],@x[$b2])",	# Q3
664	 "&paddd	(@x[$a3],@x[$b3])",	# Q4
665	"&pxor		(@x[$d2],@x[$a2])",
666	 "&pxor		(@x[$d3],@x[$a3])",
667	"&pshufb	(@x[$d2],$t1)",
668	 "&pshufb	(@x[$d3],$t1)",
669
670	"&paddd		($xc,@x[$d2])",
671	 "&paddd	($xc_,@x[$d3])",
672	"&pxor		(@x[$b2],$xc)",
673	 "&pxor		(@x[$b3],$xc_)",
674	"&movdqa	($t0,@x[$b2])",
675	"&pslld		(@x[$b2],12)",
676	"&psrld		($t0,20)",
677	 "&movdqa	($t1,@x[$b3])",
678	 "&pslld	(@x[$b3],12)",
679	"&por		(@x[$b2],$t0)",
680	 "&psrld	($t1,20)",
681	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
682	 "&por		(@x[$b3],$t1)",
683
684	"&paddd		(@x[$a2],@x[$b2])",
685	 "&paddd	(@x[$a3],@x[$b3])",
686	"&pxor		(@x[$d2],@x[$a2])",
687	 "&pxor		(@x[$d3],@x[$a3])",
688	"&pshufb	(@x[$d2],$t0)",
689	 "&pshufb	(@x[$d3],$t0)",
690
691	"&paddd		($xc,@x[$d2])",
692	 "&paddd	($xc_,@x[$d3])",
693	"&pxor		(@x[$b2],$xc)",
694	 "&pxor		(@x[$b3],$xc_)",
695	"&movdqa	($t1,@x[$b2])",
696	"&pslld		(@x[$b2],7)",
697	"&psrld		($t1,25)",
698	 "&movdqa	($t0,@x[$b3])",
699	 "&pslld	(@x[$b3],7)",
700	"&por		(@x[$b2],$t1)",
701	 "&psrld	($t0,25)",
702	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
703	 "&por		(@x[$b3],$t0)"
704	);
705}
706
707my $xframe = $win64 ? 0xa8 : 8;
708
709$code.=<<___;
710.type	ChaCha20_4x,\@function,5
711.align	32
712ChaCha20_4x:
713.LChaCha20_4x:
714.cfi_startproc
715	mov		%rsp,%r9		# frame pointer
716.cfi_def_cfa_register	r9
717	mov		%r10,%r11
718___
719$code.=<<___	if ($avx>1);
720	shr		\$32,%r10		# OPENSSL_ia32cap_P+8
721	test		\$`1<<5`,%r10		# test AVX2
722	jnz		.LChaCha20_8x
723___
724$code.=<<___;
725	cmp		\$192,$len
726	ja		.Lproceed4x
727
728	and		\$`1<<26|1<<22`,%r11	# isolate XSAVE+MOVBE
729	cmp		\$`1<<22`,%r11		# check for MOVBE without XSAVE
730	je		.Ldo_sse3_after_all	# to detect Atom
731
732.Lproceed4x:
733	sub		\$0x140+$xframe,%rsp
734___
735	################ stack layout
736	# +0x00		SIMD equivalent of @x[8-12]
737	# ...
738	# +0x40		constant copy of key[0-2] smashed by lanes
739	# ...
740	# +0x100	SIMD counters (with nonce smashed by lanes)
741	# ...
742	# +0x140
743$code.=<<___	if ($win64);
744	movaps		%xmm6,-0xa8(%r9)
745	movaps		%xmm7,-0x98(%r9)
746	movaps		%xmm8,-0x88(%r9)
747	movaps		%xmm9,-0x78(%r9)
748	movaps		%xmm10,-0x68(%r9)
749	movaps		%xmm11,-0x58(%r9)
750	movaps		%xmm12,-0x48(%r9)
751	movaps		%xmm13,-0x38(%r9)
752	movaps		%xmm14,-0x28(%r9)
753	movaps		%xmm15,-0x18(%r9)
754.L4x_body:
755___
756$code.=<<___;
757	movdqa		.Lsigma(%rip),$xa3	# key[0]
758	movdqu		($key),$xb3		# key[1]
759	movdqu		16($key),$xt3		# key[2]
760	movdqu		($counter),$xd3		# key[3]
761	lea		0x100(%rsp),%rcx	# size optimization
762	lea		.Lrot16(%rip),%r10
763	lea		.Lrot24(%rip),%r11
764
765	pshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
766	pshufd		\$0x55,$xa3,$xa1
767	movdqa		$xa0,0x40(%rsp)		# ... and offload
768	pshufd		\$0xaa,$xa3,$xa2
769	movdqa		$xa1,0x50(%rsp)
770	pshufd		\$0xff,$xa3,$xa3
771	movdqa		$xa2,0x60(%rsp)
772	movdqa		$xa3,0x70(%rsp)
773
774	pshufd		\$0x00,$xb3,$xb0
775	pshufd		\$0x55,$xb3,$xb1
776	movdqa		$xb0,0x80-0x100(%rcx)
777	pshufd		\$0xaa,$xb3,$xb2
778	movdqa		$xb1,0x90-0x100(%rcx)
779	pshufd		\$0xff,$xb3,$xb3
780	movdqa		$xb2,0xa0-0x100(%rcx)
781	movdqa		$xb3,0xb0-0x100(%rcx)
782
783	pshufd		\$0x00,$xt3,$xt0	# "$xc0"
784	pshufd		\$0x55,$xt3,$xt1	# "$xc1"
785	movdqa		$xt0,0xc0-0x100(%rcx)
786	pshufd		\$0xaa,$xt3,$xt2	# "$xc2"
787	movdqa		$xt1,0xd0-0x100(%rcx)
788	pshufd		\$0xff,$xt3,$xt3	# "$xc3"
789	movdqa		$xt2,0xe0-0x100(%rcx)
790	movdqa		$xt3,0xf0-0x100(%rcx)
791
792	pshufd		\$0x00,$xd3,$xd0
793	pshufd		\$0x55,$xd3,$xd1
794	paddd		.Linc(%rip),$xd0	# don't save counters yet
795	pshufd		\$0xaa,$xd3,$xd2
796	movdqa		$xd1,0x110-0x100(%rcx)
797	pshufd		\$0xff,$xd3,$xd3
798	movdqa		$xd2,0x120-0x100(%rcx)
799	movdqa		$xd3,0x130-0x100(%rcx)
800
801	jmp		.Loop_enter4x
802
803.align	32
804.Loop_outer4x:
805	movdqa		0x40(%rsp),$xa0		# re-load smashed key
806	movdqa		0x50(%rsp),$xa1
807	movdqa		0x60(%rsp),$xa2
808	movdqa		0x70(%rsp),$xa3
809	movdqa		0x80-0x100(%rcx),$xb0
810	movdqa		0x90-0x100(%rcx),$xb1
811	movdqa		0xa0-0x100(%rcx),$xb2
812	movdqa		0xb0-0x100(%rcx),$xb3
813	movdqa		0xc0-0x100(%rcx),$xt0	# "$xc0"
814	movdqa		0xd0-0x100(%rcx),$xt1	# "$xc1"
815	movdqa		0xe0-0x100(%rcx),$xt2	# "$xc2"
816	movdqa		0xf0-0x100(%rcx),$xt3	# "$xc3"
817	movdqa		0x100-0x100(%rcx),$xd0
818	movdqa		0x110-0x100(%rcx),$xd1
819	movdqa		0x120-0x100(%rcx),$xd2
820	movdqa		0x130-0x100(%rcx),$xd3
821	paddd		.Lfour(%rip),$xd0	# next SIMD counters
822
823.Loop_enter4x:
824	movdqa		$xt2,0x20(%rsp)		# SIMD equivalent of "@x[10]"
825	movdqa		$xt3,0x30(%rsp)		# SIMD equivalent of "@x[11]"
826	movdqa		(%r10),$xt3		# .Lrot16(%rip)
827	mov		\$10,%eax
828	movdqa		$xd0,0x100-0x100(%rcx)	# save SIMD counters
829	jmp		.Loop4x
830
831.align	32
832.Loop4x:
833___
834	foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
835	foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
836$code.=<<___;
837	dec		%eax
838	jnz		.Loop4x
839
840	paddd		0x40(%rsp),$xa0		# accumulate key material
841	paddd		0x50(%rsp),$xa1
842	paddd		0x60(%rsp),$xa2
843	paddd		0x70(%rsp),$xa3
844
845	movdqa		$xa0,$xt2		# "de-interlace" data
846	punpckldq	$xa1,$xa0
847	movdqa		$xa2,$xt3
848	punpckldq	$xa3,$xa2
849	punpckhdq	$xa1,$xt2
850	punpckhdq	$xa3,$xt3
851	movdqa		$xa0,$xa1
852	punpcklqdq	$xa2,$xa0		# "a0"
853	movdqa		$xt2,$xa3
854	punpcklqdq	$xt3,$xt2		# "a2"
855	punpckhqdq	$xa2,$xa1		# "a1"
856	punpckhqdq	$xt3,$xa3		# "a3"
857___
858	($xa2,$xt2)=($xt2,$xa2);
859$code.=<<___;
860	paddd		0x80-0x100(%rcx),$xb0
861	paddd		0x90-0x100(%rcx),$xb1
862	paddd		0xa0-0x100(%rcx),$xb2
863	paddd		0xb0-0x100(%rcx),$xb3
864
865	movdqa		$xa0,0x00(%rsp)		# offload $xaN
866	movdqa		$xa1,0x10(%rsp)
867	movdqa		0x20(%rsp),$xa0		# "xc2"
868	movdqa		0x30(%rsp),$xa1		# "xc3"
869
870	movdqa		$xb0,$xt2
871	punpckldq	$xb1,$xb0
872	movdqa		$xb2,$xt3
873	punpckldq	$xb3,$xb2
874	punpckhdq	$xb1,$xt2
875	punpckhdq	$xb3,$xt3
876	movdqa		$xb0,$xb1
877	punpcklqdq	$xb2,$xb0		# "b0"
878	movdqa		$xt2,$xb3
879	punpcklqdq	$xt3,$xt2		# "b2"
880	punpckhqdq	$xb2,$xb1		# "b1"
881	punpckhqdq	$xt3,$xb3		# "b3"
882___
883	($xb2,$xt2)=($xt2,$xb2);
884	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
885$code.=<<___;
886	paddd		0xc0-0x100(%rcx),$xc0
887	paddd		0xd0-0x100(%rcx),$xc1
888	paddd		0xe0-0x100(%rcx),$xc2
889	paddd		0xf0-0x100(%rcx),$xc3
890
891	movdqa		$xa2,0x20(%rsp)		# keep offloading $xaN
892	movdqa		$xa3,0x30(%rsp)
893
894	movdqa		$xc0,$xt2
895	punpckldq	$xc1,$xc0
896	movdqa		$xc2,$xt3
897	punpckldq	$xc3,$xc2
898	punpckhdq	$xc1,$xt2
899	punpckhdq	$xc3,$xt3
900	movdqa		$xc0,$xc1
901	punpcklqdq	$xc2,$xc0		# "c0"
902	movdqa		$xt2,$xc3
903	punpcklqdq	$xt3,$xt2		# "c2"
904	punpckhqdq	$xc2,$xc1		# "c1"
905	punpckhqdq	$xt3,$xc3		# "c3"
906___
907	($xc2,$xt2)=($xt2,$xc2);
908	($xt0,$xt1)=($xa2,$xa3);		# use $xaN as temporary
909$code.=<<___;
910	paddd		0x100-0x100(%rcx),$xd0
911	paddd		0x110-0x100(%rcx),$xd1
912	paddd		0x120-0x100(%rcx),$xd2
913	paddd		0x130-0x100(%rcx),$xd3
914
915	movdqa		$xd0,$xt2
916	punpckldq	$xd1,$xd0
917	movdqa		$xd2,$xt3
918	punpckldq	$xd3,$xd2
919	punpckhdq	$xd1,$xt2
920	punpckhdq	$xd3,$xt3
921	movdqa		$xd0,$xd1
922	punpcklqdq	$xd2,$xd0		# "d0"
923	movdqa		$xt2,$xd3
924	punpcklqdq	$xt3,$xt2		# "d2"
925	punpckhqdq	$xd2,$xd1		# "d1"
926	punpckhqdq	$xt3,$xd3		# "d3"
927___
928	($xd2,$xt2)=($xt2,$xd2);
929$code.=<<___;
930	cmp		\$64*4,$len
931	jb		.Ltail4x
932
933	movdqu		0x00($inp),$xt0		# xor with input
934	movdqu		0x10($inp),$xt1
935	movdqu		0x20($inp),$xt2
936	movdqu		0x30($inp),$xt3
937	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
938	pxor		$xb0,$xt1
939	pxor		$xc0,$xt2
940	pxor		$xd0,$xt3
941
942	 movdqu		$xt0,0x00($out)
943	movdqu		0x40($inp),$xt0
944	 movdqu		$xt1,0x10($out)
945	movdqu		0x50($inp),$xt1
946	 movdqu		$xt2,0x20($out)
947	movdqu		0x60($inp),$xt2
948	 movdqu		$xt3,0x30($out)
949	movdqu		0x70($inp),$xt3
950	lea		0x80($inp),$inp		# size optimization
951	pxor		0x10(%rsp),$xt0
952	pxor		$xb1,$xt1
953	pxor		$xc1,$xt2
954	pxor		$xd1,$xt3
955
956	 movdqu		$xt0,0x40($out)
957	movdqu		0x00($inp),$xt0
958	 movdqu		$xt1,0x50($out)
959	movdqu		0x10($inp),$xt1
960	 movdqu		$xt2,0x60($out)
961	movdqu		0x20($inp),$xt2
962	 movdqu		$xt3,0x70($out)
963	 lea		0x80($out),$out		# size optimization
964	movdqu		0x30($inp),$xt3
965	pxor		0x20(%rsp),$xt0
966	pxor		$xb2,$xt1
967	pxor		$xc2,$xt2
968	pxor		$xd2,$xt3
969
970	 movdqu		$xt0,0x00($out)
971	movdqu		0x40($inp),$xt0
972	 movdqu		$xt1,0x10($out)
973	movdqu		0x50($inp),$xt1
974	 movdqu		$xt2,0x20($out)
975	movdqu		0x60($inp),$xt2
976	 movdqu		$xt3,0x30($out)
977	movdqu		0x70($inp),$xt3
978	lea		0x80($inp),$inp		# inp+=64*4
979	pxor		0x30(%rsp),$xt0
980	pxor		$xb3,$xt1
981	pxor		$xc3,$xt2
982	pxor		$xd3,$xt3
983	movdqu		$xt0,0x40($out)
984	movdqu		$xt1,0x50($out)
985	movdqu		$xt2,0x60($out)
986	movdqu		$xt3,0x70($out)
987	lea		0x80($out),$out		# out+=64*4
988
989	sub		\$64*4,$len
990	jnz		.Loop_outer4x
991
992	jmp		.Ldone4x
993
994.Ltail4x:
995	cmp		\$192,$len
996	jae		.L192_or_more4x
997	cmp		\$128,$len
998	jae		.L128_or_more4x
999	cmp		\$64,$len
1000	jae		.L64_or_more4x
1001
1002	#movdqa		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
1003	xor		%r10,%r10
1004	#movdqa		$xt0,0x00(%rsp)
1005	movdqa		$xb0,0x10(%rsp)
1006	movdqa		$xc0,0x20(%rsp)
1007	movdqa		$xd0,0x30(%rsp)
1008	jmp		.Loop_tail4x
1009
1010.align	32
1011.L64_or_more4x:
1012	movdqu		0x00($inp),$xt0		# xor with input
1013	movdqu		0x10($inp),$xt1
1014	movdqu		0x20($inp),$xt2
1015	movdqu		0x30($inp),$xt3
1016	pxor		0x00(%rsp),$xt0		# $xaxN is offloaded, remember?
1017	pxor		$xb0,$xt1
1018	pxor		$xc0,$xt2
1019	pxor		$xd0,$xt3
1020	movdqu		$xt0,0x00($out)
1021	movdqu		$xt1,0x10($out)
1022	movdqu		$xt2,0x20($out)
1023	movdqu		$xt3,0x30($out)
1024	je		.Ldone4x
1025
1026	movdqa		0x10(%rsp),$xt0		# $xaN is offloaded, remember?
1027	lea		0x40($inp),$inp		# inp+=64*1
1028	xor		%r10,%r10
1029	movdqa		$xt0,0x00(%rsp)
1030	movdqa		$xb1,0x10(%rsp)
1031	lea		0x40($out),$out		# out+=64*1
1032	movdqa		$xc1,0x20(%rsp)
1033	sub		\$64,$len		# len-=64*1
1034	movdqa		$xd1,0x30(%rsp)
1035	jmp		.Loop_tail4x
1036
1037.align	32
1038.L128_or_more4x:
1039	movdqu		0x00($inp),$xt0		# xor with input
1040	movdqu		0x10($inp),$xt1
1041	movdqu		0x20($inp),$xt2
1042	movdqu		0x30($inp),$xt3
1043	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
1044	pxor		$xb0,$xt1
1045	pxor		$xc0,$xt2
1046	pxor		$xd0,$xt3
1047
1048	 movdqu		$xt0,0x00($out)
1049	movdqu		0x40($inp),$xt0
1050	 movdqu		$xt1,0x10($out)
1051	movdqu		0x50($inp),$xt1
1052	 movdqu		$xt2,0x20($out)
1053	movdqu		0x60($inp),$xt2
1054	 movdqu		$xt3,0x30($out)
1055	movdqu		0x70($inp),$xt3
1056	pxor		0x10(%rsp),$xt0
1057	pxor		$xb1,$xt1
1058	pxor		$xc1,$xt2
1059	pxor		$xd1,$xt3
1060	movdqu		$xt0,0x40($out)
1061	movdqu		$xt1,0x50($out)
1062	movdqu		$xt2,0x60($out)
1063	movdqu		$xt3,0x70($out)
1064	je		.Ldone4x
1065
1066	movdqa		0x20(%rsp),$xt0		# $xaN is offloaded, remember?
1067	lea		0x80($inp),$inp		# inp+=64*2
1068	xor		%r10,%r10
1069	movdqa		$xt0,0x00(%rsp)
1070	movdqa		$xb2,0x10(%rsp)
1071	lea		0x80($out),$out		# out+=64*2
1072	movdqa		$xc2,0x20(%rsp)
1073	sub		\$128,$len		# len-=64*2
1074	movdqa		$xd2,0x30(%rsp)
1075	jmp		.Loop_tail4x
1076
1077.align	32
1078.L192_or_more4x:
1079	movdqu		0x00($inp),$xt0		# xor with input
1080	movdqu		0x10($inp),$xt1
1081	movdqu		0x20($inp),$xt2
1082	movdqu		0x30($inp),$xt3
1083	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
1084	pxor		$xb0,$xt1
1085	pxor		$xc0,$xt2
1086	pxor		$xd0,$xt3
1087
1088	 movdqu		$xt0,0x00($out)
1089	movdqu		0x40($inp),$xt0
1090	 movdqu		$xt1,0x10($out)
1091	movdqu		0x50($inp),$xt1
1092	 movdqu		$xt2,0x20($out)
1093	movdqu		0x60($inp),$xt2
1094	 movdqu		$xt3,0x30($out)
1095	movdqu		0x70($inp),$xt3
1096	lea		0x80($inp),$inp		# size optimization
1097	pxor		0x10(%rsp),$xt0
1098	pxor		$xb1,$xt1
1099	pxor		$xc1,$xt2
1100	pxor		$xd1,$xt3
1101
1102	 movdqu		$xt0,0x40($out)
1103	movdqu		0x00($inp),$xt0
1104	 movdqu		$xt1,0x50($out)
1105	movdqu		0x10($inp),$xt1
1106	 movdqu		$xt2,0x60($out)
1107	movdqu		0x20($inp),$xt2
1108	 movdqu		$xt3,0x70($out)
1109	 lea		0x80($out),$out		# size optimization
1110	movdqu		0x30($inp),$xt3
1111	pxor		0x20(%rsp),$xt0
1112	pxor		$xb2,$xt1
1113	pxor		$xc2,$xt2
1114	pxor		$xd2,$xt3
1115	movdqu		$xt0,0x00($out)
1116	movdqu		$xt1,0x10($out)
1117	movdqu		$xt2,0x20($out)
1118	movdqu		$xt3,0x30($out)
1119	je		.Ldone4x
1120
1121	movdqa		0x30(%rsp),$xt0		# $xaN is offloaded, remember?
1122	lea		0x40($inp),$inp		# inp+=64*3
1123	xor		%r10,%r10
1124	movdqa		$xt0,0x00(%rsp)
1125	movdqa		$xb3,0x10(%rsp)
1126	lea		0x40($out),$out		# out+=64*3
1127	movdqa		$xc3,0x20(%rsp)
1128	sub		\$192,$len		# len-=64*3
1129	movdqa		$xd3,0x30(%rsp)
1130
1131.Loop_tail4x:
1132	movzb		($inp,%r10),%eax
1133	movzb		(%rsp,%r10),%ecx
1134	lea		1(%r10),%r10
1135	xor		%ecx,%eax
1136	mov		%al,-1($out,%r10)
1137	dec		$len
1138	jnz		.Loop_tail4x
1139
1140.Ldone4x:
1141___
1142$code.=<<___	if ($win64);
1143	movaps		-0xa8(%r9),%xmm6
1144	movaps		-0x98(%r9),%xmm7
1145	movaps		-0x88(%r9),%xmm8
1146	movaps		-0x78(%r9),%xmm9
1147	movaps		-0x68(%r9),%xmm10
1148	movaps		-0x58(%r9),%xmm11
1149	movaps		-0x48(%r9),%xmm12
1150	movaps		-0x38(%r9),%xmm13
1151	movaps		-0x28(%r9),%xmm14
1152	movaps		-0x18(%r9),%xmm15
1153___
1154$code.=<<___;
1155	lea		(%r9),%rsp
1156.cfi_def_cfa_register	rsp
1157.L4x_epilogue:
1158	ret
1159.cfi_endproc
1160.size	ChaCha20_4x,.-ChaCha20_4x
1161___
1162}
1163
1164########################################################################
1165# AVX2 code path
1166if ($avx>1) {
1167my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1168    $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1169my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1170	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1171
1172sub AVX2_lane_ROUND {
1173my ($a0,$b0,$c0,$d0)=@_;
1174my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1175my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1176my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1177my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1178my @x=map("\"$_\"",@xx);
1179
1180	# Consider order in which variables are addressed by their
1181	# index:
1182	#
1183	#	a   b   c   d
1184	#
1185	#	0   4   8  12 < even round
1186	#	1   5   9  13
1187	#	2   6  10  14
1188	#	3   7  11  15
1189	#	0   5  10  15 < odd round
1190	#	1   6  11  12
1191	#	2   7   8  13
1192	#	3   4   9  14
1193	#
1194	# 'a', 'b' and 'd's are permanently allocated in registers,
1195	# @x[0..7,12..15], while 'c's are maintained in memory. If
1196	# you observe 'c' column, you'll notice that pair of 'c's is
1197	# invariant between rounds. This means that we have to reload
1198	# them once per round, in the middle. This is why you'll see
1199	# bunch of 'c' stores and loads in the middle, but none in
1200	# the beginning or end.
1201
1202	(
1203	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
1204	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
1205	"&vpshufb	(@x[$d0],@x[$d0],$t1)",
1206	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
1207	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
1208	 "&vpshufb	(@x[$d1],@x[$d1],$t1)",
1209
1210	"&vpaddd	($xc,$xc,@x[$d0])",
1211	"&vpxor		(@x[$b0],$xc,@x[$b0])",
1212	"&vpslld	($t0,@x[$b0],12)",
1213	"&vpsrld	(@x[$b0],@x[$b0],20)",
1214	"&vpor		(@x[$b0],$t0,@x[$b0])",
1215	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
1216	 "&vpaddd	($xc_,$xc_,@x[$d1])",
1217	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
1218	 "&vpslld	($t1,@x[$b1],12)",
1219	 "&vpsrld	(@x[$b1],@x[$b1],20)",
1220	 "&vpor		(@x[$b1],$t1,@x[$b1])",
1221
1222	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
1223	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
1224	"&vpshufb	(@x[$d0],@x[$d0],$t0)",
1225	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
1226	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
1227	 "&vpshufb	(@x[$d1],@x[$d1],$t0)",
1228
1229	"&vpaddd	($xc,$xc,@x[$d0])",
1230	"&vpxor		(@x[$b0],$xc,@x[$b0])",
1231	"&vpslld	($t1,@x[$b0],7)",
1232	"&vpsrld	(@x[$b0],@x[$b0],25)",
1233	"&vpor		(@x[$b0],$t1,@x[$b0])",
1234	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
1235	 "&vpaddd	($xc_,$xc_,@x[$d1])",
1236	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
1237	 "&vpslld	($t0,@x[$b1],7)",
1238	 "&vpsrld	(@x[$b1],@x[$b1],25)",
1239	 "&vpor		(@x[$b1],$t0,@x[$b1])",
1240
1241	"&vmovdqa	(\"`32*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
1242	 "&vmovdqa	(\"`32*($c1-8)`(%rsp)\",$xc_)",
1243	"&vmovdqa	($xc,\"`32*($c2-8)`(%rsp)\")",
1244	 "&vmovdqa	($xc_,\"`32*($c3-8)`(%rsp)\")",
1245
1246	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
1247	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
1248	"&vpshufb	(@x[$d2],@x[$d2],$t1)",
1249	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
1250	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
1251	 "&vpshufb	(@x[$d3],@x[$d3],$t1)",
1252
1253	"&vpaddd	($xc,$xc,@x[$d2])",
1254	"&vpxor		(@x[$b2],$xc,@x[$b2])",
1255	"&vpslld	($t0,@x[$b2],12)",
1256	"&vpsrld	(@x[$b2],@x[$b2],20)",
1257	"&vpor		(@x[$b2],$t0,@x[$b2])",
1258	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
1259	 "&vpaddd	($xc_,$xc_,@x[$d3])",
1260	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
1261	 "&vpslld	($t1,@x[$b3],12)",
1262	 "&vpsrld	(@x[$b3],@x[$b3],20)",
1263	 "&vpor		(@x[$b3],$t1,@x[$b3])",
1264
1265	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
1266	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
1267	"&vpshufb	(@x[$d2],@x[$d2],$t0)",
1268	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
1269	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
1270	 "&vpshufb	(@x[$d3],@x[$d3],$t0)",
1271
1272	"&vpaddd	($xc,$xc,@x[$d2])",
1273	"&vpxor		(@x[$b2],$xc,@x[$b2])",
1274	"&vpslld	($t1,@x[$b2],7)",
1275	"&vpsrld	(@x[$b2],@x[$b2],25)",
1276	"&vpor		(@x[$b2],$t1,@x[$b2])",
1277	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
1278	 "&vpaddd	($xc_,$xc_,@x[$d3])",
1279	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
1280	 "&vpslld	($t0,@x[$b3],7)",
1281	 "&vpsrld	(@x[$b3],@x[$b3],25)",
1282	 "&vpor		(@x[$b3],$t0,@x[$b3])"
1283	);
1284}
1285
1286my $xframe = $win64 ? 0xa8 : 8;
1287
1288$code.=<<___;
1289.type	ChaCha20_8x,\@function,5
1290.align	32
1291ChaCha20_8x:
1292.LChaCha20_8x:
1293.cfi_startproc
1294	mov		%rsp,%r9		# frame register
1295.cfi_def_cfa_register	r9
1296	sub		\$0x280+$xframe,%rsp
1297	and		\$-32,%rsp
1298___
1299$code.=<<___	if ($win64);
1300	movaps		%xmm6,-0xa8(%r9)
1301	movaps		%xmm7,-0x98(%r9)
1302	movaps		%xmm8,-0x88(%r9)
1303	movaps		%xmm9,-0x78(%r9)
1304	movaps		%xmm10,-0x68(%r9)
1305	movaps		%xmm11,-0x58(%r9)
1306	movaps		%xmm12,-0x48(%r9)
1307	movaps		%xmm13,-0x38(%r9)
1308	movaps		%xmm14,-0x28(%r9)
1309	movaps		%xmm15,-0x18(%r9)
1310.L8x_body:
1311___
1312$code.=<<___;
1313	vzeroupper
1314
1315	################ stack layout
1316	# +0x00		SIMD equivalent of @x[8-12]
1317	# ...
1318	# +0x80		constant copy of key[0-2] smashed by lanes
1319	# ...
1320	# +0x200	SIMD counters (with nonce smashed by lanes)
1321	# ...
1322	# +0x280
1323
1324	vbroadcasti128	.Lsigma(%rip),$xa3	# key[0]
1325	vbroadcasti128	($key),$xb3		# key[1]
1326	vbroadcasti128	16($key),$xt3		# key[2]
1327	vbroadcasti128	($counter),$xd3		# key[3]
1328	lea		0x100(%rsp),%rcx	# size optimization
1329	lea		0x200(%rsp),%rax	# size optimization
1330	lea		.Lrot16(%rip),%r10
1331	lea		.Lrot24(%rip),%r11
1332
1333	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
1334	vpshufd		\$0x55,$xa3,$xa1
1335	vmovdqa		$xa0,0x80-0x100(%rcx)	# ... and offload
1336	vpshufd		\$0xaa,$xa3,$xa2
1337	vmovdqa		$xa1,0xa0-0x100(%rcx)
1338	vpshufd		\$0xff,$xa3,$xa3
1339	vmovdqa		$xa2,0xc0-0x100(%rcx)
1340	vmovdqa		$xa3,0xe0-0x100(%rcx)
1341
1342	vpshufd		\$0x00,$xb3,$xb0
1343	vpshufd		\$0x55,$xb3,$xb1
1344	vmovdqa		$xb0,0x100-0x100(%rcx)
1345	vpshufd		\$0xaa,$xb3,$xb2
1346	vmovdqa		$xb1,0x120-0x100(%rcx)
1347	vpshufd		\$0xff,$xb3,$xb3
1348	vmovdqa		$xb2,0x140-0x100(%rcx)
1349	vmovdqa		$xb3,0x160-0x100(%rcx)
1350
1351	vpshufd		\$0x00,$xt3,$xt0	# "xc0"
1352	vpshufd		\$0x55,$xt3,$xt1	# "xc1"
1353	vmovdqa		$xt0,0x180-0x200(%rax)
1354	vpshufd		\$0xaa,$xt3,$xt2	# "xc2"
1355	vmovdqa		$xt1,0x1a0-0x200(%rax)
1356	vpshufd		\$0xff,$xt3,$xt3	# "xc3"
1357	vmovdqa		$xt2,0x1c0-0x200(%rax)
1358	vmovdqa		$xt3,0x1e0-0x200(%rax)
1359
1360	vpshufd		\$0x00,$xd3,$xd0
1361	vpshufd		\$0x55,$xd3,$xd1
1362	vpaddd		.Lincy(%rip),$xd0,$xd0	# don't save counters yet
1363	vpshufd		\$0xaa,$xd3,$xd2
1364	vmovdqa		$xd1,0x220-0x200(%rax)
1365	vpshufd		\$0xff,$xd3,$xd3
1366	vmovdqa		$xd2,0x240-0x200(%rax)
1367	vmovdqa		$xd3,0x260-0x200(%rax)
1368
1369	jmp		.Loop_enter8x
1370
1371.align	32
1372.Loop_outer8x:
1373	vmovdqa		0x80-0x100(%rcx),$xa0	# re-load smashed key
1374	vmovdqa		0xa0-0x100(%rcx),$xa1
1375	vmovdqa		0xc0-0x100(%rcx),$xa2
1376	vmovdqa		0xe0-0x100(%rcx),$xa3
1377	vmovdqa		0x100-0x100(%rcx),$xb0
1378	vmovdqa		0x120-0x100(%rcx),$xb1
1379	vmovdqa		0x140-0x100(%rcx),$xb2
1380	vmovdqa		0x160-0x100(%rcx),$xb3
1381	vmovdqa		0x180-0x200(%rax),$xt0	# "xc0"
1382	vmovdqa		0x1a0-0x200(%rax),$xt1	# "xc1"
1383	vmovdqa		0x1c0-0x200(%rax),$xt2	# "xc2"
1384	vmovdqa		0x1e0-0x200(%rax),$xt3	# "xc3"
1385	vmovdqa		0x200-0x200(%rax),$xd0
1386	vmovdqa		0x220-0x200(%rax),$xd1
1387	vmovdqa		0x240-0x200(%rax),$xd2
1388	vmovdqa		0x260-0x200(%rax),$xd3
1389	vpaddd		.Leight(%rip),$xd0,$xd0	# next SIMD counters
1390
1391.Loop_enter8x:
1392	vmovdqa		$xt2,0x40(%rsp)		# SIMD equivalent of "@x[10]"
1393	vmovdqa		$xt3,0x60(%rsp)		# SIMD equivalent of "@x[11]"
1394	vbroadcasti128	(%r10),$xt3
1395	vmovdqa		$xd0,0x200-0x200(%rax)	# save SIMD counters
1396	mov		\$10,%eax
1397	jmp		.Loop8x
1398
1399.align	32
1400.Loop8x:
1401___
1402	foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1403	foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1404$code.=<<___;
1405	dec		%eax
1406	jnz		.Loop8x
1407
1408	lea		0x200(%rsp),%rax	# size optimization
1409	vpaddd		0x80-0x100(%rcx),$xa0,$xa0	# accumulate key
1410	vpaddd		0xa0-0x100(%rcx),$xa1,$xa1
1411	vpaddd		0xc0-0x100(%rcx),$xa2,$xa2
1412	vpaddd		0xe0-0x100(%rcx),$xa3,$xa3
1413
1414	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
1415	vpunpckldq	$xa3,$xa2,$xt3
1416	vpunpckhdq	$xa1,$xa0,$xa0
1417	vpunpckhdq	$xa3,$xa2,$xa2
1418	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
1419	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
1420	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
1421	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
1422___
1423	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1424$code.=<<___;
1425	vpaddd		0x100-0x100(%rcx),$xb0,$xb0
1426	vpaddd		0x120-0x100(%rcx),$xb1,$xb1
1427	vpaddd		0x140-0x100(%rcx),$xb2,$xb2
1428	vpaddd		0x160-0x100(%rcx),$xb3,$xb3
1429
1430	vpunpckldq	$xb1,$xb0,$xt2
1431	vpunpckldq	$xb3,$xb2,$xt3
1432	vpunpckhdq	$xb1,$xb0,$xb0
1433	vpunpckhdq	$xb3,$xb2,$xb2
1434	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
1435	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
1436	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
1437	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
1438___
1439	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1440$code.=<<___;
1441	vperm2i128	\$0x20,$xb0,$xa0,$xt3	# "de-interlace" further
1442	vperm2i128	\$0x31,$xb0,$xa0,$xb0
1443	vperm2i128	\$0x20,$xb1,$xa1,$xa0
1444	vperm2i128	\$0x31,$xb1,$xa1,$xb1
1445	vperm2i128	\$0x20,$xb2,$xa2,$xa1
1446	vperm2i128	\$0x31,$xb2,$xa2,$xb2
1447	vperm2i128	\$0x20,$xb3,$xa3,$xa2
1448	vperm2i128	\$0x31,$xb3,$xa3,$xb3
1449___
1450	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1451	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1452$code.=<<___;
1453	vmovdqa		$xa0,0x00(%rsp)		# offload $xaN
1454	vmovdqa		$xa1,0x20(%rsp)
1455	vmovdqa		0x40(%rsp),$xc2		# $xa0
1456	vmovdqa		0x60(%rsp),$xc3		# $xa1
1457
1458	vpaddd		0x180-0x200(%rax),$xc0,$xc0
1459	vpaddd		0x1a0-0x200(%rax),$xc1,$xc1
1460	vpaddd		0x1c0-0x200(%rax),$xc2,$xc2
1461	vpaddd		0x1e0-0x200(%rax),$xc3,$xc3
1462
1463	vpunpckldq	$xc1,$xc0,$xt2
1464	vpunpckldq	$xc3,$xc2,$xt3
1465	vpunpckhdq	$xc1,$xc0,$xc0
1466	vpunpckhdq	$xc3,$xc2,$xc2
1467	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
1468	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
1469	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
1470	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
1471___
1472	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1473$code.=<<___;
1474	vpaddd		0x200-0x200(%rax),$xd0,$xd0
1475	vpaddd		0x220-0x200(%rax),$xd1,$xd1
1476	vpaddd		0x240-0x200(%rax),$xd2,$xd2
1477	vpaddd		0x260-0x200(%rax),$xd3,$xd3
1478
1479	vpunpckldq	$xd1,$xd0,$xt2
1480	vpunpckldq	$xd3,$xd2,$xt3
1481	vpunpckhdq	$xd1,$xd0,$xd0
1482	vpunpckhdq	$xd3,$xd2,$xd2
1483	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
1484	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
1485	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
1486	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
1487___
1488	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1489$code.=<<___;
1490	vperm2i128	\$0x20,$xd0,$xc0,$xt3	# "de-interlace" further
1491	vperm2i128	\$0x31,$xd0,$xc0,$xd0
1492	vperm2i128	\$0x20,$xd1,$xc1,$xc0
1493	vperm2i128	\$0x31,$xd1,$xc1,$xd1
1494	vperm2i128	\$0x20,$xd2,$xc2,$xc1
1495	vperm2i128	\$0x31,$xd2,$xc2,$xd2
1496	vperm2i128	\$0x20,$xd3,$xc3,$xc2
1497	vperm2i128	\$0x31,$xd3,$xc3,$xd3
1498___
1499	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1500	($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1501	($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1502	($xa0,$xa1)=($xt2,$xt3);
1503$code.=<<___;
1504	vmovdqa		0x00(%rsp),$xa0		# $xaN was offloaded, remember?
1505	vmovdqa		0x20(%rsp),$xa1
1506
1507	cmp		\$64*8,$len
1508	jb		.Ltail8x
1509
1510	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1511	vpxor		0x20($inp),$xb0,$xb0
1512	vpxor		0x40($inp),$xc0,$xc0
1513	vpxor		0x60($inp),$xd0,$xd0
1514	lea		0x80($inp),$inp		# size optimization
1515	vmovdqu		$xa0,0x00($out)
1516	vmovdqu		$xb0,0x20($out)
1517	vmovdqu		$xc0,0x40($out)
1518	vmovdqu		$xd0,0x60($out)
1519	lea		0x80($out),$out		# size optimization
1520
1521	vpxor		0x00($inp),$xa1,$xa1
1522	vpxor		0x20($inp),$xb1,$xb1
1523	vpxor		0x40($inp),$xc1,$xc1
1524	vpxor		0x60($inp),$xd1,$xd1
1525	lea		0x80($inp),$inp		# size optimization
1526	vmovdqu		$xa1,0x00($out)
1527	vmovdqu		$xb1,0x20($out)
1528	vmovdqu		$xc1,0x40($out)
1529	vmovdqu		$xd1,0x60($out)
1530	lea		0x80($out),$out		# size optimization
1531
1532	vpxor		0x00($inp),$xa2,$xa2
1533	vpxor		0x20($inp),$xb2,$xb2
1534	vpxor		0x40($inp),$xc2,$xc2
1535	vpxor		0x60($inp),$xd2,$xd2
1536	lea		0x80($inp),$inp		# size optimization
1537	vmovdqu		$xa2,0x00($out)
1538	vmovdqu		$xb2,0x20($out)
1539	vmovdqu		$xc2,0x40($out)
1540	vmovdqu		$xd2,0x60($out)
1541	lea		0x80($out),$out		# size optimization
1542
1543	vpxor		0x00($inp),$xa3,$xa3
1544	vpxor		0x20($inp),$xb3,$xb3
1545	vpxor		0x40($inp),$xc3,$xc3
1546	vpxor		0x60($inp),$xd3,$xd3
1547	lea		0x80($inp),$inp		# size optimization
1548	vmovdqu		$xa3,0x00($out)
1549	vmovdqu		$xb3,0x20($out)
1550	vmovdqu		$xc3,0x40($out)
1551	vmovdqu		$xd3,0x60($out)
1552	lea		0x80($out),$out		# size optimization
1553
1554	sub		\$64*8,$len
1555	jnz		.Loop_outer8x
1556
1557	jmp		.Ldone8x
1558
1559.Ltail8x:
1560	cmp		\$448,$len
1561	jae		.L448_or_more8x
1562	cmp		\$384,$len
1563	jae		.L384_or_more8x
1564	cmp		\$320,$len
1565	jae		.L320_or_more8x
1566	cmp		\$256,$len
1567	jae		.L256_or_more8x
1568	cmp		\$192,$len
1569	jae		.L192_or_more8x
1570	cmp		\$128,$len
1571	jae		.L128_or_more8x
1572	cmp		\$64,$len
1573	jae		.L64_or_more8x
1574
1575	xor		%r10,%r10
1576	vmovdqa		$xa0,0x00(%rsp)
1577	vmovdqa		$xb0,0x20(%rsp)
1578	jmp		.Loop_tail8x
1579
1580.align	32
1581.L64_or_more8x:
1582	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1583	vpxor		0x20($inp),$xb0,$xb0
1584	vmovdqu		$xa0,0x00($out)
1585	vmovdqu		$xb0,0x20($out)
1586	je		.Ldone8x
1587
1588	lea		0x40($inp),$inp		# inp+=64*1
1589	xor		%r10,%r10
1590	vmovdqa		$xc0,0x00(%rsp)
1591	lea		0x40($out),$out		# out+=64*1
1592	sub		\$64,$len		# len-=64*1
1593	vmovdqa		$xd0,0x20(%rsp)
1594	jmp		.Loop_tail8x
1595
1596.align	32
1597.L128_or_more8x:
1598	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1599	vpxor		0x20($inp),$xb0,$xb0
1600	vpxor		0x40($inp),$xc0,$xc0
1601	vpxor		0x60($inp),$xd0,$xd0
1602	vmovdqu		$xa0,0x00($out)
1603	vmovdqu		$xb0,0x20($out)
1604	vmovdqu		$xc0,0x40($out)
1605	vmovdqu		$xd0,0x60($out)
1606	je		.Ldone8x
1607
1608	lea		0x80($inp),$inp		# inp+=64*2
1609	xor		%r10,%r10
1610	vmovdqa		$xa1,0x00(%rsp)
1611	lea		0x80($out),$out		# out+=64*2
1612	sub		\$128,$len		# len-=64*2
1613	vmovdqa		$xb1,0x20(%rsp)
1614	jmp		.Loop_tail8x
1615
1616.align	32
1617.L192_or_more8x:
1618	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1619	vpxor		0x20($inp),$xb0,$xb0
1620	vpxor		0x40($inp),$xc0,$xc0
1621	vpxor		0x60($inp),$xd0,$xd0
1622	vpxor		0x80($inp),$xa1,$xa1
1623	vpxor		0xa0($inp),$xb1,$xb1
1624	vmovdqu		$xa0,0x00($out)
1625	vmovdqu		$xb0,0x20($out)
1626	vmovdqu		$xc0,0x40($out)
1627	vmovdqu		$xd0,0x60($out)
1628	vmovdqu		$xa1,0x80($out)
1629	vmovdqu		$xb1,0xa0($out)
1630	je		.Ldone8x
1631
1632	lea		0xc0($inp),$inp		# inp+=64*3
1633	xor		%r10,%r10
1634	vmovdqa		$xc1,0x00(%rsp)
1635	lea		0xc0($out),$out		# out+=64*3
1636	sub		\$192,$len		# len-=64*3
1637	vmovdqa		$xd1,0x20(%rsp)
1638	jmp		.Loop_tail8x
1639
1640.align	32
1641.L256_or_more8x:
1642	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1643	vpxor		0x20($inp),$xb0,$xb0
1644	vpxor		0x40($inp),$xc0,$xc0
1645	vpxor		0x60($inp),$xd0,$xd0
1646	vpxor		0x80($inp),$xa1,$xa1
1647	vpxor		0xa0($inp),$xb1,$xb1
1648	vpxor		0xc0($inp),$xc1,$xc1
1649	vpxor		0xe0($inp),$xd1,$xd1
1650	vmovdqu		$xa0,0x00($out)
1651	vmovdqu		$xb0,0x20($out)
1652	vmovdqu		$xc0,0x40($out)
1653	vmovdqu		$xd0,0x60($out)
1654	vmovdqu		$xa1,0x80($out)
1655	vmovdqu		$xb1,0xa0($out)
1656	vmovdqu		$xc1,0xc0($out)
1657	vmovdqu		$xd1,0xe0($out)
1658	je		.Ldone8x
1659
1660	lea		0x100($inp),$inp	# inp+=64*4
1661	xor		%r10,%r10
1662	vmovdqa		$xa2,0x00(%rsp)
1663	lea		0x100($out),$out	# out+=64*4
1664	sub		\$256,$len		# len-=64*4
1665	vmovdqa		$xb2,0x20(%rsp)
1666	jmp		.Loop_tail8x
1667
1668.align	32
1669.L320_or_more8x:
1670	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1671	vpxor		0x20($inp),$xb0,$xb0
1672	vpxor		0x40($inp),$xc0,$xc0
1673	vpxor		0x60($inp),$xd0,$xd0
1674	vpxor		0x80($inp),$xa1,$xa1
1675	vpxor		0xa0($inp),$xb1,$xb1
1676	vpxor		0xc0($inp),$xc1,$xc1
1677	vpxor		0xe0($inp),$xd1,$xd1
1678	vpxor		0x100($inp),$xa2,$xa2
1679	vpxor		0x120($inp),$xb2,$xb2
1680	vmovdqu		$xa0,0x00($out)
1681	vmovdqu		$xb0,0x20($out)
1682	vmovdqu		$xc0,0x40($out)
1683	vmovdqu		$xd0,0x60($out)
1684	vmovdqu		$xa1,0x80($out)
1685	vmovdqu		$xb1,0xa0($out)
1686	vmovdqu		$xc1,0xc0($out)
1687	vmovdqu		$xd1,0xe0($out)
1688	vmovdqu		$xa2,0x100($out)
1689	vmovdqu		$xb2,0x120($out)
1690	je		.Ldone8x
1691
1692	lea		0x140($inp),$inp	# inp+=64*5
1693	xor		%r10,%r10
1694	vmovdqa		$xc2,0x00(%rsp)
1695	lea		0x140($out),$out	# out+=64*5
1696	sub		\$320,$len		# len-=64*5
1697	vmovdqa		$xd2,0x20(%rsp)
1698	jmp		.Loop_tail8x
1699
1700.align	32
1701.L384_or_more8x:
1702	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1703	vpxor		0x20($inp),$xb0,$xb0
1704	vpxor		0x40($inp),$xc0,$xc0
1705	vpxor		0x60($inp),$xd0,$xd0
1706	vpxor		0x80($inp),$xa1,$xa1
1707	vpxor		0xa0($inp),$xb1,$xb1
1708	vpxor		0xc0($inp),$xc1,$xc1
1709	vpxor		0xe0($inp),$xd1,$xd1
1710	vpxor		0x100($inp),$xa2,$xa2
1711	vpxor		0x120($inp),$xb2,$xb2
1712	vpxor		0x140($inp),$xc2,$xc2
1713	vpxor		0x160($inp),$xd2,$xd2
1714	vmovdqu		$xa0,0x00($out)
1715	vmovdqu		$xb0,0x20($out)
1716	vmovdqu		$xc0,0x40($out)
1717	vmovdqu		$xd0,0x60($out)
1718	vmovdqu		$xa1,0x80($out)
1719	vmovdqu		$xb1,0xa0($out)
1720	vmovdqu		$xc1,0xc0($out)
1721	vmovdqu		$xd1,0xe0($out)
1722	vmovdqu		$xa2,0x100($out)
1723	vmovdqu		$xb2,0x120($out)
1724	vmovdqu		$xc2,0x140($out)
1725	vmovdqu		$xd2,0x160($out)
1726	je		.Ldone8x
1727
1728	lea		0x180($inp),$inp	# inp+=64*6
1729	xor		%r10,%r10
1730	vmovdqa		$xa3,0x00(%rsp)
1731	lea		0x180($out),$out	# out+=64*6
1732	sub		\$384,$len		# len-=64*6
1733	vmovdqa		$xb3,0x20(%rsp)
1734	jmp		.Loop_tail8x
1735
1736.align	32
1737.L448_or_more8x:
1738	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1739	vpxor		0x20($inp),$xb0,$xb0
1740	vpxor		0x40($inp),$xc0,$xc0
1741	vpxor		0x60($inp),$xd0,$xd0
1742	vpxor		0x80($inp),$xa1,$xa1
1743	vpxor		0xa0($inp),$xb1,$xb1
1744	vpxor		0xc0($inp),$xc1,$xc1
1745	vpxor		0xe0($inp),$xd1,$xd1
1746	vpxor		0x100($inp),$xa2,$xa2
1747	vpxor		0x120($inp),$xb2,$xb2
1748	vpxor		0x140($inp),$xc2,$xc2
1749	vpxor		0x160($inp),$xd2,$xd2
1750	vpxor		0x180($inp),$xa3,$xa3
1751	vpxor		0x1a0($inp),$xb3,$xb3
1752	vmovdqu		$xa0,0x00($out)
1753	vmovdqu		$xb0,0x20($out)
1754	vmovdqu		$xc0,0x40($out)
1755	vmovdqu		$xd0,0x60($out)
1756	vmovdqu		$xa1,0x80($out)
1757	vmovdqu		$xb1,0xa0($out)
1758	vmovdqu		$xc1,0xc0($out)
1759	vmovdqu		$xd1,0xe0($out)
1760	vmovdqu		$xa2,0x100($out)
1761	vmovdqu		$xb2,0x120($out)
1762	vmovdqu		$xc2,0x140($out)
1763	vmovdqu		$xd2,0x160($out)
1764	vmovdqu		$xa3,0x180($out)
1765	vmovdqu		$xb3,0x1a0($out)
1766	je		.Ldone8x
1767
1768	lea		0x1c0($inp),$inp	# inp+=64*7
1769	xor		%r10,%r10
1770	vmovdqa		$xc3,0x00(%rsp)
1771	lea		0x1c0($out),$out	# out+=64*7
1772	sub		\$448,$len		# len-=64*7
1773	vmovdqa		$xd3,0x20(%rsp)
1774
1775.Loop_tail8x:
1776	movzb		($inp,%r10),%eax
1777	movzb		(%rsp,%r10),%ecx
1778	lea		1(%r10),%r10
1779	xor		%ecx,%eax
1780	mov		%al,-1($out,%r10)
1781	dec		$len
1782	jnz		.Loop_tail8x
1783
1784.Ldone8x:
1785	vzeroall
1786___
1787$code.=<<___	if ($win64);
1788	movaps		-0xa8(%r9),%xmm6
1789	movaps		-0x98(%r9),%xmm7
1790	movaps		-0x88(%r9),%xmm8
1791	movaps		-0x78(%r9),%xmm9
1792	movaps		-0x68(%r9),%xmm10
1793	movaps		-0x58(%r9),%xmm11
1794	movaps		-0x48(%r9),%xmm12
1795	movaps		-0x38(%r9),%xmm13
1796	movaps		-0x28(%r9),%xmm14
1797	movaps		-0x18(%r9),%xmm15
1798___
1799$code.=<<___;
1800	lea		(%r9),%rsp
1801.cfi_def_cfa_register	rsp
1802.L8x_epilogue:
1803	ret
1804.cfi_endproc
1805.size	ChaCha20_8x,.-ChaCha20_8x
1806___
1807}
1808
1809########################################################################
1810# AVX512 code paths
1811if ($avx>2) {
1812# This one handles shorter inputs...
1813
1814my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
1815my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1816
1817sub AVX512ROUND {	# critical path is 14 "SIMD ticks" per round
1818	&vpaddd	($a,$a,$b);
1819	&vpxord	($d,$d,$a);
1820	&vprold	($d,$d,16);
1821
1822	&vpaddd	($c,$c,$d);
1823	&vpxord	($b,$b,$c);
1824	&vprold	($b,$b,12);
1825
1826	&vpaddd	($a,$a,$b);
1827	&vpxord	($d,$d,$a);
1828	&vprold	($d,$d,8);
1829
1830	&vpaddd	($c,$c,$d);
1831	&vpxord	($b,$b,$c);
1832	&vprold	($b,$b,7);
1833}
1834
1835my $xframe = $win64 ? 32+8 : 8;
1836
1837$code.=<<___;
1838.type	ChaCha20_avx512,\@function,5
1839.align	32
1840ChaCha20_avx512:
1841.LChaCha20_avx512:
1842.cfi_startproc
1843	mov	%rsp,%r9		# frame pointer
1844.cfi_def_cfa_register	r9
1845	cmp	\$512,$len
1846	ja	.LChaCha20_16x
1847
1848	sub	\$64+$xframe,%rsp
1849___
1850$code.=<<___	if ($win64);
1851	movaps	%xmm6,-0x28(%r9)
1852	movaps	%xmm7,-0x18(%r9)
1853.Lavx512_body:
1854___
1855$code.=<<___;
1856	vbroadcasti32x4	.Lsigma(%rip),$a
1857	vbroadcasti32x4	($key),$b
1858	vbroadcasti32x4	16($key),$c
1859	vbroadcasti32x4	($counter),$d
1860
1861	vmovdqa32	$a,$a_
1862	vmovdqa32	$b,$b_
1863	vmovdqa32	$c,$c_
1864	vpaddd		.Lzeroz(%rip),$d,$d
1865	vmovdqa32	.Lfourz(%rip),$fourz
1866	mov		\$10,$counter	# reuse $counter
1867	vmovdqa32	$d,$d_
1868	jmp		.Loop_avx512
1869
1870.align	16
1871.Loop_outer_avx512:
1872	vmovdqa32	$a_,$a
1873	vmovdqa32	$b_,$b
1874	vmovdqa32	$c_,$c
1875	vpaddd		$fourz,$d_,$d
1876	mov		\$10,$counter
1877	vmovdqa32	$d,$d_
1878	jmp		.Loop_avx512
1879
1880.align	32
1881.Loop_avx512:
1882___
1883	&AVX512ROUND();
1884	&vpshufd	($c,$c,0b01001110);
1885	&vpshufd	($b,$b,0b00111001);
1886	&vpshufd	($d,$d,0b10010011);
1887
1888	&AVX512ROUND();
1889	&vpshufd	($c,$c,0b01001110);
1890	&vpshufd	($b,$b,0b10010011);
1891	&vpshufd	($d,$d,0b00111001);
1892
1893	&dec		($counter);
1894	&jnz		(".Loop_avx512");
1895
1896$code.=<<___;
1897	vpaddd		$a_,$a,$a
1898	vpaddd		$b_,$b,$b
1899	vpaddd		$c_,$c,$c
1900	vpaddd		$d_,$d,$d
1901
1902	sub		\$64,$len
1903	jb		.Ltail64_avx512
1904
1905	vpxor		0x00($inp),%x#$a,$t0	# xor with input
1906	vpxor		0x10($inp),%x#$b,$t1
1907	vpxor		0x20($inp),%x#$c,$t2
1908	vpxor		0x30($inp),%x#$d,$t3
1909	lea		0x40($inp),$inp		# inp+=64
1910
1911	vmovdqu		$t0,0x00($out)		# write output
1912	vmovdqu		$t1,0x10($out)
1913	vmovdqu		$t2,0x20($out)
1914	vmovdqu		$t3,0x30($out)
1915	lea		0x40($out),$out		# out+=64
1916
1917	jz		.Ldone_avx512
1918
1919	vextracti32x4	\$1,$a,$t0
1920	vextracti32x4	\$1,$b,$t1
1921	vextracti32x4	\$1,$c,$t2
1922	vextracti32x4	\$1,$d,$t3
1923
1924	sub		\$64,$len
1925	jb		.Ltail_avx512
1926
1927	vpxor		0x00($inp),$t0,$t0	# xor with input
1928	vpxor		0x10($inp),$t1,$t1
1929	vpxor		0x20($inp),$t2,$t2
1930	vpxor		0x30($inp),$t3,$t3
1931	lea		0x40($inp),$inp		# inp+=64
1932
1933	vmovdqu		$t0,0x00($out)		# write output
1934	vmovdqu		$t1,0x10($out)
1935	vmovdqu		$t2,0x20($out)
1936	vmovdqu		$t3,0x30($out)
1937	lea		0x40($out),$out		# out+=64
1938
1939	jz		.Ldone_avx512
1940
1941	vextracti32x4	\$2,$a,$t0
1942	vextracti32x4	\$2,$b,$t1
1943	vextracti32x4	\$2,$c,$t2
1944	vextracti32x4	\$2,$d,$t3
1945
1946	sub		\$64,$len
1947	jb		.Ltail_avx512
1948
1949	vpxor		0x00($inp),$t0,$t0	# xor with input
1950	vpxor		0x10($inp),$t1,$t1
1951	vpxor		0x20($inp),$t2,$t2
1952	vpxor		0x30($inp),$t3,$t3
1953	lea		0x40($inp),$inp		# inp+=64
1954
1955	vmovdqu		$t0,0x00($out)		# write output
1956	vmovdqu		$t1,0x10($out)
1957	vmovdqu		$t2,0x20($out)
1958	vmovdqu		$t3,0x30($out)
1959	lea		0x40($out),$out		# out+=64
1960
1961	jz		.Ldone_avx512
1962
1963	vextracti32x4	\$3,$a,$t0
1964	vextracti32x4	\$3,$b,$t1
1965	vextracti32x4	\$3,$c,$t2
1966	vextracti32x4	\$3,$d,$t3
1967
1968	sub		\$64,$len
1969	jb		.Ltail_avx512
1970
1971	vpxor		0x00($inp),$t0,$t0	# xor with input
1972	vpxor		0x10($inp),$t1,$t1
1973	vpxor		0x20($inp),$t2,$t2
1974	vpxor		0x30($inp),$t3,$t3
1975	lea		0x40($inp),$inp		# inp+=64
1976
1977	vmovdqu		$t0,0x00($out)		# write output
1978	vmovdqu		$t1,0x10($out)
1979	vmovdqu		$t2,0x20($out)
1980	vmovdqu		$t3,0x30($out)
1981	lea		0x40($out),$out		# out+=64
1982
1983	jnz		.Loop_outer_avx512
1984
1985	jmp		.Ldone_avx512
1986
1987.align	16
1988.Ltail64_avx512:
1989	vmovdqa		%x#$a,0x00(%rsp)
1990	vmovdqa		%x#$b,0x10(%rsp)
1991	vmovdqa		%x#$c,0x20(%rsp)
1992	vmovdqa		%x#$d,0x30(%rsp)
1993	add		\$64,$len
1994	jmp		.Loop_tail_avx512
1995
1996.align	16
1997.Ltail_avx512:
1998	vmovdqa		$t0,0x00(%rsp)
1999	vmovdqa		$t1,0x10(%rsp)
2000	vmovdqa		$t2,0x20(%rsp)
2001	vmovdqa		$t3,0x30(%rsp)
2002	add		\$64,$len
2003
2004.Loop_tail_avx512:
2005	movzb		($inp,$counter),%eax
2006	movzb		(%rsp,$counter),%ecx
2007	lea		1($counter),$counter
2008	xor		%ecx,%eax
2009	mov		%al,-1($out,$counter)
2010	dec		$len
2011	jnz		.Loop_tail_avx512
2012
2013	vmovdqa32	$a_,0x00(%rsp)
2014
2015.Ldone_avx512:
2016	vzeroall
2017___
2018$code.=<<___	if ($win64);
2019	movaps	-0x28(%r9),%xmm6
2020	movaps	-0x18(%r9),%xmm7
2021___
2022$code.=<<___;
2023	lea	(%r9),%rsp
2024.cfi_def_cfa_register	rsp
2025.Lavx512_epilogue:
2026	ret
2027.cfi_endproc
2028.size	ChaCha20_avx512,.-ChaCha20_avx512
2029___
2030}
2031if ($avx>2) {
2032# This one handles longer inputs...
2033
2034my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2035    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2036my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2037	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2038my @key=map("%zmm$_",(16..31));
2039my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2040
2041sub AVX512_lane_ROUND {
2042my ($a0,$b0,$c0,$d0)=@_;
2043my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2044my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2045my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2046my @x=map("\"$_\"",@xx);
2047
2048	(
2049	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
2050	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
2051	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
2052	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
2053	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
2054	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
2055	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
2056	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
2057	"&vprold	(@x[$d0],@x[$d0],16)",
2058	 "&vprold	(@x[$d1],@x[$d1],16)",
2059	  "&vprold	(@x[$d2],@x[$d2],16)",
2060	   "&vprold	(@x[$d3],@x[$d3],16)",
2061
2062	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
2063	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
2064	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
2065	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
2066	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
2067	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
2068	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
2069	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
2070	"&vprold	(@x[$b0],@x[$b0],12)",
2071	 "&vprold	(@x[$b1],@x[$b1],12)",
2072	  "&vprold	(@x[$b2],@x[$b2],12)",
2073	   "&vprold	(@x[$b3],@x[$b3],12)",
2074
2075	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
2076	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
2077	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
2078	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
2079	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
2080	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
2081	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
2082	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
2083	"&vprold	(@x[$d0],@x[$d0],8)",
2084	 "&vprold	(@x[$d1],@x[$d1],8)",
2085	  "&vprold	(@x[$d2],@x[$d2],8)",
2086	   "&vprold	(@x[$d3],@x[$d3],8)",
2087
2088	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
2089	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
2090	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
2091	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
2092	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
2093	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
2094	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
2095	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
2096	"&vprold	(@x[$b0],@x[$b0],7)",
2097	 "&vprold	(@x[$b1],@x[$b1],7)",
2098	  "&vprold	(@x[$b2],@x[$b2],7)",
2099	   "&vprold	(@x[$b3],@x[$b3],7)"
2100	);
2101}
2102
2103my $xframe = $win64 ? 0xa8 : 8;
2104
2105$code.=<<___;
2106.type	ChaCha20_16x,\@function,5
2107.align	32
2108ChaCha20_16x:
2109.LChaCha20_16x:
2110.cfi_startproc
2111	mov		%rsp,%r9		# frame register
2112.cfi_def_cfa_register	r9
2113	sub		\$64+$xframe,%rsp
2114	and		\$-64,%rsp
2115___
2116$code.=<<___	if ($win64);
2117	movaps		%xmm6,-0xa8(%r9)
2118	movaps		%xmm7,-0x98(%r9)
2119	movaps		%xmm8,-0x88(%r9)
2120	movaps		%xmm9,-0x78(%r9)
2121	movaps		%xmm10,-0x68(%r9)
2122	movaps		%xmm11,-0x58(%r9)
2123	movaps		%xmm12,-0x48(%r9)
2124	movaps		%xmm13,-0x38(%r9)
2125	movaps		%xmm14,-0x28(%r9)
2126	movaps		%xmm15,-0x18(%r9)
2127.L16x_body:
2128___
2129$code.=<<___;
2130	vzeroupper
2131
2132	lea		.Lsigma(%rip),%r10
2133	vbroadcasti32x4	(%r10),$xa3		# key[0]
2134	vbroadcasti32x4	($key),$xb3		# key[1]
2135	vbroadcasti32x4	16($key),$xc3		# key[2]
2136	vbroadcasti32x4	($counter),$xd3		# key[3]
2137
2138	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
2139	vpshufd		\$0x55,$xa3,$xa1
2140	vpshufd		\$0xaa,$xa3,$xa2
2141	vpshufd		\$0xff,$xa3,$xa3
2142	vmovdqa64	$xa0,@key[0]
2143	vmovdqa64	$xa1,@key[1]
2144	vmovdqa64	$xa2,@key[2]
2145	vmovdqa64	$xa3,@key[3]
2146
2147	vpshufd		\$0x00,$xb3,$xb0
2148	vpshufd		\$0x55,$xb3,$xb1
2149	vpshufd		\$0xaa,$xb3,$xb2
2150	vpshufd		\$0xff,$xb3,$xb3
2151	vmovdqa64	$xb0,@key[4]
2152	vmovdqa64	$xb1,@key[5]
2153	vmovdqa64	$xb2,@key[6]
2154	vmovdqa64	$xb3,@key[7]
2155
2156	vpshufd		\$0x00,$xc3,$xc0
2157	vpshufd		\$0x55,$xc3,$xc1
2158	vpshufd		\$0xaa,$xc3,$xc2
2159	vpshufd		\$0xff,$xc3,$xc3
2160	vmovdqa64	$xc0,@key[8]
2161	vmovdqa64	$xc1,@key[9]
2162	vmovdqa64	$xc2,@key[10]
2163	vmovdqa64	$xc3,@key[11]
2164
2165	vpshufd		\$0x00,$xd3,$xd0
2166	vpshufd		\$0x55,$xd3,$xd1
2167	vpshufd		\$0xaa,$xd3,$xd2
2168	vpshufd		\$0xff,$xd3,$xd3
2169	vpaddd		.Lincz(%rip),$xd0,$xd0	# don't save counters yet
2170	vmovdqa64	$xd0,@key[12]
2171	vmovdqa64	$xd1,@key[13]
2172	vmovdqa64	$xd2,@key[14]
2173	vmovdqa64	$xd3,@key[15]
2174
2175	mov		\$10,%eax
2176	jmp		.Loop16x
2177
2178.align	32
2179.Loop_outer16x:
2180	vpbroadcastd	0(%r10),$xa0		# reload key
2181	vpbroadcastd	4(%r10),$xa1
2182	vpbroadcastd	8(%r10),$xa2
2183	vpbroadcastd	12(%r10),$xa3
2184	vpaddd		.Lsixteen(%rip),@key[12],@key[12]	# next SIMD counters
2185	vmovdqa64	@key[4],$xb0
2186	vmovdqa64	@key[5],$xb1
2187	vmovdqa64	@key[6],$xb2
2188	vmovdqa64	@key[7],$xb3
2189	vmovdqa64	@key[8],$xc0
2190	vmovdqa64	@key[9],$xc1
2191	vmovdqa64	@key[10],$xc2
2192	vmovdqa64	@key[11],$xc3
2193	vmovdqa64	@key[12],$xd0
2194	vmovdqa64	@key[13],$xd1
2195	vmovdqa64	@key[14],$xd2
2196	vmovdqa64	@key[15],$xd3
2197
2198	vmovdqa64	$xa0,@key[0]
2199	vmovdqa64	$xa1,@key[1]
2200	vmovdqa64	$xa2,@key[2]
2201	vmovdqa64	$xa3,@key[3]
2202
2203	mov		\$10,%eax
2204	jmp		.Loop16x
2205
2206.align	32
2207.Loop16x:
2208___
2209	foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
2210	foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
2211$code.=<<___;
2212	dec		%eax
2213	jnz		.Loop16x
2214
2215	vpaddd		@key[0],$xa0,$xa0	# accumulate key
2216	vpaddd		@key[1],$xa1,$xa1
2217	vpaddd		@key[2],$xa2,$xa2
2218	vpaddd		@key[3],$xa3,$xa3
2219
2220	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
2221	vpunpckldq	$xa3,$xa2,$xt3
2222	vpunpckhdq	$xa1,$xa0,$xa0
2223	vpunpckhdq	$xa3,$xa2,$xa2
2224	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
2225	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
2226	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
2227	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
2228___
2229	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2230$code.=<<___;
2231	vpaddd		@key[4],$xb0,$xb0
2232	vpaddd		@key[5],$xb1,$xb1
2233	vpaddd		@key[6],$xb2,$xb2
2234	vpaddd		@key[7],$xb3,$xb3
2235
2236	vpunpckldq	$xb1,$xb0,$xt2
2237	vpunpckldq	$xb3,$xb2,$xt3
2238	vpunpckhdq	$xb1,$xb0,$xb0
2239	vpunpckhdq	$xb3,$xb2,$xb2
2240	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
2241	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
2242	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
2243	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
2244___
2245	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2246$code.=<<___;
2247	vshufi32x4	\$0x44,$xb0,$xa0,$xt3	# "de-interlace" further
2248	vshufi32x4	\$0xee,$xb0,$xa0,$xb0
2249	vshufi32x4	\$0x44,$xb1,$xa1,$xa0
2250	vshufi32x4	\$0xee,$xb1,$xa1,$xb1
2251	vshufi32x4	\$0x44,$xb2,$xa2,$xa1
2252	vshufi32x4	\$0xee,$xb2,$xa2,$xb2
2253	vshufi32x4	\$0x44,$xb3,$xa3,$xa2
2254	vshufi32x4	\$0xee,$xb3,$xa3,$xb3
2255___
2256	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2257$code.=<<___;
2258	vpaddd		@key[8],$xc0,$xc0
2259	vpaddd		@key[9],$xc1,$xc1
2260	vpaddd		@key[10],$xc2,$xc2
2261	vpaddd		@key[11],$xc3,$xc3
2262
2263	vpunpckldq	$xc1,$xc0,$xt2
2264	vpunpckldq	$xc3,$xc2,$xt3
2265	vpunpckhdq	$xc1,$xc0,$xc0
2266	vpunpckhdq	$xc3,$xc2,$xc2
2267	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
2268	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
2269	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
2270	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
2271___
2272	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2273$code.=<<___;
2274	vpaddd		@key[12],$xd0,$xd0
2275	vpaddd		@key[13],$xd1,$xd1
2276	vpaddd		@key[14],$xd2,$xd2
2277	vpaddd		@key[15],$xd3,$xd3
2278
2279	vpunpckldq	$xd1,$xd0,$xt2
2280	vpunpckldq	$xd3,$xd2,$xt3
2281	vpunpckhdq	$xd1,$xd0,$xd0
2282	vpunpckhdq	$xd3,$xd2,$xd2
2283	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
2284	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
2285	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
2286	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
2287___
2288	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2289$code.=<<___;
2290	vshufi32x4	\$0x44,$xd0,$xc0,$xt3	# "de-interlace" further
2291	vshufi32x4	\$0xee,$xd0,$xc0,$xd0
2292	vshufi32x4	\$0x44,$xd1,$xc1,$xc0
2293	vshufi32x4	\$0xee,$xd1,$xc1,$xd1
2294	vshufi32x4	\$0x44,$xd2,$xc2,$xc1
2295	vshufi32x4	\$0xee,$xd2,$xc2,$xd2
2296	vshufi32x4	\$0x44,$xd3,$xc3,$xc2
2297	vshufi32x4	\$0xee,$xd3,$xc3,$xd3
2298___
2299	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2300$code.=<<___;
2301	vshufi32x4	\$0x88,$xc0,$xa0,$xt0	# "de-interlace" further
2302	vshufi32x4	\$0xdd,$xc0,$xa0,$xa0
2303	 vshufi32x4	\$0x88,$xd0,$xb0,$xc0
2304	 vshufi32x4	\$0xdd,$xd0,$xb0,$xd0
2305	vshufi32x4	\$0x88,$xc1,$xa1,$xt1
2306	vshufi32x4	\$0xdd,$xc1,$xa1,$xa1
2307	 vshufi32x4	\$0x88,$xd1,$xb1,$xc1
2308	 vshufi32x4	\$0xdd,$xd1,$xb1,$xd1
2309	vshufi32x4	\$0x88,$xc2,$xa2,$xt2
2310	vshufi32x4	\$0xdd,$xc2,$xa2,$xa2
2311	 vshufi32x4	\$0x88,$xd2,$xb2,$xc2
2312	 vshufi32x4	\$0xdd,$xd2,$xb2,$xd2
2313	vshufi32x4	\$0x88,$xc3,$xa3,$xt3
2314	vshufi32x4	\$0xdd,$xc3,$xa3,$xa3
2315	 vshufi32x4	\$0x88,$xd3,$xb3,$xc3
2316	 vshufi32x4	\$0xdd,$xd3,$xb3,$xd3
2317___
2318	($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2319	($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2320
2321	($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2322	 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2323	($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2324	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2325$code.=<<___;
2326	cmp		\$64*16,$len
2327	jb		.Ltail16x
2328
2329	vpxord		0x00($inp),$xa0,$xa0	# xor with input
2330	vpxord		0x40($inp),$xb0,$xb0
2331	vpxord		0x80($inp),$xc0,$xc0
2332	vpxord		0xc0($inp),$xd0,$xd0
2333	vmovdqu32	$xa0,0x00($out)
2334	vmovdqu32	$xb0,0x40($out)
2335	vmovdqu32	$xc0,0x80($out)
2336	vmovdqu32	$xd0,0xc0($out)
2337
2338	vpxord		0x100($inp),$xa1,$xa1
2339	vpxord		0x140($inp),$xb1,$xb1
2340	vpxord		0x180($inp),$xc1,$xc1
2341	vpxord		0x1c0($inp),$xd1,$xd1
2342	vmovdqu32	$xa1,0x100($out)
2343	vmovdqu32	$xb1,0x140($out)
2344	vmovdqu32	$xc1,0x180($out)
2345	vmovdqu32	$xd1,0x1c0($out)
2346
2347	vpxord		0x200($inp),$xa2,$xa2
2348	vpxord		0x240($inp),$xb2,$xb2
2349	vpxord		0x280($inp),$xc2,$xc2
2350	vpxord		0x2c0($inp),$xd2,$xd2
2351	vmovdqu32	$xa2,0x200($out)
2352	vmovdqu32	$xb2,0x240($out)
2353	vmovdqu32	$xc2,0x280($out)
2354	vmovdqu32	$xd2,0x2c0($out)
2355
2356	vpxord		0x300($inp),$xa3,$xa3
2357	vpxord		0x340($inp),$xb3,$xb3
2358	vpxord		0x380($inp),$xc3,$xc3
2359	vpxord		0x3c0($inp),$xd3,$xd3
2360	lea		0x400($inp),$inp
2361	vmovdqu32	$xa3,0x300($out)
2362	vmovdqu32	$xb3,0x340($out)
2363	vmovdqu32	$xc3,0x380($out)
2364	vmovdqu32	$xd3,0x3c0($out)
2365	lea		0x400($out),$out
2366
2367	sub		\$64*16,$len
2368	jnz		.Loop_outer16x
2369
2370	jmp		.Ldone16x
2371
2372.align	32
2373.Ltail16x:
2374	xor		%r10,%r10
2375	sub		$inp,$out
2376	cmp		\$64*1,$len
2377	jb		.Less_than_64_16x
2378	vpxord		($inp),$xa0,$xa0	# xor with input
2379	vmovdqu32	$xa0,($out,$inp)
2380	je		.Ldone16x
2381	vmovdqa32	$xb0,$xa0
2382	lea		64($inp),$inp
2383
2384	cmp		\$64*2,$len
2385	jb		.Less_than_64_16x
2386	vpxord		($inp),$xb0,$xb0
2387	vmovdqu32	$xb0,($out,$inp)
2388	je		.Ldone16x
2389	vmovdqa32	$xc0,$xa0
2390	lea		64($inp),$inp
2391
2392	cmp		\$64*3,$len
2393	jb		.Less_than_64_16x
2394	vpxord		($inp),$xc0,$xc0
2395	vmovdqu32	$xc0,($out,$inp)
2396	je		.Ldone16x
2397	vmovdqa32	$xd0,$xa0
2398	lea		64($inp),$inp
2399
2400	cmp		\$64*4,$len
2401	jb		.Less_than_64_16x
2402	vpxord		($inp),$xd0,$xd0
2403	vmovdqu32	$xd0,($out,$inp)
2404	je		.Ldone16x
2405	vmovdqa32	$xa1,$xa0
2406	lea		64($inp),$inp
2407
2408	cmp		\$64*5,$len
2409	jb		.Less_than_64_16x
2410	vpxord		($inp),$xa1,$xa1
2411	vmovdqu32	$xa1,($out,$inp)
2412	je		.Ldone16x
2413	vmovdqa32	$xb1,$xa0
2414	lea		64($inp),$inp
2415
2416	cmp		\$64*6,$len
2417	jb		.Less_than_64_16x
2418	vpxord		($inp),$xb1,$xb1
2419	vmovdqu32	$xb1,($out,$inp)
2420	je		.Ldone16x
2421	vmovdqa32	$xc1,$xa0
2422	lea		64($inp),$inp
2423
2424	cmp		\$64*7,$len
2425	jb		.Less_than_64_16x
2426	vpxord		($inp),$xc1,$xc1
2427	vmovdqu32	$xc1,($out,$inp)
2428	je		.Ldone16x
2429	vmovdqa32	$xd1,$xa0
2430	lea		64($inp),$inp
2431
2432	cmp		\$64*8,$len
2433	jb		.Less_than_64_16x
2434	vpxord		($inp),$xd1,$xd1
2435	vmovdqu32	$xd1,($out,$inp)
2436	je		.Ldone16x
2437	vmovdqa32	$xa2,$xa0
2438	lea		64($inp),$inp
2439
2440	cmp		\$64*9,$len
2441	jb		.Less_than_64_16x
2442	vpxord		($inp),$xa2,$xa2
2443	vmovdqu32	$xa2,($out,$inp)
2444	je		.Ldone16x
2445	vmovdqa32	$xb2,$xa0
2446	lea		64($inp),$inp
2447
2448	cmp		\$64*10,$len
2449	jb		.Less_than_64_16x
2450	vpxord		($inp),$xb2,$xb2
2451	vmovdqu32	$xb2,($out,$inp)
2452	je		.Ldone16x
2453	vmovdqa32	$xc2,$xa0
2454	lea		64($inp),$inp
2455
2456	cmp		\$64*11,$len
2457	jb		.Less_than_64_16x
2458	vpxord		($inp),$xc2,$xc2
2459	vmovdqu32	$xc2,($out,$inp)
2460	je		.Ldone16x
2461	vmovdqa32	$xd2,$xa0
2462	lea		64($inp),$inp
2463
2464	cmp		\$64*12,$len
2465	jb		.Less_than_64_16x
2466	vpxord		($inp),$xd2,$xd2
2467	vmovdqu32	$xd2,($out,$inp)
2468	je		.Ldone16x
2469	vmovdqa32	$xa3,$xa0
2470	lea		64($inp),$inp
2471
2472	cmp		\$64*13,$len
2473	jb		.Less_than_64_16x
2474	vpxord		($inp),$xa3,$xa3
2475	vmovdqu32	$xa3,($out,$inp)
2476	je		.Ldone16x
2477	vmovdqa32	$xb3,$xa0
2478	lea		64($inp),$inp
2479
2480	cmp		\$64*14,$len
2481	jb		.Less_than_64_16x
2482	vpxord		($inp),$xb3,$xb3
2483	vmovdqu32	$xb3,($out,$inp)
2484	je		.Ldone16x
2485	vmovdqa32	$xc3,$xa0
2486	lea		64($inp),$inp
2487
2488	cmp		\$64*15,$len
2489	jb		.Less_than_64_16x
2490	vpxord		($inp),$xc3,$xc3
2491	vmovdqu32	$xc3,($out,$inp)
2492	je		.Ldone16x
2493	vmovdqa32	$xd3,$xa0
2494	lea		64($inp),$inp
2495
2496.Less_than_64_16x:
2497	vmovdqa32	$xa0,0x00(%rsp)
2498	lea		($out,$inp),$out
2499	and		\$63,$len
2500
2501.Loop_tail16x:
2502	movzb		($inp,%r10),%eax
2503	movzb		(%rsp,%r10),%ecx
2504	lea		1(%r10),%r10
2505	xor		%ecx,%eax
2506	mov		%al,-1($out,%r10)
2507	dec		$len
2508	jnz		.Loop_tail16x
2509
2510	vpxord		$xa0,$xa0,$xa0
2511	vmovdqa32	$xa0,0(%rsp)
2512
2513.Ldone16x:
2514	vzeroall
2515___
2516$code.=<<___	if ($win64);
2517	movaps		-0xa8(%r9),%xmm6
2518	movaps		-0x98(%r9),%xmm7
2519	movaps		-0x88(%r9),%xmm8
2520	movaps		-0x78(%r9),%xmm9
2521	movaps		-0x68(%r9),%xmm10
2522	movaps		-0x58(%r9),%xmm11
2523	movaps		-0x48(%r9),%xmm12
2524	movaps		-0x38(%r9),%xmm13
2525	movaps		-0x28(%r9),%xmm14
2526	movaps		-0x18(%r9),%xmm15
2527___
2528$code.=<<___;
2529	lea		(%r9),%rsp
2530.cfi_def_cfa_register	rsp
2531.L16x_epilogue:
2532	ret
2533.cfi_endproc
2534.size	ChaCha20_16x,.-ChaCha20_16x
2535___
2536}
2537
2538# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2539#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2540if ($win64) {
2541$rec="%rcx";
2542$frame="%rdx";
2543$context="%r8";
2544$disp="%r9";
2545
2546$code.=<<___;
2547.extern	__imp_RtlVirtualUnwind
2548.type	se_handler,\@abi-omnipotent
2549.align	16
2550se_handler:
2551	push	%rsi
2552	push	%rdi
2553	push	%rbx
2554	push	%rbp
2555	push	%r12
2556	push	%r13
2557	push	%r14
2558	push	%r15
2559	pushfq
2560	sub	\$64,%rsp
2561
2562	mov	120($context),%rax	# pull context->Rax
2563	mov	248($context),%rbx	# pull context->Rip
2564
2565	mov	8($disp),%rsi		# disp->ImageBase
2566	mov	56($disp),%r11		# disp->HandlerData
2567
2568	lea	.Lctr32_body(%rip),%r10
2569	cmp	%r10,%rbx		# context->Rip<.Lprologue
2570	jb	.Lcommon_seh_tail
2571
2572	mov	152($context),%rax	# pull context->Rsp
2573
2574	lea	.Lno_data(%rip),%r10	# epilogue label
2575	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2576	jae	.Lcommon_seh_tail
2577
2578	lea	64+24+48(%rax),%rax
2579
2580	mov	-8(%rax),%rbx
2581	mov	-16(%rax),%rbp
2582	mov	-24(%rax),%r12
2583	mov	-32(%rax),%r13
2584	mov	-40(%rax),%r14
2585	mov	-48(%rax),%r15
2586	mov	%rbx,144($context)	# restore context->Rbx
2587	mov	%rbp,160($context)	# restore context->Rbp
2588	mov	%r12,216($context)	# restore context->R12
2589	mov	%r13,224($context)	# restore context->R13
2590	mov	%r14,232($context)	# restore context->R14
2591	mov	%r15,240($context)	# restore context->R14
2592
2593.Lcommon_seh_tail:
2594	mov	8(%rax),%rdi
2595	mov	16(%rax),%rsi
2596	mov	%rax,152($context)	# restore context->Rsp
2597	mov	%rsi,168($context)	# restore context->Rsi
2598	mov	%rdi,176($context)	# restore context->Rdi
2599
2600	mov	40($disp),%rdi		# disp->ContextRecord
2601	mov	$context,%rsi		# context
2602	mov	\$154,%ecx		# sizeof(CONTEXT)
2603	.long	0xa548f3fc		# cld; rep movsq
2604
2605	mov	$disp,%rsi
2606	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2607	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2608	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2609	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2610	mov	40(%rsi),%r10		# disp->ContextRecord
2611	lea	56(%rsi),%r11		# &disp->HandlerData
2612	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2613	mov	%r10,32(%rsp)		# arg5
2614	mov	%r11,40(%rsp)		# arg6
2615	mov	%r12,48(%rsp)		# arg7
2616	mov	%rcx,56(%rsp)		# arg8, (NULL)
2617	call	*__imp_RtlVirtualUnwind(%rip)
2618
2619	mov	\$1,%eax		# ExceptionContinueSearch
2620	add	\$64,%rsp
2621	popfq
2622	pop	%r15
2623	pop	%r14
2624	pop	%r13
2625	pop	%r12
2626	pop	%rbp
2627	pop	%rbx
2628	pop	%rdi
2629	pop	%rsi
2630	ret
2631.size	se_handler,.-se_handler
2632
2633.type	ssse3_handler,\@abi-omnipotent
2634.align	16
2635ssse3_handler:
2636	push	%rsi
2637	push	%rdi
2638	push	%rbx
2639	push	%rbp
2640	push	%r12
2641	push	%r13
2642	push	%r14
2643	push	%r15
2644	pushfq
2645	sub	\$64,%rsp
2646
2647	mov	120($context),%rax	# pull context->Rax
2648	mov	248($context),%rbx	# pull context->Rip
2649
2650	mov	8($disp),%rsi		# disp->ImageBase
2651	mov	56($disp),%r11		# disp->HandlerData
2652
2653	mov	0(%r11),%r10d		# HandlerData[0]
2654	lea	(%rsi,%r10),%r10	# prologue label
2655	cmp	%r10,%rbx		# context->Rip<prologue label
2656	jb	.Lcommon_seh_tail
2657
2658	mov	192($context),%rax	# pull context->R9
2659
2660	mov	4(%r11),%r10d		# HandlerData[1]
2661	lea	(%rsi,%r10),%r10	# epilogue label
2662	cmp	%r10,%rbx		# context->Rip>=epilogue label
2663	jae	.Lcommon_seh_tail
2664
2665	lea	-0x28(%rax),%rsi
2666	lea	512($context),%rdi	# &context.Xmm6
2667	mov	\$4,%ecx
2668	.long	0xa548f3fc		# cld; rep movsq
2669
2670	jmp	.Lcommon_seh_tail
2671.size	ssse3_handler,.-ssse3_handler
2672
2673.type	full_handler,\@abi-omnipotent
2674.align	16
2675full_handler:
2676	push	%rsi
2677	push	%rdi
2678	push	%rbx
2679	push	%rbp
2680	push	%r12
2681	push	%r13
2682	push	%r14
2683	push	%r15
2684	pushfq
2685	sub	\$64,%rsp
2686
2687	mov	120($context),%rax	# pull context->Rax
2688	mov	248($context),%rbx	# pull context->Rip
2689
2690	mov	8($disp),%rsi		# disp->ImageBase
2691	mov	56($disp),%r11		# disp->HandlerData
2692
2693	mov	0(%r11),%r10d		# HandlerData[0]
2694	lea	(%rsi,%r10),%r10	# prologue label
2695	cmp	%r10,%rbx		# context->Rip<prologue label
2696	jb	.Lcommon_seh_tail
2697
2698	mov	192($context),%rax	# pull context->R9
2699
2700	mov	4(%r11),%r10d		# HandlerData[1]
2701	lea	(%rsi,%r10),%r10	# epilogue label
2702	cmp	%r10,%rbx		# context->Rip>=epilogue label
2703	jae	.Lcommon_seh_tail
2704
2705	lea	-0xa8(%rax),%rsi
2706	lea	512($context),%rdi	# &context.Xmm6
2707	mov	\$20,%ecx
2708	.long	0xa548f3fc		# cld; rep movsq
2709
2710	jmp	.Lcommon_seh_tail
2711.size	full_handler,.-full_handler
2712
2713.section	.pdata
2714.align	4
2715	.rva	.LSEH_begin_ChaCha20_ctr32
2716	.rva	.LSEH_end_ChaCha20_ctr32
2717	.rva	.LSEH_info_ChaCha20_ctr32
2718
2719	.rva	.LSEH_begin_ChaCha20_ssse3
2720	.rva	.LSEH_end_ChaCha20_ssse3
2721	.rva	.LSEH_info_ChaCha20_ssse3
2722
2723	.rva	.LSEH_begin_ChaCha20_4x
2724	.rva	.LSEH_end_ChaCha20_4x
2725	.rva	.LSEH_info_ChaCha20_4x
2726___
2727$code.=<<___ if ($avx>1);
2728	.rva	.LSEH_begin_ChaCha20_8x
2729	.rva	.LSEH_end_ChaCha20_8x
2730	.rva	.LSEH_info_ChaCha20_8x
2731___
2732$code.=<<___ if ($avx>2);
2733	.rva	.LSEH_begin_ChaCha20_avx512
2734	.rva	.LSEH_end_ChaCha20_avx512
2735	.rva	.LSEH_info_ChaCha20_avx512
2736
2737	.rva	.LSEH_begin_ChaCha20_16x
2738	.rva	.LSEH_end_ChaCha20_16x
2739	.rva	.LSEH_info_ChaCha20_16x
2740___
2741$code.=<<___;
2742.section	.xdata
2743.align	8
2744.LSEH_info_ChaCha20_ctr32:
2745	.byte	9,0,0,0
2746	.rva	se_handler
2747
2748.LSEH_info_ChaCha20_ssse3:
2749	.byte	9,0,0,0
2750	.rva	ssse3_handler
2751	.rva	.Lssse3_body,.Lssse3_epilogue
2752
2753.LSEH_info_ChaCha20_4x:
2754	.byte	9,0,0,0
2755	.rva	full_handler
2756	.rva	.L4x_body,.L4x_epilogue
2757___
2758$code.=<<___ if ($avx>1);
2759.LSEH_info_ChaCha20_8x:
2760	.byte	9,0,0,0
2761	.rva	full_handler
2762	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
2763___
2764$code.=<<___ if ($avx>2);
2765.LSEH_info_ChaCha20_avx512:
2766	.byte	9,0,0,0
2767	.rva	ssse3_handler
2768	.rva	.Lavx512_body,.Lavx512_epilogue		# HandlerData[]
2769
2770.LSEH_info_ChaCha20_16x:
2771	.byte	9,0,0,0
2772	.rva	full_handler
2773	.rva	.L16x_body,.L16x_epilogue		# HandlerData[]
2774___
2775}
2776
2777foreach (split("\n",$code)) {
2778	s/\`([^\`]*)\`/eval $1/ge;
2779
2780	s/%x#%[yz]/%x/g;	# "down-shift"
2781
2782	print $_,"\n";
2783}
2784
2785close STDOUT;
2786