1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# sha256/512_block procedure for x86_64.
10#
11# 40% improvement over compiler-generated code on Opteron. On EM64T
12# sha256 was observed to run >80% faster and sha512 - >40%. No magical
13# tricks, just straight implementation... I really wonder why gcc
14# [being armed with inline assembler] fails to generate as fast code.
15# The only thing which is cool about this module is that it's very
16# same instruction sequence used for both SHA-256 and SHA-512. In
17# former case the instructions operate on 32-bit operands, while in
18# latter - on 64-bit ones. All I had to do is to get one flavor right,
19# the other one passed the test right away:-)
20#
21# sha256_block runs in ~1005 cycles on Opteron, which gives you
22# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23# frequency in GHz. sha512_block runs in ~1275 cycles, which results
24# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25# Well, if you compare it to IA-64 implementation, which maintains
26# X[16] in register bank[!], tends to 4 instructions per CPU clock
27# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28# issue Opteron pipeline and X[16] maintained in memory. So that *if*
29# there is a way to improve it, *then* the only way would be to try to
30# offload X[16] updates to SSE unit, but that would require "deeper"
31# loop unroll, which in turn would naturally cause size blow-up, not
32# to mention increased complexity! And once again, only *if* it's
33# actually possible to noticeably improve overall ILP, instruction
34# level parallelism, on a given CPU implementation in this case.
35#
36# Special note on Intel EM64T. While Opteron CPU exhibits perfect
37# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38# [currently available] EM64T CPUs apparently are far from it. On the
39# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40# sha256_block:-( This is presumably because 64-bit shifts/rotates
41# apparently are not atomic instructions, but implemented in microcode.
42#
43# May 2012.
44#
45# Optimization including one of Pavel Semjanov's ideas, alternative
46# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47# unfortunately -2% SHA512 on P4 [which nobody should care about
48# that much].
49#
50# June 2012.
51#
52# Add SIMD code paths, see below for improvement coefficients. SSSE3
53# code path was not attempted for SHA512, because improvement is not
54# estimated to be high enough, noticeably less than 9%, to justify
55# the effort, not on pre-AVX processors. [Obviously with exclusion
56# for VIA Nano, but it has SHA512 instruction that is faster and
57# should be used instead.] For reference, corresponding estimated
58# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59# higher coefficients are observed on VIA Nano and Bulldozer has more
60# to do with specifics of their architecture [which is topic for
61# separate discussion].
62#
63# November 2012.
64#
65# Add AVX2 code path. Two consecutive input blocks are loaded to
66# 256-bit %ymm registers, with data from first block to least
67# significant 128-bit halves and data from second to most significant.
68# The data is then processed with same SIMD instruction sequence as
69# for AVX, but with %ymm as operands. Side effect is increased stack
70# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
71# code size increase.
72#
73# March 2014.
74#
75# Add support for Intel SHA Extensions.
76
77######################################################################
78# Current performance in cycles per processed byte (less is better):
79#
80#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
81#
82# AMD K8	14.9	-	    -		    9.57    -
83# P4		17.3	-	    -		    30.8    -
84# Core 2	15.6	13.8(+13%)  -		    9.97    -
85# Westmere	14.8	12.3(+19%)  -		    9.58    -
86# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
87# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
88# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
89# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
90# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
91# Atom		23.0	18.9(+22%)  -		    14.7    -
92# Silvermont	27.4	20.6(+33%)  -               17.5    -
93#
94# (*)	whichever best applicable;
95# (**)	switch from ror to shrd stands for fair share of improvement;
96# (***)	execution time is fully determined by remaining integer-only
97#	part, body_00_15; reducing the amount of SIMD instructions
98#	below certain limit makes no difference/sense; to conserve
99#	space SHA256 XOP code path is therefore omitted;
100
101$flavour = shift;
102$output  = shift;
103if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104
105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110die "can't locate x86_64-xlate.pl";
111
112# In upstream, this is controlled by shelling out to the compiler to check
113# versions, but BoringSSL is intended to be used with pre-generated perlasm
114# output, so this isn't useful anyway.
115#
116# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
117# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
118# did not tie them together until after $shaext was added.
119$avx = 1;
120
121# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
122# been tested.
123$shaext=0;	### set to zero if compiling for 1.0.1
124$avx=1		if (!$shaext && $avx);
125
126open OUT,"| \"$^X\" $xlate $flavour";
127*STDOUT=*OUT;
128
129if ($output =~ /512/) {
130	$func="sha512_block_data_order";
131	$TABLE="K512";
132	$SZ=8;
133	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
134					"%r8", "%r9", "%r10","%r11");
135	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
136	@Sigma0=(28,34,39);
137	@Sigma1=(14,18,41);
138	@sigma0=(1,  8, 7);
139	@sigma1=(19,61, 6);
140	$rounds=80;
141} else {
142	$func="sha256_block_data_order";
143	$TABLE="K256";
144	$SZ=4;
145	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
146					"%r8d","%r9d","%r10d","%r11d");
147	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
148	@Sigma0=( 2,13,22);
149	@Sigma1=( 6,11,25);
150	@sigma0=( 7,18, 3);
151	@sigma1=(17,19,10);
152	$rounds=64;
153}
154
155$ctx="%rdi";	# 1st arg, zapped by $a3
156$inp="%rsi";	# 2nd arg
157$Tbl="%rbp";
158
159$_ctx="16*$SZ+0*8(%rsp)";
160$_inp="16*$SZ+1*8(%rsp)";
161$_end="16*$SZ+2*8(%rsp)";
162$_rsp="16*$SZ+3*8(%rsp)";
163$framesz="16*$SZ+4*8";
164
165
166sub ROUND_00_15()
167{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
168  my $STRIDE=$SZ;
169     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
170
171$code.=<<___;
172	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
173	mov	$f,$a2
174
175	xor	$e,$a0
176	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
177	xor	$g,$a2			# f^g
178
179	mov	$T1,`$SZ*($i&0xf)`(%rsp)
180	xor	$a,$a1
181	and	$e,$a2			# (f^g)&e
182
183	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
184	add	$h,$T1			# T1+=h
185	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
186
187	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
188	xor	$e,$a0
189	add	$a2,$T1			# T1+=Ch(e,f,g)
190
191	mov	$a,$a2
192	add	($Tbl),$T1		# T1+=K[round]
193	xor	$a,$a1
194
195	xor	$b,$a2			# a^b, b^c in next round
196	ror	\$$Sigma1[0],$a0	# Sigma1(e)
197	mov	$b,$h
198
199	and	$a2,$a3
200	ror	\$$Sigma0[0],$a1	# Sigma0(a)
201	add	$a0,$T1			# T1+=Sigma1(e)
202
203	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
204	add	$T1,$d			# d+=T1
205	add	$T1,$h			# h+=T1
206
207	lea	$STRIDE($Tbl),$Tbl	# round++
208___
209$code.=<<___ if ($i<15);
210	add	$a1,$h			# h+=Sigma0(a)
211___
212	($a2,$a3) = ($a3,$a2);
213}
214
215sub ROUND_16_XX()
216{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
217
218$code.=<<___;
219	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
220	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
221
222	mov	$a0,$T1
223	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
224	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
225	mov	$a2,$a1
226	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
227
228	xor	$T1,$a0
229	shr	\$$sigma0[2],$T1
230	ror	\$$sigma0[0],$a0
231	xor	$a1,$a2
232	shr	\$$sigma1[2],$a1
233
234	ror	\$$sigma1[0],$a2
235	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
236	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
237	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
238
239	add	`$SZ*($i&0xf)`(%rsp),$T1
240	mov	$e,$a0
241	add	$a2,$T1
242	mov	$a,$a1
243___
244	&ROUND_00_15(@_);
245}
246
247$code=<<___;
248.text
249
250.extern	OPENSSL_ia32cap_P
251.globl	$func
252.type	$func,\@function,3
253.align	16
254$func:
255___
256$code.=<<___ if ($SZ==4 || $avx);
257	lea	OPENSSL_ia32cap_P(%rip),%r11
258	mov	0(%r11),%r9d
259	mov	4(%r11),%r10d
260	mov	8(%r11),%r11d
261___
262$code.=<<___ if ($SZ==4 && $shaext);
263	test	\$`1<<29`,%r11d		# check for SHA
264	jnz	_shaext_shortcut
265___
266$code.=<<___ if ($avx && $SZ==8);
267	test	\$`1<<11`,%r10d		# check for XOP
268	jnz	.Lxop_shortcut
269___
270$code.=<<___ if ($avx>1);
271	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
272	cmp	\$`1<<8|1<<5|1<<3`,%r11d
273	je	.Lavx2_shortcut
274___
275$code.=<<___ if ($avx);
276	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
277	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
278	or	%r9d,%r10d
279	cmp	\$`1<<28|1<<9|1<<30`,%r10d
280	je	.Lavx_shortcut
281___
282$code.=<<___ if ($SZ==4);
283	test	\$`1<<9`,%r10d
284	jnz	.Lssse3_shortcut
285___
286$code.=<<___;
287	push	%rbx
288	push	%rbp
289	push	%r12
290	push	%r13
291	push	%r14
292	push	%r15
293	mov	%rsp,%r11		# copy %rsp
294	shl	\$4,%rdx		# num*16
295	sub	\$$framesz,%rsp
296	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
297	and	\$-64,%rsp		# align stack frame
298	mov	$ctx,$_ctx		# save ctx, 1st arg
299	mov	$inp,$_inp		# save inp, 2nd arh
300	mov	%rdx,$_end		# save end pointer, "3rd" arg
301	mov	%r11,$_rsp		# save copy of %rsp
302.Lprologue:
303
304	mov	$SZ*0($ctx),$A
305	mov	$SZ*1($ctx),$B
306	mov	$SZ*2($ctx),$C
307	mov	$SZ*3($ctx),$D
308	mov	$SZ*4($ctx),$E
309	mov	$SZ*5($ctx),$F
310	mov	$SZ*6($ctx),$G
311	mov	$SZ*7($ctx),$H
312	jmp	.Lloop
313
314.align	16
315.Lloop:
316	mov	$B,$a3
317	lea	$TABLE(%rip),$Tbl
318	xor	$C,$a3			# magic
319___
320	for($i=0;$i<16;$i++) {
321		$code.="	mov	$SZ*$i($inp),$T1\n";
322		$code.="	mov	@ROT[4],$a0\n";
323		$code.="	mov	@ROT[0],$a1\n";
324		$code.="	bswap	$T1\n";
325		&ROUND_00_15($i,@ROT);
326		unshift(@ROT,pop(@ROT));
327	}
328$code.=<<___;
329	jmp	.Lrounds_16_xx
330.align	16
331.Lrounds_16_xx:
332___
333	for(;$i<32;$i++) {
334		&ROUND_16_XX($i,@ROT);
335		unshift(@ROT,pop(@ROT));
336	}
337
338$code.=<<___;
339	cmpb	\$0,`$SZ-1`($Tbl)
340	jnz	.Lrounds_16_xx
341
342	mov	$_ctx,$ctx
343	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
344	lea	16*$SZ($inp),$inp
345
346	add	$SZ*0($ctx),$A
347	add	$SZ*1($ctx),$B
348	add	$SZ*2($ctx),$C
349	add	$SZ*3($ctx),$D
350	add	$SZ*4($ctx),$E
351	add	$SZ*5($ctx),$F
352	add	$SZ*6($ctx),$G
353	add	$SZ*7($ctx),$H
354
355	cmp	$_end,$inp
356
357	mov	$A,$SZ*0($ctx)
358	mov	$B,$SZ*1($ctx)
359	mov	$C,$SZ*2($ctx)
360	mov	$D,$SZ*3($ctx)
361	mov	$E,$SZ*4($ctx)
362	mov	$F,$SZ*5($ctx)
363	mov	$G,$SZ*6($ctx)
364	mov	$H,$SZ*7($ctx)
365	jb	.Lloop
366
367	mov	$_rsp,%rsi
368	mov	(%rsi),%r15
369	mov	8(%rsi),%r14
370	mov	16(%rsi),%r13
371	mov	24(%rsi),%r12
372	mov	32(%rsi),%rbp
373	mov	40(%rsi),%rbx
374	lea	48(%rsi),%rsp
375.Lepilogue:
376	ret
377.size	$func,.-$func
378___
379
380if ($SZ==4) {
381$code.=<<___;
382.align	64
383.type	$TABLE,\@object
384$TABLE:
385	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
386	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
387	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
388	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
389	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
390	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
391	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
392	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
393	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
394	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
395	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
396	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
397	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
398	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
399	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
400	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
401	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
402	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
403	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
404	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
405	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
406	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
407	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
408	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
409	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
410	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
411	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
412	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
413	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
414	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
415	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
416	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
417
418	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
419	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
420	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
421	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
422	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
423	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
424	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
425___
426} else {
427$code.=<<___;
428.align	64
429.type	$TABLE,\@object
430$TABLE:
431	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
432	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
433	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
434	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
435	.quad	0x3956c25bf348b538,0x59f111f1b605d019
436	.quad	0x3956c25bf348b538,0x59f111f1b605d019
437	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
438	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
439	.quad	0xd807aa98a3030242,0x12835b0145706fbe
440	.quad	0xd807aa98a3030242,0x12835b0145706fbe
441	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
442	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
443	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
444	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
445	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
446	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
447	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
448	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
449	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
450	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
451	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
452	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
453	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
454	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
455	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
456	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
457	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
458	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
459	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
460	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
461	.quad	0x06ca6351e003826f,0x142929670a0e6e70
462	.quad	0x06ca6351e003826f,0x142929670a0e6e70
463	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
464	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
465	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
466	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
467	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
468	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
469	.quad	0x81c2c92e47edaee6,0x92722c851482353b
470	.quad	0x81c2c92e47edaee6,0x92722c851482353b
471	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
472	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
473	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
474	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
475	.quad	0xd192e819d6ef5218,0xd69906245565a910
476	.quad	0xd192e819d6ef5218,0xd69906245565a910
477	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
478	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
479	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
480	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
481	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
482	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
483	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
484	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
485	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
486	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
487	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
488	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
489	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
490	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
491	.quad	0x90befffa23631e28,0xa4506cebde82bde9
492	.quad	0x90befffa23631e28,0xa4506cebde82bde9
493	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
494	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
495	.quad	0xca273eceea26619c,0xd186b8c721c0c207
496	.quad	0xca273eceea26619c,0xd186b8c721c0c207
497	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
498	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
499	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
500	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
501	.quad	0x113f9804bef90dae,0x1b710b35131c471b
502	.quad	0x113f9804bef90dae,0x1b710b35131c471b
503	.quad	0x28db77f523047d84,0x32caab7b40c72493
504	.quad	0x28db77f523047d84,0x32caab7b40c72493
505	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
506	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
507	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
508	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
509	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
510	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
511
512	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
513	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
514	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
515___
516}
517
518######################################################################
519# SIMD code paths
520#
521if ($SZ==4 && $shaext) {{{
522######################################################################
523# Intel SHA Extensions implementation of SHA256 update function.
524#
525my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
526
527my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
528my @MSG=map("%xmm$_",(3..6));
529
530$code.=<<___;
531.type	sha256_block_data_order_shaext,\@function,3
532.align	64
533sha256_block_data_order_shaext:
534_shaext_shortcut:
535___
536$code.=<<___ if ($win64);
537	lea	`-8-5*16`(%rsp),%rsp
538	movaps	%xmm6,-8-5*16(%rax)
539	movaps	%xmm7,-8-4*16(%rax)
540	movaps	%xmm8,-8-3*16(%rax)
541	movaps	%xmm9,-8-2*16(%rax)
542	movaps	%xmm10,-8-1*16(%rax)
543.Lprologue_shaext:
544___
545$code.=<<___;
546	lea		K256+0x80(%rip),$Tbl
547	movdqu		($ctx),$ABEF		# DCBA
548	movdqu		16($ctx),$CDGH		# HGFE
549	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
550
551	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
552	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
553	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
554	movdqa		$TMP,$BSWAP		# offload
555	palignr		\$8,$CDGH,$ABEF		# ABEF
556	punpcklqdq	$Wi,$CDGH		# CDGH
557	jmp		.Loop_shaext
558
559.align	16
560.Loop_shaext:
561	movdqu		($inp),@MSG[0]
562	movdqu		0x10($inp),@MSG[1]
563	movdqu		0x20($inp),@MSG[2]
564	pshufb		$TMP,@MSG[0]
565	movdqu		0x30($inp),@MSG[3]
566
567	movdqa		0*32-0x80($Tbl),$Wi
568	paddd		@MSG[0],$Wi
569	pshufb		$TMP,@MSG[1]
570	movdqa		$CDGH,$CDGH_SAVE	# offload
571	sha256rnds2	$ABEF,$CDGH		# 0-3
572	pshufd		\$0x0e,$Wi,$Wi
573	nop
574	movdqa		$ABEF,$ABEF_SAVE	# offload
575	sha256rnds2	$CDGH,$ABEF
576
577	movdqa		1*32-0x80($Tbl),$Wi
578	paddd		@MSG[1],$Wi
579	pshufb		$TMP,@MSG[2]
580	sha256rnds2	$ABEF,$CDGH		# 4-7
581	pshufd		\$0x0e,$Wi,$Wi
582	lea		0x40($inp),$inp
583	sha256msg1	@MSG[1],@MSG[0]
584	sha256rnds2	$CDGH,$ABEF
585
586	movdqa		2*32-0x80($Tbl),$Wi
587	paddd		@MSG[2],$Wi
588	pshufb		$TMP,@MSG[3]
589	sha256rnds2	$ABEF,$CDGH		# 8-11
590	pshufd		\$0x0e,$Wi,$Wi
591	movdqa		@MSG[3],$TMP
592	palignr		\$4,@MSG[2],$TMP
593	nop
594	paddd		$TMP,@MSG[0]
595	sha256msg1	@MSG[2],@MSG[1]
596	sha256rnds2	$CDGH,$ABEF
597
598	movdqa		3*32-0x80($Tbl),$Wi
599	paddd		@MSG[3],$Wi
600	sha256msg2	@MSG[3],@MSG[0]
601	sha256rnds2	$ABEF,$CDGH		# 12-15
602	pshufd		\$0x0e,$Wi,$Wi
603	movdqa		@MSG[0],$TMP
604	palignr		\$4,@MSG[3],$TMP
605	nop
606	paddd		$TMP,@MSG[1]
607	sha256msg1	@MSG[3],@MSG[2]
608	sha256rnds2	$CDGH,$ABEF
609___
610for($i=4;$i<16-3;$i++) {
611$code.=<<___;
612	movdqa		$i*32-0x80($Tbl),$Wi
613	paddd		@MSG[0],$Wi
614	sha256msg2	@MSG[0],@MSG[1]
615	sha256rnds2	$ABEF,$CDGH		# 16-19...
616	pshufd		\$0x0e,$Wi,$Wi
617	movdqa		@MSG[1],$TMP
618	palignr		\$4,@MSG[0],$TMP
619	nop
620	paddd		$TMP,@MSG[2]
621	sha256msg1	@MSG[0],@MSG[3]
622	sha256rnds2	$CDGH,$ABEF
623___
624	push(@MSG,shift(@MSG));
625}
626$code.=<<___;
627	movdqa		13*32-0x80($Tbl),$Wi
628	paddd		@MSG[0],$Wi
629	sha256msg2	@MSG[0],@MSG[1]
630	sha256rnds2	$ABEF,$CDGH		# 52-55
631	pshufd		\$0x0e,$Wi,$Wi
632	movdqa		@MSG[1],$TMP
633	palignr		\$4,@MSG[0],$TMP
634	sha256rnds2	$CDGH,$ABEF
635	paddd		$TMP,@MSG[2]
636
637	movdqa		14*32-0x80($Tbl),$Wi
638	paddd		@MSG[1],$Wi
639	sha256rnds2	$ABEF,$CDGH		# 56-59
640	pshufd		\$0x0e,$Wi,$Wi
641	sha256msg2	@MSG[1],@MSG[2]
642	movdqa		$BSWAP,$TMP
643	sha256rnds2	$CDGH,$ABEF
644
645	movdqa		15*32-0x80($Tbl),$Wi
646	paddd		@MSG[2],$Wi
647	nop
648	sha256rnds2	$ABEF,$CDGH		# 60-63
649	pshufd		\$0x0e,$Wi,$Wi
650	dec		$num
651	nop
652	sha256rnds2	$CDGH,$ABEF
653
654	paddd		$CDGH_SAVE,$CDGH
655	paddd		$ABEF_SAVE,$ABEF
656	jnz		.Loop_shaext
657
658	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
659	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
660	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
661	punpckhqdq	$CDGH,$ABEF		# DCBA
662	palignr		\$8,$TMP,$CDGH		# HGFE
663
664	movdqu	$ABEF,($ctx)
665	movdqu	$CDGH,16($ctx)
666___
667$code.=<<___ if ($win64);
668	movaps	-8-5*16(%rax),%xmm6
669	movaps	-8-4*16(%rax),%xmm7
670	movaps	-8-3*16(%rax),%xmm8
671	movaps	-8-2*16(%rax),%xmm9
672	movaps	-8-1*16(%rax),%xmm10
673	mov	%rax,%rsp
674.Lepilogue_shaext:
675___
676$code.=<<___;
677	ret
678.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
679___
680}}}
681{{{
682
683my $a4=$T1;
684my ($a,$b,$c,$d,$e,$f,$g,$h);
685
686sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
687{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
688  my $arg = pop;
689    $arg = "\$$arg" if ($arg*1 eq $arg);
690    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
691}
692
693sub body_00_15 () {
694	(
695	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
696
697	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
698	'&mov	($a,$a1)',
699	'&mov	($a4,$f)',
700
701	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
702	'&xor	($a0,$e)',
703	'&xor	($a4,$g)',			# f^g
704
705	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
706	'&xor	($a1,$a)',
707	'&and	($a4,$e)',			# (f^g)&e
708
709	'&xor	($a0,$e)',
710	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
711	'&mov	($a2,$a)',
712
713	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
714	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
715	'&xor	($a2,$b)',			# a^b, b^c in next round
716
717	'&add	($h,$a4)',			# h+=Ch(e,f,g)
718	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
719	'&and	($a3,$a2)',			# (b^c)&(a^b)
720
721	'&xor	($a1,$a)',
722	'&add	($h,$a0)',			# h+=Sigma1(e)
723	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
724
725	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
726	'&add	($d,$h)',			# d+=h
727	'&add	($h,$a3)',			# h+=Maj(a,b,c)
728
729	'&mov	($a0,$d)',
730	'&add	($a1,$h);'.			# h+=Sigma0(a)
731	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
732	);
733}
734
735######################################################################
736# SSSE3 code path
737#
738if ($SZ==4) {	# SHA256 only
739my @X = map("%xmm$_",(0..3));
740my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
741
742$code.=<<___;
743.type	${func}_ssse3,\@function,3
744.align	64
745${func}_ssse3:
746.Lssse3_shortcut:
747	push	%rbx
748	push	%rbp
749	push	%r12
750	push	%r13
751	push	%r14
752	push	%r15
753	mov	%rsp,%r11		# copy %rsp
754	shl	\$4,%rdx		# num*16
755	sub	\$`$framesz+$win64*16*4`,%rsp
756	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
757	and	\$-64,%rsp		# align stack frame
758	mov	$ctx,$_ctx		# save ctx, 1st arg
759	mov	$inp,$_inp		# save inp, 2nd arh
760	mov	%rdx,$_end		# save end pointer, "3rd" arg
761	mov	%r11,$_rsp		# save copy of %rsp
762___
763$code.=<<___ if ($win64);
764	movaps	%xmm6,16*$SZ+32(%rsp)
765	movaps	%xmm7,16*$SZ+48(%rsp)
766	movaps	%xmm8,16*$SZ+64(%rsp)
767	movaps	%xmm9,16*$SZ+80(%rsp)
768___
769$code.=<<___;
770.Lprologue_ssse3:
771
772	mov	$SZ*0($ctx),$A
773	mov	$SZ*1($ctx),$B
774	mov	$SZ*2($ctx),$C
775	mov	$SZ*3($ctx),$D
776	mov	$SZ*4($ctx),$E
777	mov	$SZ*5($ctx),$F
778	mov	$SZ*6($ctx),$G
779	mov	$SZ*7($ctx),$H
780___
781
782$code.=<<___;
783	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
784	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
785	jmp	.Lloop_ssse3
786.align	16
787.Lloop_ssse3:
788	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
789	movdqu	0x00($inp),@X[0]
790	movdqu	0x10($inp),@X[1]
791	movdqu	0x20($inp),@X[2]
792	pshufb	$t3,@X[0]
793	movdqu	0x30($inp),@X[3]
794	lea	$TABLE(%rip),$Tbl
795	pshufb	$t3,@X[1]
796	movdqa	0x00($Tbl),$t0
797	movdqa	0x20($Tbl),$t1
798	pshufb	$t3,@X[2]
799	paddd	@X[0],$t0
800	movdqa	0x40($Tbl),$t2
801	pshufb	$t3,@X[3]
802	movdqa	0x60($Tbl),$t3
803	paddd	@X[1],$t1
804	paddd	@X[2],$t2
805	paddd	@X[3],$t3
806	movdqa	$t0,0x00(%rsp)
807	mov	$A,$a1
808	movdqa	$t1,0x10(%rsp)
809	mov	$B,$a3
810	movdqa	$t2,0x20(%rsp)
811	xor	$C,$a3			# magic
812	movdqa	$t3,0x30(%rsp)
813	mov	$E,$a0
814	jmp	.Lssse3_00_47
815
816.align	16
817.Lssse3_00_47:
818	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
819___
820sub Xupdate_256_SSSE3 () {
821	(
822	'&movdqa	($t0,@X[1]);',
823	'&movdqa	($t3,@X[3])',
824	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
825	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
826	'&movdqa	($t1,$t0)',
827	'&movdqa	($t2,$t0);',
828	'&psrld		($t0,$sigma0[2])',
829	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
830	'&psrld		($t2,$sigma0[0])',
831	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
832	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
833	'&pxor		($t0,$t2)',
834	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
835	'&pxor		($t0,$t1)',
836	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
837	'&pxor		($t0,$t2);',
838	 '&movdqa	($t2,$t3)',
839	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
840	 '&psrld	($t3,$sigma1[2])',
841	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
842	 '&psrlq	($t2,$sigma1[0])',
843	 '&pxor		($t3,$t2);',
844	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
845	 '&pxor		($t3,$t2)',
846	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
847	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
848	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
849	 '&movdqa	($t2,$t3);',
850	 '&psrld	($t3,$sigma1[2])',
851	 '&psrlq	($t2,$sigma1[0])',
852	 '&pxor		($t3,$t2);',
853	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
854	 '&pxor		($t3,$t2);',
855	'&movdqa	($t2,16*2*$j."($Tbl)")',
856	 '&pshufb	($t3,$t5)',
857	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
858	);
859}
860
861sub SSSE3_256_00_47 () {
862my $j = shift;
863my $body = shift;
864my @X = @_;
865my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
866
867    if (0) {
868	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
869	    eval;
870	    eval(shift(@insns));
871	    eval(shift(@insns));
872	    eval(shift(@insns));
873	}
874    } else {			# squeeze extra 4% on Westmere and 19% on Atom
875	  eval(shift(@insns));	#@
876	&movdqa		($t0,@X[1]);
877	  eval(shift(@insns));
878	  eval(shift(@insns));
879	&movdqa		($t3,@X[3]);
880	  eval(shift(@insns));	#@
881	  eval(shift(@insns));
882	  eval(shift(@insns));
883	  eval(shift(@insns));	#@
884	  eval(shift(@insns));
885	&palignr	($t0,@X[0],$SZ);	# X[1..4]
886	  eval(shift(@insns));
887	  eval(shift(@insns));
888	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
889	  eval(shift(@insns));
890	  eval(shift(@insns));
891	  eval(shift(@insns));
892	  eval(shift(@insns));	#@
893	&movdqa		($t1,$t0);
894	  eval(shift(@insns));
895	  eval(shift(@insns));
896	&movdqa		($t2,$t0);
897	  eval(shift(@insns));	#@
898	  eval(shift(@insns));
899	&psrld		($t0,$sigma0[2]);
900	  eval(shift(@insns));
901	  eval(shift(@insns));
902	  eval(shift(@insns));
903	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
904	  eval(shift(@insns));	#@
905	  eval(shift(@insns));
906	&psrld		($t2,$sigma0[0]);
907	  eval(shift(@insns));
908	  eval(shift(@insns));
909	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
910	  eval(shift(@insns));
911	  eval(shift(@insns));	#@
912	&pslld		($t1,8*$SZ-$sigma0[1]);
913	  eval(shift(@insns));
914	  eval(shift(@insns));
915	&pxor		($t0,$t2);
916	  eval(shift(@insns));	#@
917	  eval(shift(@insns));
918	  eval(shift(@insns));
919	  eval(shift(@insns));	#@
920	&psrld		($t2,$sigma0[1]-$sigma0[0]);
921	  eval(shift(@insns));
922	&pxor		($t0,$t1);
923	  eval(shift(@insns));
924	  eval(shift(@insns));
925	&pslld		($t1,$sigma0[1]-$sigma0[0]);
926	  eval(shift(@insns));
927	  eval(shift(@insns));
928	&pxor		($t0,$t2);
929	  eval(shift(@insns));
930	  eval(shift(@insns));	#@
931	 &movdqa	($t2,$t3);
932	  eval(shift(@insns));
933	  eval(shift(@insns));
934	&pxor		($t0,$t1);		# sigma0(X[1..4])
935	  eval(shift(@insns));	#@
936	  eval(shift(@insns));
937	  eval(shift(@insns));
938	 &psrld		($t3,$sigma1[2]);
939	  eval(shift(@insns));
940	  eval(shift(@insns));
941	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
942	  eval(shift(@insns));	#@
943	  eval(shift(@insns));
944	 &psrlq		($t2,$sigma1[0]);
945	  eval(shift(@insns));
946	  eval(shift(@insns));
947	  eval(shift(@insns));
948	 &pxor		($t3,$t2);
949	  eval(shift(@insns));	#@
950	  eval(shift(@insns));
951	  eval(shift(@insns));
952	  eval(shift(@insns));	#@
953	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
954	  eval(shift(@insns));
955	  eval(shift(@insns));
956	 &pxor		($t3,$t2);
957	  eval(shift(@insns));	#@
958	  eval(shift(@insns));
959	  eval(shift(@insns));
960	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
961	 &pshufd	($t3,$t3,0b10000000);
962	  eval(shift(@insns));
963	  eval(shift(@insns));
964	  eval(shift(@insns));
965	 &psrldq	($t3,8);
966	  eval(shift(@insns));
967	  eval(shift(@insns));	#@
968	  eval(shift(@insns));
969	  eval(shift(@insns));
970	  eval(shift(@insns));	#@
971	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
972	  eval(shift(@insns));
973	  eval(shift(@insns));
974	  eval(shift(@insns));
975	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
976	  eval(shift(@insns));
977	  eval(shift(@insns));	#@
978	  eval(shift(@insns));
979	 &movdqa	($t2,$t3);
980	  eval(shift(@insns));
981	  eval(shift(@insns));
982	 &psrld		($t3,$sigma1[2]);
983	  eval(shift(@insns));
984	  eval(shift(@insns));	#@
985	 &psrlq		($t2,$sigma1[0]);
986	  eval(shift(@insns));
987	  eval(shift(@insns));
988	 &pxor		($t3,$t2);
989	  eval(shift(@insns));	#@
990	  eval(shift(@insns));
991	  eval(shift(@insns));
992	  eval(shift(@insns));	#@
993	  eval(shift(@insns));
994	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
995	  eval(shift(@insns));
996	  eval(shift(@insns));
997	  eval(shift(@insns));
998	 &pxor		($t3,$t2);
999	  eval(shift(@insns));
1000	  eval(shift(@insns));
1001	  eval(shift(@insns));	#@
1002	 #&pshufb	($t3,$t5);
1003	 &pshufd	($t3,$t3,0b00001000);
1004	  eval(shift(@insns));
1005	  eval(shift(@insns));
1006	&movdqa		($t2,16*2*$j."($Tbl)");
1007	  eval(shift(@insns));	#@
1008	  eval(shift(@insns));
1009	 &pslldq	($t3,8);
1010	  eval(shift(@insns));
1011	  eval(shift(@insns));
1012	  eval(shift(@insns));
1013	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1014	  eval(shift(@insns));	#@
1015	  eval(shift(@insns));
1016	  eval(shift(@insns));
1017    }
1018	&paddd		($t2,@X[0]);
1019	  foreach (@insns) { eval; }		# remaining instructions
1020	&movdqa		(16*$j."(%rsp)",$t2);
1021}
1022
1023    for ($i=0,$j=0; $j<4; $j++) {
1024	&SSSE3_256_00_47($j,\&body_00_15,@X);
1025	push(@X,shift(@X));			# rotate(@X)
1026    }
1027	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1028	&jne	(".Lssse3_00_47");
1029
1030    for ($i=0; $i<16; ) {
1031	foreach(body_00_15()) { eval; }
1032    }
1033$code.=<<___;
1034	mov	$_ctx,$ctx
1035	mov	$a1,$A
1036
1037	add	$SZ*0($ctx),$A
1038	lea	16*$SZ($inp),$inp
1039	add	$SZ*1($ctx),$B
1040	add	$SZ*2($ctx),$C
1041	add	$SZ*3($ctx),$D
1042	add	$SZ*4($ctx),$E
1043	add	$SZ*5($ctx),$F
1044	add	$SZ*6($ctx),$G
1045	add	$SZ*7($ctx),$H
1046
1047	cmp	$_end,$inp
1048
1049	mov	$A,$SZ*0($ctx)
1050	mov	$B,$SZ*1($ctx)
1051	mov	$C,$SZ*2($ctx)
1052	mov	$D,$SZ*3($ctx)
1053	mov	$E,$SZ*4($ctx)
1054	mov	$F,$SZ*5($ctx)
1055	mov	$G,$SZ*6($ctx)
1056	mov	$H,$SZ*7($ctx)
1057	jb	.Lloop_ssse3
1058
1059	mov	$_rsp,%rsi
1060___
1061$code.=<<___ if ($win64);
1062	movaps	16*$SZ+32(%rsp),%xmm6
1063	movaps	16*$SZ+48(%rsp),%xmm7
1064	movaps	16*$SZ+64(%rsp),%xmm8
1065	movaps	16*$SZ+80(%rsp),%xmm9
1066___
1067$code.=<<___;
1068	mov	(%rsi),%r15
1069	mov	8(%rsi),%r14
1070	mov	16(%rsi),%r13
1071	mov	24(%rsi),%r12
1072	mov	32(%rsi),%rbp
1073	mov	40(%rsi),%rbx
1074	lea	48(%rsi),%rsp
1075.Lepilogue_ssse3:
1076	ret
1077.size	${func}_ssse3,.-${func}_ssse3
1078___
1079}
1080
1081if ($avx) {{
1082######################################################################
1083# XOP code path
1084#
1085if ($SZ==8) {	# SHA512 only
1086$code.=<<___;
1087.type	${func}_xop,\@function,3
1088.align	64
1089${func}_xop:
1090.Lxop_shortcut:
1091	push	%rbx
1092	push	%rbp
1093	push	%r12
1094	push	%r13
1095	push	%r14
1096	push	%r15
1097	mov	%rsp,%r11		# copy %rsp
1098	shl	\$4,%rdx		# num*16
1099	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1100	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1101	and	\$-64,%rsp		# align stack frame
1102	mov	$ctx,$_ctx		# save ctx, 1st arg
1103	mov	$inp,$_inp		# save inp, 2nd arh
1104	mov	%rdx,$_end		# save end pointer, "3rd" arg
1105	mov	%r11,$_rsp		# save copy of %rsp
1106___
1107$code.=<<___ if ($win64);
1108	movaps	%xmm6,16*$SZ+32(%rsp)
1109	movaps	%xmm7,16*$SZ+48(%rsp)
1110	movaps	%xmm8,16*$SZ+64(%rsp)
1111	movaps	%xmm9,16*$SZ+80(%rsp)
1112___
1113$code.=<<___ if ($win64 && $SZ>4);
1114	movaps	%xmm10,16*$SZ+96(%rsp)
1115	movaps	%xmm11,16*$SZ+112(%rsp)
1116___
1117$code.=<<___;
1118.Lprologue_xop:
1119
1120	vzeroupper
1121	mov	$SZ*0($ctx),$A
1122	mov	$SZ*1($ctx),$B
1123	mov	$SZ*2($ctx),$C
1124	mov	$SZ*3($ctx),$D
1125	mov	$SZ*4($ctx),$E
1126	mov	$SZ*5($ctx),$F
1127	mov	$SZ*6($ctx),$G
1128	mov	$SZ*7($ctx),$H
1129	jmp	.Lloop_xop
1130___
1131					if ($SZ==4) {	# SHA256
1132    my @X = map("%xmm$_",(0..3));
1133    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1134
1135$code.=<<___;
1136.align	16
1137.Lloop_xop:
1138	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1139	vmovdqu	0x00($inp),@X[0]
1140	vmovdqu	0x10($inp),@X[1]
1141	vmovdqu	0x20($inp),@X[2]
1142	vmovdqu	0x30($inp),@X[3]
1143	vpshufb	$t3,@X[0],@X[0]
1144	lea	$TABLE(%rip),$Tbl
1145	vpshufb	$t3,@X[1],@X[1]
1146	vpshufb	$t3,@X[2],@X[2]
1147	vpaddd	0x00($Tbl),@X[0],$t0
1148	vpshufb	$t3,@X[3],@X[3]
1149	vpaddd	0x20($Tbl),@X[1],$t1
1150	vpaddd	0x40($Tbl),@X[2],$t2
1151	vpaddd	0x60($Tbl),@X[3],$t3
1152	vmovdqa	$t0,0x00(%rsp)
1153	mov	$A,$a1
1154	vmovdqa	$t1,0x10(%rsp)
1155	mov	$B,$a3
1156	vmovdqa	$t2,0x20(%rsp)
1157	xor	$C,$a3			# magic
1158	vmovdqa	$t3,0x30(%rsp)
1159	mov	$E,$a0
1160	jmp	.Lxop_00_47
1161
1162.align	16
1163.Lxop_00_47:
1164	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1165___
1166sub XOP_256_00_47 () {
1167my $j = shift;
1168my $body = shift;
1169my @X = @_;
1170my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1171
1172	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
1173	  eval(shift(@insns));
1174	  eval(shift(@insns));
1175	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
1176	  eval(shift(@insns));
1177	  eval(shift(@insns));
1178	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
1179	  eval(shift(@insns));
1180	  eval(shift(@insns));
1181	&vpsrld		($t0,$t0,$sigma0[2]);
1182	  eval(shift(@insns));
1183	  eval(shift(@insns));
1184	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
1185	  eval(shift(@insns));
1186	  eval(shift(@insns));
1187	  eval(shift(@insns));
1188	  eval(shift(@insns));
1189	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
1190	  eval(shift(@insns));
1191	  eval(shift(@insns));
1192	&vpxor		($t0,$t0,$t1);
1193	  eval(shift(@insns));
1194	  eval(shift(@insns));
1195	  eval(shift(@insns));
1196	  eval(shift(@insns));
1197	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
1198	  eval(shift(@insns));
1199	  eval(shift(@insns));
1200	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
1201	  eval(shift(@insns));
1202	  eval(shift(@insns));
1203	 &vpsrld	($t2,@X[3],$sigma1[2]);
1204	  eval(shift(@insns));
1205	  eval(shift(@insns));
1206	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
1207	  eval(shift(@insns));
1208	  eval(shift(@insns));
1209	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1210	  eval(shift(@insns));
1211	  eval(shift(@insns));
1212	 &vpxor		($t3,$t3,$t2);
1213	  eval(shift(@insns));
1214	  eval(shift(@insns));
1215	  eval(shift(@insns));
1216	  eval(shift(@insns));
1217	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1218	  eval(shift(@insns));
1219	  eval(shift(@insns));
1220	  eval(shift(@insns));
1221	  eval(shift(@insns));
1222	&vpsrldq	($t3,$t3,8);
1223	  eval(shift(@insns));
1224	  eval(shift(@insns));
1225	  eval(shift(@insns));
1226	  eval(shift(@insns));
1227	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1228	  eval(shift(@insns));
1229	  eval(shift(@insns));
1230	  eval(shift(@insns));
1231	  eval(shift(@insns));
1232	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
1233	  eval(shift(@insns));
1234	  eval(shift(@insns));
1235	 &vpsrld	($t2,@X[0],$sigma1[2]);
1236	  eval(shift(@insns));
1237	  eval(shift(@insns));
1238	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1239	  eval(shift(@insns));
1240	  eval(shift(@insns));
1241	 &vpxor		($t3,$t3,$t2);
1242	  eval(shift(@insns));
1243	  eval(shift(@insns));
1244	  eval(shift(@insns));
1245	  eval(shift(@insns));
1246	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
1247	  eval(shift(@insns));
1248	  eval(shift(@insns));
1249	  eval(shift(@insns));
1250	  eval(shift(@insns));
1251	&vpslldq	($t3,$t3,8);		# 22 instructions
1252	  eval(shift(@insns));
1253	  eval(shift(@insns));
1254	  eval(shift(@insns));
1255	  eval(shift(@insns));
1256	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
1257	  eval(shift(@insns));
1258	  eval(shift(@insns));
1259	  eval(shift(@insns));
1260	  eval(shift(@insns));
1261	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1262	  foreach (@insns) { eval; }		# remaining instructions
1263	&vmovdqa	(16*$j."(%rsp)",$t2);
1264}
1265
1266    for ($i=0,$j=0; $j<4; $j++) {
1267	&XOP_256_00_47($j,\&body_00_15,@X);
1268	push(@X,shift(@X));			# rotate(@X)
1269    }
1270	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1271	&jne	(".Lxop_00_47");
1272
1273    for ($i=0; $i<16; ) {
1274	foreach(body_00_15()) { eval; }
1275    }
1276
1277					} else {	# SHA512
1278    my @X = map("%xmm$_",(0..7));
1279    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1280
1281$code.=<<___;
1282.align	16
1283.Lloop_xop:
1284	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1285	vmovdqu	0x00($inp),@X[0]
1286	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1287	vmovdqu	0x10($inp),@X[1]
1288	vmovdqu	0x20($inp),@X[2]
1289	vpshufb	$t3,@X[0],@X[0]
1290	vmovdqu	0x30($inp),@X[3]
1291	vpshufb	$t3,@X[1],@X[1]
1292	vmovdqu	0x40($inp),@X[4]
1293	vpshufb	$t3,@X[2],@X[2]
1294	vmovdqu	0x50($inp),@X[5]
1295	vpshufb	$t3,@X[3],@X[3]
1296	vmovdqu	0x60($inp),@X[6]
1297	vpshufb	$t3,@X[4],@X[4]
1298	vmovdqu	0x70($inp),@X[7]
1299	vpshufb	$t3,@X[5],@X[5]
1300	vpaddq	-0x80($Tbl),@X[0],$t0
1301	vpshufb	$t3,@X[6],@X[6]
1302	vpaddq	-0x60($Tbl),@X[1],$t1
1303	vpshufb	$t3,@X[7],@X[7]
1304	vpaddq	-0x40($Tbl),@X[2],$t2
1305	vpaddq	-0x20($Tbl),@X[3],$t3
1306	vmovdqa	$t0,0x00(%rsp)
1307	vpaddq	0x00($Tbl),@X[4],$t0
1308	vmovdqa	$t1,0x10(%rsp)
1309	vpaddq	0x20($Tbl),@X[5],$t1
1310	vmovdqa	$t2,0x20(%rsp)
1311	vpaddq	0x40($Tbl),@X[6],$t2
1312	vmovdqa	$t3,0x30(%rsp)
1313	vpaddq	0x60($Tbl),@X[7],$t3
1314	vmovdqa	$t0,0x40(%rsp)
1315	mov	$A,$a1
1316	vmovdqa	$t1,0x50(%rsp)
1317	mov	$B,$a3
1318	vmovdqa	$t2,0x60(%rsp)
1319	xor	$C,$a3			# magic
1320	vmovdqa	$t3,0x70(%rsp)
1321	mov	$E,$a0
1322	jmp	.Lxop_00_47
1323
1324.align	16
1325.Lxop_00_47:
1326	add	\$`16*2*$SZ`,$Tbl
1327___
1328sub XOP_512_00_47 () {
1329my $j = shift;
1330my $body = shift;
1331my @X = @_;
1332my @insns = (&$body,&$body);			# 52 instructions
1333
1334	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
1335	  eval(shift(@insns));
1336	  eval(shift(@insns));
1337	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
1338	  eval(shift(@insns));
1339	  eval(shift(@insns));
1340	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
1341	  eval(shift(@insns));
1342	  eval(shift(@insns));
1343	&vpsrlq		($t0,$t0,$sigma0[2]);
1344	  eval(shift(@insns));
1345	  eval(shift(@insns));
1346	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
1347	  eval(shift(@insns));
1348	  eval(shift(@insns));
1349	  eval(shift(@insns));
1350	  eval(shift(@insns));
1351	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
1352	  eval(shift(@insns));
1353	  eval(shift(@insns));
1354	&vpxor		($t0,$t0,$t1);
1355	  eval(shift(@insns));
1356	  eval(shift(@insns));
1357	  eval(shift(@insns));
1358	  eval(shift(@insns));
1359	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
1360	  eval(shift(@insns));
1361	  eval(shift(@insns));
1362	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
1363	  eval(shift(@insns));
1364	  eval(shift(@insns));
1365	 &vpsrlq	($t2,@X[7],$sigma1[2]);
1366	  eval(shift(@insns));
1367	  eval(shift(@insns));
1368	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
1369	  eval(shift(@insns));
1370	  eval(shift(@insns));
1371	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
1372	  eval(shift(@insns));
1373	  eval(shift(@insns));
1374	 &vpxor		($t3,$t3,$t2);
1375	  eval(shift(@insns));
1376	  eval(shift(@insns));
1377	  eval(shift(@insns));
1378	  eval(shift(@insns));
1379	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1380	  eval(shift(@insns));
1381	  eval(shift(@insns));
1382	  eval(shift(@insns));
1383	  eval(shift(@insns));
1384	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1385	  eval(shift(@insns));
1386	  eval(shift(@insns));
1387	  eval(shift(@insns));
1388	  eval(shift(@insns));
1389	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1390	  foreach (@insns) { eval; }		# remaining instructions
1391	&vmovdqa	(16*$j."(%rsp)",$t2);
1392}
1393
1394    for ($i=0,$j=0; $j<8; $j++) {
1395	&XOP_512_00_47($j,\&body_00_15,@X);
1396	push(@X,shift(@X));			# rotate(@X)
1397    }
1398	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1399	&jne	(".Lxop_00_47");
1400
1401    for ($i=0; $i<16; ) {
1402	foreach(body_00_15()) { eval; }
1403    }
1404}
1405$code.=<<___;
1406	mov	$_ctx,$ctx
1407	mov	$a1,$A
1408
1409	add	$SZ*0($ctx),$A
1410	lea	16*$SZ($inp),$inp
1411	add	$SZ*1($ctx),$B
1412	add	$SZ*2($ctx),$C
1413	add	$SZ*3($ctx),$D
1414	add	$SZ*4($ctx),$E
1415	add	$SZ*5($ctx),$F
1416	add	$SZ*6($ctx),$G
1417	add	$SZ*7($ctx),$H
1418
1419	cmp	$_end,$inp
1420
1421	mov	$A,$SZ*0($ctx)
1422	mov	$B,$SZ*1($ctx)
1423	mov	$C,$SZ*2($ctx)
1424	mov	$D,$SZ*3($ctx)
1425	mov	$E,$SZ*4($ctx)
1426	mov	$F,$SZ*5($ctx)
1427	mov	$G,$SZ*6($ctx)
1428	mov	$H,$SZ*7($ctx)
1429	jb	.Lloop_xop
1430
1431	mov	$_rsp,%rsi
1432	vzeroupper
1433___
1434$code.=<<___ if ($win64);
1435	movaps	16*$SZ+32(%rsp),%xmm6
1436	movaps	16*$SZ+48(%rsp),%xmm7
1437	movaps	16*$SZ+64(%rsp),%xmm8
1438	movaps	16*$SZ+80(%rsp),%xmm9
1439___
1440$code.=<<___ if ($win64 && $SZ>4);
1441	movaps	16*$SZ+96(%rsp),%xmm10
1442	movaps	16*$SZ+112(%rsp),%xmm11
1443___
1444$code.=<<___;
1445	mov	(%rsi),%r15
1446	mov	8(%rsi),%r14
1447	mov	16(%rsi),%r13
1448	mov	24(%rsi),%r12
1449	mov	32(%rsi),%rbp
1450	mov	40(%rsi),%rbx
1451	lea	48(%rsi),%rsp
1452.Lepilogue_xop:
1453	ret
1454.size	${func}_xop,.-${func}_xop
1455___
1456}
1457######################################################################
1458# AVX+shrd code path
1459#
1460local *ror = sub { &shrd(@_[0],@_) };
1461
1462$code.=<<___;
1463.type	${func}_avx,\@function,3
1464.align	64
1465${func}_avx:
1466.Lavx_shortcut:
1467	push	%rbx
1468	push	%rbp
1469	push	%r12
1470	push	%r13
1471	push	%r14
1472	push	%r15
1473	mov	%rsp,%r11		# copy %rsp
1474	shl	\$4,%rdx		# num*16
1475	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1476	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1477	and	\$-64,%rsp		# align stack frame
1478	mov	$ctx,$_ctx		# save ctx, 1st arg
1479	mov	$inp,$_inp		# save inp, 2nd arh
1480	mov	%rdx,$_end		# save end pointer, "3rd" arg
1481	mov	%r11,$_rsp		# save copy of %rsp
1482___
1483$code.=<<___ if ($win64);
1484	movaps	%xmm6,16*$SZ+32(%rsp)
1485	movaps	%xmm7,16*$SZ+48(%rsp)
1486	movaps	%xmm8,16*$SZ+64(%rsp)
1487	movaps	%xmm9,16*$SZ+80(%rsp)
1488___
1489$code.=<<___ if ($win64 && $SZ>4);
1490	movaps	%xmm10,16*$SZ+96(%rsp)
1491	movaps	%xmm11,16*$SZ+112(%rsp)
1492___
1493$code.=<<___;
1494.Lprologue_avx:
1495
1496	vzeroupper
1497	mov	$SZ*0($ctx),$A
1498	mov	$SZ*1($ctx),$B
1499	mov	$SZ*2($ctx),$C
1500	mov	$SZ*3($ctx),$D
1501	mov	$SZ*4($ctx),$E
1502	mov	$SZ*5($ctx),$F
1503	mov	$SZ*6($ctx),$G
1504	mov	$SZ*7($ctx),$H
1505___
1506					if ($SZ==4) {	# SHA256
1507    my @X = map("%xmm$_",(0..3));
1508    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1509
1510$code.=<<___;
1511	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1512	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1513	jmp	.Lloop_avx
1514.align	16
1515.Lloop_avx:
1516	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1517	vmovdqu	0x00($inp),@X[0]
1518	vmovdqu	0x10($inp),@X[1]
1519	vmovdqu	0x20($inp),@X[2]
1520	vmovdqu	0x30($inp),@X[3]
1521	vpshufb	$t3,@X[0],@X[0]
1522	lea	$TABLE(%rip),$Tbl
1523	vpshufb	$t3,@X[1],@X[1]
1524	vpshufb	$t3,@X[2],@X[2]
1525	vpaddd	0x00($Tbl),@X[0],$t0
1526	vpshufb	$t3,@X[3],@X[3]
1527	vpaddd	0x20($Tbl),@X[1],$t1
1528	vpaddd	0x40($Tbl),@X[2],$t2
1529	vpaddd	0x60($Tbl),@X[3],$t3
1530	vmovdqa	$t0,0x00(%rsp)
1531	mov	$A,$a1
1532	vmovdqa	$t1,0x10(%rsp)
1533	mov	$B,$a3
1534	vmovdqa	$t2,0x20(%rsp)
1535	xor	$C,$a3			# magic
1536	vmovdqa	$t3,0x30(%rsp)
1537	mov	$E,$a0
1538	jmp	.Lavx_00_47
1539
1540.align	16
1541.Lavx_00_47:
1542	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1543___
1544sub Xupdate_256_AVX () {
1545	(
1546	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1547	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1548	'&vpsrld	($t2,$t0,$sigma0[0]);',
1549	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1550	'&vpsrld	($t3,$t0,$sigma0[2])',
1551	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1552	'&vpxor		($t0,$t3,$t2)',
1553	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1554	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1555	'&vpxor		($t0,$t0,$t1)',
1556	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1557	'&vpxor		($t0,$t0,$t2)',
1558	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1559	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1560	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1561	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1562	 '&vpxor	($t2,$t2,$t3);',
1563	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1564	 '&vpxor	($t2,$t2,$t3)',
1565	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1566	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1567	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1568	 '&vpsrld	($t2,$t3,$sigma1[2])',
1569	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1570	 '&vpxor	($t2,$t2,$t3);',
1571	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1572	 '&vpxor	($t2,$t2,$t3)',
1573	 '&vpshufb	($t2,$t2,$t5)',
1574	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1575	);
1576}
1577
1578sub AVX_256_00_47 () {
1579my $j = shift;
1580my $body = shift;
1581my @X = @_;
1582my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1583
1584	foreach (Xupdate_256_AVX()) {		# 29 instructions
1585	    eval;
1586	    eval(shift(@insns));
1587	    eval(shift(@insns));
1588	    eval(shift(@insns));
1589	}
1590	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1591	  foreach (@insns) { eval; }		# remaining instructions
1592	&vmovdqa	(16*$j."(%rsp)",$t2);
1593}
1594
1595    for ($i=0,$j=0; $j<4; $j++) {
1596	&AVX_256_00_47($j,\&body_00_15,@X);
1597	push(@X,shift(@X));			# rotate(@X)
1598    }
1599	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1600	&jne	(".Lavx_00_47");
1601
1602    for ($i=0; $i<16; ) {
1603	foreach(body_00_15()) { eval; }
1604    }
1605
1606					} else {	# SHA512
1607    my @X = map("%xmm$_",(0..7));
1608    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1609
1610$code.=<<___;
1611	jmp	.Lloop_avx
1612.align	16
1613.Lloop_avx:
1614	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1615	vmovdqu	0x00($inp),@X[0]
1616	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1617	vmovdqu	0x10($inp),@X[1]
1618	vmovdqu	0x20($inp),@X[2]
1619	vpshufb	$t3,@X[0],@X[0]
1620	vmovdqu	0x30($inp),@X[3]
1621	vpshufb	$t3,@X[1],@X[1]
1622	vmovdqu	0x40($inp),@X[4]
1623	vpshufb	$t3,@X[2],@X[2]
1624	vmovdqu	0x50($inp),@X[5]
1625	vpshufb	$t3,@X[3],@X[3]
1626	vmovdqu	0x60($inp),@X[6]
1627	vpshufb	$t3,@X[4],@X[4]
1628	vmovdqu	0x70($inp),@X[7]
1629	vpshufb	$t3,@X[5],@X[5]
1630	vpaddq	-0x80($Tbl),@X[0],$t0
1631	vpshufb	$t3,@X[6],@X[6]
1632	vpaddq	-0x60($Tbl),@X[1],$t1
1633	vpshufb	$t3,@X[7],@X[7]
1634	vpaddq	-0x40($Tbl),@X[2],$t2
1635	vpaddq	-0x20($Tbl),@X[3],$t3
1636	vmovdqa	$t0,0x00(%rsp)
1637	vpaddq	0x00($Tbl),@X[4],$t0
1638	vmovdqa	$t1,0x10(%rsp)
1639	vpaddq	0x20($Tbl),@X[5],$t1
1640	vmovdqa	$t2,0x20(%rsp)
1641	vpaddq	0x40($Tbl),@X[6],$t2
1642	vmovdqa	$t3,0x30(%rsp)
1643	vpaddq	0x60($Tbl),@X[7],$t3
1644	vmovdqa	$t0,0x40(%rsp)
1645	mov	$A,$a1
1646	vmovdqa	$t1,0x50(%rsp)
1647	mov	$B,$a3
1648	vmovdqa	$t2,0x60(%rsp)
1649	xor	$C,$a3			# magic
1650	vmovdqa	$t3,0x70(%rsp)
1651	mov	$E,$a0
1652	jmp	.Lavx_00_47
1653
1654.align	16
1655.Lavx_00_47:
1656	add	\$`16*2*$SZ`,$Tbl
1657___
1658sub Xupdate_512_AVX () {
1659	(
1660	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1661	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1662	'&vpsrlq	($t2,$t0,$sigma0[0])',
1663	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1664	'&vpsrlq	($t3,$t0,$sigma0[2])',
1665	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1666	 '&vpxor	($t0,$t3,$t2)',
1667	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1668	 '&vpxor	($t0,$t0,$t1)',
1669	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1670	 '&vpxor	($t0,$t0,$t2)',
1671	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1672	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1673	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1674	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1675	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1676	 '&vpxor	($t3,$t3,$t2)',
1677	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1678	 '&vpxor	($t3,$t3,$t1)',
1679	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1680	 '&vpxor	($t3,$t3,$t2)',
1681	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1682	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1683	);
1684}
1685
1686sub AVX_512_00_47 () {
1687my $j = shift;
1688my $body = shift;
1689my @X = @_;
1690my @insns = (&$body,&$body);			# 52 instructions
1691
1692	foreach (Xupdate_512_AVX()) {		# 23 instructions
1693	    eval;
1694	    eval(shift(@insns));
1695	    eval(shift(@insns));
1696	}
1697	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1698	  foreach (@insns) { eval; }		# remaining instructions
1699	&vmovdqa	(16*$j."(%rsp)",$t2);
1700}
1701
1702    for ($i=0,$j=0; $j<8; $j++) {
1703	&AVX_512_00_47($j,\&body_00_15,@X);
1704	push(@X,shift(@X));			# rotate(@X)
1705    }
1706	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1707	&jne	(".Lavx_00_47");
1708
1709    for ($i=0; $i<16; ) {
1710	foreach(body_00_15()) { eval; }
1711    }
1712}
1713$code.=<<___;
1714	mov	$_ctx,$ctx
1715	mov	$a1,$A
1716
1717	add	$SZ*0($ctx),$A
1718	lea	16*$SZ($inp),$inp
1719	add	$SZ*1($ctx),$B
1720	add	$SZ*2($ctx),$C
1721	add	$SZ*3($ctx),$D
1722	add	$SZ*4($ctx),$E
1723	add	$SZ*5($ctx),$F
1724	add	$SZ*6($ctx),$G
1725	add	$SZ*7($ctx),$H
1726
1727	cmp	$_end,$inp
1728
1729	mov	$A,$SZ*0($ctx)
1730	mov	$B,$SZ*1($ctx)
1731	mov	$C,$SZ*2($ctx)
1732	mov	$D,$SZ*3($ctx)
1733	mov	$E,$SZ*4($ctx)
1734	mov	$F,$SZ*5($ctx)
1735	mov	$G,$SZ*6($ctx)
1736	mov	$H,$SZ*7($ctx)
1737	jb	.Lloop_avx
1738
1739	mov	$_rsp,%rsi
1740	vzeroupper
1741___
1742$code.=<<___ if ($win64);
1743	movaps	16*$SZ+32(%rsp),%xmm6
1744	movaps	16*$SZ+48(%rsp),%xmm7
1745	movaps	16*$SZ+64(%rsp),%xmm8
1746	movaps	16*$SZ+80(%rsp),%xmm9
1747___
1748$code.=<<___ if ($win64 && $SZ>4);
1749	movaps	16*$SZ+96(%rsp),%xmm10
1750	movaps	16*$SZ+112(%rsp),%xmm11
1751___
1752$code.=<<___;
1753	mov	(%rsi),%r15
1754	mov	8(%rsi),%r14
1755	mov	16(%rsi),%r13
1756	mov	24(%rsi),%r12
1757	mov	32(%rsi),%rbp
1758	mov	40(%rsi),%rbx
1759	lea	48(%rsi),%rsp
1760.Lepilogue_avx:
1761	ret
1762.size	${func}_avx,.-${func}_avx
1763___
1764
1765if ($avx>1) {{
1766######################################################################
1767# AVX2+BMI code path
1768#
1769my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1770my $PUSH8=8*2*$SZ;
1771use integer;
1772
1773sub bodyx_00_15 () {
1774	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1775	(
1776	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1777
1778	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1779	'&and	($a4,$e)',		# f&e
1780	'&rorx	($a0,$e,$Sigma1[2])',
1781	'&rorx	($a2,$e,$Sigma1[1])',
1782
1783	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1784	'&lea	($h,"($h,$a4)")',
1785	'&andn	($a4,$e,$g)',		# ~e&g
1786	'&xor	($a0,$a2)',
1787
1788	'&rorx	($a1,$e,$Sigma1[0])',
1789	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1790	'&xor	($a0,$a1)',		# Sigma1(e)
1791	'&mov	($a2,$a)',
1792
1793	'&rorx	($a4,$a,$Sigma0[2])',
1794	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1795	'&xor	($a2,$b)',		# a^b, b^c in next round
1796	'&rorx	($a1,$a,$Sigma0[1])',
1797
1798	'&rorx	($a0,$a,$Sigma0[0])',
1799	'&lea	($d,"($d,$h)")',	# d+=h
1800	'&and	($a3,$a2)',		# (b^c)&(a^b)
1801	'&xor	($a1,$a4)',
1802
1803	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1804	'&xor	($a1,$a0)',		# Sigma0(a)
1805	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1806	'&mov	($a4,$e)',		# copy of f in future
1807
1808	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1809	);
1810	# and at the finish one has to $a+=$a1
1811}
1812
1813$code.=<<___;
1814.type	${func}_avx2,\@function,3
1815.align	64
1816${func}_avx2:
1817.Lavx2_shortcut:
1818	push	%rbx
1819	push	%rbp
1820	push	%r12
1821	push	%r13
1822	push	%r14
1823	push	%r15
1824	mov	%rsp,%r11		# copy %rsp
1825	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1826	shl	\$4,%rdx		# num*16
1827	and	\$-256*$SZ,%rsp		# align stack frame
1828	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1829	add	\$`2*$SZ*($rounds-8)`,%rsp
1830	mov	$ctx,$_ctx		# save ctx, 1st arg
1831	mov	$inp,$_inp		# save inp, 2nd arh
1832	mov	%rdx,$_end		# save end pointer, "3rd" arg
1833	mov	%r11,$_rsp		# save copy of %rsp
1834___
1835$code.=<<___ if ($win64);
1836	movaps	%xmm6,16*$SZ+32(%rsp)
1837	movaps	%xmm7,16*$SZ+48(%rsp)
1838	movaps	%xmm8,16*$SZ+64(%rsp)
1839	movaps	%xmm9,16*$SZ+80(%rsp)
1840___
1841$code.=<<___ if ($win64 && $SZ>4);
1842	movaps	%xmm10,16*$SZ+96(%rsp)
1843	movaps	%xmm11,16*$SZ+112(%rsp)
1844___
1845$code.=<<___;
1846.Lprologue_avx2:
1847
1848	vzeroupper
1849	sub	\$-16*$SZ,$inp		# inp++, size optimization
1850	mov	$SZ*0($ctx),$A
1851	mov	$inp,%r12		# borrow $T1
1852	mov	$SZ*1($ctx),$B
1853	cmp	%rdx,$inp		# $_end
1854	mov	$SZ*2($ctx),$C
1855	cmove	%rsp,%r12		# next block or random data
1856	mov	$SZ*3($ctx),$D
1857	mov	$SZ*4($ctx),$E
1858	mov	$SZ*5($ctx),$F
1859	mov	$SZ*6($ctx),$G
1860	mov	$SZ*7($ctx),$H
1861___
1862					if ($SZ==4) {	# SHA256
1863    my @X = map("%ymm$_",(0..3));
1864    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1865
1866$code.=<<___;
1867	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1868	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1869	jmp	.Loop_avx2
1870.align	16
1871.Loop_avx2:
1872	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1873	vmovdqu	-16*$SZ+0($inp),%xmm0
1874	vmovdqu	-16*$SZ+16($inp),%xmm1
1875	vmovdqu	-16*$SZ+32($inp),%xmm2
1876	vmovdqu	-16*$SZ+48($inp),%xmm3
1877	#mov		$inp,$_inp	# offload $inp
1878	vinserti128	\$1,(%r12),@X[0],@X[0]
1879	vinserti128	\$1,16(%r12),@X[1],@X[1]
1880	vpshufb		$t3,@X[0],@X[0]
1881	vinserti128	\$1,32(%r12),@X[2],@X[2]
1882	vpshufb		$t3,@X[1],@X[1]
1883	vinserti128	\$1,48(%r12),@X[3],@X[3]
1884
1885	lea	$TABLE(%rip),$Tbl
1886	vpshufb	$t3,@X[2],@X[2]
1887	vpaddd	0x00($Tbl),@X[0],$t0
1888	vpshufb	$t3,@X[3],@X[3]
1889	vpaddd	0x20($Tbl),@X[1],$t1
1890	vpaddd	0x40($Tbl),@X[2],$t2
1891	vpaddd	0x60($Tbl),@X[3],$t3
1892	vmovdqa	$t0,0x00(%rsp)
1893	xor	$a1,$a1
1894	vmovdqa	$t1,0x20(%rsp)
1895	lea	-$PUSH8(%rsp),%rsp
1896	mov	$B,$a3
1897	vmovdqa	$t2,0x00(%rsp)
1898	xor	$C,$a3			# magic
1899	vmovdqa	$t3,0x20(%rsp)
1900	mov	$F,$a4
1901	sub	\$-16*2*$SZ,$Tbl	# size optimization
1902	jmp	.Lavx2_00_47
1903
1904.align	16
1905.Lavx2_00_47:
1906___
1907
1908sub AVX2_256_00_47 () {
1909my $j = shift;
1910my $body = shift;
1911my @X = @_;
1912my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1913my $base = "+2*$PUSH8(%rsp)";
1914
1915	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1916	foreach (Xupdate_256_AVX()) {		# 29 instructions
1917	    eval;
1918	    eval(shift(@insns));
1919	    eval(shift(@insns));
1920	    eval(shift(@insns));
1921	}
1922	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1923	  foreach (@insns) { eval; }		# remaining instructions
1924	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1925}
1926
1927    for ($i=0,$j=0; $j<4; $j++) {
1928	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1929	push(@X,shift(@X));			# rotate(@X)
1930    }
1931	&lea	($Tbl,16*2*$SZ."($Tbl)");
1932	&cmpb	(($SZ-1)."($Tbl)",0);
1933	&jne	(".Lavx2_00_47");
1934
1935    for ($i=0; $i<16; ) {
1936	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1937	foreach(bodyx_00_15()) { eval; }
1938    }
1939					} else {	# SHA512
1940    my @X = map("%ymm$_",(0..7));
1941    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1942
1943$code.=<<___;
1944	jmp	.Loop_avx2
1945.align	16
1946.Loop_avx2:
1947	vmovdqu	-16*$SZ($inp),%xmm0
1948	vmovdqu	-16*$SZ+16($inp),%xmm1
1949	vmovdqu	-16*$SZ+32($inp),%xmm2
1950	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1951	vmovdqu	-16*$SZ+48($inp),%xmm3
1952	vmovdqu	-16*$SZ+64($inp),%xmm4
1953	vmovdqu	-16*$SZ+80($inp),%xmm5
1954	vmovdqu	-16*$SZ+96($inp),%xmm6
1955	vmovdqu	-16*$SZ+112($inp),%xmm7
1956	#mov	$inp,$_inp	# offload $inp
1957	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
1958	vinserti128	\$1,(%r12),@X[0],@X[0]
1959	vinserti128	\$1,16(%r12),@X[1],@X[1]
1960	 vpshufb	$t2,@X[0],@X[0]
1961	vinserti128	\$1,32(%r12),@X[2],@X[2]
1962	 vpshufb	$t2,@X[1],@X[1]
1963	vinserti128	\$1,48(%r12),@X[3],@X[3]
1964	 vpshufb	$t2,@X[2],@X[2]
1965	vinserti128	\$1,64(%r12),@X[4],@X[4]
1966	 vpshufb	$t2,@X[3],@X[3]
1967	vinserti128	\$1,80(%r12),@X[5],@X[5]
1968	 vpshufb	$t2,@X[4],@X[4]
1969	vinserti128	\$1,96(%r12),@X[6],@X[6]
1970	 vpshufb	$t2,@X[5],@X[5]
1971	vinserti128	\$1,112(%r12),@X[7],@X[7]
1972
1973	vpaddq	-0x80($Tbl),@X[0],$t0
1974	vpshufb	$t2,@X[6],@X[6]
1975	vpaddq	-0x60($Tbl),@X[1],$t1
1976	vpshufb	$t2,@X[7],@X[7]
1977	vpaddq	-0x40($Tbl),@X[2],$t2
1978	vpaddq	-0x20($Tbl),@X[3],$t3
1979	vmovdqa	$t0,0x00(%rsp)
1980	vpaddq	0x00($Tbl),@X[4],$t0
1981	vmovdqa	$t1,0x20(%rsp)
1982	vpaddq	0x20($Tbl),@X[5],$t1
1983	vmovdqa	$t2,0x40(%rsp)
1984	vpaddq	0x40($Tbl),@X[6],$t2
1985	vmovdqa	$t3,0x60(%rsp)
1986	lea	-$PUSH8(%rsp),%rsp
1987	vpaddq	0x60($Tbl),@X[7],$t3
1988	vmovdqa	$t0,0x00(%rsp)
1989	xor	$a1,$a1
1990	vmovdqa	$t1,0x20(%rsp)
1991	mov	$B,$a3
1992	vmovdqa	$t2,0x40(%rsp)
1993	xor	$C,$a3			# magic
1994	vmovdqa	$t3,0x60(%rsp)
1995	mov	$F,$a4
1996	add	\$16*2*$SZ,$Tbl
1997	jmp	.Lavx2_00_47
1998
1999.align	16
2000.Lavx2_00_47:
2001___
2002
2003sub AVX2_512_00_47 () {
2004my $j = shift;
2005my $body = shift;
2006my @X = @_;
2007my @insns = (&$body,&$body);			# 48 instructions
2008my $base = "+2*$PUSH8(%rsp)";
2009
2010	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
2011	foreach (Xupdate_512_AVX()) {		# 23 instructions
2012	    eval;
2013	    if ($_ !~ /\;$/) {
2014		eval(shift(@insns));
2015		eval(shift(@insns));
2016		eval(shift(@insns));
2017	    }
2018	}
2019	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
2020	  foreach (@insns) { eval; }		# remaining instructions
2021	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2022}
2023
2024    for ($i=0,$j=0; $j<8; $j++) {
2025	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
2026	push(@X,shift(@X));			# rotate(@X)
2027    }
2028	&lea	($Tbl,16*2*$SZ."($Tbl)");
2029	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
2030	&jne	(".Lavx2_00_47");
2031
2032    for ($i=0; $i<16; ) {
2033	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2034	foreach(bodyx_00_15()) { eval; }
2035    }
2036}
2037$code.=<<___;
2038	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2039	add	$a1,$A
2040	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2041	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
2042
2043	add	$SZ*0($ctx),$A
2044	add	$SZ*1($ctx),$B
2045	add	$SZ*2($ctx),$C
2046	add	$SZ*3($ctx),$D
2047	add	$SZ*4($ctx),$E
2048	add	$SZ*5($ctx),$F
2049	add	$SZ*6($ctx),$G
2050	add	$SZ*7($ctx),$H
2051
2052	mov	$A,$SZ*0($ctx)
2053	mov	$B,$SZ*1($ctx)
2054	mov	$C,$SZ*2($ctx)
2055	mov	$D,$SZ*3($ctx)
2056	mov	$E,$SZ*4($ctx)
2057	mov	$F,$SZ*5($ctx)
2058	mov	$G,$SZ*6($ctx)
2059	mov	$H,$SZ*7($ctx)
2060
2061	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
2062	je	.Ldone_avx2
2063
2064	xor	$a1,$a1
2065	mov	$B,$a3
2066	xor	$C,$a3			# magic
2067	mov	$F,$a4
2068	jmp	.Lower_avx2
2069.align	16
2070.Lower_avx2:
2071___
2072    for ($i=0; $i<8; ) {
2073	my $base="+16($Tbl)";
2074	foreach(bodyx_00_15()) { eval; }
2075    }
2076$code.=<<___;
2077	lea	-$PUSH8($Tbl),$Tbl
2078	cmp	%rsp,$Tbl
2079	jae	.Lower_avx2
2080
2081	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2082	add	$a1,$A
2083	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2084	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
2085
2086	add	$SZ*0($ctx),$A
2087	add	$SZ*1($ctx),$B
2088	add	$SZ*2($ctx),$C
2089	add	$SZ*3($ctx),$D
2090	add	$SZ*4($ctx),$E
2091	add	$SZ*5($ctx),$F
2092	lea	`2*16*$SZ`($inp),$inp	# inp+=2
2093	add	$SZ*6($ctx),$G
2094	mov	$inp,%r12
2095	add	$SZ*7($ctx),$H
2096	cmp	$_end,$inp
2097
2098	mov	$A,$SZ*0($ctx)
2099	cmove	%rsp,%r12		# next block or stale data
2100	mov	$B,$SZ*1($ctx)
2101	mov	$C,$SZ*2($ctx)
2102	mov	$D,$SZ*3($ctx)
2103	mov	$E,$SZ*4($ctx)
2104	mov	$F,$SZ*5($ctx)
2105	mov	$G,$SZ*6($ctx)
2106	mov	$H,$SZ*7($ctx)
2107
2108	jbe	.Loop_avx2
2109	lea	(%rsp),$Tbl
2110
2111.Ldone_avx2:
2112	lea	($Tbl),%rsp
2113	mov	$_rsp,%rsi
2114	vzeroupper
2115___
2116$code.=<<___ if ($win64);
2117	movaps	16*$SZ+32(%rsp),%xmm6
2118	movaps	16*$SZ+48(%rsp),%xmm7
2119	movaps	16*$SZ+64(%rsp),%xmm8
2120	movaps	16*$SZ+80(%rsp),%xmm9
2121___
2122$code.=<<___ if ($win64 && $SZ>4);
2123	movaps	16*$SZ+96(%rsp),%xmm10
2124	movaps	16*$SZ+112(%rsp),%xmm11
2125___
2126$code.=<<___;
2127	mov	(%rsi),%r15
2128	mov	8(%rsi),%r14
2129	mov	16(%rsi),%r13
2130	mov	24(%rsi),%r12
2131	mov	32(%rsi),%rbp
2132	mov	40(%rsi),%rbx
2133	lea	48(%rsi),%rsp
2134.Lepilogue_avx2:
2135	ret
2136.size	${func}_avx2,.-${func}_avx2
2137___
2138}}
2139}}}}}
2140
2141# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2142#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2143if ($win64) {
2144$rec="%rcx";
2145$frame="%rdx";
2146$context="%r8";
2147$disp="%r9";
2148
2149$code.=<<___;
2150.extern	__imp_RtlVirtualUnwind
2151.type	se_handler,\@abi-omnipotent
2152.align	16
2153se_handler:
2154	push	%rsi
2155	push	%rdi
2156	push	%rbx
2157	push	%rbp
2158	push	%r12
2159	push	%r13
2160	push	%r14
2161	push	%r15
2162	pushfq
2163	sub	\$64,%rsp
2164
2165	mov	120($context),%rax	# pull context->Rax
2166	mov	248($context),%rbx	# pull context->Rip
2167
2168	mov	8($disp),%rsi		# disp->ImageBase
2169	mov	56($disp),%r11		# disp->HanderlData
2170
2171	mov	0(%r11),%r10d		# HandlerData[0]
2172	lea	(%rsi,%r10),%r10	# prologue label
2173	cmp	%r10,%rbx		# context->Rip<prologue label
2174	jb	.Lin_prologue
2175
2176	mov	152($context),%rax	# pull context->Rsp
2177
2178	mov	4(%r11),%r10d		# HandlerData[1]
2179	lea	(%rsi,%r10),%r10	# epilogue label
2180	cmp	%r10,%rbx		# context->Rip>=epilogue label
2181	jae	.Lin_prologue
2182___
2183$code.=<<___ if ($avx>1);
2184	lea	.Lavx2_shortcut(%rip),%r10
2185	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
2186	jb	.Lnot_in_avx2
2187
2188	and	\$-256*$SZ,%rax
2189	add	\$`2*$SZ*($rounds-8)`,%rax
2190.Lnot_in_avx2:
2191___
2192$code.=<<___;
2193	mov	%rax,%rsi		# put aside Rsp
2194	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
2195	lea	48(%rax),%rax
2196
2197	mov	-8(%rax),%rbx
2198	mov	-16(%rax),%rbp
2199	mov	-24(%rax),%r12
2200	mov	-32(%rax),%r13
2201	mov	-40(%rax),%r14
2202	mov	-48(%rax),%r15
2203	mov	%rbx,144($context)	# restore context->Rbx
2204	mov	%rbp,160($context)	# restore context->Rbp
2205	mov	%r12,216($context)	# restore context->R12
2206	mov	%r13,224($context)	# restore context->R13
2207	mov	%r14,232($context)	# restore context->R14
2208	mov	%r15,240($context)	# restore context->R15
2209
2210	lea	.Lepilogue(%rip),%r10
2211	cmp	%r10,%rbx
2212	jb	.Lin_prologue		# non-AVX code
2213
2214	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
2215	lea	512($context),%rdi	# &context.Xmm6
2216	mov	\$`$SZ==4?8:12`,%ecx
2217	.long	0xa548f3fc		# cld; rep movsq
2218
2219.Lin_prologue:
2220	mov	8(%rax),%rdi
2221	mov	16(%rax),%rsi
2222	mov	%rax,152($context)	# restore context->Rsp
2223	mov	%rsi,168($context)	# restore context->Rsi
2224	mov	%rdi,176($context)	# restore context->Rdi
2225
2226	mov	40($disp),%rdi		# disp->ContextRecord
2227	mov	$context,%rsi		# context
2228	mov	\$154,%ecx		# sizeof(CONTEXT)
2229	.long	0xa548f3fc		# cld; rep movsq
2230
2231	mov	$disp,%rsi
2232	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2233	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2234	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2235	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2236	mov	40(%rsi),%r10		# disp->ContextRecord
2237	lea	56(%rsi),%r11		# &disp->HandlerData
2238	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2239	mov	%r10,32(%rsp)		# arg5
2240	mov	%r11,40(%rsp)		# arg6
2241	mov	%r12,48(%rsp)		# arg7
2242	mov	%rcx,56(%rsp)		# arg8, (NULL)
2243	call	*__imp_RtlVirtualUnwind(%rip)
2244
2245	mov	\$1,%eax		# ExceptionContinueSearch
2246	add	\$64,%rsp
2247	popfq
2248	pop	%r15
2249	pop	%r14
2250	pop	%r13
2251	pop	%r12
2252	pop	%rbp
2253	pop	%rbx
2254	pop	%rdi
2255	pop	%rsi
2256	ret
2257.size	se_handler,.-se_handler
2258___
2259
2260$code.=<<___ if ($SZ==4 && $shaext);
2261.type	shaext_handler,\@abi-omnipotent
2262.align	16
2263shaext_handler:
2264	push	%rsi
2265	push	%rdi
2266	push	%rbx
2267	push	%rbp
2268	push	%r12
2269	push	%r13
2270	push	%r14
2271	push	%r15
2272	pushfq
2273	sub	\$64,%rsp
2274
2275	mov	120($context),%rax	# pull context->Rax
2276	mov	248($context),%rbx	# pull context->Rip
2277
2278	lea	.Lprologue_shaext(%rip),%r10
2279	cmp	%r10,%rbx		# context->Rip<.Lprologue
2280	jb	.Lin_prologue
2281
2282	lea	.Lepilogue_shaext(%rip),%r10
2283	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2284	jae	.Lin_prologue
2285
2286	lea	-8-5*16(%rax),%rsi
2287	lea	512($context),%rdi	# &context.Xmm6
2288	mov	\$10,%ecx
2289	.long	0xa548f3fc		# cld; rep movsq
2290
2291	jmp	.Lin_prologue
2292.size	shaext_handler,.-shaext_handler
2293___
2294
2295$code.=<<___;
2296.section	.pdata
2297.align	4
2298	.rva	.LSEH_begin_$func
2299	.rva	.LSEH_end_$func
2300	.rva	.LSEH_info_$func
2301___
2302$code.=<<___ if ($SZ==4 && $shaext);
2303	.rva	.LSEH_begin_${func}_shaext
2304	.rva	.LSEH_end_${func}_shaext
2305	.rva	.LSEH_info_${func}_shaext
2306___
2307$code.=<<___ if ($SZ==4);
2308	.rva	.LSEH_begin_${func}_ssse3
2309	.rva	.LSEH_end_${func}_ssse3
2310	.rva	.LSEH_info_${func}_ssse3
2311___
2312$code.=<<___ if ($avx && $SZ==8);
2313	.rva	.LSEH_begin_${func}_xop
2314	.rva	.LSEH_end_${func}_xop
2315	.rva	.LSEH_info_${func}_xop
2316___
2317$code.=<<___ if ($avx);
2318	.rva	.LSEH_begin_${func}_avx
2319	.rva	.LSEH_end_${func}_avx
2320	.rva	.LSEH_info_${func}_avx
2321___
2322$code.=<<___ if ($avx>1);
2323	.rva	.LSEH_begin_${func}_avx2
2324	.rva	.LSEH_end_${func}_avx2
2325	.rva	.LSEH_info_${func}_avx2
2326___
2327$code.=<<___;
2328.section	.xdata
2329.align	8
2330.LSEH_info_$func:
2331	.byte	9,0,0,0
2332	.rva	se_handler
2333	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2334___
2335$code.=<<___ if ($SZ==4 && $shaext);
2336.LSEH_info_${func}_shaext:
2337	.byte	9,0,0,0
2338	.rva	shaext_handler
2339___
2340$code.=<<___ if ($SZ==4);
2341.LSEH_info_${func}_ssse3:
2342	.byte	9,0,0,0
2343	.rva	se_handler
2344	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2345___
2346$code.=<<___ if ($avx && $SZ==8);
2347.LSEH_info_${func}_xop:
2348	.byte	9,0,0,0
2349	.rva	se_handler
2350	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
2351___
2352$code.=<<___ if ($avx);
2353.LSEH_info_${func}_avx:
2354	.byte	9,0,0,0
2355	.rva	se_handler
2356	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2357___
2358$code.=<<___ if ($avx>1);
2359.LSEH_info_${func}_avx2:
2360	.byte	9,0,0,0
2361	.rva	se_handler
2362	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2363___
2364}
2365
2366sub sha256op38 {
2367    my $instr = shift;
2368    my %opcodelet = (
2369		"sha256rnds2" => 0xcb,
2370  		"sha256msg1"  => 0xcc,
2371		"sha256msg2"  => 0xcd	);
2372
2373    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2374      my @opcode=(0x0f,0x38);
2375	push @opcode,$opcodelet{$instr};
2376	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2377	return ".byte\t".join(',',@opcode);
2378    } else {
2379	return $instr."\t".@_[0];
2380    }
2381}
2382
2383foreach (split("\n",$code)) {
2384	s/\`([^\`]*)\`/eval $1/geo;
2385
2386	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2387
2388	print $_,"\n";
2389}
2390close STDOUT;
2391