1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111#
112# Modified from upstream OpenSSL to remove the XOP code.
113
114$flavour = shift;
115$output  = shift;
116if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
117
118$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
119
120$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
121( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
122( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
123die "can't locate x86_64-xlate.pl";
124
125# In upstream, this is controlled by shelling out to the compiler to check
126# versions, but BoringSSL is intended to be used with pre-generated perlasm
127# output, so this isn't useful anyway.
128#
129# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
130# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
131# did not tie them together until after $shaext was added.
132$avx = 1;
133
134# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
135# been tested.
136$shaext=0;	### set to zero if compiling for 1.0.1
137$avx=1		if (!$shaext && $avx);
138
139open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
140*STDOUT=*OUT;
141
142if ($output =~ /512/) {
143	$func="sha512_block_data_order";
144	$TABLE="K512";
145	$SZ=8;
146	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
147					"%r8", "%r9", "%r10","%r11");
148	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
149	@Sigma0=(28,34,39);
150	@Sigma1=(14,18,41);
151	@sigma0=(1,  8, 7);
152	@sigma1=(19,61, 6);
153	$rounds=80;
154} else {
155	$func="sha256_block_data_order";
156	$TABLE="K256";
157	$SZ=4;
158	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
159					"%r8d","%r9d","%r10d","%r11d");
160	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
161	@Sigma0=( 2,13,22);
162	@Sigma1=( 6,11,25);
163	@sigma0=( 7,18, 3);
164	@sigma1=(17,19,10);
165	$rounds=64;
166}
167
168$ctx="%rdi";	# 1st arg, zapped by $a3
169$inp="%rsi";	# 2nd arg
170$Tbl="%rbp";
171
172$_ctx="16*$SZ+0*8(%rsp)";
173$_inp="16*$SZ+1*8(%rsp)";
174$_end="16*$SZ+2*8(%rsp)";
175$_rsp="`16*$SZ+3*8`(%rsp)";
176$framesz="16*$SZ+4*8";
177
178
179sub ROUND_00_15()
180{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
181  my $STRIDE=$SZ;
182     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
183
184$code.=<<___;
185	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
186	mov	$f,$a2
187
188	xor	$e,$a0
189	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
190	xor	$g,$a2			# f^g
191
192	mov	$T1,`$SZ*($i&0xf)`(%rsp)
193	xor	$a,$a1
194	and	$e,$a2			# (f^g)&e
195
196	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
197	add	$h,$T1			# T1+=h
198	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
199
200	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
201	xor	$e,$a0
202	add	$a2,$T1			# T1+=Ch(e,f,g)
203
204	mov	$a,$a2
205	add	($Tbl),$T1		# T1+=K[round]
206	xor	$a,$a1
207
208	xor	$b,$a2			# a^b, b^c in next round
209	ror	\$$Sigma1[0],$a0	# Sigma1(e)
210	mov	$b,$h
211
212	and	$a2,$a3
213	ror	\$$Sigma0[0],$a1	# Sigma0(a)
214	add	$a0,$T1			# T1+=Sigma1(e)
215
216	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
217	add	$T1,$d			# d+=T1
218	add	$T1,$h			# h+=T1
219
220	lea	$STRIDE($Tbl),$Tbl	# round++
221___
222$code.=<<___ if ($i<15);
223	add	$a1,$h			# h+=Sigma0(a)
224___
225	($a2,$a3) = ($a3,$a2);
226}
227
228sub ROUND_16_XX()
229{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
230
231$code.=<<___;
232	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
233	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
234
235	mov	$a0,$T1
236	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
237	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
238	mov	$a2,$a1
239	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
240
241	xor	$T1,$a0
242	shr	\$$sigma0[2],$T1
243	ror	\$$sigma0[0],$a0
244	xor	$a1,$a2
245	shr	\$$sigma1[2],$a1
246
247	ror	\$$sigma1[0],$a2
248	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
249	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
250	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
251
252	add	`$SZ*($i&0xf)`(%rsp),$T1
253	mov	$e,$a0
254	add	$a2,$T1
255	mov	$a,$a1
256___
257	&ROUND_00_15(@_);
258}
259
260$code=<<___;
261.text
262
263.extern	OPENSSL_ia32cap_P
264.globl	$func
265.type	$func,\@function,3
266.align	16
267$func:
268.cfi_startproc
269___
270$code.=<<___ if ($SZ==4 || $avx);
271	leaq	OPENSSL_ia32cap_P(%rip),%r11
272	mov	0(%r11),%r9d
273	mov	4(%r11),%r10d
274	mov	8(%r11),%r11d
275___
276$code.=<<___ if ($SZ==4 && $shaext);
277	test	\$`1<<29`,%r11d		# check for SHA
278	jnz	_shaext_shortcut
279___
280    # XOP codepath removed.
281___
282$code.=<<___ if ($avx>1);
283	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
284	cmp	\$`1<<8|1<<5|1<<3`,%r11d
285	je	.Lavx2_shortcut
286___
287$code.=<<___ if ($avx);
288	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
289	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
290	or	%r9d,%r10d
291	cmp	\$`1<<28|1<<9|1<<30`,%r10d
292	je	.Lavx_shortcut
293___
294$code.=<<___ if ($SZ==4);
295	test	\$`1<<9`,%r10d
296	jnz	.Lssse3_shortcut
297___
298$code.=<<___;
299	mov	%rsp,%rax		# copy %rsp
300.cfi_def_cfa_register	%rax
301	push	%rbx
302.cfi_push	%rbx
303	push	%rbp
304.cfi_push	%rbp
305	push	%r12
306.cfi_push	%r12
307	push	%r13
308.cfi_push	%r13
309	push	%r14
310.cfi_push	%r14
311	push	%r15
312.cfi_push	%r15
313	shl	\$4,%rdx		# num*16
314	sub	\$$framesz,%rsp
315	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
316	and	\$-64,%rsp		# align stack frame
317	mov	$ctx,$_ctx		# save ctx, 1st arg
318	mov	$inp,$_inp		# save inp, 2nd arh
319	mov	%rdx,$_end		# save end pointer, "3rd" arg
320	mov	%rax,$_rsp		# save copy of %rsp
321.cfi_cfa_expression	$_rsp,deref,+8
322.Lprologue:
323
324	mov	$SZ*0($ctx),$A
325	mov	$SZ*1($ctx),$B
326	mov	$SZ*2($ctx),$C
327	mov	$SZ*3($ctx),$D
328	mov	$SZ*4($ctx),$E
329	mov	$SZ*5($ctx),$F
330	mov	$SZ*6($ctx),$G
331	mov	$SZ*7($ctx),$H
332	jmp	.Lloop
333
334.align	16
335.Lloop:
336	mov	$B,$a3
337	lea	$TABLE(%rip),$Tbl
338	xor	$C,$a3			# magic
339___
340	for($i=0;$i<16;$i++) {
341		$code.="	mov	$SZ*$i($inp),$T1\n";
342		$code.="	mov	@ROT[4],$a0\n";
343		$code.="	mov	@ROT[0],$a1\n";
344		$code.="	bswap	$T1\n";
345		&ROUND_00_15($i,@ROT);
346		unshift(@ROT,pop(@ROT));
347	}
348$code.=<<___;
349	jmp	.Lrounds_16_xx
350.align	16
351.Lrounds_16_xx:
352___
353	for(;$i<32;$i++) {
354		&ROUND_16_XX($i,@ROT);
355		unshift(@ROT,pop(@ROT));
356	}
357
358$code.=<<___;
359	cmpb	\$0,`$SZ-1`($Tbl)
360	jnz	.Lrounds_16_xx
361
362	mov	$_ctx,$ctx
363	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
364	lea	16*$SZ($inp),$inp
365
366	add	$SZ*0($ctx),$A
367	add	$SZ*1($ctx),$B
368	add	$SZ*2($ctx),$C
369	add	$SZ*3($ctx),$D
370	add	$SZ*4($ctx),$E
371	add	$SZ*5($ctx),$F
372	add	$SZ*6($ctx),$G
373	add	$SZ*7($ctx),$H
374
375	cmp	$_end,$inp
376
377	mov	$A,$SZ*0($ctx)
378	mov	$B,$SZ*1($ctx)
379	mov	$C,$SZ*2($ctx)
380	mov	$D,$SZ*3($ctx)
381	mov	$E,$SZ*4($ctx)
382	mov	$F,$SZ*5($ctx)
383	mov	$G,$SZ*6($ctx)
384	mov	$H,$SZ*7($ctx)
385	jb	.Lloop
386
387	mov	$_rsp,%rsi
388.cfi_def_cfa	%rsi,8
389	mov	-48(%rsi),%r15
390.cfi_restore	%r15
391	mov	-40(%rsi),%r14
392.cfi_restore	%r14
393	mov	-32(%rsi),%r13
394.cfi_restore	%r13
395	mov	-24(%rsi),%r12
396.cfi_restore	%r12
397	mov	-16(%rsi),%rbp
398.cfi_restore	%rbp
399	mov	-8(%rsi),%rbx
400.cfi_restore	%rbx
401	lea	(%rsi),%rsp
402.cfi_def_cfa_register	%rsp
403.Lepilogue:
404	ret
405.cfi_endproc
406.size	$func,.-$func
407___
408
409if ($SZ==4) {
410$code.=<<___;
411.align	64
412.type	$TABLE,\@object
413$TABLE:
414	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
415	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
416	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
417	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
418	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
419	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
420	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
421	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
422	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
423	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
424	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
425	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
426	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
427	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
428	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
429	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
430	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
431	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
432	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
433	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
434	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
435	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
436	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
437	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
438	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
439	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
440	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
441	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
442	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
443	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
444	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
445	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
446
447	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
448	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
449	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
450	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
451	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
452	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
453	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
454___
455} else {
456$code.=<<___;
457.align	64
458.type	$TABLE,\@object
459$TABLE:
460	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
461	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
462	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
463	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
464	.quad	0x3956c25bf348b538,0x59f111f1b605d019
465	.quad	0x3956c25bf348b538,0x59f111f1b605d019
466	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
467	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
468	.quad	0xd807aa98a3030242,0x12835b0145706fbe
469	.quad	0xd807aa98a3030242,0x12835b0145706fbe
470	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
471	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
472	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
473	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
474	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
475	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
476	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
477	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
478	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
479	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
480	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
481	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
482	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
483	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
484	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
485	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
486	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
487	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
488	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
489	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
490	.quad	0x06ca6351e003826f,0x142929670a0e6e70
491	.quad	0x06ca6351e003826f,0x142929670a0e6e70
492	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
493	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
494	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
495	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
496	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
497	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
498	.quad	0x81c2c92e47edaee6,0x92722c851482353b
499	.quad	0x81c2c92e47edaee6,0x92722c851482353b
500	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
501	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
502	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
503	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
504	.quad	0xd192e819d6ef5218,0xd69906245565a910
505	.quad	0xd192e819d6ef5218,0xd69906245565a910
506	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
507	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
508	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
509	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
510	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
511	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
512	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
513	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
514	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
515	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
516	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
517	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
518	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
519	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
520	.quad	0x90befffa23631e28,0xa4506cebde82bde9
521	.quad	0x90befffa23631e28,0xa4506cebde82bde9
522	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
523	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
524	.quad	0xca273eceea26619c,0xd186b8c721c0c207
525	.quad	0xca273eceea26619c,0xd186b8c721c0c207
526	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
527	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
528	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
529	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
530	.quad	0x113f9804bef90dae,0x1b710b35131c471b
531	.quad	0x113f9804bef90dae,0x1b710b35131c471b
532	.quad	0x28db77f523047d84,0x32caab7b40c72493
533	.quad	0x28db77f523047d84,0x32caab7b40c72493
534	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
535	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
536	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
537	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
538	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
539	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
540
541	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
542	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
543	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
544___
545}
546
547######################################################################
548# SIMD code paths
549#
550if ($SZ==4 && $shaext) {{{
551######################################################################
552# Intel SHA Extensions implementation of SHA256 update function.
553#
554my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
555
556my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
557my @MSG=map("%xmm$_",(3..6));
558
559$code.=<<___;
560.type	sha256_block_data_order_shaext,\@function,3
561.align	64
562sha256_block_data_order_shaext:
563_shaext_shortcut:
564___
565$code.=<<___ if ($win64);
566	lea	`-8-5*16`(%rsp),%rsp
567	movaps	%xmm6,-8-5*16(%rax)
568	movaps	%xmm7,-8-4*16(%rax)
569	movaps	%xmm8,-8-3*16(%rax)
570	movaps	%xmm9,-8-2*16(%rax)
571	movaps	%xmm10,-8-1*16(%rax)
572.Lprologue_shaext:
573___
574$code.=<<___;
575	lea		K256+0x80(%rip),$Tbl
576	movdqu		($ctx),$ABEF		# DCBA
577	movdqu		16($ctx),$CDGH		# HGFE
578	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
579
580	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
581	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
582	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
583	movdqa		$TMP,$BSWAP		# offload
584	palignr		\$8,$CDGH,$ABEF		# ABEF
585	punpcklqdq	$Wi,$CDGH		# CDGH
586	jmp		.Loop_shaext
587
588.align	16
589.Loop_shaext:
590	movdqu		($inp),@MSG[0]
591	movdqu		0x10($inp),@MSG[1]
592	movdqu		0x20($inp),@MSG[2]
593	pshufb		$TMP,@MSG[0]
594	movdqu		0x30($inp),@MSG[3]
595
596	movdqa		0*32-0x80($Tbl),$Wi
597	paddd		@MSG[0],$Wi
598	pshufb		$TMP,@MSG[1]
599	movdqa		$CDGH,$CDGH_SAVE	# offload
600	sha256rnds2	$ABEF,$CDGH		# 0-3
601	pshufd		\$0x0e,$Wi,$Wi
602	nop
603	movdqa		$ABEF,$ABEF_SAVE	# offload
604	sha256rnds2	$CDGH,$ABEF
605
606	movdqa		1*32-0x80($Tbl),$Wi
607	paddd		@MSG[1],$Wi
608	pshufb		$TMP,@MSG[2]
609	sha256rnds2	$ABEF,$CDGH		# 4-7
610	pshufd		\$0x0e,$Wi,$Wi
611	lea		0x40($inp),$inp
612	sha256msg1	@MSG[1],@MSG[0]
613	sha256rnds2	$CDGH,$ABEF
614
615	movdqa		2*32-0x80($Tbl),$Wi
616	paddd		@MSG[2],$Wi
617	pshufb		$TMP,@MSG[3]
618	sha256rnds2	$ABEF,$CDGH		# 8-11
619	pshufd		\$0x0e,$Wi,$Wi
620	movdqa		@MSG[3],$TMP
621	palignr		\$4,@MSG[2],$TMP
622	nop
623	paddd		$TMP,@MSG[0]
624	sha256msg1	@MSG[2],@MSG[1]
625	sha256rnds2	$CDGH,$ABEF
626
627	movdqa		3*32-0x80($Tbl),$Wi
628	paddd		@MSG[3],$Wi
629	sha256msg2	@MSG[3],@MSG[0]
630	sha256rnds2	$ABEF,$CDGH		# 12-15
631	pshufd		\$0x0e,$Wi,$Wi
632	movdqa		@MSG[0],$TMP
633	palignr		\$4,@MSG[3],$TMP
634	nop
635	paddd		$TMP,@MSG[1]
636	sha256msg1	@MSG[3],@MSG[2]
637	sha256rnds2	$CDGH,$ABEF
638___
639for($i=4;$i<16-3;$i++) {
640$code.=<<___;
641	movdqa		$i*32-0x80($Tbl),$Wi
642	paddd		@MSG[0],$Wi
643	sha256msg2	@MSG[0],@MSG[1]
644	sha256rnds2	$ABEF,$CDGH		# 16-19...
645	pshufd		\$0x0e,$Wi,$Wi
646	movdqa		@MSG[1],$TMP
647	palignr		\$4,@MSG[0],$TMP
648	nop
649	paddd		$TMP,@MSG[2]
650	sha256msg1	@MSG[0],@MSG[3]
651	sha256rnds2	$CDGH,$ABEF
652___
653	push(@MSG,shift(@MSG));
654}
655$code.=<<___;
656	movdqa		13*32-0x80($Tbl),$Wi
657	paddd		@MSG[0],$Wi
658	sha256msg2	@MSG[0],@MSG[1]
659	sha256rnds2	$ABEF,$CDGH		# 52-55
660	pshufd		\$0x0e,$Wi,$Wi
661	movdqa		@MSG[1],$TMP
662	palignr		\$4,@MSG[0],$TMP
663	sha256rnds2	$CDGH,$ABEF
664	paddd		$TMP,@MSG[2]
665
666	movdqa		14*32-0x80($Tbl),$Wi
667	paddd		@MSG[1],$Wi
668	sha256rnds2	$ABEF,$CDGH		# 56-59
669	pshufd		\$0x0e,$Wi,$Wi
670	sha256msg2	@MSG[1],@MSG[2]
671	movdqa		$BSWAP,$TMP
672	sha256rnds2	$CDGH,$ABEF
673
674	movdqa		15*32-0x80($Tbl),$Wi
675	paddd		@MSG[2],$Wi
676	nop
677	sha256rnds2	$ABEF,$CDGH		# 60-63
678	pshufd		\$0x0e,$Wi,$Wi
679	dec		$num
680	nop
681	sha256rnds2	$CDGH,$ABEF
682
683	paddd		$CDGH_SAVE,$CDGH
684	paddd		$ABEF_SAVE,$ABEF
685	jnz		.Loop_shaext
686
687	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
688	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
689	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
690	punpckhqdq	$CDGH,$ABEF		# DCBA
691	palignr		\$8,$TMP,$CDGH		# HGFE
692
693	movdqu	$ABEF,($ctx)
694	movdqu	$CDGH,16($ctx)
695___
696$code.=<<___ if ($win64);
697	movaps	-8-5*16(%rax),%xmm6
698	movaps	-8-4*16(%rax),%xmm7
699	movaps	-8-3*16(%rax),%xmm8
700	movaps	-8-2*16(%rax),%xmm9
701	movaps	-8-1*16(%rax),%xmm10
702	mov	%rax,%rsp
703.Lepilogue_shaext:
704___
705$code.=<<___;
706	ret
707.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
708___
709}}}
710{{{
711
712my $a4=$T1;
713my ($a,$b,$c,$d,$e,$f,$g,$h);
714
715sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
716{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
717  my $arg = pop;
718    $arg = "\$$arg" if ($arg*1 eq $arg);
719    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
720}
721
722sub body_00_15 () {
723	(
724	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
725
726	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
727	'&mov	($a,$a1)',
728	'&mov	($a4,$f)',
729
730	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
731	'&xor	($a0,$e)',
732	'&xor	($a4,$g)',			# f^g
733
734	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
735	'&xor	($a1,$a)',
736	'&and	($a4,$e)',			# (f^g)&e
737
738	'&xor	($a0,$e)',
739	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
740	'&mov	($a2,$a)',
741
742	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
743	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
744	'&xor	($a2,$b)',			# a^b, b^c in next round
745
746	'&add	($h,$a4)',			# h+=Ch(e,f,g)
747	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
748	'&and	($a3,$a2)',			# (b^c)&(a^b)
749
750	'&xor	($a1,$a)',
751	'&add	($h,$a0)',			# h+=Sigma1(e)
752	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
753
754	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
755	'&add	($d,$h)',			# d+=h
756	'&add	($h,$a3)',			# h+=Maj(a,b,c)
757
758	'&mov	($a0,$d)',
759	'&add	($a1,$h);'.			# h+=Sigma0(a)
760	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
761	);
762}
763
764######################################################################
765# SSSE3 code path
766#
767if ($SZ==4) {	# SHA256 only
768my @X = map("%xmm$_",(0..3));
769my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
770
771$code.=<<___;
772.type	${func}_ssse3,\@function,3
773.align	64
774${func}_ssse3:
775.cfi_startproc
776.Lssse3_shortcut:
777	mov	%rsp,%rax		# copy %rsp
778.cfi_def_cfa_register	%rax
779	push	%rbx
780.cfi_push	%rbx
781	push	%rbp
782.cfi_push	%rbp
783	push	%r12
784.cfi_push	%r12
785	push	%r13
786.cfi_push	%r13
787	push	%r14
788.cfi_push	%r14
789	push	%r15
790.cfi_push	%r15
791	shl	\$4,%rdx		# num*16
792	sub	\$`$framesz+$win64*16*4`,%rsp
793	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
794	and	\$-64,%rsp		# align stack frame
795	mov	$ctx,$_ctx		# save ctx, 1st arg
796	mov	$inp,$_inp		# save inp, 2nd arh
797	mov	%rdx,$_end		# save end pointer, "3rd" arg
798	mov	%rax,$_rsp		# save copy of %rsp
799.cfi_cfa_expression	$_rsp,deref,+8
800___
801$code.=<<___ if ($win64);
802	movaps	%xmm6,16*$SZ+32(%rsp)
803	movaps	%xmm7,16*$SZ+48(%rsp)
804	movaps	%xmm8,16*$SZ+64(%rsp)
805	movaps	%xmm9,16*$SZ+80(%rsp)
806___
807$code.=<<___;
808.Lprologue_ssse3:
809
810	mov	$SZ*0($ctx),$A
811	mov	$SZ*1($ctx),$B
812	mov	$SZ*2($ctx),$C
813	mov	$SZ*3($ctx),$D
814	mov	$SZ*4($ctx),$E
815	mov	$SZ*5($ctx),$F
816	mov	$SZ*6($ctx),$G
817	mov	$SZ*7($ctx),$H
818___
819
820$code.=<<___;
821	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
822	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
823	jmp	.Lloop_ssse3
824.align	16
825.Lloop_ssse3:
826	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
827	movdqu	0x00($inp),@X[0]
828	movdqu	0x10($inp),@X[1]
829	movdqu	0x20($inp),@X[2]
830	pshufb	$t3,@X[0]
831	movdqu	0x30($inp),@X[3]
832	lea	$TABLE(%rip),$Tbl
833	pshufb	$t3,@X[1]
834	movdqa	0x00($Tbl),$t0
835	movdqa	0x20($Tbl),$t1
836	pshufb	$t3,@X[2]
837	paddd	@X[0],$t0
838	movdqa	0x40($Tbl),$t2
839	pshufb	$t3,@X[3]
840	movdqa	0x60($Tbl),$t3
841	paddd	@X[1],$t1
842	paddd	@X[2],$t2
843	paddd	@X[3],$t3
844	movdqa	$t0,0x00(%rsp)
845	mov	$A,$a1
846	movdqa	$t1,0x10(%rsp)
847	mov	$B,$a3
848	movdqa	$t2,0x20(%rsp)
849	xor	$C,$a3			# magic
850	movdqa	$t3,0x30(%rsp)
851	mov	$E,$a0
852	jmp	.Lssse3_00_47
853
854.align	16
855.Lssse3_00_47:
856	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
857___
858sub Xupdate_256_SSSE3 () {
859	(
860	'&movdqa	($t0,@X[1]);',
861	'&movdqa	($t3,@X[3])',
862	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
863	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
864	'&movdqa	($t1,$t0)',
865	'&movdqa	($t2,$t0);',
866	'&psrld		($t0,$sigma0[2])',
867	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
868	'&psrld		($t2,$sigma0[0])',
869	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
870	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
871	'&pxor		($t0,$t2)',
872	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
873	'&pxor		($t0,$t1)',
874	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
875	'&pxor		($t0,$t2);',
876	 '&movdqa	($t2,$t3)',
877	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
878	 '&psrld	($t3,$sigma1[2])',
879	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
880	 '&psrlq	($t2,$sigma1[0])',
881	 '&pxor		($t3,$t2);',
882	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
883	 '&pxor		($t3,$t2)',
884	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
885	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
886	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
887	 '&movdqa	($t2,$t3);',
888	 '&psrld	($t3,$sigma1[2])',
889	 '&psrlq	($t2,$sigma1[0])',
890	 '&pxor		($t3,$t2);',
891	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
892	 '&pxor		($t3,$t2);',
893	'&movdqa	($t2,16*2*$j."($Tbl)")',
894	 '&pshufb	($t3,$t5)',
895	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
896	);
897}
898
899sub SSSE3_256_00_47 () {
900my $j = shift;
901my $body = shift;
902my @X = @_;
903my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
904
905    if (0) {
906	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
907	    eval;
908	    eval(shift(@insns));
909	    eval(shift(@insns));
910	    eval(shift(@insns));
911	}
912    } else {			# squeeze extra 4% on Westmere and 19% on Atom
913	  eval(shift(@insns));	#@
914	&movdqa		($t0,@X[1]);
915	  eval(shift(@insns));
916	  eval(shift(@insns));
917	&movdqa		($t3,@X[3]);
918	  eval(shift(@insns));	#@
919	  eval(shift(@insns));
920	  eval(shift(@insns));
921	  eval(shift(@insns));	#@
922	  eval(shift(@insns));
923	&palignr	($t0,@X[0],$SZ);	# X[1..4]
924	  eval(shift(@insns));
925	  eval(shift(@insns));
926	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
927	  eval(shift(@insns));
928	  eval(shift(@insns));
929	  eval(shift(@insns));
930	  eval(shift(@insns));	#@
931	&movdqa		($t1,$t0);
932	  eval(shift(@insns));
933	  eval(shift(@insns));
934	&movdqa		($t2,$t0);
935	  eval(shift(@insns));	#@
936	  eval(shift(@insns));
937	&psrld		($t0,$sigma0[2]);
938	  eval(shift(@insns));
939	  eval(shift(@insns));
940	  eval(shift(@insns));
941	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
942	  eval(shift(@insns));	#@
943	  eval(shift(@insns));
944	&psrld		($t2,$sigma0[0]);
945	  eval(shift(@insns));
946	  eval(shift(@insns));
947	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
948	  eval(shift(@insns));
949	  eval(shift(@insns));	#@
950	&pslld		($t1,8*$SZ-$sigma0[1]);
951	  eval(shift(@insns));
952	  eval(shift(@insns));
953	&pxor		($t0,$t2);
954	  eval(shift(@insns));	#@
955	  eval(shift(@insns));
956	  eval(shift(@insns));
957	  eval(shift(@insns));	#@
958	&psrld		($t2,$sigma0[1]-$sigma0[0]);
959	  eval(shift(@insns));
960	&pxor		($t0,$t1);
961	  eval(shift(@insns));
962	  eval(shift(@insns));
963	&pslld		($t1,$sigma0[1]-$sigma0[0]);
964	  eval(shift(@insns));
965	  eval(shift(@insns));
966	&pxor		($t0,$t2);
967	  eval(shift(@insns));
968	  eval(shift(@insns));	#@
969	 &movdqa	($t2,$t3);
970	  eval(shift(@insns));
971	  eval(shift(@insns));
972	&pxor		($t0,$t1);		# sigma0(X[1..4])
973	  eval(shift(@insns));	#@
974	  eval(shift(@insns));
975	  eval(shift(@insns));
976	 &psrld		($t3,$sigma1[2]);
977	  eval(shift(@insns));
978	  eval(shift(@insns));
979	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
980	  eval(shift(@insns));	#@
981	  eval(shift(@insns));
982	 &psrlq		($t2,$sigma1[0]);
983	  eval(shift(@insns));
984	  eval(shift(@insns));
985	  eval(shift(@insns));
986	 &pxor		($t3,$t2);
987	  eval(shift(@insns));	#@
988	  eval(shift(@insns));
989	  eval(shift(@insns));
990	  eval(shift(@insns));	#@
991	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
992	  eval(shift(@insns));
993	  eval(shift(@insns));
994	 &pxor		($t3,$t2);
995	  eval(shift(@insns));	#@
996	  eval(shift(@insns));
997	  eval(shift(@insns));
998	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
999	 &pshufd	($t3,$t3,0b10000000);
1000	  eval(shift(@insns));
1001	  eval(shift(@insns));
1002	  eval(shift(@insns));
1003	 &psrldq	($t3,8);
1004	  eval(shift(@insns));
1005	  eval(shift(@insns));	#@
1006	  eval(shift(@insns));
1007	  eval(shift(@insns));
1008	  eval(shift(@insns));	#@
1009	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
1010	  eval(shift(@insns));
1011	  eval(shift(@insns));
1012	  eval(shift(@insns));
1013	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
1014	  eval(shift(@insns));
1015	  eval(shift(@insns));	#@
1016	  eval(shift(@insns));
1017	 &movdqa	($t2,$t3);
1018	  eval(shift(@insns));
1019	  eval(shift(@insns));
1020	 &psrld		($t3,$sigma1[2]);
1021	  eval(shift(@insns));
1022	  eval(shift(@insns));	#@
1023	 &psrlq		($t2,$sigma1[0]);
1024	  eval(shift(@insns));
1025	  eval(shift(@insns));
1026	 &pxor		($t3,$t2);
1027	  eval(shift(@insns));	#@
1028	  eval(shift(@insns));
1029	  eval(shift(@insns));
1030	  eval(shift(@insns));	#@
1031	  eval(shift(@insns));
1032	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1033	  eval(shift(@insns));
1034	  eval(shift(@insns));
1035	  eval(shift(@insns));
1036	 &pxor		($t3,$t2);
1037	  eval(shift(@insns));
1038	  eval(shift(@insns));
1039	  eval(shift(@insns));	#@
1040	 #&pshufb	($t3,$t5);
1041	 &pshufd	($t3,$t3,0b00001000);
1042	  eval(shift(@insns));
1043	  eval(shift(@insns));
1044	&movdqa		($t2,16*2*$j."($Tbl)");
1045	  eval(shift(@insns));	#@
1046	  eval(shift(@insns));
1047	 &pslldq	($t3,8);
1048	  eval(shift(@insns));
1049	  eval(shift(@insns));
1050	  eval(shift(@insns));
1051	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1052	  eval(shift(@insns));	#@
1053	  eval(shift(@insns));
1054	  eval(shift(@insns));
1055    }
1056	&paddd		($t2,@X[0]);
1057	  foreach (@insns) { eval; }		# remaining instructions
1058	&movdqa		(16*$j."(%rsp)",$t2);
1059}
1060
1061    for ($i=0,$j=0; $j<4; $j++) {
1062	&SSSE3_256_00_47($j,\&body_00_15,@X);
1063	push(@X,shift(@X));			# rotate(@X)
1064    }
1065	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1066	&jne	(".Lssse3_00_47");
1067
1068    for ($i=0; $i<16; ) {
1069	foreach(body_00_15()) { eval; }
1070    }
1071$code.=<<___;
1072	mov	$_ctx,$ctx
1073	mov	$a1,$A
1074
1075	add	$SZ*0($ctx),$A
1076	lea	16*$SZ($inp),$inp
1077	add	$SZ*1($ctx),$B
1078	add	$SZ*2($ctx),$C
1079	add	$SZ*3($ctx),$D
1080	add	$SZ*4($ctx),$E
1081	add	$SZ*5($ctx),$F
1082	add	$SZ*6($ctx),$G
1083	add	$SZ*7($ctx),$H
1084
1085	cmp	$_end,$inp
1086
1087	mov	$A,$SZ*0($ctx)
1088	mov	$B,$SZ*1($ctx)
1089	mov	$C,$SZ*2($ctx)
1090	mov	$D,$SZ*3($ctx)
1091	mov	$E,$SZ*4($ctx)
1092	mov	$F,$SZ*5($ctx)
1093	mov	$G,$SZ*6($ctx)
1094	mov	$H,$SZ*7($ctx)
1095	jb	.Lloop_ssse3
1096
1097	mov	$_rsp,%rsi
1098.cfi_def_cfa	%rsi,8
1099___
1100$code.=<<___ if ($win64);
1101	movaps	16*$SZ+32(%rsp),%xmm6
1102	movaps	16*$SZ+48(%rsp),%xmm7
1103	movaps	16*$SZ+64(%rsp),%xmm8
1104	movaps	16*$SZ+80(%rsp),%xmm9
1105___
1106$code.=<<___;
1107	mov	-48(%rsi),%r15
1108.cfi_restore	%r15
1109	mov	-40(%rsi),%r14
1110.cfi_restore	%r14
1111	mov	-32(%rsi),%r13
1112.cfi_restore	%r13
1113	mov	-24(%rsi),%r12
1114.cfi_restore	%r12
1115	mov	-16(%rsi),%rbp
1116.cfi_restore	%rbp
1117	mov	-8(%rsi),%rbx
1118.cfi_restore	%rbx
1119	lea	(%rsi),%rsp
1120.cfi_def_cfa_register	%rsp
1121.Lepilogue_ssse3:
1122	ret
1123.cfi_endproc
1124.size	${func}_ssse3,.-${func}_ssse3
1125___
1126}
1127
1128if ($avx) {{
1129######################################################################
1130# AVX+shrd code path
1131#
1132local *ror = sub { &shrd(@_[0],@_) };
1133
1134$code.=<<___;
1135.type	${func}_avx,\@function,3
1136.align	64
1137${func}_avx:
1138.cfi_startproc
1139.Lavx_shortcut:
1140	mov	%rsp,%rax		# copy %rsp
1141.cfi_def_cfa_register	%rax
1142	push	%rbx
1143.cfi_push	%rbx
1144	push	%rbp
1145.cfi_push	%rbp
1146	push	%r12
1147.cfi_push	%r12
1148	push	%r13
1149.cfi_push	%r13
1150	push	%r14
1151.cfi_push	%r14
1152	push	%r15
1153.cfi_push	%r15
1154	shl	\$4,%rdx		# num*16
1155	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1156	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1157	and	\$-64,%rsp		# align stack frame
1158	mov	$ctx,$_ctx		# save ctx, 1st arg
1159	mov	$inp,$_inp		# save inp, 2nd arh
1160	mov	%rdx,$_end		# save end pointer, "3rd" arg
1161	mov	%rax,$_rsp		# save copy of %rsp
1162.cfi_cfa_expression	$_rsp,deref,+8
1163___
1164$code.=<<___ if ($win64);
1165	movaps	%xmm6,16*$SZ+32(%rsp)
1166	movaps	%xmm7,16*$SZ+48(%rsp)
1167	movaps	%xmm8,16*$SZ+64(%rsp)
1168	movaps	%xmm9,16*$SZ+80(%rsp)
1169___
1170$code.=<<___ if ($win64 && $SZ>4);
1171	movaps	%xmm10,16*$SZ+96(%rsp)
1172	movaps	%xmm11,16*$SZ+112(%rsp)
1173___
1174$code.=<<___;
1175.Lprologue_avx:
1176
1177	vzeroupper
1178	mov	$SZ*0($ctx),$A
1179	mov	$SZ*1($ctx),$B
1180	mov	$SZ*2($ctx),$C
1181	mov	$SZ*3($ctx),$D
1182	mov	$SZ*4($ctx),$E
1183	mov	$SZ*5($ctx),$F
1184	mov	$SZ*6($ctx),$G
1185	mov	$SZ*7($ctx),$H
1186___
1187					if ($SZ==4) {	# SHA256
1188    my @X = map("%xmm$_",(0..3));
1189    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1190
1191$code.=<<___;
1192	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1193	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1194	jmp	.Lloop_avx
1195.align	16
1196.Lloop_avx:
1197	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1198	vmovdqu	0x00($inp),@X[0]
1199	vmovdqu	0x10($inp),@X[1]
1200	vmovdqu	0x20($inp),@X[2]
1201	vmovdqu	0x30($inp),@X[3]
1202	vpshufb	$t3,@X[0],@X[0]
1203	lea	$TABLE(%rip),$Tbl
1204	vpshufb	$t3,@X[1],@X[1]
1205	vpshufb	$t3,@X[2],@X[2]
1206	vpaddd	0x00($Tbl),@X[0],$t0
1207	vpshufb	$t3,@X[3],@X[3]
1208	vpaddd	0x20($Tbl),@X[1],$t1
1209	vpaddd	0x40($Tbl),@X[2],$t2
1210	vpaddd	0x60($Tbl),@X[3],$t3
1211	vmovdqa	$t0,0x00(%rsp)
1212	mov	$A,$a1
1213	vmovdqa	$t1,0x10(%rsp)
1214	mov	$B,$a3
1215	vmovdqa	$t2,0x20(%rsp)
1216	xor	$C,$a3			# magic
1217	vmovdqa	$t3,0x30(%rsp)
1218	mov	$E,$a0
1219	jmp	.Lavx_00_47
1220
1221.align	16
1222.Lavx_00_47:
1223	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1224___
1225sub Xupdate_256_AVX () {
1226	(
1227	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1228	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1229	'&vpsrld	($t2,$t0,$sigma0[0]);',
1230	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1231	'&vpsrld	($t3,$t0,$sigma0[2])',
1232	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1233	'&vpxor		($t0,$t3,$t2)',
1234	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1235	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1236	'&vpxor		($t0,$t0,$t1)',
1237	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1238	'&vpxor		($t0,$t0,$t2)',
1239	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1240	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1241	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1242	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1243	 '&vpxor	($t2,$t2,$t3);',
1244	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1245	 '&vpxor	($t2,$t2,$t3)',
1246	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1247	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1248	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1249	 '&vpsrld	($t2,$t3,$sigma1[2])',
1250	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1251	 '&vpxor	($t2,$t2,$t3);',
1252	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1253	 '&vpxor	($t2,$t2,$t3)',
1254	 '&vpshufb	($t2,$t2,$t5)',
1255	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1256	);
1257}
1258
1259sub AVX_256_00_47 () {
1260my $j = shift;
1261my $body = shift;
1262my @X = @_;
1263my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1264
1265	foreach (Xupdate_256_AVX()) {		# 29 instructions
1266	    eval;
1267	    eval(shift(@insns));
1268	    eval(shift(@insns));
1269	    eval(shift(@insns));
1270	}
1271	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1272	  foreach (@insns) { eval; }		# remaining instructions
1273	&vmovdqa	(16*$j."(%rsp)",$t2);
1274}
1275
1276    for ($i=0,$j=0; $j<4; $j++) {
1277	&AVX_256_00_47($j,\&body_00_15,@X);
1278	push(@X,shift(@X));			# rotate(@X)
1279    }
1280	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1281	&jne	(".Lavx_00_47");
1282
1283    for ($i=0; $i<16; ) {
1284	foreach(body_00_15()) { eval; }
1285    }
1286
1287					} else {	# SHA512
1288    my @X = map("%xmm$_",(0..7));
1289    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1290
1291$code.=<<___;
1292	jmp	.Lloop_avx
1293.align	16
1294.Lloop_avx:
1295	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1296	vmovdqu	0x00($inp),@X[0]
1297	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1298	vmovdqu	0x10($inp),@X[1]
1299	vmovdqu	0x20($inp),@X[2]
1300	vpshufb	$t3,@X[0],@X[0]
1301	vmovdqu	0x30($inp),@X[3]
1302	vpshufb	$t3,@X[1],@X[1]
1303	vmovdqu	0x40($inp),@X[4]
1304	vpshufb	$t3,@X[2],@X[2]
1305	vmovdqu	0x50($inp),@X[5]
1306	vpshufb	$t3,@X[3],@X[3]
1307	vmovdqu	0x60($inp),@X[6]
1308	vpshufb	$t3,@X[4],@X[4]
1309	vmovdqu	0x70($inp),@X[7]
1310	vpshufb	$t3,@X[5],@X[5]
1311	vpaddq	-0x80($Tbl),@X[0],$t0
1312	vpshufb	$t3,@X[6],@X[6]
1313	vpaddq	-0x60($Tbl),@X[1],$t1
1314	vpshufb	$t3,@X[7],@X[7]
1315	vpaddq	-0x40($Tbl),@X[2],$t2
1316	vpaddq	-0x20($Tbl),@X[3],$t3
1317	vmovdqa	$t0,0x00(%rsp)
1318	vpaddq	0x00($Tbl),@X[4],$t0
1319	vmovdqa	$t1,0x10(%rsp)
1320	vpaddq	0x20($Tbl),@X[5],$t1
1321	vmovdqa	$t2,0x20(%rsp)
1322	vpaddq	0x40($Tbl),@X[6],$t2
1323	vmovdqa	$t3,0x30(%rsp)
1324	vpaddq	0x60($Tbl),@X[7],$t3
1325	vmovdqa	$t0,0x40(%rsp)
1326	mov	$A,$a1
1327	vmovdqa	$t1,0x50(%rsp)
1328	mov	$B,$a3
1329	vmovdqa	$t2,0x60(%rsp)
1330	xor	$C,$a3			# magic
1331	vmovdqa	$t3,0x70(%rsp)
1332	mov	$E,$a0
1333	jmp	.Lavx_00_47
1334
1335.align	16
1336.Lavx_00_47:
1337	add	\$`16*2*$SZ`,$Tbl
1338___
1339sub Xupdate_512_AVX () {
1340	(
1341	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1342	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1343	'&vpsrlq	($t2,$t0,$sigma0[0])',
1344	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1345	'&vpsrlq	($t3,$t0,$sigma0[2])',
1346	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1347	 '&vpxor	($t0,$t3,$t2)',
1348	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1349	 '&vpxor	($t0,$t0,$t1)',
1350	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1351	 '&vpxor	($t0,$t0,$t2)',
1352	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1353	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1354	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1355	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1356	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1357	 '&vpxor	($t3,$t3,$t2)',
1358	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1359	 '&vpxor	($t3,$t3,$t1)',
1360	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1361	 '&vpxor	($t3,$t3,$t2)',
1362	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1363	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1364	);
1365}
1366
1367sub AVX_512_00_47 () {
1368my $j = shift;
1369my $body = shift;
1370my @X = @_;
1371my @insns = (&$body,&$body);			# 52 instructions
1372
1373	foreach (Xupdate_512_AVX()) {		# 23 instructions
1374	    eval;
1375	    eval(shift(@insns));
1376	    eval(shift(@insns));
1377	}
1378	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1379	  foreach (@insns) { eval; }		# remaining instructions
1380	&vmovdqa	(16*$j."(%rsp)",$t2);
1381}
1382
1383    for ($i=0,$j=0; $j<8; $j++) {
1384	&AVX_512_00_47($j,\&body_00_15,@X);
1385	push(@X,shift(@X));			# rotate(@X)
1386    }
1387	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1388	&jne	(".Lavx_00_47");
1389
1390    for ($i=0; $i<16; ) {
1391	foreach(body_00_15()) { eval; }
1392    }
1393}
1394$code.=<<___;
1395	mov	$_ctx,$ctx
1396	mov	$a1,$A
1397
1398	add	$SZ*0($ctx),$A
1399	lea	16*$SZ($inp),$inp
1400	add	$SZ*1($ctx),$B
1401	add	$SZ*2($ctx),$C
1402	add	$SZ*3($ctx),$D
1403	add	$SZ*4($ctx),$E
1404	add	$SZ*5($ctx),$F
1405	add	$SZ*6($ctx),$G
1406	add	$SZ*7($ctx),$H
1407
1408	cmp	$_end,$inp
1409
1410	mov	$A,$SZ*0($ctx)
1411	mov	$B,$SZ*1($ctx)
1412	mov	$C,$SZ*2($ctx)
1413	mov	$D,$SZ*3($ctx)
1414	mov	$E,$SZ*4($ctx)
1415	mov	$F,$SZ*5($ctx)
1416	mov	$G,$SZ*6($ctx)
1417	mov	$H,$SZ*7($ctx)
1418	jb	.Lloop_avx
1419
1420	mov	$_rsp,%rsi
1421.cfi_def_cfa	%rsi,8
1422	vzeroupper
1423___
1424$code.=<<___ if ($win64);
1425	movaps	16*$SZ+32(%rsp),%xmm6
1426	movaps	16*$SZ+48(%rsp),%xmm7
1427	movaps	16*$SZ+64(%rsp),%xmm8
1428	movaps	16*$SZ+80(%rsp),%xmm9
1429___
1430$code.=<<___ if ($win64 && $SZ>4);
1431	movaps	16*$SZ+96(%rsp),%xmm10
1432	movaps	16*$SZ+112(%rsp),%xmm11
1433___
1434$code.=<<___;
1435	mov	-48(%rsi),%r15
1436.cfi_restore	%r15
1437	mov	-40(%rsi),%r14
1438.cfi_restore	%r14
1439	mov	-32(%rsi),%r13
1440.cfi_restore	%r13
1441	mov	-24(%rsi),%r12
1442.cfi_restore	%r12
1443	mov	-16(%rsi),%rbp
1444.cfi_restore	%rbp
1445	mov	-8(%rsi),%rbx
1446.cfi_restore	%rbx
1447	lea	(%rsi),%rsp
1448.cfi_def_cfa_register	%rsp
1449.Lepilogue_avx:
1450	ret
1451.cfi_endproc
1452.size	${func}_avx,.-${func}_avx
1453___
1454
1455if ($avx>1) {{
1456######################################################################
1457# AVX2+BMI code path
1458#
1459my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1460my $PUSH8=8*2*$SZ;
1461use integer;
1462
1463sub bodyx_00_15 () {
1464	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1465	(
1466	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1467
1468	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1469	'&and	($a4,$e)',		# f&e
1470	'&rorx	($a0,$e,$Sigma1[2])',
1471	'&rorx	($a2,$e,$Sigma1[1])',
1472
1473	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1474	'&lea	($h,"($h,$a4)")',
1475	'&andn	($a4,$e,$g)',		# ~e&g
1476	'&xor	($a0,$a2)',
1477
1478	'&rorx	($a1,$e,$Sigma1[0])',
1479	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1480	'&xor	($a0,$a1)',		# Sigma1(e)
1481	'&mov	($a2,$a)',
1482
1483	'&rorx	($a4,$a,$Sigma0[2])',
1484	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1485	'&xor	($a2,$b)',		# a^b, b^c in next round
1486	'&rorx	($a1,$a,$Sigma0[1])',
1487
1488	'&rorx	($a0,$a,$Sigma0[0])',
1489	'&lea	($d,"($d,$h)")',	# d+=h
1490	'&and	($a3,$a2)',		# (b^c)&(a^b)
1491	'&xor	($a1,$a4)',
1492
1493	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1494	'&xor	($a1,$a0)',		# Sigma0(a)
1495	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1496	'&mov	($a4,$e)',		# copy of f in future
1497
1498	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1499	);
1500	# and at the finish one has to $a+=$a1
1501}
1502
1503$code.=<<___;
1504.type	${func}_avx2,\@function,3
1505.align	64
1506${func}_avx2:
1507.cfi_startproc
1508.Lavx2_shortcut:
1509	mov	%rsp,%rax		# copy %rsp
1510.cfi_def_cfa_register	%rax
1511	push	%rbx
1512.cfi_push	%rbx
1513	push	%rbp
1514.cfi_push	%rbp
1515	push	%r12
1516.cfi_push	%r12
1517	push	%r13
1518.cfi_push	%r13
1519	push	%r14
1520.cfi_push	%r14
1521	push	%r15
1522.cfi_push	%r15
1523	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1524	shl	\$4,%rdx		# num*16
1525	and	\$-256*$SZ,%rsp		# align stack frame
1526	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1527	add	\$`2*$SZ*($rounds-8)`,%rsp
1528	mov	$ctx,$_ctx		# save ctx, 1st arg
1529	mov	$inp,$_inp		# save inp, 2nd arh
1530	mov	%rdx,$_end		# save end pointer, "3rd" arg
1531	mov	%rax,$_rsp		# save copy of %rsp
1532.cfi_cfa_expression	$_rsp,deref,+8
1533___
1534$code.=<<___ if ($win64);
1535	movaps	%xmm6,16*$SZ+32(%rsp)
1536	movaps	%xmm7,16*$SZ+48(%rsp)
1537	movaps	%xmm8,16*$SZ+64(%rsp)
1538	movaps	%xmm9,16*$SZ+80(%rsp)
1539___
1540$code.=<<___ if ($win64 && $SZ>4);
1541	movaps	%xmm10,16*$SZ+96(%rsp)
1542	movaps	%xmm11,16*$SZ+112(%rsp)
1543___
1544$code.=<<___;
1545.Lprologue_avx2:
1546
1547	vzeroupper
1548	sub	\$-16*$SZ,$inp		# inp++, size optimization
1549	mov	$SZ*0($ctx),$A
1550	mov	$inp,%r12		# borrow $T1
1551	mov	$SZ*1($ctx),$B
1552	cmp	%rdx,$inp		# $_end
1553	mov	$SZ*2($ctx),$C
1554	cmove	%rsp,%r12		# next block or random data
1555	mov	$SZ*3($ctx),$D
1556	mov	$SZ*4($ctx),$E
1557	mov	$SZ*5($ctx),$F
1558	mov	$SZ*6($ctx),$G
1559	mov	$SZ*7($ctx),$H
1560___
1561					if ($SZ==4) {	# SHA256
1562    my @X = map("%ymm$_",(0..3));
1563    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1564
1565$code.=<<___;
1566	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1567	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1568	jmp	.Loop_avx2
1569.align	16
1570.Loop_avx2:
1571	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1572	vmovdqu	-16*$SZ+0($inp),%xmm0
1573	vmovdqu	-16*$SZ+16($inp),%xmm1
1574	vmovdqu	-16*$SZ+32($inp),%xmm2
1575	vmovdqu	-16*$SZ+48($inp),%xmm3
1576	#mov		$inp,$_inp	# offload $inp
1577	vinserti128	\$1,(%r12),@X[0],@X[0]
1578	vinserti128	\$1,16(%r12),@X[1],@X[1]
1579	vpshufb		$t3,@X[0],@X[0]
1580	vinserti128	\$1,32(%r12),@X[2],@X[2]
1581	vpshufb		$t3,@X[1],@X[1]
1582	vinserti128	\$1,48(%r12),@X[3],@X[3]
1583
1584	lea	$TABLE(%rip),$Tbl
1585	vpshufb	$t3,@X[2],@X[2]
1586	vpaddd	0x00($Tbl),@X[0],$t0
1587	vpshufb	$t3,@X[3],@X[3]
1588	vpaddd	0x20($Tbl),@X[1],$t1
1589	vpaddd	0x40($Tbl),@X[2],$t2
1590	vpaddd	0x60($Tbl),@X[3],$t3
1591	vmovdqa	$t0,0x00(%rsp)
1592	xor	$a1,$a1
1593	vmovdqa	$t1,0x20(%rsp)
1594	lea	-$PUSH8(%rsp),%rsp
1595	mov	$B,$a3
1596	vmovdqa	$t2,0x00(%rsp)
1597	xor	$C,$a3			# magic
1598	vmovdqa	$t3,0x20(%rsp)
1599	mov	$F,$a4
1600	sub	\$-16*2*$SZ,$Tbl	# size optimization
1601	jmp	.Lavx2_00_47
1602
1603.align	16
1604.Lavx2_00_47:
1605___
1606
1607sub AVX2_256_00_47 () {
1608my $j = shift;
1609my $body = shift;
1610my @X = @_;
1611my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1612my $base = "+2*$PUSH8(%rsp)";
1613
1614	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1615	foreach (Xupdate_256_AVX()) {		# 29 instructions
1616	    eval;
1617	    eval(shift(@insns));
1618	    eval(shift(@insns));
1619	    eval(shift(@insns));
1620	}
1621	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1622	  foreach (@insns) { eval; }		# remaining instructions
1623	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1624}
1625
1626    for ($i=0,$j=0; $j<4; $j++) {
1627	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1628	push(@X,shift(@X));			# rotate(@X)
1629    }
1630	&lea	($Tbl,16*2*$SZ."($Tbl)");
1631	&cmpb	(($SZ-1)."($Tbl)",0);
1632	&jne	(".Lavx2_00_47");
1633
1634    for ($i=0; $i<16; ) {
1635	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1636	foreach(bodyx_00_15()) { eval; }
1637    }
1638					} else {	# SHA512
1639    my @X = map("%ymm$_",(0..7));
1640    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1641
1642$code.=<<___;
1643	jmp	.Loop_avx2
1644.align	16
1645.Loop_avx2:
1646	vmovdqu	-16*$SZ($inp),%xmm0
1647	vmovdqu	-16*$SZ+16($inp),%xmm1
1648	vmovdqu	-16*$SZ+32($inp),%xmm2
1649	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1650	vmovdqu	-16*$SZ+48($inp),%xmm3
1651	vmovdqu	-16*$SZ+64($inp),%xmm4
1652	vmovdqu	-16*$SZ+80($inp),%xmm5
1653	vmovdqu	-16*$SZ+96($inp),%xmm6
1654	vmovdqu	-16*$SZ+112($inp),%xmm7
1655	#mov	$inp,$_inp	# offload $inp
1656	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
1657	vinserti128	\$1,(%r12),@X[0],@X[0]
1658	vinserti128	\$1,16(%r12),@X[1],@X[1]
1659	 vpshufb	$t2,@X[0],@X[0]
1660	vinserti128	\$1,32(%r12),@X[2],@X[2]
1661	 vpshufb	$t2,@X[1],@X[1]
1662	vinserti128	\$1,48(%r12),@X[3],@X[3]
1663	 vpshufb	$t2,@X[2],@X[2]
1664	vinserti128	\$1,64(%r12),@X[4],@X[4]
1665	 vpshufb	$t2,@X[3],@X[3]
1666	vinserti128	\$1,80(%r12),@X[5],@X[5]
1667	 vpshufb	$t2,@X[4],@X[4]
1668	vinserti128	\$1,96(%r12),@X[6],@X[6]
1669	 vpshufb	$t2,@X[5],@X[5]
1670	vinserti128	\$1,112(%r12),@X[7],@X[7]
1671
1672	vpaddq	-0x80($Tbl),@X[0],$t0
1673	vpshufb	$t2,@X[6],@X[6]
1674	vpaddq	-0x60($Tbl),@X[1],$t1
1675	vpshufb	$t2,@X[7],@X[7]
1676	vpaddq	-0x40($Tbl),@X[2],$t2
1677	vpaddq	-0x20($Tbl),@X[3],$t3
1678	vmovdqa	$t0,0x00(%rsp)
1679	vpaddq	0x00($Tbl),@X[4],$t0
1680	vmovdqa	$t1,0x20(%rsp)
1681	vpaddq	0x20($Tbl),@X[5],$t1
1682	vmovdqa	$t2,0x40(%rsp)
1683	vpaddq	0x40($Tbl),@X[6],$t2
1684	vmovdqa	$t3,0x60(%rsp)
1685	lea	-$PUSH8(%rsp),%rsp
1686	vpaddq	0x60($Tbl),@X[7],$t3
1687	vmovdqa	$t0,0x00(%rsp)
1688	xor	$a1,$a1
1689	vmovdqa	$t1,0x20(%rsp)
1690	mov	$B,$a3
1691	vmovdqa	$t2,0x40(%rsp)
1692	xor	$C,$a3			# magic
1693	vmovdqa	$t3,0x60(%rsp)
1694	mov	$F,$a4
1695	add	\$16*2*$SZ,$Tbl
1696	jmp	.Lavx2_00_47
1697
1698.align	16
1699.Lavx2_00_47:
1700___
1701
1702sub AVX2_512_00_47 () {
1703my $j = shift;
1704my $body = shift;
1705my @X = @_;
1706my @insns = (&$body,&$body);			# 48 instructions
1707my $base = "+2*$PUSH8(%rsp)";
1708
1709	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
1710	foreach (Xupdate_512_AVX()) {		# 23 instructions
1711	    eval;
1712	    if ($_ !~ /\;$/) {
1713		eval(shift(@insns));
1714		eval(shift(@insns));
1715		eval(shift(@insns));
1716	    }
1717	}
1718	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1719	  foreach (@insns) { eval; }		# remaining instructions
1720	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1721}
1722
1723    for ($i=0,$j=0; $j<8; $j++) {
1724	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
1725	push(@X,shift(@X));			# rotate(@X)
1726    }
1727	&lea	($Tbl,16*2*$SZ."($Tbl)");
1728	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
1729	&jne	(".Lavx2_00_47");
1730
1731    for ($i=0; $i<16; ) {
1732	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1733	foreach(bodyx_00_15()) { eval; }
1734    }
1735}
1736$code.=<<___;
1737	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
1738	add	$a1,$A
1739	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
1740	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1741
1742	add	$SZ*0($ctx),$A
1743	add	$SZ*1($ctx),$B
1744	add	$SZ*2($ctx),$C
1745	add	$SZ*3($ctx),$D
1746	add	$SZ*4($ctx),$E
1747	add	$SZ*5($ctx),$F
1748	add	$SZ*6($ctx),$G
1749	add	$SZ*7($ctx),$H
1750
1751	mov	$A,$SZ*0($ctx)
1752	mov	$B,$SZ*1($ctx)
1753	mov	$C,$SZ*2($ctx)
1754	mov	$D,$SZ*3($ctx)
1755	mov	$E,$SZ*4($ctx)
1756	mov	$F,$SZ*5($ctx)
1757	mov	$G,$SZ*6($ctx)
1758	mov	$H,$SZ*7($ctx)
1759
1760	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
1761	je	.Ldone_avx2
1762
1763	xor	$a1,$a1
1764	mov	$B,$a3
1765	xor	$C,$a3			# magic
1766	mov	$F,$a4
1767	jmp	.Lower_avx2
1768.align	16
1769.Lower_avx2:
1770___
1771    for ($i=0; $i<8; ) {
1772	my $base="+16($Tbl)";
1773	foreach(bodyx_00_15()) { eval; }
1774    }
1775$code.=<<___;
1776	lea	-$PUSH8($Tbl),$Tbl
1777	cmp	%rsp,$Tbl
1778	jae	.Lower_avx2
1779
1780	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
1781	add	$a1,$A
1782	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
1783	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1784
1785	add	$SZ*0($ctx),$A
1786	add	$SZ*1($ctx),$B
1787	add	$SZ*2($ctx),$C
1788	add	$SZ*3($ctx),$D
1789	add	$SZ*4($ctx),$E
1790	add	$SZ*5($ctx),$F
1791	lea	`2*16*$SZ`($inp),$inp	# inp+=2
1792	add	$SZ*6($ctx),$G
1793	mov	$inp,%r12
1794	add	$SZ*7($ctx),$H
1795	cmp	$_end,$inp
1796
1797	mov	$A,$SZ*0($ctx)
1798	cmove	%rsp,%r12		# next block or stale data
1799	mov	$B,$SZ*1($ctx)
1800	mov	$C,$SZ*2($ctx)
1801	mov	$D,$SZ*3($ctx)
1802	mov	$E,$SZ*4($ctx)
1803	mov	$F,$SZ*5($ctx)
1804	mov	$G,$SZ*6($ctx)
1805	mov	$H,$SZ*7($ctx)
1806
1807	jbe	.Loop_avx2
1808	lea	(%rsp),$Tbl
1809
1810.Ldone_avx2:
1811	lea	($Tbl),%rsp
1812	mov	$_rsp,%rsi
1813.cfi_def_cfa	%rsi,8
1814	vzeroupper
1815___
1816$code.=<<___ if ($win64);
1817	movaps	16*$SZ+32(%rsp),%xmm6
1818	movaps	16*$SZ+48(%rsp),%xmm7
1819	movaps	16*$SZ+64(%rsp),%xmm8
1820	movaps	16*$SZ+80(%rsp),%xmm9
1821___
1822$code.=<<___ if ($win64 && $SZ>4);
1823	movaps	16*$SZ+96(%rsp),%xmm10
1824	movaps	16*$SZ+112(%rsp),%xmm11
1825___
1826$code.=<<___;
1827	mov	-48(%rsi),%r15
1828.cfi_restore	%r15
1829	mov	-40(%rsi),%r14
1830.cfi_restore	%r14
1831	mov	-32(%rsi),%r13
1832.cfi_restore	%r13
1833	mov	-24(%rsi),%r12
1834.cfi_restore	%r12
1835	mov	-16(%rsi),%rbp
1836.cfi_restore	%rbp
1837	mov	-8(%rsi),%rbx
1838.cfi_restore	%rbx
1839	lea	(%rsi),%rsp
1840.cfi_def_cfa_register	%rsp
1841.Lepilogue_avx2:
1842	ret
1843.cfi_endproc
1844.size	${func}_avx2,.-${func}_avx2
1845___
1846}}
1847}}}}}
1848
1849# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1850#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1851if ($win64) {
1852$rec="%rcx";
1853$frame="%rdx";
1854$context="%r8";
1855$disp="%r9";
1856
1857$code.=<<___;
1858.extern	__imp_RtlVirtualUnwind
1859.type	se_handler,\@abi-omnipotent
1860.align	16
1861se_handler:
1862	push	%rsi
1863	push	%rdi
1864	push	%rbx
1865	push	%rbp
1866	push	%r12
1867	push	%r13
1868	push	%r14
1869	push	%r15
1870	pushfq
1871	sub	\$64,%rsp
1872
1873	mov	120($context),%rax	# pull context->Rax
1874	mov	248($context),%rbx	# pull context->Rip
1875
1876	mov	8($disp),%rsi		# disp->ImageBase
1877	mov	56($disp),%r11		# disp->HanderlData
1878
1879	mov	0(%r11),%r10d		# HandlerData[0]
1880	lea	(%rsi,%r10),%r10	# prologue label
1881	cmp	%r10,%rbx		# context->Rip<prologue label
1882	jb	.Lin_prologue
1883
1884	mov	152($context),%rax	# pull context->Rsp
1885
1886	mov	4(%r11),%r10d		# HandlerData[1]
1887	lea	(%rsi,%r10),%r10	# epilogue label
1888	cmp	%r10,%rbx		# context->Rip>=epilogue label
1889	jae	.Lin_prologue
1890___
1891$code.=<<___ if ($avx>1);
1892	lea	.Lavx2_shortcut(%rip),%r10
1893	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1894	jb	.Lnot_in_avx2
1895
1896	and	\$-256*$SZ,%rax
1897	add	\$`2*$SZ*($rounds-8)`,%rax
1898.Lnot_in_avx2:
1899___
1900$code.=<<___;
1901	mov	%rax,%rsi		# put aside Rsp
1902	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
1903
1904	mov	-8(%rax),%rbx
1905	mov	-16(%rax),%rbp
1906	mov	-24(%rax),%r12
1907	mov	-32(%rax),%r13
1908	mov	-40(%rax),%r14
1909	mov	-48(%rax),%r15
1910	mov	%rbx,144($context)	# restore context->Rbx
1911	mov	%rbp,160($context)	# restore context->Rbp
1912	mov	%r12,216($context)	# restore context->R12
1913	mov	%r13,224($context)	# restore context->R13
1914	mov	%r14,232($context)	# restore context->R14
1915	mov	%r15,240($context)	# restore context->R15
1916
1917	lea	.Lepilogue(%rip),%r10
1918	cmp	%r10,%rbx
1919	jb	.Lin_prologue		# non-AVX code
1920
1921	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
1922	lea	512($context),%rdi	# &context.Xmm6
1923	mov	\$`$SZ==4?8:12`,%ecx
1924	.long	0xa548f3fc		# cld; rep movsq
1925
1926.Lin_prologue:
1927	mov	8(%rax),%rdi
1928	mov	16(%rax),%rsi
1929	mov	%rax,152($context)	# restore context->Rsp
1930	mov	%rsi,168($context)	# restore context->Rsi
1931	mov	%rdi,176($context)	# restore context->Rdi
1932
1933	mov	40($disp),%rdi		# disp->ContextRecord
1934	mov	$context,%rsi		# context
1935	mov	\$154,%ecx		# sizeof(CONTEXT)
1936	.long	0xa548f3fc		# cld; rep movsq
1937
1938	mov	$disp,%rsi
1939	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1940	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1941	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1942	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1943	mov	40(%rsi),%r10		# disp->ContextRecord
1944	lea	56(%rsi),%r11		# &disp->HandlerData
1945	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1946	mov	%r10,32(%rsp)		# arg5
1947	mov	%r11,40(%rsp)		# arg6
1948	mov	%r12,48(%rsp)		# arg7
1949	mov	%rcx,56(%rsp)		# arg8, (NULL)
1950	call	*__imp_RtlVirtualUnwind(%rip)
1951
1952	mov	\$1,%eax		# ExceptionContinueSearch
1953	add	\$64,%rsp
1954	popfq
1955	pop	%r15
1956	pop	%r14
1957	pop	%r13
1958	pop	%r12
1959	pop	%rbp
1960	pop	%rbx
1961	pop	%rdi
1962	pop	%rsi
1963	ret
1964.size	se_handler,.-se_handler
1965___
1966
1967$code.=<<___ if ($SZ==4 && $shaext);
1968.type	shaext_handler,\@abi-omnipotent
1969.align	16
1970shaext_handler:
1971	push	%rsi
1972	push	%rdi
1973	push	%rbx
1974	push	%rbp
1975	push	%r12
1976	push	%r13
1977	push	%r14
1978	push	%r15
1979	pushfq
1980	sub	\$64,%rsp
1981
1982	mov	120($context),%rax	# pull context->Rax
1983	mov	248($context),%rbx	# pull context->Rip
1984
1985	lea	.Lprologue_shaext(%rip),%r10
1986	cmp	%r10,%rbx		# context->Rip<.Lprologue
1987	jb	.Lin_prologue
1988
1989	lea	.Lepilogue_shaext(%rip),%r10
1990	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1991	jae	.Lin_prologue
1992
1993	lea	-8-5*16(%rax),%rsi
1994	lea	512($context),%rdi	# &context.Xmm6
1995	mov	\$10,%ecx
1996	.long	0xa548f3fc		# cld; rep movsq
1997
1998	jmp	.Lin_prologue
1999.size	shaext_handler,.-shaext_handler
2000___
2001
2002$code.=<<___;
2003.section	.pdata
2004.align	4
2005	.rva	.LSEH_begin_$func
2006	.rva	.LSEH_end_$func
2007	.rva	.LSEH_info_$func
2008___
2009$code.=<<___ if ($SZ==4 && $shaext);
2010	.rva	.LSEH_begin_${func}_shaext
2011	.rva	.LSEH_end_${func}_shaext
2012	.rva	.LSEH_info_${func}_shaext
2013___
2014$code.=<<___ if ($SZ==4);
2015	.rva	.LSEH_begin_${func}_ssse3
2016	.rva	.LSEH_end_${func}_ssse3
2017	.rva	.LSEH_info_${func}_ssse3
2018___
2019$code.=<<___ if ($avx);
2020	.rva	.LSEH_begin_${func}_avx
2021	.rva	.LSEH_end_${func}_avx
2022	.rva	.LSEH_info_${func}_avx
2023___
2024$code.=<<___ if ($avx>1);
2025	.rva	.LSEH_begin_${func}_avx2
2026	.rva	.LSEH_end_${func}_avx2
2027	.rva	.LSEH_info_${func}_avx2
2028___
2029$code.=<<___;
2030.section	.xdata
2031.align	8
2032.LSEH_info_$func:
2033	.byte	9,0,0,0
2034	.rva	se_handler
2035	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2036___
2037$code.=<<___ if ($SZ==4 && $shaext);
2038.LSEH_info_${func}_shaext:
2039	.byte	9,0,0,0
2040	.rva	shaext_handler
2041___
2042$code.=<<___ if ($SZ==4);
2043.LSEH_info_${func}_ssse3:
2044	.byte	9,0,0,0
2045	.rva	se_handler
2046	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2047___
2048$code.=<<___ if ($avx);
2049.LSEH_info_${func}_avx:
2050	.byte	9,0,0,0
2051	.rva	se_handler
2052	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2053___
2054$code.=<<___ if ($avx>1);
2055.LSEH_info_${func}_avx2:
2056	.byte	9,0,0,0
2057	.rva	se_handler
2058	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2059___
2060}
2061
2062sub sha256op38 {
2063    my $instr = shift;
2064    my %opcodelet = (
2065		"sha256rnds2" => 0xcb,
2066  		"sha256msg1"  => 0xcc,
2067		"sha256msg2"  => 0xcd	);
2068
2069    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2070      my @opcode=(0x0f,0x38);
2071	push @opcode,$opcodelet{$instr};
2072	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2073	return ".byte\t".join(',',@opcode);
2074    } else {
2075	return $instr."\t".@_[0];
2076    }
2077}
2078
2079foreach (split("\n",$code)) {
2080	s/\`([^\`]*)\`/eval $1/geo;
2081
2082	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2083
2084	print $_,"\n";
2085}
2086close STDOUT;
2087