1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52# May 2013.
53#
54# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
55# and loading pair of consecutive blocks to 256-bit %ymm registers)
56# did not provide impressive performance improvement till a crucial
57# hint regarding the number of Xupdate iterations to pre-compute in
58# advance was provided by Ilya Albrekht of Intel Corp.
59
60# March 2014.
61#
62# Add support for Intel SHA Extensions.
63
64######################################################################
65# Current performance is summarized in following table. Numbers are
66# CPU clock cycles spent to process single byte (less is better).
67#
68#		x86_64		SSSE3		AVX[2]
69# P4		9.05		-
70# Opteron	6.26		-
71# Core2		6.55		6.05/+8%	-
72# Westmere	6.73		5.30/+27%	-
73# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
74# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
75# Haswell	5.45		4.15/+31%	3.57/+53%
76# Bulldozer	9.11		5.95/+53%
77# VIA Nano	9.32		7.15/+30%
78# Atom		10.3		9.17/+12%
79# Silvermont	13.1(*)		9.37/+40%
80#
81# (*)	obviously suboptimal result, nothing was done about it,
82#	because SSSE3 code is compiled unconditionally;
83
84$flavour = shift;
85$output  = shift;
86if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
87
88$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
89
90$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
91( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
92( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
93die "can't locate x86_64-xlate.pl";
94
95# In upstream, this is controlled by shelling out to the compiler to check
96# versions, but BoringSSL is intended to be used with pre-generated perlasm
97# output, so this isn't useful anyway.
98#
99# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
100# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
101# did not tie them together until after $shaext was added.
102$avx = 1;
103
104# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
105# been tested.
106$shaext=0;	### set to zero if compiling for 1.0.1
107$avx=1		if (!$shaext && $avx);
108
109open OUT,"| \"$^X\" $xlate $flavour $output";
110*STDOUT=*OUT;
111
112$ctx="%rdi";	# 1st arg
113$inp="%rsi";	# 2nd arg
114$num="%rdx";	# 3rd arg
115
116# reassign arguments in order to produce more compact code
117$ctx="%r8";
118$inp="%r9";
119$num="%r10";
120
121$t0="%eax";
122$t1="%ebx";
123$t2="%ecx";
124@xi=("%edx","%ebp","%r14d");
125$A="%esi";
126$B="%edi";
127$C="%r11d";
128$D="%r12d";
129$E="%r13d";
130
131@V=($A,$B,$C,$D,$E);
132
133sub BODY_00_19 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $j=$i+1;
136$code.=<<___ if ($i==0);
137	mov	`4*$i`($inp),$xi[0]
138	bswap	$xi[0]
139___
140$code.=<<___ if ($i<15);
141	mov	`4*$j`($inp),$xi[1]
142	mov	$d,$t0
143	mov	$xi[0],`4*$i`(%rsp)
144	mov	$a,$t2
145	bswap	$xi[1]
146	xor	$c,$t0
147	rol	\$5,$t2
148	and	$b,$t0
149	lea	0x5a827999($xi[0],$e),$e
150	add	$t2,$e
151	xor	$d,$t0
152	rol	\$30,$b
153	add	$t0,$e
154___
155$code.=<<___ if ($i>=15);
156	xor	`4*($j%16)`(%rsp),$xi[1]
157	mov	$d,$t0
158	mov	$xi[0],`4*($i%16)`(%rsp)
159	mov	$a,$t2
160	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
161	xor	$c,$t0
162	rol	\$5,$t2
163	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
164	and	$b,$t0
165	lea	0x5a827999($xi[0],$e),$e
166	rol	\$30,$b
167	xor	$d,$t0
168	add	$t2,$e
169	rol	\$1,$xi[1]
170	add	$t0,$e
171___
172push(@xi,shift(@xi));
173}
174
175sub BODY_20_39 {
176my ($i,$a,$b,$c,$d,$e)=@_;
177my $j=$i+1;
178my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
179$code.=<<___ if ($i<79);
180	xor	`4*($j%16)`(%rsp),$xi[1]
181	mov	$b,$t0
182	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
183	mov	$a,$t2
184	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
185	xor	$d,$t0
186	rol	\$5,$t2
187	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
188	lea	$K($xi[0],$e),$e
189	xor	$c,$t0
190	add	$t2,$e
191	rol	\$30,$b
192	add	$t0,$e
193	rol	\$1,$xi[1]
194___
195$code.=<<___ if ($i==79);
196	mov	$b,$t0
197	mov	$a,$t2
198	xor	$d,$t0
199	lea	$K($xi[0],$e),$e
200	rol	\$5,$t2
201	xor	$c,$t0
202	add	$t2,$e
203	rol	\$30,$b
204	add	$t0,$e
205___
206push(@xi,shift(@xi));
207}
208
209sub BODY_40_59 {
210my ($i,$a,$b,$c,$d,$e)=@_;
211my $j=$i+1;
212$code.=<<___;
213	xor	`4*($j%16)`(%rsp),$xi[1]
214	mov	$d,$t0
215	mov	$xi[0],`4*($i%16)`(%rsp)
216	mov	$d,$t1
217	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
218	and	$c,$t0
219	mov	$a,$t2
220	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
221	lea	0x8f1bbcdc($xi[0],$e),$e
222	xor	$c,$t1
223	rol	\$5,$t2
224	add	$t0,$e
225	rol	\$1,$xi[1]
226	and	$b,$t1
227	add	$t2,$e
228	rol	\$30,$b
229	add	$t1,$e
230___
231push(@xi,shift(@xi));
232}
233
234$code.=<<___;
235.text
236.extern	OPENSSL_ia32cap_P
237
238.globl	sha1_block_data_order
239.type	sha1_block_data_order,\@function,3
240.align	16
241sha1_block_data_order:
242	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
243	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
244	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
245	test	\$`1<<9`,%r8d		# check SSSE3 bit
246	jz	.Lialu
247___
248$code.=<<___ if ($shaext);
249	test	\$`1<<29`,%r10d		# check SHA bit
250	jnz	_shaext_shortcut
251___
252$code.=<<___ if ($avx>1);
253	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
254	cmp	\$`1<<3|1<<5|1<<8`,%r10d
255	je	_avx2_shortcut
256___
257$code.=<<___ if ($avx);
258	and	\$`1<<28`,%r8d		# mask AVX bit
259	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
260	or	%r9d,%r8d
261	cmp	\$`1<<28|1<<30`,%r8d
262	je	_avx_shortcut
263___
264$code.=<<___;
265	jmp	_ssse3_shortcut
266
267.align	16
268.Lialu:
269	mov	%rsp,%rax
270	push	%rbx
271	push	%rbp
272	push	%r12
273	push	%r13
274	push	%r14
275	mov	%rdi,$ctx	# reassigned argument
276	sub	\$`8+16*4`,%rsp
277	mov	%rsi,$inp	# reassigned argument
278	and	\$-64,%rsp
279	mov	%rdx,$num	# reassigned argument
280	mov	%rax,`16*4`(%rsp)
281.Lprologue:
282
283	mov	0($ctx),$A
284	mov	4($ctx),$B
285	mov	8($ctx),$C
286	mov	12($ctx),$D
287	mov	16($ctx),$E
288	jmp	.Lloop
289
290.align	16
291.Lloop:
292___
293for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
294for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
295for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
296for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
297$code.=<<___;
298	add	0($ctx),$A
299	add	4($ctx),$B
300	add	8($ctx),$C
301	add	12($ctx),$D
302	add	16($ctx),$E
303	mov	$A,0($ctx)
304	mov	$B,4($ctx)
305	mov	$C,8($ctx)
306	mov	$D,12($ctx)
307	mov	$E,16($ctx)
308
309	sub	\$1,$num
310	lea	`16*4`($inp),$inp
311	jnz	.Lloop
312
313	mov	`16*4`(%rsp),%rsi
314	mov	-40(%rsi),%r14
315	mov	-32(%rsi),%r13
316	mov	-24(%rsi),%r12
317	mov	-16(%rsi),%rbp
318	mov	-8(%rsi),%rbx
319	lea	(%rsi),%rsp
320.Lepilogue:
321	ret
322.size	sha1_block_data_order,.-sha1_block_data_order
323___
324if ($shaext) {{{
325######################################################################
326# Intel SHA Extensions implementation of SHA1 update function.
327#
328my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
329my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
330my @MSG=map("%xmm$_",(4..7));
331
332$code.=<<___;
333.type	sha1_block_data_order_shaext,\@function,3
334.align	32
335sha1_block_data_order_shaext:
336_shaext_shortcut:
337___
338$code.=<<___ if ($win64);
339	lea	`-8-4*16`(%rsp),%rsp
340	movaps	%xmm6,-8-4*16(%rax)
341	movaps	%xmm7,-8-3*16(%rax)
342	movaps	%xmm8,-8-2*16(%rax)
343	movaps	%xmm9,-8-1*16(%rax)
344.Lprologue_shaext:
345___
346$code.=<<___;
347	movdqu	($ctx),$ABCD
348	movd	16($ctx),$E
349	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
350
351	movdqu	($inp),@MSG[0]
352	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
353	movdqu	0x10($inp),@MSG[1]
354	pshufd	\$0b00011011,$E,$E		# flip word order
355	movdqu	0x20($inp),@MSG[2]
356	pshufb	$BSWAP,@MSG[0]
357	movdqu	0x30($inp),@MSG[3]
358	pshufb	$BSWAP,@MSG[1]
359	pshufb	$BSWAP,@MSG[2]
360	movdqa	$E,$E_SAVE			# offload $E
361	pshufb	$BSWAP,@MSG[3]
362	jmp	.Loop_shaext
363
364.align	16
365.Loop_shaext:
366	dec		$num
367	lea		0x40($inp),%rax		# next input block
368	paddd		@MSG[0],$E
369	cmovne		%rax,$inp
370	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
371___
372for($i=0;$i<20-4;$i+=2) {
373$code.=<<___;
374	sha1msg1	@MSG[1],@MSG[0]
375	movdqa		$ABCD,$E_
376	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
377	sha1nexte	@MSG[1],$E_
378	pxor		@MSG[2],@MSG[0]
379	sha1msg1	@MSG[2],@MSG[1]
380	sha1msg2	@MSG[3],@MSG[0]
381
382	movdqa		$ABCD,$E
383	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
384	sha1nexte	@MSG[2],$E
385	pxor		@MSG[3],@MSG[1]
386	sha1msg2	@MSG[0],@MSG[1]
387___
388	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
389}
390$code.=<<___;
391	movdqu		($inp),@MSG[0]
392	movdqa		$ABCD,$E_
393	sha1rnds4	\$3,$E,$ABCD		# 64-67
394	sha1nexte	@MSG[1],$E_
395	movdqu		0x10($inp),@MSG[1]
396	pshufb		$BSWAP,@MSG[0]
397
398	movdqa		$ABCD,$E
399	sha1rnds4	\$3,$E_,$ABCD		# 68-71
400	sha1nexte	@MSG[2],$E
401	movdqu		0x20($inp),@MSG[2]
402	pshufb		$BSWAP,@MSG[1]
403
404	movdqa		$ABCD,$E_
405	sha1rnds4	\$3,$E,$ABCD		# 72-75
406	sha1nexte	@MSG[3],$E_
407	movdqu		0x30($inp),@MSG[3]
408	pshufb		$BSWAP,@MSG[2]
409
410	movdqa		$ABCD,$E
411	sha1rnds4	\$3,$E_,$ABCD		# 76-79
412	sha1nexte	$E_SAVE,$E
413	pshufb		$BSWAP,@MSG[3]
414
415	paddd		$ABCD_SAVE,$ABCD
416	movdqa		$E,$E_SAVE		# offload $E
417
418	jnz		.Loop_shaext
419
420	pshufd	\$0b00011011,$ABCD,$ABCD
421	pshufd	\$0b00011011,$E,$E
422	movdqu	$ABCD,($ctx)
423	movd	$E,16($ctx)
424___
425$code.=<<___ if ($win64);
426	movaps	-8-4*16(%rax),%xmm6
427	movaps	-8-3*16(%rax),%xmm7
428	movaps	-8-2*16(%rax),%xmm8
429	movaps	-8-1*16(%rax),%xmm9
430	mov	%rax,%rsp
431.Lepilogue_shaext:
432___
433$code.=<<___;
434	ret
435.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
436___
437}}}
438{{{
439my $Xi=4;
440my @X=map("%xmm$_",(4..7,0..3));
441my @Tx=map("%xmm$_",(8..10));
442my $Kx="%xmm11";
443my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
444my @T=("%esi","%edi");
445my $j=0;
446my $rx=0;
447my $K_XX_XX="%r11";
448
449my $_rol=sub { &rol(@_) };
450my $_ror=sub { &ror(@_) };
451
452{ my $sn;
453sub align32() {
454  ++$sn;
455$code.=<<___;
456	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
457.align	32
458.Lalign32_$sn:
459___
460}
461}
462
463$code.=<<___;
464.type	sha1_block_data_order_ssse3,\@function,3
465.align	16
466sha1_block_data_order_ssse3:
467_ssse3_shortcut:
468	mov	%rsp,%rax
469	push	%rbx
470	push	%rbp
471	push	%r12
472	push	%r13		# redundant, done to share Win64 SE handler
473	push	%r14
474	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
475___
476$code.=<<___ if ($win64);
477	movaps	%xmm6,-40-6*16(%rax)
478	movaps	%xmm7,-40-5*16(%rax)
479	movaps	%xmm8,-40-4*16(%rax)
480	movaps	%xmm9,-40-3*16(%rax)
481	movaps	%xmm10,-40-2*16(%rax)
482	movaps	%xmm11,-40-1*16(%rax)
483.Lprologue_ssse3:
484___
485$code.=<<___;
486	mov	%rax,%r14	# original %rsp
487	and	\$-64,%rsp
488	mov	%rdi,$ctx	# reassigned argument
489	mov	%rsi,$inp	# reassigned argument
490	mov	%rdx,$num	# reassigned argument
491
492	shl	\$6,$num
493	add	$inp,$num
494	lea	K_XX_XX+64(%rip),$K_XX_XX
495
496	mov	0($ctx),$A		# load context
497	mov	4($ctx),$B
498	mov	8($ctx),$C
499	mov	12($ctx),$D
500	mov	$B,@T[0]		# magic seed
501	mov	16($ctx),$E
502	mov	$C,@T[1]
503	xor	$D,@T[1]
504	and	@T[1],@T[0]
505
506	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
507	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
508	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
509	movdqu	16($inp),@X[-3&7]
510	movdqu	32($inp),@X[-2&7]
511	movdqu	48($inp),@X[-1&7]
512	pshufb	@X[2],@X[-4&7]		# byte swap
513	pshufb	@X[2],@X[-3&7]
514	pshufb	@X[2],@X[-2&7]
515	add	\$64,$inp
516	paddd	@Tx[1],@X[-4&7]		# add K_00_19
517	pshufb	@X[2],@X[-1&7]
518	paddd	@Tx[1],@X[-3&7]
519	paddd	@Tx[1],@X[-2&7]
520	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
521	psubd	@Tx[1],@X[-4&7]		# restore X[]
522	movdqa	@X[-3&7],16(%rsp)
523	psubd	@Tx[1],@X[-3&7]
524	movdqa	@X[-2&7],32(%rsp)
525	psubd	@Tx[1],@X[-2&7]
526	jmp	.Loop_ssse3
527___
528
529sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
530{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
531  my $arg = pop;
532    $arg = "\$$arg" if ($arg*1 eq $arg);
533    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
534}
535
536sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
537{ use integer;
538  my $body = shift;
539  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
540  my ($a,$b,$c,$d,$e);
541
542	 eval(shift(@insns));		# ror
543	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
544	 eval(shift(@insns));
545	&movdqa	(@Tx[0],@X[-1&7]);
546	  &paddd	(@Tx[1],@X[-1&7]);
547	 eval(shift(@insns));
548	 eval(shift(@insns));
549
550	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
551	 eval(shift(@insns));
552	 eval(shift(@insns));		# rol
553	 eval(shift(@insns));
554	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
555	 eval(shift(@insns));
556	 eval(shift(@insns));
557
558	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
559	 eval(shift(@insns));
560	 eval(shift(@insns));		# ror
561	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
562	 eval(shift(@insns));
563	 eval(shift(@insns));
564	 eval(shift(@insns));
565
566	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
567	 eval(shift(@insns));
568	 eval(shift(@insns));		# rol
569	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
570	 eval(shift(@insns));
571	 eval(shift(@insns));
572
573	&movdqa	(@Tx[2],@X[0]);
574	 eval(shift(@insns));
575	 eval(shift(@insns));
576	 eval(shift(@insns));		# ror
577	&movdqa	(@Tx[0],@X[0]);
578	 eval(shift(@insns));
579
580	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
581	&paddd	(@X[0],@X[0]);
582	 eval(shift(@insns));
583	 eval(shift(@insns));
584
585	&psrld	(@Tx[0],31);
586	 eval(shift(@insns));
587	 eval(shift(@insns));		# rol
588	 eval(shift(@insns));
589	&movdqa	(@Tx[1],@Tx[2]);
590	 eval(shift(@insns));
591	 eval(shift(@insns));
592
593	&psrld	(@Tx[2],30);
594	 eval(shift(@insns));
595	 eval(shift(@insns));		# ror
596	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
597	 eval(shift(@insns));
598	 eval(shift(@insns));
599	 eval(shift(@insns));
600
601	&pslld	(@Tx[1],2);
602	&pxor	(@X[0],@Tx[2]);
603	 eval(shift(@insns));
604	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
605	 eval(shift(@insns));		# rol
606	 eval(shift(@insns));
607	 eval(shift(@insns));
608
609	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
610	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
611
612	 foreach (@insns) { eval; }	# remaining instructions [if any]
613
614  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
615		push(@Tx,shift(@Tx));
616}
617
618sub Xupdate_ssse3_32_79()
619{ use integer;
620  my $body = shift;
621  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
622  my ($a,$b,$c,$d,$e);
623
624	 eval(shift(@insns))		if ($Xi==8);
625	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
626	 eval(shift(@insns))		if ($Xi==8);
627	 eval(shift(@insns));		# body_20_39
628	 eval(shift(@insns));
629	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
630	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
631	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
632	 eval(shift(@insns));
633	 eval(shift(@insns));		# rol
634
635	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
636	 eval(shift(@insns));
637	 eval(shift(@insns));
638	if ($Xi%5) {
639	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
640	} else {			# ... or load next one
641	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
642	}
643	 eval(shift(@insns));		# ror
644	  &paddd	(@Tx[1],@X[-1&7]);
645	 eval(shift(@insns));
646
647	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
648	 eval(shift(@insns));		# body_20_39
649	 eval(shift(@insns));
650	 eval(shift(@insns));
651	 eval(shift(@insns));		# rol
652	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
653
654	&movdqa	(@Tx[0],@X[0]);
655	 eval(shift(@insns));
656	 eval(shift(@insns));
657	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
658	 eval(shift(@insns));		# ror
659	 eval(shift(@insns));
660	 eval(shift(@insns));		# body_20_39
661
662	&pslld	(@X[0],2);
663	 eval(shift(@insns));
664	 eval(shift(@insns));
665	&psrld	(@Tx[0],30);
666	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
667	 eval(shift(@insns));
668	 eval(shift(@insns));
669	 eval(shift(@insns));		# ror
670
671	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
672	 eval(shift(@insns));
673	 eval(shift(@insns));		# body_20_39
674	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
675	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
676	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
677	 eval(shift(@insns));
678	 eval(shift(@insns));		# rol
679	 eval(shift(@insns));
680	 eval(shift(@insns));
681	 eval(shift(@insns));		# rol
682	 eval(shift(@insns));
683
684	 foreach (@insns) { eval; }	# remaining instructions
685
686  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
687		push(@Tx,shift(@Tx));
688}
689
690sub Xuplast_ssse3_80()
691{ use integer;
692  my $body = shift;
693  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
694  my ($a,$b,$c,$d,$e);
695
696	 eval(shift(@insns));
697	 eval(shift(@insns));
698	 eval(shift(@insns));
699	 eval(shift(@insns));
700	  &paddd	(@Tx[1],@X[-1&7]);
701	 eval(shift(@insns));
702	 eval(shift(@insns));
703
704	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
705
706	 foreach (@insns) { eval; }		# remaining instructions
707
708	&cmp	($inp,$num);
709	&je	(".Ldone_ssse3");
710
711	unshift(@Tx,pop(@Tx));
712
713	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
714	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
715	&movdqu	(@X[-4&7],"0($inp)");		# load input
716	&movdqu	(@X[-3&7],"16($inp)");
717	&movdqu	(@X[-2&7],"32($inp)");
718	&movdqu	(@X[-1&7],"48($inp)");
719	&pshufb	(@X[-4&7],@X[2]);		# byte swap
720	&add	($inp,64);
721
722  $Xi=0;
723}
724
725sub Xloop_ssse3()
726{ use integer;
727  my $body = shift;
728  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
729  my ($a,$b,$c,$d,$e);
730
731	 eval(shift(@insns));
732	 eval(shift(@insns));
733	 eval(shift(@insns));
734	&pshufb	(@X[($Xi-3)&7],@X[2]);
735	 eval(shift(@insns));
736	 eval(shift(@insns));
737	 eval(shift(@insns));
738	 eval(shift(@insns));
739	&paddd	(@X[($Xi-4)&7],@Tx[1]);
740	 eval(shift(@insns));
741	 eval(shift(@insns));
742	 eval(shift(@insns));
743	 eval(shift(@insns));
744	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
745	 eval(shift(@insns));
746	 eval(shift(@insns));
747	 eval(shift(@insns));
748	 eval(shift(@insns));
749	&psubd	(@X[($Xi-4)&7],@Tx[1]);
750
751	foreach (@insns) { eval; }
752  $Xi++;
753}
754
755sub Xtail_ssse3()
756{ use integer;
757  my $body = shift;
758  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
759  my ($a,$b,$c,$d,$e);
760
761	foreach (@insns) { eval; }
762}
763
764sub body_00_19 () {	# ((c^d)&b)^d
765	# on start @T[0]=(c^d)&b
766	return &body_20_39() if ($rx==19); $rx++;
767	(
768	'($a,$b,$c,$d,$e)=@V;'.
769	'&$_ror	($b,$j?7:2)',	# $b>>>2
770	'&xor	(@T[0],$d)',
771	'&mov	(@T[1],$a)',	# $b for next round
772
773	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
774	'&xor	($b,$c)',	# $c^$d for next round
775
776	'&$_rol	($a,5)',
777	'&add	($e,@T[0])',
778	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
779
780	'&xor	($b,$c)',	# restore $b
781	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
782	);
783}
784
785sub body_20_39 () {	# b^d^c
786	# on entry @T[0]=b^d
787	return &body_40_59() if ($rx==39); $rx++;
788	(
789	'($a,$b,$c,$d,$e)=@V;'.
790	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
791	'&xor	(@T[0],$d)	if($j==19);'.
792	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
793	'&mov	(@T[1],$a)',	# $b for next round
794
795	'&$_rol	($a,5)',
796	'&add	($e,@T[0])',
797	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
798
799	'&$_ror	($b,7)',	# $b>>>2
800	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
801	);
802}
803
804sub body_40_59 () {	# ((b^c)&(c^d))^c
805	# on entry @T[0]=(b^c), (c^=d)
806	$rx++;
807	(
808	'($a,$b,$c,$d,$e)=@V;'.
809	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
810	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
811	'&xor	($c,$d)		if ($j>=40)',	# restore $c
812
813	'&$_ror	($b,7)',	# $b>>>2
814	'&mov	(@T[1],$a)',	# $b for next round
815	'&xor	(@T[0],$c)',
816
817	'&$_rol	($a,5)',
818	'&add	($e,@T[0])',
819	'&xor	(@T[1],$c)	if ($j==59);'.
820	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
821
822	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
823	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
824	);
825}
826$code.=<<___;
827.align	16
828.Loop_ssse3:
829___
830	&Xupdate_ssse3_16_31(\&body_00_19);
831	&Xupdate_ssse3_16_31(\&body_00_19);
832	&Xupdate_ssse3_16_31(\&body_00_19);
833	&Xupdate_ssse3_16_31(\&body_00_19);
834	&Xupdate_ssse3_32_79(\&body_00_19);
835	&Xupdate_ssse3_32_79(\&body_20_39);
836	&Xupdate_ssse3_32_79(\&body_20_39);
837	&Xupdate_ssse3_32_79(\&body_20_39);
838	&Xupdate_ssse3_32_79(\&body_20_39);
839	&Xupdate_ssse3_32_79(\&body_20_39);
840	&Xupdate_ssse3_32_79(\&body_40_59);
841	&Xupdate_ssse3_32_79(\&body_40_59);
842	&Xupdate_ssse3_32_79(\&body_40_59);
843	&Xupdate_ssse3_32_79(\&body_40_59);
844	&Xupdate_ssse3_32_79(\&body_40_59);
845	&Xupdate_ssse3_32_79(\&body_20_39);
846	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
847
848				$saved_j=$j; @saved_V=@V;
849
850	&Xloop_ssse3(\&body_20_39);
851	&Xloop_ssse3(\&body_20_39);
852	&Xloop_ssse3(\&body_20_39);
853
854$code.=<<___;
855	add	0($ctx),$A			# update context
856	add	4($ctx),@T[0]
857	add	8($ctx),$C
858	add	12($ctx),$D
859	mov	$A,0($ctx)
860	add	16($ctx),$E
861	mov	@T[0],4($ctx)
862	mov	@T[0],$B			# magic seed
863	mov	$C,8($ctx)
864	mov	$C,@T[1]
865	mov	$D,12($ctx)
866	xor	$D,@T[1]
867	mov	$E,16($ctx)
868	and	@T[1],@T[0]
869	jmp	.Loop_ssse3
870
871.align	16
872.Ldone_ssse3:
873___
874				$j=$saved_j; @V=@saved_V;
875
876	&Xtail_ssse3(\&body_20_39);
877	&Xtail_ssse3(\&body_20_39);
878	&Xtail_ssse3(\&body_20_39);
879
880$code.=<<___;
881	add	0($ctx),$A			# update context
882	add	4($ctx),@T[0]
883	add	8($ctx),$C
884	mov	$A,0($ctx)
885	add	12($ctx),$D
886	mov	@T[0],4($ctx)
887	add	16($ctx),$E
888	mov	$C,8($ctx)
889	mov	$D,12($ctx)
890	mov	$E,16($ctx)
891___
892$code.=<<___ if ($win64);
893	movaps	-40-6*16(%r14),%xmm6
894	movaps	-40-5*16(%r14),%xmm7
895	movaps	-40-4*16(%r14),%xmm8
896	movaps	-40-3*16(%r14),%xmm9
897	movaps	-40-2*16(%r14),%xmm10
898	movaps	-40-1*16(%r14),%xmm11
899___
900$code.=<<___;
901	lea	(%r14),%rsi
902	mov	-40(%rsi),%r14
903	mov	-32(%rsi),%r13
904	mov	-24(%rsi),%r12
905	mov	-16(%rsi),%rbp
906	mov	-8(%rsi),%rbx
907	lea	(%rsi),%rsp
908.Lepilogue_ssse3:
909	ret
910.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
911___
912
913if ($avx) {
914$Xi=4;				# reset variables
915@X=map("%xmm$_",(4..7,0..3));
916@Tx=map("%xmm$_",(8..10));
917$j=0;
918$rx=0;
919
920my $done_avx_label=".Ldone_avx";
921
922my $_rol=sub { &shld(@_[0],@_) };
923my $_ror=sub { &shrd(@_[0],@_) };
924
925$code.=<<___;
926.type	sha1_block_data_order_avx,\@function,3
927.align	16
928sha1_block_data_order_avx:
929_avx_shortcut:
930	mov	%rsp,%rax
931	push	%rbx
932	push	%rbp
933	push	%r12
934	push	%r13		# redundant, done to share Win64 SE handler
935	push	%r14
936	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
937	vzeroupper
938___
939$code.=<<___ if ($win64);
940	vmovaps	%xmm6,-40-6*16(%rax)
941	vmovaps	%xmm7,-40-5*16(%rax)
942	vmovaps	%xmm8,-40-4*16(%rax)
943	vmovaps	%xmm9,-40-3*16(%rax)
944	vmovaps	%xmm10,-40-2*16(%rax)
945	vmovaps	%xmm11,-40-1*16(%rax)
946.Lprologue_avx:
947___
948$code.=<<___;
949	mov	%rax,%r14	# original %rsp
950	and	\$-64,%rsp
951	mov	%rdi,$ctx	# reassigned argument
952	mov	%rsi,$inp	# reassigned argument
953	mov	%rdx,$num	# reassigned argument
954
955	shl	\$6,$num
956	add	$inp,$num
957	lea	K_XX_XX+64(%rip),$K_XX_XX
958
959	mov	0($ctx),$A		# load context
960	mov	4($ctx),$B
961	mov	8($ctx),$C
962	mov	12($ctx),$D
963	mov	$B,@T[0]		# magic seed
964	mov	16($ctx),$E
965	mov	$C,@T[1]
966	xor	$D,@T[1]
967	and	@T[1],@T[0]
968
969	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
970	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
971	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
972	vmovdqu	16($inp),@X[-3&7]
973	vmovdqu	32($inp),@X[-2&7]
974	vmovdqu	48($inp),@X[-1&7]
975	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
976	add	\$64,$inp
977	vpshufb	@X[2],@X[-3&7],@X[-3&7]
978	vpshufb	@X[2],@X[-2&7],@X[-2&7]
979	vpshufb	@X[2],@X[-1&7],@X[-1&7]
980	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
981	vpaddd	$Kx,@X[-3&7],@X[1]
982	vpaddd	$Kx,@X[-2&7],@X[2]
983	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
984	vmovdqa	@X[1],16(%rsp)
985	vmovdqa	@X[2],32(%rsp)
986	jmp	.Loop_avx
987___
988
989sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
990{ use integer;
991  my $body = shift;
992  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
993  my ($a,$b,$c,$d,$e);
994
995	 eval(shift(@insns));
996	 eval(shift(@insns));
997	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
998	 eval(shift(@insns));
999	 eval(shift(@insns));
1000
1001	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1002	 eval(shift(@insns));
1003	 eval(shift(@insns));
1004	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1005	 eval(shift(@insns));
1006	 eval(shift(@insns));
1007	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1008	 eval(shift(@insns));
1009	 eval(shift(@insns));
1010
1011	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1012	 eval(shift(@insns));
1013	 eval(shift(@insns));
1014	 eval(shift(@insns));
1015	 eval(shift(@insns));
1016
1017	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1018	 eval(shift(@insns));
1019	 eval(shift(@insns));
1020	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1021	 eval(shift(@insns));
1022	 eval(shift(@insns));
1023
1024	&vpsrld	(@Tx[0],@X[0],31);
1025	 eval(shift(@insns));
1026	 eval(shift(@insns));
1027	 eval(shift(@insns));
1028	 eval(shift(@insns));
1029
1030	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1031	&vpaddd	(@X[0],@X[0],@X[0]);
1032	 eval(shift(@insns));
1033	 eval(shift(@insns));
1034	 eval(shift(@insns));
1035	 eval(shift(@insns));
1036
1037	&vpsrld	(@Tx[1],@Tx[2],30);
1038	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1039	 eval(shift(@insns));
1040	 eval(shift(@insns));
1041	 eval(shift(@insns));
1042	 eval(shift(@insns));
1043
1044	&vpslld	(@Tx[2],@Tx[2],2);
1045	&vpxor	(@X[0],@X[0],@Tx[1]);
1046	 eval(shift(@insns));
1047	 eval(shift(@insns));
1048	 eval(shift(@insns));
1049	 eval(shift(@insns));
1050
1051	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1052	 eval(shift(@insns));
1053	 eval(shift(@insns));
1054	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1055	 eval(shift(@insns));
1056	 eval(shift(@insns));
1057
1058
1059	 foreach (@insns) { eval; }	# remaining instructions [if any]
1060
1061  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1062}
1063
1064sub Xupdate_avx_32_79()
1065{ use integer;
1066  my $body = shift;
1067  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1068  my ($a,$b,$c,$d,$e);
1069
1070	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1071	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1072	 eval(shift(@insns));		# body_20_39
1073	 eval(shift(@insns));
1074	 eval(shift(@insns));
1075	 eval(shift(@insns));		# rol
1076
1077	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1078	 eval(shift(@insns));
1079	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1080	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1081	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1082	 eval(shift(@insns));		# ror
1083	 eval(shift(@insns));
1084
1085	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1086	 eval(shift(@insns));		# body_20_39
1087	 eval(shift(@insns));
1088	 eval(shift(@insns));
1089	 eval(shift(@insns));		# rol
1090
1091	&vpsrld	(@Tx[0],@X[0],30);
1092	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1093	 eval(shift(@insns));
1094	 eval(shift(@insns));
1095	 eval(shift(@insns));		# ror
1096	 eval(shift(@insns));
1097
1098	&vpslld	(@X[0],@X[0],2);
1099	 eval(shift(@insns));		# body_20_39
1100	 eval(shift(@insns));
1101	 eval(shift(@insns));
1102	 eval(shift(@insns));		# rol
1103	 eval(shift(@insns));
1104	 eval(shift(@insns));
1105	 eval(shift(@insns));		# ror
1106	 eval(shift(@insns));
1107
1108	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1109	 eval(shift(@insns));		# body_20_39
1110	 eval(shift(@insns));
1111	 eval(shift(@insns));
1112	 eval(shift(@insns));		# rol
1113	 eval(shift(@insns));
1114	 eval(shift(@insns));
1115	 eval(shift(@insns));		# rol
1116	 eval(shift(@insns));
1117
1118	 foreach (@insns) { eval; }	# remaining instructions
1119
1120  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1121}
1122
1123sub Xuplast_avx_80()
1124{ use integer;
1125  my $body = shift;
1126  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1127  my ($a,$b,$c,$d,$e);
1128
1129	 eval(shift(@insns));
1130	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1131	 eval(shift(@insns));
1132	 eval(shift(@insns));
1133	 eval(shift(@insns));
1134	 eval(shift(@insns));
1135
1136	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1137
1138	 foreach (@insns) { eval; }		# remaining instructions
1139
1140	&cmp	($inp,$num);
1141	&je	($done_avx_label);
1142
1143	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1144	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1145	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1146	&vmovdqu(@X[-3&7],"16($inp)");
1147	&vmovdqu(@X[-2&7],"32($inp)");
1148	&vmovdqu(@X[-1&7],"48($inp)");
1149	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1150	&add	($inp,64);
1151
1152  $Xi=0;
1153}
1154
1155sub Xloop_avx()
1156{ use integer;
1157  my $body = shift;
1158  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1159  my ($a,$b,$c,$d,$e);
1160
1161	 eval(shift(@insns));
1162	 eval(shift(@insns));
1163	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1164	 eval(shift(@insns));
1165	 eval(shift(@insns));
1166	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1167	 eval(shift(@insns));
1168	 eval(shift(@insns));
1169	 eval(shift(@insns));
1170	 eval(shift(@insns));
1171	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1172	 eval(shift(@insns));
1173	 eval(shift(@insns));
1174
1175	foreach (@insns) { eval; }
1176  $Xi++;
1177}
1178
1179sub Xtail_avx()
1180{ use integer;
1181  my $body = shift;
1182  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1183  my ($a,$b,$c,$d,$e);
1184
1185	foreach (@insns) { eval; }
1186}
1187
1188$code.=<<___;
1189.align	16
1190.Loop_avx:
1191___
1192	&Xupdate_avx_16_31(\&body_00_19);
1193	&Xupdate_avx_16_31(\&body_00_19);
1194	&Xupdate_avx_16_31(\&body_00_19);
1195	&Xupdate_avx_16_31(\&body_00_19);
1196	&Xupdate_avx_32_79(\&body_00_19);
1197	&Xupdate_avx_32_79(\&body_20_39);
1198	&Xupdate_avx_32_79(\&body_20_39);
1199	&Xupdate_avx_32_79(\&body_20_39);
1200	&Xupdate_avx_32_79(\&body_20_39);
1201	&Xupdate_avx_32_79(\&body_20_39);
1202	&Xupdate_avx_32_79(\&body_40_59);
1203	&Xupdate_avx_32_79(\&body_40_59);
1204	&Xupdate_avx_32_79(\&body_40_59);
1205	&Xupdate_avx_32_79(\&body_40_59);
1206	&Xupdate_avx_32_79(\&body_40_59);
1207	&Xupdate_avx_32_79(\&body_20_39);
1208	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1209
1210				$saved_j=$j; @saved_V=@V;
1211
1212	&Xloop_avx(\&body_20_39);
1213	&Xloop_avx(\&body_20_39);
1214	&Xloop_avx(\&body_20_39);
1215
1216$code.=<<___;
1217	add	0($ctx),$A			# update context
1218	add	4($ctx),@T[0]
1219	add	8($ctx),$C
1220	add	12($ctx),$D
1221	mov	$A,0($ctx)
1222	add	16($ctx),$E
1223	mov	@T[0],4($ctx)
1224	mov	@T[0],$B			# magic seed
1225	mov	$C,8($ctx)
1226	mov	$C,@T[1]
1227	mov	$D,12($ctx)
1228	xor	$D,@T[1]
1229	mov	$E,16($ctx)
1230	and	@T[1],@T[0]
1231	jmp	.Loop_avx
1232
1233.align	16
1234$done_avx_label:
1235___
1236				$j=$saved_j; @V=@saved_V;
1237
1238	&Xtail_avx(\&body_20_39);
1239	&Xtail_avx(\&body_20_39);
1240	&Xtail_avx(\&body_20_39);
1241
1242$code.=<<___;
1243	vzeroupper
1244
1245	add	0($ctx),$A			# update context
1246	add	4($ctx),@T[0]
1247	add	8($ctx),$C
1248	mov	$A,0($ctx)
1249	add	12($ctx),$D
1250	mov	@T[0],4($ctx)
1251	add	16($ctx),$E
1252	mov	$C,8($ctx)
1253	mov	$D,12($ctx)
1254	mov	$E,16($ctx)
1255___
1256$code.=<<___ if ($win64);
1257	movaps	-40-6*16(%r14),%xmm6
1258	movaps	-40-5*16(%r14),%xmm7
1259	movaps	-40-4*16(%r14),%xmm8
1260	movaps	-40-3*16(%r14),%xmm9
1261	movaps	-40-2*16(%r14),%xmm10
1262	movaps	-40-1*16(%r14),%xmm11
1263___
1264$code.=<<___;
1265	lea	(%r14),%rsi
1266	mov	-40(%rsi),%r14
1267	mov	-32(%rsi),%r13
1268	mov	-24(%rsi),%r12
1269	mov	-16(%rsi),%rbp
1270	mov	-8(%rsi),%rbx
1271	lea	(%rsi),%rsp
1272.Lepilogue_avx:
1273	ret
1274.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1275___
1276
1277if ($avx>1) {
1278use integer;
1279$Xi=4;					# reset variables
1280@X=map("%ymm$_",(4..7,0..3));
1281@Tx=map("%ymm$_",(8..10));
1282$Kx="%ymm11";
1283$j=0;
1284
1285my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1286my ($a5,$t0)=("%r12d","%edi");
1287
1288my ($A,$F,$B,$C,$D,$E)=@ROTX;
1289my $rx=0;
1290my $frame="%r13";
1291
1292$code.=<<___;
1293.type	sha1_block_data_order_avx2,\@function,3
1294.align	16
1295sha1_block_data_order_avx2:
1296_avx2_shortcut:
1297	mov	%rsp,%rax
1298	push	%rbx
1299	push	%rbp
1300	push	%r12
1301	push	%r13
1302	push	%r14
1303	vzeroupper
1304___
1305$code.=<<___ if ($win64);
1306	lea	-6*16(%rsp),%rsp
1307	vmovaps	%xmm6,-40-6*16(%rax)
1308	vmovaps	%xmm7,-40-5*16(%rax)
1309	vmovaps	%xmm8,-40-4*16(%rax)
1310	vmovaps	%xmm9,-40-3*16(%rax)
1311	vmovaps	%xmm10,-40-2*16(%rax)
1312	vmovaps	%xmm11,-40-1*16(%rax)
1313.Lprologue_avx2:
1314___
1315$code.=<<___;
1316	mov	%rax,%r14		# original %rsp
1317	mov	%rdi,$ctx		# reassigned argument
1318	mov	%rsi,$inp		# reassigned argument
1319	mov	%rdx,$num		# reassigned argument
1320
1321	lea	-640(%rsp),%rsp
1322	shl	\$6,$num
1323	 lea	64($inp),$frame
1324	and	\$-128,%rsp
1325	add	$inp,$num
1326	lea	K_XX_XX+64(%rip),$K_XX_XX
1327
1328	mov	0($ctx),$A		# load context
1329	 cmp	$num,$frame
1330	 cmovae	$inp,$frame		# next or same block
1331	mov	4($ctx),$F
1332	mov	8($ctx),$C
1333	mov	12($ctx),$D
1334	mov	16($ctx),$E
1335	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1336
1337	vmovdqu		($inp),%xmm0
1338	vmovdqu		16($inp),%xmm1
1339	vmovdqu		32($inp),%xmm2
1340	vmovdqu		48($inp),%xmm3
1341	lea		64($inp),$inp
1342	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1343	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1344	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1345	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1346	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1347	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1348	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1349	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1350	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1351
1352	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1353	vpaddd	$Kx,@X[-3&7],@X[1]
1354	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1355	vpaddd	$Kx,@X[-2&7],@X[2]
1356	vmovdqu	@X[1],32(%rsp)
1357	vpaddd	$Kx,@X[-1&7],@X[3]
1358	vmovdqu	@X[2],64(%rsp)
1359	vmovdqu	@X[3],96(%rsp)
1360___
1361for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1362    use integer;
1363
1364	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1365	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1366	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1367	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1368	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1369	&vpsrld	(@Tx[0],@X[0],31);
1370	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1371	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1372	&vpaddd	(@X[0],@X[0],@X[0]);
1373	&vpsrld	(@Tx[1],@Tx[2],30);
1374	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1375	&vpslld	(@Tx[2],@Tx[2],2);
1376	&vpxor	(@X[0],@X[0],@Tx[1]);
1377	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1378	&vpaddd	(@Tx[1],@X[0],$Kx);
1379	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1380
1381	push(@X,shift(@X));	# "rotate" X[]
1382}
1383$code.=<<___;
1384	lea	128(%rsp),$frame
1385	jmp	.Loop_avx2
1386.align	32
1387.Loop_avx2:
1388	rorx	\$2,$F,$B
1389	andn	$D,$F,$t0
1390	and	$C,$F
1391	xor	$t0,$F
1392___
1393sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1394	# at start $f=(b&c)^(~b&d), $b>>>=2
1395	return &bodyx_20_39() if ($rx==19); $rx++;
1396	(
1397	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1398
1399	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1400	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1401	'&andn	($t0,$a,$c)',			# ~b&d for next round
1402
1403	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1404	'&rorx	($a5,$a,27)',			# a<<<5
1405	'&rorx	($f,$a,2)',			# b>>>2 for next round
1406	'&and	($a,$b)',			# b&c for next round
1407
1408	'&add	($e,$a5)',			# e+=a<<<5
1409	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1410
1411	'unshift(@ROTX,pop(@ROTX)); $j++;'
1412	)
1413}
1414
1415sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1416	# on entry $f=b^c^d, $b>>>=2
1417	return &bodyx_40_59() if ($rx==39); $rx++;
1418	(
1419	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1420
1421	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1422	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1423
1424	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1425	'&rorx	($a5,$a,27)',			# a<<<5
1426	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1427	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1428
1429	'&add	($e,$a5)',			# e+=a<<<5
1430	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1431
1432	'unshift(@ROTX,pop(@ROTX)); $j++;'
1433	)
1434}
1435
1436sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1437	# on entry $f=((b^c)&(c^d)), $b>>>=2
1438	$rx++;
1439	(
1440	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1441
1442	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1443	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1444	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1445	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1446	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1447
1448	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1449	'&rorx	($a5,$a,27)',			# a<<<5
1450	'&rorx	($f,$a,2)',			# b>>>2 in next round
1451	'&xor	($a,$b)',			# b^c for next round
1452
1453	'&add	($e,$a5)',			# e+=a<<<5
1454	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1455	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1456
1457	'unshift(@ROTX,pop(@ROTX)); $j++;'
1458	)
1459}
1460
1461sub Xupdate_avx2_16_31()		# recall that $Xi starts wtih 4
1462{ use integer;
1463  my $body = shift;
1464  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1465  my ($a,$b,$c,$d,$e);
1466
1467	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1468	 eval(shift(@insns));
1469	 eval(shift(@insns));
1470	 eval(shift(@insns));
1471	 eval(shift(@insns));
1472
1473	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1474	 eval(shift(@insns));
1475	 eval(shift(@insns));
1476	 eval(shift(@insns));
1477
1478	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1479	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1480	 eval(shift(@insns));
1481	 eval(shift(@insns));
1482
1483	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1484	 eval(shift(@insns));
1485	 eval(shift(@insns));
1486	 eval(shift(@insns));
1487	 eval(shift(@insns));
1488
1489	&vpsrld	(@Tx[0],@X[0],31);
1490	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1491	 eval(shift(@insns));
1492	 eval(shift(@insns));
1493	 eval(shift(@insns));
1494
1495	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1496	&vpaddd	(@X[0],@X[0],@X[0]);
1497	 eval(shift(@insns));
1498	 eval(shift(@insns));
1499
1500	&vpsrld	(@Tx[1],@Tx[2],30);
1501	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1502	 eval(shift(@insns));
1503	 eval(shift(@insns));
1504
1505	&vpslld	(@Tx[2],@Tx[2],2);
1506	&vpxor	(@X[0],@X[0],@Tx[1]);
1507	 eval(shift(@insns));
1508	 eval(shift(@insns));
1509
1510	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1511	 eval(shift(@insns));
1512	 eval(shift(@insns));
1513	 eval(shift(@insns));
1514
1515	&vpaddd	(@Tx[1],@X[0],$Kx);
1516	 eval(shift(@insns));
1517	 eval(shift(@insns));
1518	 eval(shift(@insns));
1519	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1520
1521	 foreach (@insns) { eval; }	# remaining instructions [if any]
1522
1523	$Xi++;
1524	push(@X,shift(@X));	# "rotate" X[]
1525}
1526
1527sub Xupdate_avx2_32_79()
1528{ use integer;
1529  my $body = shift;
1530  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1531  my ($a,$b,$c,$d,$e);
1532
1533	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1534	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1535	 eval(shift(@insns));
1536	 eval(shift(@insns));
1537
1538	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1539	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1540	 eval(shift(@insns));
1541	 eval(shift(@insns));
1542	 eval(shift(@insns));
1543
1544	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1545	 eval(shift(@insns));
1546	 eval(shift(@insns));
1547	 eval(shift(@insns));
1548
1549	&vpsrld	(@Tx[0],@X[0],30);
1550	&vpslld	(@X[0],@X[0],2);
1551	 eval(shift(@insns));
1552	 eval(shift(@insns));
1553	 eval(shift(@insns));
1554
1555	#&vpslld	(@X[0],@X[0],2);
1556	 eval(shift(@insns));
1557	 eval(shift(@insns));
1558	 eval(shift(@insns));
1559
1560	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1561	 eval(shift(@insns));
1562	 eval(shift(@insns));
1563	 eval(shift(@insns));
1564	 eval(shift(@insns));
1565
1566	&vpaddd	(@Tx[1],@X[0],$Kx);
1567	 eval(shift(@insns));
1568	 eval(shift(@insns));
1569	 eval(shift(@insns));
1570	 eval(shift(@insns));
1571
1572	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1573
1574	 foreach (@insns) { eval; }	# remaining instructions
1575
1576	$Xi++;
1577	push(@X,shift(@X));	# "rotate" X[]
1578}
1579
1580sub Xloop_avx2()
1581{ use integer;
1582  my $body = shift;
1583  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1584  my ($a,$b,$c,$d,$e);
1585
1586	 foreach (@insns) { eval; }
1587}
1588
1589	&align32();
1590	&Xupdate_avx2_32_79(\&bodyx_00_19);
1591	&Xupdate_avx2_32_79(\&bodyx_00_19);
1592	&Xupdate_avx2_32_79(\&bodyx_00_19);
1593	&Xupdate_avx2_32_79(\&bodyx_00_19);
1594
1595	&Xupdate_avx2_32_79(\&bodyx_20_39);
1596	&Xupdate_avx2_32_79(\&bodyx_20_39);
1597	&Xupdate_avx2_32_79(\&bodyx_20_39);
1598	&Xupdate_avx2_32_79(\&bodyx_20_39);
1599
1600	&align32();
1601	&Xupdate_avx2_32_79(\&bodyx_40_59);
1602	&Xupdate_avx2_32_79(\&bodyx_40_59);
1603	&Xupdate_avx2_32_79(\&bodyx_40_59);
1604	&Xupdate_avx2_32_79(\&bodyx_40_59);
1605
1606	&Xloop_avx2(\&bodyx_20_39);
1607	&Xloop_avx2(\&bodyx_20_39);
1608	&Xloop_avx2(\&bodyx_20_39);
1609	&Xloop_avx2(\&bodyx_20_39);
1610
1611$code.=<<___;
1612	lea	128($inp),$frame
1613	lea	128($inp),%rdi			# borrow $t0
1614	cmp	$num,$frame
1615	cmovae	$inp,$frame			# next or previous block
1616
1617	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1618	add	0($ctx),@ROTX[0]		# update context
1619	add	4($ctx),@ROTX[1]
1620	add	8($ctx),@ROTX[3]
1621	mov	@ROTX[0],0($ctx)
1622	add	12($ctx),@ROTX[4]
1623	mov	@ROTX[1],4($ctx)
1624	 mov	@ROTX[0],$A			# A=d
1625	add	16($ctx),@ROTX[5]
1626	 mov	@ROTX[3],$a5
1627	mov	@ROTX[3],8($ctx)
1628	 mov	@ROTX[4],$D			# D=b
1629	 #xchg	@ROTX[5],$F			# F=c, C=f
1630	mov	@ROTX[4],12($ctx)
1631	 mov	@ROTX[1],$F			# F=e
1632	mov	@ROTX[5],16($ctx)
1633	#mov	$F,16($ctx)
1634	 mov	@ROTX[5],$E			# E=c
1635	 mov	$a5,$C				# C=f
1636	 #xchg	$F,$E				# E=c, F=e
1637
1638	cmp	$num,$inp
1639	je	.Ldone_avx2
1640___
1641
1642$Xi=4;				# reset variables
1643@X=map("%ymm$_",(4..7,0..3));
1644
1645$code.=<<___;
1646	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1647	cmp	$num,%rdi			# borrowed $t0
1648	ja	.Last_avx2
1649
1650	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1651	vmovdqu		-48(%rdi),%xmm1
1652	vmovdqu		-32(%rdi),%xmm2
1653	vmovdqu		-16(%rdi),%xmm3
1654	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1655	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1656	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1657	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1658	jmp	.Last_avx2
1659
1660.align	32
1661.Last_avx2:
1662	lea	128+16(%rsp),$frame
1663	rorx	\$2,$F,$B
1664	andn	$D,$F,$t0
1665	and	$C,$F
1666	xor	$t0,$F
1667	sub	\$-128,$inp
1668___
1669	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1670
1671	&Xloop_avx2	(\&bodyx_00_19);
1672	&Xloop_avx2	(\&bodyx_00_19);
1673	&Xloop_avx2	(\&bodyx_00_19);
1674	&Xloop_avx2	(\&bodyx_00_19);
1675
1676	&Xloop_avx2	(\&bodyx_20_39);
1677	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1678	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1679	&Xloop_avx2	(\&bodyx_20_39);
1680	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1681	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1682	&Xloop_avx2	(\&bodyx_20_39);
1683	  &vmovdqu	("0(%rsp)",@Tx[0]);
1684	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1685	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1686	&Xloop_avx2	(\&bodyx_20_39);
1687	  &vmovdqu	("32(%rsp)",@Tx[1]);
1688	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1689	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1690
1691	&Xloop_avx2	(\&bodyx_40_59);
1692	&align32	();
1693	  &vmovdqu	("64(%rsp)",@X[2]);
1694	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1695	&Xloop_avx2	(\&bodyx_40_59);
1696	  &vmovdqu	("96(%rsp)",@X[3]);
1697	&Xloop_avx2	(\&bodyx_40_59);
1698	&Xupdate_avx2_16_31(\&bodyx_40_59);
1699
1700	&Xupdate_avx2_16_31(\&bodyx_20_39);
1701	&Xupdate_avx2_16_31(\&bodyx_20_39);
1702	&Xupdate_avx2_16_31(\&bodyx_20_39);
1703	&Xloop_avx2	(\&bodyx_20_39);
1704
1705$code.=<<___;
1706	lea	128(%rsp),$frame
1707
1708	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1709	add	0($ctx),@ROTX[0]		# update context
1710	add	4($ctx),@ROTX[1]
1711	add	8($ctx),@ROTX[3]
1712	mov	@ROTX[0],0($ctx)
1713	add	12($ctx),@ROTX[4]
1714	mov	@ROTX[1],4($ctx)
1715	 mov	@ROTX[0],$A			# A=d
1716	add	16($ctx),@ROTX[5]
1717	 mov	@ROTX[3],$a5
1718	mov	@ROTX[3],8($ctx)
1719	 mov	@ROTX[4],$D			# D=b
1720	 #xchg	@ROTX[5],$F			# F=c, C=f
1721	mov	@ROTX[4],12($ctx)
1722	 mov	@ROTX[1],$F			# F=e
1723	mov	@ROTX[5],16($ctx)
1724	#mov	$F,16($ctx)
1725	 mov	@ROTX[5],$E			# E=c
1726	 mov	$a5,$C				# C=f
1727	 #xchg	$F,$E				# E=c, F=e
1728
1729	cmp	$num,$inp
1730	jbe	.Loop_avx2
1731
1732.Ldone_avx2:
1733	vzeroupper
1734___
1735$code.=<<___ if ($win64);
1736	movaps	-40-6*16(%r14),%xmm6
1737	movaps	-40-5*16(%r14),%xmm7
1738	movaps	-40-4*16(%r14),%xmm8
1739	movaps	-40-3*16(%r14),%xmm9
1740	movaps	-40-2*16(%r14),%xmm10
1741	movaps	-40-1*16(%r14),%xmm11
1742___
1743$code.=<<___;
1744	lea	(%r14),%rsi
1745	mov	-40(%rsi),%r14
1746	mov	-32(%rsi),%r13
1747	mov	-24(%rsi),%r12
1748	mov	-16(%rsi),%rbp
1749	mov	-8(%rsi),%rbx
1750	lea	(%rsi),%rsp
1751.Lepilogue_avx2:
1752	ret
1753.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1754___
1755}
1756}
1757$code.=<<___;
1758.align	64
1759K_XX_XX:
1760.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1761.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1762.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1763.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1764.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1765.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1766.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1767.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1768.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1769.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1770.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1771___
1772}}}
1773$code.=<<___;
1774.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1775.align	64
1776___
1777
1778# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1779#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1780if ($win64) {
1781$rec="%rcx";
1782$frame="%rdx";
1783$context="%r8";
1784$disp="%r9";
1785
1786$code.=<<___;
1787.extern	__imp_RtlVirtualUnwind
1788.type	se_handler,\@abi-omnipotent
1789.align	16
1790se_handler:
1791	push	%rsi
1792	push	%rdi
1793	push	%rbx
1794	push	%rbp
1795	push	%r12
1796	push	%r13
1797	push	%r14
1798	push	%r15
1799	pushfq
1800	sub	\$64,%rsp
1801
1802	mov	120($context),%rax	# pull context->Rax
1803	mov	248($context),%rbx	# pull context->Rip
1804
1805	lea	.Lprologue(%rip),%r10
1806	cmp	%r10,%rbx		# context->Rip<.Lprologue
1807	jb	.Lcommon_seh_tail
1808
1809	mov	152($context),%rax	# pull context->Rsp
1810
1811	lea	.Lepilogue(%rip),%r10
1812	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1813	jae	.Lcommon_seh_tail
1814
1815	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1816
1817	mov	-8(%rax),%rbx
1818	mov	-16(%rax),%rbp
1819	mov	-24(%rax),%r12
1820	mov	-32(%rax),%r13
1821	mov	-40(%rax),%r14
1822	mov	%rbx,144($context)	# restore context->Rbx
1823	mov	%rbp,160($context)	# restore context->Rbp
1824	mov	%r12,216($context)	# restore context->R12
1825	mov	%r13,224($context)	# restore context->R13
1826	mov	%r14,232($context)	# restore context->R14
1827
1828	jmp	.Lcommon_seh_tail
1829.size	se_handler,.-se_handler
1830___
1831
1832$code.=<<___ if ($shaext);
1833.type	shaext_handler,\@abi-omnipotent
1834.align	16
1835shaext_handler:
1836	push	%rsi
1837	push	%rdi
1838	push	%rbx
1839	push	%rbp
1840	push	%r12
1841	push	%r13
1842	push	%r14
1843	push	%r15
1844	pushfq
1845	sub	\$64,%rsp
1846
1847	mov	120($context),%rax	# pull context->Rax
1848	mov	248($context),%rbx	# pull context->Rip
1849
1850	lea	.Lprologue_shaext(%rip),%r10
1851	cmp	%r10,%rbx		# context->Rip<.Lprologue
1852	jb	.Lcommon_seh_tail
1853
1854	lea	.Lepilogue_shaext(%rip),%r10
1855	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1856	jae	.Lcommon_seh_tail
1857
1858	lea	-8-4*16(%rax),%rsi
1859	lea	512($context),%rdi	# &context.Xmm6
1860	mov	\$8,%ecx
1861	.long	0xa548f3fc		# cld; rep movsq
1862
1863	jmp	.Lcommon_seh_tail
1864.size	shaext_handler,.-shaext_handler
1865___
1866
1867$code.=<<___;
1868.type	ssse3_handler,\@abi-omnipotent
1869.align	16
1870ssse3_handler:
1871	push	%rsi
1872	push	%rdi
1873	push	%rbx
1874	push	%rbp
1875	push	%r12
1876	push	%r13
1877	push	%r14
1878	push	%r15
1879	pushfq
1880	sub	\$64,%rsp
1881
1882	mov	120($context),%rax	# pull context->Rax
1883	mov	248($context),%rbx	# pull context->Rip
1884
1885	mov	8($disp),%rsi		# disp->ImageBase
1886	mov	56($disp),%r11		# disp->HandlerData
1887
1888	mov	0(%r11),%r10d		# HandlerData[0]
1889	lea	(%rsi,%r10),%r10	# prologue label
1890	cmp	%r10,%rbx		# context->Rip<prologue label
1891	jb	.Lcommon_seh_tail
1892
1893	mov	152($context),%rax	# pull context->Rsp
1894
1895	mov	4(%r11),%r10d		# HandlerData[1]
1896	lea	(%rsi,%r10),%r10	# epilogue label
1897	cmp	%r10,%rbx		# context->Rip>=epilogue label
1898	jae	.Lcommon_seh_tail
1899
1900	mov	232($context),%rax	# pull context->R14
1901
1902	lea	-40-6*16(%rax),%rsi
1903	lea	512($context),%rdi	# &context.Xmm6
1904	mov	\$12,%ecx
1905	.long	0xa548f3fc		# cld; rep movsq
1906
1907	mov	-8(%rax),%rbx
1908	mov	-16(%rax),%rbp
1909	mov	-24(%rax),%r12
1910	mov	-32(%rax),%r13
1911	mov	-40(%rax),%r14
1912	mov	%rbx,144($context)	# restore context->Rbx
1913	mov	%rbp,160($context)	# restore context->Rbp
1914	mov	%r12,216($context)	# restore cotnext->R12
1915	mov	%r13,224($context)	# restore cotnext->R13
1916	mov	%r14,232($context)	# restore cotnext->R14
1917
1918.Lcommon_seh_tail:
1919	mov	8(%rax),%rdi
1920	mov	16(%rax),%rsi
1921	mov	%rax,152($context)	# restore context->Rsp
1922	mov	%rsi,168($context)	# restore context->Rsi
1923	mov	%rdi,176($context)	# restore context->Rdi
1924
1925	mov	40($disp),%rdi		# disp->ContextRecord
1926	mov	$context,%rsi		# context
1927	mov	\$154,%ecx		# sizeof(CONTEXT)
1928	.long	0xa548f3fc		# cld; rep movsq
1929
1930	mov	$disp,%rsi
1931	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1932	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1933	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1934	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1935	mov	40(%rsi),%r10		# disp->ContextRecord
1936	lea	56(%rsi),%r11		# &disp->HandlerData
1937	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1938	mov	%r10,32(%rsp)		# arg5
1939	mov	%r11,40(%rsp)		# arg6
1940	mov	%r12,48(%rsp)		# arg7
1941	mov	%rcx,56(%rsp)		# arg8, (NULL)
1942	call	*__imp_RtlVirtualUnwind(%rip)
1943
1944	mov	\$1,%eax		# ExceptionContinueSearch
1945	add	\$64,%rsp
1946	popfq
1947	pop	%r15
1948	pop	%r14
1949	pop	%r13
1950	pop	%r12
1951	pop	%rbp
1952	pop	%rbx
1953	pop	%rdi
1954	pop	%rsi
1955	ret
1956.size	ssse3_handler,.-ssse3_handler
1957
1958.section	.pdata
1959.align	4
1960	.rva	.LSEH_begin_sha1_block_data_order
1961	.rva	.LSEH_end_sha1_block_data_order
1962	.rva	.LSEH_info_sha1_block_data_order
1963___
1964$code.=<<___ if ($shaext);
1965	.rva	.LSEH_begin_sha1_block_data_order_shaext
1966	.rva	.LSEH_end_sha1_block_data_order_shaext
1967	.rva	.LSEH_info_sha1_block_data_order_shaext
1968___
1969$code.=<<___;
1970	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1971	.rva	.LSEH_end_sha1_block_data_order_ssse3
1972	.rva	.LSEH_info_sha1_block_data_order_ssse3
1973___
1974$code.=<<___ if ($avx);
1975	.rva	.LSEH_begin_sha1_block_data_order_avx
1976	.rva	.LSEH_end_sha1_block_data_order_avx
1977	.rva	.LSEH_info_sha1_block_data_order_avx
1978___
1979$code.=<<___ if ($avx>1);
1980	.rva	.LSEH_begin_sha1_block_data_order_avx2
1981	.rva	.LSEH_end_sha1_block_data_order_avx2
1982	.rva	.LSEH_info_sha1_block_data_order_avx2
1983___
1984$code.=<<___;
1985.section	.xdata
1986.align	8
1987.LSEH_info_sha1_block_data_order:
1988	.byte	9,0,0,0
1989	.rva	se_handler
1990___
1991$code.=<<___ if ($shaext);
1992.LSEH_info_sha1_block_data_order_shaext:
1993	.byte	9,0,0,0
1994	.rva	shaext_handler
1995___
1996$code.=<<___;
1997.LSEH_info_sha1_block_data_order_ssse3:
1998	.byte	9,0,0,0
1999	.rva	ssse3_handler
2000	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2001___
2002$code.=<<___ if ($avx);
2003.LSEH_info_sha1_block_data_order_avx:
2004	.byte	9,0,0,0
2005	.rva	ssse3_handler
2006	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2007___
2008$code.=<<___ if ($avx>1);
2009.LSEH_info_sha1_block_data_order_avx2:
2010	.byte	9,0,0,0
2011	.rva	ssse3_handler
2012	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2013___
2014}
2015
2016####################################################################
2017
2018sub sha1rnds4 {
2019    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2020      my @opcode=(0x0f,0x3a,0xcc);
2021	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2022	my $c=$1;
2023	push @opcode,$c=~/^0/?oct($c):$c;
2024	return ".byte\t".join(',',@opcode);
2025    } else {
2026	return "sha1rnds4\t".@_[0];
2027    }
2028}
2029
2030sub sha1op38 {
2031    my $instr = shift;
2032    my %opcodelet = (
2033		"sha1nexte" => 0xc8,
2034  		"sha1msg1"  => 0xc9,
2035		"sha1msg2"  => 0xca	);
2036
2037    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2038      my @opcode=(0x0f,0x38);
2039      my $rex=0;
2040	$rex|=0x04			if ($2>=8);
2041	$rex|=0x01			if ($1>=8);
2042	unshift @opcode,0x40|$rex	if ($rex);
2043	push @opcode,$opcodelet{$instr};
2044	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2045	return ".byte\t".join(',',@opcode);
2046    } else {
2047	return $instr."\t".@_[0];
2048    }
2049}
2050
2051foreach (split("\n",$code)) {
2052	s/\`([^\`]*)\`/eval $1/geo;
2053
2054	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2055	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2056
2057	print $_,"\n";
2058}
2059close STDOUT;
2060