1#! /usr/bin/env perl 2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. Rights for redistribution and usage in source and binary 13# forms are granted according to the OpenSSL license. 14# ==================================================================== 15# 16# sha256/512_block procedure for x86_64. 17# 18# 40% improvement over compiler-generated code on Opteron. On EM64T 19# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20# tricks, just straight implementation... I really wonder why gcc 21# [being armed with inline assembler] fails to generate as fast code. 22# The only thing which is cool about this module is that it's very 23# same instruction sequence used for both SHA-256 and SHA-512. In 24# former case the instructions operate on 32-bit operands, while in 25# latter - on 64-bit ones. All I had to do is to get one flavor right, 26# the other one passed the test right away:-) 27# 28# sha256_block runs in ~1005 cycles on Opteron, which gives you 29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32# Well, if you compare it to IA-64 implementation, which maintains 33# X[16] in register bank[!], tends to 4 instructions per CPU clock 34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36# there is a way to improve it, *then* the only way would be to try to 37# offload X[16] updates to SSE unit, but that would require "deeper" 38# loop unroll, which in turn would naturally cause size blow-up, not 39# to mention increased complexity! And once again, only *if* it's 40# actually possible to noticeably improve overall ILP, instruction 41# level parallelism, on a given CPU implementation in this case. 42# 43# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45# [currently available] EM64T CPUs apparently are far from it. On the 46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47# sha256_block:-( This is presumably because 64-bit shifts/rotates 48# apparently are not atomic instructions, but implemented in microcode. 49# 50# May 2012. 51# 52# Optimization including one of Pavel Semjanov's ideas, alternative 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54# unfortunately -2% SHA512 on P4 [which nobody should care about 55# that much]. 56# 57# June 2012. 58# 59# Add SIMD code paths, see below for improvement coefficients. SSSE3 60# code path was not attempted for SHA512, because improvement is not 61# estimated to be high enough, noticeably less than 9%, to justify 62# the effort, not on pre-AVX processors. [Obviously with exclusion 63# for VIA Nano, but it has SHA512 instruction that is faster and 64# should be used instead.] For reference, corresponding estimated 65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66# higher coefficients are observed on VIA Nano and Bulldozer has more 67# to do with specifics of their architecture [which is topic for 68# separate discussion]. 69# 70# November 2012. 71# 72# Add AVX2 code path. Two consecutive input blocks are loaded to 73# 256-bit %ymm registers, with data from first block to least 74# significant 128-bit halves and data from second to most significant. 75# The data is then processed with same SIMD instruction sequence as 76# for AVX, but with %ymm as operands. Side effect is increased stack 77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78# code size increase. 79# 80# March 2014. 81# 82# Add support for Intel SHA Extensions. 83 84###################################################################### 85# Current performance in cycles per processed byte (less is better): 86# 87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88# 89# AMD K8 14.9 - - 9.57 - 90# P4 17.3 - - 30.8 - 91# Core 2 15.6 13.8(+13%) - 9.97 - 92# Westmere 14.8 12.3(+19%) - 9.58 - 93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99# VIA Nano 23.0 16.5(+39%) - 14.7 - 100# Atom 23.0 18.9(+22%) - 14.7 - 101# Silvermont 27.4 20.6(+33%) - 17.5 - 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104# 105# (*) whichever best applicable, including SHAEXT; 106# (**) switch from ror to shrd stands for fair share of improvement; 107# (***) execution time is fully determined by remaining integer-only 108# part, body_00_15; reducing the amount of SIMD instructions 109# below certain limit makes no difference/sense; to conserve 110# space SHA256 XOP code path is therefore omitted; 111# 112# Modified from upstream OpenSSL to remove the XOP code. 113 114$flavour = shift; 115$output = shift; 116if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 117 118$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 119 120$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 121( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 122( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 123die "can't locate x86_64-xlate.pl"; 124 125# In upstream, this is controlled by shelling out to the compiler to check 126# versions, but BoringSSL is intended to be used with pre-generated perlasm 127# output, so this isn't useful anyway. 128# 129# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it 130# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream 131# did not tie them together until after $shaext was added. 132$avx = 1; 133 134# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's 135# been tested. 136$shaext=0; ### set to zero if compiling for 1.0.1 137$avx=1 if (!$shaext && $avx); 138 139open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 140*STDOUT=*OUT; 141 142if ($output =~ /512/) { 143 $func="sha512_block_data_order"; 144 $TABLE="K512"; 145 $SZ=8; 146 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 147 "%r8", "%r9", "%r10","%r11"); 148 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 149 @Sigma0=(28,34,39); 150 @Sigma1=(14,18,41); 151 @sigma0=(1, 8, 7); 152 @sigma1=(19,61, 6); 153 $rounds=80; 154} else { 155 $func="sha256_block_data_order"; 156 $TABLE="K256"; 157 $SZ=4; 158 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 159 "%r8d","%r9d","%r10d","%r11d"); 160 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 161 @Sigma0=( 2,13,22); 162 @Sigma1=( 6,11,25); 163 @sigma0=( 7,18, 3); 164 @sigma1=(17,19,10); 165 $rounds=64; 166} 167 168$ctx="%rdi"; # 1st arg, zapped by $a3 169$inp="%rsi"; # 2nd arg 170$Tbl="%rbp"; 171 172$_ctx="16*$SZ+0*8(%rsp)"; 173$_inp="16*$SZ+1*8(%rsp)"; 174$_end="16*$SZ+2*8(%rsp)"; 175$_rsp="`16*$SZ+3*8`(%rsp)"; 176$framesz="16*$SZ+4*8"; 177 178 179sub ROUND_00_15() 180{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 181 my $STRIDE=$SZ; 182 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 183 184$code.=<<___; 185 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 186 mov $f,$a2 187 188 xor $e,$a0 189 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 190 xor $g,$a2 # f^g 191 192 mov $T1,`$SZ*($i&0xf)`(%rsp) 193 xor $a,$a1 194 and $e,$a2 # (f^g)&e 195 196 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 197 add $h,$T1 # T1+=h 198 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 199 200 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 201 xor $e,$a0 202 add $a2,$T1 # T1+=Ch(e,f,g) 203 204 mov $a,$a2 205 add ($Tbl),$T1 # T1+=K[round] 206 xor $a,$a1 207 208 xor $b,$a2 # a^b, b^c in next round 209 ror \$$Sigma1[0],$a0 # Sigma1(e) 210 mov $b,$h 211 212 and $a2,$a3 213 ror \$$Sigma0[0],$a1 # Sigma0(a) 214 add $a0,$T1 # T1+=Sigma1(e) 215 216 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 217 add $T1,$d # d+=T1 218 add $T1,$h # h+=T1 219 220 lea $STRIDE($Tbl),$Tbl # round++ 221___ 222$code.=<<___ if ($i<15); 223 add $a1,$h # h+=Sigma0(a) 224___ 225 ($a2,$a3) = ($a3,$a2); 226} 227 228sub ROUND_16_XX() 229{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 230 231$code.=<<___; 232 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 233 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 234 235 mov $a0,$T1 236 ror \$`$sigma0[1]-$sigma0[0]`,$a0 237 add $a1,$a # modulo-scheduled h+=Sigma0(a) 238 mov $a2,$a1 239 ror \$`$sigma1[1]-$sigma1[0]`,$a2 240 241 xor $T1,$a0 242 shr \$$sigma0[2],$T1 243 ror \$$sigma0[0],$a0 244 xor $a1,$a2 245 shr \$$sigma1[2],$a1 246 247 ror \$$sigma1[0],$a2 248 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 249 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 250 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 251 252 add `$SZ*($i&0xf)`(%rsp),$T1 253 mov $e,$a0 254 add $a2,$T1 255 mov $a,$a1 256___ 257 &ROUND_00_15(@_); 258} 259 260$code=<<___; 261.text 262 263.extern OPENSSL_ia32cap_P 264.globl $func 265.type $func,\@function,3 266.align 16 267$func: 268.cfi_startproc 269___ 270$code.=<<___ if ($SZ==4 || $avx); 271 leaq OPENSSL_ia32cap_P(%rip),%r11 272 mov 0(%r11),%r9d 273 mov 4(%r11),%r10d 274 mov 8(%r11),%r11d 275___ 276$code.=<<___ if ($SZ==4 && $shaext); 277 test \$`1<<29`,%r11d # check for SHA 278 jnz _shaext_shortcut 279___ 280 # XOP codepath removed. 281___ 282$code.=<<___ if ($avx>1); 283 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 284 cmp \$`1<<8|1<<5|1<<3`,%r11d 285 je .Lavx2_shortcut 286___ 287$code.=<<___ if ($avx); 288 and \$`1<<30`,%r9d # mask "Intel CPU" bit 289 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 290 or %r9d,%r10d 291 cmp \$`1<<28|1<<9|1<<30`,%r10d 292 je .Lavx_shortcut 293___ 294$code.=<<___ if ($SZ==4); 295 test \$`1<<9`,%r10d 296 jnz .Lssse3_shortcut 297___ 298$code.=<<___; 299 mov %rsp,%rax # copy %rsp 300.cfi_def_cfa_register %rax 301 push %rbx 302.cfi_push %rbx 303 push %rbp 304.cfi_push %rbp 305 push %r12 306.cfi_push %r12 307 push %r13 308.cfi_push %r13 309 push %r14 310.cfi_push %r14 311 push %r15 312.cfi_push %r15 313 shl \$4,%rdx # num*16 314 sub \$$framesz,%rsp 315 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 316 and \$-64,%rsp # align stack frame 317 mov $ctx,$_ctx # save ctx, 1st arg 318 mov $inp,$_inp # save inp, 2nd arh 319 mov %rdx,$_end # save end pointer, "3rd" arg 320 mov %rax,$_rsp # save copy of %rsp 321.cfi_cfa_expression $_rsp,deref,+8 322.Lprologue: 323 324 mov $SZ*0($ctx),$A 325 mov $SZ*1($ctx),$B 326 mov $SZ*2($ctx),$C 327 mov $SZ*3($ctx),$D 328 mov $SZ*4($ctx),$E 329 mov $SZ*5($ctx),$F 330 mov $SZ*6($ctx),$G 331 mov $SZ*7($ctx),$H 332 jmp .Lloop 333 334.align 16 335.Lloop: 336 mov $B,$a3 337 lea $TABLE(%rip),$Tbl 338 xor $C,$a3 # magic 339___ 340 for($i=0;$i<16;$i++) { 341 $code.=" mov $SZ*$i($inp),$T1\n"; 342 $code.=" mov @ROT[4],$a0\n"; 343 $code.=" mov @ROT[0],$a1\n"; 344 $code.=" bswap $T1\n"; 345 &ROUND_00_15($i,@ROT); 346 unshift(@ROT,pop(@ROT)); 347 } 348$code.=<<___; 349 jmp .Lrounds_16_xx 350.align 16 351.Lrounds_16_xx: 352___ 353 for(;$i<32;$i++) { 354 &ROUND_16_XX($i,@ROT); 355 unshift(@ROT,pop(@ROT)); 356 } 357 358$code.=<<___; 359 cmpb \$0,`$SZ-1`($Tbl) 360 jnz .Lrounds_16_xx 361 362 mov $_ctx,$ctx 363 add $a1,$A # modulo-scheduled h+=Sigma0(a) 364 lea 16*$SZ($inp),$inp 365 366 add $SZ*0($ctx),$A 367 add $SZ*1($ctx),$B 368 add $SZ*2($ctx),$C 369 add $SZ*3($ctx),$D 370 add $SZ*4($ctx),$E 371 add $SZ*5($ctx),$F 372 add $SZ*6($ctx),$G 373 add $SZ*7($ctx),$H 374 375 cmp $_end,$inp 376 377 mov $A,$SZ*0($ctx) 378 mov $B,$SZ*1($ctx) 379 mov $C,$SZ*2($ctx) 380 mov $D,$SZ*3($ctx) 381 mov $E,$SZ*4($ctx) 382 mov $F,$SZ*5($ctx) 383 mov $G,$SZ*6($ctx) 384 mov $H,$SZ*7($ctx) 385 jb .Lloop 386 387 mov $_rsp,%rsi 388.cfi_def_cfa %rsi,8 389 mov -48(%rsi),%r15 390.cfi_restore %r15 391 mov -40(%rsi),%r14 392.cfi_restore %r14 393 mov -32(%rsi),%r13 394.cfi_restore %r13 395 mov -24(%rsi),%r12 396.cfi_restore %r12 397 mov -16(%rsi),%rbp 398.cfi_restore %rbp 399 mov -8(%rsi),%rbx 400.cfi_restore %rbx 401 lea (%rsi),%rsp 402.cfi_def_cfa_register %rsp 403.Lepilogue: 404 ret 405.cfi_endproc 406.size $func,.-$func 407___ 408 409if ($SZ==4) { 410$code.=<<___; 411.align 64 412.type $TABLE,\@object 413$TABLE: 414 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 415 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 416 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 417 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 418 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 419 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 420 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 421 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 422 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 423 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 424 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 425 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 426 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 427 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 428 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 429 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 430 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 431 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 432 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 433 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 434 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 435 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 436 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 437 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 438 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 439 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 440 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 441 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 442 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 443 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 444 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 445 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 446 447 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 448 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 449 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 450 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 451 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 452 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 453 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 454___ 455} else { 456$code.=<<___; 457.align 64 458.type $TABLE,\@object 459$TABLE: 460 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 461 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 462 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 463 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 464 .quad 0x3956c25bf348b538,0x59f111f1b605d019 465 .quad 0x3956c25bf348b538,0x59f111f1b605d019 466 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 467 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 468 .quad 0xd807aa98a3030242,0x12835b0145706fbe 469 .quad 0xd807aa98a3030242,0x12835b0145706fbe 470 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 471 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 472 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 473 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 474 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 475 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 476 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 477 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 478 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 479 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 480 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 481 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 482 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 484 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 485 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 486 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 487 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 488 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 489 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 490 .quad 0x06ca6351e003826f,0x142929670a0e6e70 491 .quad 0x06ca6351e003826f,0x142929670a0e6e70 492 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 493 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 494 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 495 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 496 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 497 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 498 .quad 0x81c2c92e47edaee6,0x92722c851482353b 499 .quad 0x81c2c92e47edaee6,0x92722c851482353b 500 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 501 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 502 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 503 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 504 .quad 0xd192e819d6ef5218,0xd69906245565a910 505 .quad 0xd192e819d6ef5218,0xd69906245565a910 506 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 507 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 508 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 509 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 510 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 511 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 512 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 513 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 514 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 515 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 516 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 517 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 518 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 519 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 520 .quad 0x90befffa23631e28,0xa4506cebde82bde9 521 .quad 0x90befffa23631e28,0xa4506cebde82bde9 522 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 523 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 524 .quad 0xca273eceea26619c,0xd186b8c721c0c207 525 .quad 0xca273eceea26619c,0xd186b8c721c0c207 526 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 527 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 528 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 529 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 530 .quad 0x113f9804bef90dae,0x1b710b35131c471b 531 .quad 0x113f9804bef90dae,0x1b710b35131c471b 532 .quad 0x28db77f523047d84,0x32caab7b40c72493 533 .quad 0x28db77f523047d84,0x32caab7b40c72493 534 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 535 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 536 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 537 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 538 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 539 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 540 541 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 542 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 543 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 544___ 545} 546 547###################################################################### 548# SIMD code paths 549# 550if ($SZ==4 && $shaext) {{{ 551###################################################################### 552# Intel SHA Extensions implementation of SHA256 update function. 553# 554my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 555 556my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 557my @MSG=map("%xmm$_",(3..6)); 558 559$code.=<<___; 560.type sha256_block_data_order_shaext,\@function,3 561.align 64 562sha256_block_data_order_shaext: 563_shaext_shortcut: 564___ 565$code.=<<___ if ($win64); 566 lea `-8-5*16`(%rsp),%rsp 567 movaps %xmm6,-8-5*16(%rax) 568 movaps %xmm7,-8-4*16(%rax) 569 movaps %xmm8,-8-3*16(%rax) 570 movaps %xmm9,-8-2*16(%rax) 571 movaps %xmm10,-8-1*16(%rax) 572.Lprologue_shaext: 573___ 574$code.=<<___; 575 lea K256+0x80(%rip),$Tbl 576 movdqu ($ctx),$ABEF # DCBA 577 movdqu 16($ctx),$CDGH # HGFE 578 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 579 580 pshufd \$0x1b,$ABEF,$Wi # ABCD 581 pshufd \$0xb1,$ABEF,$ABEF # CDAB 582 pshufd \$0x1b,$CDGH,$CDGH # EFGH 583 movdqa $TMP,$BSWAP # offload 584 palignr \$8,$CDGH,$ABEF # ABEF 585 punpcklqdq $Wi,$CDGH # CDGH 586 jmp .Loop_shaext 587 588.align 16 589.Loop_shaext: 590 movdqu ($inp),@MSG[0] 591 movdqu 0x10($inp),@MSG[1] 592 movdqu 0x20($inp),@MSG[2] 593 pshufb $TMP,@MSG[0] 594 movdqu 0x30($inp),@MSG[3] 595 596 movdqa 0*32-0x80($Tbl),$Wi 597 paddd @MSG[0],$Wi 598 pshufb $TMP,@MSG[1] 599 movdqa $CDGH,$CDGH_SAVE # offload 600 sha256rnds2 $ABEF,$CDGH # 0-3 601 pshufd \$0x0e,$Wi,$Wi 602 nop 603 movdqa $ABEF,$ABEF_SAVE # offload 604 sha256rnds2 $CDGH,$ABEF 605 606 movdqa 1*32-0x80($Tbl),$Wi 607 paddd @MSG[1],$Wi 608 pshufb $TMP,@MSG[2] 609 sha256rnds2 $ABEF,$CDGH # 4-7 610 pshufd \$0x0e,$Wi,$Wi 611 lea 0x40($inp),$inp 612 sha256msg1 @MSG[1],@MSG[0] 613 sha256rnds2 $CDGH,$ABEF 614 615 movdqa 2*32-0x80($Tbl),$Wi 616 paddd @MSG[2],$Wi 617 pshufb $TMP,@MSG[3] 618 sha256rnds2 $ABEF,$CDGH # 8-11 619 pshufd \$0x0e,$Wi,$Wi 620 movdqa @MSG[3],$TMP 621 palignr \$4,@MSG[2],$TMP 622 nop 623 paddd $TMP,@MSG[0] 624 sha256msg1 @MSG[2],@MSG[1] 625 sha256rnds2 $CDGH,$ABEF 626 627 movdqa 3*32-0x80($Tbl),$Wi 628 paddd @MSG[3],$Wi 629 sha256msg2 @MSG[3],@MSG[0] 630 sha256rnds2 $ABEF,$CDGH # 12-15 631 pshufd \$0x0e,$Wi,$Wi 632 movdqa @MSG[0],$TMP 633 palignr \$4,@MSG[3],$TMP 634 nop 635 paddd $TMP,@MSG[1] 636 sha256msg1 @MSG[3],@MSG[2] 637 sha256rnds2 $CDGH,$ABEF 638___ 639for($i=4;$i<16-3;$i++) { 640$code.=<<___; 641 movdqa $i*32-0x80($Tbl),$Wi 642 paddd @MSG[0],$Wi 643 sha256msg2 @MSG[0],@MSG[1] 644 sha256rnds2 $ABEF,$CDGH # 16-19... 645 pshufd \$0x0e,$Wi,$Wi 646 movdqa @MSG[1],$TMP 647 palignr \$4,@MSG[0],$TMP 648 nop 649 paddd $TMP,@MSG[2] 650 sha256msg1 @MSG[0],@MSG[3] 651 sha256rnds2 $CDGH,$ABEF 652___ 653 push(@MSG,shift(@MSG)); 654} 655$code.=<<___; 656 movdqa 13*32-0x80($Tbl),$Wi 657 paddd @MSG[0],$Wi 658 sha256msg2 @MSG[0],@MSG[1] 659 sha256rnds2 $ABEF,$CDGH # 52-55 660 pshufd \$0x0e,$Wi,$Wi 661 movdqa @MSG[1],$TMP 662 palignr \$4,@MSG[0],$TMP 663 sha256rnds2 $CDGH,$ABEF 664 paddd $TMP,@MSG[2] 665 666 movdqa 14*32-0x80($Tbl),$Wi 667 paddd @MSG[1],$Wi 668 sha256rnds2 $ABEF,$CDGH # 56-59 669 pshufd \$0x0e,$Wi,$Wi 670 sha256msg2 @MSG[1],@MSG[2] 671 movdqa $BSWAP,$TMP 672 sha256rnds2 $CDGH,$ABEF 673 674 movdqa 15*32-0x80($Tbl),$Wi 675 paddd @MSG[2],$Wi 676 nop 677 sha256rnds2 $ABEF,$CDGH # 60-63 678 pshufd \$0x0e,$Wi,$Wi 679 dec $num 680 nop 681 sha256rnds2 $CDGH,$ABEF 682 683 paddd $CDGH_SAVE,$CDGH 684 paddd $ABEF_SAVE,$ABEF 685 jnz .Loop_shaext 686 687 pshufd \$0xb1,$CDGH,$CDGH # DCHG 688 pshufd \$0x1b,$ABEF,$TMP # FEBA 689 pshufd \$0xb1,$ABEF,$ABEF # BAFE 690 punpckhqdq $CDGH,$ABEF # DCBA 691 palignr \$8,$TMP,$CDGH # HGFE 692 693 movdqu $ABEF,($ctx) 694 movdqu $CDGH,16($ctx) 695___ 696$code.=<<___ if ($win64); 697 movaps -8-5*16(%rax),%xmm6 698 movaps -8-4*16(%rax),%xmm7 699 movaps -8-3*16(%rax),%xmm8 700 movaps -8-2*16(%rax),%xmm9 701 movaps -8-1*16(%rax),%xmm10 702 mov %rax,%rsp 703.Lepilogue_shaext: 704___ 705$code.=<<___; 706 ret 707.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 708___ 709}}} 710{{{ 711 712my $a4=$T1; 713my ($a,$b,$c,$d,$e,$f,$g,$h); 714 715sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 716{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 717 my $arg = pop; 718 $arg = "\$$arg" if ($arg*1 eq $arg); 719 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 720} 721 722sub body_00_15 () { 723 ( 724 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 725 726 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 727 '&mov ($a,$a1)', 728 '&mov ($a4,$f)', 729 730 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 731 '&xor ($a0,$e)', 732 '&xor ($a4,$g)', # f^g 733 734 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 735 '&xor ($a1,$a)', 736 '&and ($a4,$e)', # (f^g)&e 737 738 '&xor ($a0,$e)', 739 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 740 '&mov ($a2,$a)', 741 742 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 743 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 744 '&xor ($a2,$b)', # a^b, b^c in next round 745 746 '&add ($h,$a4)', # h+=Ch(e,f,g) 747 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 748 '&and ($a3,$a2)', # (b^c)&(a^b) 749 750 '&xor ($a1,$a)', 751 '&add ($h,$a0)', # h+=Sigma1(e) 752 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 753 754 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 755 '&add ($d,$h)', # d+=h 756 '&add ($h,$a3)', # h+=Maj(a,b,c) 757 758 '&mov ($a0,$d)', 759 '&add ($a1,$h);'. # h+=Sigma0(a) 760 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 761 ); 762} 763 764###################################################################### 765# SSSE3 code path 766# 767if ($SZ==4) { # SHA256 only 768my @X = map("%xmm$_",(0..3)); 769my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 770 771$code.=<<___; 772.type ${func}_ssse3,\@function,3 773.align 64 774${func}_ssse3: 775.cfi_startproc 776.Lssse3_shortcut: 777 mov %rsp,%rax # copy %rsp 778.cfi_def_cfa_register %rax 779 push %rbx 780.cfi_push %rbx 781 push %rbp 782.cfi_push %rbp 783 push %r12 784.cfi_push %r12 785 push %r13 786.cfi_push %r13 787 push %r14 788.cfi_push %r14 789 push %r15 790.cfi_push %r15 791 shl \$4,%rdx # num*16 792 sub \$`$framesz+$win64*16*4`,%rsp 793 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 794 and \$-64,%rsp # align stack frame 795 mov $ctx,$_ctx # save ctx, 1st arg 796 mov $inp,$_inp # save inp, 2nd arh 797 mov %rdx,$_end # save end pointer, "3rd" arg 798 mov %rax,$_rsp # save copy of %rsp 799.cfi_cfa_expression $_rsp,deref,+8 800___ 801$code.=<<___ if ($win64); 802 movaps %xmm6,16*$SZ+32(%rsp) 803 movaps %xmm7,16*$SZ+48(%rsp) 804 movaps %xmm8,16*$SZ+64(%rsp) 805 movaps %xmm9,16*$SZ+80(%rsp) 806___ 807$code.=<<___; 808.Lprologue_ssse3: 809 810 mov $SZ*0($ctx),$A 811 mov $SZ*1($ctx),$B 812 mov $SZ*2($ctx),$C 813 mov $SZ*3($ctx),$D 814 mov $SZ*4($ctx),$E 815 mov $SZ*5($ctx),$F 816 mov $SZ*6($ctx),$G 817 mov $SZ*7($ctx),$H 818___ 819 820$code.=<<___; 821 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 822 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 823 jmp .Lloop_ssse3 824.align 16 825.Lloop_ssse3: 826 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 827 movdqu 0x00($inp),@X[0] 828 movdqu 0x10($inp),@X[1] 829 movdqu 0x20($inp),@X[2] 830 pshufb $t3,@X[0] 831 movdqu 0x30($inp),@X[3] 832 lea $TABLE(%rip),$Tbl 833 pshufb $t3,@X[1] 834 movdqa 0x00($Tbl),$t0 835 movdqa 0x20($Tbl),$t1 836 pshufb $t3,@X[2] 837 paddd @X[0],$t0 838 movdqa 0x40($Tbl),$t2 839 pshufb $t3,@X[3] 840 movdqa 0x60($Tbl),$t3 841 paddd @X[1],$t1 842 paddd @X[2],$t2 843 paddd @X[3],$t3 844 movdqa $t0,0x00(%rsp) 845 mov $A,$a1 846 movdqa $t1,0x10(%rsp) 847 mov $B,$a3 848 movdqa $t2,0x20(%rsp) 849 xor $C,$a3 # magic 850 movdqa $t3,0x30(%rsp) 851 mov $E,$a0 852 jmp .Lssse3_00_47 853 854.align 16 855.Lssse3_00_47: 856 sub \$`-16*2*$SZ`,$Tbl # size optimization 857___ 858sub Xupdate_256_SSSE3 () { 859 ( 860 '&movdqa ($t0,@X[1]);', 861 '&movdqa ($t3,@X[3])', 862 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 863 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 864 '&movdqa ($t1,$t0)', 865 '&movdqa ($t2,$t0);', 866 '&psrld ($t0,$sigma0[2])', 867 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 868 '&psrld ($t2,$sigma0[0])', 869 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 870 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 871 '&pxor ($t0,$t2)', 872 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 873 '&pxor ($t0,$t1)', 874 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 875 '&pxor ($t0,$t2);', 876 '&movdqa ($t2,$t3)', 877 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 878 '&psrld ($t3,$sigma1[2])', 879 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 880 '&psrlq ($t2,$sigma1[0])', 881 '&pxor ($t3,$t2);', 882 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 883 '&pxor ($t3,$t2)', 884 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 885 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 886 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 887 '&movdqa ($t2,$t3);', 888 '&psrld ($t3,$sigma1[2])', 889 '&psrlq ($t2,$sigma1[0])', 890 '&pxor ($t3,$t2);', 891 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 892 '&pxor ($t3,$t2);', 893 '&movdqa ($t2,16*2*$j."($Tbl)")', 894 '&pshufb ($t3,$t5)', 895 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 896 ); 897} 898 899sub SSSE3_256_00_47 () { 900my $j = shift; 901my $body = shift; 902my @X = @_; 903my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 904 905 if (0) { 906 foreach (Xupdate_256_SSSE3()) { # 36 instructions 907 eval; 908 eval(shift(@insns)); 909 eval(shift(@insns)); 910 eval(shift(@insns)); 911 } 912 } else { # squeeze extra 4% on Westmere and 19% on Atom 913 eval(shift(@insns)); #@ 914 &movdqa ($t0,@X[1]); 915 eval(shift(@insns)); 916 eval(shift(@insns)); 917 &movdqa ($t3,@X[3]); 918 eval(shift(@insns)); #@ 919 eval(shift(@insns)); 920 eval(shift(@insns)); 921 eval(shift(@insns)); #@ 922 eval(shift(@insns)); 923 &palignr ($t0,@X[0],$SZ); # X[1..4] 924 eval(shift(@insns)); 925 eval(shift(@insns)); 926 &palignr ($t3,@X[2],$SZ); # X[9..12] 927 eval(shift(@insns)); 928 eval(shift(@insns)); 929 eval(shift(@insns)); 930 eval(shift(@insns)); #@ 931 &movdqa ($t1,$t0); 932 eval(shift(@insns)); 933 eval(shift(@insns)); 934 &movdqa ($t2,$t0); 935 eval(shift(@insns)); #@ 936 eval(shift(@insns)); 937 &psrld ($t0,$sigma0[2]); 938 eval(shift(@insns)); 939 eval(shift(@insns)); 940 eval(shift(@insns)); 941 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 942 eval(shift(@insns)); #@ 943 eval(shift(@insns)); 944 &psrld ($t2,$sigma0[0]); 945 eval(shift(@insns)); 946 eval(shift(@insns)); 947 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 948 eval(shift(@insns)); 949 eval(shift(@insns)); #@ 950 &pslld ($t1,8*$SZ-$sigma0[1]); 951 eval(shift(@insns)); 952 eval(shift(@insns)); 953 &pxor ($t0,$t2); 954 eval(shift(@insns)); #@ 955 eval(shift(@insns)); 956 eval(shift(@insns)); 957 eval(shift(@insns)); #@ 958 &psrld ($t2,$sigma0[1]-$sigma0[0]); 959 eval(shift(@insns)); 960 &pxor ($t0,$t1); 961 eval(shift(@insns)); 962 eval(shift(@insns)); 963 &pslld ($t1,$sigma0[1]-$sigma0[0]); 964 eval(shift(@insns)); 965 eval(shift(@insns)); 966 &pxor ($t0,$t2); 967 eval(shift(@insns)); 968 eval(shift(@insns)); #@ 969 &movdqa ($t2,$t3); 970 eval(shift(@insns)); 971 eval(shift(@insns)); 972 &pxor ($t0,$t1); # sigma0(X[1..4]) 973 eval(shift(@insns)); #@ 974 eval(shift(@insns)); 975 eval(shift(@insns)); 976 &psrld ($t3,$sigma1[2]); 977 eval(shift(@insns)); 978 eval(shift(@insns)); 979 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 980 eval(shift(@insns)); #@ 981 eval(shift(@insns)); 982 &psrlq ($t2,$sigma1[0]); 983 eval(shift(@insns)); 984 eval(shift(@insns)); 985 eval(shift(@insns)); 986 &pxor ($t3,$t2); 987 eval(shift(@insns)); #@ 988 eval(shift(@insns)); 989 eval(shift(@insns)); 990 eval(shift(@insns)); #@ 991 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 992 eval(shift(@insns)); 993 eval(shift(@insns)); 994 &pxor ($t3,$t2); 995 eval(shift(@insns)); #@ 996 eval(shift(@insns)); 997 eval(shift(@insns)); 998 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 999 &pshufd ($t3,$t3,0b10000000); 1000 eval(shift(@insns)); 1001 eval(shift(@insns)); 1002 eval(shift(@insns)); 1003 &psrldq ($t3,8); 1004 eval(shift(@insns)); 1005 eval(shift(@insns)); #@ 1006 eval(shift(@insns)); 1007 eval(shift(@insns)); 1008 eval(shift(@insns)); #@ 1009 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1010 eval(shift(@insns)); 1011 eval(shift(@insns)); 1012 eval(shift(@insns)); 1013 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 1014 eval(shift(@insns)); 1015 eval(shift(@insns)); #@ 1016 eval(shift(@insns)); 1017 &movdqa ($t2,$t3); 1018 eval(shift(@insns)); 1019 eval(shift(@insns)); 1020 &psrld ($t3,$sigma1[2]); 1021 eval(shift(@insns)); 1022 eval(shift(@insns)); #@ 1023 &psrlq ($t2,$sigma1[0]); 1024 eval(shift(@insns)); 1025 eval(shift(@insns)); 1026 &pxor ($t3,$t2); 1027 eval(shift(@insns)); #@ 1028 eval(shift(@insns)); 1029 eval(shift(@insns)); 1030 eval(shift(@insns)); #@ 1031 eval(shift(@insns)); 1032 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1033 eval(shift(@insns)); 1034 eval(shift(@insns)); 1035 eval(shift(@insns)); 1036 &pxor ($t3,$t2); 1037 eval(shift(@insns)); 1038 eval(shift(@insns)); 1039 eval(shift(@insns)); #@ 1040 #&pshufb ($t3,$t5); 1041 &pshufd ($t3,$t3,0b00001000); 1042 eval(shift(@insns)); 1043 eval(shift(@insns)); 1044 &movdqa ($t2,16*2*$j."($Tbl)"); 1045 eval(shift(@insns)); #@ 1046 eval(shift(@insns)); 1047 &pslldq ($t3,8); 1048 eval(shift(@insns)); 1049 eval(shift(@insns)); 1050 eval(shift(@insns)); 1051 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1052 eval(shift(@insns)); #@ 1053 eval(shift(@insns)); 1054 eval(shift(@insns)); 1055 } 1056 &paddd ($t2,@X[0]); 1057 foreach (@insns) { eval; } # remaining instructions 1058 &movdqa (16*$j."(%rsp)",$t2); 1059} 1060 1061 for ($i=0,$j=0; $j<4; $j++) { 1062 &SSSE3_256_00_47($j,\&body_00_15,@X); 1063 push(@X,shift(@X)); # rotate(@X) 1064 } 1065 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1066 &jne (".Lssse3_00_47"); 1067 1068 for ($i=0; $i<16; ) { 1069 foreach(body_00_15()) { eval; } 1070 } 1071$code.=<<___; 1072 mov $_ctx,$ctx 1073 mov $a1,$A 1074 1075 add $SZ*0($ctx),$A 1076 lea 16*$SZ($inp),$inp 1077 add $SZ*1($ctx),$B 1078 add $SZ*2($ctx),$C 1079 add $SZ*3($ctx),$D 1080 add $SZ*4($ctx),$E 1081 add $SZ*5($ctx),$F 1082 add $SZ*6($ctx),$G 1083 add $SZ*7($ctx),$H 1084 1085 cmp $_end,$inp 1086 1087 mov $A,$SZ*0($ctx) 1088 mov $B,$SZ*1($ctx) 1089 mov $C,$SZ*2($ctx) 1090 mov $D,$SZ*3($ctx) 1091 mov $E,$SZ*4($ctx) 1092 mov $F,$SZ*5($ctx) 1093 mov $G,$SZ*6($ctx) 1094 mov $H,$SZ*7($ctx) 1095 jb .Lloop_ssse3 1096 1097 mov $_rsp,%rsi 1098.cfi_def_cfa %rsi,8 1099___ 1100$code.=<<___ if ($win64); 1101 movaps 16*$SZ+32(%rsp),%xmm6 1102 movaps 16*$SZ+48(%rsp),%xmm7 1103 movaps 16*$SZ+64(%rsp),%xmm8 1104 movaps 16*$SZ+80(%rsp),%xmm9 1105___ 1106$code.=<<___; 1107 mov -48(%rsi),%r15 1108.cfi_restore %r15 1109 mov -40(%rsi),%r14 1110.cfi_restore %r14 1111 mov -32(%rsi),%r13 1112.cfi_restore %r13 1113 mov -24(%rsi),%r12 1114.cfi_restore %r12 1115 mov -16(%rsi),%rbp 1116.cfi_restore %rbp 1117 mov -8(%rsi),%rbx 1118.cfi_restore %rbx 1119 lea (%rsi),%rsp 1120.cfi_def_cfa_register %rsp 1121.Lepilogue_ssse3: 1122 ret 1123.cfi_endproc 1124.size ${func}_ssse3,.-${func}_ssse3 1125___ 1126} 1127 1128if ($avx) {{ 1129###################################################################### 1130# AVX+shrd code path 1131# 1132local *ror = sub { &shrd(@_[0],@_) }; 1133 1134$code.=<<___; 1135.type ${func}_avx,\@function,3 1136.align 64 1137${func}_avx: 1138.cfi_startproc 1139.Lavx_shortcut: 1140 mov %rsp,%rax # copy %rsp 1141.cfi_def_cfa_register %rax 1142 push %rbx 1143.cfi_push %rbx 1144 push %rbp 1145.cfi_push %rbp 1146 push %r12 1147.cfi_push %r12 1148 push %r13 1149.cfi_push %r13 1150 push %r14 1151.cfi_push %r14 1152 push %r15 1153.cfi_push %r15 1154 shl \$4,%rdx # num*16 1155 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1156 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1157 and \$-64,%rsp # align stack frame 1158 mov $ctx,$_ctx # save ctx, 1st arg 1159 mov $inp,$_inp # save inp, 2nd arh 1160 mov %rdx,$_end # save end pointer, "3rd" arg 1161 mov %rax,$_rsp # save copy of %rsp 1162.cfi_cfa_expression $_rsp,deref,+8 1163___ 1164$code.=<<___ if ($win64); 1165 movaps %xmm6,16*$SZ+32(%rsp) 1166 movaps %xmm7,16*$SZ+48(%rsp) 1167 movaps %xmm8,16*$SZ+64(%rsp) 1168 movaps %xmm9,16*$SZ+80(%rsp) 1169___ 1170$code.=<<___ if ($win64 && $SZ>4); 1171 movaps %xmm10,16*$SZ+96(%rsp) 1172 movaps %xmm11,16*$SZ+112(%rsp) 1173___ 1174$code.=<<___; 1175.Lprologue_avx: 1176 1177 vzeroupper 1178 mov $SZ*0($ctx),$A 1179 mov $SZ*1($ctx),$B 1180 mov $SZ*2($ctx),$C 1181 mov $SZ*3($ctx),$D 1182 mov $SZ*4($ctx),$E 1183 mov $SZ*5($ctx),$F 1184 mov $SZ*6($ctx),$G 1185 mov $SZ*7($ctx),$H 1186___ 1187 if ($SZ==4) { # SHA256 1188 my @X = map("%xmm$_",(0..3)); 1189 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1190 1191$code.=<<___; 1192 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1193 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1194 jmp .Lloop_avx 1195.align 16 1196.Lloop_avx: 1197 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1198 vmovdqu 0x00($inp),@X[0] 1199 vmovdqu 0x10($inp),@X[1] 1200 vmovdqu 0x20($inp),@X[2] 1201 vmovdqu 0x30($inp),@X[3] 1202 vpshufb $t3,@X[0],@X[0] 1203 lea $TABLE(%rip),$Tbl 1204 vpshufb $t3,@X[1],@X[1] 1205 vpshufb $t3,@X[2],@X[2] 1206 vpaddd 0x00($Tbl),@X[0],$t0 1207 vpshufb $t3,@X[3],@X[3] 1208 vpaddd 0x20($Tbl),@X[1],$t1 1209 vpaddd 0x40($Tbl),@X[2],$t2 1210 vpaddd 0x60($Tbl),@X[3],$t3 1211 vmovdqa $t0,0x00(%rsp) 1212 mov $A,$a1 1213 vmovdqa $t1,0x10(%rsp) 1214 mov $B,$a3 1215 vmovdqa $t2,0x20(%rsp) 1216 xor $C,$a3 # magic 1217 vmovdqa $t3,0x30(%rsp) 1218 mov $E,$a0 1219 jmp .Lavx_00_47 1220 1221.align 16 1222.Lavx_00_47: 1223 sub \$`-16*2*$SZ`,$Tbl # size optimization 1224___ 1225sub Xupdate_256_AVX () { 1226 ( 1227 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1228 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1229 '&vpsrld ($t2,$t0,$sigma0[0]);', 1230 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1231 '&vpsrld ($t3,$t0,$sigma0[2])', 1232 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1233 '&vpxor ($t0,$t3,$t2)', 1234 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1235 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1236 '&vpxor ($t0,$t0,$t1)', 1237 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1238 '&vpxor ($t0,$t0,$t2)', 1239 '&vpsrld ($t2,$t3,$sigma1[2]);', 1240 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1241 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1242 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1243 '&vpxor ($t2,$t2,$t3);', 1244 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1245 '&vpxor ($t2,$t2,$t3)', 1246 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1247 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1248 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1249 '&vpsrld ($t2,$t3,$sigma1[2])', 1250 '&vpsrlq ($t3,$t3,$sigma1[0])', 1251 '&vpxor ($t2,$t2,$t3);', 1252 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1253 '&vpxor ($t2,$t2,$t3)', 1254 '&vpshufb ($t2,$t2,$t5)', 1255 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1256 ); 1257} 1258 1259sub AVX_256_00_47 () { 1260my $j = shift; 1261my $body = shift; 1262my @X = @_; 1263my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1264 1265 foreach (Xupdate_256_AVX()) { # 29 instructions 1266 eval; 1267 eval(shift(@insns)); 1268 eval(shift(@insns)); 1269 eval(shift(@insns)); 1270 } 1271 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1272 foreach (@insns) { eval; } # remaining instructions 1273 &vmovdqa (16*$j."(%rsp)",$t2); 1274} 1275 1276 for ($i=0,$j=0; $j<4; $j++) { 1277 &AVX_256_00_47($j,\&body_00_15,@X); 1278 push(@X,shift(@X)); # rotate(@X) 1279 } 1280 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1281 &jne (".Lavx_00_47"); 1282 1283 for ($i=0; $i<16; ) { 1284 foreach(body_00_15()) { eval; } 1285 } 1286 1287 } else { # SHA512 1288 my @X = map("%xmm$_",(0..7)); 1289 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1290 1291$code.=<<___; 1292 jmp .Lloop_avx 1293.align 16 1294.Lloop_avx: 1295 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1296 vmovdqu 0x00($inp),@X[0] 1297 lea $TABLE+0x80(%rip),$Tbl # size optimization 1298 vmovdqu 0x10($inp),@X[1] 1299 vmovdqu 0x20($inp),@X[2] 1300 vpshufb $t3,@X[0],@X[0] 1301 vmovdqu 0x30($inp),@X[3] 1302 vpshufb $t3,@X[1],@X[1] 1303 vmovdqu 0x40($inp),@X[4] 1304 vpshufb $t3,@X[2],@X[2] 1305 vmovdqu 0x50($inp),@X[5] 1306 vpshufb $t3,@X[3],@X[3] 1307 vmovdqu 0x60($inp),@X[6] 1308 vpshufb $t3,@X[4],@X[4] 1309 vmovdqu 0x70($inp),@X[7] 1310 vpshufb $t3,@X[5],@X[5] 1311 vpaddq -0x80($Tbl),@X[0],$t0 1312 vpshufb $t3,@X[6],@X[6] 1313 vpaddq -0x60($Tbl),@X[1],$t1 1314 vpshufb $t3,@X[7],@X[7] 1315 vpaddq -0x40($Tbl),@X[2],$t2 1316 vpaddq -0x20($Tbl),@X[3],$t3 1317 vmovdqa $t0,0x00(%rsp) 1318 vpaddq 0x00($Tbl),@X[4],$t0 1319 vmovdqa $t1,0x10(%rsp) 1320 vpaddq 0x20($Tbl),@X[5],$t1 1321 vmovdqa $t2,0x20(%rsp) 1322 vpaddq 0x40($Tbl),@X[6],$t2 1323 vmovdqa $t3,0x30(%rsp) 1324 vpaddq 0x60($Tbl),@X[7],$t3 1325 vmovdqa $t0,0x40(%rsp) 1326 mov $A,$a1 1327 vmovdqa $t1,0x50(%rsp) 1328 mov $B,$a3 1329 vmovdqa $t2,0x60(%rsp) 1330 xor $C,$a3 # magic 1331 vmovdqa $t3,0x70(%rsp) 1332 mov $E,$a0 1333 jmp .Lavx_00_47 1334 1335.align 16 1336.Lavx_00_47: 1337 add \$`16*2*$SZ`,$Tbl 1338___ 1339sub Xupdate_512_AVX () { 1340 ( 1341 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1342 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1343 '&vpsrlq ($t2,$t0,$sigma0[0])', 1344 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1345 '&vpsrlq ($t3,$t0,$sigma0[2])', 1346 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1347 '&vpxor ($t0,$t3,$t2)', 1348 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1349 '&vpxor ($t0,$t0,$t1)', 1350 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1351 '&vpxor ($t0,$t0,$t2)', 1352 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1353 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1354 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1355 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1356 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1357 '&vpxor ($t3,$t3,$t2)', 1358 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1359 '&vpxor ($t3,$t3,$t1)', 1360 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1361 '&vpxor ($t3,$t3,$t2)', 1362 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1363 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1364 ); 1365} 1366 1367sub AVX_512_00_47 () { 1368my $j = shift; 1369my $body = shift; 1370my @X = @_; 1371my @insns = (&$body,&$body); # 52 instructions 1372 1373 foreach (Xupdate_512_AVX()) { # 23 instructions 1374 eval; 1375 eval(shift(@insns)); 1376 eval(shift(@insns)); 1377 } 1378 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1379 foreach (@insns) { eval; } # remaining instructions 1380 &vmovdqa (16*$j."(%rsp)",$t2); 1381} 1382 1383 for ($i=0,$j=0; $j<8; $j++) { 1384 &AVX_512_00_47($j,\&body_00_15,@X); 1385 push(@X,shift(@X)); # rotate(@X) 1386 } 1387 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1388 &jne (".Lavx_00_47"); 1389 1390 for ($i=0; $i<16; ) { 1391 foreach(body_00_15()) { eval; } 1392 } 1393} 1394$code.=<<___; 1395 mov $_ctx,$ctx 1396 mov $a1,$A 1397 1398 add $SZ*0($ctx),$A 1399 lea 16*$SZ($inp),$inp 1400 add $SZ*1($ctx),$B 1401 add $SZ*2($ctx),$C 1402 add $SZ*3($ctx),$D 1403 add $SZ*4($ctx),$E 1404 add $SZ*5($ctx),$F 1405 add $SZ*6($ctx),$G 1406 add $SZ*7($ctx),$H 1407 1408 cmp $_end,$inp 1409 1410 mov $A,$SZ*0($ctx) 1411 mov $B,$SZ*1($ctx) 1412 mov $C,$SZ*2($ctx) 1413 mov $D,$SZ*3($ctx) 1414 mov $E,$SZ*4($ctx) 1415 mov $F,$SZ*5($ctx) 1416 mov $G,$SZ*6($ctx) 1417 mov $H,$SZ*7($ctx) 1418 jb .Lloop_avx 1419 1420 mov $_rsp,%rsi 1421.cfi_def_cfa %rsi,8 1422 vzeroupper 1423___ 1424$code.=<<___ if ($win64); 1425 movaps 16*$SZ+32(%rsp),%xmm6 1426 movaps 16*$SZ+48(%rsp),%xmm7 1427 movaps 16*$SZ+64(%rsp),%xmm8 1428 movaps 16*$SZ+80(%rsp),%xmm9 1429___ 1430$code.=<<___ if ($win64 && $SZ>4); 1431 movaps 16*$SZ+96(%rsp),%xmm10 1432 movaps 16*$SZ+112(%rsp),%xmm11 1433___ 1434$code.=<<___; 1435 mov -48(%rsi),%r15 1436.cfi_restore %r15 1437 mov -40(%rsi),%r14 1438.cfi_restore %r14 1439 mov -32(%rsi),%r13 1440.cfi_restore %r13 1441 mov -24(%rsi),%r12 1442.cfi_restore %r12 1443 mov -16(%rsi),%rbp 1444.cfi_restore %rbp 1445 mov -8(%rsi),%rbx 1446.cfi_restore %rbx 1447 lea (%rsi),%rsp 1448.cfi_def_cfa_register %rsp 1449.Lepilogue_avx: 1450 ret 1451.cfi_endproc 1452.size ${func}_avx,.-${func}_avx 1453___ 1454 1455if ($avx>1) {{ 1456###################################################################### 1457# AVX2+BMI code path 1458# 1459my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1460my $PUSH8=8*2*$SZ; 1461use integer; 1462 1463sub bodyx_00_15 () { 1464 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1465 ( 1466 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1467 1468 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1469 '&and ($a4,$e)', # f&e 1470 '&rorx ($a0,$e,$Sigma1[2])', 1471 '&rorx ($a2,$e,$Sigma1[1])', 1472 1473 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1474 '&lea ($h,"($h,$a4)")', 1475 '&andn ($a4,$e,$g)', # ~e&g 1476 '&xor ($a0,$a2)', 1477 1478 '&rorx ($a1,$e,$Sigma1[0])', 1479 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1480 '&xor ($a0,$a1)', # Sigma1(e) 1481 '&mov ($a2,$a)', 1482 1483 '&rorx ($a4,$a,$Sigma0[2])', 1484 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1485 '&xor ($a2,$b)', # a^b, b^c in next round 1486 '&rorx ($a1,$a,$Sigma0[1])', 1487 1488 '&rorx ($a0,$a,$Sigma0[0])', 1489 '&lea ($d,"($d,$h)")', # d+=h 1490 '&and ($a3,$a2)', # (b^c)&(a^b) 1491 '&xor ($a1,$a4)', 1492 1493 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1494 '&xor ($a1,$a0)', # Sigma0(a) 1495 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1496 '&mov ($a4,$e)', # copy of f in future 1497 1498 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1499 ); 1500 # and at the finish one has to $a+=$a1 1501} 1502 1503$code.=<<___; 1504.type ${func}_avx2,\@function,3 1505.align 64 1506${func}_avx2: 1507.cfi_startproc 1508.Lavx2_shortcut: 1509 mov %rsp,%rax # copy %rsp 1510.cfi_def_cfa_register %rax 1511 push %rbx 1512.cfi_push %rbx 1513 push %rbp 1514.cfi_push %rbp 1515 push %r12 1516.cfi_push %r12 1517 push %r13 1518.cfi_push %r13 1519 push %r14 1520.cfi_push %r14 1521 push %r15 1522.cfi_push %r15 1523 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1524 shl \$4,%rdx # num*16 1525 and \$-256*$SZ,%rsp # align stack frame 1526 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1527 add \$`2*$SZ*($rounds-8)`,%rsp 1528 mov $ctx,$_ctx # save ctx, 1st arg 1529 mov $inp,$_inp # save inp, 2nd arh 1530 mov %rdx,$_end # save end pointer, "3rd" arg 1531 mov %rax,$_rsp # save copy of %rsp 1532.cfi_cfa_expression $_rsp,deref,+8 1533___ 1534$code.=<<___ if ($win64); 1535 movaps %xmm6,16*$SZ+32(%rsp) 1536 movaps %xmm7,16*$SZ+48(%rsp) 1537 movaps %xmm8,16*$SZ+64(%rsp) 1538 movaps %xmm9,16*$SZ+80(%rsp) 1539___ 1540$code.=<<___ if ($win64 && $SZ>4); 1541 movaps %xmm10,16*$SZ+96(%rsp) 1542 movaps %xmm11,16*$SZ+112(%rsp) 1543___ 1544$code.=<<___; 1545.Lprologue_avx2: 1546 1547 vzeroupper 1548 sub \$-16*$SZ,$inp # inp++, size optimization 1549 mov $SZ*0($ctx),$A 1550 mov $inp,%r12 # borrow $T1 1551 mov $SZ*1($ctx),$B 1552 cmp %rdx,$inp # $_end 1553 mov $SZ*2($ctx),$C 1554 cmove %rsp,%r12 # next block or random data 1555 mov $SZ*3($ctx),$D 1556 mov $SZ*4($ctx),$E 1557 mov $SZ*5($ctx),$F 1558 mov $SZ*6($ctx),$G 1559 mov $SZ*7($ctx),$H 1560___ 1561 if ($SZ==4) { # SHA256 1562 my @X = map("%ymm$_",(0..3)); 1563 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1564 1565$code.=<<___; 1566 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1567 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1568 jmp .Loop_avx2 1569.align 16 1570.Loop_avx2: 1571 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1572 vmovdqu -16*$SZ+0($inp),%xmm0 1573 vmovdqu -16*$SZ+16($inp),%xmm1 1574 vmovdqu -16*$SZ+32($inp),%xmm2 1575 vmovdqu -16*$SZ+48($inp),%xmm3 1576 #mov $inp,$_inp # offload $inp 1577 vinserti128 \$1,(%r12),@X[0],@X[0] 1578 vinserti128 \$1,16(%r12),@X[1],@X[1] 1579 vpshufb $t3,@X[0],@X[0] 1580 vinserti128 \$1,32(%r12),@X[2],@X[2] 1581 vpshufb $t3,@X[1],@X[1] 1582 vinserti128 \$1,48(%r12),@X[3],@X[3] 1583 1584 lea $TABLE(%rip),$Tbl 1585 vpshufb $t3,@X[2],@X[2] 1586 vpaddd 0x00($Tbl),@X[0],$t0 1587 vpshufb $t3,@X[3],@X[3] 1588 vpaddd 0x20($Tbl),@X[1],$t1 1589 vpaddd 0x40($Tbl),@X[2],$t2 1590 vpaddd 0x60($Tbl),@X[3],$t3 1591 vmovdqa $t0,0x00(%rsp) 1592 xor $a1,$a1 1593 vmovdqa $t1,0x20(%rsp) 1594 lea -$PUSH8(%rsp),%rsp 1595 mov $B,$a3 1596 vmovdqa $t2,0x00(%rsp) 1597 xor $C,$a3 # magic 1598 vmovdqa $t3,0x20(%rsp) 1599 mov $F,$a4 1600 sub \$-16*2*$SZ,$Tbl # size optimization 1601 jmp .Lavx2_00_47 1602 1603.align 16 1604.Lavx2_00_47: 1605___ 1606 1607sub AVX2_256_00_47 () { 1608my $j = shift; 1609my $body = shift; 1610my @X = @_; 1611my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1612my $base = "+2*$PUSH8(%rsp)"; 1613 1614 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1615 foreach (Xupdate_256_AVX()) { # 29 instructions 1616 eval; 1617 eval(shift(@insns)); 1618 eval(shift(@insns)); 1619 eval(shift(@insns)); 1620 } 1621 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1622 foreach (@insns) { eval; } # remaining instructions 1623 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1624} 1625 1626 for ($i=0,$j=0; $j<4; $j++) { 1627 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1628 push(@X,shift(@X)); # rotate(@X) 1629 } 1630 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1631 &cmpb (($SZ-1)."($Tbl)",0); 1632 &jne (".Lavx2_00_47"); 1633 1634 for ($i=0; $i<16; ) { 1635 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1636 foreach(bodyx_00_15()) { eval; } 1637 } 1638 } else { # SHA512 1639 my @X = map("%ymm$_",(0..7)); 1640 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 1641 1642$code.=<<___; 1643 jmp .Loop_avx2 1644.align 16 1645.Loop_avx2: 1646 vmovdqu -16*$SZ($inp),%xmm0 1647 vmovdqu -16*$SZ+16($inp),%xmm1 1648 vmovdqu -16*$SZ+32($inp),%xmm2 1649 lea $TABLE+0x80(%rip),$Tbl # size optimization 1650 vmovdqu -16*$SZ+48($inp),%xmm3 1651 vmovdqu -16*$SZ+64($inp),%xmm4 1652 vmovdqu -16*$SZ+80($inp),%xmm5 1653 vmovdqu -16*$SZ+96($inp),%xmm6 1654 vmovdqu -16*$SZ+112($inp),%xmm7 1655 #mov $inp,$_inp # offload $inp 1656 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 1657 vinserti128 \$1,(%r12),@X[0],@X[0] 1658 vinserti128 \$1,16(%r12),@X[1],@X[1] 1659 vpshufb $t2,@X[0],@X[0] 1660 vinserti128 \$1,32(%r12),@X[2],@X[2] 1661 vpshufb $t2,@X[1],@X[1] 1662 vinserti128 \$1,48(%r12),@X[3],@X[3] 1663 vpshufb $t2,@X[2],@X[2] 1664 vinserti128 \$1,64(%r12),@X[4],@X[4] 1665 vpshufb $t2,@X[3],@X[3] 1666 vinserti128 \$1,80(%r12),@X[5],@X[5] 1667 vpshufb $t2,@X[4],@X[4] 1668 vinserti128 \$1,96(%r12),@X[6],@X[6] 1669 vpshufb $t2,@X[5],@X[5] 1670 vinserti128 \$1,112(%r12),@X[7],@X[7] 1671 1672 vpaddq -0x80($Tbl),@X[0],$t0 1673 vpshufb $t2,@X[6],@X[6] 1674 vpaddq -0x60($Tbl),@X[1],$t1 1675 vpshufb $t2,@X[7],@X[7] 1676 vpaddq -0x40($Tbl),@X[2],$t2 1677 vpaddq -0x20($Tbl),@X[3],$t3 1678 vmovdqa $t0,0x00(%rsp) 1679 vpaddq 0x00($Tbl),@X[4],$t0 1680 vmovdqa $t1,0x20(%rsp) 1681 vpaddq 0x20($Tbl),@X[5],$t1 1682 vmovdqa $t2,0x40(%rsp) 1683 vpaddq 0x40($Tbl),@X[6],$t2 1684 vmovdqa $t3,0x60(%rsp) 1685 lea -$PUSH8(%rsp),%rsp 1686 vpaddq 0x60($Tbl),@X[7],$t3 1687 vmovdqa $t0,0x00(%rsp) 1688 xor $a1,$a1 1689 vmovdqa $t1,0x20(%rsp) 1690 mov $B,$a3 1691 vmovdqa $t2,0x40(%rsp) 1692 xor $C,$a3 # magic 1693 vmovdqa $t3,0x60(%rsp) 1694 mov $F,$a4 1695 add \$16*2*$SZ,$Tbl 1696 jmp .Lavx2_00_47 1697 1698.align 16 1699.Lavx2_00_47: 1700___ 1701 1702sub AVX2_512_00_47 () { 1703my $j = shift; 1704my $body = shift; 1705my @X = @_; 1706my @insns = (&$body,&$body); # 48 instructions 1707my $base = "+2*$PUSH8(%rsp)"; 1708 1709 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); 1710 foreach (Xupdate_512_AVX()) { # 23 instructions 1711 eval; 1712 if ($_ !~ /\;$/) { 1713 eval(shift(@insns)); 1714 eval(shift(@insns)); 1715 eval(shift(@insns)); 1716 } 1717 } 1718 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1719 foreach (@insns) { eval; } # remaining instructions 1720 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1721} 1722 1723 for ($i=0,$j=0; $j<8; $j++) { 1724 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 1725 push(@X,shift(@X)); # rotate(@X) 1726 } 1727 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1728 &cmpb (($SZ-1-0x80)."($Tbl)",0); 1729 &jne (".Lavx2_00_47"); 1730 1731 for ($i=0; $i<16; ) { 1732 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1733 foreach(bodyx_00_15()) { eval; } 1734 } 1735} 1736$code.=<<___; 1737 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 1738 add $a1,$A 1739 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 1740 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 1741 1742 add $SZ*0($ctx),$A 1743 add $SZ*1($ctx),$B 1744 add $SZ*2($ctx),$C 1745 add $SZ*3($ctx),$D 1746 add $SZ*4($ctx),$E 1747 add $SZ*5($ctx),$F 1748 add $SZ*6($ctx),$G 1749 add $SZ*7($ctx),$H 1750 1751 mov $A,$SZ*0($ctx) 1752 mov $B,$SZ*1($ctx) 1753 mov $C,$SZ*2($ctx) 1754 mov $D,$SZ*3($ctx) 1755 mov $E,$SZ*4($ctx) 1756 mov $F,$SZ*5($ctx) 1757 mov $G,$SZ*6($ctx) 1758 mov $H,$SZ*7($ctx) 1759 1760 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 1761 je .Ldone_avx2 1762 1763 xor $a1,$a1 1764 mov $B,$a3 1765 xor $C,$a3 # magic 1766 mov $F,$a4 1767 jmp .Lower_avx2 1768.align 16 1769.Lower_avx2: 1770___ 1771 for ($i=0; $i<8; ) { 1772 my $base="+16($Tbl)"; 1773 foreach(bodyx_00_15()) { eval; } 1774 } 1775$code.=<<___; 1776 lea -$PUSH8($Tbl),$Tbl 1777 cmp %rsp,$Tbl 1778 jae .Lower_avx2 1779 1780 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 1781 add $a1,$A 1782 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 1783 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 1784 1785 add $SZ*0($ctx),$A 1786 add $SZ*1($ctx),$B 1787 add $SZ*2($ctx),$C 1788 add $SZ*3($ctx),$D 1789 add $SZ*4($ctx),$E 1790 add $SZ*5($ctx),$F 1791 lea `2*16*$SZ`($inp),$inp # inp+=2 1792 add $SZ*6($ctx),$G 1793 mov $inp,%r12 1794 add $SZ*7($ctx),$H 1795 cmp $_end,$inp 1796 1797 mov $A,$SZ*0($ctx) 1798 cmove %rsp,%r12 # next block or stale data 1799 mov $B,$SZ*1($ctx) 1800 mov $C,$SZ*2($ctx) 1801 mov $D,$SZ*3($ctx) 1802 mov $E,$SZ*4($ctx) 1803 mov $F,$SZ*5($ctx) 1804 mov $G,$SZ*6($ctx) 1805 mov $H,$SZ*7($ctx) 1806 1807 jbe .Loop_avx2 1808 lea (%rsp),$Tbl 1809 1810.Ldone_avx2: 1811 lea ($Tbl),%rsp 1812 mov $_rsp,%rsi 1813.cfi_def_cfa %rsi,8 1814 vzeroupper 1815___ 1816$code.=<<___ if ($win64); 1817 movaps 16*$SZ+32(%rsp),%xmm6 1818 movaps 16*$SZ+48(%rsp),%xmm7 1819 movaps 16*$SZ+64(%rsp),%xmm8 1820 movaps 16*$SZ+80(%rsp),%xmm9 1821___ 1822$code.=<<___ if ($win64 && $SZ>4); 1823 movaps 16*$SZ+96(%rsp),%xmm10 1824 movaps 16*$SZ+112(%rsp),%xmm11 1825___ 1826$code.=<<___; 1827 mov -48(%rsi),%r15 1828.cfi_restore %r15 1829 mov -40(%rsi),%r14 1830.cfi_restore %r14 1831 mov -32(%rsi),%r13 1832.cfi_restore %r13 1833 mov -24(%rsi),%r12 1834.cfi_restore %r12 1835 mov -16(%rsi),%rbp 1836.cfi_restore %rbp 1837 mov -8(%rsi),%rbx 1838.cfi_restore %rbx 1839 lea (%rsi),%rsp 1840.cfi_def_cfa_register %rsp 1841.Lepilogue_avx2: 1842 ret 1843.cfi_endproc 1844.size ${func}_avx2,.-${func}_avx2 1845___ 1846}} 1847}}}}} 1848 1849# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1850# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1851if ($win64) { 1852$rec="%rcx"; 1853$frame="%rdx"; 1854$context="%r8"; 1855$disp="%r9"; 1856 1857$code.=<<___; 1858.extern __imp_RtlVirtualUnwind 1859.type se_handler,\@abi-omnipotent 1860.align 16 1861se_handler: 1862 push %rsi 1863 push %rdi 1864 push %rbx 1865 push %rbp 1866 push %r12 1867 push %r13 1868 push %r14 1869 push %r15 1870 pushfq 1871 sub \$64,%rsp 1872 1873 mov 120($context),%rax # pull context->Rax 1874 mov 248($context),%rbx # pull context->Rip 1875 1876 mov 8($disp),%rsi # disp->ImageBase 1877 mov 56($disp),%r11 # disp->HanderlData 1878 1879 mov 0(%r11),%r10d # HandlerData[0] 1880 lea (%rsi,%r10),%r10 # prologue label 1881 cmp %r10,%rbx # context->Rip<prologue label 1882 jb .Lin_prologue 1883 1884 mov 152($context),%rax # pull context->Rsp 1885 1886 mov 4(%r11),%r10d # HandlerData[1] 1887 lea (%rsi,%r10),%r10 # epilogue label 1888 cmp %r10,%rbx # context->Rip>=epilogue label 1889 jae .Lin_prologue 1890___ 1891$code.=<<___ if ($avx>1); 1892 lea .Lavx2_shortcut(%rip),%r10 1893 cmp %r10,%rbx # context->Rip<avx2_shortcut 1894 jb .Lnot_in_avx2 1895 1896 and \$-256*$SZ,%rax 1897 add \$`2*$SZ*($rounds-8)`,%rax 1898.Lnot_in_avx2: 1899___ 1900$code.=<<___; 1901 mov %rax,%rsi # put aside Rsp 1902 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 1903 1904 mov -8(%rax),%rbx 1905 mov -16(%rax),%rbp 1906 mov -24(%rax),%r12 1907 mov -32(%rax),%r13 1908 mov -40(%rax),%r14 1909 mov -48(%rax),%r15 1910 mov %rbx,144($context) # restore context->Rbx 1911 mov %rbp,160($context) # restore context->Rbp 1912 mov %r12,216($context) # restore context->R12 1913 mov %r13,224($context) # restore context->R13 1914 mov %r14,232($context) # restore context->R14 1915 mov %r15,240($context) # restore context->R15 1916 1917 lea .Lepilogue(%rip),%r10 1918 cmp %r10,%rbx 1919 jb .Lin_prologue # non-AVX code 1920 1921 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 1922 lea 512($context),%rdi # &context.Xmm6 1923 mov \$`$SZ==4?8:12`,%ecx 1924 .long 0xa548f3fc # cld; rep movsq 1925 1926.Lin_prologue: 1927 mov 8(%rax),%rdi 1928 mov 16(%rax),%rsi 1929 mov %rax,152($context) # restore context->Rsp 1930 mov %rsi,168($context) # restore context->Rsi 1931 mov %rdi,176($context) # restore context->Rdi 1932 1933 mov 40($disp),%rdi # disp->ContextRecord 1934 mov $context,%rsi # context 1935 mov \$154,%ecx # sizeof(CONTEXT) 1936 .long 0xa548f3fc # cld; rep movsq 1937 1938 mov $disp,%rsi 1939 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1940 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1941 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1942 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1943 mov 40(%rsi),%r10 # disp->ContextRecord 1944 lea 56(%rsi),%r11 # &disp->HandlerData 1945 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1946 mov %r10,32(%rsp) # arg5 1947 mov %r11,40(%rsp) # arg6 1948 mov %r12,48(%rsp) # arg7 1949 mov %rcx,56(%rsp) # arg8, (NULL) 1950 call *__imp_RtlVirtualUnwind(%rip) 1951 1952 mov \$1,%eax # ExceptionContinueSearch 1953 add \$64,%rsp 1954 popfq 1955 pop %r15 1956 pop %r14 1957 pop %r13 1958 pop %r12 1959 pop %rbp 1960 pop %rbx 1961 pop %rdi 1962 pop %rsi 1963 ret 1964.size se_handler,.-se_handler 1965___ 1966 1967$code.=<<___ if ($SZ==4 && $shaext); 1968.type shaext_handler,\@abi-omnipotent 1969.align 16 1970shaext_handler: 1971 push %rsi 1972 push %rdi 1973 push %rbx 1974 push %rbp 1975 push %r12 1976 push %r13 1977 push %r14 1978 push %r15 1979 pushfq 1980 sub \$64,%rsp 1981 1982 mov 120($context),%rax # pull context->Rax 1983 mov 248($context),%rbx # pull context->Rip 1984 1985 lea .Lprologue_shaext(%rip),%r10 1986 cmp %r10,%rbx # context->Rip<.Lprologue 1987 jb .Lin_prologue 1988 1989 lea .Lepilogue_shaext(%rip),%r10 1990 cmp %r10,%rbx # context->Rip>=.Lepilogue 1991 jae .Lin_prologue 1992 1993 lea -8-5*16(%rax),%rsi 1994 lea 512($context),%rdi # &context.Xmm6 1995 mov \$10,%ecx 1996 .long 0xa548f3fc # cld; rep movsq 1997 1998 jmp .Lin_prologue 1999.size shaext_handler,.-shaext_handler 2000___ 2001 2002$code.=<<___; 2003.section .pdata 2004.align 4 2005 .rva .LSEH_begin_$func 2006 .rva .LSEH_end_$func 2007 .rva .LSEH_info_$func 2008___ 2009$code.=<<___ if ($SZ==4 && $shaext); 2010 .rva .LSEH_begin_${func}_shaext 2011 .rva .LSEH_end_${func}_shaext 2012 .rva .LSEH_info_${func}_shaext 2013___ 2014$code.=<<___ if ($SZ==4); 2015 .rva .LSEH_begin_${func}_ssse3 2016 .rva .LSEH_end_${func}_ssse3 2017 .rva .LSEH_info_${func}_ssse3 2018___ 2019$code.=<<___ if ($avx); 2020 .rva .LSEH_begin_${func}_avx 2021 .rva .LSEH_end_${func}_avx 2022 .rva .LSEH_info_${func}_avx 2023___ 2024$code.=<<___ if ($avx>1); 2025 .rva .LSEH_begin_${func}_avx2 2026 .rva .LSEH_end_${func}_avx2 2027 .rva .LSEH_info_${func}_avx2 2028___ 2029$code.=<<___; 2030.section .xdata 2031.align 8 2032.LSEH_info_$func: 2033 .byte 9,0,0,0 2034 .rva se_handler 2035 .rva .Lprologue,.Lepilogue # HandlerData[] 2036___ 2037$code.=<<___ if ($SZ==4 && $shaext); 2038.LSEH_info_${func}_shaext: 2039 .byte 9,0,0,0 2040 .rva shaext_handler 2041___ 2042$code.=<<___ if ($SZ==4); 2043.LSEH_info_${func}_ssse3: 2044 .byte 9,0,0,0 2045 .rva se_handler 2046 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2047___ 2048$code.=<<___ if ($avx); 2049.LSEH_info_${func}_avx: 2050 .byte 9,0,0,0 2051 .rva se_handler 2052 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2053___ 2054$code.=<<___ if ($avx>1); 2055.LSEH_info_${func}_avx2: 2056 .byte 9,0,0,0 2057 .rva se_handler 2058 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2059___ 2060} 2061 2062sub sha256op38 { 2063 my $instr = shift; 2064 my %opcodelet = ( 2065 "sha256rnds2" => 0xcb, 2066 "sha256msg1" => 0xcc, 2067 "sha256msg2" => 0xcd ); 2068 2069 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2070 my @opcode=(0x0f,0x38); 2071 push @opcode,$opcodelet{$instr}; 2072 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2073 return ".byte\t".join(',',@opcode); 2074 } else { 2075 return $instr."\t".@_[0]; 2076 } 2077} 2078 2079foreach (split("\n",$code)) { 2080 s/\`([^\`]*)\`/eval $1/geo; 2081 2082 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2083 2084 print $_,"\n"; 2085} 2086close STDOUT; 2087