1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# March, June 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that 14# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 15# function features so called "528B" variant utilizing additional 16# 256+16 bytes of per-key storage [+512 bytes shared table]. 17# Performance results are for this streamed GHASH subroutine and are 18# expressed in cycles per processed byte, less is better: 19# 20# gcc 3.4.x(*) assembler 21# 22# P4 28.6 14.0 +100% 23# Opteron 19.3 7.7 +150% 24# Core2 17.8 8.1(**) +120% 25# Atom 31.6 16.8 +88% 26# VIA Nano 21.8 10.1 +115% 27# 28# (*) comparison is not completely fair, because C results are 29# for vanilla "256B" implementation, while assembler results 30# are for "528B";-) 31# (**) it's mystery [to me] why Core2 result is not same as for 32# Opteron; 33 34# May 2010 35# 36# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 37# See ghash-x86.pl for background information and details about coding 38# techniques. 39# 40# Special thanks to David Woodhouse <dwmw2@infradead.org> for 41# providing access to a Westmere-based system on behalf of Intel 42# Open Source Technology Centre. 43 44# December 2012 45# 46# Overhaul: aggregate Karatsuba post-processing, improve ILP in 47# reduction_alg9, increase reduction aggregate factor to 4x. As for 48# the latter. ghash-x86.pl discusses that it makes lesser sense to 49# increase aggregate factor. Then why increase here? Critical path 50# consists of 3 independent pclmulqdq instructions, Karatsuba post- 51# processing and reduction. "On top" of this we lay down aggregated 52# multiplication operations, triplets of independent pclmulqdq's. As 53# issue rate for pclmulqdq is limited, it makes lesser sense to 54# aggregate more multiplications than it takes to perform remaining 55# non-multiplication operations. 2x is near-optimal coefficient for 56# contemporary Intel CPUs (therefore modest improvement coefficient), 57# but not for Bulldozer. Latter is because logical SIMD operations 58# are twice as slow in comparison to Intel, so that critical path is 59# longer. A CPU with higher pclmulqdq issue rate would also benefit 60# from higher aggregate factor... 61# 62# Westmere 1.78(+13%) 63# Sandy Bridge 1.80(+8%) 64# Ivy Bridge 1.80(+7%) 65# Haswell 0.55(+93%) (if system doesn't support AVX) 66# Broadwell 0.45(+110%)(if system doesn't support AVX) 67# Bulldozer 1.49(+27%) 68# Silvermont 2.88(+13%) 69 70# March 2013 71# 72# ... 8x aggregate factor AVX code path is using reduction algorithm 73# suggested by Shay Gueron[1]. Even though contemporary AVX-capable 74# CPUs such as Sandy and Ivy Bridge can execute it, the code performs 75# sub-optimally in comparison to above mentioned version. But thanks 76# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 77# it performs in 0.41 cycles per byte on Haswell processor, and in 78# 0.29 on Broadwell. 79# 80# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 81 82$flavour = shift; 83$output = shift; 84if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 85 86$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 87 88$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 89( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 90( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 91die "can't locate x86_64-xlate.pl"; 92 93# In upstream, this is controlled by shelling out to the compiler to check 94# versions, but BoringSSL is intended to be used with pre-generated perlasm 95# output, so this isn't useful anyway. 96# 97# TODO(davidben): Enable this after testing. $avx goes up to 2. 98$avx = 0; 99 100open OUT,"| \"$^X\" $xlate $flavour $output"; 101*STDOUT=*OUT; 102 103$do4xaggr=1; 104 105# common register layout 106$nlo="%rax"; 107$nhi="%rbx"; 108$Zlo="%r8"; 109$Zhi="%r9"; 110$tmp="%r10"; 111$rem_4bit = "%r11"; 112 113$Xi="%rdi"; 114$Htbl="%rsi"; 115 116# per-function register layout 117$cnt="%rcx"; 118$rem="%rdx"; 119 120sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or 121 $r =~ s/%[er]([sd]i)/%\1l/ or 122 $r =~ s/%[er](bp)/%\1l/ or 123 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 124 125sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 126{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 127 my $arg = pop; 128 $arg = "\$$arg" if ($arg*1 eq $arg); 129 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 130} 131 132{ my $N; 133 sub loop() { 134 my $inp = shift; 135 136 $N++; 137$code.=<<___; 138 xor $nlo,$nlo 139 xor $nhi,$nhi 140 mov `&LB("$Zlo")`,`&LB("$nlo")` 141 mov `&LB("$Zlo")`,`&LB("$nhi")` 142 shl \$4,`&LB("$nlo")` 143 mov \$14,$cnt 144 mov 8($Htbl,$nlo),$Zlo 145 mov ($Htbl,$nlo),$Zhi 146 and \$0xf0,`&LB("$nhi")` 147 mov $Zlo,$rem 148 jmp .Loop$N 149 150.align 16 151.Loop$N: 152 shr \$4,$Zlo 153 and \$0xf,$rem 154 mov $Zhi,$tmp 155 mov ($inp,$cnt),`&LB("$nlo")` 156 shr \$4,$Zhi 157 xor 8($Htbl,$nhi),$Zlo 158 shl \$60,$tmp 159 xor ($Htbl,$nhi),$Zhi 160 mov `&LB("$nlo")`,`&LB("$nhi")` 161 xor ($rem_4bit,$rem,8),$Zhi 162 mov $Zlo,$rem 163 shl \$4,`&LB("$nlo")` 164 xor $tmp,$Zlo 165 dec $cnt 166 js .Lbreak$N 167 168 shr \$4,$Zlo 169 and \$0xf,$rem 170 mov $Zhi,$tmp 171 shr \$4,$Zhi 172 xor 8($Htbl,$nlo),$Zlo 173 shl \$60,$tmp 174 xor ($Htbl,$nlo),$Zhi 175 and \$0xf0,`&LB("$nhi")` 176 xor ($rem_4bit,$rem,8),$Zhi 177 mov $Zlo,$rem 178 xor $tmp,$Zlo 179 jmp .Loop$N 180 181.align 16 182.Lbreak$N: 183 shr \$4,$Zlo 184 and \$0xf,$rem 185 mov $Zhi,$tmp 186 shr \$4,$Zhi 187 xor 8($Htbl,$nlo),$Zlo 188 shl \$60,$tmp 189 xor ($Htbl,$nlo),$Zhi 190 and \$0xf0,`&LB("$nhi")` 191 xor ($rem_4bit,$rem,8),$Zhi 192 mov $Zlo,$rem 193 xor $tmp,$Zlo 194 195 shr \$4,$Zlo 196 and \$0xf,$rem 197 mov $Zhi,$tmp 198 shr \$4,$Zhi 199 xor 8($Htbl,$nhi),$Zlo 200 shl \$60,$tmp 201 xor ($Htbl,$nhi),$Zhi 202 xor $tmp,$Zlo 203 xor ($rem_4bit,$rem,8),$Zhi 204 205 bswap $Zlo 206 bswap $Zhi 207___ 208}} 209 210$code=<<___; 211.text 212.extern OPENSSL_ia32cap_P 213 214.globl gcm_gmult_4bit 215.type gcm_gmult_4bit,\@function,2 216.align 16 217gcm_gmult_4bit: 218 push %rbx 219 push %rbp # %rbp and %r12 are pushed exclusively in 220 push %r12 # order to reuse Win64 exception handler... 221.Lgmult_prologue: 222 223 movzb 15($Xi),$Zlo 224 lea .Lrem_4bit(%rip),$rem_4bit 225___ 226 &loop ($Xi); 227$code.=<<___; 228 mov $Zlo,8($Xi) 229 mov $Zhi,($Xi) 230 231 mov 16(%rsp),%rbx 232 lea 24(%rsp),%rsp 233.Lgmult_epilogue: 234 ret 235.size gcm_gmult_4bit,.-gcm_gmult_4bit 236___ 237 238# per-function register layout 239$inp="%rdx"; 240$len="%rcx"; 241$rem_8bit=$rem_4bit; 242 243$code.=<<___; 244.globl gcm_ghash_4bit 245.type gcm_ghash_4bit,\@function,4 246.align 16 247gcm_ghash_4bit: 248 push %rbx 249 push %rbp 250 push %r12 251 push %r13 252 push %r14 253 push %r15 254 sub \$280,%rsp 255.Lghash_prologue: 256 mov $inp,%r14 # reassign couple of args 257 mov $len,%r15 258___ 259{ my $inp="%r14"; 260 my $dat="%edx"; 261 my $len="%r15"; 262 my @nhi=("%ebx","%ecx"); 263 my @rem=("%r12","%r13"); 264 my $Hshr4="%rbp"; 265 266 &sub ($Htbl,-128); # size optimization 267 &lea ($Hshr4,"16+128(%rsp)"); 268 { my @lo =($nlo,$nhi); 269 my @hi =($Zlo,$Zhi); 270 271 &xor ($dat,$dat); 272 for ($i=0,$j=-2;$i<18;$i++,$j++) { 273 &mov ("$j(%rsp)",&LB($dat)) if ($i>1); 274 &or ($lo[0],$tmp) if ($i>1); 275 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); 276 &shr ($lo[1],4) if ($i>0 && $i<17); 277 &mov ($tmp,$hi[1]) if ($i>0 && $i<17); 278 &shr ($hi[1],4) if ($i>0 && $i<17); 279 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); 280 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); 281 &shl (&LB($dat),4) if ($i>0 && $i<17); 282 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); 283 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); 284 &shl ($tmp,60) if ($i>0 && $i<17); 285 286 push (@lo,shift(@lo)); 287 push (@hi,shift(@hi)); 288 } 289 } 290 &add ($Htbl,-128); 291 &mov ($Zlo,"8($Xi)"); 292 &mov ($Zhi,"0($Xi)"); 293 &add ($len,$inp); # pointer to the end of data 294 &lea ($rem_8bit,".Lrem_8bit(%rip)"); 295 &jmp (".Louter_loop"); 296 297$code.=".align 16\n.Louter_loop:\n"; 298 &xor ($Zhi,"($inp)"); 299 &mov ("%rdx","8($inp)"); 300 &lea ($inp,"16($inp)"); 301 &xor ("%rdx",$Zlo); 302 &mov ("($Xi)",$Zhi); 303 &mov ("8($Xi)","%rdx"); 304 &shr ("%rdx",32); 305 306 &xor ($nlo,$nlo); 307 &rol ($dat,8); 308 &mov (&LB($nlo),&LB($dat)); 309 &movz ($nhi[0],&LB($dat)); 310 &shl (&LB($nlo),4); 311 &shr ($nhi[0],4); 312 313 for ($j=11,$i=0;$i<15;$i++) { 314 &rol ($dat,8); 315 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); 316 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); 317 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); 318 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); 319 320 &mov (&LB($nlo),&LB($dat)); 321 &xor ($Zlo,$tmp) if ($i>0); 322 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); 323 324 &movz ($nhi[1],&LB($dat)); 325 &shl (&LB($nlo),4); 326 &movzb ($rem[0],"(%rsp,$nhi[0])"); 327 328 &shr ($nhi[1],4) if ($i<14); 329 &and ($nhi[1],0xf0) if ($i==14); 330 &shl ($rem[1],48) if ($i>0); 331 &xor ($rem[0],$Zlo); 332 333 &mov ($tmp,$Zhi); 334 &xor ($Zhi,$rem[1]) if ($i>0); 335 &shr ($Zlo,8); 336 337 &movz ($rem[0],&LB($rem[0])); 338 &mov ($dat,"$j($Xi)") if (--$j%4==0); 339 &shr ($Zhi,8); 340 341 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); 342 &shl ($tmp,56); 343 &xor ($Zhi,"($Hshr4,$nhi[0],8)"); 344 345 unshift (@nhi,pop(@nhi)); # "rotate" registers 346 unshift (@rem,pop(@rem)); 347 } 348 &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); 349 &xor ($Zlo,"8($Htbl,$nlo)"); 350 &xor ($Zhi,"($Htbl,$nlo)"); 351 352 &shl ($rem[1],48); 353 &xor ($Zlo,$tmp); 354 355 &xor ($Zhi,$rem[1]); 356 &movz ($rem[0],&LB($Zlo)); 357 &shr ($Zlo,4); 358 359 &mov ($tmp,$Zhi); 360 &shl (&LB($rem[0]),4); 361 &shr ($Zhi,4); 362 363 &xor ($Zlo,"8($Htbl,$nhi[0])"); 364 &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); 365 &shl ($tmp,60); 366 367 &xor ($Zhi,"($Htbl,$nhi[0])"); 368 &xor ($Zlo,$tmp); 369 &shl ($rem[0],48); 370 371 &bswap ($Zlo); 372 &xor ($Zhi,$rem[0]); 373 374 &bswap ($Zhi); 375 &cmp ($inp,$len); 376 &jb (".Louter_loop"); 377} 378$code.=<<___; 379 mov $Zlo,8($Xi) 380 mov $Zhi,($Xi) 381 382 lea 280(%rsp),%rsi 383 mov 0(%rsi),%r15 384 mov 8(%rsi),%r14 385 mov 16(%rsi),%r13 386 mov 24(%rsi),%r12 387 mov 32(%rsi),%rbp 388 mov 40(%rsi),%rbx 389 lea 48(%rsi),%rsp 390.Lghash_epilogue: 391 ret 392.size gcm_ghash_4bit,.-gcm_ghash_4bit 393___ 394 395###################################################################### 396# PCLMULQDQ version. 397 398@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 399 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 400 401($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 402($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 403 404sub clmul64x64_T2 { # minimal register pressure 405my ($Xhi,$Xi,$Hkey,$HK)=@_; 406 407if (!defined($HK)) { $HK = $T2; 408$code.=<<___; 409 movdqa $Xi,$Xhi # 410 pshufd \$0b01001110,$Xi,$T1 411 pshufd \$0b01001110,$Hkey,$T2 412 pxor $Xi,$T1 # 413 pxor $Hkey,$T2 414___ 415} else { 416$code.=<<___; 417 movdqa $Xi,$Xhi # 418 pshufd \$0b01001110,$Xi,$T1 419 pxor $Xi,$T1 # 420___ 421} 422$code.=<<___; 423 pclmulqdq \$0x00,$Hkey,$Xi ####### 424 pclmulqdq \$0x11,$Hkey,$Xhi ####### 425 pclmulqdq \$0x00,$HK,$T1 ####### 426 pxor $Xi,$T1 # 427 pxor $Xhi,$T1 # 428 429 movdqa $T1,$T2 # 430 psrldq \$8,$T1 431 pslldq \$8,$T2 # 432 pxor $T1,$Xhi 433 pxor $T2,$Xi # 434___ 435} 436 437sub reduction_alg9 { # 17/11 times faster than Intel version 438my ($Xhi,$Xi) = @_; 439 440$code.=<<___; 441 # 1st phase 442 movdqa $Xi,$T2 # 443 movdqa $Xi,$T1 444 psllq \$5,$Xi 445 pxor $Xi,$T1 # 446 psllq \$1,$Xi 447 pxor $T1,$Xi # 448 psllq \$57,$Xi # 449 movdqa $Xi,$T1 # 450 pslldq \$8,$Xi 451 psrldq \$8,$T1 # 452 pxor $T2,$Xi 453 pxor $T1,$Xhi # 454 455 # 2nd phase 456 movdqa $Xi,$T2 457 psrlq \$1,$Xi 458 pxor $T2,$Xhi # 459 pxor $Xi,$T2 460 psrlq \$5,$Xi 461 pxor $T2,$Xi # 462 psrlq \$1,$Xi # 463 pxor $Xhi,$Xi # 464___ 465} 466 467{ my ($Htbl,$Xip)=@_4args; 468 my $HK="%xmm6"; 469 470$code.=<<___; 471.globl gcm_init_clmul 472.type gcm_init_clmul,\@abi-omnipotent 473.align 16 474gcm_init_clmul: 475.L_init_clmul: 476___ 477$code.=<<___ if ($win64); 478.LSEH_begin_gcm_init_clmul: 479 # I can't trust assembler to use specific encoding:-( 480 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 481 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 482___ 483$code.=<<___; 484 movdqu ($Xip),$Hkey 485 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 486 487 # <<1 twist 488 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 489 movdqa $Hkey,$T1 490 psllq \$1,$Hkey 491 pxor $T3,$T3 # 492 psrlq \$63,$T1 493 pcmpgtd $T2,$T3 # broadcast carry bit 494 pslldq \$8,$T1 495 por $T1,$Hkey # H<<=1 496 497 # magic reduction 498 pand .L0x1c2_polynomial(%rip),$T3 499 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 500 501 # calculate H^2 502 pshufd \$0b01001110,$Hkey,$HK 503 movdqa $Hkey,$Xi 504 pxor $Hkey,$HK 505___ 506 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 507 &reduction_alg9 ($Xhi,$Xi); 508$code.=<<___; 509 pshufd \$0b01001110,$Hkey,$T1 510 pshufd \$0b01001110,$Xi,$T2 511 pxor $Hkey,$T1 # Karatsuba pre-processing 512 movdqu $Hkey,0x00($Htbl) # save H 513 pxor $Xi,$T2 # Karatsuba pre-processing 514 movdqu $Xi,0x10($Htbl) # save H^2 515 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 516 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 517___ 518if ($do4xaggr) { 519 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 520 &reduction_alg9 ($Xhi,$Xi); 521$code.=<<___; 522 movdqa $Xi,$T3 523___ 524 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 525 &reduction_alg9 ($Xhi,$Xi); 526$code.=<<___; 527 pshufd \$0b01001110,$T3,$T1 528 pshufd \$0b01001110,$Xi,$T2 529 pxor $T3,$T1 # Karatsuba pre-processing 530 movdqu $T3,0x30($Htbl) # save H^3 531 pxor $Xi,$T2 # Karatsuba pre-processing 532 movdqu $Xi,0x40($Htbl) # save H^4 533 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 534 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 535___ 536} 537$code.=<<___ if ($win64); 538 movaps (%rsp),%xmm6 539 lea 0x18(%rsp),%rsp 540.LSEH_end_gcm_init_clmul: 541___ 542$code.=<<___; 543 ret 544.size gcm_init_clmul,.-gcm_init_clmul 545___ 546} 547 548{ my ($Xip,$Htbl)=@_4args; 549 550$code.=<<___; 551.globl gcm_gmult_clmul 552.type gcm_gmult_clmul,\@abi-omnipotent 553.align 16 554gcm_gmult_clmul: 555.L_gmult_clmul: 556 movdqu ($Xip),$Xi 557 movdqa .Lbswap_mask(%rip),$T3 558 movdqu ($Htbl),$Hkey 559 movdqu 0x20($Htbl),$T2 560 pshufb $T3,$Xi 561___ 562 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 563$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 564 # experimental alternative. special thing about is that there 565 # no dependency between the two multiplications... 566 mov \$`0xE1<<1`,%eax 567 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff 568 mov \$0x07,%r11d 569 movq %rax,$T1 570 movq %r10,$T2 571 movq %r11,$T3 # borrow $T3 572 pand $Xi,$T3 573 pshufb $T3,$T2 # ($Xi&7)·0xE0 574 movq %rax,$T3 575 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) 576 pxor $Xi,$T2 577 pslldq \$15,$T2 578 paddd $T2,$T2 # <<(64+56+1) 579 pxor $T2,$Xi 580 pclmulqdq \$0x01,$T3,$Xi 581 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 582 psrldq \$1,$T1 583 pxor $T1,$Xhi 584 pslldq \$7,$Xi 585 pxor $Xhi,$Xi 586___ 587$code.=<<___; 588 pshufb $T3,$Xi 589 movdqu $Xi,($Xip) 590 ret 591.size gcm_gmult_clmul,.-gcm_gmult_clmul 592___ 593} 594 595{ my ($Xip,$Htbl,$inp,$len)=@_4args; 596 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 597 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 598 599$code.=<<___; 600.globl gcm_ghash_clmul 601.type gcm_ghash_clmul,\@abi-omnipotent 602.align 32 603gcm_ghash_clmul: 604.L_ghash_clmul: 605___ 606$code.=<<___ if ($win64); 607 lea -0x88(%rsp),%rax 608.LSEH_begin_gcm_ghash_clmul: 609 # I can't trust assembler to use specific encoding:-( 610 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 611 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 612 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 613 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 614 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 615 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 616 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 617 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 618 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 619 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 620 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 621___ 622$code.=<<___; 623 movdqa .Lbswap_mask(%rip),$T3 624 625 movdqu ($Xip),$Xi 626 movdqu ($Htbl),$Hkey 627 movdqu 0x20($Htbl),$HK 628 pshufb $T3,$Xi 629 630 sub \$0x10,$len 631 jz .Lodd_tail 632 633 movdqu 0x10($Htbl),$Hkey2 634___ 635if ($do4xaggr) { 636my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 637 638$code.=<<___; 639 mov OPENSSL_ia32cap_P+4(%rip),%eax 640 cmp \$0x30,$len 641 jb .Lskip4x 642 643 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE 644 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE 645 je .Lskip4x 646 647 sub \$0x30,$len 648 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff 649 movdqu 0x30($Htbl),$Hkey3 650 movdqu 0x40($Htbl),$Hkey4 651 652 ####### 653 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 654 # 655 movdqu 0x30($inp),$Xln 656 movdqu 0x20($inp),$Xl 657 pshufb $T3,$Xln 658 pshufb $T3,$Xl 659 movdqa $Xln,$Xhn 660 pshufd \$0b01001110,$Xln,$Xmn 661 pxor $Xln,$Xmn 662 pclmulqdq \$0x00,$Hkey,$Xln 663 pclmulqdq \$0x11,$Hkey,$Xhn 664 pclmulqdq \$0x00,$HK,$Xmn 665 666 movdqa $Xl,$Xh 667 pshufd \$0b01001110,$Xl,$Xm 668 pxor $Xl,$Xm 669 pclmulqdq \$0x00,$Hkey2,$Xl 670 pclmulqdq \$0x11,$Hkey2,$Xh 671 pclmulqdq \$0x10,$HK,$Xm 672 xorps $Xl,$Xln 673 xorps $Xh,$Xhn 674 movups 0x50($Htbl),$HK 675 xorps $Xm,$Xmn 676 677 movdqu 0x10($inp),$Xl 678 movdqu 0($inp),$T1 679 pshufb $T3,$Xl 680 pshufb $T3,$T1 681 movdqa $Xl,$Xh 682 pshufd \$0b01001110,$Xl,$Xm 683 pxor $T1,$Xi 684 pxor $Xl,$Xm 685 pclmulqdq \$0x00,$Hkey3,$Xl 686 movdqa $Xi,$Xhi 687 pshufd \$0b01001110,$Xi,$T1 688 pxor $Xi,$T1 689 pclmulqdq \$0x11,$Hkey3,$Xh 690 pclmulqdq \$0x00,$HK,$Xm 691 xorps $Xl,$Xln 692 xorps $Xh,$Xhn 693 694 lea 0x40($inp),$inp 695 sub \$0x40,$len 696 jc .Ltail4x 697 698 jmp .Lmod4_loop 699.align 32 700.Lmod4_loop: 701 pclmulqdq \$0x00,$Hkey4,$Xi 702 xorps $Xm,$Xmn 703 movdqu 0x30($inp),$Xl 704 pshufb $T3,$Xl 705 pclmulqdq \$0x11,$Hkey4,$Xhi 706 xorps $Xln,$Xi 707 movdqu 0x20($inp),$Xln 708 movdqa $Xl,$Xh 709 pclmulqdq \$0x10,$HK,$T1 710 pshufd \$0b01001110,$Xl,$Xm 711 xorps $Xhn,$Xhi 712 pxor $Xl,$Xm 713 pshufb $T3,$Xln 714 movups 0x20($Htbl),$HK 715 xorps $Xmn,$T1 716 pclmulqdq \$0x00,$Hkey,$Xl 717 pshufd \$0b01001110,$Xln,$Xmn 718 719 pxor $Xi,$T1 # aggregated Karatsuba post-processing 720 movdqa $Xln,$Xhn 721 pxor $Xhi,$T1 # 722 pxor $Xln,$Xmn 723 movdqa $T1,$T2 # 724 pclmulqdq \$0x11,$Hkey,$Xh 725 pslldq \$8,$T1 726 psrldq \$8,$T2 # 727 pxor $T1,$Xi 728 movdqa .L7_mask(%rip),$T1 729 pxor $T2,$Xhi # 730 movq %rax,$T2 731 732 pand $Xi,$T1 # 1st phase 733 pshufb $T1,$T2 # 734 pxor $Xi,$T2 # 735 pclmulqdq \$0x00,$HK,$Xm 736 psllq \$57,$T2 # 737 movdqa $T2,$T1 # 738 pslldq \$8,$T2 739 pclmulqdq \$0x00,$Hkey2,$Xln 740 psrldq \$8,$T1 # 741 pxor $T2,$Xi 742 pxor $T1,$Xhi # 743 movdqu 0($inp),$T1 744 745 movdqa $Xi,$T2 # 2nd phase 746 psrlq \$1,$Xi 747 pclmulqdq \$0x11,$Hkey2,$Xhn 748 xorps $Xl,$Xln 749 movdqu 0x10($inp),$Xl 750 pshufb $T3,$Xl 751 pclmulqdq \$0x10,$HK,$Xmn 752 xorps $Xh,$Xhn 753 movups 0x50($Htbl),$HK 754 pshufb $T3,$T1 755 pxor $T2,$Xhi # 756 pxor $Xi,$T2 757 psrlq \$5,$Xi 758 759 movdqa $Xl,$Xh 760 pxor $Xm,$Xmn 761 pshufd \$0b01001110,$Xl,$Xm 762 pxor $T2,$Xi # 763 pxor $T1,$Xhi 764 pxor $Xl,$Xm 765 pclmulqdq \$0x00,$Hkey3,$Xl 766 psrlq \$1,$Xi # 767 pxor $Xhi,$Xi # 768 movdqa $Xi,$Xhi 769 pclmulqdq \$0x11,$Hkey3,$Xh 770 xorps $Xl,$Xln 771 pshufd \$0b01001110,$Xi,$T1 772 pxor $Xi,$T1 773 774 pclmulqdq \$0x00,$HK,$Xm 775 xorps $Xh,$Xhn 776 777 lea 0x40($inp),$inp 778 sub \$0x40,$len 779 jnc .Lmod4_loop 780 781.Ltail4x: 782 pclmulqdq \$0x00,$Hkey4,$Xi 783 pclmulqdq \$0x11,$Hkey4,$Xhi 784 pclmulqdq \$0x10,$HK,$T1 785 xorps $Xm,$Xmn 786 xorps $Xln,$Xi 787 xorps $Xhn,$Xhi 788 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 789 pxor $Xmn,$T1 790 791 pxor $Xhi,$T1 # 792 pxor $Xi,$Xhi 793 794 movdqa $T1,$T2 # 795 psrldq \$8,$T1 796 pslldq \$8,$T2 # 797 pxor $T1,$Xhi 798 pxor $T2,$Xi # 799___ 800 &reduction_alg9($Xhi,$Xi); 801$code.=<<___; 802 add \$0x40,$len 803 jz .Ldone 804 movdqu 0x20($Htbl),$HK 805 sub \$0x10,$len 806 jz .Lodd_tail 807.Lskip4x: 808___ 809} 810$code.=<<___; 811 ####### 812 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 813 # [(H*Ii+1) + (H*Xi+1)] mod P = 814 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 815 # 816 movdqu ($inp),$T1 # Ii 817 movdqu 16($inp),$Xln # Ii+1 818 pshufb $T3,$T1 819 pshufb $T3,$Xln 820 pxor $T1,$Xi # Ii+Xi 821 822 movdqa $Xln,$Xhn 823 pshufd \$0b01001110,$Xln,$Xmn 824 pxor $Xln,$Xmn 825 pclmulqdq \$0x00,$Hkey,$Xln 826 pclmulqdq \$0x11,$Hkey,$Xhn 827 pclmulqdq \$0x00,$HK,$Xmn 828 829 lea 32($inp),$inp # i+=2 830 nop 831 sub \$0x20,$len 832 jbe .Leven_tail 833 nop 834 jmp .Lmod_loop 835 836.align 32 837.Lmod_loop: 838 movdqa $Xi,$Xhi 839 movdqa $Xmn,$T1 840 pshufd \$0b01001110,$Xi,$Xmn # 841 pxor $Xi,$Xmn # 842 843 pclmulqdq \$0x00,$Hkey2,$Xi 844 pclmulqdq \$0x11,$Hkey2,$Xhi 845 pclmulqdq \$0x10,$HK,$Xmn 846 847 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 848 pxor $Xhn,$Xhi 849 movdqu ($inp),$T2 # Ii 850 pxor $Xi,$T1 # aggregated Karatsuba post-processing 851 pshufb $T3,$T2 852 movdqu 16($inp),$Xln # Ii+1 853 854 pxor $Xhi,$T1 855 pxor $T2,$Xhi # "Ii+Xi", consume early 856 pxor $T1,$Xmn 857 pshufb $T3,$Xln 858 movdqa $Xmn,$T1 # 859 psrldq \$8,$T1 860 pslldq \$8,$Xmn # 861 pxor $T1,$Xhi 862 pxor $Xmn,$Xi # 863 864 movdqa $Xln,$Xhn # 865 866 movdqa $Xi,$T2 # 1st phase 867 movdqa $Xi,$T1 868 psllq \$5,$Xi 869 pxor $Xi,$T1 # 870 pclmulqdq \$0x00,$Hkey,$Xln ####### 871 psllq \$1,$Xi 872 pxor $T1,$Xi # 873 psllq \$57,$Xi # 874 movdqa $Xi,$T1 # 875 pslldq \$8,$Xi 876 psrldq \$8,$T1 # 877 pxor $T2,$Xi 878 pshufd \$0b01001110,$Xhn,$Xmn 879 pxor $T1,$Xhi # 880 pxor $Xhn,$Xmn # 881 882 movdqa $Xi,$T2 # 2nd phase 883 psrlq \$1,$Xi 884 pclmulqdq \$0x11,$Hkey,$Xhn ####### 885 pxor $T2,$Xhi # 886 pxor $Xi,$T2 887 psrlq \$5,$Xi 888 pxor $T2,$Xi # 889 lea 32($inp),$inp 890 psrlq \$1,$Xi # 891 pclmulqdq \$0x00,$HK,$Xmn ####### 892 pxor $Xhi,$Xi # 893 894 sub \$0x20,$len 895 ja .Lmod_loop 896 897.Leven_tail: 898 movdqa $Xi,$Xhi 899 movdqa $Xmn,$T1 900 pshufd \$0b01001110,$Xi,$Xmn # 901 pxor $Xi,$Xmn # 902 903 pclmulqdq \$0x00,$Hkey2,$Xi 904 pclmulqdq \$0x11,$Hkey2,$Xhi 905 pclmulqdq \$0x10,$HK,$Xmn 906 907 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 908 pxor $Xhn,$Xhi 909 pxor $Xi,$T1 910 pxor $Xhi,$T1 911 pxor $T1,$Xmn 912 movdqa $Xmn,$T1 # 913 psrldq \$8,$T1 914 pslldq \$8,$Xmn # 915 pxor $T1,$Xhi 916 pxor $Xmn,$Xi # 917___ 918 &reduction_alg9 ($Xhi,$Xi); 919$code.=<<___; 920 test $len,$len 921 jnz .Ldone 922 923.Lodd_tail: 924 movdqu ($inp),$T1 # Ii 925 pshufb $T3,$T1 926 pxor $T1,$Xi # Ii+Xi 927___ 928 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 929 &reduction_alg9 ($Xhi,$Xi); 930$code.=<<___; 931.Ldone: 932 pshufb $T3,$Xi 933 movdqu $Xi,($Xip) 934___ 935$code.=<<___ if ($win64); 936 movaps (%rsp),%xmm6 937 movaps 0x10(%rsp),%xmm7 938 movaps 0x20(%rsp),%xmm8 939 movaps 0x30(%rsp),%xmm9 940 movaps 0x40(%rsp),%xmm10 941 movaps 0x50(%rsp),%xmm11 942 movaps 0x60(%rsp),%xmm12 943 movaps 0x70(%rsp),%xmm13 944 movaps 0x80(%rsp),%xmm14 945 movaps 0x90(%rsp),%xmm15 946 lea 0xa8(%rsp),%rsp 947.LSEH_end_gcm_ghash_clmul: 948___ 949$code.=<<___; 950 ret 951.size gcm_ghash_clmul,.-gcm_ghash_clmul 952___ 953} 954 955$code.=<<___; 956.globl gcm_init_avx 957.type gcm_init_avx,\@abi-omnipotent 958.align 32 959gcm_init_avx: 960___ 961if ($avx) { 962my ($Htbl,$Xip)=@_4args; 963my $HK="%xmm6"; 964 965$code.=<<___ if ($win64); 966.LSEH_begin_gcm_init_avx: 967 # I can't trust assembler to use specific encoding:-( 968 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 969 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 970___ 971$code.=<<___; 972 vzeroupper 973 974 vmovdqu ($Xip),$Hkey 975 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 976 977 # <<1 twist 978 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 979 vpsrlq \$63,$Hkey,$T1 980 vpsllq \$1,$Hkey,$Hkey 981 vpxor $T3,$T3,$T3 # 982 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 983 vpslldq \$8,$T1,$T1 984 vpor $T1,$Hkey,$Hkey # H<<=1 985 986 # magic reduction 987 vpand .L0x1c2_polynomial(%rip),$T3,$T3 988 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 989 990 vpunpckhqdq $Hkey,$Hkey,$HK 991 vmovdqa $Hkey,$Xi 992 vpxor $Hkey,$HK,$HK 993 mov \$4,%r10 # up to H^8 994 jmp .Linit_start_avx 995___ 996 997sub clmul64x64_avx { 998my ($Xhi,$Xi,$Hkey,$HK)=@_; 999 1000if (!defined($HK)) { $HK = $T2; 1001$code.=<<___; 1002 vpunpckhqdq $Xi,$Xi,$T1 1003 vpunpckhqdq $Hkey,$Hkey,$T2 1004 vpxor $Xi,$T1,$T1 # 1005 vpxor $Hkey,$T2,$T2 1006___ 1007} else { 1008$code.=<<___; 1009 vpunpckhqdq $Xi,$Xi,$T1 1010 vpxor $Xi,$T1,$T1 # 1011___ 1012} 1013$code.=<<___; 1014 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 1015 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 1016 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 1017 vpxor $Xi,$Xhi,$T2 # 1018 vpxor $T2,$T1,$T1 # 1019 1020 vpslldq \$8,$T1,$T2 # 1021 vpsrldq \$8,$T1,$T1 1022 vpxor $T2,$Xi,$Xi # 1023 vpxor $T1,$Xhi,$Xhi 1024___ 1025} 1026 1027sub reduction_avx { 1028my ($Xhi,$Xi) = @_; 1029 1030$code.=<<___; 1031 vpsllq \$57,$Xi,$T1 # 1st phase 1032 vpsllq \$62,$Xi,$T2 1033 vpxor $T1,$T2,$T2 # 1034 vpsllq \$63,$Xi,$T1 1035 vpxor $T1,$T2,$T2 # 1036 vpslldq \$8,$T2,$T1 # 1037 vpsrldq \$8,$T2,$T2 1038 vpxor $T1,$Xi,$Xi # 1039 vpxor $T2,$Xhi,$Xhi 1040 1041 vpsrlq \$1,$Xi,$T2 # 2nd phase 1042 vpxor $Xi,$Xhi,$Xhi 1043 vpxor $T2,$Xi,$Xi # 1044 vpsrlq \$5,$T2,$T2 1045 vpxor $T2,$Xi,$Xi # 1046 vpsrlq \$1,$Xi,$Xi # 1047 vpxor $Xhi,$Xi,$Xi # 1048___ 1049} 1050 1051$code.=<<___; 1052.align 32 1053.Linit_loop_avx: 1054 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 1055 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 1056___ 1057 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 1058 &reduction_avx ($Xhi,$Xi); 1059$code.=<<___; 1060.Linit_start_avx: 1061 vmovdqa $Xi,$T3 1062___ 1063 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 1064 &reduction_avx ($Xhi,$Xi); 1065$code.=<<___; 1066 vpshufd \$0b01001110,$T3,$T1 1067 vpshufd \$0b01001110,$Xi,$T2 1068 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 1069 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 1070 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 1071 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 1072 lea 0x30($Htbl),$Htbl 1073 sub \$1,%r10 1074 jnz .Linit_loop_avx 1075 1076 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 1077 vmovdqu $T3,-0x10($Htbl) 1078 1079 vzeroupper 1080___ 1081$code.=<<___ if ($win64); 1082 movaps (%rsp),%xmm6 1083 lea 0x18(%rsp),%rsp 1084.LSEH_end_gcm_init_avx: 1085___ 1086$code.=<<___; 1087 ret 1088.size gcm_init_avx,.-gcm_init_avx 1089___ 1090} else { 1091$code.=<<___; 1092 jmp .L_init_clmul 1093.size gcm_init_avx,.-gcm_init_avx 1094___ 1095} 1096 1097$code.=<<___; 1098.globl gcm_gmult_avx 1099.type gcm_gmult_avx,\@abi-omnipotent 1100.align 32 1101gcm_gmult_avx: 1102 jmp .L_gmult_clmul 1103.size gcm_gmult_avx,.-gcm_gmult_avx 1104___ 1105 1106$code.=<<___; 1107.globl gcm_ghash_avx 1108.type gcm_ghash_avx,\@abi-omnipotent 1109.align 32 1110gcm_ghash_avx: 1111___ 1112if ($avx) { 1113my ($Xip,$Htbl,$inp,$len)=@_4args; 1114my ($Xlo,$Xhi,$Xmi, 1115 $Zlo,$Zhi,$Zmi, 1116 $Hkey,$HK,$T1,$T2, 1117 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 1118 1119$code.=<<___ if ($win64); 1120 lea -0x88(%rsp),%rax 1121.LSEH_begin_gcm_ghash_avx: 1122 # I can't trust assembler to use specific encoding:-( 1123 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1124 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 1125 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 1126 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 1127 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 1128 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 1129 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 1130 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 1131 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 1132 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 1133 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 1134___ 1135$code.=<<___; 1136 vzeroupper 1137 1138 vmovdqu ($Xip),$Xi # load $Xi 1139 lea .L0x1c2_polynomial(%rip),%r10 1140 lea 0x40($Htbl),$Htbl # size optimization 1141 vmovdqu .Lbswap_mask(%rip),$bswap 1142 vpshufb $bswap,$Xi,$Xi 1143 cmp \$0x80,$len 1144 jb .Lshort_avx 1145 sub \$0x80,$len 1146 1147 vmovdqu 0x70($inp),$Ii # I[7] 1148 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1149 vpshufb $bswap,$Ii,$Ii 1150 vmovdqu 0x20-0x40($Htbl),$HK 1151 1152 vpunpckhqdq $Ii,$Ii,$T2 1153 vmovdqu 0x60($inp),$Ij # I[6] 1154 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1155 vpxor $Ii,$T2,$T2 1156 vpshufb $bswap,$Ij,$Ij 1157 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1158 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1159 vpunpckhqdq $Ij,$Ij,$T1 1160 vmovdqu 0x50($inp),$Ii # I[5] 1161 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1162 vpxor $Ij,$T1,$T1 1163 1164 vpshufb $bswap,$Ii,$Ii 1165 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1166 vpunpckhqdq $Ii,$Ii,$T2 1167 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1168 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1169 vpxor $Ii,$T2,$T2 1170 vmovdqu 0x40($inp),$Ij # I[4] 1171 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1172 vmovdqu 0x50-0x40($Htbl),$HK 1173 1174 vpshufb $bswap,$Ij,$Ij 1175 vpxor $Xlo,$Zlo,$Zlo 1176 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1177 vpxor $Xhi,$Zhi,$Zhi 1178 vpunpckhqdq $Ij,$Ij,$T1 1179 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1180 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1181 vpxor $Xmi,$Zmi,$Zmi 1182 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1183 vpxor $Ij,$T1,$T1 1184 1185 vmovdqu 0x30($inp),$Ii # I[3] 1186 vpxor $Zlo,$Xlo,$Xlo 1187 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1188 vpxor $Zhi,$Xhi,$Xhi 1189 vpshufb $bswap,$Ii,$Ii 1190 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1191 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1192 vpxor $Zmi,$Xmi,$Xmi 1193 vpunpckhqdq $Ii,$Ii,$T2 1194 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1195 vmovdqu 0x80-0x40($Htbl),$HK 1196 vpxor $Ii,$T2,$T2 1197 1198 vmovdqu 0x20($inp),$Ij # I[2] 1199 vpxor $Xlo,$Zlo,$Zlo 1200 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1201 vpxor $Xhi,$Zhi,$Zhi 1202 vpshufb $bswap,$Ij,$Ij 1203 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1204 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1205 vpxor $Xmi,$Zmi,$Zmi 1206 vpunpckhqdq $Ij,$Ij,$T1 1207 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1208 vpxor $Ij,$T1,$T1 1209 1210 vmovdqu 0x10($inp),$Ii # I[1] 1211 vpxor $Zlo,$Xlo,$Xlo 1212 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1213 vpxor $Zhi,$Xhi,$Xhi 1214 vpshufb $bswap,$Ii,$Ii 1215 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1216 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1217 vpxor $Zmi,$Xmi,$Xmi 1218 vpunpckhqdq $Ii,$Ii,$T2 1219 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1220 vmovdqu 0xb0-0x40($Htbl),$HK 1221 vpxor $Ii,$T2,$T2 1222 1223 vmovdqu ($inp),$Ij # I[0] 1224 vpxor $Xlo,$Zlo,$Zlo 1225 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1226 vpxor $Xhi,$Zhi,$Zhi 1227 vpshufb $bswap,$Ij,$Ij 1228 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1229 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1230 vpxor $Xmi,$Zmi,$Zmi 1231 vpclmulqdq \$0x10,$HK,$T2,$Xmi 1232 1233 lea 0x80($inp),$inp 1234 cmp \$0x80,$len 1235 jb .Ltail_avx 1236 1237 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1238 sub \$0x80,$len 1239 jmp .Loop8x_avx 1240 1241.align 32 1242.Loop8x_avx: 1243 vpunpckhqdq $Ij,$Ij,$T1 1244 vmovdqu 0x70($inp),$Ii # I[7] 1245 vpxor $Xlo,$Zlo,$Zlo 1246 vpxor $Ij,$T1,$T1 1247 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 1248 vpshufb $bswap,$Ii,$Ii 1249 vpxor $Xhi,$Zhi,$Zhi 1250 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 1251 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1252 vpunpckhqdq $Ii,$Ii,$T2 1253 vpxor $Xmi,$Zmi,$Zmi 1254 vpclmulqdq \$0x00,$HK,$T1,$Tred 1255 vmovdqu 0x20-0x40($Htbl),$HK 1256 vpxor $Ii,$T2,$T2 1257 1258 vmovdqu 0x60($inp),$Ij # I[6] 1259 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1260 vpxor $Zlo,$Xi,$Xi # collect result 1261 vpshufb $bswap,$Ij,$Ij 1262 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1263 vxorps $Zhi,$Xo,$Xo 1264 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1265 vpunpckhqdq $Ij,$Ij,$T1 1266 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1267 vpxor $Zmi,$Tred,$Tred 1268 vxorps $Ij,$T1,$T1 1269 1270 vmovdqu 0x50($inp),$Ii # I[5] 1271 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1272 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1273 vpxor $Xo,$Tred,$Tred 1274 vpslldq \$8,$Tred,$T2 1275 vpxor $Xlo,$Zlo,$Zlo 1276 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1277 vpsrldq \$8,$Tred,$Tred 1278 vpxor $T2, $Xi, $Xi 1279 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1280 vpshufb $bswap,$Ii,$Ii 1281 vxorps $Tred,$Xo, $Xo 1282 vpxor $Xhi,$Zhi,$Zhi 1283 vpunpckhqdq $Ii,$Ii,$T2 1284 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1285 vmovdqu 0x50-0x40($Htbl),$HK 1286 vpxor $Ii,$T2,$T2 1287 vpxor $Xmi,$Zmi,$Zmi 1288 1289 vmovdqu 0x40($inp),$Ij # I[4] 1290 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1291 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1292 vpshufb $bswap,$Ij,$Ij 1293 vpxor $Zlo,$Xlo,$Xlo 1294 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1295 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1296 vpunpckhqdq $Ij,$Ij,$T1 1297 vpxor $Zhi,$Xhi,$Xhi 1298 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1299 vxorps $Ij,$T1,$T1 1300 vpxor $Zmi,$Xmi,$Xmi 1301 1302 vmovdqu 0x30($inp),$Ii # I[3] 1303 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1304 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1305 vpshufb $bswap,$Ii,$Ii 1306 vpxor $Xlo,$Zlo,$Zlo 1307 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1308 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1309 vpunpckhqdq $Ii,$Ii,$T2 1310 vpxor $Xhi,$Zhi,$Zhi 1311 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1312 vmovdqu 0x80-0x40($Htbl),$HK 1313 vpxor $Ii,$T2,$T2 1314 vpxor $Xmi,$Zmi,$Zmi 1315 1316 vmovdqu 0x20($inp),$Ij # I[2] 1317 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1318 vpshufb $bswap,$Ij,$Ij 1319 vpxor $Zlo,$Xlo,$Xlo 1320 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1321 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1322 vpunpckhqdq $Ij,$Ij,$T1 1323 vpxor $Zhi,$Xhi,$Xhi 1324 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1325 vpxor $Ij,$T1,$T1 1326 vpxor $Zmi,$Xmi,$Xmi 1327 vxorps $Tred,$Xi,$Xi 1328 1329 vmovdqu 0x10($inp),$Ii # I[1] 1330 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1331 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1332 vpshufb $bswap,$Ii,$Ii 1333 vpxor $Xlo,$Zlo,$Zlo 1334 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1335 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1336 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1337 vxorps $Xo,$Tred,$Tred 1338 vpunpckhqdq $Ii,$Ii,$T2 1339 vpxor $Xhi,$Zhi,$Zhi 1340 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1341 vmovdqu 0xb0-0x40($Htbl),$HK 1342 vpxor $Ii,$T2,$T2 1343 vpxor $Xmi,$Zmi,$Zmi 1344 1345 vmovdqu ($inp),$Ij # I[0] 1346 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1347 vpshufb $bswap,$Ij,$Ij 1348 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1349 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1350 vpxor $Tred,$Ij,$Ij 1351 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1352 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1353 1354 lea 0x80($inp),$inp 1355 sub \$0x80,$len 1356 jnc .Loop8x_avx 1357 1358 add \$0x80,$len 1359 jmp .Ltail_no_xor_avx 1360 1361.align 32 1362.Lshort_avx: 1363 vmovdqu -0x10($inp,$len),$Ii # very last word 1364 lea ($inp,$len),$inp 1365 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1366 vmovdqu 0x20-0x40($Htbl),$HK 1367 vpshufb $bswap,$Ii,$Ij 1368 1369 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1370 vmovdqa $Xhi,$Zhi # $Zhi and 1371 vmovdqa $Xmi,$Zmi # $Zmi 1372 sub \$0x10,$len 1373 jz .Ltail_avx 1374 1375 vpunpckhqdq $Ij,$Ij,$T1 1376 vpxor $Xlo,$Zlo,$Zlo 1377 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1378 vpxor $Ij,$T1,$T1 1379 vmovdqu -0x20($inp),$Ii 1380 vpxor $Xhi,$Zhi,$Zhi 1381 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1382 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1383 vpshufb $bswap,$Ii,$Ij 1384 vpxor $Xmi,$Zmi,$Zmi 1385 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1386 vpsrldq \$8,$HK,$HK 1387 sub \$0x10,$len 1388 jz .Ltail_avx 1389 1390 vpunpckhqdq $Ij,$Ij,$T1 1391 vpxor $Xlo,$Zlo,$Zlo 1392 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1393 vpxor $Ij,$T1,$T1 1394 vmovdqu -0x30($inp),$Ii 1395 vpxor $Xhi,$Zhi,$Zhi 1396 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1397 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1398 vpshufb $bswap,$Ii,$Ij 1399 vpxor $Xmi,$Zmi,$Zmi 1400 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1401 vmovdqu 0x50-0x40($Htbl),$HK 1402 sub \$0x10,$len 1403 jz .Ltail_avx 1404 1405 vpunpckhqdq $Ij,$Ij,$T1 1406 vpxor $Xlo,$Zlo,$Zlo 1407 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1408 vpxor $Ij,$T1,$T1 1409 vmovdqu -0x40($inp),$Ii 1410 vpxor $Xhi,$Zhi,$Zhi 1411 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1412 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1413 vpshufb $bswap,$Ii,$Ij 1414 vpxor $Xmi,$Zmi,$Zmi 1415 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1416 vpsrldq \$8,$HK,$HK 1417 sub \$0x10,$len 1418 jz .Ltail_avx 1419 1420 vpunpckhqdq $Ij,$Ij,$T1 1421 vpxor $Xlo,$Zlo,$Zlo 1422 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1423 vpxor $Ij,$T1,$T1 1424 vmovdqu -0x50($inp),$Ii 1425 vpxor $Xhi,$Zhi,$Zhi 1426 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1427 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1428 vpshufb $bswap,$Ii,$Ij 1429 vpxor $Xmi,$Zmi,$Zmi 1430 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1431 vmovdqu 0x80-0x40($Htbl),$HK 1432 sub \$0x10,$len 1433 jz .Ltail_avx 1434 1435 vpunpckhqdq $Ij,$Ij,$T1 1436 vpxor $Xlo,$Zlo,$Zlo 1437 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1438 vpxor $Ij,$T1,$T1 1439 vmovdqu -0x60($inp),$Ii 1440 vpxor $Xhi,$Zhi,$Zhi 1441 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1442 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1443 vpshufb $bswap,$Ii,$Ij 1444 vpxor $Xmi,$Zmi,$Zmi 1445 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1446 vpsrldq \$8,$HK,$HK 1447 sub \$0x10,$len 1448 jz .Ltail_avx 1449 1450 vpunpckhqdq $Ij,$Ij,$T1 1451 vpxor $Xlo,$Zlo,$Zlo 1452 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1453 vpxor $Ij,$T1,$T1 1454 vmovdqu -0x70($inp),$Ii 1455 vpxor $Xhi,$Zhi,$Zhi 1456 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1457 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1458 vpshufb $bswap,$Ii,$Ij 1459 vpxor $Xmi,$Zmi,$Zmi 1460 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1461 vmovq 0xb8-0x40($Htbl),$HK 1462 sub \$0x10,$len 1463 jmp .Ltail_avx 1464 1465.align 32 1466.Ltail_avx: 1467 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1468.Ltail_no_xor_avx: 1469 vpunpckhqdq $Ij,$Ij,$T1 1470 vpxor $Xlo,$Zlo,$Zlo 1471 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1472 vpxor $Ij,$T1,$T1 1473 vpxor $Xhi,$Zhi,$Zhi 1474 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1475 vpxor $Xmi,$Zmi,$Zmi 1476 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1477 1478 vmovdqu (%r10),$Tred 1479 1480 vpxor $Xlo,$Zlo,$Xi 1481 vpxor $Xhi,$Zhi,$Xo 1482 vpxor $Xmi,$Zmi,$Zmi 1483 1484 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1485 vpxor $Xo, $Zmi,$Zmi 1486 vpslldq \$8, $Zmi,$T2 1487 vpsrldq \$8, $Zmi,$Zmi 1488 vpxor $T2, $Xi, $Xi 1489 vpxor $Zmi,$Xo, $Xo 1490 1491 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1492 vpalignr \$8,$Xi,$Xi,$Xi 1493 vpxor $T2,$Xi,$Xi 1494 1495 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1496 vpalignr \$8,$Xi,$Xi,$Xi 1497 vpxor $Xo,$Xi,$Xi 1498 vpxor $T2,$Xi,$Xi 1499 1500 cmp \$0,$len 1501 jne .Lshort_avx 1502 1503 vpshufb $bswap,$Xi,$Xi 1504 vmovdqu $Xi,($Xip) 1505 vzeroupper 1506___ 1507$code.=<<___ if ($win64); 1508 movaps (%rsp),%xmm6 1509 movaps 0x10(%rsp),%xmm7 1510 movaps 0x20(%rsp),%xmm8 1511 movaps 0x30(%rsp),%xmm9 1512 movaps 0x40(%rsp),%xmm10 1513 movaps 0x50(%rsp),%xmm11 1514 movaps 0x60(%rsp),%xmm12 1515 movaps 0x70(%rsp),%xmm13 1516 movaps 0x80(%rsp),%xmm14 1517 movaps 0x90(%rsp),%xmm15 1518 lea 0xa8(%rsp),%rsp 1519.LSEH_end_gcm_ghash_avx: 1520___ 1521$code.=<<___; 1522 ret 1523.size gcm_ghash_avx,.-gcm_ghash_avx 1524___ 1525} else { 1526$code.=<<___; 1527 jmp .L_ghash_clmul 1528.size gcm_ghash_avx,.-gcm_ghash_avx 1529___ 1530} 1531 1532$code.=<<___; 1533.align 64 1534.Lbswap_mask: 1535 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1536.L0x1c2_polynomial: 1537 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1538.L7_mask: 1539 .long 7,0,7,0 1540.L7_mask_poly: 1541 .long 7,0,`0xE1<<1`,0 1542.align 64 1543.type .Lrem_4bit,\@object 1544.Lrem_4bit: 1545 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` 1546 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` 1547 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` 1548 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` 1549.type .Lrem_8bit,\@object 1550.Lrem_8bit: 1551 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E 1552 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E 1553 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E 1554 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E 1555 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E 1556 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E 1557 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E 1558 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E 1559 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE 1560 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE 1561 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE 1562 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE 1563 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E 1564 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E 1565 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE 1566 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE 1567 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E 1568 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E 1569 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E 1570 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E 1571 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E 1572 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E 1573 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E 1574 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E 1575 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE 1576 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE 1577 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE 1578 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE 1579 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E 1580 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E 1581 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE 1582 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE 1583 1584.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1585.align 64 1586___ 1587 1588# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1589# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1590if ($win64) { 1591$rec="%rcx"; 1592$frame="%rdx"; 1593$context="%r8"; 1594$disp="%r9"; 1595 1596$code.=<<___; 1597.extern __imp_RtlVirtualUnwind 1598.type se_handler,\@abi-omnipotent 1599.align 16 1600se_handler: 1601 push %rsi 1602 push %rdi 1603 push %rbx 1604 push %rbp 1605 push %r12 1606 push %r13 1607 push %r14 1608 push %r15 1609 pushfq 1610 sub \$64,%rsp 1611 1612 mov 120($context),%rax # pull context->Rax 1613 mov 248($context),%rbx # pull context->Rip 1614 1615 mov 8($disp),%rsi # disp->ImageBase 1616 mov 56($disp),%r11 # disp->HandlerData 1617 1618 mov 0(%r11),%r10d # HandlerData[0] 1619 lea (%rsi,%r10),%r10 # prologue label 1620 cmp %r10,%rbx # context->Rip<prologue label 1621 jb .Lin_prologue 1622 1623 mov 152($context),%rax # pull context->Rsp 1624 1625 mov 4(%r11),%r10d # HandlerData[1] 1626 lea (%rsi,%r10),%r10 # epilogue label 1627 cmp %r10,%rbx # context->Rip>=epilogue label 1628 jae .Lin_prologue 1629 1630 lea 24(%rax),%rax # adjust "rsp" 1631 1632 mov -8(%rax),%rbx 1633 mov -16(%rax),%rbp 1634 mov -24(%rax),%r12 1635 mov %rbx,144($context) # restore context->Rbx 1636 mov %rbp,160($context) # restore context->Rbp 1637 mov %r12,216($context) # restore context->R12 1638 1639.Lin_prologue: 1640 mov 8(%rax),%rdi 1641 mov 16(%rax),%rsi 1642 mov %rax,152($context) # restore context->Rsp 1643 mov %rsi,168($context) # restore context->Rsi 1644 mov %rdi,176($context) # restore context->Rdi 1645 1646 mov 40($disp),%rdi # disp->ContextRecord 1647 mov $context,%rsi # context 1648 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1649 .long 0xa548f3fc # cld; rep movsq 1650 1651 mov $disp,%rsi 1652 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1653 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1654 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1655 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1656 mov 40(%rsi),%r10 # disp->ContextRecord 1657 lea 56(%rsi),%r11 # &disp->HandlerData 1658 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1659 mov %r10,32(%rsp) # arg5 1660 mov %r11,40(%rsp) # arg6 1661 mov %r12,48(%rsp) # arg7 1662 mov %rcx,56(%rsp) # arg8, (NULL) 1663 call *__imp_RtlVirtualUnwind(%rip) 1664 1665 mov \$1,%eax # ExceptionContinueSearch 1666 add \$64,%rsp 1667 popfq 1668 pop %r15 1669 pop %r14 1670 pop %r13 1671 pop %r12 1672 pop %rbp 1673 pop %rbx 1674 pop %rdi 1675 pop %rsi 1676 ret 1677.size se_handler,.-se_handler 1678 1679.section .pdata 1680.align 4 1681 .rva .LSEH_begin_gcm_gmult_4bit 1682 .rva .LSEH_end_gcm_gmult_4bit 1683 .rva .LSEH_info_gcm_gmult_4bit 1684 1685 .rva .LSEH_begin_gcm_ghash_4bit 1686 .rva .LSEH_end_gcm_ghash_4bit 1687 .rva .LSEH_info_gcm_ghash_4bit 1688 1689 .rva .LSEH_begin_gcm_init_clmul 1690 .rva .LSEH_end_gcm_init_clmul 1691 .rva .LSEH_info_gcm_init_clmul 1692 1693 .rva .LSEH_begin_gcm_ghash_clmul 1694 .rva .LSEH_end_gcm_ghash_clmul 1695 .rva .LSEH_info_gcm_ghash_clmul 1696___ 1697$code.=<<___ if ($avx); 1698 .rva .LSEH_begin_gcm_init_avx 1699 .rva .LSEH_end_gcm_init_avx 1700 .rva .LSEH_info_gcm_init_clmul 1701 1702 .rva .LSEH_begin_gcm_ghash_avx 1703 .rva .LSEH_end_gcm_ghash_avx 1704 .rva .LSEH_info_gcm_ghash_clmul 1705___ 1706$code.=<<___; 1707.section .xdata 1708.align 8 1709.LSEH_info_gcm_gmult_4bit: 1710 .byte 9,0,0,0 1711 .rva se_handler 1712 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData 1713.LSEH_info_gcm_ghash_4bit: 1714 .byte 9,0,0,0 1715 .rva se_handler 1716 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData 1717.LSEH_info_gcm_init_clmul: 1718 .byte 0x01,0x08,0x03,0x00 1719 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1720 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 1721.LSEH_info_gcm_ghash_clmul: 1722 .byte 0x01,0x33,0x16,0x00 1723 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 1724 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 1725 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 1726 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 1727 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 1728 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 1729 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 1730 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 1731 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1732 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1733 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1734___ 1735} 1736 1737$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1738 1739print $code; 1740 1741close STDOUT; 1742