1#! /usr/bin/env perl 2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# August 2011. 18# 19# Companion to x86_64-mont.pl that optimizes cache-timing attack 20# countermeasures. The subroutines are produced by replacing bp[i] 21# references in their x86_64-mont.pl counterparts with cache-neutral 22# references to powers table computed in BN_mod_exp_mont_consttime. 23# In addition subroutine that scatters elements of the powers table 24# is implemented, so that scatter-/gathering can be tuned without 25# bn_exp.c modifications. 26 27# August 2013. 28# 29# Add MULX/AD*X code paths and additional interfaces to optimize for 30# branch prediction unit. For input lengths that are multiples of 8 31# the np argument is not just modulus value, but one interleaved 32# with 0. This is to optimize post-condition... 33 34$flavour = shift; 35$output = shift; 36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 37 38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 39 40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 42( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 43die "can't locate x86_64-xlate.pl"; 44 45open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 46*STDOUT=*OUT; 47 48# In upstream, this is controlled by shelling out to the compiler to check 49# versions, but BoringSSL is intended to be used with pre-generated perlasm 50# output, so this isn't useful anyway. 51$addx = 1; 52 53# int bn_mul_mont_gather5( 54$rp="%rdi"; # BN_ULONG *rp, 55$ap="%rsi"; # const BN_ULONG *ap, 56$bp="%rdx"; # const BN_ULONG *bp, 57$np="%rcx"; # const BN_ULONG *np, 58$n0="%r8"; # const BN_ULONG *n0, 59$num="%r9"; # int num, 60 # int idx); # 0 to 2^5-1, "index" in $bp holding 61 # pre-computed powers of a', interlaced 62 # in such manner that b[0] is $bp[idx], 63 # b[1] is [2^5+idx], etc. 64$lo0="%r10"; 65$hi0="%r11"; 66$hi1="%r13"; 67$i="%r14"; 68$j="%r15"; 69$m0="%rbx"; 70$m1="%rbp"; 71 72$code=<<___; 73.text 74 75.extern OPENSSL_ia32cap_P 76 77.globl bn_mul_mont_gather5 78.type bn_mul_mont_gather5,\@function,6 79.align 64 80bn_mul_mont_gather5: 81.cfi_startproc 82 mov ${num}d,${num}d 83 mov %rsp,%rax 84.cfi_def_cfa_register %rax 85 test \$7,${num}d 86 jnz .Lmul_enter 87___ 88$code.=<<___ if ($addx); 89 leaq OPENSSL_ia32cap_P(%rip),%r11 90 mov 8(%r11),%r11d 91___ 92$code.=<<___; 93 jmp .Lmul4x_enter 94 95.align 16 96.Lmul_enter: 97 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 98 push %rbx 99.cfi_push %rbx 100 push %rbp 101.cfi_push %rbp 102 push %r12 103.cfi_push %r12 104 push %r13 105.cfi_push %r13 106 push %r14 107.cfi_push %r14 108 push %r15 109.cfi_push %r15 110 111 neg $num 112 mov %rsp,%r11 113 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) 114 neg $num # restore $num 115 and \$-1024,%r10 # minimize TLB usage 116 117 # An OS-agnostic version of __chkstk. 118 # 119 # Some OSes (Windows) insist on stack being "wired" to 120 # physical memory in strictly sequential manner, i.e. if stack 121 # allocation spans two pages, then reference to farmost one can 122 # be punishable by SEGV. But page walking can do good even on 123 # other OSes, because it guarantees that villain thread hits 124 # the guard page before it can make damage to innocent one... 125 sub %r10,%r11 126 and \$-4096,%r11 127 lea (%r10,%r11),%rsp 128 mov (%rsp),%r11 129 cmp %r10,%rsp 130 ja .Lmul_page_walk 131 jmp .Lmul_page_walk_done 132 133.Lmul_page_walk: 134 lea -4096(%rsp),%rsp 135 mov (%rsp),%r11 136 cmp %r10,%rsp 137 ja .Lmul_page_walk 138.Lmul_page_walk_done: 139 140 lea .Linc(%rip),%r10 141 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 142.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 143.Lmul_body: 144 145 lea 128($bp),%r12 # reassign $bp (+size optimization) 146___ 147 $bp="%r12"; 148 $STRIDE=2**5*8; # 5 is "window size" 149 $N=$STRIDE/4; # should match cache line size 150$code.=<<___; 151 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 152 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 153 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 154 and \$-16,%r10 155 156 pshufd \$0,%xmm5,%xmm5 # broadcast index 157 movdqa %xmm1,%xmm4 158 movdqa %xmm1,%xmm2 159___ 160######################################################################## 161# calculate mask by comparing 0..31 to index and save result to stack 162# 163$code.=<<___; 164 paddd %xmm0,%xmm1 165 pcmpeqd %xmm5,%xmm0 # compare to 1,0 166 .byte 0x67 167 movdqa %xmm4,%xmm3 168___ 169for($k=0;$k<$STRIDE/16-4;$k+=4) { 170$code.=<<___; 171 paddd %xmm1,%xmm2 172 pcmpeqd %xmm5,%xmm1 # compare to 3,2 173 movdqa %xmm0,`16*($k+0)+112`(%r10) 174 movdqa %xmm4,%xmm0 175 176 paddd %xmm2,%xmm3 177 pcmpeqd %xmm5,%xmm2 # compare to 5,4 178 movdqa %xmm1,`16*($k+1)+112`(%r10) 179 movdqa %xmm4,%xmm1 180 181 paddd %xmm3,%xmm0 182 pcmpeqd %xmm5,%xmm3 # compare to 7,6 183 movdqa %xmm2,`16*($k+2)+112`(%r10) 184 movdqa %xmm4,%xmm2 185 186 paddd %xmm0,%xmm1 187 pcmpeqd %xmm5,%xmm0 188 movdqa %xmm3,`16*($k+3)+112`(%r10) 189 movdqa %xmm4,%xmm3 190___ 191} 192$code.=<<___; # last iteration can be optimized 193 paddd %xmm1,%xmm2 194 pcmpeqd %xmm5,%xmm1 195 movdqa %xmm0,`16*($k+0)+112`(%r10) 196 197 paddd %xmm2,%xmm3 198 .byte 0x67 199 pcmpeqd %xmm5,%xmm2 200 movdqa %xmm1,`16*($k+1)+112`(%r10) 201 202 pcmpeqd %xmm5,%xmm3 203 movdqa %xmm2,`16*($k+2)+112`(%r10) 204 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 205 206 pand `16*($k+1)-128`($bp),%xmm1 207 pand `16*($k+2)-128`($bp),%xmm2 208 movdqa %xmm3,`16*($k+3)+112`(%r10) 209 pand `16*($k+3)-128`($bp),%xmm3 210 por %xmm2,%xmm0 211 por %xmm3,%xmm1 212___ 213for($k=0;$k<$STRIDE/16-4;$k+=4) { 214$code.=<<___; 215 movdqa `16*($k+0)-128`($bp),%xmm4 216 movdqa `16*($k+1)-128`($bp),%xmm5 217 movdqa `16*($k+2)-128`($bp),%xmm2 218 pand `16*($k+0)+112`(%r10),%xmm4 219 movdqa `16*($k+3)-128`($bp),%xmm3 220 pand `16*($k+1)+112`(%r10),%xmm5 221 por %xmm4,%xmm0 222 pand `16*($k+2)+112`(%r10),%xmm2 223 por %xmm5,%xmm1 224 pand `16*($k+3)+112`(%r10),%xmm3 225 por %xmm2,%xmm0 226 por %xmm3,%xmm1 227___ 228} 229$code.=<<___; 230 por %xmm1,%xmm0 231 pshufd \$0x4e,%xmm0,%xmm1 232 por %xmm1,%xmm0 233 lea $STRIDE($bp),$bp 234 movq %xmm0,$m0 # m0=bp[0] 235 236 mov ($n0),$n0 # pull n0[0] value 237 mov ($ap),%rax 238 239 xor $i,$i # i=0 240 xor $j,$j # j=0 241 242 mov $n0,$m1 243 mulq $m0 # ap[0]*bp[0] 244 mov %rax,$lo0 245 mov ($np),%rax 246 247 imulq $lo0,$m1 # "tp[0]"*n0 248 mov %rdx,$hi0 249 250 mulq $m1 # np[0]*m1 251 add %rax,$lo0 # discarded 252 mov 8($ap),%rax 253 adc \$0,%rdx 254 mov %rdx,$hi1 255 256 lea 1($j),$j # j++ 257 jmp .L1st_enter 258 259.align 16 260.L1st: 261 add %rax,$hi1 262 mov ($ap,$j,8),%rax 263 adc \$0,%rdx 264 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 265 mov $lo0,$hi0 266 adc \$0,%rdx 267 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 268 mov %rdx,$hi1 269 270.L1st_enter: 271 mulq $m0 # ap[j]*bp[0] 272 add %rax,$hi0 273 mov ($np,$j,8),%rax 274 adc \$0,%rdx 275 lea 1($j),$j # j++ 276 mov %rdx,$lo0 277 278 mulq $m1 # np[j]*m1 279 cmp $num,$j 280 jne .L1st # note that upon exit $j==$num, so 281 # they can be used interchangeably 282 283 add %rax,$hi1 284 adc \$0,%rdx 285 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 286 adc \$0,%rdx 287 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 288 mov %rdx,$hi1 289 mov $lo0,$hi0 290 291 xor %rdx,%rdx 292 add $hi0,$hi1 293 adc \$0,%rdx 294 mov $hi1,-8(%rsp,$num,8) 295 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 296 297 lea 1($i),$i # i++ 298 jmp .Louter 299.align 16 300.Louter: 301 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 302 and \$-16,%rdx 303 pxor %xmm4,%xmm4 304 pxor %xmm5,%xmm5 305___ 306for($k=0;$k<$STRIDE/16;$k+=4) { 307$code.=<<___; 308 movdqa `16*($k+0)-128`($bp),%xmm0 309 movdqa `16*($k+1)-128`($bp),%xmm1 310 movdqa `16*($k+2)-128`($bp),%xmm2 311 movdqa `16*($k+3)-128`($bp),%xmm3 312 pand `16*($k+0)-128`(%rdx),%xmm0 313 pand `16*($k+1)-128`(%rdx),%xmm1 314 por %xmm0,%xmm4 315 pand `16*($k+2)-128`(%rdx),%xmm2 316 por %xmm1,%xmm5 317 pand `16*($k+3)-128`(%rdx),%xmm3 318 por %xmm2,%xmm4 319 por %xmm3,%xmm5 320___ 321} 322$code.=<<___; 323 por %xmm5,%xmm4 324 pshufd \$0x4e,%xmm4,%xmm0 325 por %xmm4,%xmm0 326 lea $STRIDE($bp),$bp 327 328 mov ($ap),%rax # ap[0] 329 movq %xmm0,$m0 # m0=bp[i] 330 331 xor $j,$j # j=0 332 mov $n0,$m1 333 mov (%rsp),$lo0 334 335 mulq $m0 # ap[0]*bp[i] 336 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 337 mov ($np),%rax 338 adc \$0,%rdx 339 340 imulq $lo0,$m1 # tp[0]*n0 341 mov %rdx,$hi0 342 343 mulq $m1 # np[0]*m1 344 add %rax,$lo0 # discarded 345 mov 8($ap),%rax 346 adc \$0,%rdx 347 mov 8(%rsp),$lo0 # tp[1] 348 mov %rdx,$hi1 349 350 lea 1($j),$j # j++ 351 jmp .Linner_enter 352 353.align 16 354.Linner: 355 add %rax,$hi1 356 mov ($ap,$j,8),%rax 357 adc \$0,%rdx 358 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 359 mov (%rsp,$j,8),$lo0 360 adc \$0,%rdx 361 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 362 mov %rdx,$hi1 363 364.Linner_enter: 365 mulq $m0 # ap[j]*bp[i] 366 add %rax,$hi0 367 mov ($np,$j,8),%rax 368 adc \$0,%rdx 369 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 370 mov %rdx,$hi0 371 adc \$0,$hi0 372 lea 1($j),$j # j++ 373 374 mulq $m1 # np[j]*m1 375 cmp $num,$j 376 jne .Linner # note that upon exit $j==$num, so 377 # they can be used interchangeably 378 add %rax,$hi1 379 adc \$0,%rdx 380 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 381 mov (%rsp,$num,8),$lo0 382 adc \$0,%rdx 383 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 384 mov %rdx,$hi1 385 386 xor %rdx,%rdx 387 add $hi0,$hi1 388 adc \$0,%rdx 389 add $lo0,$hi1 # pull upmost overflow bit 390 adc \$0,%rdx 391 mov $hi1,-8(%rsp,$num,8) 392 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 393 394 lea 1($i),$i # i++ 395 cmp $num,$i 396 jb .Louter 397 398 xor $i,$i # i=0 and clear CF! 399 mov (%rsp),%rax # tp[0] 400 lea (%rsp),$ap # borrow ap for tp 401 mov $num,$j # j=num 402 jmp .Lsub 403.align 16 404.Lsub: sbb ($np,$i,8),%rax 405 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 406 mov 8($ap,$i,8),%rax # tp[i+1] 407 lea 1($i),$i # i++ 408 dec $j # doesn't affect CF! 409 jnz .Lsub 410 411 sbb \$0,%rax # handle upmost overflow bit 412 mov \$-1,%rbx 413 xor %rax,%rbx 414 xor $i,$i 415 mov $num,$j # j=num 416 417.Lcopy: # conditional copy 418 mov ($rp,$i,8),%rcx 419 mov (%rsp,$i,8),%rdx 420 and %rbx,%rcx 421 and %rax,%rdx 422 mov $i,(%rsp,$i,8) # zap temporary vector 423 or %rcx,%rdx 424 mov %rdx,($rp,$i,8) # rp[i]=tp[i] 425 lea 1($i),$i 426 sub \$1,$j 427 jnz .Lcopy 428 429 mov 8(%rsp,$num,8),%rsi # restore %rsp 430.cfi_def_cfa %rsi,8 431 mov \$1,%rax 432 433 mov -48(%rsi),%r15 434.cfi_restore %r15 435 mov -40(%rsi),%r14 436.cfi_restore %r14 437 mov -32(%rsi),%r13 438.cfi_restore %r13 439 mov -24(%rsi),%r12 440.cfi_restore %r12 441 mov -16(%rsi),%rbp 442.cfi_restore %rbp 443 mov -8(%rsi),%rbx 444.cfi_restore %rbx 445 lea (%rsi),%rsp 446.cfi_def_cfa_register %rsp 447.Lmul_epilogue: 448 ret 449.cfi_endproc 450.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 451___ 452{{{ 453my @A=("%r10","%r11"); 454my @N=("%r13","%rdi"); 455$code.=<<___; 456.type bn_mul4x_mont_gather5,\@function,6 457.align 32 458bn_mul4x_mont_gather5: 459.cfi_startproc 460 .byte 0x67 461 mov %rsp,%rax 462.cfi_def_cfa_register %rax 463.Lmul4x_enter: 464___ 465$code.=<<___ if ($addx); 466 and \$0x80108,%r11d 467 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 468 je .Lmulx4x_enter 469___ 470$code.=<<___; 471 push %rbx 472.cfi_push %rbx 473 push %rbp 474.cfi_push %rbp 475 push %r12 476.cfi_push %r12 477 push %r13 478.cfi_push %r13 479 push %r14 480.cfi_push %r14 481 push %r15 482.cfi_push %r15 483.Lmul4x_prologue: 484 485 .byte 0x67 486 shl \$3,${num}d # convert $num to bytes 487 lea ($num,$num,2),%r10 # 3*$num in bytes 488 neg $num # -$num 489 490 ############################################################## 491 # Ensure that stack frame doesn't alias with $rptr+3*$num 492 # modulo 4096, which covers ret[num], am[num] and n[num] 493 # (see bn_exp.c). This is done to allow memory disambiguation 494 # logic do its magic. [Extra [num] is allocated in order 495 # to align with bn_power5's frame, which is cleansed after 496 # completing exponentiation. Extra 256 bytes is for power mask 497 # calculated from 7th argument, the index.] 498 # 499 lea -320(%rsp,$num,2),%r11 500 mov %rsp,%rbp 501 sub $rp,%r11 502 and \$4095,%r11 503 cmp %r11,%r10 504 jb .Lmul4xsp_alt 505 sub %r11,%rbp # align with $rp 506 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 507 jmp .Lmul4xsp_done 508 509.align 32 510.Lmul4xsp_alt: 511 lea 4096-320(,$num,2),%r10 512 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 513 sub %r10,%r11 514 mov \$0,%r10 515 cmovc %r10,%r11 516 sub %r11,%rbp 517.Lmul4xsp_done: 518 and \$-64,%rbp 519 mov %rsp,%r11 520 sub %rbp,%r11 521 and \$-4096,%r11 522 lea (%rbp,%r11),%rsp 523 mov (%rsp),%r10 524 cmp %rbp,%rsp 525 ja .Lmul4x_page_walk 526 jmp .Lmul4x_page_walk_done 527 528.Lmul4x_page_walk: 529 lea -4096(%rsp),%rsp 530 mov (%rsp),%r10 531 cmp %rbp,%rsp 532 ja .Lmul4x_page_walk 533.Lmul4x_page_walk_done: 534 535 neg $num 536 537 mov %rax,40(%rsp) 538.cfi_cfa_expression %rsp+40,deref,+8 539.Lmul4x_body: 540 541 call mul4x_internal 542 543 mov 40(%rsp),%rsi # restore %rsp 544.cfi_def_cfa %rsi,8 545 mov \$1,%rax 546 547 mov -48(%rsi),%r15 548.cfi_restore %r15 549 mov -40(%rsi),%r14 550.cfi_restore %r14 551 mov -32(%rsi),%r13 552.cfi_restore %r13 553 mov -24(%rsi),%r12 554.cfi_restore %r12 555 mov -16(%rsi),%rbp 556.cfi_restore %rbp 557 mov -8(%rsi),%rbx 558.cfi_restore %rbx 559 lea (%rsi),%rsp 560.cfi_def_cfa_register %rsp 561.Lmul4x_epilogue: 562 ret 563.cfi_endproc 564.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 565 566.type mul4x_internal,\@abi-omnipotent 567.align 32 568mul4x_internal: 569.cfi_startproc 570 shl \$5,$num # $num was in bytes 571 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index 572 lea .Linc(%rip),%rax 573 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) 574 shr \$5,$num # restore $num 575___ 576 $bp="%r12"; 577 $STRIDE=2**5*8; # 5 is "window size" 578 $N=$STRIDE/4; # should match cache line size 579 $tp=$i; 580$code.=<<___; 581 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 582 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 583 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) 584 lea 128(%rdx),$bp # size optimization 585 586 pshufd \$0,%xmm5,%xmm5 # broadcast index 587 movdqa %xmm1,%xmm4 588 .byte 0x67,0x67 589 movdqa %xmm1,%xmm2 590___ 591######################################################################## 592# calculate mask by comparing 0..31 to index and save result to stack 593# 594$code.=<<___; 595 paddd %xmm0,%xmm1 596 pcmpeqd %xmm5,%xmm0 # compare to 1,0 597 .byte 0x67 598 movdqa %xmm4,%xmm3 599___ 600for($i=0;$i<$STRIDE/16-4;$i+=4) { 601$code.=<<___; 602 paddd %xmm1,%xmm2 603 pcmpeqd %xmm5,%xmm1 # compare to 3,2 604 movdqa %xmm0,`16*($i+0)+112`(%r10) 605 movdqa %xmm4,%xmm0 606 607 paddd %xmm2,%xmm3 608 pcmpeqd %xmm5,%xmm2 # compare to 5,4 609 movdqa %xmm1,`16*($i+1)+112`(%r10) 610 movdqa %xmm4,%xmm1 611 612 paddd %xmm3,%xmm0 613 pcmpeqd %xmm5,%xmm3 # compare to 7,6 614 movdqa %xmm2,`16*($i+2)+112`(%r10) 615 movdqa %xmm4,%xmm2 616 617 paddd %xmm0,%xmm1 618 pcmpeqd %xmm5,%xmm0 619 movdqa %xmm3,`16*($i+3)+112`(%r10) 620 movdqa %xmm4,%xmm3 621___ 622} 623$code.=<<___; # last iteration can be optimized 624 paddd %xmm1,%xmm2 625 pcmpeqd %xmm5,%xmm1 626 movdqa %xmm0,`16*($i+0)+112`(%r10) 627 628 paddd %xmm2,%xmm3 629 .byte 0x67 630 pcmpeqd %xmm5,%xmm2 631 movdqa %xmm1,`16*($i+1)+112`(%r10) 632 633 pcmpeqd %xmm5,%xmm3 634 movdqa %xmm2,`16*($i+2)+112`(%r10) 635 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register 636 637 pand `16*($i+1)-128`($bp),%xmm1 638 pand `16*($i+2)-128`($bp),%xmm2 639 movdqa %xmm3,`16*($i+3)+112`(%r10) 640 pand `16*($i+3)-128`($bp),%xmm3 641 por %xmm2,%xmm0 642 por %xmm3,%xmm1 643___ 644for($i=0;$i<$STRIDE/16-4;$i+=4) { 645$code.=<<___; 646 movdqa `16*($i+0)-128`($bp),%xmm4 647 movdqa `16*($i+1)-128`($bp),%xmm5 648 movdqa `16*($i+2)-128`($bp),%xmm2 649 pand `16*($i+0)+112`(%r10),%xmm4 650 movdqa `16*($i+3)-128`($bp),%xmm3 651 pand `16*($i+1)+112`(%r10),%xmm5 652 por %xmm4,%xmm0 653 pand `16*($i+2)+112`(%r10),%xmm2 654 por %xmm5,%xmm1 655 pand `16*($i+3)+112`(%r10),%xmm3 656 por %xmm2,%xmm0 657 por %xmm3,%xmm1 658___ 659} 660$code.=<<___; 661 por %xmm1,%xmm0 662 pshufd \$0x4e,%xmm0,%xmm1 663 por %xmm1,%xmm0 664 lea $STRIDE($bp),$bp 665 movq %xmm0,$m0 # m0=bp[0] 666 667 mov %r13,16+8(%rsp) # save end of b[num] 668 mov $rp, 56+8(%rsp) # save $rp 669 670 mov ($n0),$n0 # pull n0[0] value 671 mov ($ap),%rax 672 lea ($ap,$num),$ap # end of a[num] 673 neg $num 674 675 mov $n0,$m1 676 mulq $m0 # ap[0]*bp[0] 677 mov %rax,$A[0] 678 mov ($np),%rax 679 680 imulq $A[0],$m1 # "tp[0]"*n0 681 lea 64+8(%rsp),$tp 682 mov %rdx,$A[1] 683 684 mulq $m1 # np[0]*m1 685 add %rax,$A[0] # discarded 686 mov 8($ap,$num),%rax 687 adc \$0,%rdx 688 mov %rdx,$N[1] 689 690 mulq $m0 691 add %rax,$A[1] 692 mov 8*1($np),%rax 693 adc \$0,%rdx 694 mov %rdx,$A[0] 695 696 mulq $m1 697 add %rax,$N[1] 698 mov 16($ap,$num),%rax 699 adc \$0,%rdx 700 add $A[1],$N[1] 701 lea 4*8($num),$j # j=4 702 lea 8*4($np),$np 703 adc \$0,%rdx 704 mov $N[1],($tp) 705 mov %rdx,$N[0] 706 jmp .L1st4x 707 708.align 32 709.L1st4x: 710 mulq $m0 # ap[j]*bp[0] 711 add %rax,$A[0] 712 mov -8*2($np),%rax 713 lea 32($tp),$tp 714 adc \$0,%rdx 715 mov %rdx,$A[1] 716 717 mulq $m1 # np[j]*m1 718 add %rax,$N[0] 719 mov -8($ap,$j),%rax 720 adc \$0,%rdx 721 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 722 adc \$0,%rdx 723 mov $N[0],-24($tp) # tp[j-1] 724 mov %rdx,$N[1] 725 726 mulq $m0 # ap[j]*bp[0] 727 add %rax,$A[1] 728 mov -8*1($np),%rax 729 adc \$0,%rdx 730 mov %rdx,$A[0] 731 732 mulq $m1 # np[j]*m1 733 add %rax,$N[1] 734 mov ($ap,$j),%rax 735 adc \$0,%rdx 736 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 737 adc \$0,%rdx 738 mov $N[1],-16($tp) # tp[j-1] 739 mov %rdx,$N[0] 740 741 mulq $m0 # ap[j]*bp[0] 742 add %rax,$A[0] 743 mov 8*0($np),%rax 744 adc \$0,%rdx 745 mov %rdx,$A[1] 746 747 mulq $m1 # np[j]*m1 748 add %rax,$N[0] 749 mov 8($ap,$j),%rax 750 adc \$0,%rdx 751 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 752 adc \$0,%rdx 753 mov $N[0],-8($tp) # tp[j-1] 754 mov %rdx,$N[1] 755 756 mulq $m0 # ap[j]*bp[0] 757 add %rax,$A[1] 758 mov 8*1($np),%rax 759 adc \$0,%rdx 760 mov %rdx,$A[0] 761 762 mulq $m1 # np[j]*m1 763 add %rax,$N[1] 764 mov 16($ap,$j),%rax 765 adc \$0,%rdx 766 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 767 lea 8*4($np),$np 768 adc \$0,%rdx 769 mov $N[1],($tp) # tp[j-1] 770 mov %rdx,$N[0] 771 772 add \$32,$j # j+=4 773 jnz .L1st4x 774 775 mulq $m0 # ap[j]*bp[0] 776 add %rax,$A[0] 777 mov -8*2($np),%rax 778 lea 32($tp),$tp 779 adc \$0,%rdx 780 mov %rdx,$A[1] 781 782 mulq $m1 # np[j]*m1 783 add %rax,$N[0] 784 mov -8($ap),%rax 785 adc \$0,%rdx 786 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 787 adc \$0,%rdx 788 mov $N[0],-24($tp) # tp[j-1] 789 mov %rdx,$N[1] 790 791 mulq $m0 # ap[j]*bp[0] 792 add %rax,$A[1] 793 mov -8*1($np),%rax 794 adc \$0,%rdx 795 mov %rdx,$A[0] 796 797 mulq $m1 # np[j]*m1 798 add %rax,$N[1] 799 mov ($ap,$num),%rax # ap[0] 800 adc \$0,%rdx 801 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 802 adc \$0,%rdx 803 mov $N[1],-16($tp) # tp[j-1] 804 mov %rdx,$N[0] 805 806 lea ($np,$num),$np # rewind $np 807 808 xor $N[1],$N[1] 809 add $A[0],$N[0] 810 adc \$0,$N[1] 811 mov $N[0],-8($tp) 812 813 jmp .Louter4x 814 815.align 32 816.Louter4x: 817 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) 818 pxor %xmm4,%xmm4 819 pxor %xmm5,%xmm5 820___ 821for($i=0;$i<$STRIDE/16;$i+=4) { 822$code.=<<___; 823 movdqa `16*($i+0)-128`($bp),%xmm0 824 movdqa `16*($i+1)-128`($bp),%xmm1 825 movdqa `16*($i+2)-128`($bp),%xmm2 826 movdqa `16*($i+3)-128`($bp),%xmm3 827 pand `16*($i+0)-128`(%rdx),%xmm0 828 pand `16*($i+1)-128`(%rdx),%xmm1 829 por %xmm0,%xmm4 830 pand `16*($i+2)-128`(%rdx),%xmm2 831 por %xmm1,%xmm5 832 pand `16*($i+3)-128`(%rdx),%xmm3 833 por %xmm2,%xmm4 834 por %xmm3,%xmm5 835___ 836} 837$code.=<<___; 838 por %xmm5,%xmm4 839 pshufd \$0x4e,%xmm4,%xmm0 840 por %xmm4,%xmm0 841 lea $STRIDE($bp),$bp 842 movq %xmm0,$m0 # m0=bp[i] 843 844 mov ($tp,$num),$A[0] 845 mov $n0,$m1 846 mulq $m0 # ap[0]*bp[i] 847 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 848 mov ($np),%rax 849 adc \$0,%rdx 850 851 imulq $A[0],$m1 # tp[0]*n0 852 mov %rdx,$A[1] 853 mov $N[1],($tp) # store upmost overflow bit 854 855 lea ($tp,$num),$tp # rewind $tp 856 857 mulq $m1 # np[0]*m1 858 add %rax,$A[0] # "$N[0]", discarded 859 mov 8($ap,$num),%rax 860 adc \$0,%rdx 861 mov %rdx,$N[1] 862 863 mulq $m0 # ap[j]*bp[i] 864 add %rax,$A[1] 865 mov 8*1($np),%rax 866 adc \$0,%rdx 867 add 8($tp),$A[1] # +tp[1] 868 adc \$0,%rdx 869 mov %rdx,$A[0] 870 871 mulq $m1 # np[j]*m1 872 add %rax,$N[1] 873 mov 16($ap,$num),%rax 874 adc \$0,%rdx 875 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 876 lea 4*8($num),$j # j=4 877 lea 8*4($np),$np 878 adc \$0,%rdx 879 mov %rdx,$N[0] 880 jmp .Linner4x 881 882.align 32 883.Linner4x: 884 mulq $m0 # ap[j]*bp[i] 885 add %rax,$A[0] 886 mov -8*2($np),%rax 887 adc \$0,%rdx 888 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 889 lea 32($tp),$tp 890 adc \$0,%rdx 891 mov %rdx,$A[1] 892 893 mulq $m1 # np[j]*m1 894 add %rax,$N[0] 895 mov -8($ap,$j),%rax 896 adc \$0,%rdx 897 add $A[0],$N[0] 898 adc \$0,%rdx 899 mov $N[1],-32($tp) # tp[j-1] 900 mov %rdx,$N[1] 901 902 mulq $m0 # ap[j]*bp[i] 903 add %rax,$A[1] 904 mov -8*1($np),%rax 905 adc \$0,%rdx 906 add -8($tp),$A[1] 907 adc \$0,%rdx 908 mov %rdx,$A[0] 909 910 mulq $m1 # np[j]*m1 911 add %rax,$N[1] 912 mov ($ap,$j),%rax 913 adc \$0,%rdx 914 add $A[1],$N[1] 915 adc \$0,%rdx 916 mov $N[0],-24($tp) # tp[j-1] 917 mov %rdx,$N[0] 918 919 mulq $m0 # ap[j]*bp[i] 920 add %rax,$A[0] 921 mov 8*0($np),%rax 922 adc \$0,%rdx 923 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 924 adc \$0,%rdx 925 mov %rdx,$A[1] 926 927 mulq $m1 # np[j]*m1 928 add %rax,$N[0] 929 mov 8($ap,$j),%rax 930 adc \$0,%rdx 931 add $A[0],$N[0] 932 adc \$0,%rdx 933 mov $N[1],-16($tp) # tp[j-1] 934 mov %rdx,$N[1] 935 936 mulq $m0 # ap[j]*bp[i] 937 add %rax,$A[1] 938 mov 8*1($np),%rax 939 adc \$0,%rdx 940 add 8($tp),$A[1] 941 adc \$0,%rdx 942 mov %rdx,$A[0] 943 944 mulq $m1 # np[j]*m1 945 add %rax,$N[1] 946 mov 16($ap,$j),%rax 947 adc \$0,%rdx 948 add $A[1],$N[1] 949 lea 8*4($np),$np 950 adc \$0,%rdx 951 mov $N[0],-8($tp) # tp[j-1] 952 mov %rdx,$N[0] 953 954 add \$32,$j # j+=4 955 jnz .Linner4x 956 957 mulq $m0 # ap[j]*bp[i] 958 add %rax,$A[0] 959 mov -8*2($np),%rax 960 adc \$0,%rdx 961 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 962 lea 32($tp),$tp 963 adc \$0,%rdx 964 mov %rdx,$A[1] 965 966 mulq $m1 # np[j]*m1 967 add %rax,$N[0] 968 mov -8($ap),%rax 969 adc \$0,%rdx 970 add $A[0],$N[0] 971 adc \$0,%rdx 972 mov $N[1],-32($tp) # tp[j-1] 973 mov %rdx,$N[1] 974 975 mulq $m0 # ap[j]*bp[i] 976 add %rax,$A[1] 977 mov $m1,%rax 978 mov -8*1($np),$m1 979 adc \$0,%rdx 980 add -8($tp),$A[1] 981 adc \$0,%rdx 982 mov %rdx,$A[0] 983 984 mulq $m1 # np[j]*m1 985 add %rax,$N[1] 986 mov ($ap,$num),%rax # ap[0] 987 adc \$0,%rdx 988 add $A[1],$N[1] 989 adc \$0,%rdx 990 mov $N[0],-24($tp) # tp[j-1] 991 mov %rdx,$N[0] 992 993 mov $N[1],-16($tp) # tp[j-1] 994 lea ($np,$num),$np # rewind $np 995 996 xor $N[1],$N[1] 997 add $A[0],$N[0] 998 adc \$0,$N[1] 999 add ($tp),$N[0] # pull upmost overflow bit 1000 adc \$0,$N[1] # upmost overflow bit 1001 mov $N[0],-8($tp) 1002 1003 cmp 16+8(%rsp),$bp 1004 jb .Louter4x 1005___ 1006if (1) { 1007$code.=<<___; 1008 xor %rax,%rax 1009 sub $N[0],$m1 # compare top-most words 1010 adc $j,$j # $j is zero 1011 or $j,$N[1] 1012 sub $N[1],%rax # %rax=-$N[1] 1013 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 1014 mov ($np),%r12 1015 lea ($np),%rbp # nptr in .sqr4x_sub 1016 mov %r9,%rcx 1017 sar \$3+2,%rcx 1018 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 1019 dec %r12 # so that after 'not' we get -n[0] 1020 xor %r10,%r10 1021 mov 8*1(%rbp),%r13 1022 mov 8*2(%rbp),%r14 1023 mov 8*3(%rbp),%r15 1024 jmp .Lsqr4x_sub_entry 1025___ 1026} else { 1027my @ri=("%rax",$bp,$m0,$m1); 1028my $rp="%rdx"; 1029$code.=<<___ 1030 xor \$1,$N[1] 1031 lea ($tp,$num),$tp # rewind $tp 1032 sar \$5,$num # cf=0 1033 lea ($np,$N[1],8),$np 1034 mov 56+8(%rsp),$rp # restore $rp 1035 jmp .Lsub4x 1036 1037.align 32 1038.Lsub4x: 1039 .byte 0x66 1040 mov 8*0($tp),@ri[0] 1041 mov 8*1($tp),@ri[1] 1042 .byte 0x66 1043 sbb 16*0($np),@ri[0] 1044 mov 8*2($tp),@ri[2] 1045 sbb 16*1($np),@ri[1] 1046 mov 3*8($tp),@ri[3] 1047 lea 4*8($tp),$tp 1048 sbb 16*2($np),@ri[2] 1049 mov @ri[0],8*0($rp) 1050 sbb 16*3($np),@ri[3] 1051 lea 16*4($np),$np 1052 mov @ri[1],8*1($rp) 1053 mov @ri[2],8*2($rp) 1054 mov @ri[3],8*3($rp) 1055 lea 8*4($rp),$rp 1056 1057 inc $num 1058 jnz .Lsub4x 1059 1060 ret 1061___ 1062} 1063$code.=<<___; 1064.cfi_endproc 1065.size mul4x_internal,.-mul4x_internal 1066___ 1067}}} 1068{{{ 1069###################################################################### 1070# void bn_power5( 1071my $rptr="%rdi"; # BN_ULONG *rptr, 1072my $aptr="%rsi"; # const BN_ULONG *aptr, 1073my $bptr="%rdx"; # const BN_ULONG *table, 1074my $nptr="%rcx"; # const BN_ULONG *nptr, 1075my $n0 ="%r8"; # const BN_ULONG *n0); 1076my $num ="%r9"; # int num, has to be divisible by 8 1077 # int pwr 1078 1079my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 1080my @A0=("%r10","%r11"); 1081my @A1=("%r12","%r13"); 1082my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 1083 1084$code.=<<___; 1085.globl bn_power5 1086.type bn_power5,\@function,6 1087.align 32 1088bn_power5: 1089.cfi_startproc 1090 mov %rsp,%rax 1091.cfi_def_cfa_register %rax 1092___ 1093$code.=<<___ if ($addx); 1094 leaq OPENSSL_ia32cap_P(%rip),%r11 1095 mov 8(%r11),%r11d 1096 and \$0x80108,%r11d 1097 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 1098 je .Lpowerx5_enter 1099___ 1100$code.=<<___; 1101 push %rbx 1102.cfi_push %rbx 1103 push %rbp 1104.cfi_push %rbp 1105 push %r12 1106.cfi_push %r12 1107 push %r13 1108.cfi_push %r13 1109 push %r14 1110.cfi_push %r14 1111 push %r15 1112.cfi_push %r15 1113.Lpower5_prologue: 1114 1115 shl \$3,${num}d # convert $num to bytes 1116 lea ($num,$num,2),%r10d # 3*$num 1117 neg $num 1118 mov ($n0),$n0 # *n0 1119 1120 ############################################################## 1121 # Ensure that stack frame doesn't alias with $rptr+3*$num 1122 # modulo 4096, which covers ret[num], am[num] and n[num] 1123 # (see bn_exp.c). This is done to allow memory disambiguation 1124 # logic do its magic. [Extra 256 bytes is for power mask 1125 # calculated from 7th argument, the index.] 1126 # 1127 lea -320(%rsp,$num,2),%r11 1128 mov %rsp,%rbp 1129 sub $rptr,%r11 1130 and \$4095,%r11 1131 cmp %r11,%r10 1132 jb .Lpwr_sp_alt 1133 sub %r11,%rbp # align with $aptr 1134 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1135 jmp .Lpwr_sp_done 1136 1137.align 32 1138.Lpwr_sp_alt: 1139 lea 4096-320(,$num,2),%r10 1140 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1141 sub %r10,%r11 1142 mov \$0,%r10 1143 cmovc %r10,%r11 1144 sub %r11,%rbp 1145.Lpwr_sp_done: 1146 and \$-64,%rbp 1147 mov %rsp,%r11 1148 sub %rbp,%r11 1149 and \$-4096,%r11 1150 lea (%rbp,%r11),%rsp 1151 mov (%rsp),%r10 1152 cmp %rbp,%rsp 1153 ja .Lpwr_page_walk 1154 jmp .Lpwr_page_walk_done 1155 1156.Lpwr_page_walk: 1157 lea -4096(%rsp),%rsp 1158 mov (%rsp),%r10 1159 cmp %rbp,%rsp 1160 ja .Lpwr_page_walk 1161.Lpwr_page_walk_done: 1162 1163 mov $num,%r10 1164 neg $num 1165 1166 ############################################################## 1167 # Stack layout 1168 # 1169 # +0 saved $num, used in reduction section 1170 # +8 &t[2*$num], used in reduction section 1171 # +32 saved *n0 1172 # +40 saved %rsp 1173 # +48 t[2*$num] 1174 # 1175 mov $n0, 32(%rsp) 1176 mov %rax, 40(%rsp) # save original %rsp 1177.cfi_cfa_expression %rsp+40,deref,+8 1178.Lpower5_body: 1179 movq $rptr,%xmm1 # save $rptr, used in sqr8x 1180 movq $nptr,%xmm2 # save $nptr 1181 movq %r10, %xmm3 # -$num, used in sqr8x 1182 movq $bptr,%xmm4 1183 1184 call __bn_sqr8x_internal 1185 call __bn_post4x_internal 1186 call __bn_sqr8x_internal 1187 call __bn_post4x_internal 1188 call __bn_sqr8x_internal 1189 call __bn_post4x_internal 1190 call __bn_sqr8x_internal 1191 call __bn_post4x_internal 1192 call __bn_sqr8x_internal 1193 call __bn_post4x_internal 1194 1195 movq %xmm2,$nptr 1196 movq %xmm4,$bptr 1197 mov $aptr,$rptr 1198 mov 40(%rsp),%rax 1199 lea 32(%rsp),$n0 1200 1201 call mul4x_internal 1202 1203 mov 40(%rsp),%rsi # restore %rsp 1204.cfi_def_cfa %rsi,8 1205 mov \$1,%rax 1206 mov -48(%rsi),%r15 1207.cfi_restore %r15 1208 mov -40(%rsi),%r14 1209.cfi_restore %r14 1210 mov -32(%rsi),%r13 1211.cfi_restore %r13 1212 mov -24(%rsi),%r12 1213.cfi_restore %r12 1214 mov -16(%rsi),%rbp 1215.cfi_restore %rbp 1216 mov -8(%rsi),%rbx 1217.cfi_restore %rbx 1218 lea (%rsi),%rsp 1219.cfi_def_cfa_register %rsp 1220.Lpower5_epilogue: 1221 ret 1222.cfi_endproc 1223.size bn_power5,.-bn_power5 1224 1225.globl bn_sqr8x_internal 1226.hidden bn_sqr8x_internal 1227.type bn_sqr8x_internal,\@abi-omnipotent 1228.align 32 1229bn_sqr8x_internal: 1230__bn_sqr8x_internal: 1231.cfi_startproc 1232 ############################################################## 1233 # Squaring part: 1234 # 1235 # a) multiply-n-add everything but a[i]*a[i]; 1236 # b) shift result of a) by 1 to the left and accumulate 1237 # a[i]*a[i] products; 1238 # 1239 ############################################################## 1240 # a[1]a[0] 1241 # a[2]a[0] 1242 # a[3]a[0] 1243 # a[2]a[1] 1244 # a[4]a[0] 1245 # a[3]a[1] 1246 # a[5]a[0] 1247 # a[4]a[1] 1248 # a[3]a[2] 1249 # a[6]a[0] 1250 # a[5]a[1] 1251 # a[4]a[2] 1252 # a[7]a[0] 1253 # a[6]a[1] 1254 # a[5]a[2] 1255 # a[4]a[3] 1256 # a[7]a[1] 1257 # a[6]a[2] 1258 # a[5]a[3] 1259 # a[7]a[2] 1260 # a[6]a[3] 1261 # a[5]a[4] 1262 # a[7]a[3] 1263 # a[6]a[4] 1264 # a[7]a[4] 1265 # a[6]a[5] 1266 # a[7]a[5] 1267 # a[7]a[6] 1268 # a[1]a[0] 1269 # a[2]a[0] 1270 # a[3]a[0] 1271 # a[4]a[0] 1272 # a[5]a[0] 1273 # a[6]a[0] 1274 # a[7]a[0] 1275 # a[2]a[1] 1276 # a[3]a[1] 1277 # a[4]a[1] 1278 # a[5]a[1] 1279 # a[6]a[1] 1280 # a[7]a[1] 1281 # a[3]a[2] 1282 # a[4]a[2] 1283 # a[5]a[2] 1284 # a[6]a[2] 1285 # a[7]a[2] 1286 # a[4]a[3] 1287 # a[5]a[3] 1288 # a[6]a[3] 1289 # a[7]a[3] 1290 # a[5]a[4] 1291 # a[6]a[4] 1292 # a[7]a[4] 1293 # a[6]a[5] 1294 # a[7]a[5] 1295 # a[7]a[6] 1296 # a[0]a[0] 1297 # a[1]a[1] 1298 # a[2]a[2] 1299 # a[3]a[3] 1300 # a[4]a[4] 1301 # a[5]a[5] 1302 # a[6]a[6] 1303 # a[7]a[7] 1304 1305 lea 32(%r10),$i # $i=-($num-32) 1306 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1307 1308 mov $num,$j # $j=$num 1309 1310 # comments apply to $num==8 case 1311 mov -32($aptr,$i),$a0 # a[0] 1312 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1313 mov -24($aptr,$i),%rax # a[1] 1314 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1315 mov -16($aptr,$i),$ai # a[2] 1316 mov %rax,$a1 1317 1318 mul $a0 # a[1]*a[0] 1319 mov %rax,$A0[0] # a[1]*a[0] 1320 mov $ai,%rax # a[2] 1321 mov %rdx,$A0[1] 1322 mov $A0[0],-24($tptr,$i) # t[1] 1323 1324 mul $a0 # a[2]*a[0] 1325 add %rax,$A0[1] 1326 mov $ai,%rax 1327 adc \$0,%rdx 1328 mov $A0[1],-16($tptr,$i) # t[2] 1329 mov %rdx,$A0[0] 1330 1331 1332 mov -8($aptr,$i),$ai # a[3] 1333 mul $a1 # a[2]*a[1] 1334 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1335 mov $ai,%rax 1336 mov %rdx,$A1[1] 1337 1338 lea ($i),$j 1339 mul $a0 # a[3]*a[0] 1340 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1341 mov $ai,%rax 1342 mov %rdx,$A0[1] 1343 adc \$0,$A0[1] 1344 add $A1[0],$A0[0] 1345 adc \$0,$A0[1] 1346 mov $A0[0],-8($tptr,$j) # t[3] 1347 jmp .Lsqr4x_1st 1348 1349.align 32 1350.Lsqr4x_1st: 1351 mov ($aptr,$j),$ai # a[4] 1352 mul $a1 # a[3]*a[1] 1353 add %rax,$A1[1] # a[3]*a[1]+t[4] 1354 mov $ai,%rax 1355 mov %rdx,$A1[0] 1356 adc \$0,$A1[0] 1357 1358 mul $a0 # a[4]*a[0] 1359 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1360 mov $ai,%rax # a[3] 1361 mov 8($aptr,$j),$ai # a[5] 1362 mov %rdx,$A0[0] 1363 adc \$0,$A0[0] 1364 add $A1[1],$A0[1] 1365 adc \$0,$A0[0] 1366 1367 1368 mul $a1 # a[4]*a[3] 1369 add %rax,$A1[0] # a[4]*a[3]+t[5] 1370 mov $ai,%rax 1371 mov $A0[1],($tptr,$j) # t[4] 1372 mov %rdx,$A1[1] 1373 adc \$0,$A1[1] 1374 1375 mul $a0 # a[5]*a[2] 1376 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1377 mov $ai,%rax 1378 mov 16($aptr,$j),$ai # a[6] 1379 mov %rdx,$A0[1] 1380 adc \$0,$A0[1] 1381 add $A1[0],$A0[0] 1382 adc \$0,$A0[1] 1383 1384 mul $a1 # a[5]*a[3] 1385 add %rax,$A1[1] # a[5]*a[3]+t[6] 1386 mov $ai,%rax 1387 mov $A0[0],8($tptr,$j) # t[5] 1388 mov %rdx,$A1[0] 1389 adc \$0,$A1[0] 1390 1391 mul $a0 # a[6]*a[2] 1392 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1393 mov $ai,%rax # a[3] 1394 mov 24($aptr,$j),$ai # a[7] 1395 mov %rdx,$A0[0] 1396 adc \$0,$A0[0] 1397 add $A1[1],$A0[1] 1398 adc \$0,$A0[0] 1399 1400 1401 mul $a1 # a[6]*a[5] 1402 add %rax,$A1[0] # a[6]*a[5]+t[7] 1403 mov $ai,%rax 1404 mov $A0[1],16($tptr,$j) # t[6] 1405 mov %rdx,$A1[1] 1406 adc \$0,$A1[1] 1407 lea 32($j),$j 1408 1409 mul $a0 # a[7]*a[4] 1410 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1411 mov $ai,%rax 1412 mov %rdx,$A0[1] 1413 adc \$0,$A0[1] 1414 add $A1[0],$A0[0] 1415 adc \$0,$A0[1] 1416 mov $A0[0],-8($tptr,$j) # t[7] 1417 1418 cmp \$0,$j 1419 jne .Lsqr4x_1st 1420 1421 mul $a1 # a[7]*a[5] 1422 add %rax,$A1[1] 1423 lea 16($i),$i 1424 adc \$0,%rdx 1425 add $A0[1],$A1[1] 1426 adc \$0,%rdx 1427 1428 mov $A1[1],($tptr) # t[8] 1429 mov %rdx,$A1[0] 1430 mov %rdx,8($tptr) # t[9] 1431 jmp .Lsqr4x_outer 1432 1433.align 32 1434.Lsqr4x_outer: # comments apply to $num==6 case 1435 mov -32($aptr,$i),$a0 # a[0] 1436 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1437 mov -24($aptr,$i),%rax # a[1] 1438 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1439 mov -16($aptr,$i),$ai # a[2] 1440 mov %rax,$a1 1441 1442 mul $a0 # a[1]*a[0] 1443 mov -24($tptr,$i),$A0[0] # t[1] 1444 add %rax,$A0[0] # a[1]*a[0]+t[1] 1445 mov $ai,%rax # a[2] 1446 adc \$0,%rdx 1447 mov $A0[0],-24($tptr,$i) # t[1] 1448 mov %rdx,$A0[1] 1449 1450 mul $a0 # a[2]*a[0] 1451 add %rax,$A0[1] 1452 mov $ai,%rax 1453 adc \$0,%rdx 1454 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1455 mov %rdx,$A0[0] 1456 adc \$0,$A0[0] 1457 mov $A0[1],-16($tptr,$i) # t[2] 1458 1459 xor $A1[0],$A1[0] 1460 1461 mov -8($aptr,$i),$ai # a[3] 1462 mul $a1 # a[2]*a[1] 1463 add %rax,$A1[0] # a[2]*a[1]+t[3] 1464 mov $ai,%rax 1465 adc \$0,%rdx 1466 add -8($tptr,$i),$A1[0] 1467 mov %rdx,$A1[1] 1468 adc \$0,$A1[1] 1469 1470 mul $a0 # a[3]*a[0] 1471 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1472 mov $ai,%rax 1473 adc \$0,%rdx 1474 add $A1[0],$A0[0] 1475 mov %rdx,$A0[1] 1476 adc \$0,$A0[1] 1477 mov $A0[0],-8($tptr,$i) # t[3] 1478 1479 lea ($i),$j 1480 jmp .Lsqr4x_inner 1481 1482.align 32 1483.Lsqr4x_inner: 1484 mov ($aptr,$j),$ai # a[4] 1485 mul $a1 # a[3]*a[1] 1486 add %rax,$A1[1] # a[3]*a[1]+t[4] 1487 mov $ai,%rax 1488 mov %rdx,$A1[0] 1489 adc \$0,$A1[0] 1490 add ($tptr,$j),$A1[1] 1491 adc \$0,$A1[0] 1492 1493 .byte 0x67 1494 mul $a0 # a[4]*a[0] 1495 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1496 mov $ai,%rax # a[3] 1497 mov 8($aptr,$j),$ai # a[5] 1498 mov %rdx,$A0[0] 1499 adc \$0,$A0[0] 1500 add $A1[1],$A0[1] 1501 adc \$0,$A0[0] 1502 1503 mul $a1 # a[4]*a[3] 1504 add %rax,$A1[0] # a[4]*a[3]+t[5] 1505 mov $A0[1],($tptr,$j) # t[4] 1506 mov $ai,%rax 1507 mov %rdx,$A1[1] 1508 adc \$0,$A1[1] 1509 add 8($tptr,$j),$A1[0] 1510 lea 16($j),$j # j++ 1511 adc \$0,$A1[1] 1512 1513 mul $a0 # a[5]*a[2] 1514 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1515 mov $ai,%rax 1516 adc \$0,%rdx 1517 add $A1[0],$A0[0] 1518 mov %rdx,$A0[1] 1519 adc \$0,$A0[1] 1520 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1521 1522 cmp \$0,$j 1523 jne .Lsqr4x_inner 1524 1525 .byte 0x67 1526 mul $a1 # a[5]*a[3] 1527 add %rax,$A1[1] 1528 adc \$0,%rdx 1529 add $A0[1],$A1[1] 1530 adc \$0,%rdx 1531 1532 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1533 mov %rdx,$A1[0] 1534 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1535 1536 add \$16,$i 1537 jnz .Lsqr4x_outer 1538 1539 # comments apply to $num==4 case 1540 mov -32($aptr),$a0 # a[0] 1541 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1542 mov -24($aptr),%rax # a[1] 1543 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1544 mov -16($aptr),$ai # a[2] 1545 mov %rax,$a1 1546 1547 mul $a0 # a[1]*a[0] 1548 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1549 mov $ai,%rax # a[2] 1550 mov %rdx,$A0[1] 1551 adc \$0,$A0[1] 1552 1553 mul $a0 # a[2]*a[0] 1554 add %rax,$A0[1] 1555 mov $ai,%rax 1556 mov $A0[0],-24($tptr) # t[1] 1557 mov %rdx,$A0[0] 1558 adc \$0,$A0[0] 1559 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1560 mov -8($aptr),$ai # a[3] 1561 adc \$0,$A0[0] 1562 1563 mul $a1 # a[2]*a[1] 1564 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1565 mov $ai,%rax 1566 mov $A0[1],-16($tptr) # t[2] 1567 mov %rdx,$A1[1] 1568 adc \$0,$A1[1] 1569 1570 mul $a0 # a[3]*a[0] 1571 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1572 mov $ai,%rax 1573 mov %rdx,$A0[1] 1574 adc \$0,$A0[1] 1575 add $A1[0],$A0[0] 1576 adc \$0,$A0[1] 1577 mov $A0[0],-8($tptr) # t[3] 1578 1579 mul $a1 # a[3]*a[1] 1580 add %rax,$A1[1] 1581 mov -16($aptr),%rax # a[2] 1582 adc \$0,%rdx 1583 add $A0[1],$A1[1] 1584 adc \$0,%rdx 1585 1586 mov $A1[1],($tptr) # t[4] 1587 mov %rdx,$A1[0] 1588 mov %rdx,8($tptr) # t[5] 1589 1590 mul $ai # a[2]*a[3] 1591___ 1592{ 1593my ($shift,$carry)=($a0,$a1); 1594my @S=(@A1,$ai,$n0); 1595$code.=<<___; 1596 add \$16,$i 1597 xor $shift,$shift 1598 sub $num,$i # $i=16-$num 1599 xor $carry,$carry 1600 1601 add $A1[0],%rax # t[5] 1602 adc \$0,%rdx 1603 mov %rax,8($tptr) # t[5] 1604 mov %rdx,16($tptr) # t[6] 1605 mov $carry,24($tptr) # t[7] 1606 1607 mov -16($aptr,$i),%rax # a[0] 1608 lea 48+8(%rsp),$tptr 1609 xor $A0[0],$A0[0] # t[0] 1610 mov 8($tptr),$A0[1] # t[1] 1611 1612 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1613 shr \$63,$A0[0] 1614 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1615 shr \$63,$A0[1] 1616 or $A0[0],$S[1] # | t[2*i]>>63 1617 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1618 mov $A0[1],$shift # shift=t[2*i+1]>>63 1619 mul %rax # a[i]*a[i] 1620 neg $carry # mov $carry,cf 1621 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1622 adc %rax,$S[0] 1623 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1624 mov $S[0],($tptr) 1625 adc %rdx,$S[1] 1626 1627 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1628 mov $S[1],8($tptr) 1629 sbb $carry,$carry # mov cf,$carry 1630 shr \$63,$A0[0] 1631 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1632 shr \$63,$A0[1] 1633 or $A0[0],$S[3] # | t[2*i]>>63 1634 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1635 mov $A0[1],$shift # shift=t[2*i+1]>>63 1636 mul %rax # a[i]*a[i] 1637 neg $carry # mov $carry,cf 1638 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1639 adc %rax,$S[2] 1640 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1641 mov $S[2],16($tptr) 1642 adc %rdx,$S[3] 1643 lea 16($i),$i 1644 mov $S[3],24($tptr) 1645 sbb $carry,$carry # mov cf,$carry 1646 lea 64($tptr),$tptr 1647 jmp .Lsqr4x_shift_n_add 1648 1649.align 32 1650.Lsqr4x_shift_n_add: 1651 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1652 shr \$63,$A0[0] 1653 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1654 shr \$63,$A0[1] 1655 or $A0[0],$S[1] # | t[2*i]>>63 1656 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1657 mov $A0[1],$shift # shift=t[2*i+1]>>63 1658 mul %rax # a[i]*a[i] 1659 neg $carry # mov $carry,cf 1660 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1661 adc %rax,$S[0] 1662 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1663 mov $S[0],-32($tptr) 1664 adc %rdx,$S[1] 1665 1666 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1667 mov $S[1],-24($tptr) 1668 sbb $carry,$carry # mov cf,$carry 1669 shr \$63,$A0[0] 1670 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1671 shr \$63,$A0[1] 1672 or $A0[0],$S[3] # | t[2*i]>>63 1673 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1674 mov $A0[1],$shift # shift=t[2*i+1]>>63 1675 mul %rax # a[i]*a[i] 1676 neg $carry # mov $carry,cf 1677 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1678 adc %rax,$S[2] 1679 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1680 mov $S[2],-16($tptr) 1681 adc %rdx,$S[3] 1682 1683 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1684 mov $S[3],-8($tptr) 1685 sbb $carry,$carry # mov cf,$carry 1686 shr \$63,$A0[0] 1687 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1688 shr \$63,$A0[1] 1689 or $A0[0],$S[1] # | t[2*i]>>63 1690 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1691 mov $A0[1],$shift # shift=t[2*i+1]>>63 1692 mul %rax # a[i]*a[i] 1693 neg $carry # mov $carry,cf 1694 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1695 adc %rax,$S[0] 1696 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1697 mov $S[0],0($tptr) 1698 adc %rdx,$S[1] 1699 1700 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1701 mov $S[1],8($tptr) 1702 sbb $carry,$carry # mov cf,$carry 1703 shr \$63,$A0[0] 1704 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1705 shr \$63,$A0[1] 1706 or $A0[0],$S[3] # | t[2*i]>>63 1707 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1708 mov $A0[1],$shift # shift=t[2*i+1]>>63 1709 mul %rax # a[i]*a[i] 1710 neg $carry # mov $carry,cf 1711 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1712 adc %rax,$S[2] 1713 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1714 mov $S[2],16($tptr) 1715 adc %rdx,$S[3] 1716 mov $S[3],24($tptr) 1717 sbb $carry,$carry # mov cf,$carry 1718 lea 64($tptr),$tptr 1719 add \$32,$i 1720 jnz .Lsqr4x_shift_n_add 1721 1722 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1723 .byte 0x67 1724 shr \$63,$A0[0] 1725 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1726 shr \$63,$A0[1] 1727 or $A0[0],$S[1] # | t[2*i]>>63 1728 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1729 mov $A0[1],$shift # shift=t[2*i+1]>>63 1730 mul %rax # a[i]*a[i] 1731 neg $carry # mov $carry,cf 1732 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1733 adc %rax,$S[0] 1734 mov -8($aptr),%rax # a[i+1] # prefetch 1735 mov $S[0],-32($tptr) 1736 adc %rdx,$S[1] 1737 1738 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1739 mov $S[1],-24($tptr) 1740 sbb $carry,$carry # mov cf,$carry 1741 shr \$63,$A0[0] 1742 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1743 shr \$63,$A0[1] 1744 or $A0[0],$S[3] # | t[2*i]>>63 1745 mul %rax # a[i]*a[i] 1746 neg $carry # mov $carry,cf 1747 adc %rax,$S[2] 1748 adc %rdx,$S[3] 1749 mov $S[2],-16($tptr) 1750 mov $S[3],-8($tptr) 1751___ 1752} 1753###################################################################### 1754# Montgomery reduction part, "word-by-word" algorithm. 1755# 1756# This new path is inspired by multiple submissions from Intel, by 1757# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1758# Vinodh Gopal... 1759{ 1760my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1761 1762$code.=<<___; 1763 movq %xmm2,$nptr 1764__bn_sqr8x_reduction: 1765 xor %rax,%rax 1766 lea ($nptr,$num),%rcx # end of n[] 1767 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1768 mov %rcx,0+8(%rsp) 1769 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1770 mov %rdx,8+8(%rsp) 1771 neg $num 1772 jmp .L8x_reduction_loop 1773 1774.align 32 1775.L8x_reduction_loop: 1776 lea ($tptr,$num),$tptr # start of current t[] window 1777 .byte 0x66 1778 mov 8*0($tptr),$m0 1779 mov 8*1($tptr),%r9 1780 mov 8*2($tptr),%r10 1781 mov 8*3($tptr),%r11 1782 mov 8*4($tptr),%r12 1783 mov 8*5($tptr),%r13 1784 mov 8*6($tptr),%r14 1785 mov 8*7($tptr),%r15 1786 mov %rax,(%rdx) # store top-most carry bit 1787 lea 8*8($tptr),$tptr 1788 1789 .byte 0x67 1790 mov $m0,%r8 1791 imulq 32+8(%rsp),$m0 # n0*a[0] 1792 mov 8*0($nptr),%rax # n[0] 1793 mov \$8,%ecx 1794 jmp .L8x_reduce 1795 1796.align 32 1797.L8x_reduce: 1798 mulq $m0 1799 mov 8*1($nptr),%rax # n[1] 1800 neg %r8 1801 mov %rdx,%r8 1802 adc \$0,%r8 1803 1804 mulq $m0 1805 add %rax,%r9 1806 mov 8*2($nptr),%rax 1807 adc \$0,%rdx 1808 add %r9,%r8 1809 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1810 mov %rdx,%r9 1811 adc \$0,%r9 1812 1813 mulq $m0 1814 add %rax,%r10 1815 mov 8*3($nptr),%rax 1816 adc \$0,%rdx 1817 add %r10,%r9 1818 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1819 mov %rdx,%r10 1820 adc \$0,%r10 1821 1822 mulq $m0 1823 add %rax,%r11 1824 mov 8*4($nptr),%rax 1825 adc \$0,%rdx 1826 imulq %r8,$carry # modulo-scheduled 1827 add %r11,%r10 1828 mov %rdx,%r11 1829 adc \$0,%r11 1830 1831 mulq $m0 1832 add %rax,%r12 1833 mov 8*5($nptr),%rax 1834 adc \$0,%rdx 1835 add %r12,%r11 1836 mov %rdx,%r12 1837 adc \$0,%r12 1838 1839 mulq $m0 1840 add %rax,%r13 1841 mov 8*6($nptr),%rax 1842 adc \$0,%rdx 1843 add %r13,%r12 1844 mov %rdx,%r13 1845 adc \$0,%r13 1846 1847 mulq $m0 1848 add %rax,%r14 1849 mov 8*7($nptr),%rax 1850 adc \$0,%rdx 1851 add %r14,%r13 1852 mov %rdx,%r14 1853 adc \$0,%r14 1854 1855 mulq $m0 1856 mov $carry,$m0 # n0*a[i] 1857 add %rax,%r15 1858 mov 8*0($nptr),%rax # n[0] 1859 adc \$0,%rdx 1860 add %r15,%r14 1861 mov %rdx,%r15 1862 adc \$0,%r15 1863 1864 dec %ecx 1865 jnz .L8x_reduce 1866 1867 lea 8*8($nptr),$nptr 1868 xor %rax,%rax 1869 mov 8+8(%rsp),%rdx # pull end of t[] 1870 cmp 0+8(%rsp),$nptr # end of n[]? 1871 jae .L8x_no_tail 1872 1873 .byte 0x66 1874 add 8*0($tptr),%r8 1875 adc 8*1($tptr),%r9 1876 adc 8*2($tptr),%r10 1877 adc 8*3($tptr),%r11 1878 adc 8*4($tptr),%r12 1879 adc 8*5($tptr),%r13 1880 adc 8*6($tptr),%r14 1881 adc 8*7($tptr),%r15 1882 sbb $carry,$carry # top carry 1883 1884 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1885 mov \$8,%ecx 1886 mov 8*0($nptr),%rax 1887 jmp .L8x_tail 1888 1889.align 32 1890.L8x_tail: 1891 mulq $m0 1892 add %rax,%r8 1893 mov 8*1($nptr),%rax 1894 mov %r8,($tptr) # save result 1895 mov %rdx,%r8 1896 adc \$0,%r8 1897 1898 mulq $m0 1899 add %rax,%r9 1900 mov 8*2($nptr),%rax 1901 adc \$0,%rdx 1902 add %r9,%r8 1903 lea 8($tptr),$tptr # $tptr++ 1904 mov %rdx,%r9 1905 adc \$0,%r9 1906 1907 mulq $m0 1908 add %rax,%r10 1909 mov 8*3($nptr),%rax 1910 adc \$0,%rdx 1911 add %r10,%r9 1912 mov %rdx,%r10 1913 adc \$0,%r10 1914 1915 mulq $m0 1916 add %rax,%r11 1917 mov 8*4($nptr),%rax 1918 adc \$0,%rdx 1919 add %r11,%r10 1920 mov %rdx,%r11 1921 adc \$0,%r11 1922 1923 mulq $m0 1924 add %rax,%r12 1925 mov 8*5($nptr),%rax 1926 adc \$0,%rdx 1927 add %r12,%r11 1928 mov %rdx,%r12 1929 adc \$0,%r12 1930 1931 mulq $m0 1932 add %rax,%r13 1933 mov 8*6($nptr),%rax 1934 adc \$0,%rdx 1935 add %r13,%r12 1936 mov %rdx,%r13 1937 adc \$0,%r13 1938 1939 mulq $m0 1940 add %rax,%r14 1941 mov 8*7($nptr),%rax 1942 adc \$0,%rdx 1943 add %r14,%r13 1944 mov %rdx,%r14 1945 adc \$0,%r14 1946 1947 mulq $m0 1948 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1949 add %rax,%r15 1950 adc \$0,%rdx 1951 add %r15,%r14 1952 mov 8*0($nptr),%rax # pull n[0] 1953 mov %rdx,%r15 1954 adc \$0,%r15 1955 1956 dec %ecx 1957 jnz .L8x_tail 1958 1959 lea 8*8($nptr),$nptr 1960 mov 8+8(%rsp),%rdx # pull end of t[] 1961 cmp 0+8(%rsp),$nptr # end of n[]? 1962 jae .L8x_tail_done # break out of loop 1963 1964 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1965 neg $carry 1966 mov 8*0($nptr),%rax # pull n[0] 1967 adc 8*0($tptr),%r8 1968 adc 8*1($tptr),%r9 1969 adc 8*2($tptr),%r10 1970 adc 8*3($tptr),%r11 1971 adc 8*4($tptr),%r12 1972 adc 8*5($tptr),%r13 1973 adc 8*6($tptr),%r14 1974 adc 8*7($tptr),%r15 1975 sbb $carry,$carry # top carry 1976 1977 mov \$8,%ecx 1978 jmp .L8x_tail 1979 1980.align 32 1981.L8x_tail_done: 1982 xor %rax,%rax 1983 add (%rdx),%r8 # can this overflow? 1984 adc \$0,%r9 1985 adc \$0,%r10 1986 adc \$0,%r11 1987 adc \$0,%r12 1988 adc \$0,%r13 1989 adc \$0,%r14 1990 adc \$0,%r15 1991 adc \$0,%rax 1992 1993 neg $carry 1994.L8x_no_tail: 1995 adc 8*0($tptr),%r8 1996 adc 8*1($tptr),%r9 1997 adc 8*2($tptr),%r10 1998 adc 8*3($tptr),%r11 1999 adc 8*4($tptr),%r12 2000 adc 8*5($tptr),%r13 2001 adc 8*6($tptr),%r14 2002 adc 8*7($tptr),%r15 2003 adc \$0,%rax # top-most carry 2004 mov -8($nptr),%rcx # np[num-1] 2005 xor $carry,$carry 2006 2007 movq %xmm2,$nptr # restore $nptr 2008 2009 mov %r8,8*0($tptr) # store top 512 bits 2010 mov %r9,8*1($tptr) 2011 movq %xmm3,$num # $num is %r9, can't be moved upwards 2012 mov %r10,8*2($tptr) 2013 mov %r11,8*3($tptr) 2014 mov %r12,8*4($tptr) 2015 mov %r13,8*5($tptr) 2016 mov %r14,8*6($tptr) 2017 mov %r15,8*7($tptr) 2018 lea 8*8($tptr),$tptr 2019 2020 cmp %rdx,$tptr # end of t[]? 2021 jb .L8x_reduction_loop 2022 ret 2023.cfi_endproc 2024.size bn_sqr8x_internal,.-bn_sqr8x_internal 2025___ 2026} 2027############################################################## 2028# Post-condition, 4x unrolled 2029# 2030{ 2031my ($tptr,$nptr)=("%rbx","%rbp"); 2032$code.=<<___; 2033.type __bn_post4x_internal,\@abi-omnipotent 2034.align 32 2035__bn_post4x_internal: 2036.cfi_startproc 2037 mov 8*0($nptr),%r12 2038 lea (%rdi,$num),$tptr # %rdi was $tptr above 2039 mov $num,%rcx 2040 movq %xmm1,$rptr # restore $rptr 2041 neg %rax 2042 movq %xmm1,$aptr # prepare for back-to-back call 2043 sar \$3+2,%rcx 2044 dec %r12 # so that after 'not' we get -n[0] 2045 xor %r10,%r10 2046 mov 8*1($nptr),%r13 2047 mov 8*2($nptr),%r14 2048 mov 8*3($nptr),%r15 2049 jmp .Lsqr4x_sub_entry 2050 2051.align 16 2052.Lsqr4x_sub: 2053 mov 8*0($nptr),%r12 2054 mov 8*1($nptr),%r13 2055 mov 8*2($nptr),%r14 2056 mov 8*3($nptr),%r15 2057.Lsqr4x_sub_entry: 2058 lea 8*4($nptr),$nptr 2059 not %r12 2060 not %r13 2061 not %r14 2062 not %r15 2063 and %rax,%r12 2064 and %rax,%r13 2065 and %rax,%r14 2066 and %rax,%r15 2067 2068 neg %r10 # mov %r10,%cf 2069 adc 8*0($tptr),%r12 2070 adc 8*1($tptr),%r13 2071 adc 8*2($tptr),%r14 2072 adc 8*3($tptr),%r15 2073 mov %r12,8*0($rptr) 2074 lea 8*4($tptr),$tptr 2075 mov %r13,8*1($rptr) 2076 sbb %r10,%r10 # mov %cf,%r10 2077 mov %r14,8*2($rptr) 2078 mov %r15,8*3($rptr) 2079 lea 8*4($rptr),$rptr 2080 2081 inc %rcx # pass %cf 2082 jnz .Lsqr4x_sub 2083 2084 mov $num,%r10 # prepare for back-to-back call 2085 neg $num # restore $num 2086 ret 2087.cfi_endproc 2088.size __bn_post4x_internal,.-__bn_post4x_internal 2089___ 2090} 2091{ 2092$code.=<<___; 2093.globl bn_from_montgomery 2094.type bn_from_montgomery,\@abi-omnipotent 2095.align 32 2096bn_from_montgomery: 2097.cfi_startproc 2098 testl \$7,`($win64?"48(%rsp)":"%r9d")` 2099 jz bn_from_mont8x 2100 xor %eax,%eax 2101 ret 2102.cfi_endproc 2103.size bn_from_montgomery,.-bn_from_montgomery 2104 2105.type bn_from_mont8x,\@function,6 2106.align 32 2107bn_from_mont8x: 2108.cfi_startproc 2109 .byte 0x67 2110 mov %rsp,%rax 2111.cfi_def_cfa_register %rax 2112 push %rbx 2113.cfi_push %rbx 2114 push %rbp 2115.cfi_push %rbp 2116 push %r12 2117.cfi_push %r12 2118 push %r13 2119.cfi_push %r13 2120 push %r14 2121.cfi_push %r14 2122 push %r15 2123.cfi_push %r15 2124.Lfrom_prologue: 2125 2126 shl \$3,${num}d # convert $num to bytes 2127 lea ($num,$num,2),%r10 # 3*$num in bytes 2128 neg $num 2129 mov ($n0),$n0 # *n0 2130 2131 ############################################################## 2132 # Ensure that stack frame doesn't alias with $rptr+3*$num 2133 # modulo 4096, which covers ret[num], am[num] and n[num] 2134 # (see bn_exp.c). The stack is allocated to aligned with 2135 # bn_power5's frame, and as bn_from_montgomery happens to be 2136 # last operation, we use the opportunity to cleanse it. 2137 # 2138 lea -320(%rsp,$num,2),%r11 2139 mov %rsp,%rbp 2140 sub $rptr,%r11 2141 and \$4095,%r11 2142 cmp %r11,%r10 2143 jb .Lfrom_sp_alt 2144 sub %r11,%rbp # align with $aptr 2145 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2146 jmp .Lfrom_sp_done 2147 2148.align 32 2149.Lfrom_sp_alt: 2150 lea 4096-320(,$num,2),%r10 2151 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2152 sub %r10,%r11 2153 mov \$0,%r10 2154 cmovc %r10,%r11 2155 sub %r11,%rbp 2156.Lfrom_sp_done: 2157 and \$-64,%rbp 2158 mov %rsp,%r11 2159 sub %rbp,%r11 2160 and \$-4096,%r11 2161 lea (%rbp,%r11),%rsp 2162 mov (%rsp),%r10 2163 cmp %rbp,%rsp 2164 ja .Lfrom_page_walk 2165 jmp .Lfrom_page_walk_done 2166 2167.Lfrom_page_walk: 2168 lea -4096(%rsp),%rsp 2169 mov (%rsp),%r10 2170 cmp %rbp,%rsp 2171 ja .Lfrom_page_walk 2172.Lfrom_page_walk_done: 2173 2174 mov $num,%r10 2175 neg $num 2176 2177 ############################################################## 2178 # Stack layout 2179 # 2180 # +0 saved $num, used in reduction section 2181 # +8 &t[2*$num], used in reduction section 2182 # +32 saved *n0 2183 # +40 saved %rsp 2184 # +48 t[2*$num] 2185 # 2186 mov $n0, 32(%rsp) 2187 mov %rax, 40(%rsp) # save original %rsp 2188.cfi_cfa_expression %rsp+40,deref,+8 2189.Lfrom_body: 2190 mov $num,%r11 2191 lea 48(%rsp),%rax 2192 pxor %xmm0,%xmm0 2193 jmp .Lmul_by_1 2194 2195.align 32 2196.Lmul_by_1: 2197 movdqu ($aptr),%xmm1 2198 movdqu 16($aptr),%xmm2 2199 movdqu 32($aptr),%xmm3 2200 movdqa %xmm0,(%rax,$num) 2201 movdqu 48($aptr),%xmm4 2202 movdqa %xmm0,16(%rax,$num) 2203 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 2204 movdqa %xmm1,(%rax) 2205 movdqa %xmm0,32(%rax,$num) 2206 movdqa %xmm2,16(%rax) 2207 movdqa %xmm0,48(%rax,$num) 2208 movdqa %xmm3,32(%rax) 2209 movdqa %xmm4,48(%rax) 2210 lea 64(%rax),%rax 2211 sub \$64,%r11 2212 jnz .Lmul_by_1 2213 2214 movq $rptr,%xmm1 2215 movq $nptr,%xmm2 2216 .byte 0x67 2217 mov $nptr,%rbp 2218 movq %r10, %xmm3 # -num 2219___ 2220$code.=<<___ if ($addx); 2221 leaq OPENSSL_ia32cap_P(%rip),%r11 2222 mov 8(%r11),%r11d 2223 and \$0x80108,%r11d 2224 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 2225 jne .Lfrom_mont_nox 2226 2227 lea (%rax,$num),$rptr 2228 call __bn_sqrx8x_reduction 2229 call __bn_postx4x_internal 2230 2231 pxor %xmm0,%xmm0 2232 lea 48(%rsp),%rax 2233 jmp .Lfrom_mont_zero 2234 2235.align 32 2236.Lfrom_mont_nox: 2237___ 2238$code.=<<___; 2239 call __bn_sqr8x_reduction 2240 call __bn_post4x_internal 2241 2242 pxor %xmm0,%xmm0 2243 lea 48(%rsp),%rax 2244 jmp .Lfrom_mont_zero 2245 2246.align 32 2247.Lfrom_mont_zero: 2248 mov 40(%rsp),%rsi # restore %rsp 2249.cfi_def_cfa %rsi,8 2250 movdqa %xmm0,16*0(%rax) 2251 movdqa %xmm0,16*1(%rax) 2252 movdqa %xmm0,16*2(%rax) 2253 movdqa %xmm0,16*3(%rax) 2254 lea 16*4(%rax),%rax 2255 sub \$32,$num 2256 jnz .Lfrom_mont_zero 2257 2258 mov \$1,%rax 2259 mov -48(%rsi),%r15 2260.cfi_restore %r15 2261 mov -40(%rsi),%r14 2262.cfi_restore %r14 2263 mov -32(%rsi),%r13 2264.cfi_restore %r13 2265 mov -24(%rsi),%r12 2266.cfi_restore %r12 2267 mov -16(%rsi),%rbp 2268.cfi_restore %rbp 2269 mov -8(%rsi),%rbx 2270.cfi_restore %rbx 2271 lea (%rsi),%rsp 2272.cfi_def_cfa_register %rsp 2273.Lfrom_epilogue: 2274 ret 2275.cfi_endproc 2276.size bn_from_mont8x,.-bn_from_mont8x 2277___ 2278} 2279}}} 2280 2281if ($addx) {{{ 2282my $bp="%rdx"; # restore original value 2283 2284$code.=<<___; 2285.type bn_mulx4x_mont_gather5,\@function,6 2286.align 32 2287bn_mulx4x_mont_gather5: 2288.cfi_startproc 2289 mov %rsp,%rax 2290.cfi_def_cfa_register %rax 2291.Lmulx4x_enter: 2292 push %rbx 2293.cfi_push %rbx 2294 push %rbp 2295.cfi_push %rbp 2296 push %r12 2297.cfi_push %r12 2298 push %r13 2299.cfi_push %r13 2300 push %r14 2301.cfi_push %r14 2302 push %r15 2303.cfi_push %r15 2304.Lmulx4x_prologue: 2305 2306 shl \$3,${num}d # convert $num to bytes 2307 lea ($num,$num,2),%r10 # 3*$num in bytes 2308 neg $num # -$num 2309 mov ($n0),$n0 # *n0 2310 2311 ############################################################## 2312 # Ensure that stack frame doesn't alias with $rptr+3*$num 2313 # modulo 4096, which covers ret[num], am[num] and n[num] 2314 # (see bn_exp.c). This is done to allow memory disambiguation 2315 # logic do its magic. [Extra [num] is allocated in order 2316 # to align with bn_power5's frame, which is cleansed after 2317 # completing exponentiation. Extra 256 bytes is for power mask 2318 # calculated from 7th argument, the index.] 2319 # 2320 lea -320(%rsp,$num,2),%r11 2321 mov %rsp,%rbp 2322 sub $rp,%r11 2323 and \$4095,%r11 2324 cmp %r11,%r10 2325 jb .Lmulx4xsp_alt 2326 sub %r11,%rbp # align with $aptr 2327 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2328 jmp .Lmulx4xsp_done 2329 2330.Lmulx4xsp_alt: 2331 lea 4096-320(,$num,2),%r10 2332 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2333 sub %r10,%r11 2334 mov \$0,%r10 2335 cmovc %r10,%r11 2336 sub %r11,%rbp 2337.Lmulx4xsp_done: 2338 and \$-64,%rbp # ensure alignment 2339 mov %rsp,%r11 2340 sub %rbp,%r11 2341 and \$-4096,%r11 2342 lea (%rbp,%r11),%rsp 2343 mov (%rsp),%r10 2344 cmp %rbp,%rsp 2345 ja .Lmulx4x_page_walk 2346 jmp .Lmulx4x_page_walk_done 2347 2348.Lmulx4x_page_walk: 2349 lea -4096(%rsp),%rsp 2350 mov (%rsp),%r10 2351 cmp %rbp,%rsp 2352 ja .Lmulx4x_page_walk 2353.Lmulx4x_page_walk_done: 2354 2355 ############################################################## 2356 # Stack layout 2357 # +0 -num 2358 # +8 off-loaded &b[i] 2359 # +16 end of b[num] 2360 # +24 inner counter 2361 # +32 saved n0 2362 # +40 saved %rsp 2363 # +48 2364 # +56 saved rp 2365 # +64 tmp[num+1] 2366 # 2367 mov $n0, 32(%rsp) # save *n0 2368 mov %rax,40(%rsp) # save original %rsp 2369.cfi_cfa_expression %rsp+40,deref,+8 2370.Lmulx4x_body: 2371 call mulx4x_internal 2372 2373 mov 40(%rsp),%rsi # restore %rsp 2374.cfi_def_cfa %rsi,8 2375 mov \$1,%rax 2376 2377 mov -48(%rsi),%r15 2378.cfi_restore %r15 2379 mov -40(%rsi),%r14 2380.cfi_restore %r14 2381 mov -32(%rsi),%r13 2382.cfi_restore %r13 2383 mov -24(%rsi),%r12 2384.cfi_restore %r12 2385 mov -16(%rsi),%rbp 2386.cfi_restore %rbp 2387 mov -8(%rsi),%rbx 2388.cfi_restore %rbx 2389 lea (%rsi),%rsp 2390.cfi_def_cfa_register %rsp 2391.Lmulx4x_epilogue: 2392 ret 2393.cfi_endproc 2394.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2395 2396.type mulx4x_internal,\@abi-omnipotent 2397.align 32 2398mulx4x_internal: 2399.cfi_startproc 2400 mov $num,8(%rsp) # save -$num (it was in bytes) 2401 mov $num,%r10 2402 neg $num # restore $num 2403 shl \$5,$num 2404 neg %r10 # restore $num 2405 lea 128($bp,$num),%r13 # end of powers table (+size optimization) 2406 shr \$5+5,$num 2407 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument 2408 sub \$1,$num 2409 lea .Linc(%rip),%rax 2410 mov %r13,16+8(%rsp) # end of b[num] 2411 mov $num,24+8(%rsp) # inner counter 2412 mov $rp, 56+8(%rsp) # save $rp 2413___ 2414my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2415 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2416my $rptr=$bptr; 2417my $STRIDE=2**5*8; # 5 is "window size" 2418my $N=$STRIDE/4; # should match cache line size 2419$code.=<<___; 2420 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 2421 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 2422 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) 2423 lea 128($bp),$bptr # size optimization 2424 2425 pshufd \$0,%xmm5,%xmm5 # broadcast index 2426 movdqa %xmm1,%xmm4 2427 .byte 0x67 2428 movdqa %xmm1,%xmm2 2429___ 2430######################################################################## 2431# calculate mask by comparing 0..31 to index and save result to stack 2432# 2433$code.=<<___; 2434 .byte 0x67 2435 paddd %xmm0,%xmm1 2436 pcmpeqd %xmm5,%xmm0 # compare to 1,0 2437 movdqa %xmm4,%xmm3 2438___ 2439for($i=0;$i<$STRIDE/16-4;$i+=4) { 2440$code.=<<___; 2441 paddd %xmm1,%xmm2 2442 pcmpeqd %xmm5,%xmm1 # compare to 3,2 2443 movdqa %xmm0,`16*($i+0)+112`(%r10) 2444 movdqa %xmm4,%xmm0 2445 2446 paddd %xmm2,%xmm3 2447 pcmpeqd %xmm5,%xmm2 # compare to 5,4 2448 movdqa %xmm1,`16*($i+1)+112`(%r10) 2449 movdqa %xmm4,%xmm1 2450 2451 paddd %xmm3,%xmm0 2452 pcmpeqd %xmm5,%xmm3 # compare to 7,6 2453 movdqa %xmm2,`16*($i+2)+112`(%r10) 2454 movdqa %xmm4,%xmm2 2455 2456 paddd %xmm0,%xmm1 2457 pcmpeqd %xmm5,%xmm0 2458 movdqa %xmm3,`16*($i+3)+112`(%r10) 2459 movdqa %xmm4,%xmm3 2460___ 2461} 2462$code.=<<___; # last iteration can be optimized 2463 .byte 0x67 2464 paddd %xmm1,%xmm2 2465 pcmpeqd %xmm5,%xmm1 2466 movdqa %xmm0,`16*($i+0)+112`(%r10) 2467 2468 paddd %xmm2,%xmm3 2469 pcmpeqd %xmm5,%xmm2 2470 movdqa %xmm1,`16*($i+1)+112`(%r10) 2471 2472 pcmpeqd %xmm5,%xmm3 2473 movdqa %xmm2,`16*($i+2)+112`(%r10) 2474 2475 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register 2476 pand `16*($i+1)-128`($bptr),%xmm1 2477 pand `16*($i+2)-128`($bptr),%xmm2 2478 movdqa %xmm3,`16*($i+3)+112`(%r10) 2479 pand `16*($i+3)-128`($bptr),%xmm3 2480 por %xmm2,%xmm0 2481 por %xmm3,%xmm1 2482___ 2483for($i=0;$i<$STRIDE/16-4;$i+=4) { 2484$code.=<<___; 2485 movdqa `16*($i+0)-128`($bptr),%xmm4 2486 movdqa `16*($i+1)-128`($bptr),%xmm5 2487 movdqa `16*($i+2)-128`($bptr),%xmm2 2488 pand `16*($i+0)+112`(%r10),%xmm4 2489 movdqa `16*($i+3)-128`($bptr),%xmm3 2490 pand `16*($i+1)+112`(%r10),%xmm5 2491 por %xmm4,%xmm0 2492 pand `16*($i+2)+112`(%r10),%xmm2 2493 por %xmm5,%xmm1 2494 pand `16*($i+3)+112`(%r10),%xmm3 2495 por %xmm2,%xmm0 2496 por %xmm3,%xmm1 2497___ 2498} 2499$code.=<<___; 2500 pxor %xmm1,%xmm0 2501 pshufd \$0x4e,%xmm0,%xmm1 2502 por %xmm1,%xmm0 2503 lea $STRIDE($bptr),$bptr 2504 movq %xmm0,%rdx # bp[0] 2505 lea 64+8*4+8(%rsp),$tptr 2506 2507 mov %rdx,$bi 2508 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2509 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2510 add %rax,%r11 2511 mulx 2*8($aptr),%rax,%r13 # ... 2512 adc %rax,%r12 2513 adc \$0,%r13 2514 mulx 3*8($aptr),%rax,%r14 2515 2516 mov $mi,%r15 2517 imulq 32+8(%rsp),$mi # "t[0]"*n0 2518 xor $zero,$zero # cf=0, of=0 2519 mov $mi,%rdx 2520 2521 mov $bptr,8+8(%rsp) # off-load &b[i] 2522 2523 lea 4*8($aptr),$aptr 2524 adcx %rax,%r13 2525 adcx $zero,%r14 # cf=0 2526 2527 mulx 0*8($nptr),%rax,%r10 2528 adcx %rax,%r15 # discarded 2529 adox %r11,%r10 2530 mulx 1*8($nptr),%rax,%r11 2531 adcx %rax,%r10 2532 adox %r12,%r11 2533 mulx 2*8($nptr),%rax,%r12 2534 mov 24+8(%rsp),$bptr # counter value 2535 mov %r10,-8*4($tptr) 2536 adcx %rax,%r11 2537 adox %r13,%r12 2538 mulx 3*8($nptr),%rax,%r15 2539 mov $bi,%rdx 2540 mov %r11,-8*3($tptr) 2541 adcx %rax,%r12 2542 adox $zero,%r15 # of=0 2543 lea 4*8($nptr),$nptr 2544 mov %r12,-8*2($tptr) 2545 jmp .Lmulx4x_1st 2546 2547.align 32 2548.Lmulx4x_1st: 2549 adcx $zero,%r15 # cf=0, modulo-scheduled 2550 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2551 adcx %r14,%r10 2552 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2553 adcx %rax,%r11 2554 mulx 2*8($aptr),%r12,%rax # ... 2555 adcx %r14,%r12 2556 mulx 3*8($aptr),%r13,%r14 2557 .byte 0x67,0x67 2558 mov $mi,%rdx 2559 adcx %rax,%r13 2560 adcx $zero,%r14 # cf=0 2561 lea 4*8($aptr),$aptr 2562 lea 4*8($tptr),$tptr 2563 2564 adox %r15,%r10 2565 mulx 0*8($nptr),%rax,%r15 2566 adcx %rax,%r10 2567 adox %r15,%r11 2568 mulx 1*8($nptr),%rax,%r15 2569 adcx %rax,%r11 2570 adox %r15,%r12 2571 mulx 2*8($nptr),%rax,%r15 2572 mov %r10,-5*8($tptr) 2573 adcx %rax,%r12 2574 mov %r11,-4*8($tptr) 2575 adox %r15,%r13 2576 mulx 3*8($nptr),%rax,%r15 2577 mov $bi,%rdx 2578 mov %r12,-3*8($tptr) 2579 adcx %rax,%r13 2580 adox $zero,%r15 2581 lea 4*8($nptr),$nptr 2582 mov %r13,-2*8($tptr) 2583 2584 dec $bptr # of=0, pass cf 2585 jnz .Lmulx4x_1st 2586 2587 mov 8(%rsp),$num # load -num 2588 adc $zero,%r15 # modulo-scheduled 2589 lea ($aptr,$num),$aptr # rewind $aptr 2590 add %r15,%r14 2591 mov 8+8(%rsp),$bptr # re-load &b[i] 2592 adc $zero,$zero # top-most carry 2593 mov %r14,-1*8($tptr) 2594 jmp .Lmulx4x_outer 2595 2596.align 32 2597.Lmulx4x_outer: 2598 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) 2599 pxor %xmm4,%xmm4 2600 .byte 0x67,0x67 2601 pxor %xmm5,%xmm5 2602___ 2603for($i=0;$i<$STRIDE/16;$i+=4) { 2604$code.=<<___; 2605 movdqa `16*($i+0)-128`($bptr),%xmm0 2606 movdqa `16*($i+1)-128`($bptr),%xmm1 2607 movdqa `16*($i+2)-128`($bptr),%xmm2 2608 pand `16*($i+0)+256`(%r10),%xmm0 2609 movdqa `16*($i+3)-128`($bptr),%xmm3 2610 pand `16*($i+1)+256`(%r10),%xmm1 2611 por %xmm0,%xmm4 2612 pand `16*($i+2)+256`(%r10),%xmm2 2613 por %xmm1,%xmm5 2614 pand `16*($i+3)+256`(%r10),%xmm3 2615 por %xmm2,%xmm4 2616 por %xmm3,%xmm5 2617___ 2618} 2619$code.=<<___; 2620 por %xmm5,%xmm4 2621 pshufd \$0x4e,%xmm4,%xmm0 2622 por %xmm4,%xmm0 2623 lea $STRIDE($bptr),$bptr 2624 movq %xmm0,%rdx # m0=bp[i] 2625 2626 mov $zero,($tptr) # save top-most carry 2627 lea 4*8($tptr,$num),$tptr # rewind $tptr 2628 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2629 xor $zero,$zero # cf=0, of=0 2630 mov %rdx,$bi 2631 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2632 adox -4*8($tptr),$mi # +t[0] 2633 adcx %r14,%r11 2634 mulx 2*8($aptr),%r15,%r13 # ... 2635 adox -3*8($tptr),%r11 2636 adcx %r15,%r12 2637 mulx 3*8($aptr),%rdx,%r14 2638 adox -2*8($tptr),%r12 2639 adcx %rdx,%r13 2640 lea ($nptr,$num),$nptr # rewind $nptr 2641 lea 4*8($aptr),$aptr 2642 adox -1*8($tptr),%r13 2643 adcx $zero,%r14 2644 adox $zero,%r14 2645 2646 mov $mi,%r15 2647 imulq 32+8(%rsp),$mi # "t[0]"*n0 2648 2649 mov $mi,%rdx 2650 xor $zero,$zero # cf=0, of=0 2651 mov $bptr,8+8(%rsp) # off-load &b[i] 2652 2653 mulx 0*8($nptr),%rax,%r10 2654 adcx %rax,%r15 # discarded 2655 adox %r11,%r10 2656 mulx 1*8($nptr),%rax,%r11 2657 adcx %rax,%r10 2658 adox %r12,%r11 2659 mulx 2*8($nptr),%rax,%r12 2660 adcx %rax,%r11 2661 adox %r13,%r12 2662 mulx 3*8($nptr),%rax,%r15 2663 mov $bi,%rdx 2664 mov 24+8(%rsp),$bptr # counter value 2665 mov %r10,-8*4($tptr) 2666 adcx %rax,%r12 2667 mov %r11,-8*3($tptr) 2668 adox $zero,%r15 # of=0 2669 mov %r12,-8*2($tptr) 2670 lea 4*8($nptr),$nptr 2671 jmp .Lmulx4x_inner 2672 2673.align 32 2674.Lmulx4x_inner: 2675 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2676 adcx $zero,%r15 # cf=0, modulo-scheduled 2677 adox %r14,%r10 2678 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2679 adcx 0*8($tptr),%r10 2680 adox %rax,%r11 2681 mulx 2*8($aptr),%r12,%rax # ... 2682 adcx 1*8($tptr),%r11 2683 adox %r14,%r12 2684 mulx 3*8($aptr),%r13,%r14 2685 mov $mi,%rdx 2686 adcx 2*8($tptr),%r12 2687 adox %rax,%r13 2688 adcx 3*8($tptr),%r13 2689 adox $zero,%r14 # of=0 2690 lea 4*8($aptr),$aptr 2691 lea 4*8($tptr),$tptr 2692 adcx $zero,%r14 # cf=0 2693 2694 adox %r15,%r10 2695 mulx 0*8($nptr),%rax,%r15 2696 adcx %rax,%r10 2697 adox %r15,%r11 2698 mulx 1*8($nptr),%rax,%r15 2699 adcx %rax,%r11 2700 adox %r15,%r12 2701 mulx 2*8($nptr),%rax,%r15 2702 mov %r10,-5*8($tptr) 2703 adcx %rax,%r12 2704 adox %r15,%r13 2705 mov %r11,-4*8($tptr) 2706 mulx 3*8($nptr),%rax,%r15 2707 mov $bi,%rdx 2708 lea 4*8($nptr),$nptr 2709 mov %r12,-3*8($tptr) 2710 adcx %rax,%r13 2711 adox $zero,%r15 2712 mov %r13,-2*8($tptr) 2713 2714 dec $bptr # of=0, pass cf 2715 jnz .Lmulx4x_inner 2716 2717 mov 0+8(%rsp),$num # load -num 2718 adc $zero,%r15 # modulo-scheduled 2719 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2720 mov 8+8(%rsp),$bptr # re-load &b[i] 2721 mov 16+8(%rsp),%r10 2722 adc %r15,%r14 2723 lea ($aptr,$num),$aptr # rewind $aptr 2724 adc $zero,$zero # top-most carry 2725 mov %r14,-1*8($tptr) 2726 2727 cmp %r10,$bptr 2728 jb .Lmulx4x_outer 2729 2730 mov -8($nptr),%r10 2731 mov $zero,%r8 2732 mov ($nptr,$num),%r12 2733 lea ($nptr,$num),%rbp # rewind $nptr 2734 mov $num,%rcx 2735 lea ($tptr,$num),%rdi # rewind $tptr 2736 xor %eax,%eax 2737 xor %r15,%r15 2738 sub %r14,%r10 # compare top-most words 2739 adc %r15,%r15 2740 or %r15,%r8 2741 sar \$3+2,%rcx 2742 sub %r8,%rax # %rax=-%r8 2743 mov 56+8(%rsp),%rdx # restore rp 2744 dec %r12 # so that after 'not' we get -n[0] 2745 mov 8*1(%rbp),%r13 2746 xor %r8,%r8 2747 mov 8*2(%rbp),%r14 2748 mov 8*3(%rbp),%r15 2749 jmp .Lsqrx4x_sub_entry # common post-condition 2750.cfi_endproc 2751.size mulx4x_internal,.-mulx4x_internal 2752___ 2753}{ 2754###################################################################### 2755# void bn_power5( 2756my $rptr="%rdi"; # BN_ULONG *rptr, 2757my $aptr="%rsi"; # const BN_ULONG *aptr, 2758my $bptr="%rdx"; # const BN_ULONG *table, 2759my $nptr="%rcx"; # const BN_ULONG *nptr, 2760my $n0 ="%r8"; # const BN_ULONG *n0); 2761my $num ="%r9"; # int num, has to be divisible by 8 2762 # int pwr); 2763 2764my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2765my @A0=("%r10","%r11"); 2766my @A1=("%r12","%r13"); 2767my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2768 2769$code.=<<___; 2770.type bn_powerx5,\@function,6 2771.align 32 2772bn_powerx5: 2773.cfi_startproc 2774 mov %rsp,%rax 2775.cfi_def_cfa_register %rax 2776.Lpowerx5_enter: 2777 push %rbx 2778.cfi_push %rbx 2779 push %rbp 2780.cfi_push %rbp 2781 push %r12 2782.cfi_push %r12 2783 push %r13 2784.cfi_push %r13 2785 push %r14 2786.cfi_push %r14 2787 push %r15 2788.cfi_push %r15 2789.Lpowerx5_prologue: 2790 2791 shl \$3,${num}d # convert $num to bytes 2792 lea ($num,$num,2),%r10 # 3*$num in bytes 2793 neg $num 2794 mov ($n0),$n0 # *n0 2795 2796 ############################################################## 2797 # Ensure that stack frame doesn't alias with $rptr+3*$num 2798 # modulo 4096, which covers ret[num], am[num] and n[num] 2799 # (see bn_exp.c). This is done to allow memory disambiguation 2800 # logic do its magic. [Extra 256 bytes is for power mask 2801 # calculated from 7th argument, the index.] 2802 # 2803 lea -320(%rsp,$num,2),%r11 2804 mov %rsp,%rbp 2805 sub $rptr,%r11 2806 and \$4095,%r11 2807 cmp %r11,%r10 2808 jb .Lpwrx_sp_alt 2809 sub %r11,%rbp # align with $aptr 2810 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2811 jmp .Lpwrx_sp_done 2812 2813.align 32 2814.Lpwrx_sp_alt: 2815 lea 4096-320(,$num,2),%r10 2816 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) 2817 sub %r10,%r11 2818 mov \$0,%r10 2819 cmovc %r10,%r11 2820 sub %r11,%rbp 2821.Lpwrx_sp_done: 2822 and \$-64,%rbp 2823 mov %rsp,%r11 2824 sub %rbp,%r11 2825 and \$-4096,%r11 2826 lea (%rbp,%r11),%rsp 2827 mov (%rsp),%r10 2828 cmp %rbp,%rsp 2829 ja .Lpwrx_page_walk 2830 jmp .Lpwrx_page_walk_done 2831 2832.Lpwrx_page_walk: 2833 lea -4096(%rsp),%rsp 2834 mov (%rsp),%r10 2835 cmp %rbp,%rsp 2836 ja .Lpwrx_page_walk 2837.Lpwrx_page_walk_done: 2838 2839 mov $num,%r10 2840 neg $num 2841 2842 ############################################################## 2843 # Stack layout 2844 # 2845 # +0 saved $num, used in reduction section 2846 # +8 &t[2*$num], used in reduction section 2847 # +16 intermediate carry bit 2848 # +24 top-most carry bit, used in reduction section 2849 # +32 saved *n0 2850 # +40 saved %rsp 2851 # +48 t[2*$num] 2852 # 2853 pxor %xmm0,%xmm0 2854 movq $rptr,%xmm1 # save $rptr 2855 movq $nptr,%xmm2 # save $nptr 2856 movq %r10, %xmm3 # -$num 2857 movq $bptr,%xmm4 2858 mov $n0, 32(%rsp) 2859 mov %rax, 40(%rsp) # save original %rsp 2860.cfi_cfa_expression %rsp+40,deref,+8 2861.Lpowerx5_body: 2862 2863 call __bn_sqrx8x_internal 2864 call __bn_postx4x_internal 2865 call __bn_sqrx8x_internal 2866 call __bn_postx4x_internal 2867 call __bn_sqrx8x_internal 2868 call __bn_postx4x_internal 2869 call __bn_sqrx8x_internal 2870 call __bn_postx4x_internal 2871 call __bn_sqrx8x_internal 2872 call __bn_postx4x_internal 2873 2874 mov %r10,$num # -num 2875 mov $aptr,$rptr 2876 movq %xmm2,$nptr 2877 movq %xmm4,$bptr 2878 mov 40(%rsp),%rax 2879 2880 call mulx4x_internal 2881 2882 mov 40(%rsp),%rsi # restore %rsp 2883.cfi_def_cfa %rsi,8 2884 mov \$1,%rax 2885 2886 mov -48(%rsi),%r15 2887.cfi_restore %r15 2888 mov -40(%rsi),%r14 2889.cfi_restore %r14 2890 mov -32(%rsi),%r13 2891.cfi_restore %r13 2892 mov -24(%rsi),%r12 2893.cfi_restore %r12 2894 mov -16(%rsi),%rbp 2895.cfi_restore %rbp 2896 mov -8(%rsi),%rbx 2897.cfi_restore %rbx 2898 lea (%rsi),%rsp 2899.cfi_def_cfa_register %rsp 2900.Lpowerx5_epilogue: 2901 ret 2902.cfi_endproc 2903.size bn_powerx5,.-bn_powerx5 2904 2905.globl bn_sqrx8x_internal 2906.hidden bn_sqrx8x_internal 2907.type bn_sqrx8x_internal,\@abi-omnipotent 2908.align 32 2909bn_sqrx8x_internal: 2910__bn_sqrx8x_internal: 2911.cfi_startproc 2912 ################################################################## 2913 # Squaring part: 2914 # 2915 # a) multiply-n-add everything but a[i]*a[i]; 2916 # b) shift result of a) by 1 to the left and accumulate 2917 # a[i]*a[i] products; 2918 # 2919 ################################################################## 2920 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2921 # a[1]a[0] 2922 # a[2]a[0] 2923 # a[3]a[0] 2924 # a[2]a[1] 2925 # a[3]a[1] 2926 # a[3]a[2] 2927 # 2928 # a[4]a[0] 2929 # a[5]a[0] 2930 # a[6]a[0] 2931 # a[7]a[0] 2932 # a[4]a[1] 2933 # a[5]a[1] 2934 # a[6]a[1] 2935 # a[7]a[1] 2936 # a[4]a[2] 2937 # a[5]a[2] 2938 # a[6]a[2] 2939 # a[7]a[2] 2940 # a[4]a[3] 2941 # a[5]a[3] 2942 # a[6]a[3] 2943 # a[7]a[3] 2944 # 2945 # a[5]a[4] 2946 # a[6]a[4] 2947 # a[7]a[4] 2948 # a[6]a[5] 2949 # a[7]a[5] 2950 # a[7]a[6] 2951 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2952___ 2953{ 2954my ($zero,$carry)=("%rbp","%rcx"); 2955my $aaptr=$zero; 2956$code.=<<___; 2957 lea 48+8(%rsp),$tptr 2958 lea ($aptr,$num),$aaptr 2959 mov $num,0+8(%rsp) # save $num 2960 mov $aaptr,8+8(%rsp) # save end of $aptr 2961 jmp .Lsqr8x_zero_start 2962 2963.align 32 2964.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2965.Lsqrx8x_zero: 2966 .byte 0x3e 2967 movdqa %xmm0,0*8($tptr) 2968 movdqa %xmm0,2*8($tptr) 2969 movdqa %xmm0,4*8($tptr) 2970 movdqa %xmm0,6*8($tptr) 2971.Lsqr8x_zero_start: # aligned at 32 2972 movdqa %xmm0,8*8($tptr) 2973 movdqa %xmm0,10*8($tptr) 2974 movdqa %xmm0,12*8($tptr) 2975 movdqa %xmm0,14*8($tptr) 2976 lea 16*8($tptr),$tptr 2977 sub \$64,$num 2978 jnz .Lsqrx8x_zero 2979 2980 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2981 #xor %r9,%r9 # t[1], ex-$num, zero already 2982 xor %r10,%r10 2983 xor %r11,%r11 2984 xor %r12,%r12 2985 xor %r13,%r13 2986 xor %r14,%r14 2987 xor %r15,%r15 2988 lea 48+8(%rsp),$tptr 2989 xor $zero,$zero # cf=0, cf=0 2990 jmp .Lsqrx8x_outer_loop 2991 2992.align 32 2993.Lsqrx8x_outer_loop: 2994 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2995 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2996 adox %rax,%r10 2997 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2998 adcx %r10,%r9 2999 adox %rax,%r11 3000 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 3001 adcx %r11,%r10 3002 adox %rax,%r12 3003 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 3004 adcx %r12,%r11 3005 adox %rax,%r13 3006 mulx 5*8($aptr),%r12,%rax 3007 adcx %r13,%r12 3008 adox %rax,%r14 3009 mulx 6*8($aptr),%r13,%rax 3010 adcx %r14,%r13 3011 adox %r15,%rax 3012 mulx 7*8($aptr),%r14,%r15 3013 mov 1*8($aptr),%rdx # a[1] 3014 adcx %rax,%r14 3015 adox $zero,%r15 3016 adc 8*8($tptr),%r15 3017 mov %r8,1*8($tptr) # t[1] 3018 mov %r9,2*8($tptr) # t[2] 3019 sbb $carry,$carry # mov %cf,$carry 3020 xor $zero,$zero # cf=0, of=0 3021 3022 3023 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 3024 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 3025 adcx %r10,%r8 3026 adox %rbx,%r9 3027 mulx 4*8($aptr),%r10,%rbx # ... 3028 adcx %r11,%r9 3029 adox %rax,%r10 3030 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 3031 adcx %r12,%r10 3032 adox %rbx,%r11 3033 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 3034 adcx %r13,%r11 3035 adox %r14,%r12 3036 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 3037 mov 2*8($aptr),%rdx # a[2] 3038 adcx %rax,%r12 3039 adox %rbx,%r13 3040 adcx %r15,%r13 3041 adox $zero,%r14 # of=0 3042 adcx $zero,%r14 # cf=0 3043 3044 mov %r8,3*8($tptr) # t[3] 3045 mov %r9,4*8($tptr) # t[4] 3046 3047 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 3048 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 3049 adcx %r10,%r8 3050 adox %rbx,%r9 3051 mulx 5*8($aptr),%r10,%rbx # ... 3052 adcx %r11,%r9 3053 adox %rax,%r10 3054 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 3055 adcx %r12,%r10 3056 adox %r13,%r11 3057 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 3058 .byte 0x3e 3059 mov 3*8($aptr),%rdx # a[3] 3060 adcx %rbx,%r11 3061 adox %rax,%r12 3062 adcx %r14,%r12 3063 mov %r8,5*8($tptr) # t[5] 3064 mov %r9,6*8($tptr) # t[6] 3065 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 3066 adox $zero,%r13 # of=0 3067 adcx $zero,%r13 # cf=0 3068 3069 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 3070 adcx %r10,%r8 3071 adox %rax,%r9 3072 mulx 6*8($aptr),%r10,%rax # ... 3073 adcx %r11,%r9 3074 adox %r12,%r10 3075 mulx 7*8($aptr),%r11,%r12 3076 mov 4*8($aptr),%rdx # a[4] 3077 mov 5*8($aptr),%r14 # a[5] 3078 adcx %rbx,%r10 3079 adox %rax,%r11 3080 mov 6*8($aptr),%r15 # a[6] 3081 adcx %r13,%r11 3082 adox $zero,%r12 # of=0 3083 adcx $zero,%r12 # cf=0 3084 3085 mov %r8,7*8($tptr) # t[7] 3086 mov %r9,8*8($tptr) # t[8] 3087 3088 mulx %r14,%r9,%rax # a[5]*a[4] 3089 mov 7*8($aptr),%r8 # a[7] 3090 adcx %r10,%r9 3091 mulx %r15,%r10,%rbx # a[6]*a[4] 3092 adox %rax,%r10 3093 adcx %r11,%r10 3094 mulx %r8,%r11,%rax # a[7]*a[4] 3095 mov %r14,%rdx # a[5] 3096 adox %rbx,%r11 3097 adcx %r12,%r11 3098 #adox $zero,%rax # of=0 3099 adcx $zero,%rax # cf=0 3100 3101 mulx %r15,%r14,%rbx # a[6]*a[5] 3102 mulx %r8,%r12,%r13 # a[7]*a[5] 3103 mov %r15,%rdx # a[6] 3104 lea 8*8($aptr),$aptr 3105 adcx %r14,%r11 3106 adox %rbx,%r12 3107 adcx %rax,%r12 3108 adox $zero,%r13 3109 3110 .byte 0x67,0x67 3111 mulx %r8,%r8,%r14 # a[7]*a[6] 3112 adcx %r8,%r13 3113 adcx $zero,%r14 3114 3115 cmp 8+8(%rsp),$aptr 3116 je .Lsqrx8x_outer_break 3117 3118 neg $carry # mov $carry,%cf 3119 mov \$-8,%rcx 3120 mov $zero,%r15 3121 mov 8*8($tptr),%r8 3122 adcx 9*8($tptr),%r9 # +=t[9] 3123 adcx 10*8($tptr),%r10 # ... 3124 adcx 11*8($tptr),%r11 3125 adc 12*8($tptr),%r12 3126 adc 13*8($tptr),%r13 3127 adc 14*8($tptr),%r14 3128 adc 15*8($tptr),%r15 3129 lea ($aptr),$aaptr 3130 lea 2*64($tptr),$tptr 3131 sbb %rax,%rax # mov %cf,$carry 3132 3133 mov -64($aptr),%rdx # a[0] 3134 mov %rax,16+8(%rsp) # offload $carry 3135 mov $tptr,24+8(%rsp) 3136 3137 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 3138 xor %eax,%eax # cf=0, of=0 3139 jmp .Lsqrx8x_loop 3140 3141.align 32 3142.Lsqrx8x_loop: 3143 mov %r8,%rbx 3144 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 3145 adcx %rax,%rbx # +=t[8] 3146 adox %r9,%r8 3147 3148 mulx 1*8($aaptr),%rax,%r9 # ... 3149 adcx %rax,%r8 3150 adox %r10,%r9 3151 3152 mulx 2*8($aaptr),%rax,%r10 3153 adcx %rax,%r9 3154 adox %r11,%r10 3155 3156 mulx 3*8($aaptr),%rax,%r11 3157 adcx %rax,%r10 3158 adox %r12,%r11 3159 3160 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 3161 adcx %rax,%r11 3162 adox %r13,%r12 3163 3164 mulx 5*8($aaptr),%rax,%r13 3165 adcx %rax,%r12 3166 adox %r14,%r13 3167 3168 mulx 6*8($aaptr),%rax,%r14 3169 mov %rbx,($tptr,%rcx,8) # store t[8+i] 3170 mov \$0,%ebx 3171 adcx %rax,%r13 3172 adox %r15,%r14 3173 3174 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 3175 mov 8($aptr,%rcx,8),%rdx # a[i] 3176 adcx %rax,%r14 3177 adox %rbx,%r15 # %rbx is 0, of=0 3178 adcx %rbx,%r15 # cf=0 3179 3180 .byte 0x67 3181 inc %rcx # of=0 3182 jnz .Lsqrx8x_loop 3183 3184 lea 8*8($aaptr),$aaptr 3185 mov \$-8,%rcx 3186 cmp 8+8(%rsp),$aaptr # done? 3187 je .Lsqrx8x_break 3188 3189 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3190 .byte 0x66 3191 mov -64($aptr),%rdx 3192 adcx 0*8($tptr),%r8 3193 adcx 1*8($tptr),%r9 3194 adc 2*8($tptr),%r10 3195 adc 3*8($tptr),%r11 3196 adc 4*8($tptr),%r12 3197 adc 5*8($tptr),%r13 3198 adc 6*8($tptr),%r14 3199 adc 7*8($tptr),%r15 3200 lea 8*8($tptr),$tptr 3201 .byte 0x67 3202 sbb %rax,%rax # mov %cf,%rax 3203 xor %ebx,%ebx # cf=0, of=0 3204 mov %rax,16+8(%rsp) # offload carry 3205 jmp .Lsqrx8x_loop 3206 3207.align 32 3208.Lsqrx8x_break: 3209 xor $zero,$zero 3210 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3211 adcx $zero,%r8 3212 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 3213 adcx $zero,%r9 3214 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 3215 adc \$0,%r10 3216 mov %r8,0*8($tptr) 3217 adc \$0,%r11 3218 adc \$0,%r12 3219 adc \$0,%r13 3220 adc \$0,%r14 3221 adc \$0,%r15 3222 cmp $carry,$tptr # cf=0, of=0 3223 je .Lsqrx8x_outer_loop 3224 3225 mov %r9,1*8($tptr) 3226 mov 1*8($carry),%r9 3227 mov %r10,2*8($tptr) 3228 mov 2*8($carry),%r10 3229 mov %r11,3*8($tptr) 3230 mov 3*8($carry),%r11 3231 mov %r12,4*8($tptr) 3232 mov 4*8($carry),%r12 3233 mov %r13,5*8($tptr) 3234 mov 5*8($carry),%r13 3235 mov %r14,6*8($tptr) 3236 mov 6*8($carry),%r14 3237 mov %r15,7*8($tptr) 3238 mov 7*8($carry),%r15 3239 mov $carry,$tptr 3240 jmp .Lsqrx8x_outer_loop 3241 3242.align 32 3243.Lsqrx8x_outer_break: 3244 mov %r9,9*8($tptr) # t[9] 3245 movq %xmm3,%rcx # -$num 3246 mov %r10,10*8($tptr) # ... 3247 mov %r11,11*8($tptr) 3248 mov %r12,12*8($tptr) 3249 mov %r13,13*8($tptr) 3250 mov %r14,14*8($tptr) 3251___ 3252}{ 3253my $i="%rcx"; 3254$code.=<<___; 3255 lea 48+8(%rsp),$tptr 3256 mov ($aptr,$i),%rdx # a[0] 3257 3258 mov 8($tptr),$A0[1] # t[1] 3259 xor $A0[0],$A0[0] # t[0], of=0, cf=0 3260 mov 0+8(%rsp),$num # restore $num 3261 adox $A0[1],$A0[1] 3262 mov 16($tptr),$A1[0] # t[2] # prefetch 3263 mov 24($tptr),$A1[1] # t[3] # prefetch 3264 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 3265 3266.align 32 3267.Lsqrx4x_shift_n_add: 3268 mulx %rdx,%rax,%rbx 3269 adox $A1[0],$A1[0] 3270 adcx $A0[0],%rax 3271 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 3272 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 3273 adox $A1[1],$A1[1] 3274 adcx $A0[1],%rbx 3275 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 3276 mov %rax,0($tptr) 3277 mov %rbx,8($tptr) 3278 3279 mulx %rdx,%rax,%rbx 3280 adox $A0[0],$A0[0] 3281 adcx $A1[0],%rax 3282 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 3283 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 3284 adox $A0[1],$A0[1] 3285 adcx $A1[1],%rbx 3286 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 3287 mov %rax,16($tptr) 3288 mov %rbx,24($tptr) 3289 3290 mulx %rdx,%rax,%rbx 3291 adox $A1[0],$A1[0] 3292 adcx $A0[0],%rax 3293 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 3294 lea 32($i),$i 3295 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 3296 adox $A1[1],$A1[1] 3297 adcx $A0[1],%rbx 3298 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 3299 mov %rax,32($tptr) 3300 mov %rbx,40($tptr) 3301 3302 mulx %rdx,%rax,%rbx 3303 adox $A0[0],$A0[0] 3304 adcx $A1[0],%rax 3305 jrcxz .Lsqrx4x_shift_n_add_break 3306 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 3307 adox $A0[1],$A0[1] 3308 adcx $A1[1],%rbx 3309 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 3310 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 3311 mov %rax,48($tptr) 3312 mov %rbx,56($tptr) 3313 lea 64($tptr),$tptr 3314 nop 3315 jmp .Lsqrx4x_shift_n_add 3316 3317.align 32 3318.Lsqrx4x_shift_n_add_break: 3319 adcx $A1[1],%rbx 3320 mov %rax,48($tptr) 3321 mov %rbx,56($tptr) 3322 lea 64($tptr),$tptr # end of t[] buffer 3323___ 3324} 3325###################################################################### 3326# Montgomery reduction part, "word-by-word" algorithm. 3327# 3328# This new path is inspired by multiple submissions from Intel, by 3329# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 3330# Vinodh Gopal... 3331{ 3332my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 3333 3334$code.=<<___; 3335 movq %xmm2,$nptr 3336__bn_sqrx8x_reduction: 3337 xor %eax,%eax # initial top-most carry bit 3338 mov 32+8(%rsp),%rbx # n0 3339 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 3340 lea -8*8($nptr,$num),%rcx # end of n[] 3341 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 3342 mov %rcx, 0+8(%rsp) # save end of n[] 3343 mov $tptr,8+8(%rsp) # save end of t[] 3344 3345 lea 48+8(%rsp),$tptr # initial t[] window 3346 jmp .Lsqrx8x_reduction_loop 3347 3348.align 32 3349.Lsqrx8x_reduction_loop: 3350 mov 8*1($tptr),%r9 3351 mov 8*2($tptr),%r10 3352 mov 8*3($tptr),%r11 3353 mov 8*4($tptr),%r12 3354 mov %rdx,%r8 3355 imulq %rbx,%rdx # n0*a[i] 3356 mov 8*5($tptr),%r13 3357 mov 8*6($tptr),%r14 3358 mov 8*7($tptr),%r15 3359 mov %rax,24+8(%rsp) # store top-most carry bit 3360 3361 lea 8*8($tptr),$tptr 3362 xor $carry,$carry # cf=0,of=0 3363 mov \$-8,%rcx 3364 jmp .Lsqrx8x_reduce 3365 3366.align 32 3367.Lsqrx8x_reduce: 3368 mov %r8, %rbx 3369 mulx 8*0($nptr),%rax,%r8 # n[0] 3370 adcx %rbx,%rax # discarded 3371 adox %r9,%r8 3372 3373 mulx 8*1($nptr),%rbx,%r9 # n[1] 3374 adcx %rbx,%r8 3375 adox %r10,%r9 3376 3377 mulx 8*2($nptr),%rbx,%r10 3378 adcx %rbx,%r9 3379 adox %r11,%r10 3380 3381 mulx 8*3($nptr),%rbx,%r11 3382 adcx %rbx,%r10 3383 adox %r12,%r11 3384 3385 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 3386 mov %rdx,%rax 3387 mov %r8,%rdx 3388 adcx %rbx,%r11 3389 adox %r13,%r12 3390 3391 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3392 mov %rax,%rdx 3393 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3394 3395 mulx 8*5($nptr),%rax,%r13 3396 adcx %rax,%r12 3397 adox %r14,%r13 3398 3399 mulx 8*6($nptr),%rax,%r14 3400 adcx %rax,%r13 3401 adox %r15,%r14 3402 3403 mulx 8*7($nptr),%rax,%r15 3404 mov %rbx,%rdx 3405 adcx %rax,%r14 3406 adox $carry,%r15 # $carry is 0 3407 adcx $carry,%r15 # cf=0 3408 3409 .byte 0x67,0x67,0x67 3410 inc %rcx # of=0 3411 jnz .Lsqrx8x_reduce 3412 3413 mov $carry,%rax # xor %rax,%rax 3414 cmp 0+8(%rsp),$nptr # end of n[]? 3415 jae .Lsqrx8x_no_tail 3416 3417 mov 48+8(%rsp),%rdx # pull n0*a[0] 3418 add 8*0($tptr),%r8 3419 lea 8*8($nptr),$nptr 3420 mov \$-8,%rcx 3421 adcx 8*1($tptr),%r9 3422 adcx 8*2($tptr),%r10 3423 adc 8*3($tptr),%r11 3424 adc 8*4($tptr),%r12 3425 adc 8*5($tptr),%r13 3426 adc 8*6($tptr),%r14 3427 adc 8*7($tptr),%r15 3428 lea 8*8($tptr),$tptr 3429 sbb %rax,%rax # top carry 3430 3431 xor $carry,$carry # of=0, cf=0 3432 mov %rax,16+8(%rsp) 3433 jmp .Lsqrx8x_tail 3434 3435.align 32 3436.Lsqrx8x_tail: 3437 mov %r8,%rbx 3438 mulx 8*0($nptr),%rax,%r8 3439 adcx %rax,%rbx 3440 adox %r9,%r8 3441 3442 mulx 8*1($nptr),%rax,%r9 3443 adcx %rax,%r8 3444 adox %r10,%r9 3445 3446 mulx 8*2($nptr),%rax,%r10 3447 adcx %rax,%r9 3448 adox %r11,%r10 3449 3450 mulx 8*3($nptr),%rax,%r11 3451 adcx %rax,%r10 3452 adox %r12,%r11 3453 3454 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 3455 adcx %rax,%r11 3456 adox %r13,%r12 3457 3458 mulx 8*5($nptr),%rax,%r13 3459 adcx %rax,%r12 3460 adox %r14,%r13 3461 3462 mulx 8*6($nptr),%rax,%r14 3463 adcx %rax,%r13 3464 adox %r15,%r14 3465 3466 mulx 8*7($nptr),%rax,%r15 3467 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3468 adcx %rax,%r14 3469 adox $carry,%r15 3470 mov %rbx,($tptr,%rcx,8) # save result 3471 mov %r8,%rbx 3472 adcx $carry,%r15 # cf=0 3473 3474 inc %rcx # of=0 3475 jnz .Lsqrx8x_tail 3476 3477 cmp 0+8(%rsp),$nptr # end of n[]? 3478 jae .Lsqrx8x_tail_done # break out of loop 3479 3480 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3481 mov 48+8(%rsp),%rdx # pull n0*a[0] 3482 lea 8*8($nptr),$nptr 3483 adc 8*0($tptr),%r8 3484 adc 8*1($tptr),%r9 3485 adc 8*2($tptr),%r10 3486 adc 8*3($tptr),%r11 3487 adc 8*4($tptr),%r12 3488 adc 8*5($tptr),%r13 3489 adc 8*6($tptr),%r14 3490 adc 8*7($tptr),%r15 3491 lea 8*8($tptr),$tptr 3492 sbb %rax,%rax 3493 sub \$8,%rcx # mov \$-8,%rcx 3494 3495 xor $carry,$carry # of=0, cf=0 3496 mov %rax,16+8(%rsp) 3497 jmp .Lsqrx8x_tail 3498 3499.align 32 3500.Lsqrx8x_tail_done: 3501 xor %rax,%rax 3502 add 24+8(%rsp),%r8 # can this overflow? 3503 adc \$0,%r9 3504 adc \$0,%r10 3505 adc \$0,%r11 3506 adc \$0,%r12 3507 adc \$0,%r13 3508 adc \$0,%r14 3509 adc \$0,%r15 3510 adc \$0,%rax 3511 3512 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3513.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3514 adc 8*0($tptr),%r8 3515 movq %xmm3,%rcx 3516 adc 8*1($tptr),%r9 3517 mov 8*7($nptr),$carry 3518 movq %xmm2,$nptr # restore $nptr 3519 adc 8*2($tptr),%r10 3520 adc 8*3($tptr),%r11 3521 adc 8*4($tptr),%r12 3522 adc 8*5($tptr),%r13 3523 adc 8*6($tptr),%r14 3524 adc 8*7($tptr),%r15 3525 adc \$0,%rax # top-most carry 3526 3527 mov 32+8(%rsp),%rbx # n0 3528 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3529 3530 mov %r8,8*0($tptr) # store top 512 bits 3531 lea 8*8($tptr),%r8 # borrow %r8 3532 mov %r9,8*1($tptr) 3533 mov %r10,8*2($tptr) 3534 mov %r11,8*3($tptr) 3535 mov %r12,8*4($tptr) 3536 mov %r13,8*5($tptr) 3537 mov %r14,8*6($tptr) 3538 mov %r15,8*7($tptr) 3539 3540 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3541 cmp 8+8(%rsp),%r8 # end of t[]? 3542 jb .Lsqrx8x_reduction_loop 3543 ret 3544.cfi_endproc 3545.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3546___ 3547} 3548############################################################## 3549# Post-condition, 4x unrolled 3550# 3551{ 3552my ($rptr,$nptr)=("%rdx","%rbp"); 3553$code.=<<___; 3554.align 32 3555.type __bn_postx4x_internal,\@abi-omnipotent 3556__bn_postx4x_internal: 3557.cfi_startproc 3558 mov 8*0($nptr),%r12 3559 mov %rcx,%r10 # -$num 3560 mov %rcx,%r9 # -$num 3561 neg %rax 3562 sar \$3+2,%rcx 3563 #lea 48+8(%rsp,%r9),$tptr 3564 movq %xmm1,$rptr # restore $rptr 3565 movq %xmm1,$aptr # prepare for back-to-back call 3566 dec %r12 # so that after 'not' we get -n[0] 3567 mov 8*1($nptr),%r13 3568 xor %r8,%r8 3569 mov 8*2($nptr),%r14 3570 mov 8*3($nptr),%r15 3571 jmp .Lsqrx4x_sub_entry 3572 3573.align 16 3574.Lsqrx4x_sub: 3575 mov 8*0($nptr),%r12 3576 mov 8*1($nptr),%r13 3577 mov 8*2($nptr),%r14 3578 mov 8*3($nptr),%r15 3579.Lsqrx4x_sub_entry: 3580 andn %rax,%r12,%r12 3581 lea 8*4($nptr),$nptr 3582 andn %rax,%r13,%r13 3583 andn %rax,%r14,%r14 3584 andn %rax,%r15,%r15 3585 3586 neg %r8 # mov %r8,%cf 3587 adc 8*0($tptr),%r12 3588 adc 8*1($tptr),%r13 3589 adc 8*2($tptr),%r14 3590 adc 8*3($tptr),%r15 3591 mov %r12,8*0($rptr) 3592 lea 8*4($tptr),$tptr 3593 mov %r13,8*1($rptr) 3594 sbb %r8,%r8 # mov %cf,%r8 3595 mov %r14,8*2($rptr) 3596 mov %r15,8*3($rptr) 3597 lea 8*4($rptr),$rptr 3598 3599 inc %rcx 3600 jnz .Lsqrx4x_sub 3601 3602 neg %r9 # restore $num 3603 3604 ret 3605.cfi_endproc 3606.size __bn_postx4x_internal,.-__bn_postx4x_internal 3607___ 3608} 3609}}} 3610{ 3611my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3612 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3613my $out=$inp; 3614my $STRIDE=2**5*8; 3615my $N=$STRIDE/4; 3616 3617$code.=<<___; 3618.globl bn_scatter5 3619.type bn_scatter5,\@abi-omnipotent 3620.align 16 3621bn_scatter5: 3622.cfi_startproc 3623 cmp \$0, $num 3624 jz .Lscatter_epilogue 3625 lea ($tbl,$idx,8),$tbl 3626.Lscatter: 3627 mov ($inp),%rax 3628 lea 8($inp),$inp 3629 mov %rax,($tbl) 3630 lea 32*8($tbl),$tbl 3631 sub \$1,$num 3632 jnz .Lscatter 3633.Lscatter_epilogue: 3634 ret 3635.cfi_endproc 3636.size bn_scatter5,.-bn_scatter5 3637 3638.globl bn_gather5 3639.type bn_gather5,\@abi-omnipotent 3640.align 32 3641bn_gather5: 3642.cfi_startproc 3643.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 3644 # I can't trust assembler to use specific encoding:-( 3645 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 3646.cfi_def_cfa_register %r10 3647 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp 3648 lea .Linc(%rip),%rax 3649 and \$-16,%rsp # shouldn't be formally required 3650 3651 movd $idx,%xmm5 3652 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 3653 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 3654 lea 128($tbl),%r11 # size optimization 3655 lea 128(%rsp),%rax # size optimization 3656 3657 pshufd \$0,%xmm5,%xmm5 # broadcast $idx 3658 movdqa %xmm1,%xmm4 3659 movdqa %xmm1,%xmm2 3660___ 3661######################################################################## 3662# calculate mask by comparing 0..31 to $idx and save result to stack 3663# 3664for($i=0;$i<$STRIDE/16;$i+=4) { 3665$code.=<<___; 3666 paddd %xmm0,%xmm1 3667 pcmpeqd %xmm5,%xmm0 # compare to 1,0 3668___ 3669$code.=<<___ if ($i); 3670 movdqa %xmm3,`16*($i-1)-128`(%rax) 3671___ 3672$code.=<<___; 3673 movdqa %xmm4,%xmm3 3674 3675 paddd %xmm1,%xmm2 3676 pcmpeqd %xmm5,%xmm1 # compare to 3,2 3677 movdqa %xmm0,`16*($i+0)-128`(%rax) 3678 movdqa %xmm4,%xmm0 3679 3680 paddd %xmm2,%xmm3 3681 pcmpeqd %xmm5,%xmm2 # compare to 5,4 3682 movdqa %xmm1,`16*($i+1)-128`(%rax) 3683 movdqa %xmm4,%xmm1 3684 3685 paddd %xmm3,%xmm0 3686 pcmpeqd %xmm5,%xmm3 # compare to 7,6 3687 movdqa %xmm2,`16*($i+2)-128`(%rax) 3688 movdqa %xmm4,%xmm2 3689___ 3690} 3691$code.=<<___; 3692 movdqa %xmm3,`16*($i-1)-128`(%rax) 3693 jmp .Lgather 3694 3695.align 32 3696.Lgather: 3697 pxor %xmm4,%xmm4 3698 pxor %xmm5,%xmm5 3699___ 3700for($i=0;$i<$STRIDE/16;$i+=4) { 3701$code.=<<___; 3702 movdqa `16*($i+0)-128`(%r11),%xmm0 3703 movdqa `16*($i+1)-128`(%r11),%xmm1 3704 movdqa `16*($i+2)-128`(%r11),%xmm2 3705 pand `16*($i+0)-128`(%rax),%xmm0 3706 movdqa `16*($i+3)-128`(%r11),%xmm3 3707 pand `16*($i+1)-128`(%rax),%xmm1 3708 por %xmm0,%xmm4 3709 pand `16*($i+2)-128`(%rax),%xmm2 3710 por %xmm1,%xmm5 3711 pand `16*($i+3)-128`(%rax),%xmm3 3712 por %xmm2,%xmm4 3713 por %xmm3,%xmm5 3714___ 3715} 3716$code.=<<___; 3717 por %xmm5,%xmm4 3718 lea $STRIDE(%r11),%r11 3719 pshufd \$0x4e,%xmm4,%xmm0 3720 por %xmm4,%xmm0 3721 movq %xmm0,($out) # m0=bp[0] 3722 lea 8($out),$out 3723 sub \$1,$num 3724 jnz .Lgather 3725 3726 lea (%r10),%rsp 3727.cfi_def_cfa_register %rsp 3728 ret 3729.LSEH_end_bn_gather5: 3730.cfi_endproc 3731.size bn_gather5,.-bn_gather5 3732___ 3733} 3734$code.=<<___; 3735.align 64 3736.Linc: 3737 .long 0,0, 1,1 3738 .long 2,2, 2,2 3739.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3740___ 3741 3742# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3743# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3744if ($win64) { 3745$rec="%rcx"; 3746$frame="%rdx"; 3747$context="%r8"; 3748$disp="%r9"; 3749 3750$code.=<<___; 3751.extern __imp_RtlVirtualUnwind 3752.type mul_handler,\@abi-omnipotent 3753.align 16 3754mul_handler: 3755 push %rsi 3756 push %rdi 3757 push %rbx 3758 push %rbp 3759 push %r12 3760 push %r13 3761 push %r14 3762 push %r15 3763 pushfq 3764 sub \$64,%rsp 3765 3766 mov 120($context),%rax # pull context->Rax 3767 mov 248($context),%rbx # pull context->Rip 3768 3769 mov 8($disp),%rsi # disp->ImageBase 3770 mov 56($disp),%r11 # disp->HandlerData 3771 3772 mov 0(%r11),%r10d # HandlerData[0] 3773 lea (%rsi,%r10),%r10 # end of prologue label 3774 cmp %r10,%rbx # context->Rip<end of prologue label 3775 jb .Lcommon_seh_tail 3776 3777 mov 4(%r11),%r10d # HandlerData[1] 3778 lea (%rsi,%r10),%r10 # beginning of body label 3779 cmp %r10,%rbx # context->Rip<body label 3780 jb .Lcommon_pop_regs 3781 3782 mov 152($context),%rax # pull context->Rsp 3783 3784 mov 8(%r11),%r10d # HandlerData[2] 3785 lea (%rsi,%r10),%r10 # epilogue label 3786 cmp %r10,%rbx # context->Rip>=epilogue label 3787 jae .Lcommon_seh_tail 3788 3789 lea .Lmul_epilogue(%rip),%r10 3790 cmp %r10,%rbx 3791 ja .Lbody_40 3792 3793 mov 192($context),%r10 # pull $num 3794 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3795 3796 jmp .Lcommon_pop_regs 3797 3798.Lbody_40: 3799 mov 40(%rax),%rax # pull saved stack pointer 3800.Lcommon_pop_regs: 3801 mov -8(%rax),%rbx 3802 mov -16(%rax),%rbp 3803 mov -24(%rax),%r12 3804 mov -32(%rax),%r13 3805 mov -40(%rax),%r14 3806 mov -48(%rax),%r15 3807 mov %rbx,144($context) # restore context->Rbx 3808 mov %rbp,160($context) # restore context->Rbp 3809 mov %r12,216($context) # restore context->R12 3810 mov %r13,224($context) # restore context->R13 3811 mov %r14,232($context) # restore context->R14 3812 mov %r15,240($context) # restore context->R15 3813 3814.Lcommon_seh_tail: 3815 mov 8(%rax),%rdi 3816 mov 16(%rax),%rsi 3817 mov %rax,152($context) # restore context->Rsp 3818 mov %rsi,168($context) # restore context->Rsi 3819 mov %rdi,176($context) # restore context->Rdi 3820 3821 mov 40($disp),%rdi # disp->ContextRecord 3822 mov $context,%rsi # context 3823 mov \$154,%ecx # sizeof(CONTEXT) 3824 .long 0xa548f3fc # cld; rep movsq 3825 3826 mov $disp,%rsi 3827 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3828 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3829 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3830 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3831 mov 40(%rsi),%r10 # disp->ContextRecord 3832 lea 56(%rsi),%r11 # &disp->HandlerData 3833 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3834 mov %r10,32(%rsp) # arg5 3835 mov %r11,40(%rsp) # arg6 3836 mov %r12,48(%rsp) # arg7 3837 mov %rcx,56(%rsp) # arg8, (NULL) 3838 call *__imp_RtlVirtualUnwind(%rip) 3839 3840 mov \$1,%eax # ExceptionContinueSearch 3841 add \$64,%rsp 3842 popfq 3843 pop %r15 3844 pop %r14 3845 pop %r13 3846 pop %r12 3847 pop %rbp 3848 pop %rbx 3849 pop %rdi 3850 pop %rsi 3851 ret 3852.size mul_handler,.-mul_handler 3853 3854.section .pdata 3855.align 4 3856 .rva .LSEH_begin_bn_mul_mont_gather5 3857 .rva .LSEH_end_bn_mul_mont_gather5 3858 .rva .LSEH_info_bn_mul_mont_gather5 3859 3860 .rva .LSEH_begin_bn_mul4x_mont_gather5 3861 .rva .LSEH_end_bn_mul4x_mont_gather5 3862 .rva .LSEH_info_bn_mul4x_mont_gather5 3863 3864 .rva .LSEH_begin_bn_power5 3865 .rva .LSEH_end_bn_power5 3866 .rva .LSEH_info_bn_power5 3867 3868 .rva .LSEH_begin_bn_from_mont8x 3869 .rva .LSEH_end_bn_from_mont8x 3870 .rva .LSEH_info_bn_from_mont8x 3871___ 3872$code.=<<___ if ($addx); 3873 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3874 .rva .LSEH_end_bn_mulx4x_mont_gather5 3875 .rva .LSEH_info_bn_mulx4x_mont_gather5 3876 3877 .rva .LSEH_begin_bn_powerx5 3878 .rva .LSEH_end_bn_powerx5 3879 .rva .LSEH_info_bn_powerx5 3880___ 3881$code.=<<___; 3882 .rva .LSEH_begin_bn_gather5 3883 .rva .LSEH_end_bn_gather5 3884 .rva .LSEH_info_bn_gather5 3885 3886.section .xdata 3887.align 8 3888.LSEH_info_bn_mul_mont_gather5: 3889 .byte 9,0,0,0 3890 .rva mul_handler 3891 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] 3892.align 8 3893.LSEH_info_bn_mul4x_mont_gather5: 3894 .byte 9,0,0,0 3895 .rva mul_handler 3896 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3897.align 8 3898.LSEH_info_bn_power5: 3899 .byte 9,0,0,0 3900 .rva mul_handler 3901 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] 3902.align 8 3903.LSEH_info_bn_from_mont8x: 3904 .byte 9,0,0,0 3905 .rva mul_handler 3906 .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] 3907___ 3908$code.=<<___ if ($addx); 3909.align 8 3910.LSEH_info_bn_mulx4x_mont_gather5: 3911 .byte 9,0,0,0 3912 .rva mul_handler 3913 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3914.align 8 3915.LSEH_info_bn_powerx5: 3916 .byte 9,0,0,0 3917 .rva mul_handler 3918 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3919___ 3920$code.=<<___; 3921.align 8 3922.LSEH_info_bn_gather5: 3923 .byte 0x01,0x0b,0x03,0x0a 3924 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 3925 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) 3926.align 8 3927___ 3928} 3929 3930$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3931 3932print $code; 3933close STDOUT; 3934