1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# August 2011. 11# 12# Companion to x86_64-mont.pl that optimizes cache-timing attack 13# countermeasures. The subroutines are produced by replacing bp[i] 14# references in their x86_64-mont.pl counterparts with cache-neutral 15# references to powers table computed in BN_mod_exp_mont_consttime. 16# In addition subroutine that scatters elements of the powers table 17# is implemented, so that scatter-/gathering can be tuned without 18# bn_exp.c modifications. 19 20# August 2013. 21# 22# Add MULX/AD*X code paths and additional interfaces to optimize for 23# branch prediction unit. For input lengths that are multiples of 8 24# the np argument is not just modulus value, but one interleaved 25# with 0. This is to optimize post-condition... 26 27$flavour = shift; 28$output = shift; 29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 30 31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 32 33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 36die "can't locate x86_64-xlate.pl"; 37 38open OUT,"| \"$^X\" $xlate $flavour $output"; 39*STDOUT=*OUT; 40 41# In upstream, this is controlled by shelling out to the compiler to check 42# versions, but BoringSSL is intended to be used with pre-generated perlasm 43# output, so this isn't useful anyway. 44# 45# TODO(davidben): Enable this after testing. $addx goes up to 1. 46$addx = 0; 47 48# int bn_mul_mont_gather5( 49$rp="%rdi"; # BN_ULONG *rp, 50$ap="%rsi"; # const BN_ULONG *ap, 51$bp="%rdx"; # const BN_ULONG *bp, 52$np="%rcx"; # const BN_ULONG *np, 53$n0="%r8"; # const BN_ULONG *n0, 54$num="%r9"; # int num, 55 # int idx); # 0 to 2^5-1, "index" in $bp holding 56 # pre-computed powers of a', interlaced 57 # in such manner that b[0] is $bp[idx], 58 # b[1] is [2^5+idx], etc. 59$lo0="%r10"; 60$hi0="%r11"; 61$hi1="%r13"; 62$i="%r14"; 63$j="%r15"; 64$m0="%rbx"; 65$m1="%rbp"; 66 67$code=<<___; 68.text 69 70.extern OPENSSL_ia32cap_P 71 72.globl bn_mul_mont_gather5 73.type bn_mul_mont_gather5,\@function,6 74.align 64 75bn_mul_mont_gather5: 76 test \$7,${num}d 77 jnz .Lmul_enter 78___ 79$code.=<<___ if ($addx); 80 mov OPENSSL_ia32cap_P+8(%rip),%r11d 81___ 82$code.=<<___; 83 jmp .Lmul4x_enter 84 85.align 16 86.Lmul_enter: 87 mov ${num}d,${num}d 88 mov %rsp,%rax 89 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 90 push %rbx 91 push %rbp 92 push %r12 93 push %r13 94 push %r14 95 push %r15 96___ 97$code.=<<___ if ($win64); 98 lea -0x28(%rsp),%rsp 99 movaps %xmm6,(%rsp) 100 movaps %xmm7,0x10(%rsp) 101___ 102$code.=<<___; 103 lea 2($num),%r11 104 neg %r11 105 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) 106 and \$-1024,%rsp # minimize TLB usage 107 108 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 109.Lmul_body: 110 mov $bp,%r12 # reassign $bp 111___ 112 $bp="%r12"; 113 $STRIDE=2**5*8; # 5 is "window size" 114 $N=$STRIDE/4; # should match cache line size 115$code.=<<___; 116 mov %r10,%r11 117 shr \$`log($N/8)/log(2)`,%r10 118 and \$`$N/8-1`,%r11 119 not %r10 120 lea .Lmagic_masks(%rip),%rax 121 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 122 lea 96($bp,%r11,8),$bp # pointer within 1st cache line 123 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 124 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 125 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 126 movq 24(%rax,%r10,8),%xmm7 127 128 movq `0*$STRIDE/4-96`($bp),%xmm0 129 movq `1*$STRIDE/4-96`($bp),%xmm1 130 pand %xmm4,%xmm0 131 movq `2*$STRIDE/4-96`($bp),%xmm2 132 pand %xmm5,%xmm1 133 movq `3*$STRIDE/4-96`($bp),%xmm3 134 pand %xmm6,%xmm2 135 por %xmm1,%xmm0 136 pand %xmm7,%xmm3 137 por %xmm2,%xmm0 138 lea $STRIDE($bp),$bp 139 por %xmm3,%xmm0 140 141 movq %xmm0,$m0 # m0=bp[0] 142 143 mov ($n0),$n0 # pull n0[0] value 144 mov ($ap),%rax 145 146 xor $i,$i # i=0 147 xor $j,$j # j=0 148 149 movq `0*$STRIDE/4-96`($bp),%xmm0 150 movq `1*$STRIDE/4-96`($bp),%xmm1 151 pand %xmm4,%xmm0 152 movq `2*$STRIDE/4-96`($bp),%xmm2 153 pand %xmm5,%xmm1 154 155 mov $n0,$m1 156 mulq $m0 # ap[0]*bp[0] 157 mov %rax,$lo0 158 mov ($np),%rax 159 160 movq `3*$STRIDE/4-96`($bp),%xmm3 161 pand %xmm6,%xmm2 162 por %xmm1,%xmm0 163 pand %xmm7,%xmm3 164 165 imulq $lo0,$m1 # "tp[0]"*n0 166 mov %rdx,$hi0 167 168 por %xmm2,%xmm0 169 lea $STRIDE($bp),$bp 170 por %xmm3,%xmm0 171 172 mulq $m1 # np[0]*m1 173 add %rax,$lo0 # discarded 174 mov 8($ap),%rax 175 adc \$0,%rdx 176 mov %rdx,$hi1 177 178 lea 1($j),$j # j++ 179 jmp .L1st_enter 180 181.align 16 182.L1st: 183 add %rax,$hi1 184 mov ($ap,$j,8),%rax 185 adc \$0,%rdx 186 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 187 mov $lo0,$hi0 188 adc \$0,%rdx 189 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 190 mov %rdx,$hi1 191 192.L1st_enter: 193 mulq $m0 # ap[j]*bp[0] 194 add %rax,$hi0 195 mov ($np,$j,8),%rax 196 adc \$0,%rdx 197 lea 1($j),$j # j++ 198 mov %rdx,$lo0 199 200 mulq $m1 # np[j]*m1 201 cmp $num,$j 202 jne .L1st 203 204 movq %xmm0,$m0 # bp[1] 205 206 add %rax,$hi1 207 mov ($ap),%rax # ap[0] 208 adc \$0,%rdx 209 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 210 adc \$0,%rdx 211 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 212 mov %rdx,$hi1 213 mov $lo0,$hi0 214 215 xor %rdx,%rdx 216 add $hi0,$hi1 217 adc \$0,%rdx 218 mov $hi1,-8(%rsp,$num,8) 219 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 220 221 lea 1($i),$i # i++ 222 jmp .Louter 223.align 16 224.Louter: 225 xor $j,$j # j=0 226 mov $n0,$m1 227 mov (%rsp),$lo0 228 229 movq `0*$STRIDE/4-96`($bp),%xmm0 230 movq `1*$STRIDE/4-96`($bp),%xmm1 231 pand %xmm4,%xmm0 232 movq `2*$STRIDE/4-96`($bp),%xmm2 233 pand %xmm5,%xmm1 234 235 mulq $m0 # ap[0]*bp[i] 236 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 237 mov ($np),%rax 238 adc \$0,%rdx 239 240 movq `3*$STRIDE/4-96`($bp),%xmm3 241 pand %xmm6,%xmm2 242 por %xmm1,%xmm0 243 pand %xmm7,%xmm3 244 245 imulq $lo0,$m1 # tp[0]*n0 246 mov %rdx,$hi0 247 248 por %xmm2,%xmm0 249 lea $STRIDE($bp),$bp 250 por %xmm3,%xmm0 251 252 mulq $m1 # np[0]*m1 253 add %rax,$lo0 # discarded 254 mov 8($ap),%rax 255 adc \$0,%rdx 256 mov 8(%rsp),$lo0 # tp[1] 257 mov %rdx,$hi1 258 259 lea 1($j),$j # j++ 260 jmp .Linner_enter 261 262.align 16 263.Linner: 264 add %rax,$hi1 265 mov ($ap,$j,8),%rax 266 adc \$0,%rdx 267 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 268 mov (%rsp,$j,8),$lo0 269 adc \$0,%rdx 270 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 271 mov %rdx,$hi1 272 273.Linner_enter: 274 mulq $m0 # ap[j]*bp[i] 275 add %rax,$hi0 276 mov ($np,$j,8),%rax 277 adc \$0,%rdx 278 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 279 mov %rdx,$hi0 280 adc \$0,$hi0 281 lea 1($j),$j # j++ 282 283 mulq $m1 # np[j]*m1 284 cmp $num,$j 285 jne .Linner 286 287 movq %xmm0,$m0 # bp[i+1] 288 289 add %rax,$hi1 290 mov ($ap),%rax # ap[0] 291 adc \$0,%rdx 292 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 293 mov (%rsp,$j,8),$lo0 294 adc \$0,%rdx 295 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 296 mov %rdx,$hi1 297 298 xor %rdx,%rdx 299 add $hi0,$hi1 300 adc \$0,%rdx 301 add $lo0,$hi1 # pull upmost overflow bit 302 adc \$0,%rdx 303 mov $hi1,-8(%rsp,$num,8) 304 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 305 306 lea 1($i),$i # i++ 307 cmp $num,$i 308 jb .Louter 309 310 xor $i,$i # i=0 and clear CF! 311 mov (%rsp),%rax # tp[0] 312 lea (%rsp),$ap # borrow ap for tp 313 mov $num,$j # j=num 314 jmp .Lsub 315.align 16 316.Lsub: sbb ($np,$i,8),%rax 317 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 318 mov 8($ap,$i,8),%rax # tp[i+1] 319 lea 1($i),$i # i++ 320 dec $j # doesnn't affect CF! 321 jnz .Lsub 322 323 sbb \$0,%rax # handle upmost overflow bit 324 xor $i,$i 325 mov $num,$j # j=num 326.align 16 327.Lcopy: # copy or in-place refresh 328 mov (%rsp,$i,8),$ap 329 mov ($rp,$i,8),$np 330 xor $np,$ap # conditional select: 331 and %rax,$ap # ((ap ^ np) & %rax) ^ np 332 xor $np,$ap # ap = borrow?tp:rp 333 mov $i,(%rsp,$i,8) # zap temporary vector 334 mov $ap,($rp,$i,8) # rp[i]=tp[i] 335 lea 1($i),$i 336 sub \$1,$j 337 jnz .Lcopy 338 339 mov 8(%rsp,$num,8),%rsi # restore %rsp 340 mov \$1,%rax 341___ 342$code.=<<___ if ($win64); 343 movaps -88(%rsi),%xmm6 344 movaps -72(%rsi),%xmm7 345___ 346$code.=<<___; 347 mov -48(%rsi),%r15 348 mov -40(%rsi),%r14 349 mov -32(%rsi),%r13 350 mov -24(%rsi),%r12 351 mov -16(%rsi),%rbp 352 mov -8(%rsi),%rbx 353 lea (%rsi),%rsp 354.Lmul_epilogue: 355 ret 356.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 357___ 358{{{ 359my @A=("%r10","%r11"); 360my @N=("%r13","%rdi"); 361$code.=<<___; 362.type bn_mul4x_mont_gather5,\@function,6 363.align 32 364bn_mul4x_mont_gather5: 365.Lmul4x_enter: 366___ 367$code.=<<___ if ($addx); 368 and \$0x80100,%r11d 369 cmp \$0x80100,%r11d 370 je .Lmulx4x_enter 371___ 372$code.=<<___; 373 .byte 0x67 374 mov %rsp,%rax 375 push %rbx 376 push %rbp 377 push %r12 378 push %r13 379 push %r14 380 push %r15 381___ 382$code.=<<___ if ($win64); 383 lea -0x28(%rsp),%rsp 384 movaps %xmm6,(%rsp) 385 movaps %xmm7,0x10(%rsp) 386___ 387$code.=<<___; 388 .byte 0x67 389 mov ${num}d,%r10d 390 shl \$3,${num}d 391 shl \$3+2,%r10d # 4*$num 392 neg $num # -$num 393 394 ############################################################## 395 # ensure that stack frame doesn't alias with $aptr+4*$num 396 # modulo 4096, which covers ret[num], am[num] and n[2*num] 397 # (see bn_exp.c). this is done to allow memory disambiguation 398 # logic do its magic. [excessive frame is allocated in order 399 # to allow bn_from_mont8x to clear it.] 400 # 401 lea -64(%rsp,$num,2),%r11 402 sub $ap,%r11 403 and \$4095,%r11 404 cmp %r11,%r10 405 jb .Lmul4xsp_alt 406 sub %r11,%rsp # align with $ap 407 lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) 408 jmp .Lmul4xsp_done 409 410.align 32 411.Lmul4xsp_alt: 412 lea 4096-64(,$num,2),%r10 413 lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) 414 sub %r10,%r11 415 mov \$0,%r10 416 cmovc %r10,%r11 417 sub %r11,%rsp 418.Lmul4xsp_done: 419 and \$-64,%rsp 420 neg $num 421 422 mov %rax,40(%rsp) 423.Lmul4x_body: 424 425 call mul4x_internal 426 427 mov 40(%rsp),%rsi # restore %rsp 428 mov \$1,%rax 429___ 430$code.=<<___ if ($win64); 431 movaps -88(%rsi),%xmm6 432 movaps -72(%rsi),%xmm7 433___ 434$code.=<<___; 435 mov -48(%rsi),%r15 436 mov -40(%rsi),%r14 437 mov -32(%rsi),%r13 438 mov -24(%rsi),%r12 439 mov -16(%rsi),%rbp 440 mov -8(%rsi),%rbx 441 lea (%rsi),%rsp 442.Lmul4x_epilogue: 443 ret 444.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 445 446.type mul4x_internal,\@abi-omnipotent 447.align 32 448mul4x_internal: 449 shl \$5,$num 450 mov `($win64?56:8)`(%rax),%r10d # load 7th argument 451 lea 256(%rdx,$num),%r13 452 shr \$5,$num # restore $num 453___ 454 $bp="%r12"; 455 $STRIDE=2**5*8; # 5 is "window size" 456 $N=$STRIDE/4; # should match cache line size 457 $tp=$i; 458$code.=<<___; 459 mov %r10,%r11 460 shr \$`log($N/8)/log(2)`,%r10 461 and \$`$N/8-1`,%r11 462 not %r10 463 lea .Lmagic_masks(%rip),%rax 464 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 465 lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line 466 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 467 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 468 add \$7,%r11 469 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 470 movq 24(%rax,%r10,8),%xmm7 471 and \$7,%r11 472 473 movq `0*$STRIDE/4-96`($bp),%xmm0 474 lea $STRIDE($bp),$tp # borrow $tp 475 movq `1*$STRIDE/4-96`($bp),%xmm1 476 pand %xmm4,%xmm0 477 movq `2*$STRIDE/4-96`($bp),%xmm2 478 pand %xmm5,%xmm1 479 movq `3*$STRIDE/4-96`($bp),%xmm3 480 pand %xmm6,%xmm2 481 .byte 0x67 482 por %xmm1,%xmm0 483 movq `0*$STRIDE/4-96`($tp),%xmm1 484 .byte 0x67 485 pand %xmm7,%xmm3 486 .byte 0x67 487 por %xmm2,%xmm0 488 movq `1*$STRIDE/4-96`($tp),%xmm2 489 .byte 0x67 490 pand %xmm4,%xmm1 491 .byte 0x67 492 por %xmm3,%xmm0 493 movq `2*$STRIDE/4-96`($tp),%xmm3 494 495 movq %xmm0,$m0 # m0=bp[0] 496 movq `3*$STRIDE/4-96`($tp),%xmm0 497 mov %r13,16+8(%rsp) # save end of b[num] 498 mov $rp, 56+8(%rsp) # save $rp 499 500 mov ($n0),$n0 # pull n0[0] value 501 mov ($ap),%rax 502 lea ($ap,$num),$ap # end of a[num] 503 neg $num 504 505 mov $n0,$m1 506 mulq $m0 # ap[0]*bp[0] 507 mov %rax,$A[0] 508 mov ($np),%rax 509 510 pand %xmm5,%xmm2 511 pand %xmm6,%xmm3 512 por %xmm2,%xmm1 513 514 imulq $A[0],$m1 # "tp[0]"*n0 515 ############################################################## 516 # $tp is chosen so that writing to top-most element of the 517 # vector occurs just "above" references to powers table, 518 # "above" modulo cache-line size, which effectively precludes 519 # possibility of memory disambiguation logic failure when 520 # accessing the table. 521 # 522 lea 64+8(%rsp,%r11,8),$tp 523 mov %rdx,$A[1] 524 525 pand %xmm7,%xmm0 526 por %xmm3,%xmm1 527 lea 2*$STRIDE($bp),$bp 528 por %xmm1,%xmm0 529 530 mulq $m1 # np[0]*m1 531 add %rax,$A[0] # discarded 532 mov 8($ap,$num),%rax 533 adc \$0,%rdx 534 mov %rdx,$N[1] 535 536 mulq $m0 537 add %rax,$A[1] 538 mov 16*1($np),%rax # interleaved with 0, therefore 16*n 539 adc \$0,%rdx 540 mov %rdx,$A[0] 541 542 mulq $m1 543 add %rax,$N[1] 544 mov 16($ap,$num),%rax 545 adc \$0,%rdx 546 add $A[1],$N[1] 547 lea 4*8($num),$j # j=4 548 lea 16*4($np),$np 549 adc \$0,%rdx 550 mov $N[1],($tp) 551 mov %rdx,$N[0] 552 jmp .L1st4x 553 554.align 32 555.L1st4x: 556 mulq $m0 # ap[j]*bp[0] 557 add %rax,$A[0] 558 mov -16*2($np),%rax 559 lea 32($tp),$tp 560 adc \$0,%rdx 561 mov %rdx,$A[1] 562 563 mulq $m1 # np[j]*m1 564 add %rax,$N[0] 565 mov -8($ap,$j),%rax 566 adc \$0,%rdx 567 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 568 adc \$0,%rdx 569 mov $N[0],-24($tp) # tp[j-1] 570 mov %rdx,$N[1] 571 572 mulq $m0 # ap[j]*bp[0] 573 add %rax,$A[1] 574 mov -16*1($np),%rax 575 adc \$0,%rdx 576 mov %rdx,$A[0] 577 578 mulq $m1 # np[j]*m1 579 add %rax,$N[1] 580 mov ($ap,$j),%rax 581 adc \$0,%rdx 582 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 583 adc \$0,%rdx 584 mov $N[1],-16($tp) # tp[j-1] 585 mov %rdx,$N[0] 586 587 mulq $m0 # ap[j]*bp[0] 588 add %rax,$A[0] 589 mov 16*0($np),%rax 590 adc \$0,%rdx 591 mov %rdx,$A[1] 592 593 mulq $m1 # np[j]*m1 594 add %rax,$N[0] 595 mov 8($ap,$j),%rax 596 adc \$0,%rdx 597 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 598 adc \$0,%rdx 599 mov $N[0],-8($tp) # tp[j-1] 600 mov %rdx,$N[1] 601 602 mulq $m0 # ap[j]*bp[0] 603 add %rax,$A[1] 604 mov 16*1($np),%rax 605 adc \$0,%rdx 606 mov %rdx,$A[0] 607 608 mulq $m1 # np[j]*m1 609 add %rax,$N[1] 610 mov 16($ap,$j),%rax 611 adc \$0,%rdx 612 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 613 lea 16*4($np),$np 614 adc \$0,%rdx 615 mov $N[1],($tp) # tp[j-1] 616 mov %rdx,$N[0] 617 618 add \$32,$j # j+=4 619 jnz .L1st4x 620 621 mulq $m0 # ap[j]*bp[0] 622 add %rax,$A[0] 623 mov -16*2($np),%rax 624 lea 32($tp),$tp 625 adc \$0,%rdx 626 mov %rdx,$A[1] 627 628 mulq $m1 # np[j]*m1 629 add %rax,$N[0] 630 mov -8($ap),%rax 631 adc \$0,%rdx 632 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 633 adc \$0,%rdx 634 mov $N[0],-24($tp) # tp[j-1] 635 mov %rdx,$N[1] 636 637 mulq $m0 # ap[j]*bp[0] 638 add %rax,$A[1] 639 mov -16*1($np),%rax 640 adc \$0,%rdx 641 mov %rdx,$A[0] 642 643 mulq $m1 # np[j]*m1 644 add %rax,$N[1] 645 mov ($ap,$num),%rax # ap[0] 646 adc \$0,%rdx 647 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 648 adc \$0,%rdx 649 mov $N[1],-16($tp) # tp[j-1] 650 mov %rdx,$N[0] 651 652 movq %xmm0,$m0 # bp[1] 653 lea ($np,$num,2),$np # rewind $np 654 655 xor $N[1],$N[1] 656 add $A[0],$N[0] 657 adc \$0,$N[1] 658 mov $N[0],-8($tp) 659 660 jmp .Louter4x 661 662.align 32 663.Louter4x: 664 mov ($tp,$num),$A[0] 665 mov $n0,$m1 666 mulq $m0 # ap[0]*bp[i] 667 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 668 mov ($np),%rax 669 adc \$0,%rdx 670 671 movq `0*$STRIDE/4-96`($bp),%xmm0 672 movq `1*$STRIDE/4-96`($bp),%xmm1 673 pand %xmm4,%xmm0 674 movq `2*$STRIDE/4-96`($bp),%xmm2 675 pand %xmm5,%xmm1 676 movq `3*$STRIDE/4-96`($bp),%xmm3 677 678 imulq $A[0],$m1 # tp[0]*n0 679 .byte 0x67 680 mov %rdx,$A[1] 681 mov $N[1],($tp) # store upmost overflow bit 682 683 pand %xmm6,%xmm2 684 por %xmm1,%xmm0 685 pand %xmm7,%xmm3 686 por %xmm2,%xmm0 687 lea ($tp,$num),$tp # rewind $tp 688 lea $STRIDE($bp),$bp 689 por %xmm3,%xmm0 690 691 mulq $m1 # np[0]*m1 692 add %rax,$A[0] # "$N[0]", discarded 693 mov 8($ap,$num),%rax 694 adc \$0,%rdx 695 mov %rdx,$N[1] 696 697 mulq $m0 # ap[j]*bp[i] 698 add %rax,$A[1] 699 mov 16*1($np),%rax # interleaved with 0, therefore 16*n 700 adc \$0,%rdx 701 add 8($tp),$A[1] # +tp[1] 702 adc \$0,%rdx 703 mov %rdx,$A[0] 704 705 mulq $m1 # np[j]*m1 706 add %rax,$N[1] 707 mov 16($ap,$num),%rax 708 adc \$0,%rdx 709 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 710 lea 4*8($num),$j # j=4 711 lea 16*4($np),$np 712 adc \$0,%rdx 713 mov %rdx,$N[0] 714 jmp .Linner4x 715 716.align 32 717.Linner4x: 718 mulq $m0 # ap[j]*bp[i] 719 add %rax,$A[0] 720 mov -16*2($np),%rax 721 adc \$0,%rdx 722 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 723 lea 32($tp),$tp 724 adc \$0,%rdx 725 mov %rdx,$A[1] 726 727 mulq $m1 # np[j]*m1 728 add %rax,$N[0] 729 mov -8($ap,$j),%rax 730 adc \$0,%rdx 731 add $A[0],$N[0] 732 adc \$0,%rdx 733 mov $N[1],-32($tp) # tp[j-1] 734 mov %rdx,$N[1] 735 736 mulq $m0 # ap[j]*bp[i] 737 add %rax,$A[1] 738 mov -16*1($np),%rax 739 adc \$0,%rdx 740 add -8($tp),$A[1] 741 adc \$0,%rdx 742 mov %rdx,$A[0] 743 744 mulq $m1 # np[j]*m1 745 add %rax,$N[1] 746 mov ($ap,$j),%rax 747 adc \$0,%rdx 748 add $A[1],$N[1] 749 adc \$0,%rdx 750 mov $N[0],-24($tp) # tp[j-1] 751 mov %rdx,$N[0] 752 753 mulq $m0 # ap[j]*bp[i] 754 add %rax,$A[0] 755 mov 16*0($np),%rax 756 adc \$0,%rdx 757 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 758 adc \$0,%rdx 759 mov %rdx,$A[1] 760 761 mulq $m1 # np[j]*m1 762 add %rax,$N[0] 763 mov 8($ap,$j),%rax 764 adc \$0,%rdx 765 add $A[0],$N[0] 766 adc \$0,%rdx 767 mov $N[1],-16($tp) # tp[j-1] 768 mov %rdx,$N[1] 769 770 mulq $m0 # ap[j]*bp[i] 771 add %rax,$A[1] 772 mov 16*1($np),%rax 773 adc \$0,%rdx 774 add 8($tp),$A[1] 775 adc \$0,%rdx 776 mov %rdx,$A[0] 777 778 mulq $m1 # np[j]*m1 779 add %rax,$N[1] 780 mov 16($ap,$j),%rax 781 adc \$0,%rdx 782 add $A[1],$N[1] 783 lea 16*4($np),$np 784 adc \$0,%rdx 785 mov $N[0],-8($tp) # tp[j-1] 786 mov %rdx,$N[0] 787 788 add \$32,$j # j+=4 789 jnz .Linner4x 790 791 mulq $m0 # ap[j]*bp[i] 792 add %rax,$A[0] 793 mov -16*2($np),%rax 794 adc \$0,%rdx 795 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 796 lea 32($tp),$tp 797 adc \$0,%rdx 798 mov %rdx,$A[1] 799 800 mulq $m1 # np[j]*m1 801 add %rax,$N[0] 802 mov -8($ap),%rax 803 adc \$0,%rdx 804 add $A[0],$N[0] 805 adc \$0,%rdx 806 mov $N[1],-32($tp) # tp[j-1] 807 mov %rdx,$N[1] 808 809 mulq $m0 # ap[j]*bp[i] 810 add %rax,$A[1] 811 mov $m1,%rax 812 mov -16*1($np),$m1 813 adc \$0,%rdx 814 add -8($tp),$A[1] 815 adc \$0,%rdx 816 mov %rdx,$A[0] 817 818 mulq $m1 # np[j]*m1 819 add %rax,$N[1] 820 mov ($ap,$num),%rax # ap[0] 821 adc \$0,%rdx 822 add $A[1],$N[1] 823 adc \$0,%rdx 824 mov $N[0],-24($tp) # tp[j-1] 825 mov %rdx,$N[0] 826 827 movq %xmm0,$m0 # bp[i+1] 828 mov $N[1],-16($tp) # tp[j-1] 829 lea ($np,$num,2),$np # rewind $np 830 831 xor $N[1],$N[1] 832 add $A[0],$N[0] 833 adc \$0,$N[1] 834 add ($tp),$N[0] # pull upmost overflow bit 835 adc \$0,$N[1] # upmost overflow bit 836 mov $N[0],-8($tp) 837 838 cmp 16+8(%rsp),$bp 839 jb .Louter4x 840___ 841if (1) { 842$code.=<<___; 843 sub $N[0],$m1 # compare top-most words 844 adc $j,$j # $j is zero 845 or $j,$N[1] 846 xor \$1,$N[1] 847 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 848 lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub 849 mov %r9,%rcx 850 sar \$3+2,%rcx # cf=0 851 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 852 jmp .Lsqr4x_sub 853___ 854} else { 855my @ri=("%rax",$bp,$m0,$m1); 856my $rp="%rdx"; 857$code.=<<___ 858 xor \$1,$N[1] 859 lea ($tp,$num),$tp # rewind $tp 860 sar \$5,$num # cf=0 861 lea ($np,$N[1],8),$np 862 mov 56+8(%rsp),$rp # restore $rp 863 jmp .Lsub4x 864 865.align 32 866.Lsub4x: 867 .byte 0x66 868 mov 8*0($tp),@ri[0] 869 mov 8*1($tp),@ri[1] 870 .byte 0x66 871 sbb 16*0($np),@ri[0] 872 mov 8*2($tp),@ri[2] 873 sbb 16*1($np),@ri[1] 874 mov 3*8($tp),@ri[3] 875 lea 4*8($tp),$tp 876 sbb 16*2($np),@ri[2] 877 mov @ri[0],8*0($rp) 878 sbb 16*3($np),@ri[3] 879 lea 16*4($np),$np 880 mov @ri[1],8*1($rp) 881 mov @ri[2],8*2($rp) 882 mov @ri[3],8*3($rp) 883 lea 8*4($rp),$rp 884 885 inc $num 886 jnz .Lsub4x 887 888 ret 889___ 890} 891$code.=<<___; 892.size mul4x_internal,.-mul4x_internal 893___ 894}}} 895{{{ 896###################################################################### 897# void bn_power5( 898my $rptr="%rdi"; # BN_ULONG *rptr, 899my $aptr="%rsi"; # const BN_ULONG *aptr, 900my $bptr="%rdx"; # const void *table, 901my $nptr="%rcx"; # const BN_ULONG *nptr, 902my $n0 ="%r8"; # const BN_ULONG *n0); 903my $num ="%r9"; # int num, has to be divisible by 8 904 # int pwr 905 906my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 907my @A0=("%r10","%r11"); 908my @A1=("%r12","%r13"); 909my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 910 911$code.=<<___; 912.globl bn_power5 913.type bn_power5,\@function,6 914.align 32 915bn_power5: 916___ 917$code.=<<___ if ($addx); 918 mov OPENSSL_ia32cap_P+8(%rip),%r11d 919 and \$0x80100,%r11d 920 cmp \$0x80100,%r11d 921 je .Lpowerx5_enter 922___ 923$code.=<<___; 924 mov %rsp,%rax 925 push %rbx 926 push %rbp 927 push %r12 928 push %r13 929 push %r14 930 push %r15 931___ 932$code.=<<___ if ($win64); 933 lea -0x28(%rsp),%rsp 934 movaps %xmm6,(%rsp) 935 movaps %xmm7,0x10(%rsp) 936___ 937$code.=<<___; 938 mov ${num}d,%r10d 939 shl \$3,${num}d # convert $num to bytes 940 shl \$3+2,%r10d # 4*$num 941 neg $num 942 mov ($n0),$n0 # *n0 943 944 ############################################################## 945 # ensure that stack frame doesn't alias with $aptr+4*$num 946 # modulo 4096, which covers ret[num], am[num] and n[2*num] 947 # (see bn_exp.c). this is done to allow memory disambiguation 948 # logic do its magic. 949 # 950 lea -64(%rsp,$num,2),%r11 951 sub $aptr,%r11 952 and \$4095,%r11 953 cmp %r11,%r10 954 jb .Lpwr_sp_alt 955 sub %r11,%rsp # align with $aptr 956 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 957 jmp .Lpwr_sp_done 958 959.align 32 960.Lpwr_sp_alt: 961 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 962 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 963 sub %r10,%r11 964 mov \$0,%r10 965 cmovc %r10,%r11 966 sub %r11,%rsp 967.Lpwr_sp_done: 968 and \$-64,%rsp 969 mov $num,%r10 970 neg $num 971 972 ############################################################## 973 # Stack layout 974 # 975 # +0 saved $num, used in reduction section 976 # +8 &t[2*$num], used in reduction section 977 # +32 saved *n0 978 # +40 saved %rsp 979 # +48 t[2*$num] 980 # 981 mov $n0, 32(%rsp) 982 mov %rax, 40(%rsp) # save original %rsp 983.Lpower5_body: 984 movq $rptr,%xmm1 # save $rptr 985 movq $nptr,%xmm2 # save $nptr 986 movq %r10, %xmm3 # -$num 987 movq $bptr,%xmm4 988 989 call __bn_sqr8x_internal 990 call __bn_sqr8x_internal 991 call __bn_sqr8x_internal 992 call __bn_sqr8x_internal 993 call __bn_sqr8x_internal 994 995 movq %xmm2,$nptr 996 movq %xmm4,$bptr 997 mov $aptr,$rptr 998 mov 40(%rsp),%rax 999 lea 32(%rsp),$n0 1000 1001 call mul4x_internal 1002 1003 mov 40(%rsp),%rsi # restore %rsp 1004 mov \$1,%rax 1005 mov -48(%rsi),%r15 1006 mov -40(%rsi),%r14 1007 mov -32(%rsi),%r13 1008 mov -24(%rsi),%r12 1009 mov -16(%rsi),%rbp 1010 mov -8(%rsi),%rbx 1011 lea (%rsi),%rsp 1012.Lpower5_epilogue: 1013 ret 1014.size bn_power5,.-bn_power5 1015 1016.globl bn_sqr8x_internal 1017.hidden bn_sqr8x_internal 1018.type bn_sqr8x_internal,\@abi-omnipotent 1019.align 32 1020bn_sqr8x_internal: 1021__bn_sqr8x_internal: 1022 ############################################################## 1023 # Squaring part: 1024 # 1025 # a) multiply-n-add everything but a[i]*a[i]; 1026 # b) shift result of a) by 1 to the left and accumulate 1027 # a[i]*a[i] products; 1028 # 1029 ############################################################## 1030 # a[1]a[0] 1031 # a[2]a[0] 1032 # a[3]a[0] 1033 # a[2]a[1] 1034 # a[4]a[0] 1035 # a[3]a[1] 1036 # a[5]a[0] 1037 # a[4]a[1] 1038 # a[3]a[2] 1039 # a[6]a[0] 1040 # a[5]a[1] 1041 # a[4]a[2] 1042 # a[7]a[0] 1043 # a[6]a[1] 1044 # a[5]a[2] 1045 # a[4]a[3] 1046 # a[7]a[1] 1047 # a[6]a[2] 1048 # a[5]a[3] 1049 # a[7]a[2] 1050 # a[6]a[3] 1051 # a[5]a[4] 1052 # a[7]a[3] 1053 # a[6]a[4] 1054 # a[7]a[4] 1055 # a[6]a[5] 1056 # a[7]a[5] 1057 # a[7]a[6] 1058 # a[1]a[0] 1059 # a[2]a[0] 1060 # a[3]a[0] 1061 # a[4]a[0] 1062 # a[5]a[0] 1063 # a[6]a[0] 1064 # a[7]a[0] 1065 # a[2]a[1] 1066 # a[3]a[1] 1067 # a[4]a[1] 1068 # a[5]a[1] 1069 # a[6]a[1] 1070 # a[7]a[1] 1071 # a[3]a[2] 1072 # a[4]a[2] 1073 # a[5]a[2] 1074 # a[6]a[2] 1075 # a[7]a[2] 1076 # a[4]a[3] 1077 # a[5]a[3] 1078 # a[6]a[3] 1079 # a[7]a[3] 1080 # a[5]a[4] 1081 # a[6]a[4] 1082 # a[7]a[4] 1083 # a[6]a[5] 1084 # a[7]a[5] 1085 # a[7]a[6] 1086 # a[0]a[0] 1087 # a[1]a[1] 1088 # a[2]a[2] 1089 # a[3]a[3] 1090 # a[4]a[4] 1091 # a[5]a[5] 1092 # a[6]a[6] 1093 # a[7]a[7] 1094 1095 lea 32(%r10),$i # $i=-($num-32) 1096 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1097 1098 mov $num,$j # $j=$num 1099 1100 # comments apply to $num==8 case 1101 mov -32($aptr,$i),$a0 # a[0] 1102 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1103 mov -24($aptr,$i),%rax # a[1] 1104 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1105 mov -16($aptr,$i),$ai # a[2] 1106 mov %rax,$a1 1107 1108 mul $a0 # a[1]*a[0] 1109 mov %rax,$A0[0] # a[1]*a[0] 1110 mov $ai,%rax # a[2] 1111 mov %rdx,$A0[1] 1112 mov $A0[0],-24($tptr,$i) # t[1] 1113 1114 mul $a0 # a[2]*a[0] 1115 add %rax,$A0[1] 1116 mov $ai,%rax 1117 adc \$0,%rdx 1118 mov $A0[1],-16($tptr,$i) # t[2] 1119 mov %rdx,$A0[0] 1120 1121 1122 mov -8($aptr,$i),$ai # a[3] 1123 mul $a1 # a[2]*a[1] 1124 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1125 mov $ai,%rax 1126 mov %rdx,$A1[1] 1127 1128 lea ($i),$j 1129 mul $a0 # a[3]*a[0] 1130 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1131 mov $ai,%rax 1132 mov %rdx,$A0[1] 1133 adc \$0,$A0[1] 1134 add $A1[0],$A0[0] 1135 adc \$0,$A0[1] 1136 mov $A0[0],-8($tptr,$j) # t[3] 1137 jmp .Lsqr4x_1st 1138 1139.align 32 1140.Lsqr4x_1st: 1141 mov ($aptr,$j),$ai # a[4] 1142 mul $a1 # a[3]*a[1] 1143 add %rax,$A1[1] # a[3]*a[1]+t[4] 1144 mov $ai,%rax 1145 mov %rdx,$A1[0] 1146 adc \$0,$A1[0] 1147 1148 mul $a0 # a[4]*a[0] 1149 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1150 mov $ai,%rax # a[3] 1151 mov 8($aptr,$j),$ai # a[5] 1152 mov %rdx,$A0[0] 1153 adc \$0,$A0[0] 1154 add $A1[1],$A0[1] 1155 adc \$0,$A0[0] 1156 1157 1158 mul $a1 # a[4]*a[3] 1159 add %rax,$A1[0] # a[4]*a[3]+t[5] 1160 mov $ai,%rax 1161 mov $A0[1],($tptr,$j) # t[4] 1162 mov %rdx,$A1[1] 1163 adc \$0,$A1[1] 1164 1165 mul $a0 # a[5]*a[2] 1166 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1167 mov $ai,%rax 1168 mov 16($aptr,$j),$ai # a[6] 1169 mov %rdx,$A0[1] 1170 adc \$0,$A0[1] 1171 add $A1[0],$A0[0] 1172 adc \$0,$A0[1] 1173 1174 mul $a1 # a[5]*a[3] 1175 add %rax,$A1[1] # a[5]*a[3]+t[6] 1176 mov $ai,%rax 1177 mov $A0[0],8($tptr,$j) # t[5] 1178 mov %rdx,$A1[0] 1179 adc \$0,$A1[0] 1180 1181 mul $a0 # a[6]*a[2] 1182 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1183 mov $ai,%rax # a[3] 1184 mov 24($aptr,$j),$ai # a[7] 1185 mov %rdx,$A0[0] 1186 adc \$0,$A0[0] 1187 add $A1[1],$A0[1] 1188 adc \$0,$A0[0] 1189 1190 1191 mul $a1 # a[6]*a[5] 1192 add %rax,$A1[0] # a[6]*a[5]+t[7] 1193 mov $ai,%rax 1194 mov $A0[1],16($tptr,$j) # t[6] 1195 mov %rdx,$A1[1] 1196 adc \$0,$A1[1] 1197 lea 32($j),$j 1198 1199 mul $a0 # a[7]*a[4] 1200 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1201 mov $ai,%rax 1202 mov %rdx,$A0[1] 1203 adc \$0,$A0[1] 1204 add $A1[0],$A0[0] 1205 adc \$0,$A0[1] 1206 mov $A0[0],-8($tptr,$j) # t[7] 1207 1208 cmp \$0,$j 1209 jne .Lsqr4x_1st 1210 1211 mul $a1 # a[7]*a[5] 1212 add %rax,$A1[1] 1213 lea 16($i),$i 1214 adc \$0,%rdx 1215 add $A0[1],$A1[1] 1216 adc \$0,%rdx 1217 1218 mov $A1[1],($tptr) # t[8] 1219 mov %rdx,$A1[0] 1220 mov %rdx,8($tptr) # t[9] 1221 jmp .Lsqr4x_outer 1222 1223.align 32 1224.Lsqr4x_outer: # comments apply to $num==6 case 1225 mov -32($aptr,$i),$a0 # a[0] 1226 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1227 mov -24($aptr,$i),%rax # a[1] 1228 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1229 mov -16($aptr,$i),$ai # a[2] 1230 mov %rax,$a1 1231 1232 mul $a0 # a[1]*a[0] 1233 mov -24($tptr,$i),$A0[0] # t[1] 1234 add %rax,$A0[0] # a[1]*a[0]+t[1] 1235 mov $ai,%rax # a[2] 1236 adc \$0,%rdx 1237 mov $A0[0],-24($tptr,$i) # t[1] 1238 mov %rdx,$A0[1] 1239 1240 mul $a0 # a[2]*a[0] 1241 add %rax,$A0[1] 1242 mov $ai,%rax 1243 adc \$0,%rdx 1244 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1245 mov %rdx,$A0[0] 1246 adc \$0,$A0[0] 1247 mov $A0[1],-16($tptr,$i) # t[2] 1248 1249 xor $A1[0],$A1[0] 1250 1251 mov -8($aptr,$i),$ai # a[3] 1252 mul $a1 # a[2]*a[1] 1253 add %rax,$A1[0] # a[2]*a[1]+t[3] 1254 mov $ai,%rax 1255 adc \$0,%rdx 1256 add -8($tptr,$i),$A1[0] 1257 mov %rdx,$A1[1] 1258 adc \$0,$A1[1] 1259 1260 mul $a0 # a[3]*a[0] 1261 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1262 mov $ai,%rax 1263 adc \$0,%rdx 1264 add $A1[0],$A0[0] 1265 mov %rdx,$A0[1] 1266 adc \$0,$A0[1] 1267 mov $A0[0],-8($tptr,$i) # t[3] 1268 1269 lea ($i),$j 1270 jmp .Lsqr4x_inner 1271 1272.align 32 1273.Lsqr4x_inner: 1274 mov ($aptr,$j),$ai # a[4] 1275 mul $a1 # a[3]*a[1] 1276 add %rax,$A1[1] # a[3]*a[1]+t[4] 1277 mov $ai,%rax 1278 mov %rdx,$A1[0] 1279 adc \$0,$A1[0] 1280 add ($tptr,$j),$A1[1] 1281 adc \$0,$A1[0] 1282 1283 .byte 0x67 1284 mul $a0 # a[4]*a[0] 1285 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1286 mov $ai,%rax # a[3] 1287 mov 8($aptr,$j),$ai # a[5] 1288 mov %rdx,$A0[0] 1289 adc \$0,$A0[0] 1290 add $A1[1],$A0[1] 1291 adc \$0,$A0[0] 1292 1293 mul $a1 # a[4]*a[3] 1294 add %rax,$A1[0] # a[4]*a[3]+t[5] 1295 mov $A0[1],($tptr,$j) # t[4] 1296 mov $ai,%rax 1297 mov %rdx,$A1[1] 1298 adc \$0,$A1[1] 1299 add 8($tptr,$j),$A1[0] 1300 lea 16($j),$j # j++ 1301 adc \$0,$A1[1] 1302 1303 mul $a0 # a[5]*a[2] 1304 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1305 mov $ai,%rax 1306 adc \$0,%rdx 1307 add $A1[0],$A0[0] 1308 mov %rdx,$A0[1] 1309 adc \$0,$A0[1] 1310 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1311 1312 cmp \$0,$j 1313 jne .Lsqr4x_inner 1314 1315 .byte 0x67 1316 mul $a1 # a[5]*a[3] 1317 add %rax,$A1[1] 1318 adc \$0,%rdx 1319 add $A0[1],$A1[1] 1320 adc \$0,%rdx 1321 1322 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1323 mov %rdx,$A1[0] 1324 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1325 1326 add \$16,$i 1327 jnz .Lsqr4x_outer 1328 1329 # comments apply to $num==4 case 1330 mov -32($aptr),$a0 # a[0] 1331 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1332 mov -24($aptr),%rax # a[1] 1333 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1334 mov -16($aptr),$ai # a[2] 1335 mov %rax,$a1 1336 1337 mul $a0 # a[1]*a[0] 1338 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1339 mov $ai,%rax # a[2] 1340 mov %rdx,$A0[1] 1341 adc \$0,$A0[1] 1342 1343 mul $a0 # a[2]*a[0] 1344 add %rax,$A0[1] 1345 mov $ai,%rax 1346 mov $A0[0],-24($tptr) # t[1] 1347 mov %rdx,$A0[0] 1348 adc \$0,$A0[0] 1349 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1350 mov -8($aptr),$ai # a[3] 1351 adc \$0,$A0[0] 1352 1353 mul $a1 # a[2]*a[1] 1354 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1355 mov $ai,%rax 1356 mov $A0[1],-16($tptr) # t[2] 1357 mov %rdx,$A1[1] 1358 adc \$0,$A1[1] 1359 1360 mul $a0 # a[3]*a[0] 1361 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1362 mov $ai,%rax 1363 mov %rdx,$A0[1] 1364 adc \$0,$A0[1] 1365 add $A1[0],$A0[0] 1366 adc \$0,$A0[1] 1367 mov $A0[0],-8($tptr) # t[3] 1368 1369 mul $a1 # a[3]*a[1] 1370 add %rax,$A1[1] 1371 mov -16($aptr),%rax # a[2] 1372 adc \$0,%rdx 1373 add $A0[1],$A1[1] 1374 adc \$0,%rdx 1375 1376 mov $A1[1],($tptr) # t[4] 1377 mov %rdx,$A1[0] 1378 mov %rdx,8($tptr) # t[5] 1379 1380 mul $ai # a[2]*a[3] 1381___ 1382{ 1383my ($shift,$carry)=($a0,$a1); 1384my @S=(@A1,$ai,$n0); 1385$code.=<<___; 1386 add \$16,$i 1387 xor $shift,$shift 1388 sub $num,$i # $i=16-$num 1389 xor $carry,$carry 1390 1391 add $A1[0],%rax # t[5] 1392 adc \$0,%rdx 1393 mov %rax,8($tptr) # t[5] 1394 mov %rdx,16($tptr) # t[6] 1395 mov $carry,24($tptr) # t[7] 1396 1397 mov -16($aptr,$i),%rax # a[0] 1398 lea 48+8(%rsp),$tptr 1399 xor $A0[0],$A0[0] # t[0] 1400 mov 8($tptr),$A0[1] # t[1] 1401 1402 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1403 shr \$63,$A0[0] 1404 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1405 shr \$63,$A0[1] 1406 or $A0[0],$S[1] # | t[2*i]>>63 1407 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1408 mov $A0[1],$shift # shift=t[2*i+1]>>63 1409 mul %rax # a[i]*a[i] 1410 neg $carry # mov $carry,cf 1411 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1412 adc %rax,$S[0] 1413 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1414 mov $S[0],($tptr) 1415 adc %rdx,$S[1] 1416 1417 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1418 mov $S[1],8($tptr) 1419 sbb $carry,$carry # mov cf,$carry 1420 shr \$63,$A0[0] 1421 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1422 shr \$63,$A0[1] 1423 or $A0[0],$S[3] # | t[2*i]>>63 1424 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1425 mov $A0[1],$shift # shift=t[2*i+1]>>63 1426 mul %rax # a[i]*a[i] 1427 neg $carry # mov $carry,cf 1428 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1429 adc %rax,$S[2] 1430 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1431 mov $S[2],16($tptr) 1432 adc %rdx,$S[3] 1433 lea 16($i),$i 1434 mov $S[3],24($tptr) 1435 sbb $carry,$carry # mov cf,$carry 1436 lea 64($tptr),$tptr 1437 jmp .Lsqr4x_shift_n_add 1438 1439.align 32 1440.Lsqr4x_shift_n_add: 1441 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1442 shr \$63,$A0[0] 1443 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1444 shr \$63,$A0[1] 1445 or $A0[0],$S[1] # | t[2*i]>>63 1446 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1447 mov $A0[1],$shift # shift=t[2*i+1]>>63 1448 mul %rax # a[i]*a[i] 1449 neg $carry # mov $carry,cf 1450 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1451 adc %rax,$S[0] 1452 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1453 mov $S[0],-32($tptr) 1454 adc %rdx,$S[1] 1455 1456 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1457 mov $S[1],-24($tptr) 1458 sbb $carry,$carry # mov cf,$carry 1459 shr \$63,$A0[0] 1460 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1461 shr \$63,$A0[1] 1462 or $A0[0],$S[3] # | t[2*i]>>63 1463 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1464 mov $A0[1],$shift # shift=t[2*i+1]>>63 1465 mul %rax # a[i]*a[i] 1466 neg $carry # mov $carry,cf 1467 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1468 adc %rax,$S[2] 1469 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1470 mov $S[2],-16($tptr) 1471 adc %rdx,$S[3] 1472 1473 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1474 mov $S[3],-8($tptr) 1475 sbb $carry,$carry # mov cf,$carry 1476 shr \$63,$A0[0] 1477 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1478 shr \$63,$A0[1] 1479 or $A0[0],$S[1] # | t[2*i]>>63 1480 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1481 mov $A0[1],$shift # shift=t[2*i+1]>>63 1482 mul %rax # a[i]*a[i] 1483 neg $carry # mov $carry,cf 1484 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1485 adc %rax,$S[0] 1486 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1487 mov $S[0],0($tptr) 1488 adc %rdx,$S[1] 1489 1490 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1491 mov $S[1],8($tptr) 1492 sbb $carry,$carry # mov cf,$carry 1493 shr \$63,$A0[0] 1494 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1495 shr \$63,$A0[1] 1496 or $A0[0],$S[3] # | t[2*i]>>63 1497 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1498 mov $A0[1],$shift # shift=t[2*i+1]>>63 1499 mul %rax # a[i]*a[i] 1500 neg $carry # mov $carry,cf 1501 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1502 adc %rax,$S[2] 1503 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1504 mov $S[2],16($tptr) 1505 adc %rdx,$S[3] 1506 mov $S[3],24($tptr) 1507 sbb $carry,$carry # mov cf,$carry 1508 lea 64($tptr),$tptr 1509 add \$32,$i 1510 jnz .Lsqr4x_shift_n_add 1511 1512 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1513 .byte 0x67 1514 shr \$63,$A0[0] 1515 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1516 shr \$63,$A0[1] 1517 or $A0[0],$S[1] # | t[2*i]>>63 1518 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1519 mov $A0[1],$shift # shift=t[2*i+1]>>63 1520 mul %rax # a[i]*a[i] 1521 neg $carry # mov $carry,cf 1522 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1523 adc %rax,$S[0] 1524 mov -8($aptr),%rax # a[i+1] # prefetch 1525 mov $S[0],-32($tptr) 1526 adc %rdx,$S[1] 1527 1528 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1529 mov $S[1],-24($tptr) 1530 sbb $carry,$carry # mov cf,$carry 1531 shr \$63,$A0[0] 1532 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1533 shr \$63,$A0[1] 1534 or $A0[0],$S[3] # | t[2*i]>>63 1535 mul %rax # a[i]*a[i] 1536 neg $carry # mov $carry,cf 1537 adc %rax,$S[2] 1538 adc %rdx,$S[3] 1539 mov $S[2],-16($tptr) 1540 mov $S[3],-8($tptr) 1541___ 1542} 1543###################################################################### 1544# Montgomery reduction part, "word-by-word" algorithm. 1545# 1546# This new path is inspired by multiple submissions from Intel, by 1547# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1548# Vinodh Gopal... 1549{ 1550my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1551 1552$code.=<<___; 1553 movq %xmm2,$nptr 1554sqr8x_reduction: 1555 xor %rax,%rax 1556 lea ($nptr,$num,2),%rcx # end of n[] 1557 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1558 mov %rcx,0+8(%rsp) 1559 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1560 mov %rdx,8+8(%rsp) 1561 neg $num 1562 jmp .L8x_reduction_loop 1563 1564.align 32 1565.L8x_reduction_loop: 1566 lea ($tptr,$num),$tptr # start of current t[] window 1567 .byte 0x66 1568 mov 8*0($tptr),$m0 1569 mov 8*1($tptr),%r9 1570 mov 8*2($tptr),%r10 1571 mov 8*3($tptr),%r11 1572 mov 8*4($tptr),%r12 1573 mov 8*5($tptr),%r13 1574 mov 8*6($tptr),%r14 1575 mov 8*7($tptr),%r15 1576 mov %rax,(%rdx) # store top-most carry bit 1577 lea 8*8($tptr),$tptr 1578 1579 .byte 0x67 1580 mov $m0,%r8 1581 imulq 32+8(%rsp),$m0 # n0*a[0] 1582 mov 16*0($nptr),%rax # n[0] 1583 mov \$8,%ecx 1584 jmp .L8x_reduce 1585 1586.align 32 1587.L8x_reduce: 1588 mulq $m0 1589 mov 16*1($nptr),%rax # n[1] 1590 neg %r8 1591 mov %rdx,%r8 1592 adc \$0,%r8 1593 1594 mulq $m0 1595 add %rax,%r9 1596 mov 16*2($nptr),%rax 1597 adc \$0,%rdx 1598 add %r9,%r8 1599 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1600 mov %rdx,%r9 1601 adc \$0,%r9 1602 1603 mulq $m0 1604 add %rax,%r10 1605 mov 16*3($nptr),%rax 1606 adc \$0,%rdx 1607 add %r10,%r9 1608 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1609 mov %rdx,%r10 1610 adc \$0,%r10 1611 1612 mulq $m0 1613 add %rax,%r11 1614 mov 16*4($nptr),%rax 1615 adc \$0,%rdx 1616 imulq %r8,$carry # modulo-scheduled 1617 add %r11,%r10 1618 mov %rdx,%r11 1619 adc \$0,%r11 1620 1621 mulq $m0 1622 add %rax,%r12 1623 mov 16*5($nptr),%rax 1624 adc \$0,%rdx 1625 add %r12,%r11 1626 mov %rdx,%r12 1627 adc \$0,%r12 1628 1629 mulq $m0 1630 add %rax,%r13 1631 mov 16*6($nptr),%rax 1632 adc \$0,%rdx 1633 add %r13,%r12 1634 mov %rdx,%r13 1635 adc \$0,%r13 1636 1637 mulq $m0 1638 add %rax,%r14 1639 mov 16*7($nptr),%rax 1640 adc \$0,%rdx 1641 add %r14,%r13 1642 mov %rdx,%r14 1643 adc \$0,%r14 1644 1645 mulq $m0 1646 mov $carry,$m0 # n0*a[i] 1647 add %rax,%r15 1648 mov 16*0($nptr),%rax # n[0] 1649 adc \$0,%rdx 1650 add %r15,%r14 1651 mov %rdx,%r15 1652 adc \$0,%r15 1653 1654 dec %ecx 1655 jnz .L8x_reduce 1656 1657 lea 16*8($nptr),$nptr 1658 xor %rax,%rax 1659 mov 8+8(%rsp),%rdx # pull end of t[] 1660 cmp 0+8(%rsp),$nptr # end of n[]? 1661 jae .L8x_no_tail 1662 1663 .byte 0x66 1664 add 8*0($tptr),%r8 1665 adc 8*1($tptr),%r9 1666 adc 8*2($tptr),%r10 1667 adc 8*3($tptr),%r11 1668 adc 8*4($tptr),%r12 1669 adc 8*5($tptr),%r13 1670 adc 8*6($tptr),%r14 1671 adc 8*7($tptr),%r15 1672 sbb $carry,$carry # top carry 1673 1674 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1675 mov \$8,%ecx 1676 mov 16*0($nptr),%rax 1677 jmp .L8x_tail 1678 1679.align 32 1680.L8x_tail: 1681 mulq $m0 1682 add %rax,%r8 1683 mov 16*1($nptr),%rax 1684 mov %r8,($tptr) # save result 1685 mov %rdx,%r8 1686 adc \$0,%r8 1687 1688 mulq $m0 1689 add %rax,%r9 1690 mov 16*2($nptr),%rax 1691 adc \$0,%rdx 1692 add %r9,%r8 1693 lea 8($tptr),$tptr # $tptr++ 1694 mov %rdx,%r9 1695 adc \$0,%r9 1696 1697 mulq $m0 1698 add %rax,%r10 1699 mov 16*3($nptr),%rax 1700 adc \$0,%rdx 1701 add %r10,%r9 1702 mov %rdx,%r10 1703 adc \$0,%r10 1704 1705 mulq $m0 1706 add %rax,%r11 1707 mov 16*4($nptr),%rax 1708 adc \$0,%rdx 1709 add %r11,%r10 1710 mov %rdx,%r11 1711 adc \$0,%r11 1712 1713 mulq $m0 1714 add %rax,%r12 1715 mov 16*5($nptr),%rax 1716 adc \$0,%rdx 1717 add %r12,%r11 1718 mov %rdx,%r12 1719 adc \$0,%r12 1720 1721 mulq $m0 1722 add %rax,%r13 1723 mov 16*6($nptr),%rax 1724 adc \$0,%rdx 1725 add %r13,%r12 1726 mov %rdx,%r13 1727 adc \$0,%r13 1728 1729 mulq $m0 1730 add %rax,%r14 1731 mov 16*7($nptr),%rax 1732 adc \$0,%rdx 1733 add %r14,%r13 1734 mov %rdx,%r14 1735 adc \$0,%r14 1736 1737 mulq $m0 1738 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1739 add %rax,%r15 1740 adc \$0,%rdx 1741 add %r15,%r14 1742 mov 16*0($nptr),%rax # pull n[0] 1743 mov %rdx,%r15 1744 adc \$0,%r15 1745 1746 dec %ecx 1747 jnz .L8x_tail 1748 1749 lea 16*8($nptr),$nptr 1750 mov 8+8(%rsp),%rdx # pull end of t[] 1751 cmp 0+8(%rsp),$nptr # end of n[]? 1752 jae .L8x_tail_done # break out of loop 1753 1754 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1755 neg $carry 1756 mov 8*0($nptr),%rax # pull n[0] 1757 adc 8*0($tptr),%r8 1758 adc 8*1($tptr),%r9 1759 adc 8*2($tptr),%r10 1760 adc 8*3($tptr),%r11 1761 adc 8*4($tptr),%r12 1762 adc 8*5($tptr),%r13 1763 adc 8*6($tptr),%r14 1764 adc 8*7($tptr),%r15 1765 sbb $carry,$carry # top carry 1766 1767 mov \$8,%ecx 1768 jmp .L8x_tail 1769 1770.align 32 1771.L8x_tail_done: 1772 add (%rdx),%r8 # can this overflow? 1773 adc \$0,%r9 1774 adc \$0,%r10 1775 adc \$0,%r11 1776 adc \$0,%r12 1777 adc \$0,%r13 1778 adc \$0,%r14 1779 adc \$0,%r15 # can't overflow, because we 1780 # started with "overhung" part 1781 # of multiplication 1782 xor %rax,%rax 1783 1784 neg $carry 1785.L8x_no_tail: 1786 adc 8*0($tptr),%r8 1787 adc 8*1($tptr),%r9 1788 adc 8*2($tptr),%r10 1789 adc 8*3($tptr),%r11 1790 adc 8*4($tptr),%r12 1791 adc 8*5($tptr),%r13 1792 adc 8*6($tptr),%r14 1793 adc 8*7($tptr),%r15 1794 adc \$0,%rax # top-most carry 1795 mov -16($nptr),%rcx # np[num-1] 1796 xor $carry,$carry 1797 1798 movq %xmm2,$nptr # restore $nptr 1799 1800 mov %r8,8*0($tptr) # store top 512 bits 1801 mov %r9,8*1($tptr) 1802 movq %xmm3,$num # $num is %r9, can't be moved upwards 1803 mov %r10,8*2($tptr) 1804 mov %r11,8*3($tptr) 1805 mov %r12,8*4($tptr) 1806 mov %r13,8*5($tptr) 1807 mov %r14,8*6($tptr) 1808 mov %r15,8*7($tptr) 1809 lea 8*8($tptr),$tptr 1810 1811 cmp %rdx,$tptr # end of t[]? 1812 jb .L8x_reduction_loop 1813___ 1814} 1815############################################################## 1816# Post-condition, 4x unrolled 1817# 1818{ 1819my ($tptr,$nptr)=("%rbx","%rbp"); 1820$code.=<<___; 1821 #xor %rsi,%rsi # %rsi was $carry above 1822 sub %r15,%rcx # compare top-most words 1823 lea (%rdi,$num),$tptr # %rdi was $tptr above 1824 adc %rsi,%rsi 1825 mov $num,%rcx 1826 or %rsi,%rax 1827 movq %xmm1,$rptr # restore $rptr 1828 xor \$1,%rax 1829 movq %xmm1,$aptr # prepare for back-to-back call 1830 lea ($nptr,%rax,8),$nptr 1831 sar \$3+2,%rcx # cf=0 1832 jmp .Lsqr4x_sub 1833 1834.align 32 1835.Lsqr4x_sub: 1836 .byte 0x66 1837 mov 8*0($tptr),%r12 1838 mov 8*1($tptr),%r13 1839 sbb 16*0($nptr),%r12 1840 mov 8*2($tptr),%r14 1841 sbb 16*1($nptr),%r13 1842 mov 8*3($tptr),%r15 1843 lea 8*4($tptr),$tptr 1844 sbb 16*2($nptr),%r14 1845 mov %r12,8*0($rptr) 1846 sbb 16*3($nptr),%r15 1847 lea 16*4($nptr),$nptr 1848 mov %r13,8*1($rptr) 1849 mov %r14,8*2($rptr) 1850 mov %r15,8*3($rptr) 1851 lea 8*4($rptr),$rptr 1852 1853 inc %rcx # pass %cf 1854 jnz .Lsqr4x_sub 1855___ 1856} 1857$code.=<<___; 1858 mov $num,%r10 # prepare for back-to-back call 1859 neg $num # restore $num 1860 ret 1861.size bn_sqr8x_internal,.-bn_sqr8x_internal 1862___ 1863{ 1864$code.=<<___; 1865.globl bn_from_montgomery 1866.type bn_from_montgomery,\@abi-omnipotent 1867.align 32 1868bn_from_montgomery: 1869 testl \$7,`($win64?"48(%rsp)":"%r9d")` 1870 jz bn_from_mont8x 1871 xor %eax,%eax 1872 ret 1873.size bn_from_montgomery,.-bn_from_montgomery 1874 1875.type bn_from_mont8x,\@function,6 1876.align 32 1877bn_from_mont8x: 1878 .byte 0x67 1879 mov %rsp,%rax 1880 push %rbx 1881 push %rbp 1882 push %r12 1883 push %r13 1884 push %r14 1885 push %r15 1886___ 1887$code.=<<___ if ($win64); 1888 lea -0x28(%rsp),%rsp 1889 movaps %xmm6,(%rsp) 1890 movaps %xmm7,0x10(%rsp) 1891___ 1892$code.=<<___; 1893 .byte 0x67 1894 mov ${num}d,%r10d 1895 shl \$3,${num}d # convert $num to bytes 1896 shl \$3+2,%r10d # 4*$num 1897 neg $num 1898 mov ($n0),$n0 # *n0 1899 1900 ############################################################## 1901 # ensure that stack frame doesn't alias with $aptr+4*$num 1902 # modulo 4096, which covers ret[num], am[num] and n[2*num] 1903 # (see bn_exp.c). this is done to allow memory disambiguation 1904 # logic do its magic. 1905 # 1906 lea -64(%rsp,$num,2),%r11 1907 sub $aptr,%r11 1908 and \$4095,%r11 1909 cmp %r11,%r10 1910 jb .Lfrom_sp_alt 1911 sub %r11,%rsp # align with $aptr 1912 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 1913 jmp .Lfrom_sp_done 1914 1915.align 32 1916.Lfrom_sp_alt: 1917 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 1918 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 1919 sub %r10,%r11 1920 mov \$0,%r10 1921 cmovc %r10,%r11 1922 sub %r11,%rsp 1923.Lfrom_sp_done: 1924 and \$-64,%rsp 1925 mov $num,%r10 1926 neg $num 1927 1928 ############################################################## 1929 # Stack layout 1930 # 1931 # +0 saved $num, used in reduction section 1932 # +8 &t[2*$num], used in reduction section 1933 # +32 saved *n0 1934 # +40 saved %rsp 1935 # +48 t[2*$num] 1936 # 1937 mov $n0, 32(%rsp) 1938 mov %rax, 40(%rsp) # save original %rsp 1939.Lfrom_body: 1940 mov $num,%r11 1941 lea 48(%rsp),%rax 1942 pxor %xmm0,%xmm0 1943 jmp .Lmul_by_1 1944 1945.align 32 1946.Lmul_by_1: 1947 movdqu ($aptr),%xmm1 1948 movdqu 16($aptr),%xmm2 1949 movdqu 32($aptr),%xmm3 1950 movdqa %xmm0,(%rax,$num) 1951 movdqu 48($aptr),%xmm4 1952 movdqa %xmm0,16(%rax,$num) 1953 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr 1954 movdqa %xmm1,(%rax) 1955 movdqa %xmm0,32(%rax,$num) 1956 movdqa %xmm2,16(%rax) 1957 movdqa %xmm0,48(%rax,$num) 1958 movdqa %xmm3,32(%rax) 1959 movdqa %xmm4,48(%rax) 1960 lea 64(%rax),%rax 1961 sub \$64,%r11 1962 jnz .Lmul_by_1 1963 1964 movq $rptr,%xmm1 1965 movq $nptr,%xmm2 1966 .byte 0x67 1967 mov $nptr,%rbp 1968 movq %r10, %xmm3 # -num 1969___ 1970$code.=<<___ if ($addx); 1971 mov OPENSSL_ia32cap_P+8(%rip),%r11d 1972 and \$0x80100,%r11d 1973 cmp \$0x80100,%r11d 1974 jne .Lfrom_mont_nox 1975 1976 lea (%rax,$num),$rptr 1977 call sqrx8x_reduction 1978 1979 pxor %xmm0,%xmm0 1980 lea 48(%rsp),%rax 1981 mov 40(%rsp),%rsi # restore %rsp 1982 jmp .Lfrom_mont_zero 1983 1984.align 32 1985.Lfrom_mont_nox: 1986___ 1987$code.=<<___; 1988 call sqr8x_reduction 1989 1990 pxor %xmm0,%xmm0 1991 lea 48(%rsp),%rax 1992 mov 40(%rsp),%rsi # restore %rsp 1993 jmp .Lfrom_mont_zero 1994 1995.align 32 1996.Lfrom_mont_zero: 1997 movdqa %xmm0,16*0(%rax) 1998 movdqa %xmm0,16*1(%rax) 1999 movdqa %xmm0,16*2(%rax) 2000 movdqa %xmm0,16*3(%rax) 2001 lea 16*4(%rax),%rax 2002 sub \$32,$num 2003 jnz .Lfrom_mont_zero 2004 2005 mov \$1,%rax 2006 mov -48(%rsi),%r15 2007 mov -40(%rsi),%r14 2008 mov -32(%rsi),%r13 2009 mov -24(%rsi),%r12 2010 mov -16(%rsi),%rbp 2011 mov -8(%rsi),%rbx 2012 lea (%rsi),%rsp 2013.Lfrom_epilogue: 2014 ret 2015.size bn_from_mont8x,.-bn_from_mont8x 2016___ 2017} 2018}}} 2019 2020if ($addx) {{{ 2021my $bp="%rdx"; # restore original value 2022 2023$code.=<<___; 2024.type bn_mulx4x_mont_gather5,\@function,6 2025.align 32 2026bn_mulx4x_mont_gather5: 2027.Lmulx4x_enter: 2028 .byte 0x67 2029 mov %rsp,%rax 2030 push %rbx 2031 push %rbp 2032 push %r12 2033 push %r13 2034 push %r14 2035 push %r15 2036___ 2037$code.=<<___ if ($win64); 2038 lea -0x28(%rsp),%rsp 2039 movaps %xmm6,(%rsp) 2040 movaps %xmm7,0x10(%rsp) 2041___ 2042$code.=<<___; 2043 .byte 0x67 2044 mov ${num}d,%r10d 2045 shl \$3,${num}d # convert $num to bytes 2046 shl \$3+2,%r10d # 4*$num 2047 neg $num # -$num 2048 mov ($n0),$n0 # *n0 2049 2050 ############################################################## 2051 # ensure that stack frame doesn't alias with $aptr+4*$num 2052 # modulo 4096, which covers a[num], ret[num] and n[2*num] 2053 # (see bn_exp.c). this is done to allow memory disambiguation 2054 # logic do its magic. [excessive frame is allocated in order 2055 # to allow bn_from_mont8x to clear it.] 2056 # 2057 lea -64(%rsp,$num,2),%r11 2058 sub $ap,%r11 2059 and \$4095,%r11 2060 cmp %r11,%r10 2061 jb .Lmulx4xsp_alt 2062 sub %r11,%rsp # align with $aptr 2063 lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) 2064 jmp .Lmulx4xsp_done 2065 2066.align 32 2067.Lmulx4xsp_alt: 2068 lea 4096-64(,$num,2),%r10 # 4096-frame-$num 2069 lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) 2070 sub %r10,%r11 2071 mov \$0,%r10 2072 cmovc %r10,%r11 2073 sub %r11,%rsp 2074.Lmulx4xsp_done: 2075 and \$-64,%rsp # ensure alignment 2076 ############################################################## 2077 # Stack layout 2078 # +0 -num 2079 # +8 off-loaded &b[i] 2080 # +16 end of b[num] 2081 # +24 inner counter 2082 # +32 saved n0 2083 # +40 saved %rsp 2084 # +48 2085 # +56 saved rp 2086 # +64 tmp[num+1] 2087 # 2088 mov $n0, 32(%rsp) # save *n0 2089 mov %rax,40(%rsp) # save original %rsp 2090.Lmulx4x_body: 2091 call mulx4x_internal 2092 2093 mov 40(%rsp),%rsi # restore %rsp 2094 mov \$1,%rax 2095___ 2096$code.=<<___ if ($win64); 2097 movaps -88(%rsi),%xmm6 2098 movaps -72(%rsi),%xmm7 2099___ 2100$code.=<<___; 2101 mov -48(%rsi),%r15 2102 mov -40(%rsi),%r14 2103 mov -32(%rsi),%r13 2104 mov -24(%rsi),%r12 2105 mov -16(%rsi),%rbp 2106 mov -8(%rsi),%rbx 2107 lea (%rsi),%rsp 2108.Lmulx4x_epilogue: 2109 ret 2110.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2111 2112.type mulx4x_internal,\@abi-omnipotent 2113.align 32 2114mulx4x_internal: 2115 .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num 2116 .byte 0x67 2117 neg $num # restore $num 2118 shl \$5,$num 2119 lea 256($bp,$num),%r13 2120 shr \$5+5,$num 2121 mov `($win64?56:8)`(%rax),%r10d # load 7th argument 2122 sub \$1,$num 2123 mov %r13,16+8(%rsp) # end of b[num] 2124 mov $num,24+8(%rsp) # inner counter 2125 mov $rp, 56+8(%rsp) # save $rp 2126___ 2127my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2128 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2129my $rptr=$bptr; 2130my $STRIDE=2**5*8; # 5 is "window size" 2131my $N=$STRIDE/4; # should match cache line size 2132$code.=<<___; 2133 mov %r10,%r11 2134 shr \$`log($N/8)/log(2)`,%r10 2135 and \$`$N/8-1`,%r11 2136 not %r10 2137 lea .Lmagic_masks(%rip),%rax 2138 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 2139 lea 96($bp,%r11,8),$bptr # pointer within 1st cache line 2140 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 2141 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 2142 add \$7,%r11 2143 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 2144 movq 24(%rax,%r10,8),%xmm7 2145 and \$7,%r11 2146 2147 movq `0*$STRIDE/4-96`($bptr),%xmm0 2148 lea $STRIDE($bptr),$tptr # borrow $tptr 2149 movq `1*$STRIDE/4-96`($bptr),%xmm1 2150 pand %xmm4,%xmm0 2151 movq `2*$STRIDE/4-96`($bptr),%xmm2 2152 pand %xmm5,%xmm1 2153 movq `3*$STRIDE/4-96`($bptr),%xmm3 2154 pand %xmm6,%xmm2 2155 por %xmm1,%xmm0 2156 movq `0*$STRIDE/4-96`($tptr),%xmm1 2157 pand %xmm7,%xmm3 2158 por %xmm2,%xmm0 2159 movq `1*$STRIDE/4-96`($tptr),%xmm2 2160 por %xmm3,%xmm0 2161 .byte 0x67,0x67 2162 pand %xmm4,%xmm1 2163 movq `2*$STRIDE/4-96`($tptr),%xmm3 2164 2165 movq %xmm0,%rdx # bp[0] 2166 movq `3*$STRIDE/4-96`($tptr),%xmm0 2167 lea 2*$STRIDE($bptr),$bptr # next &b[i] 2168 pand %xmm5,%xmm2 2169 .byte 0x67,0x67 2170 pand %xmm6,%xmm3 2171 ############################################################## 2172 # $tptr is chosen so that writing to top-most element of the 2173 # vector occurs just "above" references to powers table, 2174 # "above" modulo cache-line size, which effectively precludes 2175 # possibility of memory disambiguation logic failure when 2176 # accessing the table. 2177 # 2178 lea 64+8*4+8(%rsp,%r11,8),$tptr 2179 2180 mov %rdx,$bi 2181 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2182 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2183 add %rax,%r11 2184 mulx 2*8($aptr),%rax,%r13 # ... 2185 adc %rax,%r12 2186 adc \$0,%r13 2187 mulx 3*8($aptr),%rax,%r14 2188 2189 mov $mi,%r15 2190 imulq 32+8(%rsp),$mi # "t[0]"*n0 2191 xor $zero,$zero # cf=0, of=0 2192 mov $mi,%rdx 2193 2194 por %xmm2,%xmm1 2195 pand %xmm7,%xmm0 2196 por %xmm3,%xmm1 2197 mov $bptr,8+8(%rsp) # off-load &b[i] 2198 por %xmm1,%xmm0 2199 2200 .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr 2201 adcx %rax,%r13 2202 adcx $zero,%r14 # cf=0 2203 2204 mulx 0*16($nptr),%rax,%r10 2205 adcx %rax,%r15 # discarded 2206 adox %r11,%r10 2207 mulx 1*16($nptr),%rax,%r11 2208 adcx %rax,%r10 2209 adox %r12,%r11 2210 mulx 2*16($nptr),%rax,%r12 2211 mov 24+8(%rsp),$bptr # counter value 2212 .byte 0x66 2213 mov %r10,-8*4($tptr) 2214 adcx %rax,%r11 2215 adox %r13,%r12 2216 mulx 3*16($nptr),%rax,%r15 2217 .byte 0x67,0x67 2218 mov $bi,%rdx 2219 mov %r11,-8*3($tptr) 2220 adcx %rax,%r12 2221 adox $zero,%r15 # of=0 2222 .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr 2223 mov %r12,-8*2($tptr) 2224 #jmp .Lmulx4x_1st 2225 2226.align 32 2227.Lmulx4x_1st: 2228 adcx $zero,%r15 # cf=0, modulo-scheduled 2229 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2230 adcx %r14,%r10 2231 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2232 adcx %rax,%r11 2233 mulx 2*8($aptr),%r12,%rax # ... 2234 adcx %r14,%r12 2235 mulx 3*8($aptr),%r13,%r14 2236 .byte 0x67,0x67 2237 mov $mi,%rdx 2238 adcx %rax,%r13 2239 adcx $zero,%r14 # cf=0 2240 lea 4*8($aptr),$aptr 2241 lea 4*8($tptr),$tptr 2242 2243 adox %r15,%r10 2244 mulx 0*16($nptr),%rax,%r15 2245 adcx %rax,%r10 2246 adox %r15,%r11 2247 mulx 1*16($nptr),%rax,%r15 2248 adcx %rax,%r11 2249 adox %r15,%r12 2250 mulx 2*16($nptr),%rax,%r15 2251 mov %r10,-5*8($tptr) 2252 adcx %rax,%r12 2253 mov %r11,-4*8($tptr) 2254 adox %r15,%r13 2255 mulx 3*16($nptr),%rax,%r15 2256 mov $bi,%rdx 2257 mov %r12,-3*8($tptr) 2258 adcx %rax,%r13 2259 adox $zero,%r15 2260 lea 4*16($nptr),$nptr 2261 mov %r13,-2*8($tptr) 2262 2263 dec $bptr # of=0, pass cf 2264 jnz .Lmulx4x_1st 2265 2266 mov 8(%rsp),$num # load -num 2267 movq %xmm0,%rdx # bp[1] 2268 adc $zero,%r15 # modulo-scheduled 2269 lea ($aptr,$num),$aptr # rewind $aptr 2270 add %r15,%r14 2271 mov 8+8(%rsp),$bptr # re-load &b[i] 2272 adc $zero,$zero # top-most carry 2273 mov %r14,-1*8($tptr) 2274 jmp .Lmulx4x_outer 2275 2276.align 32 2277.Lmulx4x_outer: 2278 mov $zero,($tptr) # save top-most carry 2279 lea 4*8($tptr,$num),$tptr # rewind $tptr 2280 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2281 xor $zero,$zero # cf=0, of=0 2282 mov %rdx,$bi 2283 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2284 adox -4*8($tptr),$mi # +t[0] 2285 adcx %r14,%r11 2286 mulx 2*8($aptr),%r15,%r13 # ... 2287 adox -3*8($tptr),%r11 2288 adcx %r15,%r12 2289 mulx 3*8($aptr),%rdx,%r14 2290 adox -2*8($tptr),%r12 2291 adcx %rdx,%r13 2292 lea ($nptr,$num,2),$nptr # rewind $nptr 2293 lea 4*8($aptr),$aptr 2294 adox -1*8($tptr),%r13 2295 adcx $zero,%r14 2296 adox $zero,%r14 2297 2298 .byte 0x67 2299 mov $mi,%r15 2300 imulq 32+8(%rsp),$mi # "t[0]"*n0 2301 2302 movq `0*$STRIDE/4-96`($bptr),%xmm0 2303 .byte 0x67,0x67 2304 mov $mi,%rdx 2305 movq `1*$STRIDE/4-96`($bptr),%xmm1 2306 .byte 0x67 2307 pand %xmm4,%xmm0 2308 movq `2*$STRIDE/4-96`($bptr),%xmm2 2309 .byte 0x67 2310 pand %xmm5,%xmm1 2311 movq `3*$STRIDE/4-96`($bptr),%xmm3 2312 add \$$STRIDE,$bptr # next &b[i] 2313 .byte 0x67 2314 pand %xmm6,%xmm2 2315 por %xmm1,%xmm0 2316 pand %xmm7,%xmm3 2317 xor $zero,$zero # cf=0, of=0 2318 mov $bptr,8+8(%rsp) # off-load &b[i] 2319 2320 mulx 0*16($nptr),%rax,%r10 2321 adcx %rax,%r15 # discarded 2322 adox %r11,%r10 2323 mulx 1*16($nptr),%rax,%r11 2324 adcx %rax,%r10 2325 adox %r12,%r11 2326 mulx 2*16($nptr),%rax,%r12 2327 adcx %rax,%r11 2328 adox %r13,%r12 2329 mulx 3*16($nptr),%rax,%r15 2330 mov $bi,%rdx 2331 por %xmm2,%xmm0 2332 mov 24+8(%rsp),$bptr # counter value 2333 mov %r10,-8*4($tptr) 2334 por %xmm3,%xmm0 2335 adcx %rax,%r12 2336 mov %r11,-8*3($tptr) 2337 adox $zero,%r15 # of=0 2338 mov %r12,-8*2($tptr) 2339 lea 4*16($nptr),$nptr 2340 jmp .Lmulx4x_inner 2341 2342.align 32 2343.Lmulx4x_inner: 2344 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2345 adcx $zero,%r15 # cf=0, modulo-scheduled 2346 adox %r14,%r10 2347 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2348 adcx 0*8($tptr),%r10 2349 adox %rax,%r11 2350 mulx 2*8($aptr),%r12,%rax # ... 2351 adcx 1*8($tptr),%r11 2352 adox %r14,%r12 2353 mulx 3*8($aptr),%r13,%r14 2354 mov $mi,%rdx 2355 adcx 2*8($tptr),%r12 2356 adox %rax,%r13 2357 adcx 3*8($tptr),%r13 2358 adox $zero,%r14 # of=0 2359 lea 4*8($aptr),$aptr 2360 lea 4*8($tptr),$tptr 2361 adcx $zero,%r14 # cf=0 2362 2363 adox %r15,%r10 2364 mulx 0*16($nptr),%rax,%r15 2365 adcx %rax,%r10 2366 adox %r15,%r11 2367 mulx 1*16($nptr),%rax,%r15 2368 adcx %rax,%r11 2369 adox %r15,%r12 2370 mulx 2*16($nptr),%rax,%r15 2371 mov %r10,-5*8($tptr) 2372 adcx %rax,%r12 2373 adox %r15,%r13 2374 mov %r11,-4*8($tptr) 2375 mulx 3*16($nptr),%rax,%r15 2376 mov $bi,%rdx 2377 lea 4*16($nptr),$nptr 2378 mov %r12,-3*8($tptr) 2379 adcx %rax,%r13 2380 adox $zero,%r15 2381 mov %r13,-2*8($tptr) 2382 2383 dec $bptr # of=0, pass cf 2384 jnz .Lmulx4x_inner 2385 2386 mov 0+8(%rsp),$num # load -num 2387 movq %xmm0,%rdx # bp[i+1] 2388 adc $zero,%r15 # modulo-scheduled 2389 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2390 mov 8+8(%rsp),$bptr # re-load &b[i] 2391 mov 16+8(%rsp),%r10 2392 adc %r15,%r14 2393 lea ($aptr,$num),$aptr # rewind $aptr 2394 adc $zero,$zero # top-most carry 2395 mov %r14,-1*8($tptr) 2396 2397 cmp %r10,$bptr 2398 jb .Lmulx4x_outer 2399 2400 mov -16($nptr),%r10 2401 xor %r15,%r15 2402 sub %r14,%r10 # compare top-most words 2403 adc %r15,%r15 2404 or %r15,$zero 2405 xor \$1,$zero 2406 lea ($tptr,$num),%rdi # rewind $tptr 2407 lea ($nptr,$num,2),$nptr # rewind $nptr 2408 .byte 0x67,0x67 2409 sar \$3+2,$num # cf=0 2410 lea ($nptr,$zero,8),%rbp 2411 mov 56+8(%rsp),%rdx # restore rp 2412 mov $num,%rcx 2413 jmp .Lsqrx4x_sub # common post-condition 2414.size mulx4x_internal,.-mulx4x_internal 2415___ 2416}{ 2417###################################################################### 2418# void bn_power5( 2419my $rptr="%rdi"; # BN_ULONG *rptr, 2420my $aptr="%rsi"; # const BN_ULONG *aptr, 2421my $bptr="%rdx"; # const void *table, 2422my $nptr="%rcx"; # const BN_ULONG *nptr, 2423my $n0 ="%r8"; # const BN_ULONG *n0); 2424my $num ="%r9"; # int num, has to be divisible by 8 2425 # int pwr); 2426 2427my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2428my @A0=("%r10","%r11"); 2429my @A1=("%r12","%r13"); 2430my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2431 2432$code.=<<___; 2433.type bn_powerx5,\@function,6 2434.align 32 2435bn_powerx5: 2436.Lpowerx5_enter: 2437 .byte 0x67 2438 mov %rsp,%rax 2439 push %rbx 2440 push %rbp 2441 push %r12 2442 push %r13 2443 push %r14 2444 push %r15 2445___ 2446$code.=<<___ if ($win64); 2447 lea -0x28(%rsp),%rsp 2448 movaps %xmm6,(%rsp) 2449 movaps %xmm7,0x10(%rsp) 2450___ 2451$code.=<<___; 2452 .byte 0x67 2453 mov ${num}d,%r10d 2454 shl \$3,${num}d # convert $num to bytes 2455 shl \$3+2,%r10d # 4*$num 2456 neg $num 2457 mov ($n0),$n0 # *n0 2458 2459 ############################################################## 2460 # ensure that stack frame doesn't alias with $aptr+4*$num 2461 # modulo 4096, which covers ret[num], am[num] and n[2*num] 2462 # (see bn_exp.c). this is done to allow memory disambiguation 2463 # logic do its magic. 2464 # 2465 lea -64(%rsp,$num,2),%r11 2466 sub $aptr,%r11 2467 and \$4095,%r11 2468 cmp %r11,%r10 2469 jb .Lpwrx_sp_alt 2470 sub %r11,%rsp # align with $aptr 2471 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 2472 jmp .Lpwrx_sp_done 2473 2474.align 32 2475.Lpwrx_sp_alt: 2476 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 2477 lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) 2478 sub %r10,%r11 2479 mov \$0,%r10 2480 cmovc %r10,%r11 2481 sub %r11,%rsp 2482.Lpwrx_sp_done: 2483 and \$-64,%rsp 2484 mov $num,%r10 2485 neg $num 2486 2487 ############################################################## 2488 # Stack layout 2489 # 2490 # +0 saved $num, used in reduction section 2491 # +8 &t[2*$num], used in reduction section 2492 # +16 intermediate carry bit 2493 # +24 top-most carry bit, used in reduction section 2494 # +32 saved *n0 2495 # +40 saved %rsp 2496 # +48 t[2*$num] 2497 # 2498 pxor %xmm0,%xmm0 2499 movq $rptr,%xmm1 # save $rptr 2500 movq $nptr,%xmm2 # save $nptr 2501 movq %r10, %xmm3 # -$num 2502 movq $bptr,%xmm4 2503 mov $n0, 32(%rsp) 2504 mov %rax, 40(%rsp) # save original %rsp 2505.Lpowerx5_body: 2506 2507 call __bn_sqrx8x_internal 2508 call __bn_sqrx8x_internal 2509 call __bn_sqrx8x_internal 2510 call __bn_sqrx8x_internal 2511 call __bn_sqrx8x_internal 2512 2513 mov %r10,$num # -num 2514 mov $aptr,$rptr 2515 movq %xmm2,$nptr 2516 movq %xmm4,$bptr 2517 mov 40(%rsp),%rax 2518 2519 call mulx4x_internal 2520 2521 mov 40(%rsp),%rsi # restore %rsp 2522 mov \$1,%rax 2523___ 2524$code.=<<___ if ($win64); 2525 movaps -88(%rsi),%xmm6 2526 movaps -72(%rsi),%xmm7 2527___ 2528$code.=<<___; 2529 mov -48(%rsi),%r15 2530 mov -40(%rsi),%r14 2531 mov -32(%rsi),%r13 2532 mov -24(%rsi),%r12 2533 mov -16(%rsi),%rbp 2534 mov -8(%rsi),%rbx 2535 lea (%rsi),%rsp 2536.Lpowerx5_epilogue: 2537 ret 2538.size bn_powerx5,.-bn_powerx5 2539 2540.globl bn_sqrx8x_internal 2541.hidden bn_sqrx8x_internal 2542.type bn_sqrx8x_internal,\@abi-omnipotent 2543.align 32 2544bn_sqrx8x_internal: 2545__bn_sqrx8x_internal: 2546 ################################################################## 2547 # Squaring part: 2548 # 2549 # a) multiply-n-add everything but a[i]*a[i]; 2550 # b) shift result of a) by 1 to the left and accumulate 2551 # a[i]*a[i] products; 2552 # 2553 ################################################################## 2554 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2555 # a[1]a[0] 2556 # a[2]a[0] 2557 # a[3]a[0] 2558 # a[2]a[1] 2559 # a[3]a[1] 2560 # a[3]a[2] 2561 # 2562 # a[4]a[0] 2563 # a[5]a[0] 2564 # a[6]a[0] 2565 # a[7]a[0] 2566 # a[4]a[1] 2567 # a[5]a[1] 2568 # a[6]a[1] 2569 # a[7]a[1] 2570 # a[4]a[2] 2571 # a[5]a[2] 2572 # a[6]a[2] 2573 # a[7]a[2] 2574 # a[4]a[3] 2575 # a[5]a[3] 2576 # a[6]a[3] 2577 # a[7]a[3] 2578 # 2579 # a[5]a[4] 2580 # a[6]a[4] 2581 # a[7]a[4] 2582 # a[6]a[5] 2583 # a[7]a[5] 2584 # a[7]a[6] 2585 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2586___ 2587{ 2588my ($zero,$carry)=("%rbp","%rcx"); 2589my $aaptr=$zero; 2590$code.=<<___; 2591 lea 48+8(%rsp),$tptr 2592 lea ($aptr,$num),$aaptr 2593 mov $num,0+8(%rsp) # save $num 2594 mov $aaptr,8+8(%rsp) # save end of $aptr 2595 jmp .Lsqr8x_zero_start 2596 2597.align 32 2598.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2599.Lsqrx8x_zero: 2600 .byte 0x3e 2601 movdqa %xmm0,0*8($tptr) 2602 movdqa %xmm0,2*8($tptr) 2603 movdqa %xmm0,4*8($tptr) 2604 movdqa %xmm0,6*8($tptr) 2605.Lsqr8x_zero_start: # aligned at 32 2606 movdqa %xmm0,8*8($tptr) 2607 movdqa %xmm0,10*8($tptr) 2608 movdqa %xmm0,12*8($tptr) 2609 movdqa %xmm0,14*8($tptr) 2610 lea 16*8($tptr),$tptr 2611 sub \$64,$num 2612 jnz .Lsqrx8x_zero 2613 2614 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2615 #xor %r9,%r9 # t[1], ex-$num, zero already 2616 xor %r10,%r10 2617 xor %r11,%r11 2618 xor %r12,%r12 2619 xor %r13,%r13 2620 xor %r14,%r14 2621 xor %r15,%r15 2622 lea 48+8(%rsp),$tptr 2623 xor $zero,$zero # cf=0, cf=0 2624 jmp .Lsqrx8x_outer_loop 2625 2626.align 32 2627.Lsqrx8x_outer_loop: 2628 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2629 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2630 adox %rax,%r10 2631 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2632 adcx %r10,%r9 2633 adox %rax,%r11 2634 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 2635 adcx %r11,%r10 2636 adox %rax,%r12 2637 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 2638 adcx %r12,%r11 2639 adox %rax,%r13 2640 mulx 5*8($aptr),%r12,%rax 2641 adcx %r13,%r12 2642 adox %rax,%r14 2643 mulx 6*8($aptr),%r13,%rax 2644 adcx %r14,%r13 2645 adox %r15,%rax 2646 mulx 7*8($aptr),%r14,%r15 2647 mov 1*8($aptr),%rdx # a[1] 2648 adcx %rax,%r14 2649 adox $zero,%r15 2650 adc 8*8($tptr),%r15 2651 mov %r8,1*8($tptr) # t[1] 2652 mov %r9,2*8($tptr) # t[2] 2653 sbb $carry,$carry # mov %cf,$carry 2654 xor $zero,$zero # cf=0, of=0 2655 2656 2657 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 2658 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 2659 adcx %r10,%r8 2660 adox %rbx,%r9 2661 mulx 4*8($aptr),%r10,%rbx # ... 2662 adcx %r11,%r9 2663 adox %rax,%r10 2664 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 2665 adcx %r12,%r10 2666 adox %rbx,%r11 2667 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 2668 adcx %r13,%r11 2669 adox %r14,%r12 2670 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 2671 mov 2*8($aptr),%rdx # a[2] 2672 adcx %rax,%r12 2673 adox %rbx,%r13 2674 adcx %r15,%r13 2675 adox $zero,%r14 # of=0 2676 adcx $zero,%r14 # cf=0 2677 2678 mov %r8,3*8($tptr) # t[3] 2679 mov %r9,4*8($tptr) # t[4] 2680 2681 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 2682 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 2683 adcx %r10,%r8 2684 adox %rbx,%r9 2685 mulx 5*8($aptr),%r10,%rbx # ... 2686 adcx %r11,%r9 2687 adox %rax,%r10 2688 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 2689 adcx %r12,%r10 2690 adox %r13,%r11 2691 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 2692 .byte 0x3e 2693 mov 3*8($aptr),%rdx # a[3] 2694 adcx %rbx,%r11 2695 adox %rax,%r12 2696 adcx %r14,%r12 2697 mov %r8,5*8($tptr) # t[5] 2698 mov %r9,6*8($tptr) # t[6] 2699 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 2700 adox $zero,%r13 # of=0 2701 adcx $zero,%r13 # cf=0 2702 2703 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 2704 adcx %r10,%r8 2705 adox %rax,%r9 2706 mulx 6*8($aptr),%r10,%rax # ... 2707 adcx %r11,%r9 2708 adox %r12,%r10 2709 mulx 7*8($aptr),%r11,%r12 2710 mov 4*8($aptr),%rdx # a[4] 2711 mov 5*8($aptr),%r14 # a[5] 2712 adcx %rbx,%r10 2713 adox %rax,%r11 2714 mov 6*8($aptr),%r15 # a[6] 2715 adcx %r13,%r11 2716 adox $zero,%r12 # of=0 2717 adcx $zero,%r12 # cf=0 2718 2719 mov %r8,7*8($tptr) # t[7] 2720 mov %r9,8*8($tptr) # t[8] 2721 2722 mulx %r14,%r9,%rax # a[5]*a[4] 2723 mov 7*8($aptr),%r8 # a[7] 2724 adcx %r10,%r9 2725 mulx %r15,%r10,%rbx # a[6]*a[4] 2726 adox %rax,%r10 2727 adcx %r11,%r10 2728 mulx %r8,%r11,%rax # a[7]*a[4] 2729 mov %r14,%rdx # a[5] 2730 adox %rbx,%r11 2731 adcx %r12,%r11 2732 #adox $zero,%rax # of=0 2733 adcx $zero,%rax # cf=0 2734 2735 mulx %r15,%r14,%rbx # a[6]*a[5] 2736 mulx %r8,%r12,%r13 # a[7]*a[5] 2737 mov %r15,%rdx # a[6] 2738 lea 8*8($aptr),$aptr 2739 adcx %r14,%r11 2740 adox %rbx,%r12 2741 adcx %rax,%r12 2742 adox $zero,%r13 2743 2744 .byte 0x67,0x67 2745 mulx %r8,%r8,%r14 # a[7]*a[6] 2746 adcx %r8,%r13 2747 adcx $zero,%r14 2748 2749 cmp 8+8(%rsp),$aptr 2750 je .Lsqrx8x_outer_break 2751 2752 neg $carry # mov $carry,%cf 2753 mov \$-8,%rcx 2754 mov $zero,%r15 2755 mov 8*8($tptr),%r8 2756 adcx 9*8($tptr),%r9 # +=t[9] 2757 adcx 10*8($tptr),%r10 # ... 2758 adcx 11*8($tptr),%r11 2759 adc 12*8($tptr),%r12 2760 adc 13*8($tptr),%r13 2761 adc 14*8($tptr),%r14 2762 adc 15*8($tptr),%r15 2763 lea ($aptr),$aaptr 2764 lea 2*64($tptr),$tptr 2765 sbb %rax,%rax # mov %cf,$carry 2766 2767 mov -64($aptr),%rdx # a[0] 2768 mov %rax,16+8(%rsp) # offload $carry 2769 mov $tptr,24+8(%rsp) 2770 2771 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 2772 xor %eax,%eax # cf=0, of=0 2773 jmp .Lsqrx8x_loop 2774 2775.align 32 2776.Lsqrx8x_loop: 2777 mov %r8,%rbx 2778 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 2779 adcx %rax,%rbx # +=t[8] 2780 adox %r9,%r8 2781 2782 mulx 1*8($aaptr),%rax,%r9 # ... 2783 adcx %rax,%r8 2784 adox %r10,%r9 2785 2786 mulx 2*8($aaptr),%rax,%r10 2787 adcx %rax,%r9 2788 adox %r11,%r10 2789 2790 mulx 3*8($aaptr),%rax,%r11 2791 adcx %rax,%r10 2792 adox %r12,%r11 2793 2794 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 2795 adcx %rax,%r11 2796 adox %r13,%r12 2797 2798 mulx 5*8($aaptr),%rax,%r13 2799 adcx %rax,%r12 2800 adox %r14,%r13 2801 2802 mulx 6*8($aaptr),%rax,%r14 2803 mov %rbx,($tptr,%rcx,8) # store t[8+i] 2804 mov \$0,%ebx 2805 adcx %rax,%r13 2806 adox %r15,%r14 2807 2808 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 2809 mov 8($aptr,%rcx,8),%rdx # a[i] 2810 adcx %rax,%r14 2811 adox %rbx,%r15 # %rbx is 0, of=0 2812 adcx %rbx,%r15 # cf=0 2813 2814 .byte 0x67 2815 inc %rcx # of=0 2816 jnz .Lsqrx8x_loop 2817 2818 lea 8*8($aaptr),$aaptr 2819 mov \$-8,%rcx 2820 cmp 8+8(%rsp),$aaptr # done? 2821 je .Lsqrx8x_break 2822 2823 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 2824 .byte 0x66 2825 mov -64($aptr),%rdx 2826 adcx 0*8($tptr),%r8 2827 adcx 1*8($tptr),%r9 2828 adc 2*8($tptr),%r10 2829 adc 3*8($tptr),%r11 2830 adc 4*8($tptr),%r12 2831 adc 5*8($tptr),%r13 2832 adc 6*8($tptr),%r14 2833 adc 7*8($tptr),%r15 2834 lea 8*8($tptr),$tptr 2835 .byte 0x67 2836 sbb %rax,%rax # mov %cf,%rax 2837 xor %ebx,%ebx # cf=0, of=0 2838 mov %rax,16+8(%rsp) # offload carry 2839 jmp .Lsqrx8x_loop 2840 2841.align 32 2842.Lsqrx8x_break: 2843 sub 16+8(%rsp),%r8 # consume last carry 2844 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 2845 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 2846 xor %ebp,%ebp # xor $zero,$zero 2847 mov %r8,0*8($tptr) 2848 cmp $carry,$tptr # cf=0, of=0 2849 je .Lsqrx8x_outer_loop 2850 2851 mov %r9,1*8($tptr) 2852 mov 1*8($carry),%r9 2853 mov %r10,2*8($tptr) 2854 mov 2*8($carry),%r10 2855 mov %r11,3*8($tptr) 2856 mov 3*8($carry),%r11 2857 mov %r12,4*8($tptr) 2858 mov 4*8($carry),%r12 2859 mov %r13,5*8($tptr) 2860 mov 5*8($carry),%r13 2861 mov %r14,6*8($tptr) 2862 mov 6*8($carry),%r14 2863 mov %r15,7*8($tptr) 2864 mov 7*8($carry),%r15 2865 mov $carry,$tptr 2866 jmp .Lsqrx8x_outer_loop 2867 2868.align 32 2869.Lsqrx8x_outer_break: 2870 mov %r9,9*8($tptr) # t[9] 2871 movq %xmm3,%rcx # -$num 2872 mov %r10,10*8($tptr) # ... 2873 mov %r11,11*8($tptr) 2874 mov %r12,12*8($tptr) 2875 mov %r13,13*8($tptr) 2876 mov %r14,14*8($tptr) 2877___ 2878}{ 2879my $i="%rcx"; 2880$code.=<<___; 2881 lea 48+8(%rsp),$tptr 2882 mov ($aptr,$i),%rdx # a[0] 2883 2884 mov 8($tptr),$A0[1] # t[1] 2885 xor $A0[0],$A0[0] # t[0], of=0, cf=0 2886 mov 0+8(%rsp),$num # restore $num 2887 adox $A0[1],$A0[1] 2888 mov 16($tptr),$A1[0] # t[2] # prefetch 2889 mov 24($tptr),$A1[1] # t[3] # prefetch 2890 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 2891 2892.align 32 2893.Lsqrx4x_shift_n_add: 2894 mulx %rdx,%rax,%rbx 2895 adox $A1[0],$A1[0] 2896 adcx $A0[0],%rax 2897 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 2898 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 2899 adox $A1[1],$A1[1] 2900 adcx $A0[1],%rbx 2901 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 2902 mov %rax,0($tptr) 2903 mov %rbx,8($tptr) 2904 2905 mulx %rdx,%rax,%rbx 2906 adox $A0[0],$A0[0] 2907 adcx $A1[0],%rax 2908 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 2909 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 2910 adox $A0[1],$A0[1] 2911 adcx $A1[1],%rbx 2912 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 2913 mov %rax,16($tptr) 2914 mov %rbx,24($tptr) 2915 2916 mulx %rdx,%rax,%rbx 2917 adox $A1[0],$A1[0] 2918 adcx $A0[0],%rax 2919 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 2920 lea 32($i),$i 2921 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 2922 adox $A1[1],$A1[1] 2923 adcx $A0[1],%rbx 2924 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 2925 mov %rax,32($tptr) 2926 mov %rbx,40($tptr) 2927 2928 mulx %rdx,%rax,%rbx 2929 adox $A0[0],$A0[0] 2930 adcx $A1[0],%rax 2931 jrcxz .Lsqrx4x_shift_n_add_break 2932 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 2933 adox $A0[1],$A0[1] 2934 adcx $A1[1],%rbx 2935 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 2936 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 2937 mov %rax,48($tptr) 2938 mov %rbx,56($tptr) 2939 lea 64($tptr),$tptr 2940 nop 2941 jmp .Lsqrx4x_shift_n_add 2942 2943.align 32 2944.Lsqrx4x_shift_n_add_break: 2945 adcx $A1[1],%rbx 2946 mov %rax,48($tptr) 2947 mov %rbx,56($tptr) 2948 lea 64($tptr),$tptr # end of t[] buffer 2949___ 2950} 2951###################################################################### 2952# Montgomery reduction part, "word-by-word" algorithm. 2953# 2954# This new path is inspired by multiple submissions from Intel, by 2955# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 2956# Vinodh Gopal... 2957{ 2958my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 2959 2960$code.=<<___; 2961 movq %xmm2,$nptr 2962sqrx8x_reduction: 2963 xor %eax,%eax # initial top-most carry bit 2964 mov 32+8(%rsp),%rbx # n0 2965 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 2966 lea -128($nptr,$num,2),%rcx # end of n[] 2967 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 2968 mov %rcx, 0+8(%rsp) # save end of n[] 2969 mov $tptr,8+8(%rsp) # save end of t[] 2970 2971 lea 48+8(%rsp),$tptr # initial t[] window 2972 jmp .Lsqrx8x_reduction_loop 2973 2974.align 32 2975.Lsqrx8x_reduction_loop: 2976 mov 8*1($tptr),%r9 2977 mov 8*2($tptr),%r10 2978 mov 8*3($tptr),%r11 2979 mov 8*4($tptr),%r12 2980 mov %rdx,%r8 2981 imulq %rbx,%rdx # n0*a[i] 2982 mov 8*5($tptr),%r13 2983 mov 8*6($tptr),%r14 2984 mov 8*7($tptr),%r15 2985 mov %rax,24+8(%rsp) # store top-most carry bit 2986 2987 lea 8*8($tptr),$tptr 2988 xor $carry,$carry # cf=0,of=0 2989 mov \$-8,%rcx 2990 jmp .Lsqrx8x_reduce 2991 2992.align 32 2993.Lsqrx8x_reduce: 2994 mov %r8, %rbx 2995 mulx 16*0($nptr),%rax,%r8 # n[0] 2996 adcx %rbx,%rax # discarded 2997 adox %r9,%r8 2998 2999 mulx 16*1($nptr),%rbx,%r9 # n[1] 3000 adcx %rbx,%r8 3001 adox %r10,%r9 3002 3003 mulx 16*2($nptr),%rbx,%r10 3004 adcx %rbx,%r9 3005 adox %r11,%r10 3006 3007 mulx 16*3($nptr),%rbx,%r11 3008 adcx %rbx,%r10 3009 adox %r12,%r11 3010 3011 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12 3012 mov %rdx,%rax 3013 mov %r8,%rdx 3014 adcx %rbx,%r11 3015 adox %r13,%r12 3016 3017 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3018 mov %rax,%rdx 3019 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3020 3021 mulx 16*5($nptr),%rax,%r13 3022 adcx %rax,%r12 3023 adox %r14,%r13 3024 3025 mulx 16*6($nptr),%rax,%r14 3026 adcx %rax,%r13 3027 adox %r15,%r14 3028 3029 mulx 16*7($nptr),%rax,%r15 3030 mov %rbx,%rdx 3031 adcx %rax,%r14 3032 adox $carry,%r15 # $carry is 0 3033 adcx $carry,%r15 # cf=0 3034 3035 .byte 0x67,0x67,0x67 3036 inc %rcx # of=0 3037 jnz .Lsqrx8x_reduce 3038 3039 mov $carry,%rax # xor %rax,%rax 3040 cmp 0+8(%rsp),$nptr # end of n[]? 3041 jae .Lsqrx8x_no_tail 3042 3043 mov 48+8(%rsp),%rdx # pull n0*a[0] 3044 add 8*0($tptr),%r8 3045 lea 16*8($nptr),$nptr 3046 mov \$-8,%rcx 3047 adcx 8*1($tptr),%r9 3048 adcx 8*2($tptr),%r10 3049 adc 8*3($tptr),%r11 3050 adc 8*4($tptr),%r12 3051 adc 8*5($tptr),%r13 3052 adc 8*6($tptr),%r14 3053 adc 8*7($tptr),%r15 3054 lea 8*8($tptr),$tptr 3055 sbb %rax,%rax # top carry 3056 3057 xor $carry,$carry # of=0, cf=0 3058 mov %rax,16+8(%rsp) 3059 jmp .Lsqrx8x_tail 3060 3061.align 32 3062.Lsqrx8x_tail: 3063 mov %r8,%rbx 3064 mulx 16*0($nptr),%rax,%r8 3065 adcx %rax,%rbx 3066 adox %r9,%r8 3067 3068 mulx 16*1($nptr),%rax,%r9 3069 adcx %rax,%r8 3070 adox %r10,%r9 3071 3072 mulx 16*2($nptr),%rax,%r10 3073 adcx %rax,%r9 3074 adox %r11,%r10 3075 3076 mulx 16*3($nptr),%rax,%r11 3077 adcx %rax,%r10 3078 adox %r12,%r11 3079 3080 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12 3081 adcx %rax,%r11 3082 adox %r13,%r12 3083 3084 mulx 16*5($nptr),%rax,%r13 3085 adcx %rax,%r12 3086 adox %r14,%r13 3087 3088 mulx 16*6($nptr),%rax,%r14 3089 adcx %rax,%r13 3090 adox %r15,%r14 3091 3092 mulx 16*7($nptr),%rax,%r15 3093 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3094 adcx %rax,%r14 3095 adox $carry,%r15 3096 mov %rbx,($tptr,%rcx,8) # save result 3097 mov %r8,%rbx 3098 adcx $carry,%r15 # cf=0 3099 3100 inc %rcx # of=0 3101 jnz .Lsqrx8x_tail 3102 3103 cmp 0+8(%rsp),$nptr # end of n[]? 3104 jae .Lsqrx8x_tail_done # break out of loop 3105 3106 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3107 mov 48+8(%rsp),%rdx # pull n0*a[0] 3108 lea 16*8($nptr),$nptr 3109 adc 8*0($tptr),%r8 3110 adc 8*1($tptr),%r9 3111 adc 8*2($tptr),%r10 3112 adc 8*3($tptr),%r11 3113 adc 8*4($tptr),%r12 3114 adc 8*5($tptr),%r13 3115 adc 8*6($tptr),%r14 3116 adc 8*7($tptr),%r15 3117 lea 8*8($tptr),$tptr 3118 sbb %rax,%rax 3119 sub \$8,%rcx # mov \$-8,%rcx 3120 3121 xor $carry,$carry # of=0, cf=0 3122 mov %rax,16+8(%rsp) 3123 jmp .Lsqrx8x_tail 3124 3125.align 32 3126.Lsqrx8x_tail_done: 3127 add 24+8(%rsp),%r8 # can this overflow? 3128 adc \$0,%r9 3129 adc \$0,%r10 3130 adc \$0,%r11 3131 adc \$0,%r12 3132 adc \$0,%r13 3133 adc \$0,%r14 3134 adc \$0,%r15 # can't overflow, because we 3135 # started with "overhung" part 3136 # of multiplication 3137 mov $carry,%rax # xor %rax,%rax 3138 3139 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3140.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3141 adc 8*0($tptr),%r8 3142 movq %xmm3,%rcx 3143 adc 8*1($tptr),%r9 3144 mov 16*7($nptr),$carry 3145 movq %xmm2,$nptr # restore $nptr 3146 adc 8*2($tptr),%r10 3147 adc 8*3($tptr),%r11 3148 adc 8*4($tptr),%r12 3149 adc 8*5($tptr),%r13 3150 adc 8*6($tptr),%r14 3151 adc 8*7($tptr),%r15 3152 adc %rax,%rax # top-most carry 3153 3154 mov 32+8(%rsp),%rbx # n0 3155 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3156 3157 mov %r8,8*0($tptr) # store top 512 bits 3158 lea 8*8($tptr),%r8 # borrow %r8 3159 mov %r9,8*1($tptr) 3160 mov %r10,8*2($tptr) 3161 mov %r11,8*3($tptr) 3162 mov %r12,8*4($tptr) 3163 mov %r13,8*5($tptr) 3164 mov %r14,8*6($tptr) 3165 mov %r15,8*7($tptr) 3166 3167 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3168 cmp 8+8(%rsp),%r8 # end of t[]? 3169 jb .Lsqrx8x_reduction_loop 3170___ 3171} 3172############################################################## 3173# Post-condition, 4x unrolled 3174# 3175{ 3176my ($rptr,$nptr)=("%rdx","%rbp"); 3177my @ri=map("%r$_",(10..13)); 3178my @ni=map("%r$_",(14..15)); 3179$code.=<<___; 3180 xor %ebx,%ebx 3181 sub %r15,%rsi # compare top-most words 3182 adc %rbx,%rbx 3183 mov %rcx,%r10 # -$num 3184 or %rbx,%rax 3185 mov %rcx,%r9 # -$num 3186 xor \$1,%rax 3187 sar \$3+2,%rcx # cf=0 3188 #lea 48+8(%rsp,%r9),$tptr 3189 lea ($nptr,%rax,8),$nptr 3190 movq %xmm1,$rptr # restore $rptr 3191 movq %xmm1,$aptr # prepare for back-to-back call 3192 jmp .Lsqrx4x_sub 3193 3194.align 32 3195.Lsqrx4x_sub: 3196 .byte 0x66 3197 mov 8*0($tptr),%r12 3198 mov 8*1($tptr),%r13 3199 sbb 16*0($nptr),%r12 3200 mov 8*2($tptr),%r14 3201 sbb 16*1($nptr),%r13 3202 mov 8*3($tptr),%r15 3203 lea 8*4($tptr),$tptr 3204 sbb 16*2($nptr),%r14 3205 mov %r12,8*0($rptr) 3206 sbb 16*3($nptr),%r15 3207 lea 16*4($nptr),$nptr 3208 mov %r13,8*1($rptr) 3209 mov %r14,8*2($rptr) 3210 mov %r15,8*3($rptr) 3211 lea 8*4($rptr),$rptr 3212 3213 inc %rcx 3214 jnz .Lsqrx4x_sub 3215___ 3216} 3217$code.=<<___; 3218 neg %r9 # restore $num 3219 3220 ret 3221.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3222___ 3223}}} 3224{ 3225my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3226 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3227my $out=$inp; 3228my $STRIDE=2**5*8; 3229my $N=$STRIDE/4; 3230 3231$code.=<<___; 3232.globl bn_scatter5 3233.type bn_scatter5,\@abi-omnipotent 3234.align 16 3235bn_scatter5: 3236 cmp \$0, $num 3237 jz .Lscatter_epilogue 3238 lea ($tbl,$idx,8),$tbl 3239.Lscatter: 3240 mov ($inp),%rax 3241 lea 8($inp),$inp 3242 mov %rax,($tbl) 3243 lea 32*8($tbl),$tbl 3244 sub \$1,$num 3245 jnz .Lscatter 3246.Lscatter_epilogue: 3247 ret 3248.size bn_scatter5,.-bn_scatter5 3249 3250.globl bn_gather5 3251.type bn_gather5,\@abi-omnipotent 3252.align 16 3253bn_gather5: 3254___ 3255$code.=<<___ if ($win64); 3256.LSEH_begin_bn_gather5: 3257 # I can't trust assembler to use specific encoding:-( 3258 .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp 3259 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 3260 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) 3261___ 3262$code.=<<___; 3263 mov $idx,%r11d 3264 shr \$`log($N/8)/log(2)`,$idx 3265 and \$`$N/8-1`,%r11 3266 not $idx 3267 lea .Lmagic_masks(%rip),%rax 3268 and \$`2**5/($N/8)-1`,$idx # 5 is "window size" 3269 lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line 3270 movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which 3271 movq 8(%rax,$idx,8),%xmm5 # cache line contains element 3272 movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument 3273 movq 24(%rax,$idx,8),%xmm7 3274 jmp .Lgather 3275.align 16 3276.Lgather: 3277 movq `0*$STRIDE/4-128`($tbl),%xmm0 3278 movq `1*$STRIDE/4-128`($tbl),%xmm1 3279 pand %xmm4,%xmm0 3280 movq `2*$STRIDE/4-128`($tbl),%xmm2 3281 pand %xmm5,%xmm1 3282 movq `3*$STRIDE/4-128`($tbl),%xmm3 3283 pand %xmm6,%xmm2 3284 por %xmm1,%xmm0 3285 pand %xmm7,%xmm3 3286 .byte 0x67,0x67 3287 por %xmm2,%xmm0 3288 lea $STRIDE($tbl),$tbl 3289 por %xmm3,%xmm0 3290 3291 movq %xmm0,($out) # m0=bp[0] 3292 lea 8($out),$out 3293 sub \$1,$num 3294 jnz .Lgather 3295___ 3296$code.=<<___ if ($win64); 3297 movaps (%rsp),%xmm6 3298 movaps 0x10(%rsp),%xmm7 3299 lea 0x28(%rsp),%rsp 3300___ 3301$code.=<<___; 3302 ret 3303.LSEH_end_bn_gather5: 3304.size bn_gather5,.-bn_gather5 3305___ 3306} 3307$code.=<<___; 3308.align 64 3309.Lmagic_masks: 3310 .long 0,0, 0,0, 0,0, -1,-1 3311 .long 0,0, 0,0, 0,0, 0,0 3312.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3313___ 3314 3315# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3316# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3317if ($win64) { 3318$rec="%rcx"; 3319$frame="%rdx"; 3320$context="%r8"; 3321$disp="%r9"; 3322 3323$code.=<<___; 3324.extern __imp_RtlVirtualUnwind 3325.type mul_handler,\@abi-omnipotent 3326.align 16 3327mul_handler: 3328 push %rsi 3329 push %rdi 3330 push %rbx 3331 push %rbp 3332 push %r12 3333 push %r13 3334 push %r14 3335 push %r15 3336 pushfq 3337 sub \$64,%rsp 3338 3339 mov 120($context),%rax # pull context->Rax 3340 mov 248($context),%rbx # pull context->Rip 3341 3342 mov 8($disp),%rsi # disp->ImageBase 3343 mov 56($disp),%r11 # disp->HandlerData 3344 3345 mov 0(%r11),%r10d # HandlerData[0] 3346 lea (%rsi,%r10),%r10 # end of prologue label 3347 cmp %r10,%rbx # context->Rip<end of prologue label 3348 jb .Lcommon_seh_tail 3349 3350 mov 152($context),%rax # pull context->Rsp 3351 3352 mov 4(%r11),%r10d # HandlerData[1] 3353 lea (%rsi,%r10),%r10 # epilogue label 3354 cmp %r10,%rbx # context->Rip>=epilogue label 3355 jae .Lcommon_seh_tail 3356 3357 lea .Lmul_epilogue(%rip),%r10 3358 cmp %r10,%rbx 3359 jb .Lbody_40 3360 3361 mov 192($context),%r10 # pull $num 3362 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3363 jmp .Lbody_proceed 3364 3365.Lbody_40: 3366 mov 40(%rax),%rax # pull saved stack pointer 3367.Lbody_proceed: 3368 3369 movaps -88(%rax),%xmm0 3370 movaps -72(%rax),%xmm1 3371 3372 mov -8(%rax),%rbx 3373 mov -16(%rax),%rbp 3374 mov -24(%rax),%r12 3375 mov -32(%rax),%r13 3376 mov -40(%rax),%r14 3377 mov -48(%rax),%r15 3378 mov %rbx,144($context) # restore context->Rbx 3379 mov %rbp,160($context) # restore context->Rbp 3380 mov %r12,216($context) # restore context->R12 3381 mov %r13,224($context) # restore context->R13 3382 mov %r14,232($context) # restore context->R14 3383 mov %r15,240($context) # restore context->R15 3384 movups %xmm0,512($context) # restore context->Xmm6 3385 movups %xmm1,528($context) # restore context->Xmm7 3386 3387.Lcommon_seh_tail: 3388 mov 8(%rax),%rdi 3389 mov 16(%rax),%rsi 3390 mov %rax,152($context) # restore context->Rsp 3391 mov %rsi,168($context) # restore context->Rsi 3392 mov %rdi,176($context) # restore context->Rdi 3393 3394 mov 40($disp),%rdi # disp->ContextRecord 3395 mov $context,%rsi # context 3396 mov \$154,%ecx # sizeof(CONTEXT) 3397 .long 0xa548f3fc # cld; rep movsq 3398 3399 mov $disp,%rsi 3400 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3401 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3402 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3403 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3404 mov 40(%rsi),%r10 # disp->ContextRecord 3405 lea 56(%rsi),%r11 # &disp->HandlerData 3406 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3407 mov %r10,32(%rsp) # arg5 3408 mov %r11,40(%rsp) # arg6 3409 mov %r12,48(%rsp) # arg7 3410 mov %rcx,56(%rsp) # arg8, (NULL) 3411 call *__imp_RtlVirtualUnwind(%rip) 3412 3413 mov \$1,%eax # ExceptionContinueSearch 3414 add \$64,%rsp 3415 popfq 3416 pop %r15 3417 pop %r14 3418 pop %r13 3419 pop %r12 3420 pop %rbp 3421 pop %rbx 3422 pop %rdi 3423 pop %rsi 3424 ret 3425.size mul_handler,.-mul_handler 3426 3427.section .pdata 3428.align 4 3429 .rva .LSEH_begin_bn_mul_mont_gather5 3430 .rva .LSEH_end_bn_mul_mont_gather5 3431 .rva .LSEH_info_bn_mul_mont_gather5 3432 3433 .rva .LSEH_begin_bn_mul4x_mont_gather5 3434 .rva .LSEH_end_bn_mul4x_mont_gather5 3435 .rva .LSEH_info_bn_mul4x_mont_gather5 3436 3437 .rva .LSEH_begin_bn_power5 3438 .rva .LSEH_end_bn_power5 3439 .rva .LSEH_info_bn_power5 3440 3441 .rva .LSEH_begin_bn_from_mont8x 3442 .rva .LSEH_end_bn_from_mont8x 3443 .rva .LSEH_info_bn_from_mont8x 3444___ 3445$code.=<<___ if ($addx); 3446 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3447 .rva .LSEH_end_bn_mulx4x_mont_gather5 3448 .rva .LSEH_info_bn_mulx4x_mont_gather5 3449 3450 .rva .LSEH_begin_bn_powerx5 3451 .rva .LSEH_end_bn_powerx5 3452 .rva .LSEH_info_bn_powerx5 3453___ 3454$code.=<<___; 3455 .rva .LSEH_begin_bn_gather5 3456 .rva .LSEH_end_bn_gather5 3457 .rva .LSEH_info_bn_gather5 3458 3459.section .xdata 3460.align 8 3461.LSEH_info_bn_mul_mont_gather5: 3462 .byte 9,0,0,0 3463 .rva mul_handler 3464 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 3465.align 8 3466.LSEH_info_bn_mul4x_mont_gather5: 3467 .byte 9,0,0,0 3468 .rva mul_handler 3469 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3470.align 8 3471.LSEH_info_bn_power5: 3472 .byte 9,0,0,0 3473 .rva mul_handler 3474 .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] 3475.align 8 3476.LSEH_info_bn_from_mont8x: 3477 .byte 9,0,0,0 3478 .rva mul_handler 3479 .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] 3480___ 3481$code.=<<___ if ($addx); 3482.align 8 3483.LSEH_info_bn_mulx4x_mont_gather5: 3484 .byte 9,0,0,0 3485 .rva mul_handler 3486 .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3487.align 8 3488.LSEH_info_bn_powerx5: 3489 .byte 9,0,0,0 3490 .rva mul_handler 3491 .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3492___ 3493$code.=<<___; 3494.align 8 3495.LSEH_info_bn_gather5: 3496 .byte 0x01,0x0d,0x05,0x00 3497 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 3498 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 3499 .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 3500.align 8 3501___ 3502} 3503 3504$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3505 3506print $code; 3507close STDOUT; 3508