1#if defined(__x86_64__) 2.text 3 4 5 6.globl _bn_mul_mont_gather5 7.private_extern _bn_mul_mont_gather5 8 9.p2align 6 10_bn_mul_mont_gather5: 11 testl $7,%r9d 12 jnz L$mul_enter 13 jmp L$mul4x_enter 14 15.p2align 4 16L$mul_enter: 17 movl %r9d,%r9d 18 movq %rsp,%rax 19 movl 8(%rsp),%r10d 20 pushq %rbx 21 pushq %rbp 22 pushq %r12 23 pushq %r13 24 pushq %r14 25 pushq %r15 26 leaq 2(%r9),%r11 27 negq %r11 28 leaq (%rsp,%r11,8),%rsp 29 andq $-1024,%rsp 30 31 movq %rax,8(%rsp,%r9,8) 32L$mul_body: 33 movq %rdx,%r12 34 movq %r10,%r11 35 shrq $3,%r10 36 andq $7,%r11 37 notq %r10 38 leaq L$magic_masks(%rip),%rax 39 andq $3,%r10 40 leaq 96(%r12,%r11,8),%r12 41 movq 0(%rax,%r10,8),%xmm4 42 movq 8(%rax,%r10,8),%xmm5 43 movq 16(%rax,%r10,8),%xmm6 44 movq 24(%rax,%r10,8),%xmm7 45 46 movq -96(%r12),%xmm0 47 movq -32(%r12),%xmm1 48 pand %xmm4,%xmm0 49 movq 32(%r12),%xmm2 50 pand %xmm5,%xmm1 51 movq 96(%r12),%xmm3 52 pand %xmm6,%xmm2 53 por %xmm1,%xmm0 54 pand %xmm7,%xmm3 55 por %xmm2,%xmm0 56 leaq 256(%r12),%r12 57 por %xmm3,%xmm0 58 59.byte 102,72,15,126,195 60 61 movq (%r8),%r8 62 movq (%rsi),%rax 63 64 xorq %r14,%r14 65 xorq %r15,%r15 66 67 movq -96(%r12),%xmm0 68 movq -32(%r12),%xmm1 69 pand %xmm4,%xmm0 70 movq 32(%r12),%xmm2 71 pand %xmm5,%xmm1 72 73 movq %r8,%rbp 74 mulq %rbx 75 movq %rax,%r10 76 movq (%rcx),%rax 77 78 movq 96(%r12),%xmm3 79 pand %xmm6,%xmm2 80 por %xmm1,%xmm0 81 pand %xmm7,%xmm3 82 83 imulq %r10,%rbp 84 movq %rdx,%r11 85 86 por %xmm2,%xmm0 87 leaq 256(%r12),%r12 88 por %xmm3,%xmm0 89 90 mulq %rbp 91 addq %rax,%r10 92 movq 8(%rsi),%rax 93 adcq $0,%rdx 94 movq %rdx,%r13 95 96 leaq 1(%r15),%r15 97 jmp L$1st_enter 98 99.p2align 4 100L$1st: 101 addq %rax,%r13 102 movq (%rsi,%r15,8),%rax 103 adcq $0,%rdx 104 addq %r11,%r13 105 movq %r10,%r11 106 adcq $0,%rdx 107 movq %r13,-16(%rsp,%r15,8) 108 movq %rdx,%r13 109 110L$1st_enter: 111 mulq %rbx 112 addq %rax,%r11 113 movq (%rcx,%r15,8),%rax 114 adcq $0,%rdx 115 leaq 1(%r15),%r15 116 movq %rdx,%r10 117 118 mulq %rbp 119 cmpq %r9,%r15 120 jne L$1st 121 122.byte 102,72,15,126,195 123 124 addq %rax,%r13 125 movq (%rsi),%rax 126 adcq $0,%rdx 127 addq %r11,%r13 128 adcq $0,%rdx 129 movq %r13,-16(%rsp,%r15,8) 130 movq %rdx,%r13 131 movq %r10,%r11 132 133 xorq %rdx,%rdx 134 addq %r11,%r13 135 adcq $0,%rdx 136 movq %r13,-8(%rsp,%r9,8) 137 movq %rdx,(%rsp,%r9,8) 138 139 leaq 1(%r14),%r14 140 jmp L$outer 141.p2align 4 142L$outer: 143 xorq %r15,%r15 144 movq %r8,%rbp 145 movq (%rsp),%r10 146 147 movq -96(%r12),%xmm0 148 movq -32(%r12),%xmm1 149 pand %xmm4,%xmm0 150 movq 32(%r12),%xmm2 151 pand %xmm5,%xmm1 152 153 mulq %rbx 154 addq %rax,%r10 155 movq (%rcx),%rax 156 adcq $0,%rdx 157 158 movq 96(%r12),%xmm3 159 pand %xmm6,%xmm2 160 por %xmm1,%xmm0 161 pand %xmm7,%xmm3 162 163 imulq %r10,%rbp 164 movq %rdx,%r11 165 166 por %xmm2,%xmm0 167 leaq 256(%r12),%r12 168 por %xmm3,%xmm0 169 170 mulq %rbp 171 addq %rax,%r10 172 movq 8(%rsi),%rax 173 adcq $0,%rdx 174 movq 8(%rsp),%r10 175 movq %rdx,%r13 176 177 leaq 1(%r15),%r15 178 jmp L$inner_enter 179 180.p2align 4 181L$inner: 182 addq %rax,%r13 183 movq (%rsi,%r15,8),%rax 184 adcq $0,%rdx 185 addq %r10,%r13 186 movq (%rsp,%r15,8),%r10 187 adcq $0,%rdx 188 movq %r13,-16(%rsp,%r15,8) 189 movq %rdx,%r13 190 191L$inner_enter: 192 mulq %rbx 193 addq %rax,%r11 194 movq (%rcx,%r15,8),%rax 195 adcq $0,%rdx 196 addq %r11,%r10 197 movq %rdx,%r11 198 adcq $0,%r11 199 leaq 1(%r15),%r15 200 201 mulq %rbp 202 cmpq %r9,%r15 203 jne L$inner 204 205.byte 102,72,15,126,195 206 207 addq %rax,%r13 208 movq (%rsi),%rax 209 adcq $0,%rdx 210 addq %r10,%r13 211 movq (%rsp,%r15,8),%r10 212 adcq $0,%rdx 213 movq %r13,-16(%rsp,%r15,8) 214 movq %rdx,%r13 215 216 xorq %rdx,%rdx 217 addq %r11,%r13 218 adcq $0,%rdx 219 addq %r10,%r13 220 adcq $0,%rdx 221 movq %r13,-8(%rsp,%r9,8) 222 movq %rdx,(%rsp,%r9,8) 223 224 leaq 1(%r14),%r14 225 cmpq %r9,%r14 226 jb L$outer 227 228 xorq %r14,%r14 229 movq (%rsp),%rax 230 leaq (%rsp),%rsi 231 movq %r9,%r15 232 jmp L$sub 233.p2align 4 234L$sub: sbbq (%rcx,%r14,8),%rax 235 movq %rax,(%rdi,%r14,8) 236 movq 8(%rsi,%r14,8),%rax 237 leaq 1(%r14),%r14 238 decq %r15 239 jnz L$sub 240 241 sbbq $0,%rax 242 xorq %r14,%r14 243 movq %r9,%r15 244.p2align 4 245L$copy: 246 movq (%rsp,%r14,8),%rsi 247 movq (%rdi,%r14,8),%rcx 248 xorq %rcx,%rsi 249 andq %rax,%rsi 250 xorq %rcx,%rsi 251 movq %r14,(%rsp,%r14,8) 252 movq %rsi,(%rdi,%r14,8) 253 leaq 1(%r14),%r14 254 subq $1,%r15 255 jnz L$copy 256 257 movq 8(%rsp,%r9,8),%rsi 258 movq $1,%rax 259 movq -48(%rsi),%r15 260 movq -40(%rsi),%r14 261 movq -32(%rsi),%r13 262 movq -24(%rsi),%r12 263 movq -16(%rsi),%rbp 264 movq -8(%rsi),%rbx 265 leaq (%rsi),%rsp 266L$mul_epilogue: 267 .byte 0xf3,0xc3 268 269 270.p2align 5 271bn_mul4x_mont_gather5: 272L$mul4x_enter: 273.byte 0x67 274 movq %rsp,%rax 275 pushq %rbx 276 pushq %rbp 277 pushq %r12 278 pushq %r13 279 pushq %r14 280 pushq %r15 281.byte 0x67 282 movl %r9d,%r10d 283 shll $3,%r9d 284 shll $3+2,%r10d 285 negq %r9 286 287 288 289 290 291 292 293 294 leaq -64(%rsp,%r9,2),%r11 295 subq %rsi,%r11 296 andq $4095,%r11 297 cmpq %r11,%r10 298 jb L$mul4xsp_alt 299 subq %r11,%rsp 300 leaq -64(%rsp,%r9,2),%rsp 301 jmp L$mul4xsp_done 302 303.p2align 5 304L$mul4xsp_alt: 305 leaq 4096-64(,%r9,2),%r10 306 leaq -64(%rsp,%r9,2),%rsp 307 subq %r10,%r11 308 movq $0,%r10 309 cmovcq %r10,%r11 310 subq %r11,%rsp 311L$mul4xsp_done: 312 andq $-64,%rsp 313 negq %r9 314 315 movq %rax,40(%rsp) 316L$mul4x_body: 317 318 call mul4x_internal 319 320 movq 40(%rsp),%rsi 321 movq $1,%rax 322 movq -48(%rsi),%r15 323 movq -40(%rsi),%r14 324 movq -32(%rsi),%r13 325 movq -24(%rsi),%r12 326 movq -16(%rsi),%rbp 327 movq -8(%rsi),%rbx 328 leaq (%rsi),%rsp 329L$mul4x_epilogue: 330 .byte 0xf3,0xc3 331 332 333 334.p2align 5 335mul4x_internal: 336 shlq $5,%r9 337 movl 8(%rax),%r10d 338 leaq 256(%rdx,%r9,1),%r13 339 shrq $5,%r9 340 movq %r10,%r11 341 shrq $3,%r10 342 andq $7,%r11 343 notq %r10 344 leaq L$magic_masks(%rip),%rax 345 andq $3,%r10 346 leaq 96(%rdx,%r11,8),%r12 347 movq 0(%rax,%r10,8),%xmm4 348 movq 8(%rax,%r10,8),%xmm5 349 addq $7,%r11 350 movq 16(%rax,%r10,8),%xmm6 351 movq 24(%rax,%r10,8),%xmm7 352 andq $7,%r11 353 354 movq -96(%r12),%xmm0 355 leaq 256(%r12),%r14 356 movq -32(%r12),%xmm1 357 pand %xmm4,%xmm0 358 movq 32(%r12),%xmm2 359 pand %xmm5,%xmm1 360 movq 96(%r12),%xmm3 361 pand %xmm6,%xmm2 362.byte 0x67 363 por %xmm1,%xmm0 364 movq -96(%r14),%xmm1 365.byte 0x67 366 pand %xmm7,%xmm3 367.byte 0x67 368 por %xmm2,%xmm0 369 movq -32(%r14),%xmm2 370.byte 0x67 371 pand %xmm4,%xmm1 372.byte 0x67 373 por %xmm3,%xmm0 374 movq 32(%r14),%xmm3 375 376.byte 102,72,15,126,195 377 movq 96(%r14),%xmm0 378 movq %r13,16+8(%rsp) 379 movq %rdi,56+8(%rsp) 380 381 movq (%r8),%r8 382 movq (%rsi),%rax 383 leaq (%rsi,%r9,1),%rsi 384 negq %r9 385 386 movq %r8,%rbp 387 mulq %rbx 388 movq %rax,%r10 389 movq (%rcx),%rax 390 391 pand %xmm5,%xmm2 392 pand %xmm6,%xmm3 393 por %xmm2,%xmm1 394 395 imulq %r10,%rbp 396 397 398 399 400 401 402 403 leaq 64+8(%rsp,%r11,8),%r14 404 movq %rdx,%r11 405 406 pand %xmm7,%xmm0 407 por %xmm3,%xmm1 408 leaq 512(%r12),%r12 409 por %xmm1,%xmm0 410 411 mulq %rbp 412 addq %rax,%r10 413 movq 8(%rsi,%r9,1),%rax 414 adcq $0,%rdx 415 movq %rdx,%rdi 416 417 mulq %rbx 418 addq %rax,%r11 419 movq 16(%rcx),%rax 420 adcq $0,%rdx 421 movq %rdx,%r10 422 423 mulq %rbp 424 addq %rax,%rdi 425 movq 16(%rsi,%r9,1),%rax 426 adcq $0,%rdx 427 addq %r11,%rdi 428 leaq 32(%r9),%r15 429 leaq 64(%rcx),%rcx 430 adcq $0,%rdx 431 movq %rdi,(%r14) 432 movq %rdx,%r13 433 jmp L$1st4x 434 435.p2align 5 436L$1st4x: 437 mulq %rbx 438 addq %rax,%r10 439 movq -32(%rcx),%rax 440 leaq 32(%r14),%r14 441 adcq $0,%rdx 442 movq %rdx,%r11 443 444 mulq %rbp 445 addq %rax,%r13 446 movq -8(%rsi,%r15,1),%rax 447 adcq $0,%rdx 448 addq %r10,%r13 449 adcq $0,%rdx 450 movq %r13,-24(%r14) 451 movq %rdx,%rdi 452 453 mulq %rbx 454 addq %rax,%r11 455 movq -16(%rcx),%rax 456 adcq $0,%rdx 457 movq %rdx,%r10 458 459 mulq %rbp 460 addq %rax,%rdi 461 movq (%rsi,%r15,1),%rax 462 adcq $0,%rdx 463 addq %r11,%rdi 464 adcq $0,%rdx 465 movq %rdi,-16(%r14) 466 movq %rdx,%r13 467 468 mulq %rbx 469 addq %rax,%r10 470 movq 0(%rcx),%rax 471 adcq $0,%rdx 472 movq %rdx,%r11 473 474 mulq %rbp 475 addq %rax,%r13 476 movq 8(%rsi,%r15,1),%rax 477 adcq $0,%rdx 478 addq %r10,%r13 479 adcq $0,%rdx 480 movq %r13,-8(%r14) 481 movq %rdx,%rdi 482 483 mulq %rbx 484 addq %rax,%r11 485 movq 16(%rcx),%rax 486 adcq $0,%rdx 487 movq %rdx,%r10 488 489 mulq %rbp 490 addq %rax,%rdi 491 movq 16(%rsi,%r15,1),%rax 492 adcq $0,%rdx 493 addq %r11,%rdi 494 leaq 64(%rcx),%rcx 495 adcq $0,%rdx 496 movq %rdi,(%r14) 497 movq %rdx,%r13 498 499 addq $32,%r15 500 jnz L$1st4x 501 502 mulq %rbx 503 addq %rax,%r10 504 movq -32(%rcx),%rax 505 leaq 32(%r14),%r14 506 adcq $0,%rdx 507 movq %rdx,%r11 508 509 mulq %rbp 510 addq %rax,%r13 511 movq -8(%rsi),%rax 512 adcq $0,%rdx 513 addq %r10,%r13 514 adcq $0,%rdx 515 movq %r13,-24(%r14) 516 movq %rdx,%rdi 517 518 mulq %rbx 519 addq %rax,%r11 520 movq -16(%rcx),%rax 521 adcq $0,%rdx 522 movq %rdx,%r10 523 524 mulq %rbp 525 addq %rax,%rdi 526 movq (%rsi,%r9,1),%rax 527 adcq $0,%rdx 528 addq %r11,%rdi 529 adcq $0,%rdx 530 movq %rdi,-16(%r14) 531 movq %rdx,%r13 532 533.byte 102,72,15,126,195 534 leaq (%rcx,%r9,2),%rcx 535 536 xorq %rdi,%rdi 537 addq %r10,%r13 538 adcq $0,%rdi 539 movq %r13,-8(%r14) 540 541 jmp L$outer4x 542 543.p2align 5 544L$outer4x: 545 movq (%r14,%r9,1),%r10 546 movq %r8,%rbp 547 mulq %rbx 548 addq %rax,%r10 549 movq (%rcx),%rax 550 adcq $0,%rdx 551 552 movq -96(%r12),%xmm0 553 movq -32(%r12),%xmm1 554 pand %xmm4,%xmm0 555 movq 32(%r12),%xmm2 556 pand %xmm5,%xmm1 557 movq 96(%r12),%xmm3 558 559 imulq %r10,%rbp 560.byte 0x67 561 movq %rdx,%r11 562 movq %rdi,(%r14) 563 564 pand %xmm6,%xmm2 565 por %xmm1,%xmm0 566 pand %xmm7,%xmm3 567 por %xmm2,%xmm0 568 leaq (%r14,%r9,1),%r14 569 leaq 256(%r12),%r12 570 por %xmm3,%xmm0 571 572 mulq %rbp 573 addq %rax,%r10 574 movq 8(%rsi,%r9,1),%rax 575 adcq $0,%rdx 576 movq %rdx,%rdi 577 578 mulq %rbx 579 addq %rax,%r11 580 movq 16(%rcx),%rax 581 adcq $0,%rdx 582 addq 8(%r14),%r11 583 adcq $0,%rdx 584 movq %rdx,%r10 585 586 mulq %rbp 587 addq %rax,%rdi 588 movq 16(%rsi,%r9,1),%rax 589 adcq $0,%rdx 590 addq %r11,%rdi 591 leaq 32(%r9),%r15 592 leaq 64(%rcx),%rcx 593 adcq $0,%rdx 594 movq %rdx,%r13 595 jmp L$inner4x 596 597.p2align 5 598L$inner4x: 599 mulq %rbx 600 addq %rax,%r10 601 movq -32(%rcx),%rax 602 adcq $0,%rdx 603 addq 16(%r14),%r10 604 leaq 32(%r14),%r14 605 adcq $0,%rdx 606 movq %rdx,%r11 607 608 mulq %rbp 609 addq %rax,%r13 610 movq -8(%rsi,%r15,1),%rax 611 adcq $0,%rdx 612 addq %r10,%r13 613 adcq $0,%rdx 614 movq %rdi,-32(%r14) 615 movq %rdx,%rdi 616 617 mulq %rbx 618 addq %rax,%r11 619 movq -16(%rcx),%rax 620 adcq $0,%rdx 621 addq -8(%r14),%r11 622 adcq $0,%rdx 623 movq %rdx,%r10 624 625 mulq %rbp 626 addq %rax,%rdi 627 movq (%rsi,%r15,1),%rax 628 adcq $0,%rdx 629 addq %r11,%rdi 630 adcq $0,%rdx 631 movq %r13,-24(%r14) 632 movq %rdx,%r13 633 634 mulq %rbx 635 addq %rax,%r10 636 movq 0(%rcx),%rax 637 adcq $0,%rdx 638 addq (%r14),%r10 639 adcq $0,%rdx 640 movq %rdx,%r11 641 642 mulq %rbp 643 addq %rax,%r13 644 movq 8(%rsi,%r15,1),%rax 645 adcq $0,%rdx 646 addq %r10,%r13 647 adcq $0,%rdx 648 movq %rdi,-16(%r14) 649 movq %rdx,%rdi 650 651 mulq %rbx 652 addq %rax,%r11 653 movq 16(%rcx),%rax 654 adcq $0,%rdx 655 addq 8(%r14),%r11 656 adcq $0,%rdx 657 movq %rdx,%r10 658 659 mulq %rbp 660 addq %rax,%rdi 661 movq 16(%rsi,%r15,1),%rax 662 adcq $0,%rdx 663 addq %r11,%rdi 664 leaq 64(%rcx),%rcx 665 adcq $0,%rdx 666 movq %r13,-8(%r14) 667 movq %rdx,%r13 668 669 addq $32,%r15 670 jnz L$inner4x 671 672 mulq %rbx 673 addq %rax,%r10 674 movq -32(%rcx),%rax 675 adcq $0,%rdx 676 addq 16(%r14),%r10 677 leaq 32(%r14),%r14 678 adcq $0,%rdx 679 movq %rdx,%r11 680 681 mulq %rbp 682 addq %rax,%r13 683 movq -8(%rsi),%rax 684 adcq $0,%rdx 685 addq %r10,%r13 686 adcq $0,%rdx 687 movq %rdi,-32(%r14) 688 movq %rdx,%rdi 689 690 mulq %rbx 691 addq %rax,%r11 692 movq %rbp,%rax 693 movq -16(%rcx),%rbp 694 adcq $0,%rdx 695 addq -8(%r14),%r11 696 adcq $0,%rdx 697 movq %rdx,%r10 698 699 mulq %rbp 700 addq %rax,%rdi 701 movq (%rsi,%r9,1),%rax 702 adcq $0,%rdx 703 addq %r11,%rdi 704 adcq $0,%rdx 705 movq %r13,-24(%r14) 706 movq %rdx,%r13 707 708.byte 102,72,15,126,195 709 movq %rdi,-16(%r14) 710 leaq (%rcx,%r9,2),%rcx 711 712 xorq %rdi,%rdi 713 addq %r10,%r13 714 adcq $0,%rdi 715 addq (%r14),%r13 716 adcq $0,%rdi 717 movq %r13,-8(%r14) 718 719 cmpq 16+8(%rsp),%r12 720 jb L$outer4x 721 subq %r13,%rbp 722 adcq %r15,%r15 723 orq %r15,%rdi 724 xorq $1,%rdi 725 leaq (%r14,%r9,1),%rbx 726 leaq (%rcx,%rdi,8),%rbp 727 movq %r9,%rcx 728 sarq $3+2,%rcx 729 movq 56+8(%rsp),%rdi 730 jmp L$sqr4x_sub 731 732.globl _bn_power5 733.private_extern _bn_power5 734 735.p2align 5 736_bn_power5: 737 movq %rsp,%rax 738 pushq %rbx 739 pushq %rbp 740 pushq %r12 741 pushq %r13 742 pushq %r14 743 pushq %r15 744 movl %r9d,%r10d 745 shll $3,%r9d 746 shll $3+2,%r10d 747 negq %r9 748 movq (%r8),%r8 749 750 751 752 753 754 755 756 leaq -64(%rsp,%r9,2),%r11 757 subq %rsi,%r11 758 andq $4095,%r11 759 cmpq %r11,%r10 760 jb L$pwr_sp_alt 761 subq %r11,%rsp 762 leaq -64(%rsp,%r9,2),%rsp 763 jmp L$pwr_sp_done 764 765.p2align 5 766L$pwr_sp_alt: 767 leaq 4096-64(,%r9,2),%r10 768 leaq -64(%rsp,%r9,2),%rsp 769 subq %r10,%r11 770 movq $0,%r10 771 cmovcq %r10,%r11 772 subq %r11,%rsp 773L$pwr_sp_done: 774 andq $-64,%rsp 775 movq %r9,%r10 776 negq %r9 777 778 779 780 781 782 783 784 785 786 787 movq %r8,32(%rsp) 788 movq %rax,40(%rsp) 789L$power5_body: 790.byte 102,72,15,110,207 791.byte 102,72,15,110,209 792.byte 102,73,15,110,218 793.byte 102,72,15,110,226 794 795 call __bn_sqr8x_internal 796 call __bn_sqr8x_internal 797 call __bn_sqr8x_internal 798 call __bn_sqr8x_internal 799 call __bn_sqr8x_internal 800 801.byte 102,72,15,126,209 802.byte 102,72,15,126,226 803 movq %rsi,%rdi 804 movq 40(%rsp),%rax 805 leaq 32(%rsp),%r8 806 807 call mul4x_internal 808 809 movq 40(%rsp),%rsi 810 movq $1,%rax 811 movq -48(%rsi),%r15 812 movq -40(%rsi),%r14 813 movq -32(%rsi),%r13 814 movq -24(%rsi),%r12 815 movq -16(%rsi),%rbp 816 movq -8(%rsi),%rbx 817 leaq (%rsi),%rsp 818L$power5_epilogue: 819 .byte 0xf3,0xc3 820 821 822.globl _bn_sqr8x_internal 823.private_extern _bn_sqr8x_internal 824.private_extern _bn_sqr8x_internal 825 826.p2align 5 827_bn_sqr8x_internal: 828__bn_sqr8x_internal: 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 leaq 32(%r10),%rbp 903 leaq (%rsi,%r9,1),%rsi 904 905 movq %r9,%rcx 906 907 908 movq -32(%rsi,%rbp,1),%r14 909 leaq 48+8(%rsp,%r9,2),%rdi 910 movq -24(%rsi,%rbp,1),%rax 911 leaq -32(%rdi,%rbp,1),%rdi 912 movq -16(%rsi,%rbp,1),%rbx 913 movq %rax,%r15 914 915 mulq %r14 916 movq %rax,%r10 917 movq %rbx,%rax 918 movq %rdx,%r11 919 movq %r10,-24(%rdi,%rbp,1) 920 921 mulq %r14 922 addq %rax,%r11 923 movq %rbx,%rax 924 adcq $0,%rdx 925 movq %r11,-16(%rdi,%rbp,1) 926 movq %rdx,%r10 927 928 929 movq -8(%rsi,%rbp,1),%rbx 930 mulq %r15 931 movq %rax,%r12 932 movq %rbx,%rax 933 movq %rdx,%r13 934 935 leaq (%rbp),%rcx 936 mulq %r14 937 addq %rax,%r10 938 movq %rbx,%rax 939 movq %rdx,%r11 940 adcq $0,%r11 941 addq %r12,%r10 942 adcq $0,%r11 943 movq %r10,-8(%rdi,%rcx,1) 944 jmp L$sqr4x_1st 945 946.p2align 5 947L$sqr4x_1st: 948 movq (%rsi,%rcx,1),%rbx 949 mulq %r15 950 addq %rax,%r13 951 movq %rbx,%rax 952 movq %rdx,%r12 953 adcq $0,%r12 954 955 mulq %r14 956 addq %rax,%r11 957 movq %rbx,%rax 958 movq 8(%rsi,%rcx,1),%rbx 959 movq %rdx,%r10 960 adcq $0,%r10 961 addq %r13,%r11 962 adcq $0,%r10 963 964 965 mulq %r15 966 addq %rax,%r12 967 movq %rbx,%rax 968 movq %r11,(%rdi,%rcx,1) 969 movq %rdx,%r13 970 adcq $0,%r13 971 972 mulq %r14 973 addq %rax,%r10 974 movq %rbx,%rax 975 movq 16(%rsi,%rcx,1),%rbx 976 movq %rdx,%r11 977 adcq $0,%r11 978 addq %r12,%r10 979 adcq $0,%r11 980 981 mulq %r15 982 addq %rax,%r13 983 movq %rbx,%rax 984 movq %r10,8(%rdi,%rcx,1) 985 movq %rdx,%r12 986 adcq $0,%r12 987 988 mulq %r14 989 addq %rax,%r11 990 movq %rbx,%rax 991 movq 24(%rsi,%rcx,1),%rbx 992 movq %rdx,%r10 993 adcq $0,%r10 994 addq %r13,%r11 995 adcq $0,%r10 996 997 998 mulq %r15 999 addq %rax,%r12 1000 movq %rbx,%rax 1001 movq %r11,16(%rdi,%rcx,1) 1002 movq %rdx,%r13 1003 adcq $0,%r13 1004 leaq 32(%rcx),%rcx 1005 1006 mulq %r14 1007 addq %rax,%r10 1008 movq %rbx,%rax 1009 movq %rdx,%r11 1010 adcq $0,%r11 1011 addq %r12,%r10 1012 adcq $0,%r11 1013 movq %r10,-8(%rdi,%rcx,1) 1014 1015 cmpq $0,%rcx 1016 jne L$sqr4x_1st 1017 1018 mulq %r15 1019 addq %rax,%r13 1020 leaq 16(%rbp),%rbp 1021 adcq $0,%rdx 1022 addq %r11,%r13 1023 adcq $0,%rdx 1024 1025 movq %r13,(%rdi) 1026 movq %rdx,%r12 1027 movq %rdx,8(%rdi) 1028 jmp L$sqr4x_outer 1029 1030.p2align 5 1031L$sqr4x_outer: 1032 movq -32(%rsi,%rbp,1),%r14 1033 leaq 48+8(%rsp,%r9,2),%rdi 1034 movq -24(%rsi,%rbp,1),%rax 1035 leaq -32(%rdi,%rbp,1),%rdi 1036 movq -16(%rsi,%rbp,1),%rbx 1037 movq %rax,%r15 1038 1039 mulq %r14 1040 movq -24(%rdi,%rbp,1),%r10 1041 addq %rax,%r10 1042 movq %rbx,%rax 1043 adcq $0,%rdx 1044 movq %r10,-24(%rdi,%rbp,1) 1045 movq %rdx,%r11 1046 1047 mulq %r14 1048 addq %rax,%r11 1049 movq %rbx,%rax 1050 adcq $0,%rdx 1051 addq -16(%rdi,%rbp,1),%r11 1052 movq %rdx,%r10 1053 adcq $0,%r10 1054 movq %r11,-16(%rdi,%rbp,1) 1055 1056 xorq %r12,%r12 1057 1058 movq -8(%rsi,%rbp,1),%rbx 1059 mulq %r15 1060 addq %rax,%r12 1061 movq %rbx,%rax 1062 adcq $0,%rdx 1063 addq -8(%rdi,%rbp,1),%r12 1064 movq %rdx,%r13 1065 adcq $0,%r13 1066 1067 mulq %r14 1068 addq %rax,%r10 1069 movq %rbx,%rax 1070 adcq $0,%rdx 1071 addq %r12,%r10 1072 movq %rdx,%r11 1073 adcq $0,%r11 1074 movq %r10,-8(%rdi,%rbp,1) 1075 1076 leaq (%rbp),%rcx 1077 jmp L$sqr4x_inner 1078 1079.p2align 5 1080L$sqr4x_inner: 1081 movq (%rsi,%rcx,1),%rbx 1082 mulq %r15 1083 addq %rax,%r13 1084 movq %rbx,%rax 1085 movq %rdx,%r12 1086 adcq $0,%r12 1087 addq (%rdi,%rcx,1),%r13 1088 adcq $0,%r12 1089 1090.byte 0x67 1091 mulq %r14 1092 addq %rax,%r11 1093 movq %rbx,%rax 1094 movq 8(%rsi,%rcx,1),%rbx 1095 movq %rdx,%r10 1096 adcq $0,%r10 1097 addq %r13,%r11 1098 adcq $0,%r10 1099 1100 mulq %r15 1101 addq %rax,%r12 1102 movq %r11,(%rdi,%rcx,1) 1103 movq %rbx,%rax 1104 movq %rdx,%r13 1105 adcq $0,%r13 1106 addq 8(%rdi,%rcx,1),%r12 1107 leaq 16(%rcx),%rcx 1108 adcq $0,%r13 1109 1110 mulq %r14 1111 addq %rax,%r10 1112 movq %rbx,%rax 1113 adcq $0,%rdx 1114 addq %r12,%r10 1115 movq %rdx,%r11 1116 adcq $0,%r11 1117 movq %r10,-8(%rdi,%rcx,1) 1118 1119 cmpq $0,%rcx 1120 jne L$sqr4x_inner 1121 1122.byte 0x67 1123 mulq %r15 1124 addq %rax,%r13 1125 adcq $0,%rdx 1126 addq %r11,%r13 1127 adcq $0,%rdx 1128 1129 movq %r13,(%rdi) 1130 movq %rdx,%r12 1131 movq %rdx,8(%rdi) 1132 1133 addq $16,%rbp 1134 jnz L$sqr4x_outer 1135 1136 1137 movq -32(%rsi),%r14 1138 leaq 48+8(%rsp,%r9,2),%rdi 1139 movq -24(%rsi),%rax 1140 leaq -32(%rdi,%rbp,1),%rdi 1141 movq -16(%rsi),%rbx 1142 movq %rax,%r15 1143 1144 mulq %r14 1145 addq %rax,%r10 1146 movq %rbx,%rax 1147 movq %rdx,%r11 1148 adcq $0,%r11 1149 1150 mulq %r14 1151 addq %rax,%r11 1152 movq %rbx,%rax 1153 movq %r10,-24(%rdi) 1154 movq %rdx,%r10 1155 adcq $0,%r10 1156 addq %r13,%r11 1157 movq -8(%rsi),%rbx 1158 adcq $0,%r10 1159 1160 mulq %r15 1161 addq %rax,%r12 1162 movq %rbx,%rax 1163 movq %r11,-16(%rdi) 1164 movq %rdx,%r13 1165 adcq $0,%r13 1166 1167 mulq %r14 1168 addq %rax,%r10 1169 movq %rbx,%rax 1170 movq %rdx,%r11 1171 adcq $0,%r11 1172 addq %r12,%r10 1173 adcq $0,%r11 1174 movq %r10,-8(%rdi) 1175 1176 mulq %r15 1177 addq %rax,%r13 1178 movq -16(%rsi),%rax 1179 adcq $0,%rdx 1180 addq %r11,%r13 1181 adcq $0,%rdx 1182 1183 movq %r13,(%rdi) 1184 movq %rdx,%r12 1185 movq %rdx,8(%rdi) 1186 1187 mulq %rbx 1188 addq $16,%rbp 1189 xorq %r14,%r14 1190 subq %r9,%rbp 1191 xorq %r15,%r15 1192 1193 addq %r12,%rax 1194 adcq $0,%rdx 1195 movq %rax,8(%rdi) 1196 movq %rdx,16(%rdi) 1197 movq %r15,24(%rdi) 1198 1199 movq -16(%rsi,%rbp,1),%rax 1200 leaq 48+8(%rsp),%rdi 1201 xorq %r10,%r10 1202 movq 8(%rdi),%r11 1203 1204 leaq (%r14,%r10,2),%r12 1205 shrq $63,%r10 1206 leaq (%rcx,%r11,2),%r13 1207 shrq $63,%r11 1208 orq %r10,%r13 1209 movq 16(%rdi),%r10 1210 movq %r11,%r14 1211 mulq %rax 1212 negq %r15 1213 movq 24(%rdi),%r11 1214 adcq %rax,%r12 1215 movq -8(%rsi,%rbp,1),%rax 1216 movq %r12,(%rdi) 1217 adcq %rdx,%r13 1218 1219 leaq (%r14,%r10,2),%rbx 1220 movq %r13,8(%rdi) 1221 sbbq %r15,%r15 1222 shrq $63,%r10 1223 leaq (%rcx,%r11,2),%r8 1224 shrq $63,%r11 1225 orq %r10,%r8 1226 movq 32(%rdi),%r10 1227 movq %r11,%r14 1228 mulq %rax 1229 negq %r15 1230 movq 40(%rdi),%r11 1231 adcq %rax,%rbx 1232 movq 0(%rsi,%rbp,1),%rax 1233 movq %rbx,16(%rdi) 1234 adcq %rdx,%r8 1235 leaq 16(%rbp),%rbp 1236 movq %r8,24(%rdi) 1237 sbbq %r15,%r15 1238 leaq 64(%rdi),%rdi 1239 jmp L$sqr4x_shift_n_add 1240 1241.p2align 5 1242L$sqr4x_shift_n_add: 1243 leaq (%r14,%r10,2),%r12 1244 shrq $63,%r10 1245 leaq (%rcx,%r11,2),%r13 1246 shrq $63,%r11 1247 orq %r10,%r13 1248 movq -16(%rdi),%r10 1249 movq %r11,%r14 1250 mulq %rax 1251 negq %r15 1252 movq -8(%rdi),%r11 1253 adcq %rax,%r12 1254 movq -8(%rsi,%rbp,1),%rax 1255 movq %r12,-32(%rdi) 1256 adcq %rdx,%r13 1257 1258 leaq (%r14,%r10,2),%rbx 1259 movq %r13,-24(%rdi) 1260 sbbq %r15,%r15 1261 shrq $63,%r10 1262 leaq (%rcx,%r11,2),%r8 1263 shrq $63,%r11 1264 orq %r10,%r8 1265 movq 0(%rdi),%r10 1266 movq %r11,%r14 1267 mulq %rax 1268 negq %r15 1269 movq 8(%rdi),%r11 1270 adcq %rax,%rbx 1271 movq 0(%rsi,%rbp,1),%rax 1272 movq %rbx,-16(%rdi) 1273 adcq %rdx,%r8 1274 1275 leaq (%r14,%r10,2),%r12 1276 movq %r8,-8(%rdi) 1277 sbbq %r15,%r15 1278 shrq $63,%r10 1279 leaq (%rcx,%r11,2),%r13 1280 shrq $63,%r11 1281 orq %r10,%r13 1282 movq 16(%rdi),%r10 1283 movq %r11,%r14 1284 mulq %rax 1285 negq %r15 1286 movq 24(%rdi),%r11 1287 adcq %rax,%r12 1288 movq 8(%rsi,%rbp,1),%rax 1289 movq %r12,0(%rdi) 1290 adcq %rdx,%r13 1291 1292 leaq (%r14,%r10,2),%rbx 1293 movq %r13,8(%rdi) 1294 sbbq %r15,%r15 1295 shrq $63,%r10 1296 leaq (%rcx,%r11,2),%r8 1297 shrq $63,%r11 1298 orq %r10,%r8 1299 movq 32(%rdi),%r10 1300 movq %r11,%r14 1301 mulq %rax 1302 negq %r15 1303 movq 40(%rdi),%r11 1304 adcq %rax,%rbx 1305 movq 16(%rsi,%rbp,1),%rax 1306 movq %rbx,16(%rdi) 1307 adcq %rdx,%r8 1308 movq %r8,24(%rdi) 1309 sbbq %r15,%r15 1310 leaq 64(%rdi),%rdi 1311 addq $32,%rbp 1312 jnz L$sqr4x_shift_n_add 1313 1314 leaq (%r14,%r10,2),%r12 1315.byte 0x67 1316 shrq $63,%r10 1317 leaq (%rcx,%r11,2),%r13 1318 shrq $63,%r11 1319 orq %r10,%r13 1320 movq -16(%rdi),%r10 1321 movq %r11,%r14 1322 mulq %rax 1323 negq %r15 1324 movq -8(%rdi),%r11 1325 adcq %rax,%r12 1326 movq -8(%rsi),%rax 1327 movq %r12,-32(%rdi) 1328 adcq %rdx,%r13 1329 1330 leaq (%r14,%r10,2),%rbx 1331 movq %r13,-24(%rdi) 1332 sbbq %r15,%r15 1333 shrq $63,%r10 1334 leaq (%rcx,%r11,2),%r8 1335 shrq $63,%r11 1336 orq %r10,%r8 1337 mulq %rax 1338 negq %r15 1339 adcq %rax,%rbx 1340 adcq %rdx,%r8 1341 movq %rbx,-16(%rdi) 1342 movq %r8,-8(%rdi) 1343.byte 102,72,15,126,213 1344sqr8x_reduction: 1345 xorq %rax,%rax 1346 leaq (%rbp,%r9,2),%rcx 1347 leaq 48+8(%rsp,%r9,2),%rdx 1348 movq %rcx,0+8(%rsp) 1349 leaq 48+8(%rsp,%r9,1),%rdi 1350 movq %rdx,8+8(%rsp) 1351 negq %r9 1352 jmp L$8x_reduction_loop 1353 1354.p2align 5 1355L$8x_reduction_loop: 1356 leaq (%rdi,%r9,1),%rdi 1357.byte 0x66 1358 movq 0(%rdi),%rbx 1359 movq 8(%rdi),%r9 1360 movq 16(%rdi),%r10 1361 movq 24(%rdi),%r11 1362 movq 32(%rdi),%r12 1363 movq 40(%rdi),%r13 1364 movq 48(%rdi),%r14 1365 movq 56(%rdi),%r15 1366 movq %rax,(%rdx) 1367 leaq 64(%rdi),%rdi 1368 1369.byte 0x67 1370 movq %rbx,%r8 1371 imulq 32+8(%rsp),%rbx 1372 movq 0(%rbp),%rax 1373 movl $8,%ecx 1374 jmp L$8x_reduce 1375 1376.p2align 5 1377L$8x_reduce: 1378 mulq %rbx 1379 movq 16(%rbp),%rax 1380 negq %r8 1381 movq %rdx,%r8 1382 adcq $0,%r8 1383 1384 mulq %rbx 1385 addq %rax,%r9 1386 movq 32(%rbp),%rax 1387 adcq $0,%rdx 1388 addq %r9,%r8 1389 movq %rbx,48-8+8(%rsp,%rcx,8) 1390 movq %rdx,%r9 1391 adcq $0,%r9 1392 1393 mulq %rbx 1394 addq %rax,%r10 1395 movq 48(%rbp),%rax 1396 adcq $0,%rdx 1397 addq %r10,%r9 1398 movq 32+8(%rsp),%rsi 1399 movq %rdx,%r10 1400 adcq $0,%r10 1401 1402 mulq %rbx 1403 addq %rax,%r11 1404 movq 64(%rbp),%rax 1405 adcq $0,%rdx 1406 imulq %r8,%rsi 1407 addq %r11,%r10 1408 movq %rdx,%r11 1409 adcq $0,%r11 1410 1411 mulq %rbx 1412 addq %rax,%r12 1413 movq 80(%rbp),%rax 1414 adcq $0,%rdx 1415 addq %r12,%r11 1416 movq %rdx,%r12 1417 adcq $0,%r12 1418 1419 mulq %rbx 1420 addq %rax,%r13 1421 movq 96(%rbp),%rax 1422 adcq $0,%rdx 1423 addq %r13,%r12 1424 movq %rdx,%r13 1425 adcq $0,%r13 1426 1427 mulq %rbx 1428 addq %rax,%r14 1429 movq 112(%rbp),%rax 1430 adcq $0,%rdx 1431 addq %r14,%r13 1432 movq %rdx,%r14 1433 adcq $0,%r14 1434 1435 mulq %rbx 1436 movq %rsi,%rbx 1437 addq %rax,%r15 1438 movq 0(%rbp),%rax 1439 adcq $0,%rdx 1440 addq %r15,%r14 1441 movq %rdx,%r15 1442 adcq $0,%r15 1443 1444 decl %ecx 1445 jnz L$8x_reduce 1446 1447 leaq 128(%rbp),%rbp 1448 xorq %rax,%rax 1449 movq 8+8(%rsp),%rdx 1450 cmpq 0+8(%rsp),%rbp 1451 jae L$8x_no_tail 1452 1453.byte 0x66 1454 addq 0(%rdi),%r8 1455 adcq 8(%rdi),%r9 1456 adcq 16(%rdi),%r10 1457 adcq 24(%rdi),%r11 1458 adcq 32(%rdi),%r12 1459 adcq 40(%rdi),%r13 1460 adcq 48(%rdi),%r14 1461 adcq 56(%rdi),%r15 1462 sbbq %rsi,%rsi 1463 1464 movq 48+56+8(%rsp),%rbx 1465 movl $8,%ecx 1466 movq 0(%rbp),%rax 1467 jmp L$8x_tail 1468 1469.p2align 5 1470L$8x_tail: 1471 mulq %rbx 1472 addq %rax,%r8 1473 movq 16(%rbp),%rax 1474 movq %r8,(%rdi) 1475 movq %rdx,%r8 1476 adcq $0,%r8 1477 1478 mulq %rbx 1479 addq %rax,%r9 1480 movq 32(%rbp),%rax 1481 adcq $0,%rdx 1482 addq %r9,%r8 1483 leaq 8(%rdi),%rdi 1484 movq %rdx,%r9 1485 adcq $0,%r9 1486 1487 mulq %rbx 1488 addq %rax,%r10 1489 movq 48(%rbp),%rax 1490 adcq $0,%rdx 1491 addq %r10,%r9 1492 movq %rdx,%r10 1493 adcq $0,%r10 1494 1495 mulq %rbx 1496 addq %rax,%r11 1497 movq 64(%rbp),%rax 1498 adcq $0,%rdx 1499 addq %r11,%r10 1500 movq %rdx,%r11 1501 adcq $0,%r11 1502 1503 mulq %rbx 1504 addq %rax,%r12 1505 movq 80(%rbp),%rax 1506 adcq $0,%rdx 1507 addq %r12,%r11 1508 movq %rdx,%r12 1509 adcq $0,%r12 1510 1511 mulq %rbx 1512 addq %rax,%r13 1513 movq 96(%rbp),%rax 1514 adcq $0,%rdx 1515 addq %r13,%r12 1516 movq %rdx,%r13 1517 adcq $0,%r13 1518 1519 mulq %rbx 1520 addq %rax,%r14 1521 movq 112(%rbp),%rax 1522 adcq $0,%rdx 1523 addq %r14,%r13 1524 movq %rdx,%r14 1525 adcq $0,%r14 1526 1527 mulq %rbx 1528 movq 48-16+8(%rsp,%rcx,8),%rbx 1529 addq %rax,%r15 1530 adcq $0,%rdx 1531 addq %r15,%r14 1532 movq 0(%rbp),%rax 1533 movq %rdx,%r15 1534 adcq $0,%r15 1535 1536 decl %ecx 1537 jnz L$8x_tail 1538 1539 leaq 128(%rbp),%rbp 1540 movq 8+8(%rsp),%rdx 1541 cmpq 0+8(%rsp),%rbp 1542 jae L$8x_tail_done 1543 1544 movq 48+56+8(%rsp),%rbx 1545 negq %rsi 1546 movq 0(%rbp),%rax 1547 adcq 0(%rdi),%r8 1548 adcq 8(%rdi),%r9 1549 adcq 16(%rdi),%r10 1550 adcq 24(%rdi),%r11 1551 adcq 32(%rdi),%r12 1552 adcq 40(%rdi),%r13 1553 adcq 48(%rdi),%r14 1554 adcq 56(%rdi),%r15 1555 sbbq %rsi,%rsi 1556 1557 movl $8,%ecx 1558 jmp L$8x_tail 1559 1560.p2align 5 1561L$8x_tail_done: 1562 addq (%rdx),%r8 1563 adcq $0,%r9 1564 adcq $0,%r10 1565 adcq $0,%r11 1566 adcq $0,%r12 1567 adcq $0,%r13 1568 adcq $0,%r14 1569 adcq $0,%r15 1570 1571 1572 xorq %rax,%rax 1573 1574 negq %rsi 1575L$8x_no_tail: 1576 adcq 0(%rdi),%r8 1577 adcq 8(%rdi),%r9 1578 adcq 16(%rdi),%r10 1579 adcq 24(%rdi),%r11 1580 adcq 32(%rdi),%r12 1581 adcq 40(%rdi),%r13 1582 adcq 48(%rdi),%r14 1583 adcq 56(%rdi),%r15 1584 adcq $0,%rax 1585 movq -16(%rbp),%rcx 1586 xorq %rsi,%rsi 1587 1588.byte 102,72,15,126,213 1589 1590 movq %r8,0(%rdi) 1591 movq %r9,8(%rdi) 1592.byte 102,73,15,126,217 1593 movq %r10,16(%rdi) 1594 movq %r11,24(%rdi) 1595 movq %r12,32(%rdi) 1596 movq %r13,40(%rdi) 1597 movq %r14,48(%rdi) 1598 movq %r15,56(%rdi) 1599 leaq 64(%rdi),%rdi 1600 1601 cmpq %rdx,%rdi 1602 jb L$8x_reduction_loop 1603 1604 subq %r15,%rcx 1605 leaq (%rdi,%r9,1),%rbx 1606 adcq %rsi,%rsi 1607 movq %r9,%rcx 1608 orq %rsi,%rax 1609.byte 102,72,15,126,207 1610 xorq $1,%rax 1611.byte 102,72,15,126,206 1612 leaq (%rbp,%rax,8),%rbp 1613 sarq $3+2,%rcx 1614 jmp L$sqr4x_sub 1615 1616.p2align 5 1617L$sqr4x_sub: 1618.byte 0x66 1619 movq 0(%rbx),%r12 1620 movq 8(%rbx),%r13 1621 sbbq 0(%rbp),%r12 1622 movq 16(%rbx),%r14 1623 sbbq 16(%rbp),%r13 1624 movq 24(%rbx),%r15 1625 leaq 32(%rbx),%rbx 1626 sbbq 32(%rbp),%r14 1627 movq %r12,0(%rdi) 1628 sbbq 48(%rbp),%r15 1629 leaq 64(%rbp),%rbp 1630 movq %r13,8(%rdi) 1631 movq %r14,16(%rdi) 1632 movq %r15,24(%rdi) 1633 leaq 32(%rdi),%rdi 1634 1635 incq %rcx 1636 jnz L$sqr4x_sub 1637 movq %r9,%r10 1638 negq %r9 1639 .byte 0xf3,0xc3 1640 1641.globl _bn_from_montgomery 1642.private_extern _bn_from_montgomery 1643 1644.p2align 5 1645_bn_from_montgomery: 1646 testl $7,%r9d 1647 jz bn_from_mont8x 1648 xorl %eax,%eax 1649 .byte 0xf3,0xc3 1650 1651 1652 1653.p2align 5 1654bn_from_mont8x: 1655.byte 0x67 1656 movq %rsp,%rax 1657 pushq %rbx 1658 pushq %rbp 1659 pushq %r12 1660 pushq %r13 1661 pushq %r14 1662 pushq %r15 1663.byte 0x67 1664 movl %r9d,%r10d 1665 shll $3,%r9d 1666 shll $3+2,%r10d 1667 negq %r9 1668 movq (%r8),%r8 1669 1670 1671 1672 1673 1674 1675 1676 leaq -64(%rsp,%r9,2),%r11 1677 subq %rsi,%r11 1678 andq $4095,%r11 1679 cmpq %r11,%r10 1680 jb L$from_sp_alt 1681 subq %r11,%rsp 1682 leaq -64(%rsp,%r9,2),%rsp 1683 jmp L$from_sp_done 1684 1685.p2align 5 1686L$from_sp_alt: 1687 leaq 4096-64(,%r9,2),%r10 1688 leaq -64(%rsp,%r9,2),%rsp 1689 subq %r10,%r11 1690 movq $0,%r10 1691 cmovcq %r10,%r11 1692 subq %r11,%rsp 1693L$from_sp_done: 1694 andq $-64,%rsp 1695 movq %r9,%r10 1696 negq %r9 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 movq %r8,32(%rsp) 1708 movq %rax,40(%rsp) 1709L$from_body: 1710 movq %r9,%r11 1711 leaq 48(%rsp),%rax 1712 pxor %xmm0,%xmm0 1713 jmp L$mul_by_1 1714 1715.p2align 5 1716L$mul_by_1: 1717 movdqu (%rsi),%xmm1 1718 movdqu 16(%rsi),%xmm2 1719 movdqu 32(%rsi),%xmm3 1720 movdqa %xmm0,(%rax,%r9,1) 1721 movdqu 48(%rsi),%xmm4 1722 movdqa %xmm0,16(%rax,%r9,1) 1723.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 1724 movdqa %xmm1,(%rax) 1725 movdqa %xmm0,32(%rax,%r9,1) 1726 movdqa %xmm2,16(%rax) 1727 movdqa %xmm0,48(%rax,%r9,1) 1728 movdqa %xmm3,32(%rax) 1729 movdqa %xmm4,48(%rax) 1730 leaq 64(%rax),%rax 1731 subq $64,%r11 1732 jnz L$mul_by_1 1733 1734.byte 102,72,15,110,207 1735.byte 102,72,15,110,209 1736.byte 0x67 1737 movq %rcx,%rbp 1738.byte 102,73,15,110,218 1739 call sqr8x_reduction 1740 1741 pxor %xmm0,%xmm0 1742 leaq 48(%rsp),%rax 1743 movq 40(%rsp),%rsi 1744 jmp L$from_mont_zero 1745 1746.p2align 5 1747L$from_mont_zero: 1748 movdqa %xmm0,0(%rax) 1749 movdqa %xmm0,16(%rax) 1750 movdqa %xmm0,32(%rax) 1751 movdqa %xmm0,48(%rax) 1752 leaq 64(%rax),%rax 1753 subq $32,%r9 1754 jnz L$from_mont_zero 1755 1756 movq $1,%rax 1757 movq -48(%rsi),%r15 1758 movq -40(%rsi),%r14 1759 movq -32(%rsi),%r13 1760 movq -24(%rsi),%r12 1761 movq -16(%rsi),%rbp 1762 movq -8(%rsi),%rbx 1763 leaq (%rsi),%rsp 1764L$from_epilogue: 1765 .byte 0xf3,0xc3 1766 1767.globl _bn_scatter5 1768.private_extern _bn_scatter5 1769 1770.p2align 4 1771_bn_scatter5: 1772 cmpl $0,%esi 1773 jz L$scatter_epilogue 1774 leaq (%rdx,%rcx,8),%rdx 1775L$scatter: 1776 movq (%rdi),%rax 1777 leaq 8(%rdi),%rdi 1778 movq %rax,(%rdx) 1779 leaq 256(%rdx),%rdx 1780 subl $1,%esi 1781 jnz L$scatter 1782L$scatter_epilogue: 1783 .byte 0xf3,0xc3 1784 1785 1786.globl _bn_gather5 1787.private_extern _bn_gather5 1788 1789.p2align 4 1790_bn_gather5: 1791 movl %ecx,%r11d 1792 shrl $3,%ecx 1793 andq $7,%r11 1794 notl %ecx 1795 leaq L$magic_masks(%rip),%rax 1796 andl $3,%ecx 1797 leaq 128(%rdx,%r11,8),%rdx 1798 movq 0(%rax,%rcx,8),%xmm4 1799 movq 8(%rax,%rcx,8),%xmm5 1800 movq 16(%rax,%rcx,8),%xmm6 1801 movq 24(%rax,%rcx,8),%xmm7 1802 jmp L$gather 1803.p2align 4 1804L$gather: 1805 movq -128(%rdx),%xmm0 1806 movq -64(%rdx),%xmm1 1807 pand %xmm4,%xmm0 1808 movq 0(%rdx),%xmm2 1809 pand %xmm5,%xmm1 1810 movq 64(%rdx),%xmm3 1811 pand %xmm6,%xmm2 1812 por %xmm1,%xmm0 1813 pand %xmm7,%xmm3 1814.byte 0x67,0x67 1815 por %xmm2,%xmm0 1816 leaq 256(%rdx),%rdx 1817 por %xmm3,%xmm0 1818 1819 movq %xmm0,(%rdi) 1820 leaq 8(%rdi),%rdi 1821 subl $1,%esi 1822 jnz L$gather 1823 .byte 0xf3,0xc3 1824L$SEH_end_bn_gather5: 1825 1826.p2align 6 1827L$magic_masks: 1828.long 0,0, 0,0, 0,0, -1,-1 1829.long 0,0, 0,0, 0,0, 0,0 1830.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1831#endif 1832