1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11.text 12 13 14 15.globl _GFp_bn_mul_mont_gather5 16.private_extern _GFp_bn_mul_mont_gather5 17 18.p2align 6 19_GFp_bn_mul_mont_gather5: 20 21 movl %r9d,%r9d 22 movq %rsp,%rax 23 24 testl $7,%r9d 25 jnz L$mul_enter 26 leaq _GFp_ia32cap_P(%rip),%r11 27 movl 8(%r11),%r11d 28 jmp L$mul4x_enter 29 30.p2align 4 31L$mul_enter: 32 movd 8(%rsp),%xmm5 33 pushq %rbx 34 35 pushq %rbp 36 37 pushq %r12 38 39 pushq %r13 40 41 pushq %r14 42 43 pushq %r15 44 45 46 negq %r9 47 movq %rsp,%r11 48 leaq -280(%rsp,%r9,8),%r10 49 negq %r9 50 andq $-1024,%r10 51 52 53 54 55 56 57 58 59 60 subq %r10,%r11 61 andq $-4096,%r11 62 leaq (%r10,%r11,1),%rsp 63 movq (%rsp),%r11 64 cmpq %r10,%rsp 65 ja L$mul_page_walk 66 jmp L$mul_page_walk_done 67 68L$mul_page_walk: 69 leaq -4096(%rsp),%rsp 70 movq (%rsp),%r11 71 cmpq %r10,%rsp 72 ja L$mul_page_walk 73L$mul_page_walk_done: 74 75 leaq L$inc(%rip),%r10 76 movq %rax,8(%rsp,%r9,8) 77 78L$mul_body: 79 80 leaq 128(%rdx),%r12 81 movdqa 0(%r10),%xmm0 82 movdqa 16(%r10),%xmm1 83 leaq 24-112(%rsp,%r9,8),%r10 84 andq $-16,%r10 85 86 pshufd $0,%xmm5,%xmm5 87 movdqa %xmm1,%xmm4 88 movdqa %xmm1,%xmm2 89 paddd %xmm0,%xmm1 90 pcmpeqd %xmm5,%xmm0 91.byte 0x67 92 movdqa %xmm4,%xmm3 93 paddd %xmm1,%xmm2 94 pcmpeqd %xmm5,%xmm1 95 movdqa %xmm0,112(%r10) 96 movdqa %xmm4,%xmm0 97 98 paddd %xmm2,%xmm3 99 pcmpeqd %xmm5,%xmm2 100 movdqa %xmm1,128(%r10) 101 movdqa %xmm4,%xmm1 102 103 paddd %xmm3,%xmm0 104 pcmpeqd %xmm5,%xmm3 105 movdqa %xmm2,144(%r10) 106 movdqa %xmm4,%xmm2 107 108 paddd %xmm0,%xmm1 109 pcmpeqd %xmm5,%xmm0 110 movdqa %xmm3,160(%r10) 111 movdqa %xmm4,%xmm3 112 paddd %xmm1,%xmm2 113 pcmpeqd %xmm5,%xmm1 114 movdqa %xmm0,176(%r10) 115 movdqa %xmm4,%xmm0 116 117 paddd %xmm2,%xmm3 118 pcmpeqd %xmm5,%xmm2 119 movdqa %xmm1,192(%r10) 120 movdqa %xmm4,%xmm1 121 122 paddd %xmm3,%xmm0 123 pcmpeqd %xmm5,%xmm3 124 movdqa %xmm2,208(%r10) 125 movdqa %xmm4,%xmm2 126 127 paddd %xmm0,%xmm1 128 pcmpeqd %xmm5,%xmm0 129 movdqa %xmm3,224(%r10) 130 movdqa %xmm4,%xmm3 131 paddd %xmm1,%xmm2 132 pcmpeqd %xmm5,%xmm1 133 movdqa %xmm0,240(%r10) 134 movdqa %xmm4,%xmm0 135 136 paddd %xmm2,%xmm3 137 pcmpeqd %xmm5,%xmm2 138 movdqa %xmm1,256(%r10) 139 movdqa %xmm4,%xmm1 140 141 paddd %xmm3,%xmm0 142 pcmpeqd %xmm5,%xmm3 143 movdqa %xmm2,272(%r10) 144 movdqa %xmm4,%xmm2 145 146 paddd %xmm0,%xmm1 147 pcmpeqd %xmm5,%xmm0 148 movdqa %xmm3,288(%r10) 149 movdqa %xmm4,%xmm3 150 paddd %xmm1,%xmm2 151 pcmpeqd %xmm5,%xmm1 152 movdqa %xmm0,304(%r10) 153 154 paddd %xmm2,%xmm3 155.byte 0x67 156 pcmpeqd %xmm5,%xmm2 157 movdqa %xmm1,320(%r10) 158 159 pcmpeqd %xmm5,%xmm3 160 movdqa %xmm2,336(%r10) 161 pand 64(%r12),%xmm0 162 163 pand 80(%r12),%xmm1 164 pand 96(%r12),%xmm2 165 movdqa %xmm3,352(%r10) 166 pand 112(%r12),%xmm3 167 por %xmm2,%xmm0 168 por %xmm3,%xmm1 169 movdqa -128(%r12),%xmm4 170 movdqa -112(%r12),%xmm5 171 movdqa -96(%r12),%xmm2 172 pand 112(%r10),%xmm4 173 movdqa -80(%r12),%xmm3 174 pand 128(%r10),%xmm5 175 por %xmm4,%xmm0 176 pand 144(%r10),%xmm2 177 por %xmm5,%xmm1 178 pand 160(%r10),%xmm3 179 por %xmm2,%xmm0 180 por %xmm3,%xmm1 181 movdqa -64(%r12),%xmm4 182 movdqa -48(%r12),%xmm5 183 movdqa -32(%r12),%xmm2 184 pand 176(%r10),%xmm4 185 movdqa -16(%r12),%xmm3 186 pand 192(%r10),%xmm5 187 por %xmm4,%xmm0 188 pand 208(%r10),%xmm2 189 por %xmm5,%xmm1 190 pand 224(%r10),%xmm3 191 por %xmm2,%xmm0 192 por %xmm3,%xmm1 193 movdqa 0(%r12),%xmm4 194 movdqa 16(%r12),%xmm5 195 movdqa 32(%r12),%xmm2 196 pand 240(%r10),%xmm4 197 movdqa 48(%r12),%xmm3 198 pand 256(%r10),%xmm5 199 por %xmm4,%xmm0 200 pand 272(%r10),%xmm2 201 por %xmm5,%xmm1 202 pand 288(%r10),%xmm3 203 por %xmm2,%xmm0 204 por %xmm3,%xmm1 205 por %xmm1,%xmm0 206 pshufd $0x4e,%xmm0,%xmm1 207 por %xmm1,%xmm0 208 leaq 256(%r12),%r12 209.byte 102,72,15,126,195 210 211 movq (%r8),%r8 212 movq (%rsi),%rax 213 214 xorq %r14,%r14 215 xorq %r15,%r15 216 217 movq %r8,%rbp 218 mulq %rbx 219 movq %rax,%r10 220 movq (%rcx),%rax 221 222 imulq %r10,%rbp 223 movq %rdx,%r11 224 225 mulq %rbp 226 addq %rax,%r10 227 movq 8(%rsi),%rax 228 adcq $0,%rdx 229 movq %rdx,%r13 230 231 leaq 1(%r15),%r15 232 jmp L$1st_enter 233 234.p2align 4 235L$1st: 236 addq %rax,%r13 237 movq (%rsi,%r15,8),%rax 238 adcq $0,%rdx 239 addq %r11,%r13 240 movq %r10,%r11 241 adcq $0,%rdx 242 movq %r13,-16(%rsp,%r15,8) 243 movq %rdx,%r13 244 245L$1st_enter: 246 mulq %rbx 247 addq %rax,%r11 248 movq (%rcx,%r15,8),%rax 249 adcq $0,%rdx 250 leaq 1(%r15),%r15 251 movq %rdx,%r10 252 253 mulq %rbp 254 cmpq %r9,%r15 255 jne L$1st 256 257 258 addq %rax,%r13 259 adcq $0,%rdx 260 addq %r11,%r13 261 adcq $0,%rdx 262 movq %r13,-16(%rsp,%r9,8) 263 movq %rdx,%r13 264 movq %r10,%r11 265 266 xorq %rdx,%rdx 267 addq %r11,%r13 268 adcq $0,%rdx 269 movq %r13,-8(%rsp,%r9,8) 270 movq %rdx,(%rsp,%r9,8) 271 272 leaq 1(%r14),%r14 273 jmp L$outer 274.p2align 4 275L$outer: 276 leaq 24+128(%rsp,%r9,8),%rdx 277 andq $-16,%rdx 278 pxor %xmm4,%xmm4 279 pxor %xmm5,%xmm5 280 movdqa -128(%r12),%xmm0 281 movdqa -112(%r12),%xmm1 282 movdqa -96(%r12),%xmm2 283 movdqa -80(%r12),%xmm3 284 pand -128(%rdx),%xmm0 285 pand -112(%rdx),%xmm1 286 por %xmm0,%xmm4 287 pand -96(%rdx),%xmm2 288 por %xmm1,%xmm5 289 pand -80(%rdx),%xmm3 290 por %xmm2,%xmm4 291 por %xmm3,%xmm5 292 movdqa -64(%r12),%xmm0 293 movdqa -48(%r12),%xmm1 294 movdqa -32(%r12),%xmm2 295 movdqa -16(%r12),%xmm3 296 pand -64(%rdx),%xmm0 297 pand -48(%rdx),%xmm1 298 por %xmm0,%xmm4 299 pand -32(%rdx),%xmm2 300 por %xmm1,%xmm5 301 pand -16(%rdx),%xmm3 302 por %xmm2,%xmm4 303 por %xmm3,%xmm5 304 movdqa 0(%r12),%xmm0 305 movdqa 16(%r12),%xmm1 306 movdqa 32(%r12),%xmm2 307 movdqa 48(%r12),%xmm3 308 pand 0(%rdx),%xmm0 309 pand 16(%rdx),%xmm1 310 por %xmm0,%xmm4 311 pand 32(%rdx),%xmm2 312 por %xmm1,%xmm5 313 pand 48(%rdx),%xmm3 314 por %xmm2,%xmm4 315 por %xmm3,%xmm5 316 movdqa 64(%r12),%xmm0 317 movdqa 80(%r12),%xmm1 318 movdqa 96(%r12),%xmm2 319 movdqa 112(%r12),%xmm3 320 pand 64(%rdx),%xmm0 321 pand 80(%rdx),%xmm1 322 por %xmm0,%xmm4 323 pand 96(%rdx),%xmm2 324 por %xmm1,%xmm5 325 pand 112(%rdx),%xmm3 326 por %xmm2,%xmm4 327 por %xmm3,%xmm5 328 por %xmm5,%xmm4 329 pshufd $0x4e,%xmm4,%xmm0 330 por %xmm4,%xmm0 331 leaq 256(%r12),%r12 332 333 movq (%rsi),%rax 334.byte 102,72,15,126,195 335 336 xorq %r15,%r15 337 movq %r8,%rbp 338 movq (%rsp),%r10 339 340 mulq %rbx 341 addq %rax,%r10 342 movq (%rcx),%rax 343 adcq $0,%rdx 344 345 imulq %r10,%rbp 346 movq %rdx,%r11 347 348 mulq %rbp 349 addq %rax,%r10 350 movq 8(%rsi),%rax 351 adcq $0,%rdx 352 movq 8(%rsp),%r10 353 movq %rdx,%r13 354 355 leaq 1(%r15),%r15 356 jmp L$inner_enter 357 358.p2align 4 359L$inner: 360 addq %rax,%r13 361 movq (%rsi,%r15,8),%rax 362 adcq $0,%rdx 363 addq %r10,%r13 364 movq (%rsp,%r15,8),%r10 365 adcq $0,%rdx 366 movq %r13,-16(%rsp,%r15,8) 367 movq %rdx,%r13 368 369L$inner_enter: 370 mulq %rbx 371 addq %rax,%r11 372 movq (%rcx,%r15,8),%rax 373 adcq $0,%rdx 374 addq %r11,%r10 375 movq %rdx,%r11 376 adcq $0,%r11 377 leaq 1(%r15),%r15 378 379 mulq %rbp 380 cmpq %r9,%r15 381 jne L$inner 382 383 addq %rax,%r13 384 adcq $0,%rdx 385 addq %r10,%r13 386 movq (%rsp,%r9,8),%r10 387 adcq $0,%rdx 388 movq %r13,-16(%rsp,%r9,8) 389 movq %rdx,%r13 390 391 xorq %rdx,%rdx 392 addq %r11,%r13 393 adcq $0,%rdx 394 addq %r10,%r13 395 adcq $0,%rdx 396 movq %r13,-8(%rsp,%r9,8) 397 movq %rdx,(%rsp,%r9,8) 398 399 leaq 1(%r14),%r14 400 cmpq %r9,%r14 401 jb L$outer 402 403 xorq %r14,%r14 404 movq (%rsp),%rax 405 leaq (%rsp),%rsi 406 movq %r9,%r15 407 jmp L$sub 408.p2align 4 409L$sub: sbbq (%rcx,%r14,8),%rax 410 movq %rax,(%rdi,%r14,8) 411 movq 8(%rsi,%r14,8),%rax 412 leaq 1(%r14),%r14 413 decq %r15 414 jnz L$sub 415 416 sbbq $0,%rax 417 movq $-1,%rbx 418 xorq %rax,%rbx 419 xorq %r14,%r14 420 movq %r9,%r15 421 422L$copy: 423 movq (%rdi,%r14,8),%rcx 424 movq (%rsp,%r14,8),%rdx 425 andq %rbx,%rcx 426 andq %rax,%rdx 427 movq %r14,(%rsp,%r14,8) 428 orq %rcx,%rdx 429 movq %rdx,(%rdi,%r14,8) 430 leaq 1(%r14),%r14 431 subq $1,%r15 432 jnz L$copy 433 434 movq 8(%rsp,%r9,8),%rsi 435 436 movq $1,%rax 437 438 movq -48(%rsi),%r15 439 440 movq -40(%rsi),%r14 441 442 movq -32(%rsi),%r13 443 444 movq -24(%rsi),%r12 445 446 movq -16(%rsi),%rbp 447 448 movq -8(%rsi),%rbx 449 450 leaq (%rsi),%rsp 451 452L$mul_epilogue: 453 .byte 0xf3,0xc3 454 455 456 457.p2align 5 458bn_mul4x_mont_gather5: 459 460.byte 0x67 461 movq %rsp,%rax 462 463L$mul4x_enter: 464 andl $0x80108,%r11d 465 cmpl $0x80108,%r11d 466 je L$mulx4x_enter 467 pushq %rbx 468 469 pushq %rbp 470 471 pushq %r12 472 473 pushq %r13 474 475 pushq %r14 476 477 pushq %r15 478 479L$mul4x_prologue: 480 481.byte 0x67 482 shll $3,%r9d 483 leaq (%r9,%r9,2),%r10 484 negq %r9 485 486 487 488 489 490 491 492 493 494 495 leaq -320(%rsp,%r9,2),%r11 496 movq %rsp,%rbp 497 subq %rdi,%r11 498 andq $4095,%r11 499 cmpq %r11,%r10 500 jb L$mul4xsp_alt 501 subq %r11,%rbp 502 leaq -320(%rbp,%r9,2),%rbp 503 jmp L$mul4xsp_done 504 505.p2align 5 506L$mul4xsp_alt: 507 leaq 4096-320(,%r9,2),%r10 508 leaq -320(%rbp,%r9,2),%rbp 509 subq %r10,%r11 510 movq $0,%r10 511 cmovcq %r10,%r11 512 subq %r11,%rbp 513L$mul4xsp_done: 514 andq $-64,%rbp 515 movq %rsp,%r11 516 subq %rbp,%r11 517 andq $-4096,%r11 518 leaq (%r11,%rbp,1),%rsp 519 movq (%rsp),%r10 520 cmpq %rbp,%rsp 521 ja L$mul4x_page_walk 522 jmp L$mul4x_page_walk_done 523 524L$mul4x_page_walk: 525 leaq -4096(%rsp),%rsp 526 movq (%rsp),%r10 527 cmpq %rbp,%rsp 528 ja L$mul4x_page_walk 529L$mul4x_page_walk_done: 530 531 negq %r9 532 533 movq %rax,40(%rsp) 534 535L$mul4x_body: 536 537 call mul4x_internal 538 539 movq 40(%rsp),%rsi 540 541 movq $1,%rax 542 543 movq -48(%rsi),%r15 544 545 movq -40(%rsi),%r14 546 547 movq -32(%rsi),%r13 548 549 movq -24(%rsi),%r12 550 551 movq -16(%rsi),%rbp 552 553 movq -8(%rsi),%rbx 554 555 leaq (%rsi),%rsp 556 557L$mul4x_epilogue: 558 .byte 0xf3,0xc3 559 560 561 562 563.p2align 5 564mul4x_internal: 565 566 shlq $5,%r9 567 movd 8(%rax),%xmm5 568 leaq L$inc(%rip),%rax 569 leaq 128(%rdx,%r9,1),%r13 570 shrq $5,%r9 571 movdqa 0(%rax),%xmm0 572 movdqa 16(%rax),%xmm1 573 leaq 88-112(%rsp,%r9,1),%r10 574 leaq 128(%rdx),%r12 575 576 pshufd $0,%xmm5,%xmm5 577 movdqa %xmm1,%xmm4 578.byte 0x67,0x67 579 movdqa %xmm1,%xmm2 580 paddd %xmm0,%xmm1 581 pcmpeqd %xmm5,%xmm0 582.byte 0x67 583 movdqa %xmm4,%xmm3 584 paddd %xmm1,%xmm2 585 pcmpeqd %xmm5,%xmm1 586 movdqa %xmm0,112(%r10) 587 movdqa %xmm4,%xmm0 588 589 paddd %xmm2,%xmm3 590 pcmpeqd %xmm5,%xmm2 591 movdqa %xmm1,128(%r10) 592 movdqa %xmm4,%xmm1 593 594 paddd %xmm3,%xmm0 595 pcmpeqd %xmm5,%xmm3 596 movdqa %xmm2,144(%r10) 597 movdqa %xmm4,%xmm2 598 599 paddd %xmm0,%xmm1 600 pcmpeqd %xmm5,%xmm0 601 movdqa %xmm3,160(%r10) 602 movdqa %xmm4,%xmm3 603 paddd %xmm1,%xmm2 604 pcmpeqd %xmm5,%xmm1 605 movdqa %xmm0,176(%r10) 606 movdqa %xmm4,%xmm0 607 608 paddd %xmm2,%xmm3 609 pcmpeqd %xmm5,%xmm2 610 movdqa %xmm1,192(%r10) 611 movdqa %xmm4,%xmm1 612 613 paddd %xmm3,%xmm0 614 pcmpeqd %xmm5,%xmm3 615 movdqa %xmm2,208(%r10) 616 movdqa %xmm4,%xmm2 617 618 paddd %xmm0,%xmm1 619 pcmpeqd %xmm5,%xmm0 620 movdqa %xmm3,224(%r10) 621 movdqa %xmm4,%xmm3 622 paddd %xmm1,%xmm2 623 pcmpeqd %xmm5,%xmm1 624 movdqa %xmm0,240(%r10) 625 movdqa %xmm4,%xmm0 626 627 paddd %xmm2,%xmm3 628 pcmpeqd %xmm5,%xmm2 629 movdqa %xmm1,256(%r10) 630 movdqa %xmm4,%xmm1 631 632 paddd %xmm3,%xmm0 633 pcmpeqd %xmm5,%xmm3 634 movdqa %xmm2,272(%r10) 635 movdqa %xmm4,%xmm2 636 637 paddd %xmm0,%xmm1 638 pcmpeqd %xmm5,%xmm0 639 movdqa %xmm3,288(%r10) 640 movdqa %xmm4,%xmm3 641 paddd %xmm1,%xmm2 642 pcmpeqd %xmm5,%xmm1 643 movdqa %xmm0,304(%r10) 644 645 paddd %xmm2,%xmm3 646.byte 0x67 647 pcmpeqd %xmm5,%xmm2 648 movdqa %xmm1,320(%r10) 649 650 pcmpeqd %xmm5,%xmm3 651 movdqa %xmm2,336(%r10) 652 pand 64(%r12),%xmm0 653 654 pand 80(%r12),%xmm1 655 pand 96(%r12),%xmm2 656 movdqa %xmm3,352(%r10) 657 pand 112(%r12),%xmm3 658 por %xmm2,%xmm0 659 por %xmm3,%xmm1 660 movdqa -128(%r12),%xmm4 661 movdqa -112(%r12),%xmm5 662 movdqa -96(%r12),%xmm2 663 pand 112(%r10),%xmm4 664 movdqa -80(%r12),%xmm3 665 pand 128(%r10),%xmm5 666 por %xmm4,%xmm0 667 pand 144(%r10),%xmm2 668 por %xmm5,%xmm1 669 pand 160(%r10),%xmm3 670 por %xmm2,%xmm0 671 por %xmm3,%xmm1 672 movdqa -64(%r12),%xmm4 673 movdqa -48(%r12),%xmm5 674 movdqa -32(%r12),%xmm2 675 pand 176(%r10),%xmm4 676 movdqa -16(%r12),%xmm3 677 pand 192(%r10),%xmm5 678 por %xmm4,%xmm0 679 pand 208(%r10),%xmm2 680 por %xmm5,%xmm1 681 pand 224(%r10),%xmm3 682 por %xmm2,%xmm0 683 por %xmm3,%xmm1 684 movdqa 0(%r12),%xmm4 685 movdqa 16(%r12),%xmm5 686 movdqa 32(%r12),%xmm2 687 pand 240(%r10),%xmm4 688 movdqa 48(%r12),%xmm3 689 pand 256(%r10),%xmm5 690 por %xmm4,%xmm0 691 pand 272(%r10),%xmm2 692 por %xmm5,%xmm1 693 pand 288(%r10),%xmm3 694 por %xmm2,%xmm0 695 por %xmm3,%xmm1 696 por %xmm1,%xmm0 697 pshufd $0x4e,%xmm0,%xmm1 698 por %xmm1,%xmm0 699 leaq 256(%r12),%r12 700.byte 102,72,15,126,195 701 702 movq %r13,16+8(%rsp) 703 movq %rdi,56+8(%rsp) 704 705 movq (%r8),%r8 706 movq (%rsi),%rax 707 leaq (%rsi,%r9,1),%rsi 708 negq %r9 709 710 movq %r8,%rbp 711 mulq %rbx 712 movq %rax,%r10 713 movq (%rcx),%rax 714 715 imulq %r10,%rbp 716 leaq 64+8(%rsp),%r14 717 movq %rdx,%r11 718 719 mulq %rbp 720 addq %rax,%r10 721 movq 8(%rsi,%r9,1),%rax 722 adcq $0,%rdx 723 movq %rdx,%rdi 724 725 mulq %rbx 726 addq %rax,%r11 727 movq 8(%rcx),%rax 728 adcq $0,%rdx 729 movq %rdx,%r10 730 731 mulq %rbp 732 addq %rax,%rdi 733 movq 16(%rsi,%r9,1),%rax 734 adcq $0,%rdx 735 addq %r11,%rdi 736 leaq 32(%r9),%r15 737 leaq 32(%rcx),%rcx 738 adcq $0,%rdx 739 movq %rdi,(%r14) 740 movq %rdx,%r13 741 jmp L$1st4x 742 743.p2align 5 744L$1st4x: 745 mulq %rbx 746 addq %rax,%r10 747 movq -16(%rcx),%rax 748 leaq 32(%r14),%r14 749 adcq $0,%rdx 750 movq %rdx,%r11 751 752 mulq %rbp 753 addq %rax,%r13 754 movq -8(%rsi,%r15,1),%rax 755 adcq $0,%rdx 756 addq %r10,%r13 757 adcq $0,%rdx 758 movq %r13,-24(%r14) 759 movq %rdx,%rdi 760 761 mulq %rbx 762 addq %rax,%r11 763 movq -8(%rcx),%rax 764 adcq $0,%rdx 765 movq %rdx,%r10 766 767 mulq %rbp 768 addq %rax,%rdi 769 movq (%rsi,%r15,1),%rax 770 adcq $0,%rdx 771 addq %r11,%rdi 772 adcq $0,%rdx 773 movq %rdi,-16(%r14) 774 movq %rdx,%r13 775 776 mulq %rbx 777 addq %rax,%r10 778 movq 0(%rcx),%rax 779 adcq $0,%rdx 780 movq %rdx,%r11 781 782 mulq %rbp 783 addq %rax,%r13 784 movq 8(%rsi,%r15,1),%rax 785 adcq $0,%rdx 786 addq %r10,%r13 787 adcq $0,%rdx 788 movq %r13,-8(%r14) 789 movq %rdx,%rdi 790 791 mulq %rbx 792 addq %rax,%r11 793 movq 8(%rcx),%rax 794 adcq $0,%rdx 795 movq %rdx,%r10 796 797 mulq %rbp 798 addq %rax,%rdi 799 movq 16(%rsi,%r15,1),%rax 800 adcq $0,%rdx 801 addq %r11,%rdi 802 leaq 32(%rcx),%rcx 803 adcq $0,%rdx 804 movq %rdi,(%r14) 805 movq %rdx,%r13 806 807 addq $32,%r15 808 jnz L$1st4x 809 810 mulq %rbx 811 addq %rax,%r10 812 movq -16(%rcx),%rax 813 leaq 32(%r14),%r14 814 adcq $0,%rdx 815 movq %rdx,%r11 816 817 mulq %rbp 818 addq %rax,%r13 819 movq -8(%rsi),%rax 820 adcq $0,%rdx 821 addq %r10,%r13 822 adcq $0,%rdx 823 movq %r13,-24(%r14) 824 movq %rdx,%rdi 825 826 mulq %rbx 827 addq %rax,%r11 828 movq -8(%rcx),%rax 829 adcq $0,%rdx 830 movq %rdx,%r10 831 832 mulq %rbp 833 addq %rax,%rdi 834 movq (%rsi,%r9,1),%rax 835 adcq $0,%rdx 836 addq %r11,%rdi 837 adcq $0,%rdx 838 movq %rdi,-16(%r14) 839 movq %rdx,%r13 840 841 leaq (%rcx,%r9,1),%rcx 842 843 xorq %rdi,%rdi 844 addq %r10,%r13 845 adcq $0,%rdi 846 movq %r13,-8(%r14) 847 848 jmp L$outer4x 849 850.p2align 5 851L$outer4x: 852 leaq 16+128(%r14),%rdx 853 pxor %xmm4,%xmm4 854 pxor %xmm5,%xmm5 855 movdqa -128(%r12),%xmm0 856 movdqa -112(%r12),%xmm1 857 movdqa -96(%r12),%xmm2 858 movdqa -80(%r12),%xmm3 859 pand -128(%rdx),%xmm0 860 pand -112(%rdx),%xmm1 861 por %xmm0,%xmm4 862 pand -96(%rdx),%xmm2 863 por %xmm1,%xmm5 864 pand -80(%rdx),%xmm3 865 por %xmm2,%xmm4 866 por %xmm3,%xmm5 867 movdqa -64(%r12),%xmm0 868 movdqa -48(%r12),%xmm1 869 movdqa -32(%r12),%xmm2 870 movdqa -16(%r12),%xmm3 871 pand -64(%rdx),%xmm0 872 pand -48(%rdx),%xmm1 873 por %xmm0,%xmm4 874 pand -32(%rdx),%xmm2 875 por %xmm1,%xmm5 876 pand -16(%rdx),%xmm3 877 por %xmm2,%xmm4 878 por %xmm3,%xmm5 879 movdqa 0(%r12),%xmm0 880 movdqa 16(%r12),%xmm1 881 movdqa 32(%r12),%xmm2 882 movdqa 48(%r12),%xmm3 883 pand 0(%rdx),%xmm0 884 pand 16(%rdx),%xmm1 885 por %xmm0,%xmm4 886 pand 32(%rdx),%xmm2 887 por %xmm1,%xmm5 888 pand 48(%rdx),%xmm3 889 por %xmm2,%xmm4 890 por %xmm3,%xmm5 891 movdqa 64(%r12),%xmm0 892 movdqa 80(%r12),%xmm1 893 movdqa 96(%r12),%xmm2 894 movdqa 112(%r12),%xmm3 895 pand 64(%rdx),%xmm0 896 pand 80(%rdx),%xmm1 897 por %xmm0,%xmm4 898 pand 96(%rdx),%xmm2 899 por %xmm1,%xmm5 900 pand 112(%rdx),%xmm3 901 por %xmm2,%xmm4 902 por %xmm3,%xmm5 903 por %xmm5,%xmm4 904 pshufd $0x4e,%xmm4,%xmm0 905 por %xmm4,%xmm0 906 leaq 256(%r12),%r12 907.byte 102,72,15,126,195 908 909 movq (%r14,%r9,1),%r10 910 movq %r8,%rbp 911 mulq %rbx 912 addq %rax,%r10 913 movq (%rcx),%rax 914 adcq $0,%rdx 915 916 imulq %r10,%rbp 917 movq %rdx,%r11 918 movq %rdi,(%r14) 919 920 leaq (%r14,%r9,1),%r14 921 922 mulq %rbp 923 addq %rax,%r10 924 movq 8(%rsi,%r9,1),%rax 925 adcq $0,%rdx 926 movq %rdx,%rdi 927 928 mulq %rbx 929 addq %rax,%r11 930 movq 8(%rcx),%rax 931 adcq $0,%rdx 932 addq 8(%r14),%r11 933 adcq $0,%rdx 934 movq %rdx,%r10 935 936 mulq %rbp 937 addq %rax,%rdi 938 movq 16(%rsi,%r9,1),%rax 939 adcq $0,%rdx 940 addq %r11,%rdi 941 leaq 32(%r9),%r15 942 leaq 32(%rcx),%rcx 943 adcq $0,%rdx 944 movq %rdx,%r13 945 jmp L$inner4x 946 947.p2align 5 948L$inner4x: 949 mulq %rbx 950 addq %rax,%r10 951 movq -16(%rcx),%rax 952 adcq $0,%rdx 953 addq 16(%r14),%r10 954 leaq 32(%r14),%r14 955 adcq $0,%rdx 956 movq %rdx,%r11 957 958 mulq %rbp 959 addq %rax,%r13 960 movq -8(%rsi,%r15,1),%rax 961 adcq $0,%rdx 962 addq %r10,%r13 963 adcq $0,%rdx 964 movq %rdi,-32(%r14) 965 movq %rdx,%rdi 966 967 mulq %rbx 968 addq %rax,%r11 969 movq -8(%rcx),%rax 970 adcq $0,%rdx 971 addq -8(%r14),%r11 972 adcq $0,%rdx 973 movq %rdx,%r10 974 975 mulq %rbp 976 addq %rax,%rdi 977 movq (%rsi,%r15,1),%rax 978 adcq $0,%rdx 979 addq %r11,%rdi 980 adcq $0,%rdx 981 movq %r13,-24(%r14) 982 movq %rdx,%r13 983 984 mulq %rbx 985 addq %rax,%r10 986 movq 0(%rcx),%rax 987 adcq $0,%rdx 988 addq (%r14),%r10 989 adcq $0,%rdx 990 movq %rdx,%r11 991 992 mulq %rbp 993 addq %rax,%r13 994 movq 8(%rsi,%r15,1),%rax 995 adcq $0,%rdx 996 addq %r10,%r13 997 adcq $0,%rdx 998 movq %rdi,-16(%r14) 999 movq %rdx,%rdi 1000 1001 mulq %rbx 1002 addq %rax,%r11 1003 movq 8(%rcx),%rax 1004 adcq $0,%rdx 1005 addq 8(%r14),%r11 1006 adcq $0,%rdx 1007 movq %rdx,%r10 1008 1009 mulq %rbp 1010 addq %rax,%rdi 1011 movq 16(%rsi,%r15,1),%rax 1012 adcq $0,%rdx 1013 addq %r11,%rdi 1014 leaq 32(%rcx),%rcx 1015 adcq $0,%rdx 1016 movq %r13,-8(%r14) 1017 movq %rdx,%r13 1018 1019 addq $32,%r15 1020 jnz L$inner4x 1021 1022 mulq %rbx 1023 addq %rax,%r10 1024 movq -16(%rcx),%rax 1025 adcq $0,%rdx 1026 addq 16(%r14),%r10 1027 leaq 32(%r14),%r14 1028 adcq $0,%rdx 1029 movq %rdx,%r11 1030 1031 mulq %rbp 1032 addq %rax,%r13 1033 movq -8(%rsi),%rax 1034 adcq $0,%rdx 1035 addq %r10,%r13 1036 adcq $0,%rdx 1037 movq %rdi,-32(%r14) 1038 movq %rdx,%rdi 1039 1040 mulq %rbx 1041 addq %rax,%r11 1042 movq %rbp,%rax 1043 movq -8(%rcx),%rbp 1044 adcq $0,%rdx 1045 addq -8(%r14),%r11 1046 adcq $0,%rdx 1047 movq %rdx,%r10 1048 1049 mulq %rbp 1050 addq %rax,%rdi 1051 movq (%rsi,%r9,1),%rax 1052 adcq $0,%rdx 1053 addq %r11,%rdi 1054 adcq $0,%rdx 1055 movq %r13,-24(%r14) 1056 movq %rdx,%r13 1057 1058 movq %rdi,-16(%r14) 1059 leaq (%rcx,%r9,1),%rcx 1060 1061 xorq %rdi,%rdi 1062 addq %r10,%r13 1063 adcq $0,%rdi 1064 addq (%r14),%r13 1065 adcq $0,%rdi 1066 movq %r13,-8(%r14) 1067 1068 cmpq 16+8(%rsp),%r12 1069 jb L$outer4x 1070 xorq %rax,%rax 1071 subq %r13,%rbp 1072 adcq %r15,%r15 1073 orq %r15,%rdi 1074 subq %rdi,%rax 1075 leaq (%r14,%r9,1),%rbx 1076 movq (%rcx),%r12 1077 leaq (%rcx),%rbp 1078 movq %r9,%rcx 1079 sarq $3+2,%rcx 1080 movq 56+8(%rsp),%rdi 1081 decq %r12 1082 xorq %r10,%r10 1083 movq 8(%rbp),%r13 1084 movq 16(%rbp),%r14 1085 movq 24(%rbp),%r15 1086 jmp L$sqr4x_sub_entry 1087 1088 1089.globl _GFp_bn_power5 1090.private_extern _GFp_bn_power5 1091 1092.p2align 5 1093_GFp_bn_power5: 1094 1095 movq %rsp,%rax 1096 1097 leaq _GFp_ia32cap_P(%rip),%r11 1098 movl 8(%r11),%r11d 1099 andl $0x80108,%r11d 1100 cmpl $0x80108,%r11d 1101 je L$powerx5_enter 1102 pushq %rbx 1103 1104 pushq %rbp 1105 1106 pushq %r12 1107 1108 pushq %r13 1109 1110 pushq %r14 1111 1112 pushq %r15 1113 1114L$power5_prologue: 1115 1116 shll $3,%r9d 1117 leal (%r9,%r9,2),%r10d 1118 negq %r9 1119 movq (%r8),%r8 1120 1121 1122 1123 1124 1125 1126 1127 1128 leaq -320(%rsp,%r9,2),%r11 1129 movq %rsp,%rbp 1130 subq %rdi,%r11 1131 andq $4095,%r11 1132 cmpq %r11,%r10 1133 jb L$pwr_sp_alt 1134 subq %r11,%rbp 1135 leaq -320(%rbp,%r9,2),%rbp 1136 jmp L$pwr_sp_done 1137 1138.p2align 5 1139L$pwr_sp_alt: 1140 leaq 4096-320(,%r9,2),%r10 1141 leaq -320(%rbp,%r9,2),%rbp 1142 subq %r10,%r11 1143 movq $0,%r10 1144 cmovcq %r10,%r11 1145 subq %r11,%rbp 1146L$pwr_sp_done: 1147 andq $-64,%rbp 1148 movq %rsp,%r11 1149 subq %rbp,%r11 1150 andq $-4096,%r11 1151 leaq (%r11,%rbp,1),%rsp 1152 movq (%rsp),%r10 1153 cmpq %rbp,%rsp 1154 ja L$pwr_page_walk 1155 jmp L$pwr_page_walk_done 1156 1157L$pwr_page_walk: 1158 leaq -4096(%rsp),%rsp 1159 movq (%rsp),%r10 1160 cmpq %rbp,%rsp 1161 ja L$pwr_page_walk 1162L$pwr_page_walk_done: 1163 1164 movq %r9,%r10 1165 negq %r9 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 movq %r8,32(%rsp) 1177 movq %rax,40(%rsp) 1178 1179L$power5_body: 1180.byte 102,72,15,110,207 1181.byte 102,72,15,110,209 1182.byte 102,73,15,110,218 1183.byte 102,72,15,110,226 1184 1185 call __bn_sqr8x_internal 1186 call __bn_post4x_internal 1187 call __bn_sqr8x_internal 1188 call __bn_post4x_internal 1189 call __bn_sqr8x_internal 1190 call __bn_post4x_internal 1191 call __bn_sqr8x_internal 1192 call __bn_post4x_internal 1193 call __bn_sqr8x_internal 1194 call __bn_post4x_internal 1195 1196.byte 102,72,15,126,209 1197.byte 102,72,15,126,226 1198 movq %rsi,%rdi 1199 movq 40(%rsp),%rax 1200 leaq 32(%rsp),%r8 1201 1202 call mul4x_internal 1203 1204 movq 40(%rsp),%rsi 1205 1206 movq $1,%rax 1207 movq -48(%rsi),%r15 1208 1209 movq -40(%rsi),%r14 1210 1211 movq -32(%rsi),%r13 1212 1213 movq -24(%rsi),%r12 1214 1215 movq -16(%rsi),%rbp 1216 1217 movq -8(%rsi),%rbx 1218 1219 leaq (%rsi),%rsp 1220 1221L$power5_epilogue: 1222 .byte 0xf3,0xc3 1223 1224 1225 1226.globl _GFp_bn_sqr8x_internal 1227.private_extern _GFp_bn_sqr8x_internal 1228.private_extern _GFp_bn_sqr8x_internal 1229 1230.p2align 5 1231_GFp_bn_sqr8x_internal: 1232__bn_sqr8x_internal: 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 leaq 32(%r10),%rbp 1308 leaq (%rsi,%r9,1),%rsi 1309 1310 movq %r9,%rcx 1311 1312 1313 movq -32(%rsi,%rbp,1),%r14 1314 leaq 48+8(%rsp,%r9,2),%rdi 1315 movq -24(%rsi,%rbp,1),%rax 1316 leaq -32(%rdi,%rbp,1),%rdi 1317 movq -16(%rsi,%rbp,1),%rbx 1318 movq %rax,%r15 1319 1320 mulq %r14 1321 movq %rax,%r10 1322 movq %rbx,%rax 1323 movq %rdx,%r11 1324 movq %r10,-24(%rdi,%rbp,1) 1325 1326 mulq %r14 1327 addq %rax,%r11 1328 movq %rbx,%rax 1329 adcq $0,%rdx 1330 movq %r11,-16(%rdi,%rbp,1) 1331 movq %rdx,%r10 1332 1333 1334 movq -8(%rsi,%rbp,1),%rbx 1335 mulq %r15 1336 movq %rax,%r12 1337 movq %rbx,%rax 1338 movq %rdx,%r13 1339 1340 leaq (%rbp),%rcx 1341 mulq %r14 1342 addq %rax,%r10 1343 movq %rbx,%rax 1344 movq %rdx,%r11 1345 adcq $0,%r11 1346 addq %r12,%r10 1347 adcq $0,%r11 1348 movq %r10,-8(%rdi,%rcx,1) 1349 jmp L$sqr4x_1st 1350 1351.p2align 5 1352L$sqr4x_1st: 1353 movq (%rsi,%rcx,1),%rbx 1354 mulq %r15 1355 addq %rax,%r13 1356 movq %rbx,%rax 1357 movq %rdx,%r12 1358 adcq $0,%r12 1359 1360 mulq %r14 1361 addq %rax,%r11 1362 movq %rbx,%rax 1363 movq 8(%rsi,%rcx,1),%rbx 1364 movq %rdx,%r10 1365 adcq $0,%r10 1366 addq %r13,%r11 1367 adcq $0,%r10 1368 1369 1370 mulq %r15 1371 addq %rax,%r12 1372 movq %rbx,%rax 1373 movq %r11,(%rdi,%rcx,1) 1374 movq %rdx,%r13 1375 adcq $0,%r13 1376 1377 mulq %r14 1378 addq %rax,%r10 1379 movq %rbx,%rax 1380 movq 16(%rsi,%rcx,1),%rbx 1381 movq %rdx,%r11 1382 adcq $0,%r11 1383 addq %r12,%r10 1384 adcq $0,%r11 1385 1386 mulq %r15 1387 addq %rax,%r13 1388 movq %rbx,%rax 1389 movq %r10,8(%rdi,%rcx,1) 1390 movq %rdx,%r12 1391 adcq $0,%r12 1392 1393 mulq %r14 1394 addq %rax,%r11 1395 movq %rbx,%rax 1396 movq 24(%rsi,%rcx,1),%rbx 1397 movq %rdx,%r10 1398 adcq $0,%r10 1399 addq %r13,%r11 1400 adcq $0,%r10 1401 1402 1403 mulq %r15 1404 addq %rax,%r12 1405 movq %rbx,%rax 1406 movq %r11,16(%rdi,%rcx,1) 1407 movq %rdx,%r13 1408 adcq $0,%r13 1409 leaq 32(%rcx),%rcx 1410 1411 mulq %r14 1412 addq %rax,%r10 1413 movq %rbx,%rax 1414 movq %rdx,%r11 1415 adcq $0,%r11 1416 addq %r12,%r10 1417 adcq $0,%r11 1418 movq %r10,-8(%rdi,%rcx,1) 1419 1420 cmpq $0,%rcx 1421 jne L$sqr4x_1st 1422 1423 mulq %r15 1424 addq %rax,%r13 1425 leaq 16(%rbp),%rbp 1426 adcq $0,%rdx 1427 addq %r11,%r13 1428 adcq $0,%rdx 1429 1430 movq %r13,(%rdi) 1431 movq %rdx,%r12 1432 movq %rdx,8(%rdi) 1433 jmp L$sqr4x_outer 1434 1435.p2align 5 1436L$sqr4x_outer: 1437 movq -32(%rsi,%rbp,1),%r14 1438 leaq 48+8(%rsp,%r9,2),%rdi 1439 movq -24(%rsi,%rbp,1),%rax 1440 leaq -32(%rdi,%rbp,1),%rdi 1441 movq -16(%rsi,%rbp,1),%rbx 1442 movq %rax,%r15 1443 1444 mulq %r14 1445 movq -24(%rdi,%rbp,1),%r10 1446 addq %rax,%r10 1447 movq %rbx,%rax 1448 adcq $0,%rdx 1449 movq %r10,-24(%rdi,%rbp,1) 1450 movq %rdx,%r11 1451 1452 mulq %r14 1453 addq %rax,%r11 1454 movq %rbx,%rax 1455 adcq $0,%rdx 1456 addq -16(%rdi,%rbp,1),%r11 1457 movq %rdx,%r10 1458 adcq $0,%r10 1459 movq %r11,-16(%rdi,%rbp,1) 1460 1461 xorq %r12,%r12 1462 1463 movq -8(%rsi,%rbp,1),%rbx 1464 mulq %r15 1465 addq %rax,%r12 1466 movq %rbx,%rax 1467 adcq $0,%rdx 1468 addq -8(%rdi,%rbp,1),%r12 1469 movq %rdx,%r13 1470 adcq $0,%r13 1471 1472 mulq %r14 1473 addq %rax,%r10 1474 movq %rbx,%rax 1475 adcq $0,%rdx 1476 addq %r12,%r10 1477 movq %rdx,%r11 1478 adcq $0,%r11 1479 movq %r10,-8(%rdi,%rbp,1) 1480 1481 leaq (%rbp),%rcx 1482 jmp L$sqr4x_inner 1483 1484.p2align 5 1485L$sqr4x_inner: 1486 movq (%rsi,%rcx,1),%rbx 1487 mulq %r15 1488 addq %rax,%r13 1489 movq %rbx,%rax 1490 movq %rdx,%r12 1491 adcq $0,%r12 1492 addq (%rdi,%rcx,1),%r13 1493 adcq $0,%r12 1494 1495.byte 0x67 1496 mulq %r14 1497 addq %rax,%r11 1498 movq %rbx,%rax 1499 movq 8(%rsi,%rcx,1),%rbx 1500 movq %rdx,%r10 1501 adcq $0,%r10 1502 addq %r13,%r11 1503 adcq $0,%r10 1504 1505 mulq %r15 1506 addq %rax,%r12 1507 movq %r11,(%rdi,%rcx,1) 1508 movq %rbx,%rax 1509 movq %rdx,%r13 1510 adcq $0,%r13 1511 addq 8(%rdi,%rcx,1),%r12 1512 leaq 16(%rcx),%rcx 1513 adcq $0,%r13 1514 1515 mulq %r14 1516 addq %rax,%r10 1517 movq %rbx,%rax 1518 adcq $0,%rdx 1519 addq %r12,%r10 1520 movq %rdx,%r11 1521 adcq $0,%r11 1522 movq %r10,-8(%rdi,%rcx,1) 1523 1524 cmpq $0,%rcx 1525 jne L$sqr4x_inner 1526 1527.byte 0x67 1528 mulq %r15 1529 addq %rax,%r13 1530 adcq $0,%rdx 1531 addq %r11,%r13 1532 adcq $0,%rdx 1533 1534 movq %r13,(%rdi) 1535 movq %rdx,%r12 1536 movq %rdx,8(%rdi) 1537 1538 addq $16,%rbp 1539 jnz L$sqr4x_outer 1540 1541 1542 movq -32(%rsi),%r14 1543 leaq 48+8(%rsp,%r9,2),%rdi 1544 movq -24(%rsi),%rax 1545 leaq -32(%rdi,%rbp,1),%rdi 1546 movq -16(%rsi),%rbx 1547 movq %rax,%r15 1548 1549 mulq %r14 1550 addq %rax,%r10 1551 movq %rbx,%rax 1552 movq %rdx,%r11 1553 adcq $0,%r11 1554 1555 mulq %r14 1556 addq %rax,%r11 1557 movq %rbx,%rax 1558 movq %r10,-24(%rdi) 1559 movq %rdx,%r10 1560 adcq $0,%r10 1561 addq %r13,%r11 1562 movq -8(%rsi),%rbx 1563 adcq $0,%r10 1564 1565 mulq %r15 1566 addq %rax,%r12 1567 movq %rbx,%rax 1568 movq %r11,-16(%rdi) 1569 movq %rdx,%r13 1570 adcq $0,%r13 1571 1572 mulq %r14 1573 addq %rax,%r10 1574 movq %rbx,%rax 1575 movq %rdx,%r11 1576 adcq $0,%r11 1577 addq %r12,%r10 1578 adcq $0,%r11 1579 movq %r10,-8(%rdi) 1580 1581 mulq %r15 1582 addq %rax,%r13 1583 movq -16(%rsi),%rax 1584 adcq $0,%rdx 1585 addq %r11,%r13 1586 adcq $0,%rdx 1587 1588 movq %r13,(%rdi) 1589 movq %rdx,%r12 1590 movq %rdx,8(%rdi) 1591 1592 mulq %rbx 1593 addq $16,%rbp 1594 xorq %r14,%r14 1595 subq %r9,%rbp 1596 xorq %r15,%r15 1597 1598 addq %r12,%rax 1599 adcq $0,%rdx 1600 movq %rax,8(%rdi) 1601 movq %rdx,16(%rdi) 1602 movq %r15,24(%rdi) 1603 1604 movq -16(%rsi,%rbp,1),%rax 1605 leaq 48+8(%rsp),%rdi 1606 xorq %r10,%r10 1607 movq 8(%rdi),%r11 1608 1609 leaq (%r14,%r10,2),%r12 1610 shrq $63,%r10 1611 leaq (%rcx,%r11,2),%r13 1612 shrq $63,%r11 1613 orq %r10,%r13 1614 movq 16(%rdi),%r10 1615 movq %r11,%r14 1616 mulq %rax 1617 negq %r15 1618 movq 24(%rdi),%r11 1619 adcq %rax,%r12 1620 movq -8(%rsi,%rbp,1),%rax 1621 movq %r12,(%rdi) 1622 adcq %rdx,%r13 1623 1624 leaq (%r14,%r10,2),%rbx 1625 movq %r13,8(%rdi) 1626 sbbq %r15,%r15 1627 shrq $63,%r10 1628 leaq (%rcx,%r11,2),%r8 1629 shrq $63,%r11 1630 orq %r10,%r8 1631 movq 32(%rdi),%r10 1632 movq %r11,%r14 1633 mulq %rax 1634 negq %r15 1635 movq 40(%rdi),%r11 1636 adcq %rax,%rbx 1637 movq 0(%rsi,%rbp,1),%rax 1638 movq %rbx,16(%rdi) 1639 adcq %rdx,%r8 1640 leaq 16(%rbp),%rbp 1641 movq %r8,24(%rdi) 1642 sbbq %r15,%r15 1643 leaq 64(%rdi),%rdi 1644 jmp L$sqr4x_shift_n_add 1645 1646.p2align 5 1647L$sqr4x_shift_n_add: 1648 leaq (%r14,%r10,2),%r12 1649 shrq $63,%r10 1650 leaq (%rcx,%r11,2),%r13 1651 shrq $63,%r11 1652 orq %r10,%r13 1653 movq -16(%rdi),%r10 1654 movq %r11,%r14 1655 mulq %rax 1656 negq %r15 1657 movq -8(%rdi),%r11 1658 adcq %rax,%r12 1659 movq -8(%rsi,%rbp,1),%rax 1660 movq %r12,-32(%rdi) 1661 adcq %rdx,%r13 1662 1663 leaq (%r14,%r10,2),%rbx 1664 movq %r13,-24(%rdi) 1665 sbbq %r15,%r15 1666 shrq $63,%r10 1667 leaq (%rcx,%r11,2),%r8 1668 shrq $63,%r11 1669 orq %r10,%r8 1670 movq 0(%rdi),%r10 1671 movq %r11,%r14 1672 mulq %rax 1673 negq %r15 1674 movq 8(%rdi),%r11 1675 adcq %rax,%rbx 1676 movq 0(%rsi,%rbp,1),%rax 1677 movq %rbx,-16(%rdi) 1678 adcq %rdx,%r8 1679 1680 leaq (%r14,%r10,2),%r12 1681 movq %r8,-8(%rdi) 1682 sbbq %r15,%r15 1683 shrq $63,%r10 1684 leaq (%rcx,%r11,2),%r13 1685 shrq $63,%r11 1686 orq %r10,%r13 1687 movq 16(%rdi),%r10 1688 movq %r11,%r14 1689 mulq %rax 1690 negq %r15 1691 movq 24(%rdi),%r11 1692 adcq %rax,%r12 1693 movq 8(%rsi,%rbp,1),%rax 1694 movq %r12,0(%rdi) 1695 adcq %rdx,%r13 1696 1697 leaq (%r14,%r10,2),%rbx 1698 movq %r13,8(%rdi) 1699 sbbq %r15,%r15 1700 shrq $63,%r10 1701 leaq (%rcx,%r11,2),%r8 1702 shrq $63,%r11 1703 orq %r10,%r8 1704 movq 32(%rdi),%r10 1705 movq %r11,%r14 1706 mulq %rax 1707 negq %r15 1708 movq 40(%rdi),%r11 1709 adcq %rax,%rbx 1710 movq 16(%rsi,%rbp,1),%rax 1711 movq %rbx,16(%rdi) 1712 adcq %rdx,%r8 1713 movq %r8,24(%rdi) 1714 sbbq %r15,%r15 1715 leaq 64(%rdi),%rdi 1716 addq $32,%rbp 1717 jnz L$sqr4x_shift_n_add 1718 1719 leaq (%r14,%r10,2),%r12 1720.byte 0x67 1721 shrq $63,%r10 1722 leaq (%rcx,%r11,2),%r13 1723 shrq $63,%r11 1724 orq %r10,%r13 1725 movq -16(%rdi),%r10 1726 movq %r11,%r14 1727 mulq %rax 1728 negq %r15 1729 movq -8(%rdi),%r11 1730 adcq %rax,%r12 1731 movq -8(%rsi),%rax 1732 movq %r12,-32(%rdi) 1733 adcq %rdx,%r13 1734 1735 leaq (%r14,%r10,2),%rbx 1736 movq %r13,-24(%rdi) 1737 sbbq %r15,%r15 1738 shrq $63,%r10 1739 leaq (%rcx,%r11,2),%r8 1740 shrq $63,%r11 1741 orq %r10,%r8 1742 mulq %rax 1743 negq %r15 1744 adcq %rax,%rbx 1745 adcq %rdx,%r8 1746 movq %rbx,-16(%rdi) 1747 movq %r8,-8(%rdi) 1748.byte 102,72,15,126,213 1749__bn_sqr8x_reduction: 1750 xorq %rax,%rax 1751 leaq (%r9,%rbp,1),%rcx 1752 leaq 48+8(%rsp,%r9,2),%rdx 1753 movq %rcx,0+8(%rsp) 1754 leaq 48+8(%rsp,%r9,1),%rdi 1755 movq %rdx,8+8(%rsp) 1756 negq %r9 1757 jmp L$8x_reduction_loop 1758 1759.p2align 5 1760L$8x_reduction_loop: 1761 leaq (%rdi,%r9,1),%rdi 1762.byte 0x66 1763 movq 0(%rdi),%rbx 1764 movq 8(%rdi),%r9 1765 movq 16(%rdi),%r10 1766 movq 24(%rdi),%r11 1767 movq 32(%rdi),%r12 1768 movq 40(%rdi),%r13 1769 movq 48(%rdi),%r14 1770 movq 56(%rdi),%r15 1771 movq %rax,(%rdx) 1772 leaq 64(%rdi),%rdi 1773 1774.byte 0x67 1775 movq %rbx,%r8 1776 imulq 32+8(%rsp),%rbx 1777 movq 0(%rbp),%rax 1778 movl $8,%ecx 1779 jmp L$8x_reduce 1780 1781.p2align 5 1782L$8x_reduce: 1783 mulq %rbx 1784 movq 8(%rbp),%rax 1785 negq %r8 1786 movq %rdx,%r8 1787 adcq $0,%r8 1788 1789 mulq %rbx 1790 addq %rax,%r9 1791 movq 16(%rbp),%rax 1792 adcq $0,%rdx 1793 addq %r9,%r8 1794 movq %rbx,48-8+8(%rsp,%rcx,8) 1795 movq %rdx,%r9 1796 adcq $0,%r9 1797 1798 mulq %rbx 1799 addq %rax,%r10 1800 movq 24(%rbp),%rax 1801 adcq $0,%rdx 1802 addq %r10,%r9 1803 movq 32+8(%rsp),%rsi 1804 movq %rdx,%r10 1805 adcq $0,%r10 1806 1807 mulq %rbx 1808 addq %rax,%r11 1809 movq 32(%rbp),%rax 1810 adcq $0,%rdx 1811 imulq %r8,%rsi 1812 addq %r11,%r10 1813 movq %rdx,%r11 1814 adcq $0,%r11 1815 1816 mulq %rbx 1817 addq %rax,%r12 1818 movq 40(%rbp),%rax 1819 adcq $0,%rdx 1820 addq %r12,%r11 1821 movq %rdx,%r12 1822 adcq $0,%r12 1823 1824 mulq %rbx 1825 addq %rax,%r13 1826 movq 48(%rbp),%rax 1827 adcq $0,%rdx 1828 addq %r13,%r12 1829 movq %rdx,%r13 1830 adcq $0,%r13 1831 1832 mulq %rbx 1833 addq %rax,%r14 1834 movq 56(%rbp),%rax 1835 adcq $0,%rdx 1836 addq %r14,%r13 1837 movq %rdx,%r14 1838 adcq $0,%r14 1839 1840 mulq %rbx 1841 movq %rsi,%rbx 1842 addq %rax,%r15 1843 movq 0(%rbp),%rax 1844 adcq $0,%rdx 1845 addq %r15,%r14 1846 movq %rdx,%r15 1847 adcq $0,%r15 1848 1849 decl %ecx 1850 jnz L$8x_reduce 1851 1852 leaq 64(%rbp),%rbp 1853 xorq %rax,%rax 1854 movq 8+8(%rsp),%rdx 1855 cmpq 0+8(%rsp),%rbp 1856 jae L$8x_no_tail 1857 1858.byte 0x66 1859 addq 0(%rdi),%r8 1860 adcq 8(%rdi),%r9 1861 adcq 16(%rdi),%r10 1862 adcq 24(%rdi),%r11 1863 adcq 32(%rdi),%r12 1864 adcq 40(%rdi),%r13 1865 adcq 48(%rdi),%r14 1866 adcq 56(%rdi),%r15 1867 sbbq %rsi,%rsi 1868 1869 movq 48+56+8(%rsp),%rbx 1870 movl $8,%ecx 1871 movq 0(%rbp),%rax 1872 jmp L$8x_tail 1873 1874.p2align 5 1875L$8x_tail: 1876 mulq %rbx 1877 addq %rax,%r8 1878 movq 8(%rbp),%rax 1879 movq %r8,(%rdi) 1880 movq %rdx,%r8 1881 adcq $0,%r8 1882 1883 mulq %rbx 1884 addq %rax,%r9 1885 movq 16(%rbp),%rax 1886 adcq $0,%rdx 1887 addq %r9,%r8 1888 leaq 8(%rdi),%rdi 1889 movq %rdx,%r9 1890 adcq $0,%r9 1891 1892 mulq %rbx 1893 addq %rax,%r10 1894 movq 24(%rbp),%rax 1895 adcq $0,%rdx 1896 addq %r10,%r9 1897 movq %rdx,%r10 1898 adcq $0,%r10 1899 1900 mulq %rbx 1901 addq %rax,%r11 1902 movq 32(%rbp),%rax 1903 adcq $0,%rdx 1904 addq %r11,%r10 1905 movq %rdx,%r11 1906 adcq $0,%r11 1907 1908 mulq %rbx 1909 addq %rax,%r12 1910 movq 40(%rbp),%rax 1911 adcq $0,%rdx 1912 addq %r12,%r11 1913 movq %rdx,%r12 1914 adcq $0,%r12 1915 1916 mulq %rbx 1917 addq %rax,%r13 1918 movq 48(%rbp),%rax 1919 adcq $0,%rdx 1920 addq %r13,%r12 1921 movq %rdx,%r13 1922 adcq $0,%r13 1923 1924 mulq %rbx 1925 addq %rax,%r14 1926 movq 56(%rbp),%rax 1927 adcq $0,%rdx 1928 addq %r14,%r13 1929 movq %rdx,%r14 1930 adcq $0,%r14 1931 1932 mulq %rbx 1933 movq 48-16+8(%rsp,%rcx,8),%rbx 1934 addq %rax,%r15 1935 adcq $0,%rdx 1936 addq %r15,%r14 1937 movq 0(%rbp),%rax 1938 movq %rdx,%r15 1939 adcq $0,%r15 1940 1941 decl %ecx 1942 jnz L$8x_tail 1943 1944 leaq 64(%rbp),%rbp 1945 movq 8+8(%rsp),%rdx 1946 cmpq 0+8(%rsp),%rbp 1947 jae L$8x_tail_done 1948 1949 movq 48+56+8(%rsp),%rbx 1950 negq %rsi 1951 movq 0(%rbp),%rax 1952 adcq 0(%rdi),%r8 1953 adcq 8(%rdi),%r9 1954 adcq 16(%rdi),%r10 1955 adcq 24(%rdi),%r11 1956 adcq 32(%rdi),%r12 1957 adcq 40(%rdi),%r13 1958 adcq 48(%rdi),%r14 1959 adcq 56(%rdi),%r15 1960 sbbq %rsi,%rsi 1961 1962 movl $8,%ecx 1963 jmp L$8x_tail 1964 1965.p2align 5 1966L$8x_tail_done: 1967 xorq %rax,%rax 1968 addq (%rdx),%r8 1969 adcq $0,%r9 1970 adcq $0,%r10 1971 adcq $0,%r11 1972 adcq $0,%r12 1973 adcq $0,%r13 1974 adcq $0,%r14 1975 adcq $0,%r15 1976 adcq $0,%rax 1977 1978 negq %rsi 1979L$8x_no_tail: 1980 adcq 0(%rdi),%r8 1981 adcq 8(%rdi),%r9 1982 adcq 16(%rdi),%r10 1983 adcq 24(%rdi),%r11 1984 adcq 32(%rdi),%r12 1985 adcq 40(%rdi),%r13 1986 adcq 48(%rdi),%r14 1987 adcq 56(%rdi),%r15 1988 adcq $0,%rax 1989 movq -8(%rbp),%rcx 1990 xorq %rsi,%rsi 1991 1992.byte 102,72,15,126,213 1993 1994 movq %r8,0(%rdi) 1995 movq %r9,8(%rdi) 1996.byte 102,73,15,126,217 1997 movq %r10,16(%rdi) 1998 movq %r11,24(%rdi) 1999 movq %r12,32(%rdi) 2000 movq %r13,40(%rdi) 2001 movq %r14,48(%rdi) 2002 movq %r15,56(%rdi) 2003 leaq 64(%rdi),%rdi 2004 2005 cmpq %rdx,%rdi 2006 jb L$8x_reduction_loop 2007 .byte 0xf3,0xc3 2008 2009 2010 2011.p2align 5 2012__bn_post4x_internal: 2013 2014 movq 0(%rbp),%r12 2015 leaq (%rdi,%r9,1),%rbx 2016 movq %r9,%rcx 2017.byte 102,72,15,126,207 2018 negq %rax 2019.byte 102,72,15,126,206 2020 sarq $3+2,%rcx 2021 decq %r12 2022 xorq %r10,%r10 2023 movq 8(%rbp),%r13 2024 movq 16(%rbp),%r14 2025 movq 24(%rbp),%r15 2026 jmp L$sqr4x_sub_entry 2027 2028.p2align 4 2029L$sqr4x_sub: 2030 movq 0(%rbp),%r12 2031 movq 8(%rbp),%r13 2032 movq 16(%rbp),%r14 2033 movq 24(%rbp),%r15 2034L$sqr4x_sub_entry: 2035 leaq 32(%rbp),%rbp 2036 notq %r12 2037 notq %r13 2038 notq %r14 2039 notq %r15 2040 andq %rax,%r12 2041 andq %rax,%r13 2042 andq %rax,%r14 2043 andq %rax,%r15 2044 2045 negq %r10 2046 adcq 0(%rbx),%r12 2047 adcq 8(%rbx),%r13 2048 adcq 16(%rbx),%r14 2049 adcq 24(%rbx),%r15 2050 movq %r12,0(%rdi) 2051 leaq 32(%rbx),%rbx 2052 movq %r13,8(%rdi) 2053 sbbq %r10,%r10 2054 movq %r14,16(%rdi) 2055 movq %r15,24(%rdi) 2056 leaq 32(%rdi),%rdi 2057 2058 incq %rcx 2059 jnz L$sqr4x_sub 2060 2061 movq %r9,%r10 2062 negq %r9 2063 .byte 0xf3,0xc3 2064 2065 2066.globl _GFp_bn_from_montgomery 2067.private_extern _GFp_bn_from_montgomery 2068 2069.p2align 5 2070_GFp_bn_from_montgomery: 2071 2072 testl $7,%r9d 2073 jz bn_from_mont8x 2074 xorl %eax,%eax 2075 .byte 0xf3,0xc3 2076 2077 2078 2079 2080.p2align 5 2081bn_from_mont8x: 2082 2083.byte 0x67 2084 movq %rsp,%rax 2085 2086 pushq %rbx 2087 2088 pushq %rbp 2089 2090 pushq %r12 2091 2092 pushq %r13 2093 2094 pushq %r14 2095 2096 pushq %r15 2097 2098L$from_prologue: 2099 2100 shll $3,%r9d 2101 leaq (%r9,%r9,2),%r10 2102 negq %r9 2103 movq (%r8),%r8 2104 2105 2106 2107 2108 2109 2110 2111 2112 leaq -320(%rsp,%r9,2),%r11 2113 movq %rsp,%rbp 2114 subq %rdi,%r11 2115 andq $4095,%r11 2116 cmpq %r11,%r10 2117 jb L$from_sp_alt 2118 subq %r11,%rbp 2119 leaq -320(%rbp,%r9,2),%rbp 2120 jmp L$from_sp_done 2121 2122.p2align 5 2123L$from_sp_alt: 2124 leaq 4096-320(,%r9,2),%r10 2125 leaq -320(%rbp,%r9,2),%rbp 2126 subq %r10,%r11 2127 movq $0,%r10 2128 cmovcq %r10,%r11 2129 subq %r11,%rbp 2130L$from_sp_done: 2131 andq $-64,%rbp 2132 movq %rsp,%r11 2133 subq %rbp,%r11 2134 andq $-4096,%r11 2135 leaq (%r11,%rbp,1),%rsp 2136 movq (%rsp),%r10 2137 cmpq %rbp,%rsp 2138 ja L$from_page_walk 2139 jmp L$from_page_walk_done 2140 2141L$from_page_walk: 2142 leaq -4096(%rsp),%rsp 2143 movq (%rsp),%r10 2144 cmpq %rbp,%rsp 2145 ja L$from_page_walk 2146L$from_page_walk_done: 2147 2148 movq %r9,%r10 2149 negq %r9 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 movq %r8,32(%rsp) 2161 movq %rax,40(%rsp) 2162 2163L$from_body: 2164 movq %r9,%r11 2165 leaq 48(%rsp),%rax 2166 pxor %xmm0,%xmm0 2167 jmp L$mul_by_1 2168 2169.p2align 5 2170L$mul_by_1: 2171 movdqu (%rsi),%xmm1 2172 movdqu 16(%rsi),%xmm2 2173 movdqu 32(%rsi),%xmm3 2174 movdqa %xmm0,(%rax,%r9,1) 2175 movdqu 48(%rsi),%xmm4 2176 movdqa %xmm0,16(%rax,%r9,1) 2177.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2178 movdqa %xmm1,(%rax) 2179 movdqa %xmm0,32(%rax,%r9,1) 2180 movdqa %xmm2,16(%rax) 2181 movdqa %xmm0,48(%rax,%r9,1) 2182 movdqa %xmm3,32(%rax) 2183 movdqa %xmm4,48(%rax) 2184 leaq 64(%rax),%rax 2185 subq $64,%r11 2186 jnz L$mul_by_1 2187 2188.byte 102,72,15,110,207 2189.byte 102,72,15,110,209 2190.byte 0x67 2191 movq %rcx,%rbp 2192.byte 102,73,15,110,218 2193 leaq _GFp_ia32cap_P(%rip),%r11 2194 movl 8(%r11),%r11d 2195 andl $0x80108,%r11d 2196 cmpl $0x80108,%r11d 2197 jne L$from_mont_nox 2198 2199 leaq (%rax,%r9,1),%rdi 2200 call __bn_sqrx8x_reduction 2201 call __bn_postx4x_internal 2202 2203 pxor %xmm0,%xmm0 2204 leaq 48(%rsp),%rax 2205 jmp L$from_mont_zero 2206 2207.p2align 5 2208L$from_mont_nox: 2209 call __bn_sqr8x_reduction 2210 call __bn_post4x_internal 2211 2212 pxor %xmm0,%xmm0 2213 leaq 48(%rsp),%rax 2214 jmp L$from_mont_zero 2215 2216.p2align 5 2217L$from_mont_zero: 2218 movq 40(%rsp),%rsi 2219 2220 movdqa %xmm0,0(%rax) 2221 movdqa %xmm0,16(%rax) 2222 movdqa %xmm0,32(%rax) 2223 movdqa %xmm0,48(%rax) 2224 leaq 64(%rax),%rax 2225 subq $32,%r9 2226 jnz L$from_mont_zero 2227 2228 movq $1,%rax 2229 movq -48(%rsi),%r15 2230 2231 movq -40(%rsi),%r14 2232 2233 movq -32(%rsi),%r13 2234 2235 movq -24(%rsi),%r12 2236 2237 movq -16(%rsi),%rbp 2238 2239 movq -8(%rsi),%rbx 2240 2241 leaq (%rsi),%rsp 2242 2243L$from_epilogue: 2244 .byte 0xf3,0xc3 2245 2246 2247 2248.p2align 5 2249bn_mulx4x_mont_gather5: 2250 2251 movq %rsp,%rax 2252 2253L$mulx4x_enter: 2254 pushq %rbx 2255 2256 pushq %rbp 2257 2258 pushq %r12 2259 2260 pushq %r13 2261 2262 pushq %r14 2263 2264 pushq %r15 2265 2266L$mulx4x_prologue: 2267 2268 shll $3,%r9d 2269 leaq (%r9,%r9,2),%r10 2270 negq %r9 2271 movq (%r8),%r8 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 leaq -320(%rsp,%r9,2),%r11 2283 movq %rsp,%rbp 2284 subq %rdi,%r11 2285 andq $4095,%r11 2286 cmpq %r11,%r10 2287 jb L$mulx4xsp_alt 2288 subq %r11,%rbp 2289 leaq -320(%rbp,%r9,2),%rbp 2290 jmp L$mulx4xsp_done 2291 2292L$mulx4xsp_alt: 2293 leaq 4096-320(,%r9,2),%r10 2294 leaq -320(%rbp,%r9,2),%rbp 2295 subq %r10,%r11 2296 movq $0,%r10 2297 cmovcq %r10,%r11 2298 subq %r11,%rbp 2299L$mulx4xsp_done: 2300 andq $-64,%rbp 2301 movq %rsp,%r11 2302 subq %rbp,%r11 2303 andq $-4096,%r11 2304 leaq (%r11,%rbp,1),%rsp 2305 movq (%rsp),%r10 2306 cmpq %rbp,%rsp 2307 ja L$mulx4x_page_walk 2308 jmp L$mulx4x_page_walk_done 2309 2310L$mulx4x_page_walk: 2311 leaq -4096(%rsp),%rsp 2312 movq (%rsp),%r10 2313 cmpq %rbp,%rsp 2314 ja L$mulx4x_page_walk 2315L$mulx4x_page_walk_done: 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 movq %r8,32(%rsp) 2330 movq %rax,40(%rsp) 2331 2332L$mulx4x_body: 2333 call mulx4x_internal 2334 2335 movq 40(%rsp),%rsi 2336 2337 movq $1,%rax 2338 2339 movq -48(%rsi),%r15 2340 2341 movq -40(%rsi),%r14 2342 2343 movq -32(%rsi),%r13 2344 2345 movq -24(%rsi),%r12 2346 2347 movq -16(%rsi),%rbp 2348 2349 movq -8(%rsi),%rbx 2350 2351 leaq (%rsi),%rsp 2352 2353L$mulx4x_epilogue: 2354 .byte 0xf3,0xc3 2355 2356 2357 2358 2359.p2align 5 2360mulx4x_internal: 2361 2362 movq %r9,8(%rsp) 2363 movq %r9,%r10 2364 negq %r9 2365 shlq $5,%r9 2366 negq %r10 2367 leaq 128(%rdx,%r9,1),%r13 2368 shrq $5+5,%r9 2369 movd 8(%rax),%xmm5 2370 subq $1,%r9 2371 leaq L$inc(%rip),%rax 2372 movq %r13,16+8(%rsp) 2373 movq %r9,24+8(%rsp) 2374 movq %rdi,56+8(%rsp) 2375 movdqa 0(%rax),%xmm0 2376 movdqa 16(%rax),%xmm1 2377 leaq 88-112(%rsp,%r10,1),%r10 2378 leaq 128(%rdx),%rdi 2379 2380 pshufd $0,%xmm5,%xmm5 2381 movdqa %xmm1,%xmm4 2382.byte 0x67 2383 movdqa %xmm1,%xmm2 2384.byte 0x67 2385 paddd %xmm0,%xmm1 2386 pcmpeqd %xmm5,%xmm0 2387 movdqa %xmm4,%xmm3 2388 paddd %xmm1,%xmm2 2389 pcmpeqd %xmm5,%xmm1 2390 movdqa %xmm0,112(%r10) 2391 movdqa %xmm4,%xmm0 2392 2393 paddd %xmm2,%xmm3 2394 pcmpeqd %xmm5,%xmm2 2395 movdqa %xmm1,128(%r10) 2396 movdqa %xmm4,%xmm1 2397 2398 paddd %xmm3,%xmm0 2399 pcmpeqd %xmm5,%xmm3 2400 movdqa %xmm2,144(%r10) 2401 movdqa %xmm4,%xmm2 2402 2403 paddd %xmm0,%xmm1 2404 pcmpeqd %xmm5,%xmm0 2405 movdqa %xmm3,160(%r10) 2406 movdqa %xmm4,%xmm3 2407 paddd %xmm1,%xmm2 2408 pcmpeqd %xmm5,%xmm1 2409 movdqa %xmm0,176(%r10) 2410 movdqa %xmm4,%xmm0 2411 2412 paddd %xmm2,%xmm3 2413 pcmpeqd %xmm5,%xmm2 2414 movdqa %xmm1,192(%r10) 2415 movdqa %xmm4,%xmm1 2416 2417 paddd %xmm3,%xmm0 2418 pcmpeqd %xmm5,%xmm3 2419 movdqa %xmm2,208(%r10) 2420 movdqa %xmm4,%xmm2 2421 2422 paddd %xmm0,%xmm1 2423 pcmpeqd %xmm5,%xmm0 2424 movdqa %xmm3,224(%r10) 2425 movdqa %xmm4,%xmm3 2426 paddd %xmm1,%xmm2 2427 pcmpeqd %xmm5,%xmm1 2428 movdqa %xmm0,240(%r10) 2429 movdqa %xmm4,%xmm0 2430 2431 paddd %xmm2,%xmm3 2432 pcmpeqd %xmm5,%xmm2 2433 movdqa %xmm1,256(%r10) 2434 movdqa %xmm4,%xmm1 2435 2436 paddd %xmm3,%xmm0 2437 pcmpeqd %xmm5,%xmm3 2438 movdqa %xmm2,272(%r10) 2439 movdqa %xmm4,%xmm2 2440 2441 paddd %xmm0,%xmm1 2442 pcmpeqd %xmm5,%xmm0 2443 movdqa %xmm3,288(%r10) 2444 movdqa %xmm4,%xmm3 2445.byte 0x67 2446 paddd %xmm1,%xmm2 2447 pcmpeqd %xmm5,%xmm1 2448 movdqa %xmm0,304(%r10) 2449 2450 paddd %xmm2,%xmm3 2451 pcmpeqd %xmm5,%xmm2 2452 movdqa %xmm1,320(%r10) 2453 2454 pcmpeqd %xmm5,%xmm3 2455 movdqa %xmm2,336(%r10) 2456 2457 pand 64(%rdi),%xmm0 2458 pand 80(%rdi),%xmm1 2459 pand 96(%rdi),%xmm2 2460 movdqa %xmm3,352(%r10) 2461 pand 112(%rdi),%xmm3 2462 por %xmm2,%xmm0 2463 por %xmm3,%xmm1 2464 movdqa -128(%rdi),%xmm4 2465 movdqa -112(%rdi),%xmm5 2466 movdqa -96(%rdi),%xmm2 2467 pand 112(%r10),%xmm4 2468 movdqa -80(%rdi),%xmm3 2469 pand 128(%r10),%xmm5 2470 por %xmm4,%xmm0 2471 pand 144(%r10),%xmm2 2472 por %xmm5,%xmm1 2473 pand 160(%r10),%xmm3 2474 por %xmm2,%xmm0 2475 por %xmm3,%xmm1 2476 movdqa -64(%rdi),%xmm4 2477 movdqa -48(%rdi),%xmm5 2478 movdqa -32(%rdi),%xmm2 2479 pand 176(%r10),%xmm4 2480 movdqa -16(%rdi),%xmm3 2481 pand 192(%r10),%xmm5 2482 por %xmm4,%xmm0 2483 pand 208(%r10),%xmm2 2484 por %xmm5,%xmm1 2485 pand 224(%r10),%xmm3 2486 por %xmm2,%xmm0 2487 por %xmm3,%xmm1 2488 movdqa 0(%rdi),%xmm4 2489 movdqa 16(%rdi),%xmm5 2490 movdqa 32(%rdi),%xmm2 2491 pand 240(%r10),%xmm4 2492 movdqa 48(%rdi),%xmm3 2493 pand 256(%r10),%xmm5 2494 por %xmm4,%xmm0 2495 pand 272(%r10),%xmm2 2496 por %xmm5,%xmm1 2497 pand 288(%r10),%xmm3 2498 por %xmm2,%xmm0 2499 por %xmm3,%xmm1 2500 pxor %xmm1,%xmm0 2501 pshufd $0x4e,%xmm0,%xmm1 2502 por %xmm1,%xmm0 2503 leaq 256(%rdi),%rdi 2504.byte 102,72,15,126,194 2505 leaq 64+32+8(%rsp),%rbx 2506 2507 movq %rdx,%r9 2508 mulxq 0(%rsi),%r8,%rax 2509 mulxq 8(%rsi),%r11,%r12 2510 addq %rax,%r11 2511 mulxq 16(%rsi),%rax,%r13 2512 adcq %rax,%r12 2513 adcq $0,%r13 2514 mulxq 24(%rsi),%rax,%r14 2515 2516 movq %r8,%r15 2517 imulq 32+8(%rsp),%r8 2518 xorq %rbp,%rbp 2519 movq %r8,%rdx 2520 2521 movq %rdi,8+8(%rsp) 2522 2523 leaq 32(%rsi),%rsi 2524 adcxq %rax,%r13 2525 adcxq %rbp,%r14 2526 2527 mulxq 0(%rcx),%rax,%r10 2528 adcxq %rax,%r15 2529 adoxq %r11,%r10 2530 mulxq 8(%rcx),%rax,%r11 2531 adcxq %rax,%r10 2532 adoxq %r12,%r11 2533 mulxq 16(%rcx),%rax,%r12 2534 movq 24+8(%rsp),%rdi 2535 movq %r10,-32(%rbx) 2536 adcxq %rax,%r11 2537 adoxq %r13,%r12 2538 mulxq 24(%rcx),%rax,%r15 2539 movq %r9,%rdx 2540 movq %r11,-24(%rbx) 2541 adcxq %rax,%r12 2542 adoxq %rbp,%r15 2543 leaq 32(%rcx),%rcx 2544 movq %r12,-16(%rbx) 2545 jmp L$mulx4x_1st 2546 2547.p2align 5 2548L$mulx4x_1st: 2549 adcxq %rbp,%r15 2550 mulxq 0(%rsi),%r10,%rax 2551 adcxq %r14,%r10 2552 mulxq 8(%rsi),%r11,%r14 2553 adcxq %rax,%r11 2554 mulxq 16(%rsi),%r12,%rax 2555 adcxq %r14,%r12 2556 mulxq 24(%rsi),%r13,%r14 2557.byte 0x67,0x67 2558 movq %r8,%rdx 2559 adcxq %rax,%r13 2560 adcxq %rbp,%r14 2561 leaq 32(%rsi),%rsi 2562 leaq 32(%rbx),%rbx 2563 2564 adoxq %r15,%r10 2565 mulxq 0(%rcx),%rax,%r15 2566 adcxq %rax,%r10 2567 adoxq %r15,%r11 2568 mulxq 8(%rcx),%rax,%r15 2569 adcxq %rax,%r11 2570 adoxq %r15,%r12 2571 mulxq 16(%rcx),%rax,%r15 2572 movq %r10,-40(%rbx) 2573 adcxq %rax,%r12 2574 movq %r11,-32(%rbx) 2575 adoxq %r15,%r13 2576 mulxq 24(%rcx),%rax,%r15 2577 movq %r9,%rdx 2578 movq %r12,-24(%rbx) 2579 adcxq %rax,%r13 2580 adoxq %rbp,%r15 2581 leaq 32(%rcx),%rcx 2582 movq %r13,-16(%rbx) 2583 2584 decq %rdi 2585 jnz L$mulx4x_1st 2586 2587 movq 8(%rsp),%rax 2588 adcq %rbp,%r15 2589 leaq (%rsi,%rax,1),%rsi 2590 addq %r15,%r14 2591 movq 8+8(%rsp),%rdi 2592 adcq %rbp,%rbp 2593 movq %r14,-8(%rbx) 2594 jmp L$mulx4x_outer 2595 2596.p2align 5 2597L$mulx4x_outer: 2598 leaq 16-256(%rbx),%r10 2599 pxor %xmm4,%xmm4 2600.byte 0x67,0x67 2601 pxor %xmm5,%xmm5 2602 movdqa -128(%rdi),%xmm0 2603 movdqa -112(%rdi),%xmm1 2604 movdqa -96(%rdi),%xmm2 2605 pand 256(%r10),%xmm0 2606 movdqa -80(%rdi),%xmm3 2607 pand 272(%r10),%xmm1 2608 por %xmm0,%xmm4 2609 pand 288(%r10),%xmm2 2610 por %xmm1,%xmm5 2611 pand 304(%r10),%xmm3 2612 por %xmm2,%xmm4 2613 por %xmm3,%xmm5 2614 movdqa -64(%rdi),%xmm0 2615 movdqa -48(%rdi),%xmm1 2616 movdqa -32(%rdi),%xmm2 2617 pand 320(%r10),%xmm0 2618 movdqa -16(%rdi),%xmm3 2619 pand 336(%r10),%xmm1 2620 por %xmm0,%xmm4 2621 pand 352(%r10),%xmm2 2622 por %xmm1,%xmm5 2623 pand 368(%r10),%xmm3 2624 por %xmm2,%xmm4 2625 por %xmm3,%xmm5 2626 movdqa 0(%rdi),%xmm0 2627 movdqa 16(%rdi),%xmm1 2628 movdqa 32(%rdi),%xmm2 2629 pand 384(%r10),%xmm0 2630 movdqa 48(%rdi),%xmm3 2631 pand 400(%r10),%xmm1 2632 por %xmm0,%xmm4 2633 pand 416(%r10),%xmm2 2634 por %xmm1,%xmm5 2635 pand 432(%r10),%xmm3 2636 por %xmm2,%xmm4 2637 por %xmm3,%xmm5 2638 movdqa 64(%rdi),%xmm0 2639 movdqa 80(%rdi),%xmm1 2640 movdqa 96(%rdi),%xmm2 2641 pand 448(%r10),%xmm0 2642 movdqa 112(%rdi),%xmm3 2643 pand 464(%r10),%xmm1 2644 por %xmm0,%xmm4 2645 pand 480(%r10),%xmm2 2646 por %xmm1,%xmm5 2647 pand 496(%r10),%xmm3 2648 por %xmm2,%xmm4 2649 por %xmm3,%xmm5 2650 por %xmm5,%xmm4 2651 pshufd $0x4e,%xmm4,%xmm0 2652 por %xmm4,%xmm0 2653 leaq 256(%rdi),%rdi 2654.byte 102,72,15,126,194 2655 2656 movq %rbp,(%rbx) 2657 leaq 32(%rbx,%rax,1),%rbx 2658 mulxq 0(%rsi),%r8,%r11 2659 xorq %rbp,%rbp 2660 movq %rdx,%r9 2661 mulxq 8(%rsi),%r14,%r12 2662 adoxq -32(%rbx),%r8 2663 adcxq %r14,%r11 2664 mulxq 16(%rsi),%r15,%r13 2665 adoxq -24(%rbx),%r11 2666 adcxq %r15,%r12 2667 mulxq 24(%rsi),%rdx,%r14 2668 adoxq -16(%rbx),%r12 2669 adcxq %rdx,%r13 2670 leaq (%rcx,%rax,1),%rcx 2671 leaq 32(%rsi),%rsi 2672 adoxq -8(%rbx),%r13 2673 adcxq %rbp,%r14 2674 adoxq %rbp,%r14 2675 2676 movq %r8,%r15 2677 imulq 32+8(%rsp),%r8 2678 2679 movq %r8,%rdx 2680 xorq %rbp,%rbp 2681 movq %rdi,8+8(%rsp) 2682 2683 mulxq 0(%rcx),%rax,%r10 2684 adcxq %rax,%r15 2685 adoxq %r11,%r10 2686 mulxq 8(%rcx),%rax,%r11 2687 adcxq %rax,%r10 2688 adoxq %r12,%r11 2689 mulxq 16(%rcx),%rax,%r12 2690 adcxq %rax,%r11 2691 adoxq %r13,%r12 2692 mulxq 24(%rcx),%rax,%r15 2693 movq %r9,%rdx 2694 movq 24+8(%rsp),%rdi 2695 movq %r10,-32(%rbx) 2696 adcxq %rax,%r12 2697 movq %r11,-24(%rbx) 2698 adoxq %rbp,%r15 2699 movq %r12,-16(%rbx) 2700 leaq 32(%rcx),%rcx 2701 jmp L$mulx4x_inner 2702 2703.p2align 5 2704L$mulx4x_inner: 2705 mulxq 0(%rsi),%r10,%rax 2706 adcxq %rbp,%r15 2707 adoxq %r14,%r10 2708 mulxq 8(%rsi),%r11,%r14 2709 adcxq 0(%rbx),%r10 2710 adoxq %rax,%r11 2711 mulxq 16(%rsi),%r12,%rax 2712 adcxq 8(%rbx),%r11 2713 adoxq %r14,%r12 2714 mulxq 24(%rsi),%r13,%r14 2715 movq %r8,%rdx 2716 adcxq 16(%rbx),%r12 2717 adoxq %rax,%r13 2718 adcxq 24(%rbx),%r13 2719 adoxq %rbp,%r14 2720 leaq 32(%rsi),%rsi 2721 leaq 32(%rbx),%rbx 2722 adcxq %rbp,%r14 2723 2724 adoxq %r15,%r10 2725 mulxq 0(%rcx),%rax,%r15 2726 adcxq %rax,%r10 2727 adoxq %r15,%r11 2728 mulxq 8(%rcx),%rax,%r15 2729 adcxq %rax,%r11 2730 adoxq %r15,%r12 2731 mulxq 16(%rcx),%rax,%r15 2732 movq %r10,-40(%rbx) 2733 adcxq %rax,%r12 2734 adoxq %r15,%r13 2735 movq %r11,-32(%rbx) 2736 mulxq 24(%rcx),%rax,%r15 2737 movq %r9,%rdx 2738 leaq 32(%rcx),%rcx 2739 movq %r12,-24(%rbx) 2740 adcxq %rax,%r13 2741 adoxq %rbp,%r15 2742 movq %r13,-16(%rbx) 2743 2744 decq %rdi 2745 jnz L$mulx4x_inner 2746 2747 movq 0+8(%rsp),%rax 2748 adcq %rbp,%r15 2749 subq 0(%rbx),%rdi 2750 movq 8+8(%rsp),%rdi 2751 movq 16+8(%rsp),%r10 2752 adcq %r15,%r14 2753 leaq (%rsi,%rax,1),%rsi 2754 adcq %rbp,%rbp 2755 movq %r14,-8(%rbx) 2756 2757 cmpq %r10,%rdi 2758 jb L$mulx4x_outer 2759 2760 movq -8(%rcx),%r10 2761 movq %rbp,%r8 2762 movq (%rcx,%rax,1),%r12 2763 leaq (%rcx,%rax,1),%rbp 2764 movq %rax,%rcx 2765 leaq (%rbx,%rax,1),%rdi 2766 xorl %eax,%eax 2767 xorq %r15,%r15 2768 subq %r14,%r10 2769 adcq %r15,%r15 2770 orq %r15,%r8 2771 sarq $3+2,%rcx 2772 subq %r8,%rax 2773 movq 56+8(%rsp),%rdx 2774 decq %r12 2775 movq 8(%rbp),%r13 2776 xorq %r8,%r8 2777 movq 16(%rbp),%r14 2778 movq 24(%rbp),%r15 2779 jmp L$sqrx4x_sub_entry 2780 2781 2782 2783.p2align 5 2784bn_powerx5: 2785 2786 movq %rsp,%rax 2787 2788L$powerx5_enter: 2789 pushq %rbx 2790 2791 pushq %rbp 2792 2793 pushq %r12 2794 2795 pushq %r13 2796 2797 pushq %r14 2798 2799 pushq %r15 2800 2801L$powerx5_prologue: 2802 2803 shll $3,%r9d 2804 leaq (%r9,%r9,2),%r10 2805 negq %r9 2806 movq (%r8),%r8 2807 2808 2809 2810 2811 2812 2813 2814 2815 leaq -320(%rsp,%r9,2),%r11 2816 movq %rsp,%rbp 2817 subq %rdi,%r11 2818 andq $4095,%r11 2819 cmpq %r11,%r10 2820 jb L$pwrx_sp_alt 2821 subq %r11,%rbp 2822 leaq -320(%rbp,%r9,2),%rbp 2823 jmp L$pwrx_sp_done 2824 2825.p2align 5 2826L$pwrx_sp_alt: 2827 leaq 4096-320(,%r9,2),%r10 2828 leaq -320(%rbp,%r9,2),%rbp 2829 subq %r10,%r11 2830 movq $0,%r10 2831 cmovcq %r10,%r11 2832 subq %r11,%rbp 2833L$pwrx_sp_done: 2834 andq $-64,%rbp 2835 movq %rsp,%r11 2836 subq %rbp,%r11 2837 andq $-4096,%r11 2838 leaq (%r11,%rbp,1),%rsp 2839 movq (%rsp),%r10 2840 cmpq %rbp,%rsp 2841 ja L$pwrx_page_walk 2842 jmp L$pwrx_page_walk_done 2843 2844L$pwrx_page_walk: 2845 leaq -4096(%rsp),%rsp 2846 movq (%rsp),%r10 2847 cmpq %rbp,%rsp 2848 ja L$pwrx_page_walk 2849L$pwrx_page_walk_done: 2850 2851 movq %r9,%r10 2852 negq %r9 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 pxor %xmm0,%xmm0 2866.byte 102,72,15,110,207 2867.byte 102,72,15,110,209 2868.byte 102,73,15,110,218 2869.byte 102,72,15,110,226 2870 movq %r8,32(%rsp) 2871 movq %rax,40(%rsp) 2872 2873L$powerx5_body: 2874 2875 call __bn_sqrx8x_internal 2876 call __bn_postx4x_internal 2877 call __bn_sqrx8x_internal 2878 call __bn_postx4x_internal 2879 call __bn_sqrx8x_internal 2880 call __bn_postx4x_internal 2881 call __bn_sqrx8x_internal 2882 call __bn_postx4x_internal 2883 call __bn_sqrx8x_internal 2884 call __bn_postx4x_internal 2885 2886 movq %r10,%r9 2887 movq %rsi,%rdi 2888.byte 102,72,15,126,209 2889.byte 102,72,15,126,226 2890 movq 40(%rsp),%rax 2891 2892 call mulx4x_internal 2893 2894 movq 40(%rsp),%rsi 2895 2896 movq $1,%rax 2897 2898 movq -48(%rsi),%r15 2899 2900 movq -40(%rsi),%r14 2901 2902 movq -32(%rsi),%r13 2903 2904 movq -24(%rsi),%r12 2905 2906 movq -16(%rsi),%rbp 2907 2908 movq -8(%rsi),%rbx 2909 2910 leaq (%rsi),%rsp 2911 2912L$powerx5_epilogue: 2913 .byte 0xf3,0xc3 2914 2915 2916 2917.globl _GFp_bn_sqrx8x_internal 2918.private_extern _GFp_bn_sqrx8x_internal 2919 2920.p2align 5 2921_GFp_bn_sqrx8x_internal: 2922__bn_sqrx8x_internal: 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 leaq 48+8(%rsp),%rdi 2965 leaq (%rsi,%r9,1),%rbp 2966 movq %r9,0+8(%rsp) 2967 movq %rbp,8+8(%rsp) 2968 jmp L$sqr8x_zero_start 2969 2970.p2align 5 2971.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2972L$sqrx8x_zero: 2973.byte 0x3e 2974 movdqa %xmm0,0(%rdi) 2975 movdqa %xmm0,16(%rdi) 2976 movdqa %xmm0,32(%rdi) 2977 movdqa %xmm0,48(%rdi) 2978L$sqr8x_zero_start: 2979 movdqa %xmm0,64(%rdi) 2980 movdqa %xmm0,80(%rdi) 2981 movdqa %xmm0,96(%rdi) 2982 movdqa %xmm0,112(%rdi) 2983 leaq 128(%rdi),%rdi 2984 subq $64,%r9 2985 jnz L$sqrx8x_zero 2986 2987 movq 0(%rsi),%rdx 2988 2989 xorq %r10,%r10 2990 xorq %r11,%r11 2991 xorq %r12,%r12 2992 xorq %r13,%r13 2993 xorq %r14,%r14 2994 xorq %r15,%r15 2995 leaq 48+8(%rsp),%rdi 2996 xorq %rbp,%rbp 2997 jmp L$sqrx8x_outer_loop 2998 2999.p2align 5 3000L$sqrx8x_outer_loop: 3001 mulxq 8(%rsi),%r8,%rax 3002 adcxq %r9,%r8 3003 adoxq %rax,%r10 3004 mulxq 16(%rsi),%r9,%rax 3005 adcxq %r10,%r9 3006 adoxq %rax,%r11 3007.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 3008 adcxq %r11,%r10 3009 adoxq %rax,%r12 3010.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 3011 adcxq %r12,%r11 3012 adoxq %rax,%r13 3013 mulxq 40(%rsi),%r12,%rax 3014 adcxq %r13,%r12 3015 adoxq %rax,%r14 3016 mulxq 48(%rsi),%r13,%rax 3017 adcxq %r14,%r13 3018 adoxq %r15,%rax 3019 mulxq 56(%rsi),%r14,%r15 3020 movq 8(%rsi),%rdx 3021 adcxq %rax,%r14 3022 adoxq %rbp,%r15 3023 adcq 64(%rdi),%r15 3024 movq %r8,8(%rdi) 3025 movq %r9,16(%rdi) 3026 sbbq %rcx,%rcx 3027 xorq %rbp,%rbp 3028 3029 3030 mulxq 16(%rsi),%r8,%rbx 3031 mulxq 24(%rsi),%r9,%rax 3032 adcxq %r10,%r8 3033 adoxq %rbx,%r9 3034 mulxq 32(%rsi),%r10,%rbx 3035 adcxq %r11,%r9 3036 adoxq %rax,%r10 3037.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 3038 adcxq %r12,%r10 3039 adoxq %rbx,%r11 3040.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 3041 adcxq %r13,%r11 3042 adoxq %r14,%r12 3043.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 3044 movq 16(%rsi),%rdx 3045 adcxq %rax,%r12 3046 adoxq %rbx,%r13 3047 adcxq %r15,%r13 3048 adoxq %rbp,%r14 3049 adcxq %rbp,%r14 3050 3051 movq %r8,24(%rdi) 3052 movq %r9,32(%rdi) 3053 3054 mulxq 24(%rsi),%r8,%rbx 3055 mulxq 32(%rsi),%r9,%rax 3056 adcxq %r10,%r8 3057 adoxq %rbx,%r9 3058 mulxq 40(%rsi),%r10,%rbx 3059 adcxq %r11,%r9 3060 adoxq %rax,%r10 3061.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 3062 adcxq %r12,%r10 3063 adoxq %r13,%r11 3064.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 3065.byte 0x3e 3066 movq 24(%rsi),%rdx 3067 adcxq %rbx,%r11 3068 adoxq %rax,%r12 3069 adcxq %r14,%r12 3070 movq %r8,40(%rdi) 3071 movq %r9,48(%rdi) 3072 mulxq 32(%rsi),%r8,%rax 3073 adoxq %rbp,%r13 3074 adcxq %rbp,%r13 3075 3076 mulxq 40(%rsi),%r9,%rbx 3077 adcxq %r10,%r8 3078 adoxq %rax,%r9 3079 mulxq 48(%rsi),%r10,%rax 3080 adcxq %r11,%r9 3081 adoxq %r12,%r10 3082 mulxq 56(%rsi),%r11,%r12 3083 movq 32(%rsi),%rdx 3084 movq 40(%rsi),%r14 3085 adcxq %rbx,%r10 3086 adoxq %rax,%r11 3087 movq 48(%rsi),%r15 3088 adcxq %r13,%r11 3089 adoxq %rbp,%r12 3090 adcxq %rbp,%r12 3091 3092 movq %r8,56(%rdi) 3093 movq %r9,64(%rdi) 3094 3095 mulxq %r14,%r9,%rax 3096 movq 56(%rsi),%r8 3097 adcxq %r10,%r9 3098 mulxq %r15,%r10,%rbx 3099 adoxq %rax,%r10 3100 adcxq %r11,%r10 3101 mulxq %r8,%r11,%rax 3102 movq %r14,%rdx 3103 adoxq %rbx,%r11 3104 adcxq %r12,%r11 3105 3106 adcxq %rbp,%rax 3107 3108 mulxq %r15,%r14,%rbx 3109 mulxq %r8,%r12,%r13 3110 movq %r15,%rdx 3111 leaq 64(%rsi),%rsi 3112 adcxq %r14,%r11 3113 adoxq %rbx,%r12 3114 adcxq %rax,%r12 3115 adoxq %rbp,%r13 3116 3117.byte 0x67,0x67 3118 mulxq %r8,%r8,%r14 3119 adcxq %r8,%r13 3120 adcxq %rbp,%r14 3121 3122 cmpq 8+8(%rsp),%rsi 3123 je L$sqrx8x_outer_break 3124 3125 negq %rcx 3126 movq $-8,%rcx 3127 movq %rbp,%r15 3128 movq 64(%rdi),%r8 3129 adcxq 72(%rdi),%r9 3130 adcxq 80(%rdi),%r10 3131 adcxq 88(%rdi),%r11 3132 adcq 96(%rdi),%r12 3133 adcq 104(%rdi),%r13 3134 adcq 112(%rdi),%r14 3135 adcq 120(%rdi),%r15 3136 leaq (%rsi),%rbp 3137 leaq 128(%rdi),%rdi 3138 sbbq %rax,%rax 3139 3140 movq -64(%rsi),%rdx 3141 movq %rax,16+8(%rsp) 3142 movq %rdi,24+8(%rsp) 3143 3144 3145 xorl %eax,%eax 3146 jmp L$sqrx8x_loop 3147 3148.p2align 5 3149L$sqrx8x_loop: 3150 movq %r8,%rbx 3151 mulxq 0(%rbp),%rax,%r8 3152 adcxq %rax,%rbx 3153 adoxq %r9,%r8 3154 3155 mulxq 8(%rbp),%rax,%r9 3156 adcxq %rax,%r8 3157 adoxq %r10,%r9 3158 3159 mulxq 16(%rbp),%rax,%r10 3160 adcxq %rax,%r9 3161 adoxq %r11,%r10 3162 3163 mulxq 24(%rbp),%rax,%r11 3164 adcxq %rax,%r10 3165 adoxq %r12,%r11 3166 3167.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3168 adcxq %rax,%r11 3169 adoxq %r13,%r12 3170 3171 mulxq 40(%rbp),%rax,%r13 3172 adcxq %rax,%r12 3173 adoxq %r14,%r13 3174 3175 mulxq 48(%rbp),%rax,%r14 3176 movq %rbx,(%rdi,%rcx,8) 3177 movl $0,%ebx 3178 adcxq %rax,%r13 3179 adoxq %r15,%r14 3180 3181.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 3182 movq 8(%rsi,%rcx,8),%rdx 3183 adcxq %rax,%r14 3184 adoxq %rbx,%r15 3185 adcxq %rbx,%r15 3186 3187.byte 0x67 3188 incq %rcx 3189 jnz L$sqrx8x_loop 3190 3191 leaq 64(%rbp),%rbp 3192 movq $-8,%rcx 3193 cmpq 8+8(%rsp),%rbp 3194 je L$sqrx8x_break 3195 3196 subq 16+8(%rsp),%rbx 3197.byte 0x66 3198 movq -64(%rsi),%rdx 3199 adcxq 0(%rdi),%r8 3200 adcxq 8(%rdi),%r9 3201 adcq 16(%rdi),%r10 3202 adcq 24(%rdi),%r11 3203 adcq 32(%rdi),%r12 3204 adcq 40(%rdi),%r13 3205 adcq 48(%rdi),%r14 3206 adcq 56(%rdi),%r15 3207 leaq 64(%rdi),%rdi 3208.byte 0x67 3209 sbbq %rax,%rax 3210 xorl %ebx,%ebx 3211 movq %rax,16+8(%rsp) 3212 jmp L$sqrx8x_loop 3213 3214.p2align 5 3215L$sqrx8x_break: 3216 xorq %rbp,%rbp 3217 subq 16+8(%rsp),%rbx 3218 adcxq %rbp,%r8 3219 movq 24+8(%rsp),%rcx 3220 adcxq %rbp,%r9 3221 movq 0(%rsi),%rdx 3222 adcq $0,%r10 3223 movq %r8,0(%rdi) 3224 adcq $0,%r11 3225 adcq $0,%r12 3226 adcq $0,%r13 3227 adcq $0,%r14 3228 adcq $0,%r15 3229 cmpq %rcx,%rdi 3230 je L$sqrx8x_outer_loop 3231 3232 movq %r9,8(%rdi) 3233 movq 8(%rcx),%r9 3234 movq %r10,16(%rdi) 3235 movq 16(%rcx),%r10 3236 movq %r11,24(%rdi) 3237 movq 24(%rcx),%r11 3238 movq %r12,32(%rdi) 3239 movq 32(%rcx),%r12 3240 movq %r13,40(%rdi) 3241 movq 40(%rcx),%r13 3242 movq %r14,48(%rdi) 3243 movq 48(%rcx),%r14 3244 movq %r15,56(%rdi) 3245 movq 56(%rcx),%r15 3246 movq %rcx,%rdi 3247 jmp L$sqrx8x_outer_loop 3248 3249.p2align 5 3250L$sqrx8x_outer_break: 3251 movq %r9,72(%rdi) 3252.byte 102,72,15,126,217 3253 movq %r10,80(%rdi) 3254 movq %r11,88(%rdi) 3255 movq %r12,96(%rdi) 3256 movq %r13,104(%rdi) 3257 movq %r14,112(%rdi) 3258 leaq 48+8(%rsp),%rdi 3259 movq (%rsi,%rcx,1),%rdx 3260 3261 movq 8(%rdi),%r11 3262 xorq %r10,%r10 3263 movq 0+8(%rsp),%r9 3264 adoxq %r11,%r11 3265 movq 16(%rdi),%r12 3266 movq 24(%rdi),%r13 3267 3268 3269.p2align 5 3270L$sqrx4x_shift_n_add: 3271 mulxq %rdx,%rax,%rbx 3272 adoxq %r12,%r12 3273 adcxq %r10,%rax 3274.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 3275.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 3276 adoxq %r13,%r13 3277 adcxq %r11,%rbx 3278 movq 40(%rdi),%r11 3279 movq %rax,0(%rdi) 3280 movq %rbx,8(%rdi) 3281 3282 mulxq %rdx,%rax,%rbx 3283 adoxq %r10,%r10 3284 adcxq %r12,%rax 3285 movq 16(%rsi,%rcx,1),%rdx 3286 movq 48(%rdi),%r12 3287 adoxq %r11,%r11 3288 adcxq %r13,%rbx 3289 movq 56(%rdi),%r13 3290 movq %rax,16(%rdi) 3291 movq %rbx,24(%rdi) 3292 3293 mulxq %rdx,%rax,%rbx 3294 adoxq %r12,%r12 3295 adcxq %r10,%rax 3296 movq 24(%rsi,%rcx,1),%rdx 3297 leaq 32(%rcx),%rcx 3298 movq 64(%rdi),%r10 3299 adoxq %r13,%r13 3300 adcxq %r11,%rbx 3301 movq 72(%rdi),%r11 3302 movq %rax,32(%rdi) 3303 movq %rbx,40(%rdi) 3304 3305 mulxq %rdx,%rax,%rbx 3306 adoxq %r10,%r10 3307 adcxq %r12,%rax 3308 jrcxz L$sqrx4x_shift_n_add_break 3309.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 3310 adoxq %r11,%r11 3311 adcxq %r13,%rbx 3312 movq 80(%rdi),%r12 3313 movq 88(%rdi),%r13 3314 movq %rax,48(%rdi) 3315 movq %rbx,56(%rdi) 3316 leaq 64(%rdi),%rdi 3317 nop 3318 jmp L$sqrx4x_shift_n_add 3319 3320.p2align 5 3321L$sqrx4x_shift_n_add_break: 3322 adcxq %r13,%rbx 3323 movq %rax,48(%rdi) 3324 movq %rbx,56(%rdi) 3325 leaq 64(%rdi),%rdi 3326.byte 102,72,15,126,213 3327__bn_sqrx8x_reduction: 3328 xorl %eax,%eax 3329 movq 32+8(%rsp),%rbx 3330 movq 48+8(%rsp),%rdx 3331 leaq -64(%rbp,%r9,1),%rcx 3332 3333 movq %rcx,0+8(%rsp) 3334 movq %rdi,8+8(%rsp) 3335 3336 leaq 48+8(%rsp),%rdi 3337 jmp L$sqrx8x_reduction_loop 3338 3339.p2align 5 3340L$sqrx8x_reduction_loop: 3341 movq 8(%rdi),%r9 3342 movq 16(%rdi),%r10 3343 movq 24(%rdi),%r11 3344 movq 32(%rdi),%r12 3345 movq %rdx,%r8 3346 imulq %rbx,%rdx 3347 movq 40(%rdi),%r13 3348 movq 48(%rdi),%r14 3349 movq 56(%rdi),%r15 3350 movq %rax,24+8(%rsp) 3351 3352 leaq 64(%rdi),%rdi 3353 xorq %rsi,%rsi 3354 movq $-8,%rcx 3355 jmp L$sqrx8x_reduce 3356 3357.p2align 5 3358L$sqrx8x_reduce: 3359 movq %r8,%rbx 3360 mulxq 0(%rbp),%rax,%r8 3361 adcxq %rbx,%rax 3362 adoxq %r9,%r8 3363 3364 mulxq 8(%rbp),%rbx,%r9 3365 adcxq %rbx,%r8 3366 adoxq %r10,%r9 3367 3368 mulxq 16(%rbp),%rbx,%r10 3369 adcxq %rbx,%r9 3370 adoxq %r11,%r10 3371 3372 mulxq 24(%rbp),%rbx,%r11 3373 adcxq %rbx,%r10 3374 adoxq %r12,%r11 3375 3376.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 3377 movq %rdx,%rax 3378 movq %r8,%rdx 3379 adcxq %rbx,%r11 3380 adoxq %r13,%r12 3381 3382 mulxq 32+8(%rsp),%rbx,%rdx 3383 movq %rax,%rdx 3384 movq %rax,64+48+8(%rsp,%rcx,8) 3385 3386 mulxq 40(%rbp),%rax,%r13 3387 adcxq %rax,%r12 3388 adoxq %r14,%r13 3389 3390 mulxq 48(%rbp),%rax,%r14 3391 adcxq %rax,%r13 3392 adoxq %r15,%r14 3393 3394 mulxq 56(%rbp),%rax,%r15 3395 movq %rbx,%rdx 3396 adcxq %rax,%r14 3397 adoxq %rsi,%r15 3398 adcxq %rsi,%r15 3399 3400.byte 0x67,0x67,0x67 3401 incq %rcx 3402 jnz L$sqrx8x_reduce 3403 3404 movq %rsi,%rax 3405 cmpq 0+8(%rsp),%rbp 3406 jae L$sqrx8x_no_tail 3407 3408 movq 48+8(%rsp),%rdx 3409 addq 0(%rdi),%r8 3410 leaq 64(%rbp),%rbp 3411 movq $-8,%rcx 3412 adcxq 8(%rdi),%r9 3413 adcxq 16(%rdi),%r10 3414 adcq 24(%rdi),%r11 3415 adcq 32(%rdi),%r12 3416 adcq 40(%rdi),%r13 3417 adcq 48(%rdi),%r14 3418 adcq 56(%rdi),%r15 3419 leaq 64(%rdi),%rdi 3420 sbbq %rax,%rax 3421 3422 xorq %rsi,%rsi 3423 movq %rax,16+8(%rsp) 3424 jmp L$sqrx8x_tail 3425 3426.p2align 5 3427L$sqrx8x_tail: 3428 movq %r8,%rbx 3429 mulxq 0(%rbp),%rax,%r8 3430 adcxq %rax,%rbx 3431 adoxq %r9,%r8 3432 3433 mulxq 8(%rbp),%rax,%r9 3434 adcxq %rax,%r8 3435 adoxq %r10,%r9 3436 3437 mulxq 16(%rbp),%rax,%r10 3438 adcxq %rax,%r9 3439 adoxq %r11,%r10 3440 3441 mulxq 24(%rbp),%rax,%r11 3442 adcxq %rax,%r10 3443 adoxq %r12,%r11 3444 3445.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3446 adcxq %rax,%r11 3447 adoxq %r13,%r12 3448 3449 mulxq 40(%rbp),%rax,%r13 3450 adcxq %rax,%r12 3451 adoxq %r14,%r13 3452 3453 mulxq 48(%rbp),%rax,%r14 3454 adcxq %rax,%r13 3455 adoxq %r15,%r14 3456 3457 mulxq 56(%rbp),%rax,%r15 3458 movq 72+48+8(%rsp,%rcx,8),%rdx 3459 adcxq %rax,%r14 3460 adoxq %rsi,%r15 3461 movq %rbx,(%rdi,%rcx,8) 3462 movq %r8,%rbx 3463 adcxq %rsi,%r15 3464 3465 incq %rcx 3466 jnz L$sqrx8x_tail 3467 3468 cmpq 0+8(%rsp),%rbp 3469 jae L$sqrx8x_tail_done 3470 3471 subq 16+8(%rsp),%rsi 3472 movq 48+8(%rsp),%rdx 3473 leaq 64(%rbp),%rbp 3474 adcq 0(%rdi),%r8 3475 adcq 8(%rdi),%r9 3476 adcq 16(%rdi),%r10 3477 adcq 24(%rdi),%r11 3478 adcq 32(%rdi),%r12 3479 adcq 40(%rdi),%r13 3480 adcq 48(%rdi),%r14 3481 adcq 56(%rdi),%r15 3482 leaq 64(%rdi),%rdi 3483 sbbq %rax,%rax 3484 subq $8,%rcx 3485 3486 xorq %rsi,%rsi 3487 movq %rax,16+8(%rsp) 3488 jmp L$sqrx8x_tail 3489 3490.p2align 5 3491L$sqrx8x_tail_done: 3492 xorq %rax,%rax 3493 addq 24+8(%rsp),%r8 3494 adcq $0,%r9 3495 adcq $0,%r10 3496 adcq $0,%r11 3497 adcq $0,%r12 3498 adcq $0,%r13 3499 adcq $0,%r14 3500 adcq $0,%r15 3501 adcq $0,%rax 3502 3503 subq 16+8(%rsp),%rsi 3504L$sqrx8x_no_tail: 3505 adcq 0(%rdi),%r8 3506.byte 102,72,15,126,217 3507 adcq 8(%rdi),%r9 3508 movq 56(%rbp),%rsi 3509.byte 102,72,15,126,213 3510 adcq 16(%rdi),%r10 3511 adcq 24(%rdi),%r11 3512 adcq 32(%rdi),%r12 3513 adcq 40(%rdi),%r13 3514 adcq 48(%rdi),%r14 3515 adcq 56(%rdi),%r15 3516 adcq $0,%rax 3517 3518 movq 32+8(%rsp),%rbx 3519 movq 64(%rdi,%rcx,1),%rdx 3520 3521 movq %r8,0(%rdi) 3522 leaq 64(%rdi),%r8 3523 movq %r9,8(%rdi) 3524 movq %r10,16(%rdi) 3525 movq %r11,24(%rdi) 3526 movq %r12,32(%rdi) 3527 movq %r13,40(%rdi) 3528 movq %r14,48(%rdi) 3529 movq %r15,56(%rdi) 3530 3531 leaq 64(%rdi,%rcx,1),%rdi 3532 cmpq 8+8(%rsp),%r8 3533 jb L$sqrx8x_reduction_loop 3534 .byte 0xf3,0xc3 3535 3536 3537.p2align 5 3538 3539__bn_postx4x_internal: 3540 3541 movq 0(%rbp),%r12 3542 movq %rcx,%r10 3543 movq %rcx,%r9 3544 negq %rax 3545 sarq $3+2,%rcx 3546 3547.byte 102,72,15,126,202 3548.byte 102,72,15,126,206 3549 decq %r12 3550 movq 8(%rbp),%r13 3551 xorq %r8,%r8 3552 movq 16(%rbp),%r14 3553 movq 24(%rbp),%r15 3554 jmp L$sqrx4x_sub_entry 3555 3556.p2align 4 3557L$sqrx4x_sub: 3558 movq 0(%rbp),%r12 3559 movq 8(%rbp),%r13 3560 movq 16(%rbp),%r14 3561 movq 24(%rbp),%r15 3562L$sqrx4x_sub_entry: 3563 andnq %rax,%r12,%r12 3564 leaq 32(%rbp),%rbp 3565 andnq %rax,%r13,%r13 3566 andnq %rax,%r14,%r14 3567 andnq %rax,%r15,%r15 3568 3569 negq %r8 3570 adcq 0(%rdi),%r12 3571 adcq 8(%rdi),%r13 3572 adcq 16(%rdi),%r14 3573 adcq 24(%rdi),%r15 3574 movq %r12,0(%rdx) 3575 leaq 32(%rdi),%rdi 3576 movq %r13,8(%rdx) 3577 sbbq %r8,%r8 3578 movq %r14,16(%rdx) 3579 movq %r15,24(%rdx) 3580 leaq 32(%rdx),%rdx 3581 3582 incq %rcx 3583 jnz L$sqrx4x_sub 3584 3585 negq %r9 3586 3587 .byte 0xf3,0xc3 3588 3589 3590.globl _GFp_bn_scatter5 3591.private_extern _GFp_bn_scatter5 3592 3593.p2align 4 3594_GFp_bn_scatter5: 3595 3596 cmpl $0,%esi 3597 jz L$scatter_epilogue 3598 leaq (%rdx,%rcx,8),%rdx 3599L$scatter: 3600 movq (%rdi),%rax 3601 leaq 8(%rdi),%rdi 3602 movq %rax,(%rdx) 3603 leaq 256(%rdx),%rdx 3604 subl $1,%esi 3605 jnz L$scatter 3606L$scatter_epilogue: 3607 .byte 0xf3,0xc3 3608 3609 3610 3611.globl _GFp_bn_gather5 3612.private_extern _GFp_bn_gather5 3613 3614.p2align 5 3615_GFp_bn_gather5: 3616 3617L$SEH_begin_GFp_bn_gather5: 3618 3619.byte 0x4c,0x8d,0x14,0x24 3620 3621.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 3622 leaq L$inc(%rip),%rax 3623 andq $-16,%rsp 3624 3625 movd %ecx,%xmm5 3626 movdqa 0(%rax),%xmm0 3627 movdqa 16(%rax),%xmm1 3628 leaq 128(%rdx),%r11 3629 leaq 128(%rsp),%rax 3630 3631 pshufd $0,%xmm5,%xmm5 3632 movdqa %xmm1,%xmm4 3633 movdqa %xmm1,%xmm2 3634 paddd %xmm0,%xmm1 3635 pcmpeqd %xmm5,%xmm0 3636 movdqa %xmm4,%xmm3 3637 3638 paddd %xmm1,%xmm2 3639 pcmpeqd %xmm5,%xmm1 3640 movdqa %xmm0,-128(%rax) 3641 movdqa %xmm4,%xmm0 3642 3643 paddd %xmm2,%xmm3 3644 pcmpeqd %xmm5,%xmm2 3645 movdqa %xmm1,-112(%rax) 3646 movdqa %xmm4,%xmm1 3647 3648 paddd %xmm3,%xmm0 3649 pcmpeqd %xmm5,%xmm3 3650 movdqa %xmm2,-96(%rax) 3651 movdqa %xmm4,%xmm2 3652 paddd %xmm0,%xmm1 3653 pcmpeqd %xmm5,%xmm0 3654 movdqa %xmm3,-80(%rax) 3655 movdqa %xmm4,%xmm3 3656 3657 paddd %xmm1,%xmm2 3658 pcmpeqd %xmm5,%xmm1 3659 movdqa %xmm0,-64(%rax) 3660 movdqa %xmm4,%xmm0 3661 3662 paddd %xmm2,%xmm3 3663 pcmpeqd %xmm5,%xmm2 3664 movdqa %xmm1,-48(%rax) 3665 movdqa %xmm4,%xmm1 3666 3667 paddd %xmm3,%xmm0 3668 pcmpeqd %xmm5,%xmm3 3669 movdqa %xmm2,-32(%rax) 3670 movdqa %xmm4,%xmm2 3671 paddd %xmm0,%xmm1 3672 pcmpeqd %xmm5,%xmm0 3673 movdqa %xmm3,-16(%rax) 3674 movdqa %xmm4,%xmm3 3675 3676 paddd %xmm1,%xmm2 3677 pcmpeqd %xmm5,%xmm1 3678 movdqa %xmm0,0(%rax) 3679 movdqa %xmm4,%xmm0 3680 3681 paddd %xmm2,%xmm3 3682 pcmpeqd %xmm5,%xmm2 3683 movdqa %xmm1,16(%rax) 3684 movdqa %xmm4,%xmm1 3685 3686 paddd %xmm3,%xmm0 3687 pcmpeqd %xmm5,%xmm3 3688 movdqa %xmm2,32(%rax) 3689 movdqa %xmm4,%xmm2 3690 paddd %xmm0,%xmm1 3691 pcmpeqd %xmm5,%xmm0 3692 movdqa %xmm3,48(%rax) 3693 movdqa %xmm4,%xmm3 3694 3695 paddd %xmm1,%xmm2 3696 pcmpeqd %xmm5,%xmm1 3697 movdqa %xmm0,64(%rax) 3698 movdqa %xmm4,%xmm0 3699 3700 paddd %xmm2,%xmm3 3701 pcmpeqd %xmm5,%xmm2 3702 movdqa %xmm1,80(%rax) 3703 movdqa %xmm4,%xmm1 3704 3705 paddd %xmm3,%xmm0 3706 pcmpeqd %xmm5,%xmm3 3707 movdqa %xmm2,96(%rax) 3708 movdqa %xmm4,%xmm2 3709 movdqa %xmm3,112(%rax) 3710 jmp L$gather 3711 3712.p2align 5 3713L$gather: 3714 pxor %xmm4,%xmm4 3715 pxor %xmm5,%xmm5 3716 movdqa -128(%r11),%xmm0 3717 movdqa -112(%r11),%xmm1 3718 movdqa -96(%r11),%xmm2 3719 pand -128(%rax),%xmm0 3720 movdqa -80(%r11),%xmm3 3721 pand -112(%rax),%xmm1 3722 por %xmm0,%xmm4 3723 pand -96(%rax),%xmm2 3724 por %xmm1,%xmm5 3725 pand -80(%rax),%xmm3 3726 por %xmm2,%xmm4 3727 por %xmm3,%xmm5 3728 movdqa -64(%r11),%xmm0 3729 movdqa -48(%r11),%xmm1 3730 movdqa -32(%r11),%xmm2 3731 pand -64(%rax),%xmm0 3732 movdqa -16(%r11),%xmm3 3733 pand -48(%rax),%xmm1 3734 por %xmm0,%xmm4 3735 pand -32(%rax),%xmm2 3736 por %xmm1,%xmm5 3737 pand -16(%rax),%xmm3 3738 por %xmm2,%xmm4 3739 por %xmm3,%xmm5 3740 movdqa 0(%r11),%xmm0 3741 movdqa 16(%r11),%xmm1 3742 movdqa 32(%r11),%xmm2 3743 pand 0(%rax),%xmm0 3744 movdqa 48(%r11),%xmm3 3745 pand 16(%rax),%xmm1 3746 por %xmm0,%xmm4 3747 pand 32(%rax),%xmm2 3748 por %xmm1,%xmm5 3749 pand 48(%rax),%xmm3 3750 por %xmm2,%xmm4 3751 por %xmm3,%xmm5 3752 movdqa 64(%r11),%xmm0 3753 movdqa 80(%r11),%xmm1 3754 movdqa 96(%r11),%xmm2 3755 pand 64(%rax),%xmm0 3756 movdqa 112(%r11),%xmm3 3757 pand 80(%rax),%xmm1 3758 por %xmm0,%xmm4 3759 pand 96(%rax),%xmm2 3760 por %xmm1,%xmm5 3761 pand 112(%rax),%xmm3 3762 por %xmm2,%xmm4 3763 por %xmm3,%xmm5 3764 por %xmm5,%xmm4 3765 leaq 256(%r11),%r11 3766 pshufd $0x4e,%xmm4,%xmm0 3767 por %xmm4,%xmm0 3768 movq %xmm0,(%rdi) 3769 leaq 8(%rdi),%rdi 3770 subl $1,%esi 3771 jnz L$gather 3772 3773 leaq (%r10),%rsp 3774 3775 .byte 0xf3,0xc3 3776L$SEH_end_GFp_bn_gather5: 3777 3778 3779.p2align 6 3780L$inc: 3781.long 0,0, 1,1 3782.long 2,2, 2,2 3783.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3784#endif 3785