1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11.text 12 13.extern GFp_ia32cap_P 14.hidden GFp_ia32cap_P 15 16.globl GFp_bn_mul_mont_gather5 17.hidden GFp_bn_mul_mont_gather5 18.type GFp_bn_mul_mont_gather5,@function 19.align 64 20GFp_bn_mul_mont_gather5: 21.cfi_startproc 22 movl %r9d,%r9d 23 movq %rsp,%rax 24.cfi_def_cfa_register %rax 25 testl $7,%r9d 26 jnz .Lmul_enter 27 leaq GFp_ia32cap_P(%rip),%r11 28 movl 8(%r11),%r11d 29 jmp .Lmul4x_enter 30 31.align 16 32.Lmul_enter: 33 movd 8(%rsp),%xmm5 34 pushq %rbx 35.cfi_offset %rbx,-16 36 pushq %rbp 37.cfi_offset %rbp,-24 38 pushq %r12 39.cfi_offset %r12,-32 40 pushq %r13 41.cfi_offset %r13,-40 42 pushq %r14 43.cfi_offset %r14,-48 44 pushq %r15 45.cfi_offset %r15,-56 46 47 negq %r9 48 movq %rsp,%r11 49 leaq -280(%rsp,%r9,8),%r10 50 negq %r9 51 andq $-1024,%r10 52 53 54 55 56 57 58 59 60 61 subq %r10,%r11 62 andq $-4096,%r11 63 leaq (%r10,%r11,1),%rsp 64 movq (%rsp),%r11 65 cmpq %r10,%rsp 66 ja .Lmul_page_walk 67 jmp .Lmul_page_walk_done 68 69.Lmul_page_walk: 70 leaq -4096(%rsp),%rsp 71 movq (%rsp),%r11 72 cmpq %r10,%rsp 73 ja .Lmul_page_walk 74.Lmul_page_walk_done: 75 76 leaq .Linc(%rip),%r10 77 movq %rax,8(%rsp,%r9,8) 78.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 79.Lmul_body: 80 81 leaq 128(%rdx),%r12 82 movdqa 0(%r10),%xmm0 83 movdqa 16(%r10),%xmm1 84 leaq 24-112(%rsp,%r9,8),%r10 85 andq $-16,%r10 86 87 pshufd $0,%xmm5,%xmm5 88 movdqa %xmm1,%xmm4 89 movdqa %xmm1,%xmm2 90 paddd %xmm0,%xmm1 91 pcmpeqd %xmm5,%xmm0 92.byte 0x67 93 movdqa %xmm4,%xmm3 94 paddd %xmm1,%xmm2 95 pcmpeqd %xmm5,%xmm1 96 movdqa %xmm0,112(%r10) 97 movdqa %xmm4,%xmm0 98 99 paddd %xmm2,%xmm3 100 pcmpeqd %xmm5,%xmm2 101 movdqa %xmm1,128(%r10) 102 movdqa %xmm4,%xmm1 103 104 paddd %xmm3,%xmm0 105 pcmpeqd %xmm5,%xmm3 106 movdqa %xmm2,144(%r10) 107 movdqa %xmm4,%xmm2 108 109 paddd %xmm0,%xmm1 110 pcmpeqd %xmm5,%xmm0 111 movdqa %xmm3,160(%r10) 112 movdqa %xmm4,%xmm3 113 paddd %xmm1,%xmm2 114 pcmpeqd %xmm5,%xmm1 115 movdqa %xmm0,176(%r10) 116 movdqa %xmm4,%xmm0 117 118 paddd %xmm2,%xmm3 119 pcmpeqd %xmm5,%xmm2 120 movdqa %xmm1,192(%r10) 121 movdqa %xmm4,%xmm1 122 123 paddd %xmm3,%xmm0 124 pcmpeqd %xmm5,%xmm3 125 movdqa %xmm2,208(%r10) 126 movdqa %xmm4,%xmm2 127 128 paddd %xmm0,%xmm1 129 pcmpeqd %xmm5,%xmm0 130 movdqa %xmm3,224(%r10) 131 movdqa %xmm4,%xmm3 132 paddd %xmm1,%xmm2 133 pcmpeqd %xmm5,%xmm1 134 movdqa %xmm0,240(%r10) 135 movdqa %xmm4,%xmm0 136 137 paddd %xmm2,%xmm3 138 pcmpeqd %xmm5,%xmm2 139 movdqa %xmm1,256(%r10) 140 movdqa %xmm4,%xmm1 141 142 paddd %xmm3,%xmm0 143 pcmpeqd %xmm5,%xmm3 144 movdqa %xmm2,272(%r10) 145 movdqa %xmm4,%xmm2 146 147 paddd %xmm0,%xmm1 148 pcmpeqd %xmm5,%xmm0 149 movdqa %xmm3,288(%r10) 150 movdqa %xmm4,%xmm3 151 paddd %xmm1,%xmm2 152 pcmpeqd %xmm5,%xmm1 153 movdqa %xmm0,304(%r10) 154 155 paddd %xmm2,%xmm3 156.byte 0x67 157 pcmpeqd %xmm5,%xmm2 158 movdqa %xmm1,320(%r10) 159 160 pcmpeqd %xmm5,%xmm3 161 movdqa %xmm2,336(%r10) 162 pand 64(%r12),%xmm0 163 164 pand 80(%r12),%xmm1 165 pand 96(%r12),%xmm2 166 movdqa %xmm3,352(%r10) 167 pand 112(%r12),%xmm3 168 por %xmm2,%xmm0 169 por %xmm3,%xmm1 170 movdqa -128(%r12),%xmm4 171 movdqa -112(%r12),%xmm5 172 movdqa -96(%r12),%xmm2 173 pand 112(%r10),%xmm4 174 movdqa -80(%r12),%xmm3 175 pand 128(%r10),%xmm5 176 por %xmm4,%xmm0 177 pand 144(%r10),%xmm2 178 por %xmm5,%xmm1 179 pand 160(%r10),%xmm3 180 por %xmm2,%xmm0 181 por %xmm3,%xmm1 182 movdqa -64(%r12),%xmm4 183 movdqa -48(%r12),%xmm5 184 movdqa -32(%r12),%xmm2 185 pand 176(%r10),%xmm4 186 movdqa -16(%r12),%xmm3 187 pand 192(%r10),%xmm5 188 por %xmm4,%xmm0 189 pand 208(%r10),%xmm2 190 por %xmm5,%xmm1 191 pand 224(%r10),%xmm3 192 por %xmm2,%xmm0 193 por %xmm3,%xmm1 194 movdqa 0(%r12),%xmm4 195 movdqa 16(%r12),%xmm5 196 movdqa 32(%r12),%xmm2 197 pand 240(%r10),%xmm4 198 movdqa 48(%r12),%xmm3 199 pand 256(%r10),%xmm5 200 por %xmm4,%xmm0 201 pand 272(%r10),%xmm2 202 por %xmm5,%xmm1 203 pand 288(%r10),%xmm3 204 por %xmm2,%xmm0 205 por %xmm3,%xmm1 206 por %xmm1,%xmm0 207 pshufd $0x4e,%xmm0,%xmm1 208 por %xmm1,%xmm0 209 leaq 256(%r12),%r12 210.byte 102,72,15,126,195 211 212 movq (%r8),%r8 213 movq (%rsi),%rax 214 215 xorq %r14,%r14 216 xorq %r15,%r15 217 218 movq %r8,%rbp 219 mulq %rbx 220 movq %rax,%r10 221 movq (%rcx),%rax 222 223 imulq %r10,%rbp 224 movq %rdx,%r11 225 226 mulq %rbp 227 addq %rax,%r10 228 movq 8(%rsi),%rax 229 adcq $0,%rdx 230 movq %rdx,%r13 231 232 leaq 1(%r15),%r15 233 jmp .L1st_enter 234 235.align 16 236.L1st: 237 addq %rax,%r13 238 movq (%rsi,%r15,8),%rax 239 adcq $0,%rdx 240 addq %r11,%r13 241 movq %r10,%r11 242 adcq $0,%rdx 243 movq %r13,-16(%rsp,%r15,8) 244 movq %rdx,%r13 245 246.L1st_enter: 247 mulq %rbx 248 addq %rax,%r11 249 movq (%rcx,%r15,8),%rax 250 adcq $0,%rdx 251 leaq 1(%r15),%r15 252 movq %rdx,%r10 253 254 mulq %rbp 255 cmpq %r9,%r15 256 jne .L1st 257 258 259 addq %rax,%r13 260 adcq $0,%rdx 261 addq %r11,%r13 262 adcq $0,%rdx 263 movq %r13,-16(%rsp,%r9,8) 264 movq %rdx,%r13 265 movq %r10,%r11 266 267 xorq %rdx,%rdx 268 addq %r11,%r13 269 adcq $0,%rdx 270 movq %r13,-8(%rsp,%r9,8) 271 movq %rdx,(%rsp,%r9,8) 272 273 leaq 1(%r14),%r14 274 jmp .Louter 275.align 16 276.Louter: 277 leaq 24+128(%rsp,%r9,8),%rdx 278 andq $-16,%rdx 279 pxor %xmm4,%xmm4 280 pxor %xmm5,%xmm5 281 movdqa -128(%r12),%xmm0 282 movdqa -112(%r12),%xmm1 283 movdqa -96(%r12),%xmm2 284 movdqa -80(%r12),%xmm3 285 pand -128(%rdx),%xmm0 286 pand -112(%rdx),%xmm1 287 por %xmm0,%xmm4 288 pand -96(%rdx),%xmm2 289 por %xmm1,%xmm5 290 pand -80(%rdx),%xmm3 291 por %xmm2,%xmm4 292 por %xmm3,%xmm5 293 movdqa -64(%r12),%xmm0 294 movdqa -48(%r12),%xmm1 295 movdqa -32(%r12),%xmm2 296 movdqa -16(%r12),%xmm3 297 pand -64(%rdx),%xmm0 298 pand -48(%rdx),%xmm1 299 por %xmm0,%xmm4 300 pand -32(%rdx),%xmm2 301 por %xmm1,%xmm5 302 pand -16(%rdx),%xmm3 303 por %xmm2,%xmm4 304 por %xmm3,%xmm5 305 movdqa 0(%r12),%xmm0 306 movdqa 16(%r12),%xmm1 307 movdqa 32(%r12),%xmm2 308 movdqa 48(%r12),%xmm3 309 pand 0(%rdx),%xmm0 310 pand 16(%rdx),%xmm1 311 por %xmm0,%xmm4 312 pand 32(%rdx),%xmm2 313 por %xmm1,%xmm5 314 pand 48(%rdx),%xmm3 315 por %xmm2,%xmm4 316 por %xmm3,%xmm5 317 movdqa 64(%r12),%xmm0 318 movdqa 80(%r12),%xmm1 319 movdqa 96(%r12),%xmm2 320 movdqa 112(%r12),%xmm3 321 pand 64(%rdx),%xmm0 322 pand 80(%rdx),%xmm1 323 por %xmm0,%xmm4 324 pand 96(%rdx),%xmm2 325 por %xmm1,%xmm5 326 pand 112(%rdx),%xmm3 327 por %xmm2,%xmm4 328 por %xmm3,%xmm5 329 por %xmm5,%xmm4 330 pshufd $0x4e,%xmm4,%xmm0 331 por %xmm4,%xmm0 332 leaq 256(%r12),%r12 333 334 movq (%rsi),%rax 335.byte 102,72,15,126,195 336 337 xorq %r15,%r15 338 movq %r8,%rbp 339 movq (%rsp),%r10 340 341 mulq %rbx 342 addq %rax,%r10 343 movq (%rcx),%rax 344 adcq $0,%rdx 345 346 imulq %r10,%rbp 347 movq %rdx,%r11 348 349 mulq %rbp 350 addq %rax,%r10 351 movq 8(%rsi),%rax 352 adcq $0,%rdx 353 movq 8(%rsp),%r10 354 movq %rdx,%r13 355 356 leaq 1(%r15),%r15 357 jmp .Linner_enter 358 359.align 16 360.Linner: 361 addq %rax,%r13 362 movq (%rsi,%r15,8),%rax 363 adcq $0,%rdx 364 addq %r10,%r13 365 movq (%rsp,%r15,8),%r10 366 adcq $0,%rdx 367 movq %r13,-16(%rsp,%r15,8) 368 movq %rdx,%r13 369 370.Linner_enter: 371 mulq %rbx 372 addq %rax,%r11 373 movq (%rcx,%r15,8),%rax 374 adcq $0,%rdx 375 addq %r11,%r10 376 movq %rdx,%r11 377 adcq $0,%r11 378 leaq 1(%r15),%r15 379 380 mulq %rbp 381 cmpq %r9,%r15 382 jne .Linner 383 384 addq %rax,%r13 385 adcq $0,%rdx 386 addq %r10,%r13 387 movq (%rsp,%r9,8),%r10 388 adcq $0,%rdx 389 movq %r13,-16(%rsp,%r9,8) 390 movq %rdx,%r13 391 392 xorq %rdx,%rdx 393 addq %r11,%r13 394 adcq $0,%rdx 395 addq %r10,%r13 396 adcq $0,%rdx 397 movq %r13,-8(%rsp,%r9,8) 398 movq %rdx,(%rsp,%r9,8) 399 400 leaq 1(%r14),%r14 401 cmpq %r9,%r14 402 jb .Louter 403 404 xorq %r14,%r14 405 movq (%rsp),%rax 406 leaq (%rsp),%rsi 407 movq %r9,%r15 408 jmp .Lsub 409.align 16 410.Lsub: sbbq (%rcx,%r14,8),%rax 411 movq %rax,(%rdi,%r14,8) 412 movq 8(%rsi,%r14,8),%rax 413 leaq 1(%r14),%r14 414 decq %r15 415 jnz .Lsub 416 417 sbbq $0,%rax 418 movq $-1,%rbx 419 xorq %rax,%rbx 420 xorq %r14,%r14 421 movq %r9,%r15 422 423.Lcopy: 424 movq (%rdi,%r14,8),%rcx 425 movq (%rsp,%r14,8),%rdx 426 andq %rbx,%rcx 427 andq %rax,%rdx 428 movq %r14,(%rsp,%r14,8) 429 orq %rcx,%rdx 430 movq %rdx,(%rdi,%r14,8) 431 leaq 1(%r14),%r14 432 subq $1,%r15 433 jnz .Lcopy 434 435 movq 8(%rsp,%r9,8),%rsi 436.cfi_def_cfa %rsi,8 437 movq $1,%rax 438 439 movq -48(%rsi),%r15 440.cfi_restore %r15 441 movq -40(%rsi),%r14 442.cfi_restore %r14 443 movq -32(%rsi),%r13 444.cfi_restore %r13 445 movq -24(%rsi),%r12 446.cfi_restore %r12 447 movq -16(%rsi),%rbp 448.cfi_restore %rbp 449 movq -8(%rsi),%rbx 450.cfi_restore %rbx 451 leaq (%rsi),%rsp 452.cfi_def_cfa_register %rsp 453.Lmul_epilogue: 454 .byte 0xf3,0xc3 455.cfi_endproc 456.size GFp_bn_mul_mont_gather5,.-GFp_bn_mul_mont_gather5 457.type bn_mul4x_mont_gather5,@function 458.align 32 459bn_mul4x_mont_gather5: 460.cfi_startproc 461.byte 0x67 462 movq %rsp,%rax 463.cfi_def_cfa_register %rax 464.Lmul4x_enter: 465 andl $0x80108,%r11d 466 cmpl $0x80108,%r11d 467 je .Lmulx4x_enter 468 pushq %rbx 469.cfi_offset %rbx,-16 470 pushq %rbp 471.cfi_offset %rbp,-24 472 pushq %r12 473.cfi_offset %r12,-32 474 pushq %r13 475.cfi_offset %r13,-40 476 pushq %r14 477.cfi_offset %r14,-48 478 pushq %r15 479.cfi_offset %r15,-56 480.Lmul4x_prologue: 481 482.byte 0x67 483 shll $3,%r9d 484 leaq (%r9,%r9,2),%r10 485 negq %r9 486 487 488 489 490 491 492 493 494 495 496 leaq -320(%rsp,%r9,2),%r11 497 movq %rsp,%rbp 498 subq %rdi,%r11 499 andq $4095,%r11 500 cmpq %r11,%r10 501 jb .Lmul4xsp_alt 502 subq %r11,%rbp 503 leaq -320(%rbp,%r9,2),%rbp 504 jmp .Lmul4xsp_done 505 506.align 32 507.Lmul4xsp_alt: 508 leaq 4096-320(,%r9,2),%r10 509 leaq -320(%rbp,%r9,2),%rbp 510 subq %r10,%r11 511 movq $0,%r10 512 cmovcq %r10,%r11 513 subq %r11,%rbp 514.Lmul4xsp_done: 515 andq $-64,%rbp 516 movq %rsp,%r11 517 subq %rbp,%r11 518 andq $-4096,%r11 519 leaq (%r11,%rbp,1),%rsp 520 movq (%rsp),%r10 521 cmpq %rbp,%rsp 522 ja .Lmul4x_page_walk 523 jmp .Lmul4x_page_walk_done 524 525.Lmul4x_page_walk: 526 leaq -4096(%rsp),%rsp 527 movq (%rsp),%r10 528 cmpq %rbp,%rsp 529 ja .Lmul4x_page_walk 530.Lmul4x_page_walk_done: 531 532 negq %r9 533 534 movq %rax,40(%rsp) 535.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 536.Lmul4x_body: 537 538 call mul4x_internal 539 540 movq 40(%rsp),%rsi 541.cfi_def_cfa %rsi,8 542 movq $1,%rax 543 544 movq -48(%rsi),%r15 545.cfi_restore %r15 546 movq -40(%rsi),%r14 547.cfi_restore %r14 548 movq -32(%rsi),%r13 549.cfi_restore %r13 550 movq -24(%rsi),%r12 551.cfi_restore %r12 552 movq -16(%rsi),%rbp 553.cfi_restore %rbp 554 movq -8(%rsi),%rbx 555.cfi_restore %rbx 556 leaq (%rsi),%rsp 557.cfi_def_cfa_register %rsp 558.Lmul4x_epilogue: 559 .byte 0xf3,0xc3 560.cfi_endproc 561.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 562 563.type mul4x_internal,@function 564.align 32 565mul4x_internal: 566.cfi_startproc 567 shlq $5,%r9 568 movd 8(%rax),%xmm5 569 leaq .Linc(%rip),%rax 570 leaq 128(%rdx,%r9,1),%r13 571 shrq $5,%r9 572 movdqa 0(%rax),%xmm0 573 movdqa 16(%rax),%xmm1 574 leaq 88-112(%rsp,%r9,1),%r10 575 leaq 128(%rdx),%r12 576 577 pshufd $0,%xmm5,%xmm5 578 movdqa %xmm1,%xmm4 579.byte 0x67,0x67 580 movdqa %xmm1,%xmm2 581 paddd %xmm0,%xmm1 582 pcmpeqd %xmm5,%xmm0 583.byte 0x67 584 movdqa %xmm4,%xmm3 585 paddd %xmm1,%xmm2 586 pcmpeqd %xmm5,%xmm1 587 movdqa %xmm0,112(%r10) 588 movdqa %xmm4,%xmm0 589 590 paddd %xmm2,%xmm3 591 pcmpeqd %xmm5,%xmm2 592 movdqa %xmm1,128(%r10) 593 movdqa %xmm4,%xmm1 594 595 paddd %xmm3,%xmm0 596 pcmpeqd %xmm5,%xmm3 597 movdqa %xmm2,144(%r10) 598 movdqa %xmm4,%xmm2 599 600 paddd %xmm0,%xmm1 601 pcmpeqd %xmm5,%xmm0 602 movdqa %xmm3,160(%r10) 603 movdqa %xmm4,%xmm3 604 paddd %xmm1,%xmm2 605 pcmpeqd %xmm5,%xmm1 606 movdqa %xmm0,176(%r10) 607 movdqa %xmm4,%xmm0 608 609 paddd %xmm2,%xmm3 610 pcmpeqd %xmm5,%xmm2 611 movdqa %xmm1,192(%r10) 612 movdqa %xmm4,%xmm1 613 614 paddd %xmm3,%xmm0 615 pcmpeqd %xmm5,%xmm3 616 movdqa %xmm2,208(%r10) 617 movdqa %xmm4,%xmm2 618 619 paddd %xmm0,%xmm1 620 pcmpeqd %xmm5,%xmm0 621 movdqa %xmm3,224(%r10) 622 movdqa %xmm4,%xmm3 623 paddd %xmm1,%xmm2 624 pcmpeqd %xmm5,%xmm1 625 movdqa %xmm0,240(%r10) 626 movdqa %xmm4,%xmm0 627 628 paddd %xmm2,%xmm3 629 pcmpeqd %xmm5,%xmm2 630 movdqa %xmm1,256(%r10) 631 movdqa %xmm4,%xmm1 632 633 paddd %xmm3,%xmm0 634 pcmpeqd %xmm5,%xmm3 635 movdqa %xmm2,272(%r10) 636 movdqa %xmm4,%xmm2 637 638 paddd %xmm0,%xmm1 639 pcmpeqd %xmm5,%xmm0 640 movdqa %xmm3,288(%r10) 641 movdqa %xmm4,%xmm3 642 paddd %xmm1,%xmm2 643 pcmpeqd %xmm5,%xmm1 644 movdqa %xmm0,304(%r10) 645 646 paddd %xmm2,%xmm3 647.byte 0x67 648 pcmpeqd %xmm5,%xmm2 649 movdqa %xmm1,320(%r10) 650 651 pcmpeqd %xmm5,%xmm3 652 movdqa %xmm2,336(%r10) 653 pand 64(%r12),%xmm0 654 655 pand 80(%r12),%xmm1 656 pand 96(%r12),%xmm2 657 movdqa %xmm3,352(%r10) 658 pand 112(%r12),%xmm3 659 por %xmm2,%xmm0 660 por %xmm3,%xmm1 661 movdqa -128(%r12),%xmm4 662 movdqa -112(%r12),%xmm5 663 movdqa -96(%r12),%xmm2 664 pand 112(%r10),%xmm4 665 movdqa -80(%r12),%xmm3 666 pand 128(%r10),%xmm5 667 por %xmm4,%xmm0 668 pand 144(%r10),%xmm2 669 por %xmm5,%xmm1 670 pand 160(%r10),%xmm3 671 por %xmm2,%xmm0 672 por %xmm3,%xmm1 673 movdqa -64(%r12),%xmm4 674 movdqa -48(%r12),%xmm5 675 movdqa -32(%r12),%xmm2 676 pand 176(%r10),%xmm4 677 movdqa -16(%r12),%xmm3 678 pand 192(%r10),%xmm5 679 por %xmm4,%xmm0 680 pand 208(%r10),%xmm2 681 por %xmm5,%xmm1 682 pand 224(%r10),%xmm3 683 por %xmm2,%xmm0 684 por %xmm3,%xmm1 685 movdqa 0(%r12),%xmm4 686 movdqa 16(%r12),%xmm5 687 movdqa 32(%r12),%xmm2 688 pand 240(%r10),%xmm4 689 movdqa 48(%r12),%xmm3 690 pand 256(%r10),%xmm5 691 por %xmm4,%xmm0 692 pand 272(%r10),%xmm2 693 por %xmm5,%xmm1 694 pand 288(%r10),%xmm3 695 por %xmm2,%xmm0 696 por %xmm3,%xmm1 697 por %xmm1,%xmm0 698 pshufd $0x4e,%xmm0,%xmm1 699 por %xmm1,%xmm0 700 leaq 256(%r12),%r12 701.byte 102,72,15,126,195 702 703 movq %r13,16+8(%rsp) 704 movq %rdi,56+8(%rsp) 705 706 movq (%r8),%r8 707 movq (%rsi),%rax 708 leaq (%rsi,%r9,1),%rsi 709 negq %r9 710 711 movq %r8,%rbp 712 mulq %rbx 713 movq %rax,%r10 714 movq (%rcx),%rax 715 716 imulq %r10,%rbp 717 leaq 64+8(%rsp),%r14 718 movq %rdx,%r11 719 720 mulq %rbp 721 addq %rax,%r10 722 movq 8(%rsi,%r9,1),%rax 723 adcq $0,%rdx 724 movq %rdx,%rdi 725 726 mulq %rbx 727 addq %rax,%r11 728 movq 8(%rcx),%rax 729 adcq $0,%rdx 730 movq %rdx,%r10 731 732 mulq %rbp 733 addq %rax,%rdi 734 movq 16(%rsi,%r9,1),%rax 735 adcq $0,%rdx 736 addq %r11,%rdi 737 leaq 32(%r9),%r15 738 leaq 32(%rcx),%rcx 739 adcq $0,%rdx 740 movq %rdi,(%r14) 741 movq %rdx,%r13 742 jmp .L1st4x 743 744.align 32 745.L1st4x: 746 mulq %rbx 747 addq %rax,%r10 748 movq -16(%rcx),%rax 749 leaq 32(%r14),%r14 750 adcq $0,%rdx 751 movq %rdx,%r11 752 753 mulq %rbp 754 addq %rax,%r13 755 movq -8(%rsi,%r15,1),%rax 756 adcq $0,%rdx 757 addq %r10,%r13 758 adcq $0,%rdx 759 movq %r13,-24(%r14) 760 movq %rdx,%rdi 761 762 mulq %rbx 763 addq %rax,%r11 764 movq -8(%rcx),%rax 765 adcq $0,%rdx 766 movq %rdx,%r10 767 768 mulq %rbp 769 addq %rax,%rdi 770 movq (%rsi,%r15,1),%rax 771 adcq $0,%rdx 772 addq %r11,%rdi 773 adcq $0,%rdx 774 movq %rdi,-16(%r14) 775 movq %rdx,%r13 776 777 mulq %rbx 778 addq %rax,%r10 779 movq 0(%rcx),%rax 780 adcq $0,%rdx 781 movq %rdx,%r11 782 783 mulq %rbp 784 addq %rax,%r13 785 movq 8(%rsi,%r15,1),%rax 786 adcq $0,%rdx 787 addq %r10,%r13 788 adcq $0,%rdx 789 movq %r13,-8(%r14) 790 movq %rdx,%rdi 791 792 mulq %rbx 793 addq %rax,%r11 794 movq 8(%rcx),%rax 795 adcq $0,%rdx 796 movq %rdx,%r10 797 798 mulq %rbp 799 addq %rax,%rdi 800 movq 16(%rsi,%r15,1),%rax 801 adcq $0,%rdx 802 addq %r11,%rdi 803 leaq 32(%rcx),%rcx 804 adcq $0,%rdx 805 movq %rdi,(%r14) 806 movq %rdx,%r13 807 808 addq $32,%r15 809 jnz .L1st4x 810 811 mulq %rbx 812 addq %rax,%r10 813 movq -16(%rcx),%rax 814 leaq 32(%r14),%r14 815 adcq $0,%rdx 816 movq %rdx,%r11 817 818 mulq %rbp 819 addq %rax,%r13 820 movq -8(%rsi),%rax 821 adcq $0,%rdx 822 addq %r10,%r13 823 adcq $0,%rdx 824 movq %r13,-24(%r14) 825 movq %rdx,%rdi 826 827 mulq %rbx 828 addq %rax,%r11 829 movq -8(%rcx),%rax 830 adcq $0,%rdx 831 movq %rdx,%r10 832 833 mulq %rbp 834 addq %rax,%rdi 835 movq (%rsi,%r9,1),%rax 836 adcq $0,%rdx 837 addq %r11,%rdi 838 adcq $0,%rdx 839 movq %rdi,-16(%r14) 840 movq %rdx,%r13 841 842 leaq (%rcx,%r9,1),%rcx 843 844 xorq %rdi,%rdi 845 addq %r10,%r13 846 adcq $0,%rdi 847 movq %r13,-8(%r14) 848 849 jmp .Louter4x 850 851.align 32 852.Louter4x: 853 leaq 16+128(%r14),%rdx 854 pxor %xmm4,%xmm4 855 pxor %xmm5,%xmm5 856 movdqa -128(%r12),%xmm0 857 movdqa -112(%r12),%xmm1 858 movdqa -96(%r12),%xmm2 859 movdqa -80(%r12),%xmm3 860 pand -128(%rdx),%xmm0 861 pand -112(%rdx),%xmm1 862 por %xmm0,%xmm4 863 pand -96(%rdx),%xmm2 864 por %xmm1,%xmm5 865 pand -80(%rdx),%xmm3 866 por %xmm2,%xmm4 867 por %xmm3,%xmm5 868 movdqa -64(%r12),%xmm0 869 movdqa -48(%r12),%xmm1 870 movdqa -32(%r12),%xmm2 871 movdqa -16(%r12),%xmm3 872 pand -64(%rdx),%xmm0 873 pand -48(%rdx),%xmm1 874 por %xmm0,%xmm4 875 pand -32(%rdx),%xmm2 876 por %xmm1,%xmm5 877 pand -16(%rdx),%xmm3 878 por %xmm2,%xmm4 879 por %xmm3,%xmm5 880 movdqa 0(%r12),%xmm0 881 movdqa 16(%r12),%xmm1 882 movdqa 32(%r12),%xmm2 883 movdqa 48(%r12),%xmm3 884 pand 0(%rdx),%xmm0 885 pand 16(%rdx),%xmm1 886 por %xmm0,%xmm4 887 pand 32(%rdx),%xmm2 888 por %xmm1,%xmm5 889 pand 48(%rdx),%xmm3 890 por %xmm2,%xmm4 891 por %xmm3,%xmm5 892 movdqa 64(%r12),%xmm0 893 movdqa 80(%r12),%xmm1 894 movdqa 96(%r12),%xmm2 895 movdqa 112(%r12),%xmm3 896 pand 64(%rdx),%xmm0 897 pand 80(%rdx),%xmm1 898 por %xmm0,%xmm4 899 pand 96(%rdx),%xmm2 900 por %xmm1,%xmm5 901 pand 112(%rdx),%xmm3 902 por %xmm2,%xmm4 903 por %xmm3,%xmm5 904 por %xmm5,%xmm4 905 pshufd $0x4e,%xmm4,%xmm0 906 por %xmm4,%xmm0 907 leaq 256(%r12),%r12 908.byte 102,72,15,126,195 909 910 movq (%r14,%r9,1),%r10 911 movq %r8,%rbp 912 mulq %rbx 913 addq %rax,%r10 914 movq (%rcx),%rax 915 adcq $0,%rdx 916 917 imulq %r10,%rbp 918 movq %rdx,%r11 919 movq %rdi,(%r14) 920 921 leaq (%r14,%r9,1),%r14 922 923 mulq %rbp 924 addq %rax,%r10 925 movq 8(%rsi,%r9,1),%rax 926 adcq $0,%rdx 927 movq %rdx,%rdi 928 929 mulq %rbx 930 addq %rax,%r11 931 movq 8(%rcx),%rax 932 adcq $0,%rdx 933 addq 8(%r14),%r11 934 adcq $0,%rdx 935 movq %rdx,%r10 936 937 mulq %rbp 938 addq %rax,%rdi 939 movq 16(%rsi,%r9,1),%rax 940 adcq $0,%rdx 941 addq %r11,%rdi 942 leaq 32(%r9),%r15 943 leaq 32(%rcx),%rcx 944 adcq $0,%rdx 945 movq %rdx,%r13 946 jmp .Linner4x 947 948.align 32 949.Linner4x: 950 mulq %rbx 951 addq %rax,%r10 952 movq -16(%rcx),%rax 953 adcq $0,%rdx 954 addq 16(%r14),%r10 955 leaq 32(%r14),%r14 956 adcq $0,%rdx 957 movq %rdx,%r11 958 959 mulq %rbp 960 addq %rax,%r13 961 movq -8(%rsi,%r15,1),%rax 962 adcq $0,%rdx 963 addq %r10,%r13 964 adcq $0,%rdx 965 movq %rdi,-32(%r14) 966 movq %rdx,%rdi 967 968 mulq %rbx 969 addq %rax,%r11 970 movq -8(%rcx),%rax 971 adcq $0,%rdx 972 addq -8(%r14),%r11 973 adcq $0,%rdx 974 movq %rdx,%r10 975 976 mulq %rbp 977 addq %rax,%rdi 978 movq (%rsi,%r15,1),%rax 979 adcq $0,%rdx 980 addq %r11,%rdi 981 adcq $0,%rdx 982 movq %r13,-24(%r14) 983 movq %rdx,%r13 984 985 mulq %rbx 986 addq %rax,%r10 987 movq 0(%rcx),%rax 988 adcq $0,%rdx 989 addq (%r14),%r10 990 adcq $0,%rdx 991 movq %rdx,%r11 992 993 mulq %rbp 994 addq %rax,%r13 995 movq 8(%rsi,%r15,1),%rax 996 adcq $0,%rdx 997 addq %r10,%r13 998 adcq $0,%rdx 999 movq %rdi,-16(%r14) 1000 movq %rdx,%rdi 1001 1002 mulq %rbx 1003 addq %rax,%r11 1004 movq 8(%rcx),%rax 1005 adcq $0,%rdx 1006 addq 8(%r14),%r11 1007 adcq $0,%rdx 1008 movq %rdx,%r10 1009 1010 mulq %rbp 1011 addq %rax,%rdi 1012 movq 16(%rsi,%r15,1),%rax 1013 adcq $0,%rdx 1014 addq %r11,%rdi 1015 leaq 32(%rcx),%rcx 1016 adcq $0,%rdx 1017 movq %r13,-8(%r14) 1018 movq %rdx,%r13 1019 1020 addq $32,%r15 1021 jnz .Linner4x 1022 1023 mulq %rbx 1024 addq %rax,%r10 1025 movq -16(%rcx),%rax 1026 adcq $0,%rdx 1027 addq 16(%r14),%r10 1028 leaq 32(%r14),%r14 1029 adcq $0,%rdx 1030 movq %rdx,%r11 1031 1032 mulq %rbp 1033 addq %rax,%r13 1034 movq -8(%rsi),%rax 1035 adcq $0,%rdx 1036 addq %r10,%r13 1037 adcq $0,%rdx 1038 movq %rdi,-32(%r14) 1039 movq %rdx,%rdi 1040 1041 mulq %rbx 1042 addq %rax,%r11 1043 movq %rbp,%rax 1044 movq -8(%rcx),%rbp 1045 adcq $0,%rdx 1046 addq -8(%r14),%r11 1047 adcq $0,%rdx 1048 movq %rdx,%r10 1049 1050 mulq %rbp 1051 addq %rax,%rdi 1052 movq (%rsi,%r9,1),%rax 1053 adcq $0,%rdx 1054 addq %r11,%rdi 1055 adcq $0,%rdx 1056 movq %r13,-24(%r14) 1057 movq %rdx,%r13 1058 1059 movq %rdi,-16(%r14) 1060 leaq (%rcx,%r9,1),%rcx 1061 1062 xorq %rdi,%rdi 1063 addq %r10,%r13 1064 adcq $0,%rdi 1065 addq (%r14),%r13 1066 adcq $0,%rdi 1067 movq %r13,-8(%r14) 1068 1069 cmpq 16+8(%rsp),%r12 1070 jb .Louter4x 1071 xorq %rax,%rax 1072 subq %r13,%rbp 1073 adcq %r15,%r15 1074 orq %r15,%rdi 1075 subq %rdi,%rax 1076 leaq (%r14,%r9,1),%rbx 1077 movq (%rcx),%r12 1078 leaq (%rcx),%rbp 1079 movq %r9,%rcx 1080 sarq $3+2,%rcx 1081 movq 56+8(%rsp),%rdi 1082 decq %r12 1083 xorq %r10,%r10 1084 movq 8(%rbp),%r13 1085 movq 16(%rbp),%r14 1086 movq 24(%rbp),%r15 1087 jmp .Lsqr4x_sub_entry 1088.cfi_endproc 1089.size mul4x_internal,.-mul4x_internal 1090.globl GFp_bn_power5 1091.hidden GFp_bn_power5 1092.type GFp_bn_power5,@function 1093.align 32 1094GFp_bn_power5: 1095.cfi_startproc 1096 movq %rsp,%rax 1097.cfi_def_cfa_register %rax 1098 leaq GFp_ia32cap_P(%rip),%r11 1099 movl 8(%r11),%r11d 1100 andl $0x80108,%r11d 1101 cmpl $0x80108,%r11d 1102 je .Lpowerx5_enter 1103 pushq %rbx 1104.cfi_offset %rbx,-16 1105 pushq %rbp 1106.cfi_offset %rbp,-24 1107 pushq %r12 1108.cfi_offset %r12,-32 1109 pushq %r13 1110.cfi_offset %r13,-40 1111 pushq %r14 1112.cfi_offset %r14,-48 1113 pushq %r15 1114.cfi_offset %r15,-56 1115.Lpower5_prologue: 1116 1117 shll $3,%r9d 1118 leal (%r9,%r9,2),%r10d 1119 negq %r9 1120 movq (%r8),%r8 1121 1122 1123 1124 1125 1126 1127 1128 1129 leaq -320(%rsp,%r9,2),%r11 1130 movq %rsp,%rbp 1131 subq %rdi,%r11 1132 andq $4095,%r11 1133 cmpq %r11,%r10 1134 jb .Lpwr_sp_alt 1135 subq %r11,%rbp 1136 leaq -320(%rbp,%r9,2),%rbp 1137 jmp .Lpwr_sp_done 1138 1139.align 32 1140.Lpwr_sp_alt: 1141 leaq 4096-320(,%r9,2),%r10 1142 leaq -320(%rbp,%r9,2),%rbp 1143 subq %r10,%r11 1144 movq $0,%r10 1145 cmovcq %r10,%r11 1146 subq %r11,%rbp 1147.Lpwr_sp_done: 1148 andq $-64,%rbp 1149 movq %rsp,%r11 1150 subq %rbp,%r11 1151 andq $-4096,%r11 1152 leaq (%r11,%rbp,1),%rsp 1153 movq (%rsp),%r10 1154 cmpq %rbp,%rsp 1155 ja .Lpwr_page_walk 1156 jmp .Lpwr_page_walk_done 1157 1158.Lpwr_page_walk: 1159 leaq -4096(%rsp),%rsp 1160 movq (%rsp),%r10 1161 cmpq %rbp,%rsp 1162 ja .Lpwr_page_walk 1163.Lpwr_page_walk_done: 1164 1165 movq %r9,%r10 1166 negq %r9 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 movq %r8,32(%rsp) 1178 movq %rax,40(%rsp) 1179.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 1180.Lpower5_body: 1181.byte 102,72,15,110,207 1182.byte 102,72,15,110,209 1183.byte 102,73,15,110,218 1184.byte 102,72,15,110,226 1185 1186 call __bn_sqr8x_internal 1187 call __bn_post4x_internal 1188 call __bn_sqr8x_internal 1189 call __bn_post4x_internal 1190 call __bn_sqr8x_internal 1191 call __bn_post4x_internal 1192 call __bn_sqr8x_internal 1193 call __bn_post4x_internal 1194 call __bn_sqr8x_internal 1195 call __bn_post4x_internal 1196 1197.byte 102,72,15,126,209 1198.byte 102,72,15,126,226 1199 movq %rsi,%rdi 1200 movq 40(%rsp),%rax 1201 leaq 32(%rsp),%r8 1202 1203 call mul4x_internal 1204 1205 movq 40(%rsp),%rsi 1206.cfi_def_cfa %rsi,8 1207 movq $1,%rax 1208 movq -48(%rsi),%r15 1209.cfi_restore %r15 1210 movq -40(%rsi),%r14 1211.cfi_restore %r14 1212 movq -32(%rsi),%r13 1213.cfi_restore %r13 1214 movq -24(%rsi),%r12 1215.cfi_restore %r12 1216 movq -16(%rsi),%rbp 1217.cfi_restore %rbp 1218 movq -8(%rsi),%rbx 1219.cfi_restore %rbx 1220 leaq (%rsi),%rsp 1221.cfi_def_cfa_register %rsp 1222.Lpower5_epilogue: 1223 .byte 0xf3,0xc3 1224.cfi_endproc 1225.size GFp_bn_power5,.-GFp_bn_power5 1226 1227.globl GFp_bn_sqr8x_internal 1228.hidden GFp_bn_sqr8x_internal 1229.hidden GFp_bn_sqr8x_internal 1230.type GFp_bn_sqr8x_internal,@function 1231.align 32 1232GFp_bn_sqr8x_internal: 1233__bn_sqr8x_internal: 1234.cfi_startproc 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 leaq 32(%r10),%rbp 1309 leaq (%rsi,%r9,1),%rsi 1310 1311 movq %r9,%rcx 1312 1313 1314 movq -32(%rsi,%rbp,1),%r14 1315 leaq 48+8(%rsp,%r9,2),%rdi 1316 movq -24(%rsi,%rbp,1),%rax 1317 leaq -32(%rdi,%rbp,1),%rdi 1318 movq -16(%rsi,%rbp,1),%rbx 1319 movq %rax,%r15 1320 1321 mulq %r14 1322 movq %rax,%r10 1323 movq %rbx,%rax 1324 movq %rdx,%r11 1325 movq %r10,-24(%rdi,%rbp,1) 1326 1327 mulq %r14 1328 addq %rax,%r11 1329 movq %rbx,%rax 1330 adcq $0,%rdx 1331 movq %r11,-16(%rdi,%rbp,1) 1332 movq %rdx,%r10 1333 1334 1335 movq -8(%rsi,%rbp,1),%rbx 1336 mulq %r15 1337 movq %rax,%r12 1338 movq %rbx,%rax 1339 movq %rdx,%r13 1340 1341 leaq (%rbp),%rcx 1342 mulq %r14 1343 addq %rax,%r10 1344 movq %rbx,%rax 1345 movq %rdx,%r11 1346 adcq $0,%r11 1347 addq %r12,%r10 1348 adcq $0,%r11 1349 movq %r10,-8(%rdi,%rcx,1) 1350 jmp .Lsqr4x_1st 1351 1352.align 32 1353.Lsqr4x_1st: 1354 movq (%rsi,%rcx,1),%rbx 1355 mulq %r15 1356 addq %rax,%r13 1357 movq %rbx,%rax 1358 movq %rdx,%r12 1359 adcq $0,%r12 1360 1361 mulq %r14 1362 addq %rax,%r11 1363 movq %rbx,%rax 1364 movq 8(%rsi,%rcx,1),%rbx 1365 movq %rdx,%r10 1366 adcq $0,%r10 1367 addq %r13,%r11 1368 adcq $0,%r10 1369 1370 1371 mulq %r15 1372 addq %rax,%r12 1373 movq %rbx,%rax 1374 movq %r11,(%rdi,%rcx,1) 1375 movq %rdx,%r13 1376 adcq $0,%r13 1377 1378 mulq %r14 1379 addq %rax,%r10 1380 movq %rbx,%rax 1381 movq 16(%rsi,%rcx,1),%rbx 1382 movq %rdx,%r11 1383 adcq $0,%r11 1384 addq %r12,%r10 1385 adcq $0,%r11 1386 1387 mulq %r15 1388 addq %rax,%r13 1389 movq %rbx,%rax 1390 movq %r10,8(%rdi,%rcx,1) 1391 movq %rdx,%r12 1392 adcq $0,%r12 1393 1394 mulq %r14 1395 addq %rax,%r11 1396 movq %rbx,%rax 1397 movq 24(%rsi,%rcx,1),%rbx 1398 movq %rdx,%r10 1399 adcq $0,%r10 1400 addq %r13,%r11 1401 adcq $0,%r10 1402 1403 1404 mulq %r15 1405 addq %rax,%r12 1406 movq %rbx,%rax 1407 movq %r11,16(%rdi,%rcx,1) 1408 movq %rdx,%r13 1409 adcq $0,%r13 1410 leaq 32(%rcx),%rcx 1411 1412 mulq %r14 1413 addq %rax,%r10 1414 movq %rbx,%rax 1415 movq %rdx,%r11 1416 adcq $0,%r11 1417 addq %r12,%r10 1418 adcq $0,%r11 1419 movq %r10,-8(%rdi,%rcx,1) 1420 1421 cmpq $0,%rcx 1422 jne .Lsqr4x_1st 1423 1424 mulq %r15 1425 addq %rax,%r13 1426 leaq 16(%rbp),%rbp 1427 adcq $0,%rdx 1428 addq %r11,%r13 1429 adcq $0,%rdx 1430 1431 movq %r13,(%rdi) 1432 movq %rdx,%r12 1433 movq %rdx,8(%rdi) 1434 jmp .Lsqr4x_outer 1435 1436.align 32 1437.Lsqr4x_outer: 1438 movq -32(%rsi,%rbp,1),%r14 1439 leaq 48+8(%rsp,%r9,2),%rdi 1440 movq -24(%rsi,%rbp,1),%rax 1441 leaq -32(%rdi,%rbp,1),%rdi 1442 movq -16(%rsi,%rbp,1),%rbx 1443 movq %rax,%r15 1444 1445 mulq %r14 1446 movq -24(%rdi,%rbp,1),%r10 1447 addq %rax,%r10 1448 movq %rbx,%rax 1449 adcq $0,%rdx 1450 movq %r10,-24(%rdi,%rbp,1) 1451 movq %rdx,%r11 1452 1453 mulq %r14 1454 addq %rax,%r11 1455 movq %rbx,%rax 1456 adcq $0,%rdx 1457 addq -16(%rdi,%rbp,1),%r11 1458 movq %rdx,%r10 1459 adcq $0,%r10 1460 movq %r11,-16(%rdi,%rbp,1) 1461 1462 xorq %r12,%r12 1463 1464 movq -8(%rsi,%rbp,1),%rbx 1465 mulq %r15 1466 addq %rax,%r12 1467 movq %rbx,%rax 1468 adcq $0,%rdx 1469 addq -8(%rdi,%rbp,1),%r12 1470 movq %rdx,%r13 1471 adcq $0,%r13 1472 1473 mulq %r14 1474 addq %rax,%r10 1475 movq %rbx,%rax 1476 adcq $0,%rdx 1477 addq %r12,%r10 1478 movq %rdx,%r11 1479 adcq $0,%r11 1480 movq %r10,-8(%rdi,%rbp,1) 1481 1482 leaq (%rbp),%rcx 1483 jmp .Lsqr4x_inner 1484 1485.align 32 1486.Lsqr4x_inner: 1487 movq (%rsi,%rcx,1),%rbx 1488 mulq %r15 1489 addq %rax,%r13 1490 movq %rbx,%rax 1491 movq %rdx,%r12 1492 adcq $0,%r12 1493 addq (%rdi,%rcx,1),%r13 1494 adcq $0,%r12 1495 1496.byte 0x67 1497 mulq %r14 1498 addq %rax,%r11 1499 movq %rbx,%rax 1500 movq 8(%rsi,%rcx,1),%rbx 1501 movq %rdx,%r10 1502 adcq $0,%r10 1503 addq %r13,%r11 1504 adcq $0,%r10 1505 1506 mulq %r15 1507 addq %rax,%r12 1508 movq %r11,(%rdi,%rcx,1) 1509 movq %rbx,%rax 1510 movq %rdx,%r13 1511 adcq $0,%r13 1512 addq 8(%rdi,%rcx,1),%r12 1513 leaq 16(%rcx),%rcx 1514 adcq $0,%r13 1515 1516 mulq %r14 1517 addq %rax,%r10 1518 movq %rbx,%rax 1519 adcq $0,%rdx 1520 addq %r12,%r10 1521 movq %rdx,%r11 1522 adcq $0,%r11 1523 movq %r10,-8(%rdi,%rcx,1) 1524 1525 cmpq $0,%rcx 1526 jne .Lsqr4x_inner 1527 1528.byte 0x67 1529 mulq %r15 1530 addq %rax,%r13 1531 adcq $0,%rdx 1532 addq %r11,%r13 1533 adcq $0,%rdx 1534 1535 movq %r13,(%rdi) 1536 movq %rdx,%r12 1537 movq %rdx,8(%rdi) 1538 1539 addq $16,%rbp 1540 jnz .Lsqr4x_outer 1541 1542 1543 movq -32(%rsi),%r14 1544 leaq 48+8(%rsp,%r9,2),%rdi 1545 movq -24(%rsi),%rax 1546 leaq -32(%rdi,%rbp,1),%rdi 1547 movq -16(%rsi),%rbx 1548 movq %rax,%r15 1549 1550 mulq %r14 1551 addq %rax,%r10 1552 movq %rbx,%rax 1553 movq %rdx,%r11 1554 adcq $0,%r11 1555 1556 mulq %r14 1557 addq %rax,%r11 1558 movq %rbx,%rax 1559 movq %r10,-24(%rdi) 1560 movq %rdx,%r10 1561 adcq $0,%r10 1562 addq %r13,%r11 1563 movq -8(%rsi),%rbx 1564 adcq $0,%r10 1565 1566 mulq %r15 1567 addq %rax,%r12 1568 movq %rbx,%rax 1569 movq %r11,-16(%rdi) 1570 movq %rdx,%r13 1571 adcq $0,%r13 1572 1573 mulq %r14 1574 addq %rax,%r10 1575 movq %rbx,%rax 1576 movq %rdx,%r11 1577 adcq $0,%r11 1578 addq %r12,%r10 1579 adcq $0,%r11 1580 movq %r10,-8(%rdi) 1581 1582 mulq %r15 1583 addq %rax,%r13 1584 movq -16(%rsi),%rax 1585 adcq $0,%rdx 1586 addq %r11,%r13 1587 adcq $0,%rdx 1588 1589 movq %r13,(%rdi) 1590 movq %rdx,%r12 1591 movq %rdx,8(%rdi) 1592 1593 mulq %rbx 1594 addq $16,%rbp 1595 xorq %r14,%r14 1596 subq %r9,%rbp 1597 xorq %r15,%r15 1598 1599 addq %r12,%rax 1600 adcq $0,%rdx 1601 movq %rax,8(%rdi) 1602 movq %rdx,16(%rdi) 1603 movq %r15,24(%rdi) 1604 1605 movq -16(%rsi,%rbp,1),%rax 1606 leaq 48+8(%rsp),%rdi 1607 xorq %r10,%r10 1608 movq 8(%rdi),%r11 1609 1610 leaq (%r14,%r10,2),%r12 1611 shrq $63,%r10 1612 leaq (%rcx,%r11,2),%r13 1613 shrq $63,%r11 1614 orq %r10,%r13 1615 movq 16(%rdi),%r10 1616 movq %r11,%r14 1617 mulq %rax 1618 negq %r15 1619 movq 24(%rdi),%r11 1620 adcq %rax,%r12 1621 movq -8(%rsi,%rbp,1),%rax 1622 movq %r12,(%rdi) 1623 adcq %rdx,%r13 1624 1625 leaq (%r14,%r10,2),%rbx 1626 movq %r13,8(%rdi) 1627 sbbq %r15,%r15 1628 shrq $63,%r10 1629 leaq (%rcx,%r11,2),%r8 1630 shrq $63,%r11 1631 orq %r10,%r8 1632 movq 32(%rdi),%r10 1633 movq %r11,%r14 1634 mulq %rax 1635 negq %r15 1636 movq 40(%rdi),%r11 1637 adcq %rax,%rbx 1638 movq 0(%rsi,%rbp,1),%rax 1639 movq %rbx,16(%rdi) 1640 adcq %rdx,%r8 1641 leaq 16(%rbp),%rbp 1642 movq %r8,24(%rdi) 1643 sbbq %r15,%r15 1644 leaq 64(%rdi),%rdi 1645 jmp .Lsqr4x_shift_n_add 1646 1647.align 32 1648.Lsqr4x_shift_n_add: 1649 leaq (%r14,%r10,2),%r12 1650 shrq $63,%r10 1651 leaq (%rcx,%r11,2),%r13 1652 shrq $63,%r11 1653 orq %r10,%r13 1654 movq -16(%rdi),%r10 1655 movq %r11,%r14 1656 mulq %rax 1657 negq %r15 1658 movq -8(%rdi),%r11 1659 adcq %rax,%r12 1660 movq -8(%rsi,%rbp,1),%rax 1661 movq %r12,-32(%rdi) 1662 adcq %rdx,%r13 1663 1664 leaq (%r14,%r10,2),%rbx 1665 movq %r13,-24(%rdi) 1666 sbbq %r15,%r15 1667 shrq $63,%r10 1668 leaq (%rcx,%r11,2),%r8 1669 shrq $63,%r11 1670 orq %r10,%r8 1671 movq 0(%rdi),%r10 1672 movq %r11,%r14 1673 mulq %rax 1674 negq %r15 1675 movq 8(%rdi),%r11 1676 adcq %rax,%rbx 1677 movq 0(%rsi,%rbp,1),%rax 1678 movq %rbx,-16(%rdi) 1679 adcq %rdx,%r8 1680 1681 leaq (%r14,%r10,2),%r12 1682 movq %r8,-8(%rdi) 1683 sbbq %r15,%r15 1684 shrq $63,%r10 1685 leaq (%rcx,%r11,2),%r13 1686 shrq $63,%r11 1687 orq %r10,%r13 1688 movq 16(%rdi),%r10 1689 movq %r11,%r14 1690 mulq %rax 1691 negq %r15 1692 movq 24(%rdi),%r11 1693 adcq %rax,%r12 1694 movq 8(%rsi,%rbp,1),%rax 1695 movq %r12,0(%rdi) 1696 adcq %rdx,%r13 1697 1698 leaq (%r14,%r10,2),%rbx 1699 movq %r13,8(%rdi) 1700 sbbq %r15,%r15 1701 shrq $63,%r10 1702 leaq (%rcx,%r11,2),%r8 1703 shrq $63,%r11 1704 orq %r10,%r8 1705 movq 32(%rdi),%r10 1706 movq %r11,%r14 1707 mulq %rax 1708 negq %r15 1709 movq 40(%rdi),%r11 1710 adcq %rax,%rbx 1711 movq 16(%rsi,%rbp,1),%rax 1712 movq %rbx,16(%rdi) 1713 adcq %rdx,%r8 1714 movq %r8,24(%rdi) 1715 sbbq %r15,%r15 1716 leaq 64(%rdi),%rdi 1717 addq $32,%rbp 1718 jnz .Lsqr4x_shift_n_add 1719 1720 leaq (%r14,%r10,2),%r12 1721.byte 0x67 1722 shrq $63,%r10 1723 leaq (%rcx,%r11,2),%r13 1724 shrq $63,%r11 1725 orq %r10,%r13 1726 movq -16(%rdi),%r10 1727 movq %r11,%r14 1728 mulq %rax 1729 negq %r15 1730 movq -8(%rdi),%r11 1731 adcq %rax,%r12 1732 movq -8(%rsi),%rax 1733 movq %r12,-32(%rdi) 1734 adcq %rdx,%r13 1735 1736 leaq (%r14,%r10,2),%rbx 1737 movq %r13,-24(%rdi) 1738 sbbq %r15,%r15 1739 shrq $63,%r10 1740 leaq (%rcx,%r11,2),%r8 1741 shrq $63,%r11 1742 orq %r10,%r8 1743 mulq %rax 1744 negq %r15 1745 adcq %rax,%rbx 1746 adcq %rdx,%r8 1747 movq %rbx,-16(%rdi) 1748 movq %r8,-8(%rdi) 1749.byte 102,72,15,126,213 1750__bn_sqr8x_reduction: 1751 xorq %rax,%rax 1752 leaq (%r9,%rbp,1),%rcx 1753 leaq 48+8(%rsp,%r9,2),%rdx 1754 movq %rcx,0+8(%rsp) 1755 leaq 48+8(%rsp,%r9,1),%rdi 1756 movq %rdx,8+8(%rsp) 1757 negq %r9 1758 jmp .L8x_reduction_loop 1759 1760.align 32 1761.L8x_reduction_loop: 1762 leaq (%rdi,%r9,1),%rdi 1763.byte 0x66 1764 movq 0(%rdi),%rbx 1765 movq 8(%rdi),%r9 1766 movq 16(%rdi),%r10 1767 movq 24(%rdi),%r11 1768 movq 32(%rdi),%r12 1769 movq 40(%rdi),%r13 1770 movq 48(%rdi),%r14 1771 movq 56(%rdi),%r15 1772 movq %rax,(%rdx) 1773 leaq 64(%rdi),%rdi 1774 1775.byte 0x67 1776 movq %rbx,%r8 1777 imulq 32+8(%rsp),%rbx 1778 movq 0(%rbp),%rax 1779 movl $8,%ecx 1780 jmp .L8x_reduce 1781 1782.align 32 1783.L8x_reduce: 1784 mulq %rbx 1785 movq 8(%rbp),%rax 1786 negq %r8 1787 movq %rdx,%r8 1788 adcq $0,%r8 1789 1790 mulq %rbx 1791 addq %rax,%r9 1792 movq 16(%rbp),%rax 1793 adcq $0,%rdx 1794 addq %r9,%r8 1795 movq %rbx,48-8+8(%rsp,%rcx,8) 1796 movq %rdx,%r9 1797 adcq $0,%r9 1798 1799 mulq %rbx 1800 addq %rax,%r10 1801 movq 24(%rbp),%rax 1802 adcq $0,%rdx 1803 addq %r10,%r9 1804 movq 32+8(%rsp),%rsi 1805 movq %rdx,%r10 1806 adcq $0,%r10 1807 1808 mulq %rbx 1809 addq %rax,%r11 1810 movq 32(%rbp),%rax 1811 adcq $0,%rdx 1812 imulq %r8,%rsi 1813 addq %r11,%r10 1814 movq %rdx,%r11 1815 adcq $0,%r11 1816 1817 mulq %rbx 1818 addq %rax,%r12 1819 movq 40(%rbp),%rax 1820 adcq $0,%rdx 1821 addq %r12,%r11 1822 movq %rdx,%r12 1823 adcq $0,%r12 1824 1825 mulq %rbx 1826 addq %rax,%r13 1827 movq 48(%rbp),%rax 1828 adcq $0,%rdx 1829 addq %r13,%r12 1830 movq %rdx,%r13 1831 adcq $0,%r13 1832 1833 mulq %rbx 1834 addq %rax,%r14 1835 movq 56(%rbp),%rax 1836 adcq $0,%rdx 1837 addq %r14,%r13 1838 movq %rdx,%r14 1839 adcq $0,%r14 1840 1841 mulq %rbx 1842 movq %rsi,%rbx 1843 addq %rax,%r15 1844 movq 0(%rbp),%rax 1845 adcq $0,%rdx 1846 addq %r15,%r14 1847 movq %rdx,%r15 1848 adcq $0,%r15 1849 1850 decl %ecx 1851 jnz .L8x_reduce 1852 1853 leaq 64(%rbp),%rbp 1854 xorq %rax,%rax 1855 movq 8+8(%rsp),%rdx 1856 cmpq 0+8(%rsp),%rbp 1857 jae .L8x_no_tail 1858 1859.byte 0x66 1860 addq 0(%rdi),%r8 1861 adcq 8(%rdi),%r9 1862 adcq 16(%rdi),%r10 1863 adcq 24(%rdi),%r11 1864 adcq 32(%rdi),%r12 1865 adcq 40(%rdi),%r13 1866 adcq 48(%rdi),%r14 1867 adcq 56(%rdi),%r15 1868 sbbq %rsi,%rsi 1869 1870 movq 48+56+8(%rsp),%rbx 1871 movl $8,%ecx 1872 movq 0(%rbp),%rax 1873 jmp .L8x_tail 1874 1875.align 32 1876.L8x_tail: 1877 mulq %rbx 1878 addq %rax,%r8 1879 movq 8(%rbp),%rax 1880 movq %r8,(%rdi) 1881 movq %rdx,%r8 1882 adcq $0,%r8 1883 1884 mulq %rbx 1885 addq %rax,%r9 1886 movq 16(%rbp),%rax 1887 adcq $0,%rdx 1888 addq %r9,%r8 1889 leaq 8(%rdi),%rdi 1890 movq %rdx,%r9 1891 adcq $0,%r9 1892 1893 mulq %rbx 1894 addq %rax,%r10 1895 movq 24(%rbp),%rax 1896 adcq $0,%rdx 1897 addq %r10,%r9 1898 movq %rdx,%r10 1899 adcq $0,%r10 1900 1901 mulq %rbx 1902 addq %rax,%r11 1903 movq 32(%rbp),%rax 1904 adcq $0,%rdx 1905 addq %r11,%r10 1906 movq %rdx,%r11 1907 adcq $0,%r11 1908 1909 mulq %rbx 1910 addq %rax,%r12 1911 movq 40(%rbp),%rax 1912 adcq $0,%rdx 1913 addq %r12,%r11 1914 movq %rdx,%r12 1915 adcq $0,%r12 1916 1917 mulq %rbx 1918 addq %rax,%r13 1919 movq 48(%rbp),%rax 1920 adcq $0,%rdx 1921 addq %r13,%r12 1922 movq %rdx,%r13 1923 adcq $0,%r13 1924 1925 mulq %rbx 1926 addq %rax,%r14 1927 movq 56(%rbp),%rax 1928 adcq $0,%rdx 1929 addq %r14,%r13 1930 movq %rdx,%r14 1931 adcq $0,%r14 1932 1933 mulq %rbx 1934 movq 48-16+8(%rsp,%rcx,8),%rbx 1935 addq %rax,%r15 1936 adcq $0,%rdx 1937 addq %r15,%r14 1938 movq 0(%rbp),%rax 1939 movq %rdx,%r15 1940 adcq $0,%r15 1941 1942 decl %ecx 1943 jnz .L8x_tail 1944 1945 leaq 64(%rbp),%rbp 1946 movq 8+8(%rsp),%rdx 1947 cmpq 0+8(%rsp),%rbp 1948 jae .L8x_tail_done 1949 1950 movq 48+56+8(%rsp),%rbx 1951 negq %rsi 1952 movq 0(%rbp),%rax 1953 adcq 0(%rdi),%r8 1954 adcq 8(%rdi),%r9 1955 adcq 16(%rdi),%r10 1956 adcq 24(%rdi),%r11 1957 adcq 32(%rdi),%r12 1958 adcq 40(%rdi),%r13 1959 adcq 48(%rdi),%r14 1960 adcq 56(%rdi),%r15 1961 sbbq %rsi,%rsi 1962 1963 movl $8,%ecx 1964 jmp .L8x_tail 1965 1966.align 32 1967.L8x_tail_done: 1968 xorq %rax,%rax 1969 addq (%rdx),%r8 1970 adcq $0,%r9 1971 adcq $0,%r10 1972 adcq $0,%r11 1973 adcq $0,%r12 1974 adcq $0,%r13 1975 adcq $0,%r14 1976 adcq $0,%r15 1977 adcq $0,%rax 1978 1979 negq %rsi 1980.L8x_no_tail: 1981 adcq 0(%rdi),%r8 1982 adcq 8(%rdi),%r9 1983 adcq 16(%rdi),%r10 1984 adcq 24(%rdi),%r11 1985 adcq 32(%rdi),%r12 1986 adcq 40(%rdi),%r13 1987 adcq 48(%rdi),%r14 1988 adcq 56(%rdi),%r15 1989 adcq $0,%rax 1990 movq -8(%rbp),%rcx 1991 xorq %rsi,%rsi 1992 1993.byte 102,72,15,126,213 1994 1995 movq %r8,0(%rdi) 1996 movq %r9,8(%rdi) 1997.byte 102,73,15,126,217 1998 movq %r10,16(%rdi) 1999 movq %r11,24(%rdi) 2000 movq %r12,32(%rdi) 2001 movq %r13,40(%rdi) 2002 movq %r14,48(%rdi) 2003 movq %r15,56(%rdi) 2004 leaq 64(%rdi),%rdi 2005 2006 cmpq %rdx,%rdi 2007 jb .L8x_reduction_loop 2008 .byte 0xf3,0xc3 2009.cfi_endproc 2010.size GFp_bn_sqr8x_internal,.-GFp_bn_sqr8x_internal 2011.type __bn_post4x_internal,@function 2012.align 32 2013__bn_post4x_internal: 2014.cfi_startproc 2015 movq 0(%rbp),%r12 2016 leaq (%rdi,%r9,1),%rbx 2017 movq %r9,%rcx 2018.byte 102,72,15,126,207 2019 negq %rax 2020.byte 102,72,15,126,206 2021 sarq $3+2,%rcx 2022 decq %r12 2023 xorq %r10,%r10 2024 movq 8(%rbp),%r13 2025 movq 16(%rbp),%r14 2026 movq 24(%rbp),%r15 2027 jmp .Lsqr4x_sub_entry 2028 2029.align 16 2030.Lsqr4x_sub: 2031 movq 0(%rbp),%r12 2032 movq 8(%rbp),%r13 2033 movq 16(%rbp),%r14 2034 movq 24(%rbp),%r15 2035.Lsqr4x_sub_entry: 2036 leaq 32(%rbp),%rbp 2037 notq %r12 2038 notq %r13 2039 notq %r14 2040 notq %r15 2041 andq %rax,%r12 2042 andq %rax,%r13 2043 andq %rax,%r14 2044 andq %rax,%r15 2045 2046 negq %r10 2047 adcq 0(%rbx),%r12 2048 adcq 8(%rbx),%r13 2049 adcq 16(%rbx),%r14 2050 adcq 24(%rbx),%r15 2051 movq %r12,0(%rdi) 2052 leaq 32(%rbx),%rbx 2053 movq %r13,8(%rdi) 2054 sbbq %r10,%r10 2055 movq %r14,16(%rdi) 2056 movq %r15,24(%rdi) 2057 leaq 32(%rdi),%rdi 2058 2059 incq %rcx 2060 jnz .Lsqr4x_sub 2061 2062 movq %r9,%r10 2063 negq %r9 2064 .byte 0xf3,0xc3 2065.cfi_endproc 2066.size __bn_post4x_internal,.-__bn_post4x_internal 2067.globl GFp_bn_from_montgomery 2068.hidden GFp_bn_from_montgomery 2069.type GFp_bn_from_montgomery,@function 2070.align 32 2071GFp_bn_from_montgomery: 2072.cfi_startproc 2073 testl $7,%r9d 2074 jz bn_from_mont8x 2075 xorl %eax,%eax 2076 .byte 0xf3,0xc3 2077.cfi_endproc 2078.size GFp_bn_from_montgomery,.-GFp_bn_from_montgomery 2079 2080.type bn_from_mont8x,@function 2081.align 32 2082bn_from_mont8x: 2083.cfi_startproc 2084.byte 0x67 2085 movq %rsp,%rax 2086.cfi_def_cfa_register %rax 2087 pushq %rbx 2088.cfi_offset %rbx,-16 2089 pushq %rbp 2090.cfi_offset %rbp,-24 2091 pushq %r12 2092.cfi_offset %r12,-32 2093 pushq %r13 2094.cfi_offset %r13,-40 2095 pushq %r14 2096.cfi_offset %r14,-48 2097 pushq %r15 2098.cfi_offset %r15,-56 2099.Lfrom_prologue: 2100 2101 shll $3,%r9d 2102 leaq (%r9,%r9,2),%r10 2103 negq %r9 2104 movq (%r8),%r8 2105 2106 2107 2108 2109 2110 2111 2112 2113 leaq -320(%rsp,%r9,2),%r11 2114 movq %rsp,%rbp 2115 subq %rdi,%r11 2116 andq $4095,%r11 2117 cmpq %r11,%r10 2118 jb .Lfrom_sp_alt 2119 subq %r11,%rbp 2120 leaq -320(%rbp,%r9,2),%rbp 2121 jmp .Lfrom_sp_done 2122 2123.align 32 2124.Lfrom_sp_alt: 2125 leaq 4096-320(,%r9,2),%r10 2126 leaq -320(%rbp,%r9,2),%rbp 2127 subq %r10,%r11 2128 movq $0,%r10 2129 cmovcq %r10,%r11 2130 subq %r11,%rbp 2131.Lfrom_sp_done: 2132 andq $-64,%rbp 2133 movq %rsp,%r11 2134 subq %rbp,%r11 2135 andq $-4096,%r11 2136 leaq (%r11,%rbp,1),%rsp 2137 movq (%rsp),%r10 2138 cmpq %rbp,%rsp 2139 ja .Lfrom_page_walk 2140 jmp .Lfrom_page_walk_done 2141 2142.Lfrom_page_walk: 2143 leaq -4096(%rsp),%rsp 2144 movq (%rsp),%r10 2145 cmpq %rbp,%rsp 2146 ja .Lfrom_page_walk 2147.Lfrom_page_walk_done: 2148 2149 movq %r9,%r10 2150 negq %r9 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 movq %r8,32(%rsp) 2162 movq %rax,40(%rsp) 2163.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2164.Lfrom_body: 2165 movq %r9,%r11 2166 leaq 48(%rsp),%rax 2167 pxor %xmm0,%xmm0 2168 jmp .Lmul_by_1 2169 2170.align 32 2171.Lmul_by_1: 2172 movdqu (%rsi),%xmm1 2173 movdqu 16(%rsi),%xmm2 2174 movdqu 32(%rsi),%xmm3 2175 movdqa %xmm0,(%rax,%r9,1) 2176 movdqu 48(%rsi),%xmm4 2177 movdqa %xmm0,16(%rax,%r9,1) 2178.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2179 movdqa %xmm1,(%rax) 2180 movdqa %xmm0,32(%rax,%r9,1) 2181 movdqa %xmm2,16(%rax) 2182 movdqa %xmm0,48(%rax,%r9,1) 2183 movdqa %xmm3,32(%rax) 2184 movdqa %xmm4,48(%rax) 2185 leaq 64(%rax),%rax 2186 subq $64,%r11 2187 jnz .Lmul_by_1 2188 2189.byte 102,72,15,110,207 2190.byte 102,72,15,110,209 2191.byte 0x67 2192 movq %rcx,%rbp 2193.byte 102,73,15,110,218 2194 leaq GFp_ia32cap_P(%rip),%r11 2195 movl 8(%r11),%r11d 2196 andl $0x80108,%r11d 2197 cmpl $0x80108,%r11d 2198 jne .Lfrom_mont_nox 2199 2200 leaq (%rax,%r9,1),%rdi 2201 call __bn_sqrx8x_reduction 2202 call __bn_postx4x_internal 2203 2204 pxor %xmm0,%xmm0 2205 leaq 48(%rsp),%rax 2206 jmp .Lfrom_mont_zero 2207 2208.align 32 2209.Lfrom_mont_nox: 2210 call __bn_sqr8x_reduction 2211 call __bn_post4x_internal 2212 2213 pxor %xmm0,%xmm0 2214 leaq 48(%rsp),%rax 2215 jmp .Lfrom_mont_zero 2216 2217.align 32 2218.Lfrom_mont_zero: 2219 movq 40(%rsp),%rsi 2220.cfi_def_cfa %rsi,8 2221 movdqa %xmm0,0(%rax) 2222 movdqa %xmm0,16(%rax) 2223 movdqa %xmm0,32(%rax) 2224 movdqa %xmm0,48(%rax) 2225 leaq 64(%rax),%rax 2226 subq $32,%r9 2227 jnz .Lfrom_mont_zero 2228 2229 movq $1,%rax 2230 movq -48(%rsi),%r15 2231.cfi_restore %r15 2232 movq -40(%rsi),%r14 2233.cfi_restore %r14 2234 movq -32(%rsi),%r13 2235.cfi_restore %r13 2236 movq -24(%rsi),%r12 2237.cfi_restore %r12 2238 movq -16(%rsi),%rbp 2239.cfi_restore %rbp 2240 movq -8(%rsi),%rbx 2241.cfi_restore %rbx 2242 leaq (%rsi),%rsp 2243.cfi_def_cfa_register %rsp 2244.Lfrom_epilogue: 2245 .byte 0xf3,0xc3 2246.cfi_endproc 2247.size bn_from_mont8x,.-bn_from_mont8x 2248.type bn_mulx4x_mont_gather5,@function 2249.align 32 2250bn_mulx4x_mont_gather5: 2251.cfi_startproc 2252 movq %rsp,%rax 2253.cfi_def_cfa_register %rax 2254.Lmulx4x_enter: 2255 pushq %rbx 2256.cfi_offset %rbx,-16 2257 pushq %rbp 2258.cfi_offset %rbp,-24 2259 pushq %r12 2260.cfi_offset %r12,-32 2261 pushq %r13 2262.cfi_offset %r13,-40 2263 pushq %r14 2264.cfi_offset %r14,-48 2265 pushq %r15 2266.cfi_offset %r15,-56 2267.Lmulx4x_prologue: 2268 2269 shll $3,%r9d 2270 leaq (%r9,%r9,2),%r10 2271 negq %r9 2272 movq (%r8),%r8 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 leaq -320(%rsp,%r9,2),%r11 2284 movq %rsp,%rbp 2285 subq %rdi,%r11 2286 andq $4095,%r11 2287 cmpq %r11,%r10 2288 jb .Lmulx4xsp_alt 2289 subq %r11,%rbp 2290 leaq -320(%rbp,%r9,2),%rbp 2291 jmp .Lmulx4xsp_done 2292 2293.Lmulx4xsp_alt: 2294 leaq 4096-320(,%r9,2),%r10 2295 leaq -320(%rbp,%r9,2),%rbp 2296 subq %r10,%r11 2297 movq $0,%r10 2298 cmovcq %r10,%r11 2299 subq %r11,%rbp 2300.Lmulx4xsp_done: 2301 andq $-64,%rbp 2302 movq %rsp,%r11 2303 subq %rbp,%r11 2304 andq $-4096,%r11 2305 leaq (%r11,%rbp,1),%rsp 2306 movq (%rsp),%r10 2307 cmpq %rbp,%rsp 2308 ja .Lmulx4x_page_walk 2309 jmp .Lmulx4x_page_walk_done 2310 2311.Lmulx4x_page_walk: 2312 leaq -4096(%rsp),%rsp 2313 movq (%rsp),%r10 2314 cmpq %rbp,%rsp 2315 ja .Lmulx4x_page_walk 2316.Lmulx4x_page_walk_done: 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 movq %r8,32(%rsp) 2331 movq %rax,40(%rsp) 2332.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2333.Lmulx4x_body: 2334 call mulx4x_internal 2335 2336 movq 40(%rsp),%rsi 2337.cfi_def_cfa %rsi,8 2338 movq $1,%rax 2339 2340 movq -48(%rsi),%r15 2341.cfi_restore %r15 2342 movq -40(%rsi),%r14 2343.cfi_restore %r14 2344 movq -32(%rsi),%r13 2345.cfi_restore %r13 2346 movq -24(%rsi),%r12 2347.cfi_restore %r12 2348 movq -16(%rsi),%rbp 2349.cfi_restore %rbp 2350 movq -8(%rsi),%rbx 2351.cfi_restore %rbx 2352 leaq (%rsi),%rsp 2353.cfi_def_cfa_register %rsp 2354.Lmulx4x_epilogue: 2355 .byte 0xf3,0xc3 2356.cfi_endproc 2357.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2358 2359.type mulx4x_internal,@function 2360.align 32 2361mulx4x_internal: 2362.cfi_startproc 2363 movq %r9,8(%rsp) 2364 movq %r9,%r10 2365 negq %r9 2366 shlq $5,%r9 2367 negq %r10 2368 leaq 128(%rdx,%r9,1),%r13 2369 shrq $5+5,%r9 2370 movd 8(%rax),%xmm5 2371 subq $1,%r9 2372 leaq .Linc(%rip),%rax 2373 movq %r13,16+8(%rsp) 2374 movq %r9,24+8(%rsp) 2375 movq %rdi,56+8(%rsp) 2376 movdqa 0(%rax),%xmm0 2377 movdqa 16(%rax),%xmm1 2378 leaq 88-112(%rsp,%r10,1),%r10 2379 leaq 128(%rdx),%rdi 2380 2381 pshufd $0,%xmm5,%xmm5 2382 movdqa %xmm1,%xmm4 2383.byte 0x67 2384 movdqa %xmm1,%xmm2 2385.byte 0x67 2386 paddd %xmm0,%xmm1 2387 pcmpeqd %xmm5,%xmm0 2388 movdqa %xmm4,%xmm3 2389 paddd %xmm1,%xmm2 2390 pcmpeqd %xmm5,%xmm1 2391 movdqa %xmm0,112(%r10) 2392 movdqa %xmm4,%xmm0 2393 2394 paddd %xmm2,%xmm3 2395 pcmpeqd %xmm5,%xmm2 2396 movdqa %xmm1,128(%r10) 2397 movdqa %xmm4,%xmm1 2398 2399 paddd %xmm3,%xmm0 2400 pcmpeqd %xmm5,%xmm3 2401 movdqa %xmm2,144(%r10) 2402 movdqa %xmm4,%xmm2 2403 2404 paddd %xmm0,%xmm1 2405 pcmpeqd %xmm5,%xmm0 2406 movdqa %xmm3,160(%r10) 2407 movdqa %xmm4,%xmm3 2408 paddd %xmm1,%xmm2 2409 pcmpeqd %xmm5,%xmm1 2410 movdqa %xmm0,176(%r10) 2411 movdqa %xmm4,%xmm0 2412 2413 paddd %xmm2,%xmm3 2414 pcmpeqd %xmm5,%xmm2 2415 movdqa %xmm1,192(%r10) 2416 movdqa %xmm4,%xmm1 2417 2418 paddd %xmm3,%xmm0 2419 pcmpeqd %xmm5,%xmm3 2420 movdqa %xmm2,208(%r10) 2421 movdqa %xmm4,%xmm2 2422 2423 paddd %xmm0,%xmm1 2424 pcmpeqd %xmm5,%xmm0 2425 movdqa %xmm3,224(%r10) 2426 movdqa %xmm4,%xmm3 2427 paddd %xmm1,%xmm2 2428 pcmpeqd %xmm5,%xmm1 2429 movdqa %xmm0,240(%r10) 2430 movdqa %xmm4,%xmm0 2431 2432 paddd %xmm2,%xmm3 2433 pcmpeqd %xmm5,%xmm2 2434 movdqa %xmm1,256(%r10) 2435 movdqa %xmm4,%xmm1 2436 2437 paddd %xmm3,%xmm0 2438 pcmpeqd %xmm5,%xmm3 2439 movdqa %xmm2,272(%r10) 2440 movdqa %xmm4,%xmm2 2441 2442 paddd %xmm0,%xmm1 2443 pcmpeqd %xmm5,%xmm0 2444 movdqa %xmm3,288(%r10) 2445 movdqa %xmm4,%xmm3 2446.byte 0x67 2447 paddd %xmm1,%xmm2 2448 pcmpeqd %xmm5,%xmm1 2449 movdqa %xmm0,304(%r10) 2450 2451 paddd %xmm2,%xmm3 2452 pcmpeqd %xmm5,%xmm2 2453 movdqa %xmm1,320(%r10) 2454 2455 pcmpeqd %xmm5,%xmm3 2456 movdqa %xmm2,336(%r10) 2457 2458 pand 64(%rdi),%xmm0 2459 pand 80(%rdi),%xmm1 2460 pand 96(%rdi),%xmm2 2461 movdqa %xmm3,352(%r10) 2462 pand 112(%rdi),%xmm3 2463 por %xmm2,%xmm0 2464 por %xmm3,%xmm1 2465 movdqa -128(%rdi),%xmm4 2466 movdqa -112(%rdi),%xmm5 2467 movdqa -96(%rdi),%xmm2 2468 pand 112(%r10),%xmm4 2469 movdqa -80(%rdi),%xmm3 2470 pand 128(%r10),%xmm5 2471 por %xmm4,%xmm0 2472 pand 144(%r10),%xmm2 2473 por %xmm5,%xmm1 2474 pand 160(%r10),%xmm3 2475 por %xmm2,%xmm0 2476 por %xmm3,%xmm1 2477 movdqa -64(%rdi),%xmm4 2478 movdqa -48(%rdi),%xmm5 2479 movdqa -32(%rdi),%xmm2 2480 pand 176(%r10),%xmm4 2481 movdqa -16(%rdi),%xmm3 2482 pand 192(%r10),%xmm5 2483 por %xmm4,%xmm0 2484 pand 208(%r10),%xmm2 2485 por %xmm5,%xmm1 2486 pand 224(%r10),%xmm3 2487 por %xmm2,%xmm0 2488 por %xmm3,%xmm1 2489 movdqa 0(%rdi),%xmm4 2490 movdqa 16(%rdi),%xmm5 2491 movdqa 32(%rdi),%xmm2 2492 pand 240(%r10),%xmm4 2493 movdqa 48(%rdi),%xmm3 2494 pand 256(%r10),%xmm5 2495 por %xmm4,%xmm0 2496 pand 272(%r10),%xmm2 2497 por %xmm5,%xmm1 2498 pand 288(%r10),%xmm3 2499 por %xmm2,%xmm0 2500 por %xmm3,%xmm1 2501 pxor %xmm1,%xmm0 2502 pshufd $0x4e,%xmm0,%xmm1 2503 por %xmm1,%xmm0 2504 leaq 256(%rdi),%rdi 2505.byte 102,72,15,126,194 2506 leaq 64+32+8(%rsp),%rbx 2507 2508 movq %rdx,%r9 2509 mulxq 0(%rsi),%r8,%rax 2510 mulxq 8(%rsi),%r11,%r12 2511 addq %rax,%r11 2512 mulxq 16(%rsi),%rax,%r13 2513 adcq %rax,%r12 2514 adcq $0,%r13 2515 mulxq 24(%rsi),%rax,%r14 2516 2517 movq %r8,%r15 2518 imulq 32+8(%rsp),%r8 2519 xorq %rbp,%rbp 2520 movq %r8,%rdx 2521 2522 movq %rdi,8+8(%rsp) 2523 2524 leaq 32(%rsi),%rsi 2525 adcxq %rax,%r13 2526 adcxq %rbp,%r14 2527 2528 mulxq 0(%rcx),%rax,%r10 2529 adcxq %rax,%r15 2530 adoxq %r11,%r10 2531 mulxq 8(%rcx),%rax,%r11 2532 adcxq %rax,%r10 2533 adoxq %r12,%r11 2534 mulxq 16(%rcx),%rax,%r12 2535 movq 24+8(%rsp),%rdi 2536 movq %r10,-32(%rbx) 2537 adcxq %rax,%r11 2538 adoxq %r13,%r12 2539 mulxq 24(%rcx),%rax,%r15 2540 movq %r9,%rdx 2541 movq %r11,-24(%rbx) 2542 adcxq %rax,%r12 2543 adoxq %rbp,%r15 2544 leaq 32(%rcx),%rcx 2545 movq %r12,-16(%rbx) 2546 jmp .Lmulx4x_1st 2547 2548.align 32 2549.Lmulx4x_1st: 2550 adcxq %rbp,%r15 2551 mulxq 0(%rsi),%r10,%rax 2552 adcxq %r14,%r10 2553 mulxq 8(%rsi),%r11,%r14 2554 adcxq %rax,%r11 2555 mulxq 16(%rsi),%r12,%rax 2556 adcxq %r14,%r12 2557 mulxq 24(%rsi),%r13,%r14 2558.byte 0x67,0x67 2559 movq %r8,%rdx 2560 adcxq %rax,%r13 2561 adcxq %rbp,%r14 2562 leaq 32(%rsi),%rsi 2563 leaq 32(%rbx),%rbx 2564 2565 adoxq %r15,%r10 2566 mulxq 0(%rcx),%rax,%r15 2567 adcxq %rax,%r10 2568 adoxq %r15,%r11 2569 mulxq 8(%rcx),%rax,%r15 2570 adcxq %rax,%r11 2571 adoxq %r15,%r12 2572 mulxq 16(%rcx),%rax,%r15 2573 movq %r10,-40(%rbx) 2574 adcxq %rax,%r12 2575 movq %r11,-32(%rbx) 2576 adoxq %r15,%r13 2577 mulxq 24(%rcx),%rax,%r15 2578 movq %r9,%rdx 2579 movq %r12,-24(%rbx) 2580 adcxq %rax,%r13 2581 adoxq %rbp,%r15 2582 leaq 32(%rcx),%rcx 2583 movq %r13,-16(%rbx) 2584 2585 decq %rdi 2586 jnz .Lmulx4x_1st 2587 2588 movq 8(%rsp),%rax 2589 adcq %rbp,%r15 2590 leaq (%rsi,%rax,1),%rsi 2591 addq %r15,%r14 2592 movq 8+8(%rsp),%rdi 2593 adcq %rbp,%rbp 2594 movq %r14,-8(%rbx) 2595 jmp .Lmulx4x_outer 2596 2597.align 32 2598.Lmulx4x_outer: 2599 leaq 16-256(%rbx),%r10 2600 pxor %xmm4,%xmm4 2601.byte 0x67,0x67 2602 pxor %xmm5,%xmm5 2603 movdqa -128(%rdi),%xmm0 2604 movdqa -112(%rdi),%xmm1 2605 movdqa -96(%rdi),%xmm2 2606 pand 256(%r10),%xmm0 2607 movdqa -80(%rdi),%xmm3 2608 pand 272(%r10),%xmm1 2609 por %xmm0,%xmm4 2610 pand 288(%r10),%xmm2 2611 por %xmm1,%xmm5 2612 pand 304(%r10),%xmm3 2613 por %xmm2,%xmm4 2614 por %xmm3,%xmm5 2615 movdqa -64(%rdi),%xmm0 2616 movdqa -48(%rdi),%xmm1 2617 movdqa -32(%rdi),%xmm2 2618 pand 320(%r10),%xmm0 2619 movdqa -16(%rdi),%xmm3 2620 pand 336(%r10),%xmm1 2621 por %xmm0,%xmm4 2622 pand 352(%r10),%xmm2 2623 por %xmm1,%xmm5 2624 pand 368(%r10),%xmm3 2625 por %xmm2,%xmm4 2626 por %xmm3,%xmm5 2627 movdqa 0(%rdi),%xmm0 2628 movdqa 16(%rdi),%xmm1 2629 movdqa 32(%rdi),%xmm2 2630 pand 384(%r10),%xmm0 2631 movdqa 48(%rdi),%xmm3 2632 pand 400(%r10),%xmm1 2633 por %xmm0,%xmm4 2634 pand 416(%r10),%xmm2 2635 por %xmm1,%xmm5 2636 pand 432(%r10),%xmm3 2637 por %xmm2,%xmm4 2638 por %xmm3,%xmm5 2639 movdqa 64(%rdi),%xmm0 2640 movdqa 80(%rdi),%xmm1 2641 movdqa 96(%rdi),%xmm2 2642 pand 448(%r10),%xmm0 2643 movdqa 112(%rdi),%xmm3 2644 pand 464(%r10),%xmm1 2645 por %xmm0,%xmm4 2646 pand 480(%r10),%xmm2 2647 por %xmm1,%xmm5 2648 pand 496(%r10),%xmm3 2649 por %xmm2,%xmm4 2650 por %xmm3,%xmm5 2651 por %xmm5,%xmm4 2652 pshufd $0x4e,%xmm4,%xmm0 2653 por %xmm4,%xmm0 2654 leaq 256(%rdi),%rdi 2655.byte 102,72,15,126,194 2656 2657 movq %rbp,(%rbx) 2658 leaq 32(%rbx,%rax,1),%rbx 2659 mulxq 0(%rsi),%r8,%r11 2660 xorq %rbp,%rbp 2661 movq %rdx,%r9 2662 mulxq 8(%rsi),%r14,%r12 2663 adoxq -32(%rbx),%r8 2664 adcxq %r14,%r11 2665 mulxq 16(%rsi),%r15,%r13 2666 adoxq -24(%rbx),%r11 2667 adcxq %r15,%r12 2668 mulxq 24(%rsi),%rdx,%r14 2669 adoxq -16(%rbx),%r12 2670 adcxq %rdx,%r13 2671 leaq (%rcx,%rax,1),%rcx 2672 leaq 32(%rsi),%rsi 2673 adoxq -8(%rbx),%r13 2674 adcxq %rbp,%r14 2675 adoxq %rbp,%r14 2676 2677 movq %r8,%r15 2678 imulq 32+8(%rsp),%r8 2679 2680 movq %r8,%rdx 2681 xorq %rbp,%rbp 2682 movq %rdi,8+8(%rsp) 2683 2684 mulxq 0(%rcx),%rax,%r10 2685 adcxq %rax,%r15 2686 adoxq %r11,%r10 2687 mulxq 8(%rcx),%rax,%r11 2688 adcxq %rax,%r10 2689 adoxq %r12,%r11 2690 mulxq 16(%rcx),%rax,%r12 2691 adcxq %rax,%r11 2692 adoxq %r13,%r12 2693 mulxq 24(%rcx),%rax,%r15 2694 movq %r9,%rdx 2695 movq 24+8(%rsp),%rdi 2696 movq %r10,-32(%rbx) 2697 adcxq %rax,%r12 2698 movq %r11,-24(%rbx) 2699 adoxq %rbp,%r15 2700 movq %r12,-16(%rbx) 2701 leaq 32(%rcx),%rcx 2702 jmp .Lmulx4x_inner 2703 2704.align 32 2705.Lmulx4x_inner: 2706 mulxq 0(%rsi),%r10,%rax 2707 adcxq %rbp,%r15 2708 adoxq %r14,%r10 2709 mulxq 8(%rsi),%r11,%r14 2710 adcxq 0(%rbx),%r10 2711 adoxq %rax,%r11 2712 mulxq 16(%rsi),%r12,%rax 2713 adcxq 8(%rbx),%r11 2714 adoxq %r14,%r12 2715 mulxq 24(%rsi),%r13,%r14 2716 movq %r8,%rdx 2717 adcxq 16(%rbx),%r12 2718 adoxq %rax,%r13 2719 adcxq 24(%rbx),%r13 2720 adoxq %rbp,%r14 2721 leaq 32(%rsi),%rsi 2722 leaq 32(%rbx),%rbx 2723 adcxq %rbp,%r14 2724 2725 adoxq %r15,%r10 2726 mulxq 0(%rcx),%rax,%r15 2727 adcxq %rax,%r10 2728 adoxq %r15,%r11 2729 mulxq 8(%rcx),%rax,%r15 2730 adcxq %rax,%r11 2731 adoxq %r15,%r12 2732 mulxq 16(%rcx),%rax,%r15 2733 movq %r10,-40(%rbx) 2734 adcxq %rax,%r12 2735 adoxq %r15,%r13 2736 movq %r11,-32(%rbx) 2737 mulxq 24(%rcx),%rax,%r15 2738 movq %r9,%rdx 2739 leaq 32(%rcx),%rcx 2740 movq %r12,-24(%rbx) 2741 adcxq %rax,%r13 2742 adoxq %rbp,%r15 2743 movq %r13,-16(%rbx) 2744 2745 decq %rdi 2746 jnz .Lmulx4x_inner 2747 2748 movq 0+8(%rsp),%rax 2749 adcq %rbp,%r15 2750 subq 0(%rbx),%rdi 2751 movq 8+8(%rsp),%rdi 2752 movq 16+8(%rsp),%r10 2753 adcq %r15,%r14 2754 leaq (%rsi,%rax,1),%rsi 2755 adcq %rbp,%rbp 2756 movq %r14,-8(%rbx) 2757 2758 cmpq %r10,%rdi 2759 jb .Lmulx4x_outer 2760 2761 movq -8(%rcx),%r10 2762 movq %rbp,%r8 2763 movq (%rcx,%rax,1),%r12 2764 leaq (%rcx,%rax,1),%rbp 2765 movq %rax,%rcx 2766 leaq (%rbx,%rax,1),%rdi 2767 xorl %eax,%eax 2768 xorq %r15,%r15 2769 subq %r14,%r10 2770 adcq %r15,%r15 2771 orq %r15,%r8 2772 sarq $3+2,%rcx 2773 subq %r8,%rax 2774 movq 56+8(%rsp),%rdx 2775 decq %r12 2776 movq 8(%rbp),%r13 2777 xorq %r8,%r8 2778 movq 16(%rbp),%r14 2779 movq 24(%rbp),%r15 2780 jmp .Lsqrx4x_sub_entry 2781.cfi_endproc 2782.size mulx4x_internal,.-mulx4x_internal 2783.type bn_powerx5,@function 2784.align 32 2785bn_powerx5: 2786.cfi_startproc 2787 movq %rsp,%rax 2788.cfi_def_cfa_register %rax 2789.Lpowerx5_enter: 2790 pushq %rbx 2791.cfi_offset %rbx,-16 2792 pushq %rbp 2793.cfi_offset %rbp,-24 2794 pushq %r12 2795.cfi_offset %r12,-32 2796 pushq %r13 2797.cfi_offset %r13,-40 2798 pushq %r14 2799.cfi_offset %r14,-48 2800 pushq %r15 2801.cfi_offset %r15,-56 2802.Lpowerx5_prologue: 2803 2804 shll $3,%r9d 2805 leaq (%r9,%r9,2),%r10 2806 negq %r9 2807 movq (%r8),%r8 2808 2809 2810 2811 2812 2813 2814 2815 2816 leaq -320(%rsp,%r9,2),%r11 2817 movq %rsp,%rbp 2818 subq %rdi,%r11 2819 andq $4095,%r11 2820 cmpq %r11,%r10 2821 jb .Lpwrx_sp_alt 2822 subq %r11,%rbp 2823 leaq -320(%rbp,%r9,2),%rbp 2824 jmp .Lpwrx_sp_done 2825 2826.align 32 2827.Lpwrx_sp_alt: 2828 leaq 4096-320(,%r9,2),%r10 2829 leaq -320(%rbp,%r9,2),%rbp 2830 subq %r10,%r11 2831 movq $0,%r10 2832 cmovcq %r10,%r11 2833 subq %r11,%rbp 2834.Lpwrx_sp_done: 2835 andq $-64,%rbp 2836 movq %rsp,%r11 2837 subq %rbp,%r11 2838 andq $-4096,%r11 2839 leaq (%r11,%rbp,1),%rsp 2840 movq (%rsp),%r10 2841 cmpq %rbp,%rsp 2842 ja .Lpwrx_page_walk 2843 jmp .Lpwrx_page_walk_done 2844 2845.Lpwrx_page_walk: 2846 leaq -4096(%rsp),%rsp 2847 movq (%rsp),%r10 2848 cmpq %rbp,%rsp 2849 ja .Lpwrx_page_walk 2850.Lpwrx_page_walk_done: 2851 2852 movq %r9,%r10 2853 negq %r9 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 pxor %xmm0,%xmm0 2867.byte 102,72,15,110,207 2868.byte 102,72,15,110,209 2869.byte 102,73,15,110,218 2870.byte 102,72,15,110,226 2871 movq %r8,32(%rsp) 2872 movq %rax,40(%rsp) 2873.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2874.Lpowerx5_body: 2875 2876 call __bn_sqrx8x_internal 2877 call __bn_postx4x_internal 2878 call __bn_sqrx8x_internal 2879 call __bn_postx4x_internal 2880 call __bn_sqrx8x_internal 2881 call __bn_postx4x_internal 2882 call __bn_sqrx8x_internal 2883 call __bn_postx4x_internal 2884 call __bn_sqrx8x_internal 2885 call __bn_postx4x_internal 2886 2887 movq %r10,%r9 2888 movq %rsi,%rdi 2889.byte 102,72,15,126,209 2890.byte 102,72,15,126,226 2891 movq 40(%rsp),%rax 2892 2893 call mulx4x_internal 2894 2895 movq 40(%rsp),%rsi 2896.cfi_def_cfa %rsi,8 2897 movq $1,%rax 2898 2899 movq -48(%rsi),%r15 2900.cfi_restore %r15 2901 movq -40(%rsi),%r14 2902.cfi_restore %r14 2903 movq -32(%rsi),%r13 2904.cfi_restore %r13 2905 movq -24(%rsi),%r12 2906.cfi_restore %r12 2907 movq -16(%rsi),%rbp 2908.cfi_restore %rbp 2909 movq -8(%rsi),%rbx 2910.cfi_restore %rbx 2911 leaq (%rsi),%rsp 2912.cfi_def_cfa_register %rsp 2913.Lpowerx5_epilogue: 2914 .byte 0xf3,0xc3 2915.cfi_endproc 2916.size bn_powerx5,.-bn_powerx5 2917 2918.globl GFp_bn_sqrx8x_internal 2919.hidden GFp_bn_sqrx8x_internal 2920.type GFp_bn_sqrx8x_internal,@function 2921.align 32 2922GFp_bn_sqrx8x_internal: 2923__bn_sqrx8x_internal: 2924.cfi_startproc 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 leaq 48+8(%rsp),%rdi 2966 leaq (%rsi,%r9,1),%rbp 2967 movq %r9,0+8(%rsp) 2968 movq %rbp,8+8(%rsp) 2969 jmp .Lsqr8x_zero_start 2970 2971.align 32 2972.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2973.Lsqrx8x_zero: 2974.byte 0x3e 2975 movdqa %xmm0,0(%rdi) 2976 movdqa %xmm0,16(%rdi) 2977 movdqa %xmm0,32(%rdi) 2978 movdqa %xmm0,48(%rdi) 2979.Lsqr8x_zero_start: 2980 movdqa %xmm0,64(%rdi) 2981 movdqa %xmm0,80(%rdi) 2982 movdqa %xmm0,96(%rdi) 2983 movdqa %xmm0,112(%rdi) 2984 leaq 128(%rdi),%rdi 2985 subq $64,%r9 2986 jnz .Lsqrx8x_zero 2987 2988 movq 0(%rsi),%rdx 2989 2990 xorq %r10,%r10 2991 xorq %r11,%r11 2992 xorq %r12,%r12 2993 xorq %r13,%r13 2994 xorq %r14,%r14 2995 xorq %r15,%r15 2996 leaq 48+8(%rsp),%rdi 2997 xorq %rbp,%rbp 2998 jmp .Lsqrx8x_outer_loop 2999 3000.align 32 3001.Lsqrx8x_outer_loop: 3002 mulxq 8(%rsi),%r8,%rax 3003 adcxq %r9,%r8 3004 adoxq %rax,%r10 3005 mulxq 16(%rsi),%r9,%rax 3006 adcxq %r10,%r9 3007 adoxq %rax,%r11 3008.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 3009 adcxq %r11,%r10 3010 adoxq %rax,%r12 3011.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 3012 adcxq %r12,%r11 3013 adoxq %rax,%r13 3014 mulxq 40(%rsi),%r12,%rax 3015 adcxq %r13,%r12 3016 adoxq %rax,%r14 3017 mulxq 48(%rsi),%r13,%rax 3018 adcxq %r14,%r13 3019 adoxq %r15,%rax 3020 mulxq 56(%rsi),%r14,%r15 3021 movq 8(%rsi),%rdx 3022 adcxq %rax,%r14 3023 adoxq %rbp,%r15 3024 adcq 64(%rdi),%r15 3025 movq %r8,8(%rdi) 3026 movq %r9,16(%rdi) 3027 sbbq %rcx,%rcx 3028 xorq %rbp,%rbp 3029 3030 3031 mulxq 16(%rsi),%r8,%rbx 3032 mulxq 24(%rsi),%r9,%rax 3033 adcxq %r10,%r8 3034 adoxq %rbx,%r9 3035 mulxq 32(%rsi),%r10,%rbx 3036 adcxq %r11,%r9 3037 adoxq %rax,%r10 3038.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 3039 adcxq %r12,%r10 3040 adoxq %rbx,%r11 3041.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 3042 adcxq %r13,%r11 3043 adoxq %r14,%r12 3044.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 3045 movq 16(%rsi),%rdx 3046 adcxq %rax,%r12 3047 adoxq %rbx,%r13 3048 adcxq %r15,%r13 3049 adoxq %rbp,%r14 3050 adcxq %rbp,%r14 3051 3052 movq %r8,24(%rdi) 3053 movq %r9,32(%rdi) 3054 3055 mulxq 24(%rsi),%r8,%rbx 3056 mulxq 32(%rsi),%r9,%rax 3057 adcxq %r10,%r8 3058 adoxq %rbx,%r9 3059 mulxq 40(%rsi),%r10,%rbx 3060 adcxq %r11,%r9 3061 adoxq %rax,%r10 3062.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 3063 adcxq %r12,%r10 3064 adoxq %r13,%r11 3065.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 3066.byte 0x3e 3067 movq 24(%rsi),%rdx 3068 adcxq %rbx,%r11 3069 adoxq %rax,%r12 3070 adcxq %r14,%r12 3071 movq %r8,40(%rdi) 3072 movq %r9,48(%rdi) 3073 mulxq 32(%rsi),%r8,%rax 3074 adoxq %rbp,%r13 3075 adcxq %rbp,%r13 3076 3077 mulxq 40(%rsi),%r9,%rbx 3078 adcxq %r10,%r8 3079 adoxq %rax,%r9 3080 mulxq 48(%rsi),%r10,%rax 3081 adcxq %r11,%r9 3082 adoxq %r12,%r10 3083 mulxq 56(%rsi),%r11,%r12 3084 movq 32(%rsi),%rdx 3085 movq 40(%rsi),%r14 3086 adcxq %rbx,%r10 3087 adoxq %rax,%r11 3088 movq 48(%rsi),%r15 3089 adcxq %r13,%r11 3090 adoxq %rbp,%r12 3091 adcxq %rbp,%r12 3092 3093 movq %r8,56(%rdi) 3094 movq %r9,64(%rdi) 3095 3096 mulxq %r14,%r9,%rax 3097 movq 56(%rsi),%r8 3098 adcxq %r10,%r9 3099 mulxq %r15,%r10,%rbx 3100 adoxq %rax,%r10 3101 adcxq %r11,%r10 3102 mulxq %r8,%r11,%rax 3103 movq %r14,%rdx 3104 adoxq %rbx,%r11 3105 adcxq %r12,%r11 3106 3107 adcxq %rbp,%rax 3108 3109 mulxq %r15,%r14,%rbx 3110 mulxq %r8,%r12,%r13 3111 movq %r15,%rdx 3112 leaq 64(%rsi),%rsi 3113 adcxq %r14,%r11 3114 adoxq %rbx,%r12 3115 adcxq %rax,%r12 3116 adoxq %rbp,%r13 3117 3118.byte 0x67,0x67 3119 mulxq %r8,%r8,%r14 3120 adcxq %r8,%r13 3121 adcxq %rbp,%r14 3122 3123 cmpq 8+8(%rsp),%rsi 3124 je .Lsqrx8x_outer_break 3125 3126 negq %rcx 3127 movq $-8,%rcx 3128 movq %rbp,%r15 3129 movq 64(%rdi),%r8 3130 adcxq 72(%rdi),%r9 3131 adcxq 80(%rdi),%r10 3132 adcxq 88(%rdi),%r11 3133 adcq 96(%rdi),%r12 3134 adcq 104(%rdi),%r13 3135 adcq 112(%rdi),%r14 3136 adcq 120(%rdi),%r15 3137 leaq (%rsi),%rbp 3138 leaq 128(%rdi),%rdi 3139 sbbq %rax,%rax 3140 3141 movq -64(%rsi),%rdx 3142 movq %rax,16+8(%rsp) 3143 movq %rdi,24+8(%rsp) 3144 3145 3146 xorl %eax,%eax 3147 jmp .Lsqrx8x_loop 3148 3149.align 32 3150.Lsqrx8x_loop: 3151 movq %r8,%rbx 3152 mulxq 0(%rbp),%rax,%r8 3153 adcxq %rax,%rbx 3154 adoxq %r9,%r8 3155 3156 mulxq 8(%rbp),%rax,%r9 3157 adcxq %rax,%r8 3158 adoxq %r10,%r9 3159 3160 mulxq 16(%rbp),%rax,%r10 3161 adcxq %rax,%r9 3162 adoxq %r11,%r10 3163 3164 mulxq 24(%rbp),%rax,%r11 3165 adcxq %rax,%r10 3166 adoxq %r12,%r11 3167 3168.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3169 adcxq %rax,%r11 3170 adoxq %r13,%r12 3171 3172 mulxq 40(%rbp),%rax,%r13 3173 adcxq %rax,%r12 3174 adoxq %r14,%r13 3175 3176 mulxq 48(%rbp),%rax,%r14 3177 movq %rbx,(%rdi,%rcx,8) 3178 movl $0,%ebx 3179 adcxq %rax,%r13 3180 adoxq %r15,%r14 3181 3182.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 3183 movq 8(%rsi,%rcx,8),%rdx 3184 adcxq %rax,%r14 3185 adoxq %rbx,%r15 3186 adcxq %rbx,%r15 3187 3188.byte 0x67 3189 incq %rcx 3190 jnz .Lsqrx8x_loop 3191 3192 leaq 64(%rbp),%rbp 3193 movq $-8,%rcx 3194 cmpq 8+8(%rsp),%rbp 3195 je .Lsqrx8x_break 3196 3197 subq 16+8(%rsp),%rbx 3198.byte 0x66 3199 movq -64(%rsi),%rdx 3200 adcxq 0(%rdi),%r8 3201 adcxq 8(%rdi),%r9 3202 adcq 16(%rdi),%r10 3203 adcq 24(%rdi),%r11 3204 adcq 32(%rdi),%r12 3205 adcq 40(%rdi),%r13 3206 adcq 48(%rdi),%r14 3207 adcq 56(%rdi),%r15 3208 leaq 64(%rdi),%rdi 3209.byte 0x67 3210 sbbq %rax,%rax 3211 xorl %ebx,%ebx 3212 movq %rax,16+8(%rsp) 3213 jmp .Lsqrx8x_loop 3214 3215.align 32 3216.Lsqrx8x_break: 3217 xorq %rbp,%rbp 3218 subq 16+8(%rsp),%rbx 3219 adcxq %rbp,%r8 3220 movq 24+8(%rsp),%rcx 3221 adcxq %rbp,%r9 3222 movq 0(%rsi),%rdx 3223 adcq $0,%r10 3224 movq %r8,0(%rdi) 3225 adcq $0,%r11 3226 adcq $0,%r12 3227 adcq $0,%r13 3228 adcq $0,%r14 3229 adcq $0,%r15 3230 cmpq %rcx,%rdi 3231 je .Lsqrx8x_outer_loop 3232 3233 movq %r9,8(%rdi) 3234 movq 8(%rcx),%r9 3235 movq %r10,16(%rdi) 3236 movq 16(%rcx),%r10 3237 movq %r11,24(%rdi) 3238 movq 24(%rcx),%r11 3239 movq %r12,32(%rdi) 3240 movq 32(%rcx),%r12 3241 movq %r13,40(%rdi) 3242 movq 40(%rcx),%r13 3243 movq %r14,48(%rdi) 3244 movq 48(%rcx),%r14 3245 movq %r15,56(%rdi) 3246 movq 56(%rcx),%r15 3247 movq %rcx,%rdi 3248 jmp .Lsqrx8x_outer_loop 3249 3250.align 32 3251.Lsqrx8x_outer_break: 3252 movq %r9,72(%rdi) 3253.byte 102,72,15,126,217 3254 movq %r10,80(%rdi) 3255 movq %r11,88(%rdi) 3256 movq %r12,96(%rdi) 3257 movq %r13,104(%rdi) 3258 movq %r14,112(%rdi) 3259 leaq 48+8(%rsp),%rdi 3260 movq (%rsi,%rcx,1),%rdx 3261 3262 movq 8(%rdi),%r11 3263 xorq %r10,%r10 3264 movq 0+8(%rsp),%r9 3265 adoxq %r11,%r11 3266 movq 16(%rdi),%r12 3267 movq 24(%rdi),%r13 3268 3269 3270.align 32 3271.Lsqrx4x_shift_n_add: 3272 mulxq %rdx,%rax,%rbx 3273 adoxq %r12,%r12 3274 adcxq %r10,%rax 3275.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 3276.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 3277 adoxq %r13,%r13 3278 adcxq %r11,%rbx 3279 movq 40(%rdi),%r11 3280 movq %rax,0(%rdi) 3281 movq %rbx,8(%rdi) 3282 3283 mulxq %rdx,%rax,%rbx 3284 adoxq %r10,%r10 3285 adcxq %r12,%rax 3286 movq 16(%rsi,%rcx,1),%rdx 3287 movq 48(%rdi),%r12 3288 adoxq %r11,%r11 3289 adcxq %r13,%rbx 3290 movq 56(%rdi),%r13 3291 movq %rax,16(%rdi) 3292 movq %rbx,24(%rdi) 3293 3294 mulxq %rdx,%rax,%rbx 3295 adoxq %r12,%r12 3296 adcxq %r10,%rax 3297 movq 24(%rsi,%rcx,1),%rdx 3298 leaq 32(%rcx),%rcx 3299 movq 64(%rdi),%r10 3300 adoxq %r13,%r13 3301 adcxq %r11,%rbx 3302 movq 72(%rdi),%r11 3303 movq %rax,32(%rdi) 3304 movq %rbx,40(%rdi) 3305 3306 mulxq %rdx,%rax,%rbx 3307 adoxq %r10,%r10 3308 adcxq %r12,%rax 3309 jrcxz .Lsqrx4x_shift_n_add_break 3310.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 3311 adoxq %r11,%r11 3312 adcxq %r13,%rbx 3313 movq 80(%rdi),%r12 3314 movq 88(%rdi),%r13 3315 movq %rax,48(%rdi) 3316 movq %rbx,56(%rdi) 3317 leaq 64(%rdi),%rdi 3318 nop 3319 jmp .Lsqrx4x_shift_n_add 3320 3321.align 32 3322.Lsqrx4x_shift_n_add_break: 3323 adcxq %r13,%rbx 3324 movq %rax,48(%rdi) 3325 movq %rbx,56(%rdi) 3326 leaq 64(%rdi),%rdi 3327.byte 102,72,15,126,213 3328__bn_sqrx8x_reduction: 3329 xorl %eax,%eax 3330 movq 32+8(%rsp),%rbx 3331 movq 48+8(%rsp),%rdx 3332 leaq -64(%rbp,%r9,1),%rcx 3333 3334 movq %rcx,0+8(%rsp) 3335 movq %rdi,8+8(%rsp) 3336 3337 leaq 48+8(%rsp),%rdi 3338 jmp .Lsqrx8x_reduction_loop 3339 3340.align 32 3341.Lsqrx8x_reduction_loop: 3342 movq 8(%rdi),%r9 3343 movq 16(%rdi),%r10 3344 movq 24(%rdi),%r11 3345 movq 32(%rdi),%r12 3346 movq %rdx,%r8 3347 imulq %rbx,%rdx 3348 movq 40(%rdi),%r13 3349 movq 48(%rdi),%r14 3350 movq 56(%rdi),%r15 3351 movq %rax,24+8(%rsp) 3352 3353 leaq 64(%rdi),%rdi 3354 xorq %rsi,%rsi 3355 movq $-8,%rcx 3356 jmp .Lsqrx8x_reduce 3357 3358.align 32 3359.Lsqrx8x_reduce: 3360 movq %r8,%rbx 3361 mulxq 0(%rbp),%rax,%r8 3362 adcxq %rbx,%rax 3363 adoxq %r9,%r8 3364 3365 mulxq 8(%rbp),%rbx,%r9 3366 adcxq %rbx,%r8 3367 adoxq %r10,%r9 3368 3369 mulxq 16(%rbp),%rbx,%r10 3370 adcxq %rbx,%r9 3371 adoxq %r11,%r10 3372 3373 mulxq 24(%rbp),%rbx,%r11 3374 adcxq %rbx,%r10 3375 adoxq %r12,%r11 3376 3377.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 3378 movq %rdx,%rax 3379 movq %r8,%rdx 3380 adcxq %rbx,%r11 3381 adoxq %r13,%r12 3382 3383 mulxq 32+8(%rsp),%rbx,%rdx 3384 movq %rax,%rdx 3385 movq %rax,64+48+8(%rsp,%rcx,8) 3386 3387 mulxq 40(%rbp),%rax,%r13 3388 adcxq %rax,%r12 3389 adoxq %r14,%r13 3390 3391 mulxq 48(%rbp),%rax,%r14 3392 adcxq %rax,%r13 3393 adoxq %r15,%r14 3394 3395 mulxq 56(%rbp),%rax,%r15 3396 movq %rbx,%rdx 3397 adcxq %rax,%r14 3398 adoxq %rsi,%r15 3399 adcxq %rsi,%r15 3400 3401.byte 0x67,0x67,0x67 3402 incq %rcx 3403 jnz .Lsqrx8x_reduce 3404 3405 movq %rsi,%rax 3406 cmpq 0+8(%rsp),%rbp 3407 jae .Lsqrx8x_no_tail 3408 3409 movq 48+8(%rsp),%rdx 3410 addq 0(%rdi),%r8 3411 leaq 64(%rbp),%rbp 3412 movq $-8,%rcx 3413 adcxq 8(%rdi),%r9 3414 adcxq 16(%rdi),%r10 3415 adcq 24(%rdi),%r11 3416 adcq 32(%rdi),%r12 3417 adcq 40(%rdi),%r13 3418 adcq 48(%rdi),%r14 3419 adcq 56(%rdi),%r15 3420 leaq 64(%rdi),%rdi 3421 sbbq %rax,%rax 3422 3423 xorq %rsi,%rsi 3424 movq %rax,16+8(%rsp) 3425 jmp .Lsqrx8x_tail 3426 3427.align 32 3428.Lsqrx8x_tail: 3429 movq %r8,%rbx 3430 mulxq 0(%rbp),%rax,%r8 3431 adcxq %rax,%rbx 3432 adoxq %r9,%r8 3433 3434 mulxq 8(%rbp),%rax,%r9 3435 adcxq %rax,%r8 3436 adoxq %r10,%r9 3437 3438 mulxq 16(%rbp),%rax,%r10 3439 adcxq %rax,%r9 3440 adoxq %r11,%r10 3441 3442 mulxq 24(%rbp),%rax,%r11 3443 adcxq %rax,%r10 3444 adoxq %r12,%r11 3445 3446.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3447 adcxq %rax,%r11 3448 adoxq %r13,%r12 3449 3450 mulxq 40(%rbp),%rax,%r13 3451 adcxq %rax,%r12 3452 adoxq %r14,%r13 3453 3454 mulxq 48(%rbp),%rax,%r14 3455 adcxq %rax,%r13 3456 adoxq %r15,%r14 3457 3458 mulxq 56(%rbp),%rax,%r15 3459 movq 72+48+8(%rsp,%rcx,8),%rdx 3460 adcxq %rax,%r14 3461 adoxq %rsi,%r15 3462 movq %rbx,(%rdi,%rcx,8) 3463 movq %r8,%rbx 3464 adcxq %rsi,%r15 3465 3466 incq %rcx 3467 jnz .Lsqrx8x_tail 3468 3469 cmpq 0+8(%rsp),%rbp 3470 jae .Lsqrx8x_tail_done 3471 3472 subq 16+8(%rsp),%rsi 3473 movq 48+8(%rsp),%rdx 3474 leaq 64(%rbp),%rbp 3475 adcq 0(%rdi),%r8 3476 adcq 8(%rdi),%r9 3477 adcq 16(%rdi),%r10 3478 adcq 24(%rdi),%r11 3479 adcq 32(%rdi),%r12 3480 adcq 40(%rdi),%r13 3481 adcq 48(%rdi),%r14 3482 adcq 56(%rdi),%r15 3483 leaq 64(%rdi),%rdi 3484 sbbq %rax,%rax 3485 subq $8,%rcx 3486 3487 xorq %rsi,%rsi 3488 movq %rax,16+8(%rsp) 3489 jmp .Lsqrx8x_tail 3490 3491.align 32 3492.Lsqrx8x_tail_done: 3493 xorq %rax,%rax 3494 addq 24+8(%rsp),%r8 3495 adcq $0,%r9 3496 adcq $0,%r10 3497 adcq $0,%r11 3498 adcq $0,%r12 3499 adcq $0,%r13 3500 adcq $0,%r14 3501 adcq $0,%r15 3502 adcq $0,%rax 3503 3504 subq 16+8(%rsp),%rsi 3505.Lsqrx8x_no_tail: 3506 adcq 0(%rdi),%r8 3507.byte 102,72,15,126,217 3508 adcq 8(%rdi),%r9 3509 movq 56(%rbp),%rsi 3510.byte 102,72,15,126,213 3511 adcq 16(%rdi),%r10 3512 adcq 24(%rdi),%r11 3513 adcq 32(%rdi),%r12 3514 adcq 40(%rdi),%r13 3515 adcq 48(%rdi),%r14 3516 adcq 56(%rdi),%r15 3517 adcq $0,%rax 3518 3519 movq 32+8(%rsp),%rbx 3520 movq 64(%rdi,%rcx,1),%rdx 3521 3522 movq %r8,0(%rdi) 3523 leaq 64(%rdi),%r8 3524 movq %r9,8(%rdi) 3525 movq %r10,16(%rdi) 3526 movq %r11,24(%rdi) 3527 movq %r12,32(%rdi) 3528 movq %r13,40(%rdi) 3529 movq %r14,48(%rdi) 3530 movq %r15,56(%rdi) 3531 3532 leaq 64(%rdi,%rcx,1),%rdi 3533 cmpq 8+8(%rsp),%r8 3534 jb .Lsqrx8x_reduction_loop 3535 .byte 0xf3,0xc3 3536.cfi_endproc 3537.size GFp_bn_sqrx8x_internal,.-GFp_bn_sqrx8x_internal 3538.align 32 3539.type __bn_postx4x_internal,@function 3540__bn_postx4x_internal: 3541.cfi_startproc 3542 movq 0(%rbp),%r12 3543 movq %rcx,%r10 3544 movq %rcx,%r9 3545 negq %rax 3546 sarq $3+2,%rcx 3547 3548.byte 102,72,15,126,202 3549.byte 102,72,15,126,206 3550 decq %r12 3551 movq 8(%rbp),%r13 3552 xorq %r8,%r8 3553 movq 16(%rbp),%r14 3554 movq 24(%rbp),%r15 3555 jmp .Lsqrx4x_sub_entry 3556 3557.align 16 3558.Lsqrx4x_sub: 3559 movq 0(%rbp),%r12 3560 movq 8(%rbp),%r13 3561 movq 16(%rbp),%r14 3562 movq 24(%rbp),%r15 3563.Lsqrx4x_sub_entry: 3564 andnq %rax,%r12,%r12 3565 leaq 32(%rbp),%rbp 3566 andnq %rax,%r13,%r13 3567 andnq %rax,%r14,%r14 3568 andnq %rax,%r15,%r15 3569 3570 negq %r8 3571 adcq 0(%rdi),%r12 3572 adcq 8(%rdi),%r13 3573 adcq 16(%rdi),%r14 3574 adcq 24(%rdi),%r15 3575 movq %r12,0(%rdx) 3576 leaq 32(%rdi),%rdi 3577 movq %r13,8(%rdx) 3578 sbbq %r8,%r8 3579 movq %r14,16(%rdx) 3580 movq %r15,24(%rdx) 3581 leaq 32(%rdx),%rdx 3582 3583 incq %rcx 3584 jnz .Lsqrx4x_sub 3585 3586 negq %r9 3587 3588 .byte 0xf3,0xc3 3589.cfi_endproc 3590.size __bn_postx4x_internal,.-__bn_postx4x_internal 3591.globl GFp_bn_scatter5 3592.hidden GFp_bn_scatter5 3593.type GFp_bn_scatter5,@function 3594.align 16 3595GFp_bn_scatter5: 3596.cfi_startproc 3597 cmpl $0,%esi 3598 jz .Lscatter_epilogue 3599 leaq (%rdx,%rcx,8),%rdx 3600.Lscatter: 3601 movq (%rdi),%rax 3602 leaq 8(%rdi),%rdi 3603 movq %rax,(%rdx) 3604 leaq 256(%rdx),%rdx 3605 subl $1,%esi 3606 jnz .Lscatter 3607.Lscatter_epilogue: 3608 .byte 0xf3,0xc3 3609.cfi_endproc 3610.size GFp_bn_scatter5,.-GFp_bn_scatter5 3611 3612.globl GFp_bn_gather5 3613.hidden GFp_bn_gather5 3614.type GFp_bn_gather5,@function 3615.align 32 3616GFp_bn_gather5: 3617.cfi_startproc 3618.LSEH_begin_GFp_bn_gather5: 3619 3620.byte 0x4c,0x8d,0x14,0x24 3621.cfi_def_cfa_register %r10 3622.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 3623 leaq .Linc(%rip),%rax 3624 andq $-16,%rsp 3625 3626 movd %ecx,%xmm5 3627 movdqa 0(%rax),%xmm0 3628 movdqa 16(%rax),%xmm1 3629 leaq 128(%rdx),%r11 3630 leaq 128(%rsp),%rax 3631 3632 pshufd $0,%xmm5,%xmm5 3633 movdqa %xmm1,%xmm4 3634 movdqa %xmm1,%xmm2 3635 paddd %xmm0,%xmm1 3636 pcmpeqd %xmm5,%xmm0 3637 movdqa %xmm4,%xmm3 3638 3639 paddd %xmm1,%xmm2 3640 pcmpeqd %xmm5,%xmm1 3641 movdqa %xmm0,-128(%rax) 3642 movdqa %xmm4,%xmm0 3643 3644 paddd %xmm2,%xmm3 3645 pcmpeqd %xmm5,%xmm2 3646 movdqa %xmm1,-112(%rax) 3647 movdqa %xmm4,%xmm1 3648 3649 paddd %xmm3,%xmm0 3650 pcmpeqd %xmm5,%xmm3 3651 movdqa %xmm2,-96(%rax) 3652 movdqa %xmm4,%xmm2 3653 paddd %xmm0,%xmm1 3654 pcmpeqd %xmm5,%xmm0 3655 movdqa %xmm3,-80(%rax) 3656 movdqa %xmm4,%xmm3 3657 3658 paddd %xmm1,%xmm2 3659 pcmpeqd %xmm5,%xmm1 3660 movdqa %xmm0,-64(%rax) 3661 movdqa %xmm4,%xmm0 3662 3663 paddd %xmm2,%xmm3 3664 pcmpeqd %xmm5,%xmm2 3665 movdqa %xmm1,-48(%rax) 3666 movdqa %xmm4,%xmm1 3667 3668 paddd %xmm3,%xmm0 3669 pcmpeqd %xmm5,%xmm3 3670 movdqa %xmm2,-32(%rax) 3671 movdqa %xmm4,%xmm2 3672 paddd %xmm0,%xmm1 3673 pcmpeqd %xmm5,%xmm0 3674 movdqa %xmm3,-16(%rax) 3675 movdqa %xmm4,%xmm3 3676 3677 paddd %xmm1,%xmm2 3678 pcmpeqd %xmm5,%xmm1 3679 movdqa %xmm0,0(%rax) 3680 movdqa %xmm4,%xmm0 3681 3682 paddd %xmm2,%xmm3 3683 pcmpeqd %xmm5,%xmm2 3684 movdqa %xmm1,16(%rax) 3685 movdqa %xmm4,%xmm1 3686 3687 paddd %xmm3,%xmm0 3688 pcmpeqd %xmm5,%xmm3 3689 movdqa %xmm2,32(%rax) 3690 movdqa %xmm4,%xmm2 3691 paddd %xmm0,%xmm1 3692 pcmpeqd %xmm5,%xmm0 3693 movdqa %xmm3,48(%rax) 3694 movdqa %xmm4,%xmm3 3695 3696 paddd %xmm1,%xmm2 3697 pcmpeqd %xmm5,%xmm1 3698 movdqa %xmm0,64(%rax) 3699 movdqa %xmm4,%xmm0 3700 3701 paddd %xmm2,%xmm3 3702 pcmpeqd %xmm5,%xmm2 3703 movdqa %xmm1,80(%rax) 3704 movdqa %xmm4,%xmm1 3705 3706 paddd %xmm3,%xmm0 3707 pcmpeqd %xmm5,%xmm3 3708 movdqa %xmm2,96(%rax) 3709 movdqa %xmm4,%xmm2 3710 movdqa %xmm3,112(%rax) 3711 jmp .Lgather 3712 3713.align 32 3714.Lgather: 3715 pxor %xmm4,%xmm4 3716 pxor %xmm5,%xmm5 3717 movdqa -128(%r11),%xmm0 3718 movdqa -112(%r11),%xmm1 3719 movdqa -96(%r11),%xmm2 3720 pand -128(%rax),%xmm0 3721 movdqa -80(%r11),%xmm3 3722 pand -112(%rax),%xmm1 3723 por %xmm0,%xmm4 3724 pand -96(%rax),%xmm2 3725 por %xmm1,%xmm5 3726 pand -80(%rax),%xmm3 3727 por %xmm2,%xmm4 3728 por %xmm3,%xmm5 3729 movdqa -64(%r11),%xmm0 3730 movdqa -48(%r11),%xmm1 3731 movdqa -32(%r11),%xmm2 3732 pand -64(%rax),%xmm0 3733 movdqa -16(%r11),%xmm3 3734 pand -48(%rax),%xmm1 3735 por %xmm0,%xmm4 3736 pand -32(%rax),%xmm2 3737 por %xmm1,%xmm5 3738 pand -16(%rax),%xmm3 3739 por %xmm2,%xmm4 3740 por %xmm3,%xmm5 3741 movdqa 0(%r11),%xmm0 3742 movdqa 16(%r11),%xmm1 3743 movdqa 32(%r11),%xmm2 3744 pand 0(%rax),%xmm0 3745 movdqa 48(%r11),%xmm3 3746 pand 16(%rax),%xmm1 3747 por %xmm0,%xmm4 3748 pand 32(%rax),%xmm2 3749 por %xmm1,%xmm5 3750 pand 48(%rax),%xmm3 3751 por %xmm2,%xmm4 3752 por %xmm3,%xmm5 3753 movdqa 64(%r11),%xmm0 3754 movdqa 80(%r11),%xmm1 3755 movdqa 96(%r11),%xmm2 3756 pand 64(%rax),%xmm0 3757 movdqa 112(%r11),%xmm3 3758 pand 80(%rax),%xmm1 3759 por %xmm0,%xmm4 3760 pand 96(%rax),%xmm2 3761 por %xmm1,%xmm5 3762 pand 112(%rax),%xmm3 3763 por %xmm2,%xmm4 3764 por %xmm3,%xmm5 3765 por %xmm5,%xmm4 3766 leaq 256(%r11),%r11 3767 pshufd $0x4e,%xmm4,%xmm0 3768 por %xmm4,%xmm0 3769 movq %xmm0,(%rdi) 3770 leaq 8(%rdi),%rdi 3771 subl $1,%esi 3772 jnz .Lgather 3773 3774 leaq (%r10),%rsp 3775.cfi_def_cfa_register %rsp 3776 .byte 0xf3,0xc3 3777.LSEH_end_GFp_bn_gather5: 3778.cfi_endproc 3779.size GFp_bn_gather5,.-GFp_bn_gather5 3780.align 64 3781.Linc: 3782.long 0,0, 1,1 3783.long 2,2, 2,2 3784.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3785#endif 3786.section .note.GNU-stack,"",@progbits 3787