1#if defined(__x86_64__) 2.text 3 4.extern OPENSSL_ia32cap_P 5.hidden OPENSSL_ia32cap_P 6 7.globl bn_mul_mont_gather5 8.hidden bn_mul_mont_gather5 9.type bn_mul_mont_gather5,@function 10.align 64 11bn_mul_mont_gather5: 12.cfi_startproc 13 movl %r9d,%r9d 14 movq %rsp,%rax 15.cfi_def_cfa_register %rax 16 testl $7,%r9d 17 jnz .Lmul_enter 18 jmp .Lmul4x_enter 19 20.align 16 21.Lmul_enter: 22 movd 8(%rsp),%xmm5 23 pushq %rbx 24.cfi_offset %rbx,-16 25 pushq %rbp 26.cfi_offset %rbp,-24 27 pushq %r12 28.cfi_offset %r12,-32 29 pushq %r13 30.cfi_offset %r13,-40 31 pushq %r14 32.cfi_offset %r14,-48 33 pushq %r15 34.cfi_offset %r15,-56 35 36 negq %r9 37 movq %rsp,%r11 38 leaq -280(%rsp,%r9,8),%r10 39 negq %r9 40 andq $-1024,%r10 41 42 43 44 45 46 47 48 49 50 subq %r10,%r11 51 andq $-4096,%r11 52 leaq (%r10,%r11,1),%rsp 53 movq (%rsp),%r11 54 cmpq %r10,%rsp 55 ja .Lmul_page_walk 56 jmp .Lmul_page_walk_done 57 58.Lmul_page_walk: 59 leaq -4096(%rsp),%rsp 60 movq (%rsp),%r11 61 cmpq %r10,%rsp 62 ja .Lmul_page_walk 63.Lmul_page_walk_done: 64 65 leaq .Linc(%rip),%r10 66 movq %rax,8(%rsp,%r9,8) 67.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 68.Lmul_body: 69 70 leaq 128(%rdx),%r12 71 movdqa 0(%r10),%xmm0 72 movdqa 16(%r10),%xmm1 73 leaq 24-112(%rsp,%r9,8),%r10 74 andq $-16,%r10 75 76 pshufd $0,%xmm5,%xmm5 77 movdqa %xmm1,%xmm4 78 movdqa %xmm1,%xmm2 79 paddd %xmm0,%xmm1 80 pcmpeqd %xmm5,%xmm0 81.byte 0x67 82 movdqa %xmm4,%xmm3 83 paddd %xmm1,%xmm2 84 pcmpeqd %xmm5,%xmm1 85 movdqa %xmm0,112(%r10) 86 movdqa %xmm4,%xmm0 87 88 paddd %xmm2,%xmm3 89 pcmpeqd %xmm5,%xmm2 90 movdqa %xmm1,128(%r10) 91 movdqa %xmm4,%xmm1 92 93 paddd %xmm3,%xmm0 94 pcmpeqd %xmm5,%xmm3 95 movdqa %xmm2,144(%r10) 96 movdqa %xmm4,%xmm2 97 98 paddd %xmm0,%xmm1 99 pcmpeqd %xmm5,%xmm0 100 movdqa %xmm3,160(%r10) 101 movdqa %xmm4,%xmm3 102 paddd %xmm1,%xmm2 103 pcmpeqd %xmm5,%xmm1 104 movdqa %xmm0,176(%r10) 105 movdqa %xmm4,%xmm0 106 107 paddd %xmm2,%xmm3 108 pcmpeqd %xmm5,%xmm2 109 movdqa %xmm1,192(%r10) 110 movdqa %xmm4,%xmm1 111 112 paddd %xmm3,%xmm0 113 pcmpeqd %xmm5,%xmm3 114 movdqa %xmm2,208(%r10) 115 movdqa %xmm4,%xmm2 116 117 paddd %xmm0,%xmm1 118 pcmpeqd %xmm5,%xmm0 119 movdqa %xmm3,224(%r10) 120 movdqa %xmm4,%xmm3 121 paddd %xmm1,%xmm2 122 pcmpeqd %xmm5,%xmm1 123 movdqa %xmm0,240(%r10) 124 movdqa %xmm4,%xmm0 125 126 paddd %xmm2,%xmm3 127 pcmpeqd %xmm5,%xmm2 128 movdqa %xmm1,256(%r10) 129 movdqa %xmm4,%xmm1 130 131 paddd %xmm3,%xmm0 132 pcmpeqd %xmm5,%xmm3 133 movdqa %xmm2,272(%r10) 134 movdqa %xmm4,%xmm2 135 136 paddd %xmm0,%xmm1 137 pcmpeqd %xmm5,%xmm0 138 movdqa %xmm3,288(%r10) 139 movdqa %xmm4,%xmm3 140 paddd %xmm1,%xmm2 141 pcmpeqd %xmm5,%xmm1 142 movdqa %xmm0,304(%r10) 143 144 paddd %xmm2,%xmm3 145.byte 0x67 146 pcmpeqd %xmm5,%xmm2 147 movdqa %xmm1,320(%r10) 148 149 pcmpeqd %xmm5,%xmm3 150 movdqa %xmm2,336(%r10) 151 pand 64(%r12),%xmm0 152 153 pand 80(%r12),%xmm1 154 pand 96(%r12),%xmm2 155 movdqa %xmm3,352(%r10) 156 pand 112(%r12),%xmm3 157 por %xmm2,%xmm0 158 por %xmm3,%xmm1 159 movdqa -128(%r12),%xmm4 160 movdqa -112(%r12),%xmm5 161 movdqa -96(%r12),%xmm2 162 pand 112(%r10),%xmm4 163 movdqa -80(%r12),%xmm3 164 pand 128(%r10),%xmm5 165 por %xmm4,%xmm0 166 pand 144(%r10),%xmm2 167 por %xmm5,%xmm1 168 pand 160(%r10),%xmm3 169 por %xmm2,%xmm0 170 por %xmm3,%xmm1 171 movdqa -64(%r12),%xmm4 172 movdqa -48(%r12),%xmm5 173 movdqa -32(%r12),%xmm2 174 pand 176(%r10),%xmm4 175 movdqa -16(%r12),%xmm3 176 pand 192(%r10),%xmm5 177 por %xmm4,%xmm0 178 pand 208(%r10),%xmm2 179 por %xmm5,%xmm1 180 pand 224(%r10),%xmm3 181 por %xmm2,%xmm0 182 por %xmm3,%xmm1 183 movdqa 0(%r12),%xmm4 184 movdqa 16(%r12),%xmm5 185 movdqa 32(%r12),%xmm2 186 pand 240(%r10),%xmm4 187 movdqa 48(%r12),%xmm3 188 pand 256(%r10),%xmm5 189 por %xmm4,%xmm0 190 pand 272(%r10),%xmm2 191 por %xmm5,%xmm1 192 pand 288(%r10),%xmm3 193 por %xmm2,%xmm0 194 por %xmm3,%xmm1 195 por %xmm1,%xmm0 196 pshufd $0x4e,%xmm0,%xmm1 197 por %xmm1,%xmm0 198 leaq 256(%r12),%r12 199.byte 102,72,15,126,195 200 201 movq (%r8),%r8 202 movq (%rsi),%rax 203 204 xorq %r14,%r14 205 xorq %r15,%r15 206 207 movq %r8,%rbp 208 mulq %rbx 209 movq %rax,%r10 210 movq (%rcx),%rax 211 212 imulq %r10,%rbp 213 movq %rdx,%r11 214 215 mulq %rbp 216 addq %rax,%r10 217 movq 8(%rsi),%rax 218 adcq $0,%rdx 219 movq %rdx,%r13 220 221 leaq 1(%r15),%r15 222 jmp .L1st_enter 223 224.align 16 225.L1st: 226 addq %rax,%r13 227 movq (%rsi,%r15,8),%rax 228 adcq $0,%rdx 229 addq %r11,%r13 230 movq %r10,%r11 231 adcq $0,%rdx 232 movq %r13,-16(%rsp,%r15,8) 233 movq %rdx,%r13 234 235.L1st_enter: 236 mulq %rbx 237 addq %rax,%r11 238 movq (%rcx,%r15,8),%rax 239 adcq $0,%rdx 240 leaq 1(%r15),%r15 241 movq %rdx,%r10 242 243 mulq %rbp 244 cmpq %r9,%r15 245 jne .L1st 246 247 248 addq %rax,%r13 249 adcq $0,%rdx 250 addq %r11,%r13 251 adcq $0,%rdx 252 movq %r13,-16(%rsp,%r9,8) 253 movq %rdx,%r13 254 movq %r10,%r11 255 256 xorq %rdx,%rdx 257 addq %r11,%r13 258 adcq $0,%rdx 259 movq %r13,-8(%rsp,%r9,8) 260 movq %rdx,(%rsp,%r9,8) 261 262 leaq 1(%r14),%r14 263 jmp .Louter 264.align 16 265.Louter: 266 leaq 24+128(%rsp,%r9,8),%rdx 267 andq $-16,%rdx 268 pxor %xmm4,%xmm4 269 pxor %xmm5,%xmm5 270 movdqa -128(%r12),%xmm0 271 movdqa -112(%r12),%xmm1 272 movdqa -96(%r12),%xmm2 273 movdqa -80(%r12),%xmm3 274 pand -128(%rdx),%xmm0 275 pand -112(%rdx),%xmm1 276 por %xmm0,%xmm4 277 pand -96(%rdx),%xmm2 278 por %xmm1,%xmm5 279 pand -80(%rdx),%xmm3 280 por %xmm2,%xmm4 281 por %xmm3,%xmm5 282 movdqa -64(%r12),%xmm0 283 movdqa -48(%r12),%xmm1 284 movdqa -32(%r12),%xmm2 285 movdqa -16(%r12),%xmm3 286 pand -64(%rdx),%xmm0 287 pand -48(%rdx),%xmm1 288 por %xmm0,%xmm4 289 pand -32(%rdx),%xmm2 290 por %xmm1,%xmm5 291 pand -16(%rdx),%xmm3 292 por %xmm2,%xmm4 293 por %xmm3,%xmm5 294 movdqa 0(%r12),%xmm0 295 movdqa 16(%r12),%xmm1 296 movdqa 32(%r12),%xmm2 297 movdqa 48(%r12),%xmm3 298 pand 0(%rdx),%xmm0 299 pand 16(%rdx),%xmm1 300 por %xmm0,%xmm4 301 pand 32(%rdx),%xmm2 302 por %xmm1,%xmm5 303 pand 48(%rdx),%xmm3 304 por %xmm2,%xmm4 305 por %xmm3,%xmm5 306 movdqa 64(%r12),%xmm0 307 movdqa 80(%r12),%xmm1 308 movdqa 96(%r12),%xmm2 309 movdqa 112(%r12),%xmm3 310 pand 64(%rdx),%xmm0 311 pand 80(%rdx),%xmm1 312 por %xmm0,%xmm4 313 pand 96(%rdx),%xmm2 314 por %xmm1,%xmm5 315 pand 112(%rdx),%xmm3 316 por %xmm2,%xmm4 317 por %xmm3,%xmm5 318 por %xmm5,%xmm4 319 pshufd $0x4e,%xmm4,%xmm0 320 por %xmm4,%xmm0 321 leaq 256(%r12),%r12 322 323 movq (%rsi),%rax 324.byte 102,72,15,126,195 325 326 xorq %r15,%r15 327 movq %r8,%rbp 328 movq (%rsp),%r10 329 330 mulq %rbx 331 addq %rax,%r10 332 movq (%rcx),%rax 333 adcq $0,%rdx 334 335 imulq %r10,%rbp 336 movq %rdx,%r11 337 338 mulq %rbp 339 addq %rax,%r10 340 movq 8(%rsi),%rax 341 adcq $0,%rdx 342 movq 8(%rsp),%r10 343 movq %rdx,%r13 344 345 leaq 1(%r15),%r15 346 jmp .Linner_enter 347 348.align 16 349.Linner: 350 addq %rax,%r13 351 movq (%rsi,%r15,8),%rax 352 adcq $0,%rdx 353 addq %r10,%r13 354 movq (%rsp,%r15,8),%r10 355 adcq $0,%rdx 356 movq %r13,-16(%rsp,%r15,8) 357 movq %rdx,%r13 358 359.Linner_enter: 360 mulq %rbx 361 addq %rax,%r11 362 movq (%rcx,%r15,8),%rax 363 adcq $0,%rdx 364 addq %r11,%r10 365 movq %rdx,%r11 366 adcq $0,%r11 367 leaq 1(%r15),%r15 368 369 mulq %rbp 370 cmpq %r9,%r15 371 jne .Linner 372 373 addq %rax,%r13 374 adcq $0,%rdx 375 addq %r10,%r13 376 movq (%rsp,%r9,8),%r10 377 adcq $0,%rdx 378 movq %r13,-16(%rsp,%r9,8) 379 movq %rdx,%r13 380 381 xorq %rdx,%rdx 382 addq %r11,%r13 383 adcq $0,%rdx 384 addq %r10,%r13 385 adcq $0,%rdx 386 movq %r13,-8(%rsp,%r9,8) 387 movq %rdx,(%rsp,%r9,8) 388 389 leaq 1(%r14),%r14 390 cmpq %r9,%r14 391 jb .Louter 392 393 xorq %r14,%r14 394 movq (%rsp),%rax 395 leaq (%rsp),%rsi 396 movq %r9,%r15 397 jmp .Lsub 398.align 16 399.Lsub: sbbq (%rcx,%r14,8),%rax 400 movq %rax,(%rdi,%r14,8) 401 movq 8(%rsi,%r14,8),%rax 402 leaq 1(%r14),%r14 403 decq %r15 404 jnz .Lsub 405 406 sbbq $0,%rax 407 xorq %r14,%r14 408 andq %rax,%rsi 409 notq %rax 410 movq %rdi,%rcx 411 andq %rax,%rcx 412 movq %r9,%r15 413 orq %rcx,%rsi 414.align 16 415.Lcopy: 416 movq (%rsi,%r14,8),%rax 417 movq %r14,(%rsp,%r14,8) 418 movq %rax,(%rdi,%r14,8) 419 leaq 1(%r14),%r14 420 subq $1,%r15 421 jnz .Lcopy 422 423 movq 8(%rsp,%r9,8),%rsi 424.cfi_def_cfa %rsi,8 425 movq $1,%rax 426 427 movq -48(%rsi),%r15 428.cfi_restore %r15 429 movq -40(%rsi),%r14 430.cfi_restore %r14 431 movq -32(%rsi),%r13 432.cfi_restore %r13 433 movq -24(%rsi),%r12 434.cfi_restore %r12 435 movq -16(%rsi),%rbp 436.cfi_restore %rbp 437 movq -8(%rsi),%rbx 438.cfi_restore %rbx 439 leaq (%rsi),%rsp 440.cfi_def_cfa_register %rsp 441.Lmul_epilogue: 442 .byte 0xf3,0xc3 443.cfi_endproc 444.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 445.type bn_mul4x_mont_gather5,@function 446.align 32 447bn_mul4x_mont_gather5: 448.cfi_startproc 449.byte 0x67 450 movq %rsp,%rax 451.cfi_def_cfa_register %rax 452.Lmul4x_enter: 453 pushq %rbx 454.cfi_offset %rbx,-16 455 pushq %rbp 456.cfi_offset %rbp,-24 457 pushq %r12 458.cfi_offset %r12,-32 459 pushq %r13 460.cfi_offset %r13,-40 461 pushq %r14 462.cfi_offset %r14,-48 463 pushq %r15 464.cfi_offset %r15,-56 465.Lmul4x_prologue: 466 467.byte 0x67 468 shll $3,%r9d 469 leaq (%r9,%r9,2),%r10 470 negq %r9 471 472 473 474 475 476 477 478 479 480 481 leaq -320(%rsp,%r9,2),%r11 482 movq %rsp,%rbp 483 subq %rdi,%r11 484 andq $4095,%r11 485 cmpq %r11,%r10 486 jb .Lmul4xsp_alt 487 subq %r11,%rbp 488 leaq -320(%rbp,%r9,2),%rbp 489 jmp .Lmul4xsp_done 490 491.align 32 492.Lmul4xsp_alt: 493 leaq 4096-320(,%r9,2),%r10 494 leaq -320(%rbp,%r9,2),%rbp 495 subq %r10,%r11 496 movq $0,%r10 497 cmovcq %r10,%r11 498 subq %r11,%rbp 499.Lmul4xsp_done: 500 andq $-64,%rbp 501 movq %rsp,%r11 502 subq %rbp,%r11 503 andq $-4096,%r11 504 leaq (%r11,%rbp,1),%rsp 505 movq (%rsp),%r10 506 cmpq %rbp,%rsp 507 ja .Lmul4x_page_walk 508 jmp .Lmul4x_page_walk_done 509 510.Lmul4x_page_walk: 511 leaq -4096(%rsp),%rsp 512 movq (%rsp),%r10 513 cmpq %rbp,%rsp 514 ja .Lmul4x_page_walk 515.Lmul4x_page_walk_done: 516 517 negq %r9 518 519 movq %rax,40(%rsp) 520.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 521.Lmul4x_body: 522 523 call mul4x_internal 524 525 movq 40(%rsp),%rsi 526.cfi_def_cfa %rsi,8 527 movq $1,%rax 528 529 movq -48(%rsi),%r15 530.cfi_restore %r15 531 movq -40(%rsi),%r14 532.cfi_restore %r14 533 movq -32(%rsi),%r13 534.cfi_restore %r13 535 movq -24(%rsi),%r12 536.cfi_restore %r12 537 movq -16(%rsi),%rbp 538.cfi_restore %rbp 539 movq -8(%rsi),%rbx 540.cfi_restore %rbx 541 leaq (%rsi),%rsp 542.cfi_def_cfa_register %rsp 543.Lmul4x_epilogue: 544 .byte 0xf3,0xc3 545.cfi_endproc 546.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 547 548.type mul4x_internal,@function 549.align 32 550mul4x_internal: 551 shlq $5,%r9 552 movd 8(%rax),%xmm5 553 leaq .Linc(%rip),%rax 554 leaq 128(%rdx,%r9,1),%r13 555 shrq $5,%r9 556 movdqa 0(%rax),%xmm0 557 movdqa 16(%rax),%xmm1 558 leaq 88-112(%rsp,%r9,1),%r10 559 leaq 128(%rdx),%r12 560 561 pshufd $0,%xmm5,%xmm5 562 movdqa %xmm1,%xmm4 563.byte 0x67,0x67 564 movdqa %xmm1,%xmm2 565 paddd %xmm0,%xmm1 566 pcmpeqd %xmm5,%xmm0 567.byte 0x67 568 movdqa %xmm4,%xmm3 569 paddd %xmm1,%xmm2 570 pcmpeqd %xmm5,%xmm1 571 movdqa %xmm0,112(%r10) 572 movdqa %xmm4,%xmm0 573 574 paddd %xmm2,%xmm3 575 pcmpeqd %xmm5,%xmm2 576 movdqa %xmm1,128(%r10) 577 movdqa %xmm4,%xmm1 578 579 paddd %xmm3,%xmm0 580 pcmpeqd %xmm5,%xmm3 581 movdqa %xmm2,144(%r10) 582 movdqa %xmm4,%xmm2 583 584 paddd %xmm0,%xmm1 585 pcmpeqd %xmm5,%xmm0 586 movdqa %xmm3,160(%r10) 587 movdqa %xmm4,%xmm3 588 paddd %xmm1,%xmm2 589 pcmpeqd %xmm5,%xmm1 590 movdqa %xmm0,176(%r10) 591 movdqa %xmm4,%xmm0 592 593 paddd %xmm2,%xmm3 594 pcmpeqd %xmm5,%xmm2 595 movdqa %xmm1,192(%r10) 596 movdqa %xmm4,%xmm1 597 598 paddd %xmm3,%xmm0 599 pcmpeqd %xmm5,%xmm3 600 movdqa %xmm2,208(%r10) 601 movdqa %xmm4,%xmm2 602 603 paddd %xmm0,%xmm1 604 pcmpeqd %xmm5,%xmm0 605 movdqa %xmm3,224(%r10) 606 movdqa %xmm4,%xmm3 607 paddd %xmm1,%xmm2 608 pcmpeqd %xmm5,%xmm1 609 movdqa %xmm0,240(%r10) 610 movdqa %xmm4,%xmm0 611 612 paddd %xmm2,%xmm3 613 pcmpeqd %xmm5,%xmm2 614 movdqa %xmm1,256(%r10) 615 movdqa %xmm4,%xmm1 616 617 paddd %xmm3,%xmm0 618 pcmpeqd %xmm5,%xmm3 619 movdqa %xmm2,272(%r10) 620 movdqa %xmm4,%xmm2 621 622 paddd %xmm0,%xmm1 623 pcmpeqd %xmm5,%xmm0 624 movdqa %xmm3,288(%r10) 625 movdqa %xmm4,%xmm3 626 paddd %xmm1,%xmm2 627 pcmpeqd %xmm5,%xmm1 628 movdqa %xmm0,304(%r10) 629 630 paddd %xmm2,%xmm3 631.byte 0x67 632 pcmpeqd %xmm5,%xmm2 633 movdqa %xmm1,320(%r10) 634 635 pcmpeqd %xmm5,%xmm3 636 movdqa %xmm2,336(%r10) 637 pand 64(%r12),%xmm0 638 639 pand 80(%r12),%xmm1 640 pand 96(%r12),%xmm2 641 movdqa %xmm3,352(%r10) 642 pand 112(%r12),%xmm3 643 por %xmm2,%xmm0 644 por %xmm3,%xmm1 645 movdqa -128(%r12),%xmm4 646 movdqa -112(%r12),%xmm5 647 movdqa -96(%r12),%xmm2 648 pand 112(%r10),%xmm4 649 movdqa -80(%r12),%xmm3 650 pand 128(%r10),%xmm5 651 por %xmm4,%xmm0 652 pand 144(%r10),%xmm2 653 por %xmm5,%xmm1 654 pand 160(%r10),%xmm3 655 por %xmm2,%xmm0 656 por %xmm3,%xmm1 657 movdqa -64(%r12),%xmm4 658 movdqa -48(%r12),%xmm5 659 movdqa -32(%r12),%xmm2 660 pand 176(%r10),%xmm4 661 movdqa -16(%r12),%xmm3 662 pand 192(%r10),%xmm5 663 por %xmm4,%xmm0 664 pand 208(%r10),%xmm2 665 por %xmm5,%xmm1 666 pand 224(%r10),%xmm3 667 por %xmm2,%xmm0 668 por %xmm3,%xmm1 669 movdqa 0(%r12),%xmm4 670 movdqa 16(%r12),%xmm5 671 movdqa 32(%r12),%xmm2 672 pand 240(%r10),%xmm4 673 movdqa 48(%r12),%xmm3 674 pand 256(%r10),%xmm5 675 por %xmm4,%xmm0 676 pand 272(%r10),%xmm2 677 por %xmm5,%xmm1 678 pand 288(%r10),%xmm3 679 por %xmm2,%xmm0 680 por %xmm3,%xmm1 681 por %xmm1,%xmm0 682 pshufd $0x4e,%xmm0,%xmm1 683 por %xmm1,%xmm0 684 leaq 256(%r12),%r12 685.byte 102,72,15,126,195 686 687 movq %r13,16+8(%rsp) 688 movq %rdi,56+8(%rsp) 689 690 movq (%r8),%r8 691 movq (%rsi),%rax 692 leaq (%rsi,%r9,1),%rsi 693 negq %r9 694 695 movq %r8,%rbp 696 mulq %rbx 697 movq %rax,%r10 698 movq (%rcx),%rax 699 700 imulq %r10,%rbp 701 leaq 64+8(%rsp),%r14 702 movq %rdx,%r11 703 704 mulq %rbp 705 addq %rax,%r10 706 movq 8(%rsi,%r9,1),%rax 707 adcq $0,%rdx 708 movq %rdx,%rdi 709 710 mulq %rbx 711 addq %rax,%r11 712 movq 8(%rcx),%rax 713 adcq $0,%rdx 714 movq %rdx,%r10 715 716 mulq %rbp 717 addq %rax,%rdi 718 movq 16(%rsi,%r9,1),%rax 719 adcq $0,%rdx 720 addq %r11,%rdi 721 leaq 32(%r9),%r15 722 leaq 32(%rcx),%rcx 723 adcq $0,%rdx 724 movq %rdi,(%r14) 725 movq %rdx,%r13 726 jmp .L1st4x 727 728.align 32 729.L1st4x: 730 mulq %rbx 731 addq %rax,%r10 732 movq -16(%rcx),%rax 733 leaq 32(%r14),%r14 734 adcq $0,%rdx 735 movq %rdx,%r11 736 737 mulq %rbp 738 addq %rax,%r13 739 movq -8(%rsi,%r15,1),%rax 740 adcq $0,%rdx 741 addq %r10,%r13 742 adcq $0,%rdx 743 movq %r13,-24(%r14) 744 movq %rdx,%rdi 745 746 mulq %rbx 747 addq %rax,%r11 748 movq -8(%rcx),%rax 749 adcq $0,%rdx 750 movq %rdx,%r10 751 752 mulq %rbp 753 addq %rax,%rdi 754 movq (%rsi,%r15,1),%rax 755 adcq $0,%rdx 756 addq %r11,%rdi 757 adcq $0,%rdx 758 movq %rdi,-16(%r14) 759 movq %rdx,%r13 760 761 mulq %rbx 762 addq %rax,%r10 763 movq 0(%rcx),%rax 764 adcq $0,%rdx 765 movq %rdx,%r11 766 767 mulq %rbp 768 addq %rax,%r13 769 movq 8(%rsi,%r15,1),%rax 770 adcq $0,%rdx 771 addq %r10,%r13 772 adcq $0,%rdx 773 movq %r13,-8(%r14) 774 movq %rdx,%rdi 775 776 mulq %rbx 777 addq %rax,%r11 778 movq 8(%rcx),%rax 779 adcq $0,%rdx 780 movq %rdx,%r10 781 782 mulq %rbp 783 addq %rax,%rdi 784 movq 16(%rsi,%r15,1),%rax 785 adcq $0,%rdx 786 addq %r11,%rdi 787 leaq 32(%rcx),%rcx 788 adcq $0,%rdx 789 movq %rdi,(%r14) 790 movq %rdx,%r13 791 792 addq $32,%r15 793 jnz .L1st4x 794 795 mulq %rbx 796 addq %rax,%r10 797 movq -16(%rcx),%rax 798 leaq 32(%r14),%r14 799 adcq $0,%rdx 800 movq %rdx,%r11 801 802 mulq %rbp 803 addq %rax,%r13 804 movq -8(%rsi),%rax 805 adcq $0,%rdx 806 addq %r10,%r13 807 adcq $0,%rdx 808 movq %r13,-24(%r14) 809 movq %rdx,%rdi 810 811 mulq %rbx 812 addq %rax,%r11 813 movq -8(%rcx),%rax 814 adcq $0,%rdx 815 movq %rdx,%r10 816 817 mulq %rbp 818 addq %rax,%rdi 819 movq (%rsi,%r9,1),%rax 820 adcq $0,%rdx 821 addq %r11,%rdi 822 adcq $0,%rdx 823 movq %rdi,-16(%r14) 824 movq %rdx,%r13 825 826 leaq (%rcx,%r9,1),%rcx 827 828 xorq %rdi,%rdi 829 addq %r10,%r13 830 adcq $0,%rdi 831 movq %r13,-8(%r14) 832 833 jmp .Louter4x 834 835.align 32 836.Louter4x: 837 leaq 16+128(%r14),%rdx 838 pxor %xmm4,%xmm4 839 pxor %xmm5,%xmm5 840 movdqa -128(%r12),%xmm0 841 movdqa -112(%r12),%xmm1 842 movdqa -96(%r12),%xmm2 843 movdqa -80(%r12),%xmm3 844 pand -128(%rdx),%xmm0 845 pand -112(%rdx),%xmm1 846 por %xmm0,%xmm4 847 pand -96(%rdx),%xmm2 848 por %xmm1,%xmm5 849 pand -80(%rdx),%xmm3 850 por %xmm2,%xmm4 851 por %xmm3,%xmm5 852 movdqa -64(%r12),%xmm0 853 movdqa -48(%r12),%xmm1 854 movdqa -32(%r12),%xmm2 855 movdqa -16(%r12),%xmm3 856 pand -64(%rdx),%xmm0 857 pand -48(%rdx),%xmm1 858 por %xmm0,%xmm4 859 pand -32(%rdx),%xmm2 860 por %xmm1,%xmm5 861 pand -16(%rdx),%xmm3 862 por %xmm2,%xmm4 863 por %xmm3,%xmm5 864 movdqa 0(%r12),%xmm0 865 movdqa 16(%r12),%xmm1 866 movdqa 32(%r12),%xmm2 867 movdqa 48(%r12),%xmm3 868 pand 0(%rdx),%xmm0 869 pand 16(%rdx),%xmm1 870 por %xmm0,%xmm4 871 pand 32(%rdx),%xmm2 872 por %xmm1,%xmm5 873 pand 48(%rdx),%xmm3 874 por %xmm2,%xmm4 875 por %xmm3,%xmm5 876 movdqa 64(%r12),%xmm0 877 movdqa 80(%r12),%xmm1 878 movdqa 96(%r12),%xmm2 879 movdqa 112(%r12),%xmm3 880 pand 64(%rdx),%xmm0 881 pand 80(%rdx),%xmm1 882 por %xmm0,%xmm4 883 pand 96(%rdx),%xmm2 884 por %xmm1,%xmm5 885 pand 112(%rdx),%xmm3 886 por %xmm2,%xmm4 887 por %xmm3,%xmm5 888 por %xmm5,%xmm4 889 pshufd $0x4e,%xmm4,%xmm0 890 por %xmm4,%xmm0 891 leaq 256(%r12),%r12 892.byte 102,72,15,126,195 893 894 movq (%r14,%r9,1),%r10 895 movq %r8,%rbp 896 mulq %rbx 897 addq %rax,%r10 898 movq (%rcx),%rax 899 adcq $0,%rdx 900 901 imulq %r10,%rbp 902 movq %rdx,%r11 903 movq %rdi,(%r14) 904 905 leaq (%r14,%r9,1),%r14 906 907 mulq %rbp 908 addq %rax,%r10 909 movq 8(%rsi,%r9,1),%rax 910 adcq $0,%rdx 911 movq %rdx,%rdi 912 913 mulq %rbx 914 addq %rax,%r11 915 movq 8(%rcx),%rax 916 adcq $0,%rdx 917 addq 8(%r14),%r11 918 adcq $0,%rdx 919 movq %rdx,%r10 920 921 mulq %rbp 922 addq %rax,%rdi 923 movq 16(%rsi,%r9,1),%rax 924 adcq $0,%rdx 925 addq %r11,%rdi 926 leaq 32(%r9),%r15 927 leaq 32(%rcx),%rcx 928 adcq $0,%rdx 929 movq %rdx,%r13 930 jmp .Linner4x 931 932.align 32 933.Linner4x: 934 mulq %rbx 935 addq %rax,%r10 936 movq -16(%rcx),%rax 937 adcq $0,%rdx 938 addq 16(%r14),%r10 939 leaq 32(%r14),%r14 940 adcq $0,%rdx 941 movq %rdx,%r11 942 943 mulq %rbp 944 addq %rax,%r13 945 movq -8(%rsi,%r15,1),%rax 946 adcq $0,%rdx 947 addq %r10,%r13 948 adcq $0,%rdx 949 movq %rdi,-32(%r14) 950 movq %rdx,%rdi 951 952 mulq %rbx 953 addq %rax,%r11 954 movq -8(%rcx),%rax 955 adcq $0,%rdx 956 addq -8(%r14),%r11 957 adcq $0,%rdx 958 movq %rdx,%r10 959 960 mulq %rbp 961 addq %rax,%rdi 962 movq (%rsi,%r15,1),%rax 963 adcq $0,%rdx 964 addq %r11,%rdi 965 adcq $0,%rdx 966 movq %r13,-24(%r14) 967 movq %rdx,%r13 968 969 mulq %rbx 970 addq %rax,%r10 971 movq 0(%rcx),%rax 972 adcq $0,%rdx 973 addq (%r14),%r10 974 adcq $0,%rdx 975 movq %rdx,%r11 976 977 mulq %rbp 978 addq %rax,%r13 979 movq 8(%rsi,%r15,1),%rax 980 adcq $0,%rdx 981 addq %r10,%r13 982 adcq $0,%rdx 983 movq %rdi,-16(%r14) 984 movq %rdx,%rdi 985 986 mulq %rbx 987 addq %rax,%r11 988 movq 8(%rcx),%rax 989 adcq $0,%rdx 990 addq 8(%r14),%r11 991 adcq $0,%rdx 992 movq %rdx,%r10 993 994 mulq %rbp 995 addq %rax,%rdi 996 movq 16(%rsi,%r15,1),%rax 997 adcq $0,%rdx 998 addq %r11,%rdi 999 leaq 32(%rcx),%rcx 1000 adcq $0,%rdx 1001 movq %r13,-8(%r14) 1002 movq %rdx,%r13 1003 1004 addq $32,%r15 1005 jnz .Linner4x 1006 1007 mulq %rbx 1008 addq %rax,%r10 1009 movq -16(%rcx),%rax 1010 adcq $0,%rdx 1011 addq 16(%r14),%r10 1012 leaq 32(%r14),%r14 1013 adcq $0,%rdx 1014 movq %rdx,%r11 1015 1016 mulq %rbp 1017 addq %rax,%r13 1018 movq -8(%rsi),%rax 1019 adcq $0,%rdx 1020 addq %r10,%r13 1021 adcq $0,%rdx 1022 movq %rdi,-32(%r14) 1023 movq %rdx,%rdi 1024 1025 mulq %rbx 1026 addq %rax,%r11 1027 movq %rbp,%rax 1028 movq -8(%rcx),%rbp 1029 adcq $0,%rdx 1030 addq -8(%r14),%r11 1031 adcq $0,%rdx 1032 movq %rdx,%r10 1033 1034 mulq %rbp 1035 addq %rax,%rdi 1036 movq (%rsi,%r9,1),%rax 1037 adcq $0,%rdx 1038 addq %r11,%rdi 1039 adcq $0,%rdx 1040 movq %r13,-24(%r14) 1041 movq %rdx,%r13 1042 1043 movq %rdi,-16(%r14) 1044 leaq (%rcx,%r9,1),%rcx 1045 1046 xorq %rdi,%rdi 1047 addq %r10,%r13 1048 adcq $0,%rdi 1049 addq (%r14),%r13 1050 adcq $0,%rdi 1051 movq %r13,-8(%r14) 1052 1053 cmpq 16+8(%rsp),%r12 1054 jb .Louter4x 1055 xorq %rax,%rax 1056 subq %r13,%rbp 1057 adcq %r15,%r15 1058 orq %r15,%rdi 1059 subq %rdi,%rax 1060 leaq (%r14,%r9,1),%rbx 1061 movq (%rcx),%r12 1062 leaq (%rcx),%rbp 1063 movq %r9,%rcx 1064 sarq $3+2,%rcx 1065 movq 56+8(%rsp),%rdi 1066 decq %r12 1067 xorq %r10,%r10 1068 movq 8(%rbp),%r13 1069 movq 16(%rbp),%r14 1070 movq 24(%rbp),%r15 1071 jmp .Lsqr4x_sub_entry 1072.size mul4x_internal,.-mul4x_internal 1073.globl bn_power5 1074.hidden bn_power5 1075.type bn_power5,@function 1076.align 32 1077bn_power5: 1078.cfi_startproc 1079 movq %rsp,%rax 1080.cfi_def_cfa_register %rax 1081 pushq %rbx 1082.cfi_offset %rbx,-16 1083 pushq %rbp 1084.cfi_offset %rbp,-24 1085 pushq %r12 1086.cfi_offset %r12,-32 1087 pushq %r13 1088.cfi_offset %r13,-40 1089 pushq %r14 1090.cfi_offset %r14,-48 1091 pushq %r15 1092.cfi_offset %r15,-56 1093.Lpower5_prologue: 1094 1095 shll $3,%r9d 1096 leal (%r9,%r9,2),%r10d 1097 negq %r9 1098 movq (%r8),%r8 1099 1100 1101 1102 1103 1104 1105 1106 1107 leaq -320(%rsp,%r9,2),%r11 1108 movq %rsp,%rbp 1109 subq %rdi,%r11 1110 andq $4095,%r11 1111 cmpq %r11,%r10 1112 jb .Lpwr_sp_alt 1113 subq %r11,%rbp 1114 leaq -320(%rbp,%r9,2),%rbp 1115 jmp .Lpwr_sp_done 1116 1117.align 32 1118.Lpwr_sp_alt: 1119 leaq 4096-320(,%r9,2),%r10 1120 leaq -320(%rbp,%r9,2),%rbp 1121 subq %r10,%r11 1122 movq $0,%r10 1123 cmovcq %r10,%r11 1124 subq %r11,%rbp 1125.Lpwr_sp_done: 1126 andq $-64,%rbp 1127 movq %rsp,%r11 1128 subq %rbp,%r11 1129 andq $-4096,%r11 1130 leaq (%r11,%rbp,1),%rsp 1131 movq (%rsp),%r10 1132 cmpq %rbp,%rsp 1133 ja .Lpwr_page_walk 1134 jmp .Lpwr_page_walk_done 1135 1136.Lpwr_page_walk: 1137 leaq -4096(%rsp),%rsp 1138 movq (%rsp),%r10 1139 cmpq %rbp,%rsp 1140 ja .Lpwr_page_walk 1141.Lpwr_page_walk_done: 1142 1143 movq %r9,%r10 1144 negq %r9 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 movq %r8,32(%rsp) 1156 movq %rax,40(%rsp) 1157.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 1158.Lpower5_body: 1159.byte 102,72,15,110,207 1160.byte 102,72,15,110,209 1161.byte 102,73,15,110,218 1162.byte 102,72,15,110,226 1163 1164 call __bn_sqr8x_internal 1165 call __bn_post4x_internal 1166 call __bn_sqr8x_internal 1167 call __bn_post4x_internal 1168 call __bn_sqr8x_internal 1169 call __bn_post4x_internal 1170 call __bn_sqr8x_internal 1171 call __bn_post4x_internal 1172 call __bn_sqr8x_internal 1173 call __bn_post4x_internal 1174 1175.byte 102,72,15,126,209 1176.byte 102,72,15,126,226 1177 movq %rsi,%rdi 1178 movq 40(%rsp),%rax 1179 leaq 32(%rsp),%r8 1180 1181 call mul4x_internal 1182 1183 movq 40(%rsp),%rsi 1184.cfi_def_cfa %rsi,8 1185 movq $1,%rax 1186 movq -48(%rsi),%r15 1187.cfi_restore %r15 1188 movq -40(%rsi),%r14 1189.cfi_restore %r14 1190 movq -32(%rsi),%r13 1191.cfi_restore %r13 1192 movq -24(%rsi),%r12 1193.cfi_restore %r12 1194 movq -16(%rsi),%rbp 1195.cfi_restore %rbp 1196 movq -8(%rsi),%rbx 1197.cfi_restore %rbx 1198 leaq (%rsi),%rsp 1199.cfi_def_cfa_register %rsp 1200.Lpower5_epilogue: 1201 .byte 0xf3,0xc3 1202.cfi_endproc 1203.size bn_power5,.-bn_power5 1204 1205.globl bn_sqr8x_internal 1206.hidden bn_sqr8x_internal 1207.hidden bn_sqr8x_internal 1208.type bn_sqr8x_internal,@function 1209.align 32 1210bn_sqr8x_internal: 1211__bn_sqr8x_internal: 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 leaq 32(%r10),%rbp 1286 leaq (%rsi,%r9,1),%rsi 1287 1288 movq %r9,%rcx 1289 1290 1291 movq -32(%rsi,%rbp,1),%r14 1292 leaq 48+8(%rsp,%r9,2),%rdi 1293 movq -24(%rsi,%rbp,1),%rax 1294 leaq -32(%rdi,%rbp,1),%rdi 1295 movq -16(%rsi,%rbp,1),%rbx 1296 movq %rax,%r15 1297 1298 mulq %r14 1299 movq %rax,%r10 1300 movq %rbx,%rax 1301 movq %rdx,%r11 1302 movq %r10,-24(%rdi,%rbp,1) 1303 1304 mulq %r14 1305 addq %rax,%r11 1306 movq %rbx,%rax 1307 adcq $0,%rdx 1308 movq %r11,-16(%rdi,%rbp,1) 1309 movq %rdx,%r10 1310 1311 1312 movq -8(%rsi,%rbp,1),%rbx 1313 mulq %r15 1314 movq %rax,%r12 1315 movq %rbx,%rax 1316 movq %rdx,%r13 1317 1318 leaq (%rbp),%rcx 1319 mulq %r14 1320 addq %rax,%r10 1321 movq %rbx,%rax 1322 movq %rdx,%r11 1323 adcq $0,%r11 1324 addq %r12,%r10 1325 adcq $0,%r11 1326 movq %r10,-8(%rdi,%rcx,1) 1327 jmp .Lsqr4x_1st 1328 1329.align 32 1330.Lsqr4x_1st: 1331 movq (%rsi,%rcx,1),%rbx 1332 mulq %r15 1333 addq %rax,%r13 1334 movq %rbx,%rax 1335 movq %rdx,%r12 1336 adcq $0,%r12 1337 1338 mulq %r14 1339 addq %rax,%r11 1340 movq %rbx,%rax 1341 movq 8(%rsi,%rcx,1),%rbx 1342 movq %rdx,%r10 1343 adcq $0,%r10 1344 addq %r13,%r11 1345 adcq $0,%r10 1346 1347 1348 mulq %r15 1349 addq %rax,%r12 1350 movq %rbx,%rax 1351 movq %r11,(%rdi,%rcx,1) 1352 movq %rdx,%r13 1353 adcq $0,%r13 1354 1355 mulq %r14 1356 addq %rax,%r10 1357 movq %rbx,%rax 1358 movq 16(%rsi,%rcx,1),%rbx 1359 movq %rdx,%r11 1360 adcq $0,%r11 1361 addq %r12,%r10 1362 adcq $0,%r11 1363 1364 mulq %r15 1365 addq %rax,%r13 1366 movq %rbx,%rax 1367 movq %r10,8(%rdi,%rcx,1) 1368 movq %rdx,%r12 1369 adcq $0,%r12 1370 1371 mulq %r14 1372 addq %rax,%r11 1373 movq %rbx,%rax 1374 movq 24(%rsi,%rcx,1),%rbx 1375 movq %rdx,%r10 1376 adcq $0,%r10 1377 addq %r13,%r11 1378 adcq $0,%r10 1379 1380 1381 mulq %r15 1382 addq %rax,%r12 1383 movq %rbx,%rax 1384 movq %r11,16(%rdi,%rcx,1) 1385 movq %rdx,%r13 1386 adcq $0,%r13 1387 leaq 32(%rcx),%rcx 1388 1389 mulq %r14 1390 addq %rax,%r10 1391 movq %rbx,%rax 1392 movq %rdx,%r11 1393 adcq $0,%r11 1394 addq %r12,%r10 1395 adcq $0,%r11 1396 movq %r10,-8(%rdi,%rcx,1) 1397 1398 cmpq $0,%rcx 1399 jne .Lsqr4x_1st 1400 1401 mulq %r15 1402 addq %rax,%r13 1403 leaq 16(%rbp),%rbp 1404 adcq $0,%rdx 1405 addq %r11,%r13 1406 adcq $0,%rdx 1407 1408 movq %r13,(%rdi) 1409 movq %rdx,%r12 1410 movq %rdx,8(%rdi) 1411 jmp .Lsqr4x_outer 1412 1413.align 32 1414.Lsqr4x_outer: 1415 movq -32(%rsi,%rbp,1),%r14 1416 leaq 48+8(%rsp,%r9,2),%rdi 1417 movq -24(%rsi,%rbp,1),%rax 1418 leaq -32(%rdi,%rbp,1),%rdi 1419 movq -16(%rsi,%rbp,1),%rbx 1420 movq %rax,%r15 1421 1422 mulq %r14 1423 movq -24(%rdi,%rbp,1),%r10 1424 addq %rax,%r10 1425 movq %rbx,%rax 1426 adcq $0,%rdx 1427 movq %r10,-24(%rdi,%rbp,1) 1428 movq %rdx,%r11 1429 1430 mulq %r14 1431 addq %rax,%r11 1432 movq %rbx,%rax 1433 adcq $0,%rdx 1434 addq -16(%rdi,%rbp,1),%r11 1435 movq %rdx,%r10 1436 adcq $0,%r10 1437 movq %r11,-16(%rdi,%rbp,1) 1438 1439 xorq %r12,%r12 1440 1441 movq -8(%rsi,%rbp,1),%rbx 1442 mulq %r15 1443 addq %rax,%r12 1444 movq %rbx,%rax 1445 adcq $0,%rdx 1446 addq -8(%rdi,%rbp,1),%r12 1447 movq %rdx,%r13 1448 adcq $0,%r13 1449 1450 mulq %r14 1451 addq %rax,%r10 1452 movq %rbx,%rax 1453 adcq $0,%rdx 1454 addq %r12,%r10 1455 movq %rdx,%r11 1456 adcq $0,%r11 1457 movq %r10,-8(%rdi,%rbp,1) 1458 1459 leaq (%rbp),%rcx 1460 jmp .Lsqr4x_inner 1461 1462.align 32 1463.Lsqr4x_inner: 1464 movq (%rsi,%rcx,1),%rbx 1465 mulq %r15 1466 addq %rax,%r13 1467 movq %rbx,%rax 1468 movq %rdx,%r12 1469 adcq $0,%r12 1470 addq (%rdi,%rcx,1),%r13 1471 adcq $0,%r12 1472 1473.byte 0x67 1474 mulq %r14 1475 addq %rax,%r11 1476 movq %rbx,%rax 1477 movq 8(%rsi,%rcx,1),%rbx 1478 movq %rdx,%r10 1479 adcq $0,%r10 1480 addq %r13,%r11 1481 adcq $0,%r10 1482 1483 mulq %r15 1484 addq %rax,%r12 1485 movq %r11,(%rdi,%rcx,1) 1486 movq %rbx,%rax 1487 movq %rdx,%r13 1488 adcq $0,%r13 1489 addq 8(%rdi,%rcx,1),%r12 1490 leaq 16(%rcx),%rcx 1491 adcq $0,%r13 1492 1493 mulq %r14 1494 addq %rax,%r10 1495 movq %rbx,%rax 1496 adcq $0,%rdx 1497 addq %r12,%r10 1498 movq %rdx,%r11 1499 adcq $0,%r11 1500 movq %r10,-8(%rdi,%rcx,1) 1501 1502 cmpq $0,%rcx 1503 jne .Lsqr4x_inner 1504 1505.byte 0x67 1506 mulq %r15 1507 addq %rax,%r13 1508 adcq $0,%rdx 1509 addq %r11,%r13 1510 adcq $0,%rdx 1511 1512 movq %r13,(%rdi) 1513 movq %rdx,%r12 1514 movq %rdx,8(%rdi) 1515 1516 addq $16,%rbp 1517 jnz .Lsqr4x_outer 1518 1519 1520 movq -32(%rsi),%r14 1521 leaq 48+8(%rsp,%r9,2),%rdi 1522 movq -24(%rsi),%rax 1523 leaq -32(%rdi,%rbp,1),%rdi 1524 movq -16(%rsi),%rbx 1525 movq %rax,%r15 1526 1527 mulq %r14 1528 addq %rax,%r10 1529 movq %rbx,%rax 1530 movq %rdx,%r11 1531 adcq $0,%r11 1532 1533 mulq %r14 1534 addq %rax,%r11 1535 movq %rbx,%rax 1536 movq %r10,-24(%rdi) 1537 movq %rdx,%r10 1538 adcq $0,%r10 1539 addq %r13,%r11 1540 movq -8(%rsi),%rbx 1541 adcq $0,%r10 1542 1543 mulq %r15 1544 addq %rax,%r12 1545 movq %rbx,%rax 1546 movq %r11,-16(%rdi) 1547 movq %rdx,%r13 1548 adcq $0,%r13 1549 1550 mulq %r14 1551 addq %rax,%r10 1552 movq %rbx,%rax 1553 movq %rdx,%r11 1554 adcq $0,%r11 1555 addq %r12,%r10 1556 adcq $0,%r11 1557 movq %r10,-8(%rdi) 1558 1559 mulq %r15 1560 addq %rax,%r13 1561 movq -16(%rsi),%rax 1562 adcq $0,%rdx 1563 addq %r11,%r13 1564 adcq $0,%rdx 1565 1566 movq %r13,(%rdi) 1567 movq %rdx,%r12 1568 movq %rdx,8(%rdi) 1569 1570 mulq %rbx 1571 addq $16,%rbp 1572 xorq %r14,%r14 1573 subq %r9,%rbp 1574 xorq %r15,%r15 1575 1576 addq %r12,%rax 1577 adcq $0,%rdx 1578 movq %rax,8(%rdi) 1579 movq %rdx,16(%rdi) 1580 movq %r15,24(%rdi) 1581 1582 movq -16(%rsi,%rbp,1),%rax 1583 leaq 48+8(%rsp),%rdi 1584 xorq %r10,%r10 1585 movq 8(%rdi),%r11 1586 1587 leaq (%r14,%r10,2),%r12 1588 shrq $63,%r10 1589 leaq (%rcx,%r11,2),%r13 1590 shrq $63,%r11 1591 orq %r10,%r13 1592 movq 16(%rdi),%r10 1593 movq %r11,%r14 1594 mulq %rax 1595 negq %r15 1596 movq 24(%rdi),%r11 1597 adcq %rax,%r12 1598 movq -8(%rsi,%rbp,1),%rax 1599 movq %r12,(%rdi) 1600 adcq %rdx,%r13 1601 1602 leaq (%r14,%r10,2),%rbx 1603 movq %r13,8(%rdi) 1604 sbbq %r15,%r15 1605 shrq $63,%r10 1606 leaq (%rcx,%r11,2),%r8 1607 shrq $63,%r11 1608 orq %r10,%r8 1609 movq 32(%rdi),%r10 1610 movq %r11,%r14 1611 mulq %rax 1612 negq %r15 1613 movq 40(%rdi),%r11 1614 adcq %rax,%rbx 1615 movq 0(%rsi,%rbp,1),%rax 1616 movq %rbx,16(%rdi) 1617 adcq %rdx,%r8 1618 leaq 16(%rbp),%rbp 1619 movq %r8,24(%rdi) 1620 sbbq %r15,%r15 1621 leaq 64(%rdi),%rdi 1622 jmp .Lsqr4x_shift_n_add 1623 1624.align 32 1625.Lsqr4x_shift_n_add: 1626 leaq (%r14,%r10,2),%r12 1627 shrq $63,%r10 1628 leaq (%rcx,%r11,2),%r13 1629 shrq $63,%r11 1630 orq %r10,%r13 1631 movq -16(%rdi),%r10 1632 movq %r11,%r14 1633 mulq %rax 1634 negq %r15 1635 movq -8(%rdi),%r11 1636 adcq %rax,%r12 1637 movq -8(%rsi,%rbp,1),%rax 1638 movq %r12,-32(%rdi) 1639 adcq %rdx,%r13 1640 1641 leaq (%r14,%r10,2),%rbx 1642 movq %r13,-24(%rdi) 1643 sbbq %r15,%r15 1644 shrq $63,%r10 1645 leaq (%rcx,%r11,2),%r8 1646 shrq $63,%r11 1647 orq %r10,%r8 1648 movq 0(%rdi),%r10 1649 movq %r11,%r14 1650 mulq %rax 1651 negq %r15 1652 movq 8(%rdi),%r11 1653 adcq %rax,%rbx 1654 movq 0(%rsi,%rbp,1),%rax 1655 movq %rbx,-16(%rdi) 1656 adcq %rdx,%r8 1657 1658 leaq (%r14,%r10,2),%r12 1659 movq %r8,-8(%rdi) 1660 sbbq %r15,%r15 1661 shrq $63,%r10 1662 leaq (%rcx,%r11,2),%r13 1663 shrq $63,%r11 1664 orq %r10,%r13 1665 movq 16(%rdi),%r10 1666 movq %r11,%r14 1667 mulq %rax 1668 negq %r15 1669 movq 24(%rdi),%r11 1670 adcq %rax,%r12 1671 movq 8(%rsi,%rbp,1),%rax 1672 movq %r12,0(%rdi) 1673 adcq %rdx,%r13 1674 1675 leaq (%r14,%r10,2),%rbx 1676 movq %r13,8(%rdi) 1677 sbbq %r15,%r15 1678 shrq $63,%r10 1679 leaq (%rcx,%r11,2),%r8 1680 shrq $63,%r11 1681 orq %r10,%r8 1682 movq 32(%rdi),%r10 1683 movq %r11,%r14 1684 mulq %rax 1685 negq %r15 1686 movq 40(%rdi),%r11 1687 adcq %rax,%rbx 1688 movq 16(%rsi,%rbp,1),%rax 1689 movq %rbx,16(%rdi) 1690 adcq %rdx,%r8 1691 movq %r8,24(%rdi) 1692 sbbq %r15,%r15 1693 leaq 64(%rdi),%rdi 1694 addq $32,%rbp 1695 jnz .Lsqr4x_shift_n_add 1696 1697 leaq (%r14,%r10,2),%r12 1698.byte 0x67 1699 shrq $63,%r10 1700 leaq (%rcx,%r11,2),%r13 1701 shrq $63,%r11 1702 orq %r10,%r13 1703 movq -16(%rdi),%r10 1704 movq %r11,%r14 1705 mulq %rax 1706 negq %r15 1707 movq -8(%rdi),%r11 1708 adcq %rax,%r12 1709 movq -8(%rsi),%rax 1710 movq %r12,-32(%rdi) 1711 adcq %rdx,%r13 1712 1713 leaq (%r14,%r10,2),%rbx 1714 movq %r13,-24(%rdi) 1715 sbbq %r15,%r15 1716 shrq $63,%r10 1717 leaq (%rcx,%r11,2),%r8 1718 shrq $63,%r11 1719 orq %r10,%r8 1720 mulq %rax 1721 negq %r15 1722 adcq %rax,%rbx 1723 adcq %rdx,%r8 1724 movq %rbx,-16(%rdi) 1725 movq %r8,-8(%rdi) 1726.byte 102,72,15,126,213 1727__bn_sqr8x_reduction: 1728 xorq %rax,%rax 1729 leaq (%r9,%rbp,1),%rcx 1730 leaq 48+8(%rsp,%r9,2),%rdx 1731 movq %rcx,0+8(%rsp) 1732 leaq 48+8(%rsp,%r9,1),%rdi 1733 movq %rdx,8+8(%rsp) 1734 negq %r9 1735 jmp .L8x_reduction_loop 1736 1737.align 32 1738.L8x_reduction_loop: 1739 leaq (%rdi,%r9,1),%rdi 1740.byte 0x66 1741 movq 0(%rdi),%rbx 1742 movq 8(%rdi),%r9 1743 movq 16(%rdi),%r10 1744 movq 24(%rdi),%r11 1745 movq 32(%rdi),%r12 1746 movq 40(%rdi),%r13 1747 movq 48(%rdi),%r14 1748 movq 56(%rdi),%r15 1749 movq %rax,(%rdx) 1750 leaq 64(%rdi),%rdi 1751 1752.byte 0x67 1753 movq %rbx,%r8 1754 imulq 32+8(%rsp),%rbx 1755 movq 0(%rbp),%rax 1756 movl $8,%ecx 1757 jmp .L8x_reduce 1758 1759.align 32 1760.L8x_reduce: 1761 mulq %rbx 1762 movq 8(%rbp),%rax 1763 negq %r8 1764 movq %rdx,%r8 1765 adcq $0,%r8 1766 1767 mulq %rbx 1768 addq %rax,%r9 1769 movq 16(%rbp),%rax 1770 adcq $0,%rdx 1771 addq %r9,%r8 1772 movq %rbx,48-8+8(%rsp,%rcx,8) 1773 movq %rdx,%r9 1774 adcq $0,%r9 1775 1776 mulq %rbx 1777 addq %rax,%r10 1778 movq 24(%rbp),%rax 1779 adcq $0,%rdx 1780 addq %r10,%r9 1781 movq 32+8(%rsp),%rsi 1782 movq %rdx,%r10 1783 adcq $0,%r10 1784 1785 mulq %rbx 1786 addq %rax,%r11 1787 movq 32(%rbp),%rax 1788 adcq $0,%rdx 1789 imulq %r8,%rsi 1790 addq %r11,%r10 1791 movq %rdx,%r11 1792 adcq $0,%r11 1793 1794 mulq %rbx 1795 addq %rax,%r12 1796 movq 40(%rbp),%rax 1797 adcq $0,%rdx 1798 addq %r12,%r11 1799 movq %rdx,%r12 1800 adcq $0,%r12 1801 1802 mulq %rbx 1803 addq %rax,%r13 1804 movq 48(%rbp),%rax 1805 adcq $0,%rdx 1806 addq %r13,%r12 1807 movq %rdx,%r13 1808 adcq $0,%r13 1809 1810 mulq %rbx 1811 addq %rax,%r14 1812 movq 56(%rbp),%rax 1813 adcq $0,%rdx 1814 addq %r14,%r13 1815 movq %rdx,%r14 1816 adcq $0,%r14 1817 1818 mulq %rbx 1819 movq %rsi,%rbx 1820 addq %rax,%r15 1821 movq 0(%rbp),%rax 1822 adcq $0,%rdx 1823 addq %r15,%r14 1824 movq %rdx,%r15 1825 adcq $0,%r15 1826 1827 decl %ecx 1828 jnz .L8x_reduce 1829 1830 leaq 64(%rbp),%rbp 1831 xorq %rax,%rax 1832 movq 8+8(%rsp),%rdx 1833 cmpq 0+8(%rsp),%rbp 1834 jae .L8x_no_tail 1835 1836.byte 0x66 1837 addq 0(%rdi),%r8 1838 adcq 8(%rdi),%r9 1839 adcq 16(%rdi),%r10 1840 adcq 24(%rdi),%r11 1841 adcq 32(%rdi),%r12 1842 adcq 40(%rdi),%r13 1843 adcq 48(%rdi),%r14 1844 adcq 56(%rdi),%r15 1845 sbbq %rsi,%rsi 1846 1847 movq 48+56+8(%rsp),%rbx 1848 movl $8,%ecx 1849 movq 0(%rbp),%rax 1850 jmp .L8x_tail 1851 1852.align 32 1853.L8x_tail: 1854 mulq %rbx 1855 addq %rax,%r8 1856 movq 8(%rbp),%rax 1857 movq %r8,(%rdi) 1858 movq %rdx,%r8 1859 adcq $0,%r8 1860 1861 mulq %rbx 1862 addq %rax,%r9 1863 movq 16(%rbp),%rax 1864 adcq $0,%rdx 1865 addq %r9,%r8 1866 leaq 8(%rdi),%rdi 1867 movq %rdx,%r9 1868 adcq $0,%r9 1869 1870 mulq %rbx 1871 addq %rax,%r10 1872 movq 24(%rbp),%rax 1873 adcq $0,%rdx 1874 addq %r10,%r9 1875 movq %rdx,%r10 1876 adcq $0,%r10 1877 1878 mulq %rbx 1879 addq %rax,%r11 1880 movq 32(%rbp),%rax 1881 adcq $0,%rdx 1882 addq %r11,%r10 1883 movq %rdx,%r11 1884 adcq $0,%r11 1885 1886 mulq %rbx 1887 addq %rax,%r12 1888 movq 40(%rbp),%rax 1889 adcq $0,%rdx 1890 addq %r12,%r11 1891 movq %rdx,%r12 1892 adcq $0,%r12 1893 1894 mulq %rbx 1895 addq %rax,%r13 1896 movq 48(%rbp),%rax 1897 adcq $0,%rdx 1898 addq %r13,%r12 1899 movq %rdx,%r13 1900 adcq $0,%r13 1901 1902 mulq %rbx 1903 addq %rax,%r14 1904 movq 56(%rbp),%rax 1905 adcq $0,%rdx 1906 addq %r14,%r13 1907 movq %rdx,%r14 1908 adcq $0,%r14 1909 1910 mulq %rbx 1911 movq 48-16+8(%rsp,%rcx,8),%rbx 1912 addq %rax,%r15 1913 adcq $0,%rdx 1914 addq %r15,%r14 1915 movq 0(%rbp),%rax 1916 movq %rdx,%r15 1917 adcq $0,%r15 1918 1919 decl %ecx 1920 jnz .L8x_tail 1921 1922 leaq 64(%rbp),%rbp 1923 movq 8+8(%rsp),%rdx 1924 cmpq 0+8(%rsp),%rbp 1925 jae .L8x_tail_done 1926 1927 movq 48+56+8(%rsp),%rbx 1928 negq %rsi 1929 movq 0(%rbp),%rax 1930 adcq 0(%rdi),%r8 1931 adcq 8(%rdi),%r9 1932 adcq 16(%rdi),%r10 1933 adcq 24(%rdi),%r11 1934 adcq 32(%rdi),%r12 1935 adcq 40(%rdi),%r13 1936 adcq 48(%rdi),%r14 1937 adcq 56(%rdi),%r15 1938 sbbq %rsi,%rsi 1939 1940 movl $8,%ecx 1941 jmp .L8x_tail 1942 1943.align 32 1944.L8x_tail_done: 1945 xorq %rax,%rax 1946 addq (%rdx),%r8 1947 adcq $0,%r9 1948 adcq $0,%r10 1949 adcq $0,%r11 1950 adcq $0,%r12 1951 adcq $0,%r13 1952 adcq $0,%r14 1953 adcq $0,%r15 1954 adcq $0,%rax 1955 1956 negq %rsi 1957.L8x_no_tail: 1958 adcq 0(%rdi),%r8 1959 adcq 8(%rdi),%r9 1960 adcq 16(%rdi),%r10 1961 adcq 24(%rdi),%r11 1962 adcq 32(%rdi),%r12 1963 adcq 40(%rdi),%r13 1964 adcq 48(%rdi),%r14 1965 adcq 56(%rdi),%r15 1966 adcq $0,%rax 1967 movq -8(%rbp),%rcx 1968 xorq %rsi,%rsi 1969 1970.byte 102,72,15,126,213 1971 1972 movq %r8,0(%rdi) 1973 movq %r9,8(%rdi) 1974.byte 102,73,15,126,217 1975 movq %r10,16(%rdi) 1976 movq %r11,24(%rdi) 1977 movq %r12,32(%rdi) 1978 movq %r13,40(%rdi) 1979 movq %r14,48(%rdi) 1980 movq %r15,56(%rdi) 1981 leaq 64(%rdi),%rdi 1982 1983 cmpq %rdx,%rdi 1984 jb .L8x_reduction_loop 1985 .byte 0xf3,0xc3 1986.size bn_sqr8x_internal,.-bn_sqr8x_internal 1987.type __bn_post4x_internal,@function 1988.align 32 1989__bn_post4x_internal: 1990 movq 0(%rbp),%r12 1991 leaq (%rdi,%r9,1),%rbx 1992 movq %r9,%rcx 1993.byte 102,72,15,126,207 1994 negq %rax 1995.byte 102,72,15,126,206 1996 sarq $3+2,%rcx 1997 decq %r12 1998 xorq %r10,%r10 1999 movq 8(%rbp),%r13 2000 movq 16(%rbp),%r14 2001 movq 24(%rbp),%r15 2002 jmp .Lsqr4x_sub_entry 2003 2004.align 16 2005.Lsqr4x_sub: 2006 movq 0(%rbp),%r12 2007 movq 8(%rbp),%r13 2008 movq 16(%rbp),%r14 2009 movq 24(%rbp),%r15 2010.Lsqr4x_sub_entry: 2011 leaq 32(%rbp),%rbp 2012 notq %r12 2013 notq %r13 2014 notq %r14 2015 notq %r15 2016 andq %rax,%r12 2017 andq %rax,%r13 2018 andq %rax,%r14 2019 andq %rax,%r15 2020 2021 negq %r10 2022 adcq 0(%rbx),%r12 2023 adcq 8(%rbx),%r13 2024 adcq 16(%rbx),%r14 2025 adcq 24(%rbx),%r15 2026 movq %r12,0(%rdi) 2027 leaq 32(%rbx),%rbx 2028 movq %r13,8(%rdi) 2029 sbbq %r10,%r10 2030 movq %r14,16(%rdi) 2031 movq %r15,24(%rdi) 2032 leaq 32(%rdi),%rdi 2033 2034 incq %rcx 2035 jnz .Lsqr4x_sub 2036 2037 movq %r9,%r10 2038 negq %r9 2039 .byte 0xf3,0xc3 2040.size __bn_post4x_internal,.-__bn_post4x_internal 2041.globl bn_from_montgomery 2042.hidden bn_from_montgomery 2043.type bn_from_montgomery,@function 2044.align 32 2045bn_from_montgomery: 2046 testl $7,%r9d 2047 jz bn_from_mont8x 2048 xorl %eax,%eax 2049 .byte 0xf3,0xc3 2050.size bn_from_montgomery,.-bn_from_montgomery 2051 2052.type bn_from_mont8x,@function 2053.align 32 2054bn_from_mont8x: 2055.cfi_startproc 2056.byte 0x67 2057 movq %rsp,%rax 2058.cfi_def_cfa_register %rax 2059 pushq %rbx 2060.cfi_offset %rbx,-16 2061 pushq %rbp 2062.cfi_offset %rbp,-24 2063 pushq %r12 2064.cfi_offset %r12,-32 2065 pushq %r13 2066.cfi_offset %r13,-40 2067 pushq %r14 2068.cfi_offset %r14,-48 2069 pushq %r15 2070.cfi_offset %r15,-56 2071.Lfrom_prologue: 2072 2073 shll $3,%r9d 2074 leaq (%r9,%r9,2),%r10 2075 negq %r9 2076 movq (%r8),%r8 2077 2078 2079 2080 2081 2082 2083 2084 2085 leaq -320(%rsp,%r9,2),%r11 2086 movq %rsp,%rbp 2087 subq %rdi,%r11 2088 andq $4095,%r11 2089 cmpq %r11,%r10 2090 jb .Lfrom_sp_alt 2091 subq %r11,%rbp 2092 leaq -320(%rbp,%r9,2),%rbp 2093 jmp .Lfrom_sp_done 2094 2095.align 32 2096.Lfrom_sp_alt: 2097 leaq 4096-320(,%r9,2),%r10 2098 leaq -320(%rbp,%r9,2),%rbp 2099 subq %r10,%r11 2100 movq $0,%r10 2101 cmovcq %r10,%r11 2102 subq %r11,%rbp 2103.Lfrom_sp_done: 2104 andq $-64,%rbp 2105 movq %rsp,%r11 2106 subq %rbp,%r11 2107 andq $-4096,%r11 2108 leaq (%r11,%rbp,1),%rsp 2109 movq (%rsp),%r10 2110 cmpq %rbp,%rsp 2111 ja .Lfrom_page_walk 2112 jmp .Lfrom_page_walk_done 2113 2114.Lfrom_page_walk: 2115 leaq -4096(%rsp),%rsp 2116 movq (%rsp),%r10 2117 cmpq %rbp,%rsp 2118 ja .Lfrom_page_walk 2119.Lfrom_page_walk_done: 2120 2121 movq %r9,%r10 2122 negq %r9 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 movq %r8,32(%rsp) 2134 movq %rax,40(%rsp) 2135.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2136.Lfrom_body: 2137 movq %r9,%r11 2138 leaq 48(%rsp),%rax 2139 pxor %xmm0,%xmm0 2140 jmp .Lmul_by_1 2141 2142.align 32 2143.Lmul_by_1: 2144 movdqu (%rsi),%xmm1 2145 movdqu 16(%rsi),%xmm2 2146 movdqu 32(%rsi),%xmm3 2147 movdqa %xmm0,(%rax,%r9,1) 2148 movdqu 48(%rsi),%xmm4 2149 movdqa %xmm0,16(%rax,%r9,1) 2150.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2151 movdqa %xmm1,(%rax) 2152 movdqa %xmm0,32(%rax,%r9,1) 2153 movdqa %xmm2,16(%rax) 2154 movdqa %xmm0,48(%rax,%r9,1) 2155 movdqa %xmm3,32(%rax) 2156 movdqa %xmm4,48(%rax) 2157 leaq 64(%rax),%rax 2158 subq $64,%r11 2159 jnz .Lmul_by_1 2160 2161.byte 102,72,15,110,207 2162.byte 102,72,15,110,209 2163.byte 0x67 2164 movq %rcx,%rbp 2165.byte 102,73,15,110,218 2166 call __bn_sqr8x_reduction 2167 call __bn_post4x_internal 2168 2169 pxor %xmm0,%xmm0 2170 leaq 48(%rsp),%rax 2171 jmp .Lfrom_mont_zero 2172 2173.align 32 2174.Lfrom_mont_zero: 2175 movq 40(%rsp),%rsi 2176.cfi_def_cfa %rsi,8 2177 movdqa %xmm0,0(%rax) 2178 movdqa %xmm0,16(%rax) 2179 movdqa %xmm0,32(%rax) 2180 movdqa %xmm0,48(%rax) 2181 leaq 64(%rax),%rax 2182 subq $32,%r9 2183 jnz .Lfrom_mont_zero 2184 2185 movq $1,%rax 2186 movq -48(%rsi),%r15 2187.cfi_restore %r15 2188 movq -40(%rsi),%r14 2189.cfi_restore %r14 2190 movq -32(%rsi),%r13 2191.cfi_restore %r13 2192 movq -24(%rsi),%r12 2193.cfi_restore %r12 2194 movq -16(%rsi),%rbp 2195.cfi_restore %rbp 2196 movq -8(%rsi),%rbx 2197.cfi_restore %rbx 2198 leaq (%rsi),%rsp 2199.cfi_def_cfa_register %rsp 2200.Lfrom_epilogue: 2201 .byte 0xf3,0xc3 2202.cfi_endproc 2203.size bn_from_mont8x,.-bn_from_mont8x 2204.globl bn_scatter5 2205.hidden bn_scatter5 2206.type bn_scatter5,@function 2207.align 16 2208bn_scatter5: 2209 cmpl $0,%esi 2210 jz .Lscatter_epilogue 2211 leaq (%rdx,%rcx,8),%rdx 2212.Lscatter: 2213 movq (%rdi),%rax 2214 leaq 8(%rdi),%rdi 2215 movq %rax,(%rdx) 2216 leaq 256(%rdx),%rdx 2217 subl $1,%esi 2218 jnz .Lscatter 2219.Lscatter_epilogue: 2220 .byte 0xf3,0xc3 2221.size bn_scatter5,.-bn_scatter5 2222 2223.globl bn_gather5 2224.hidden bn_gather5 2225.type bn_gather5,@function 2226.align 32 2227bn_gather5: 2228.LSEH_begin_bn_gather5: 2229 2230.byte 0x4c,0x8d,0x14,0x24 2231.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 2232 leaq .Linc(%rip),%rax 2233 andq $-16,%rsp 2234 2235 movd %ecx,%xmm5 2236 movdqa 0(%rax),%xmm0 2237 movdqa 16(%rax),%xmm1 2238 leaq 128(%rdx),%r11 2239 leaq 128(%rsp),%rax 2240 2241 pshufd $0,%xmm5,%xmm5 2242 movdqa %xmm1,%xmm4 2243 movdqa %xmm1,%xmm2 2244 paddd %xmm0,%xmm1 2245 pcmpeqd %xmm5,%xmm0 2246 movdqa %xmm4,%xmm3 2247 2248 paddd %xmm1,%xmm2 2249 pcmpeqd %xmm5,%xmm1 2250 movdqa %xmm0,-128(%rax) 2251 movdqa %xmm4,%xmm0 2252 2253 paddd %xmm2,%xmm3 2254 pcmpeqd %xmm5,%xmm2 2255 movdqa %xmm1,-112(%rax) 2256 movdqa %xmm4,%xmm1 2257 2258 paddd %xmm3,%xmm0 2259 pcmpeqd %xmm5,%xmm3 2260 movdqa %xmm2,-96(%rax) 2261 movdqa %xmm4,%xmm2 2262 paddd %xmm0,%xmm1 2263 pcmpeqd %xmm5,%xmm0 2264 movdqa %xmm3,-80(%rax) 2265 movdqa %xmm4,%xmm3 2266 2267 paddd %xmm1,%xmm2 2268 pcmpeqd %xmm5,%xmm1 2269 movdqa %xmm0,-64(%rax) 2270 movdqa %xmm4,%xmm0 2271 2272 paddd %xmm2,%xmm3 2273 pcmpeqd %xmm5,%xmm2 2274 movdqa %xmm1,-48(%rax) 2275 movdqa %xmm4,%xmm1 2276 2277 paddd %xmm3,%xmm0 2278 pcmpeqd %xmm5,%xmm3 2279 movdqa %xmm2,-32(%rax) 2280 movdqa %xmm4,%xmm2 2281 paddd %xmm0,%xmm1 2282 pcmpeqd %xmm5,%xmm0 2283 movdqa %xmm3,-16(%rax) 2284 movdqa %xmm4,%xmm3 2285 2286 paddd %xmm1,%xmm2 2287 pcmpeqd %xmm5,%xmm1 2288 movdqa %xmm0,0(%rax) 2289 movdqa %xmm4,%xmm0 2290 2291 paddd %xmm2,%xmm3 2292 pcmpeqd %xmm5,%xmm2 2293 movdqa %xmm1,16(%rax) 2294 movdqa %xmm4,%xmm1 2295 2296 paddd %xmm3,%xmm0 2297 pcmpeqd %xmm5,%xmm3 2298 movdqa %xmm2,32(%rax) 2299 movdqa %xmm4,%xmm2 2300 paddd %xmm0,%xmm1 2301 pcmpeqd %xmm5,%xmm0 2302 movdqa %xmm3,48(%rax) 2303 movdqa %xmm4,%xmm3 2304 2305 paddd %xmm1,%xmm2 2306 pcmpeqd %xmm5,%xmm1 2307 movdqa %xmm0,64(%rax) 2308 movdqa %xmm4,%xmm0 2309 2310 paddd %xmm2,%xmm3 2311 pcmpeqd %xmm5,%xmm2 2312 movdqa %xmm1,80(%rax) 2313 movdqa %xmm4,%xmm1 2314 2315 paddd %xmm3,%xmm0 2316 pcmpeqd %xmm5,%xmm3 2317 movdqa %xmm2,96(%rax) 2318 movdqa %xmm4,%xmm2 2319 movdqa %xmm3,112(%rax) 2320 jmp .Lgather 2321 2322.align 32 2323.Lgather: 2324 pxor %xmm4,%xmm4 2325 pxor %xmm5,%xmm5 2326 movdqa -128(%r11),%xmm0 2327 movdqa -112(%r11),%xmm1 2328 movdqa -96(%r11),%xmm2 2329 pand -128(%rax),%xmm0 2330 movdqa -80(%r11),%xmm3 2331 pand -112(%rax),%xmm1 2332 por %xmm0,%xmm4 2333 pand -96(%rax),%xmm2 2334 por %xmm1,%xmm5 2335 pand -80(%rax),%xmm3 2336 por %xmm2,%xmm4 2337 por %xmm3,%xmm5 2338 movdqa -64(%r11),%xmm0 2339 movdqa -48(%r11),%xmm1 2340 movdqa -32(%r11),%xmm2 2341 pand -64(%rax),%xmm0 2342 movdqa -16(%r11),%xmm3 2343 pand -48(%rax),%xmm1 2344 por %xmm0,%xmm4 2345 pand -32(%rax),%xmm2 2346 por %xmm1,%xmm5 2347 pand -16(%rax),%xmm3 2348 por %xmm2,%xmm4 2349 por %xmm3,%xmm5 2350 movdqa 0(%r11),%xmm0 2351 movdqa 16(%r11),%xmm1 2352 movdqa 32(%r11),%xmm2 2353 pand 0(%rax),%xmm0 2354 movdqa 48(%r11),%xmm3 2355 pand 16(%rax),%xmm1 2356 por %xmm0,%xmm4 2357 pand 32(%rax),%xmm2 2358 por %xmm1,%xmm5 2359 pand 48(%rax),%xmm3 2360 por %xmm2,%xmm4 2361 por %xmm3,%xmm5 2362 movdqa 64(%r11),%xmm0 2363 movdqa 80(%r11),%xmm1 2364 movdqa 96(%r11),%xmm2 2365 pand 64(%rax),%xmm0 2366 movdqa 112(%r11),%xmm3 2367 pand 80(%rax),%xmm1 2368 por %xmm0,%xmm4 2369 pand 96(%rax),%xmm2 2370 por %xmm1,%xmm5 2371 pand 112(%rax),%xmm3 2372 por %xmm2,%xmm4 2373 por %xmm3,%xmm5 2374 por %xmm5,%xmm4 2375 leaq 256(%r11),%r11 2376 pshufd $0x4e,%xmm4,%xmm0 2377 por %xmm4,%xmm0 2378 movq %xmm0,(%rdi) 2379 leaq 8(%rdi),%rdi 2380 subl $1,%esi 2381 jnz .Lgather 2382 2383 leaq (%r10),%rsp 2384 .byte 0xf3,0xc3 2385.LSEH_end_bn_gather5: 2386.size bn_gather5,.-bn_gather5 2387.align 64 2388.Linc: 2389.long 0,0, 1,1 2390.long 2,2, 2,2 2391.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2392#endif 2393