1%ifidn __OUTPUT_FORMAT__,obj 2section code use32 class=code align=64 3%elifidn __OUTPUT_FORMAT__,win32 4%ifdef __YASM_VERSION_ID__ 5%if __YASM_VERSION_ID__ < 01010000h 6%error yasm version 1.1.0 or later needed. 7%endif 8; Yasm automatically includes .00 and complains about redefining it. 9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html 10%else 11$@feat.00 equ 1 12%endif 13section .text code align=64 14%else 15section .text code 16%endif 17global _gcm_gmult_4bit_x86 18align 16 19_gcm_gmult_4bit_x86: 20L$_gcm_gmult_4bit_x86_begin: 21 push ebp 22 push ebx 23 push esi 24 push edi 25 sub esp,84 26 mov edi,DWORD [104+esp] 27 mov esi,DWORD [108+esp] 28 mov ebp,DWORD [edi] 29 mov edx,DWORD [4+edi] 30 mov ecx,DWORD [8+edi] 31 mov ebx,DWORD [12+edi] 32 mov DWORD [16+esp],0 33 mov DWORD [20+esp],471859200 34 mov DWORD [24+esp],943718400 35 mov DWORD [28+esp],610271232 36 mov DWORD [32+esp],1887436800 37 mov DWORD [36+esp],1822425088 38 mov DWORD [40+esp],1220542464 39 mov DWORD [44+esp],1423966208 40 mov DWORD [48+esp],3774873600 41 mov DWORD [52+esp],4246732800 42 mov DWORD [56+esp],3644850176 43 mov DWORD [60+esp],3311403008 44 mov DWORD [64+esp],2441084928 45 mov DWORD [68+esp],2376073216 46 mov DWORD [72+esp],2847932416 47 mov DWORD [76+esp],3051356160 48 mov DWORD [esp],ebp 49 mov DWORD [4+esp],edx 50 mov DWORD [8+esp],ecx 51 mov DWORD [12+esp],ebx 52 shr ebx,20 53 and ebx,240 54 mov ebp,DWORD [4+ebx*1+esi] 55 mov edx,DWORD [ebx*1+esi] 56 mov ecx,DWORD [12+ebx*1+esi] 57 mov ebx,DWORD [8+ebx*1+esi] 58 xor eax,eax 59 mov edi,15 60 jmp NEAR L$000x86_loop 61align 16 62L$000x86_loop: 63 mov al,bl 64 shrd ebx,ecx,4 65 and al,15 66 shrd ecx,edx,4 67 shrd edx,ebp,4 68 shr ebp,4 69 xor ebp,DWORD [16+eax*4+esp] 70 mov al,BYTE [edi*1+esp] 71 and al,240 72 xor ebx,DWORD [8+eax*1+esi] 73 xor ecx,DWORD [12+eax*1+esi] 74 xor edx,DWORD [eax*1+esi] 75 xor ebp,DWORD [4+eax*1+esi] 76 dec edi 77 js NEAR L$001x86_break 78 mov al,bl 79 shrd ebx,ecx,4 80 and al,15 81 shrd ecx,edx,4 82 shrd edx,ebp,4 83 shr ebp,4 84 xor ebp,DWORD [16+eax*4+esp] 85 mov al,BYTE [edi*1+esp] 86 shl al,4 87 xor ebx,DWORD [8+eax*1+esi] 88 xor ecx,DWORD [12+eax*1+esi] 89 xor edx,DWORD [eax*1+esi] 90 xor ebp,DWORD [4+eax*1+esi] 91 jmp NEAR L$000x86_loop 92align 16 93L$001x86_break: 94 bswap ebx 95 bswap ecx 96 bswap edx 97 bswap ebp 98 mov edi,DWORD [104+esp] 99 mov DWORD [12+edi],ebx 100 mov DWORD [8+edi],ecx 101 mov DWORD [4+edi],edx 102 mov DWORD [edi],ebp 103 add esp,84 104 pop edi 105 pop esi 106 pop ebx 107 pop ebp 108 ret 109global _gcm_ghash_4bit_x86 110align 16 111_gcm_ghash_4bit_x86: 112L$_gcm_ghash_4bit_x86_begin: 113 push ebp 114 push ebx 115 push esi 116 push edi 117 sub esp,84 118 mov ebx,DWORD [104+esp] 119 mov esi,DWORD [108+esp] 120 mov edi,DWORD [112+esp] 121 mov ecx,DWORD [116+esp] 122 add ecx,edi 123 mov DWORD [116+esp],ecx 124 mov ebp,DWORD [ebx] 125 mov edx,DWORD [4+ebx] 126 mov ecx,DWORD [8+ebx] 127 mov ebx,DWORD [12+ebx] 128 mov DWORD [16+esp],0 129 mov DWORD [20+esp],471859200 130 mov DWORD [24+esp],943718400 131 mov DWORD [28+esp],610271232 132 mov DWORD [32+esp],1887436800 133 mov DWORD [36+esp],1822425088 134 mov DWORD [40+esp],1220542464 135 mov DWORD [44+esp],1423966208 136 mov DWORD [48+esp],3774873600 137 mov DWORD [52+esp],4246732800 138 mov DWORD [56+esp],3644850176 139 mov DWORD [60+esp],3311403008 140 mov DWORD [64+esp],2441084928 141 mov DWORD [68+esp],2376073216 142 mov DWORD [72+esp],2847932416 143 mov DWORD [76+esp],3051356160 144align 16 145L$002x86_outer_loop: 146 xor ebx,DWORD [12+edi] 147 xor ecx,DWORD [8+edi] 148 xor edx,DWORD [4+edi] 149 xor ebp,DWORD [edi] 150 mov DWORD [12+esp],ebx 151 mov DWORD [8+esp],ecx 152 mov DWORD [4+esp],edx 153 mov DWORD [esp],ebp 154 shr ebx,20 155 and ebx,240 156 mov ebp,DWORD [4+ebx*1+esi] 157 mov edx,DWORD [ebx*1+esi] 158 mov ecx,DWORD [12+ebx*1+esi] 159 mov ebx,DWORD [8+ebx*1+esi] 160 xor eax,eax 161 mov edi,15 162 jmp NEAR L$003x86_loop 163align 16 164L$003x86_loop: 165 mov al,bl 166 shrd ebx,ecx,4 167 and al,15 168 shrd ecx,edx,4 169 shrd edx,ebp,4 170 shr ebp,4 171 xor ebp,DWORD [16+eax*4+esp] 172 mov al,BYTE [edi*1+esp] 173 and al,240 174 xor ebx,DWORD [8+eax*1+esi] 175 xor ecx,DWORD [12+eax*1+esi] 176 xor edx,DWORD [eax*1+esi] 177 xor ebp,DWORD [4+eax*1+esi] 178 dec edi 179 js NEAR L$004x86_break 180 mov al,bl 181 shrd ebx,ecx,4 182 and al,15 183 shrd ecx,edx,4 184 shrd edx,ebp,4 185 shr ebp,4 186 xor ebp,DWORD [16+eax*4+esp] 187 mov al,BYTE [edi*1+esp] 188 shl al,4 189 xor ebx,DWORD [8+eax*1+esi] 190 xor ecx,DWORD [12+eax*1+esi] 191 xor edx,DWORD [eax*1+esi] 192 xor ebp,DWORD [4+eax*1+esi] 193 jmp NEAR L$003x86_loop 194align 16 195L$004x86_break: 196 bswap ebx 197 bswap ecx 198 bswap edx 199 bswap ebp 200 mov edi,DWORD [112+esp] 201 lea edi,[16+edi] 202 cmp edi,DWORD [116+esp] 203 mov DWORD [112+esp],edi 204 jb NEAR L$002x86_outer_loop 205 mov edi,DWORD [104+esp] 206 mov DWORD [12+edi],ebx 207 mov DWORD [8+edi],ecx 208 mov DWORD [4+edi],edx 209 mov DWORD [edi],ebp 210 add esp,84 211 pop edi 212 pop esi 213 pop ebx 214 pop ebp 215 ret 216global _gcm_gmult_4bit_mmx 217align 16 218_gcm_gmult_4bit_mmx: 219L$_gcm_gmult_4bit_mmx_begin: 220 push ebp 221 push ebx 222 push esi 223 push edi 224 mov edi,DWORD [20+esp] 225 mov esi,DWORD [24+esp] 226 call L$005pic_point 227L$005pic_point: 228 pop eax 229 lea eax,[(L$rem_4bit-L$005pic_point)+eax] 230 movzx ebx,BYTE [15+edi] 231 xor ecx,ecx 232 mov edx,ebx 233 mov cl,dl 234 mov ebp,14 235 shl cl,4 236 and edx,240 237 movq mm0,[8+ecx*1+esi] 238 movq mm1,[ecx*1+esi] 239 movd ebx,mm0 240 jmp NEAR L$006mmx_loop 241align 16 242L$006mmx_loop: 243 psrlq mm0,4 244 and ebx,15 245 movq mm2,mm1 246 psrlq mm1,4 247 pxor mm0,[8+edx*1+esi] 248 mov cl,BYTE [ebp*1+edi] 249 psllq mm2,60 250 pxor mm1,[ebx*8+eax] 251 dec ebp 252 movd ebx,mm0 253 pxor mm1,[edx*1+esi] 254 mov edx,ecx 255 pxor mm0,mm2 256 js NEAR L$007mmx_break 257 shl cl,4 258 and ebx,15 259 psrlq mm0,4 260 and edx,240 261 movq mm2,mm1 262 psrlq mm1,4 263 pxor mm0,[8+ecx*1+esi] 264 psllq mm2,60 265 pxor mm1,[ebx*8+eax] 266 movd ebx,mm0 267 pxor mm1,[ecx*1+esi] 268 pxor mm0,mm2 269 jmp NEAR L$006mmx_loop 270align 16 271L$007mmx_break: 272 shl cl,4 273 and ebx,15 274 psrlq mm0,4 275 and edx,240 276 movq mm2,mm1 277 psrlq mm1,4 278 pxor mm0,[8+ecx*1+esi] 279 psllq mm2,60 280 pxor mm1,[ebx*8+eax] 281 movd ebx,mm0 282 pxor mm1,[ecx*1+esi] 283 pxor mm0,mm2 284 psrlq mm0,4 285 and ebx,15 286 movq mm2,mm1 287 psrlq mm1,4 288 pxor mm0,[8+edx*1+esi] 289 psllq mm2,60 290 pxor mm1,[ebx*8+eax] 291 movd ebx,mm0 292 pxor mm1,[edx*1+esi] 293 pxor mm0,mm2 294 psrlq mm0,32 295 movd edx,mm1 296 psrlq mm1,32 297 movd ecx,mm0 298 movd ebp,mm1 299 bswap ebx 300 bswap edx 301 bswap ecx 302 bswap ebp 303 emms 304 mov DWORD [12+edi],ebx 305 mov DWORD [4+edi],edx 306 mov DWORD [8+edi],ecx 307 mov DWORD [edi],ebp 308 pop edi 309 pop esi 310 pop ebx 311 pop ebp 312 ret 313global _gcm_ghash_4bit_mmx 314align 16 315_gcm_ghash_4bit_mmx: 316L$_gcm_ghash_4bit_mmx_begin: 317 push ebp 318 push ebx 319 push esi 320 push edi 321 mov eax,DWORD [20+esp] 322 mov ebx,DWORD [24+esp] 323 mov ecx,DWORD [28+esp] 324 mov edx,DWORD [32+esp] 325 mov ebp,esp 326 call L$008pic_point 327L$008pic_point: 328 pop esi 329 lea esi,[(L$rem_8bit-L$008pic_point)+esi] 330 sub esp,544 331 and esp,-64 332 sub esp,16 333 add edx,ecx 334 mov DWORD [544+esp],eax 335 mov DWORD [552+esp],edx 336 mov DWORD [556+esp],ebp 337 add ebx,128 338 lea edi,[144+esp] 339 lea ebp,[400+esp] 340 mov edx,DWORD [ebx-120] 341 movq mm0,[ebx-120] 342 movq mm3,[ebx-128] 343 shl edx,4 344 mov BYTE [esp],dl 345 mov edx,DWORD [ebx-104] 346 movq mm2,[ebx-104] 347 movq mm5,[ebx-112] 348 movq [edi-128],mm0 349 psrlq mm0,4 350 movq [edi],mm3 351 movq mm7,mm3 352 psrlq mm3,4 353 shl edx,4 354 mov BYTE [1+esp],dl 355 mov edx,DWORD [ebx-88] 356 movq mm1,[ebx-88] 357 psllq mm7,60 358 movq mm4,[ebx-96] 359 por mm0,mm7 360 movq [edi-120],mm2 361 psrlq mm2,4 362 movq [8+edi],mm5 363 movq mm6,mm5 364 movq [ebp-128],mm0 365 psrlq mm5,4 366 movq [ebp],mm3 367 shl edx,4 368 mov BYTE [2+esp],dl 369 mov edx,DWORD [ebx-72] 370 movq mm0,[ebx-72] 371 psllq mm6,60 372 movq mm3,[ebx-80] 373 por mm2,mm6 374 movq [edi-112],mm1 375 psrlq mm1,4 376 movq [16+edi],mm4 377 movq mm7,mm4 378 movq [ebp-120],mm2 379 psrlq mm4,4 380 movq [8+ebp],mm5 381 shl edx,4 382 mov BYTE [3+esp],dl 383 mov edx,DWORD [ebx-56] 384 movq mm2,[ebx-56] 385 psllq mm7,60 386 movq mm5,[ebx-64] 387 por mm1,mm7 388 movq [edi-104],mm0 389 psrlq mm0,4 390 movq [24+edi],mm3 391 movq mm6,mm3 392 movq [ebp-112],mm1 393 psrlq mm3,4 394 movq [16+ebp],mm4 395 shl edx,4 396 mov BYTE [4+esp],dl 397 mov edx,DWORD [ebx-40] 398 movq mm1,[ebx-40] 399 psllq mm6,60 400 movq mm4,[ebx-48] 401 por mm0,mm6 402 movq [edi-96],mm2 403 psrlq mm2,4 404 movq [32+edi],mm5 405 movq mm7,mm5 406 movq [ebp-104],mm0 407 psrlq mm5,4 408 movq [24+ebp],mm3 409 shl edx,4 410 mov BYTE [5+esp],dl 411 mov edx,DWORD [ebx-24] 412 movq mm0,[ebx-24] 413 psllq mm7,60 414 movq mm3,[ebx-32] 415 por mm2,mm7 416 movq [edi-88],mm1 417 psrlq mm1,4 418 movq [40+edi],mm4 419 movq mm6,mm4 420 movq [ebp-96],mm2 421 psrlq mm4,4 422 movq [32+ebp],mm5 423 shl edx,4 424 mov BYTE [6+esp],dl 425 mov edx,DWORD [ebx-8] 426 movq mm2,[ebx-8] 427 psllq mm6,60 428 movq mm5,[ebx-16] 429 por mm1,mm6 430 movq [edi-80],mm0 431 psrlq mm0,4 432 movq [48+edi],mm3 433 movq mm7,mm3 434 movq [ebp-88],mm1 435 psrlq mm3,4 436 movq [40+ebp],mm4 437 shl edx,4 438 mov BYTE [7+esp],dl 439 mov edx,DWORD [8+ebx] 440 movq mm1,[8+ebx] 441 psllq mm7,60 442 movq mm4,[ebx] 443 por mm0,mm7 444 movq [edi-72],mm2 445 psrlq mm2,4 446 movq [56+edi],mm5 447 movq mm6,mm5 448 movq [ebp-80],mm0 449 psrlq mm5,4 450 movq [48+ebp],mm3 451 shl edx,4 452 mov BYTE [8+esp],dl 453 mov edx,DWORD [24+ebx] 454 movq mm0,[24+ebx] 455 psllq mm6,60 456 movq mm3,[16+ebx] 457 por mm2,mm6 458 movq [edi-64],mm1 459 psrlq mm1,4 460 movq [64+edi],mm4 461 movq mm7,mm4 462 movq [ebp-72],mm2 463 psrlq mm4,4 464 movq [56+ebp],mm5 465 shl edx,4 466 mov BYTE [9+esp],dl 467 mov edx,DWORD [40+ebx] 468 movq mm2,[40+ebx] 469 psllq mm7,60 470 movq mm5,[32+ebx] 471 por mm1,mm7 472 movq [edi-56],mm0 473 psrlq mm0,4 474 movq [72+edi],mm3 475 movq mm6,mm3 476 movq [ebp-64],mm1 477 psrlq mm3,4 478 movq [64+ebp],mm4 479 shl edx,4 480 mov BYTE [10+esp],dl 481 mov edx,DWORD [56+ebx] 482 movq mm1,[56+ebx] 483 psllq mm6,60 484 movq mm4,[48+ebx] 485 por mm0,mm6 486 movq [edi-48],mm2 487 psrlq mm2,4 488 movq [80+edi],mm5 489 movq mm7,mm5 490 movq [ebp-56],mm0 491 psrlq mm5,4 492 movq [72+ebp],mm3 493 shl edx,4 494 mov BYTE [11+esp],dl 495 mov edx,DWORD [72+ebx] 496 movq mm0,[72+ebx] 497 psllq mm7,60 498 movq mm3,[64+ebx] 499 por mm2,mm7 500 movq [edi-40],mm1 501 psrlq mm1,4 502 movq [88+edi],mm4 503 movq mm6,mm4 504 movq [ebp-48],mm2 505 psrlq mm4,4 506 movq [80+ebp],mm5 507 shl edx,4 508 mov BYTE [12+esp],dl 509 mov edx,DWORD [88+ebx] 510 movq mm2,[88+ebx] 511 psllq mm6,60 512 movq mm5,[80+ebx] 513 por mm1,mm6 514 movq [edi-32],mm0 515 psrlq mm0,4 516 movq [96+edi],mm3 517 movq mm7,mm3 518 movq [ebp-40],mm1 519 psrlq mm3,4 520 movq [88+ebp],mm4 521 shl edx,4 522 mov BYTE [13+esp],dl 523 mov edx,DWORD [104+ebx] 524 movq mm1,[104+ebx] 525 psllq mm7,60 526 movq mm4,[96+ebx] 527 por mm0,mm7 528 movq [edi-24],mm2 529 psrlq mm2,4 530 movq [104+edi],mm5 531 movq mm6,mm5 532 movq [ebp-32],mm0 533 psrlq mm5,4 534 movq [96+ebp],mm3 535 shl edx,4 536 mov BYTE [14+esp],dl 537 mov edx,DWORD [120+ebx] 538 movq mm0,[120+ebx] 539 psllq mm6,60 540 movq mm3,[112+ebx] 541 por mm2,mm6 542 movq [edi-16],mm1 543 psrlq mm1,4 544 movq [112+edi],mm4 545 movq mm7,mm4 546 movq [ebp-24],mm2 547 psrlq mm4,4 548 movq [104+ebp],mm5 549 shl edx,4 550 mov BYTE [15+esp],dl 551 psllq mm7,60 552 por mm1,mm7 553 movq [edi-8],mm0 554 psrlq mm0,4 555 movq [120+edi],mm3 556 movq mm6,mm3 557 movq [ebp-16],mm1 558 psrlq mm3,4 559 movq [112+ebp],mm4 560 psllq mm6,60 561 por mm0,mm6 562 movq [ebp-8],mm0 563 movq [120+ebp],mm3 564 movq mm6,[eax] 565 mov ebx,DWORD [8+eax] 566 mov edx,DWORD [12+eax] 567align 16 568L$009outer: 569 xor edx,DWORD [12+ecx] 570 xor ebx,DWORD [8+ecx] 571 pxor mm6,[ecx] 572 lea ecx,[16+ecx] 573 mov DWORD [536+esp],ebx 574 movq [528+esp],mm6 575 mov DWORD [548+esp],ecx 576 xor eax,eax 577 rol edx,8 578 mov al,dl 579 mov ebp,eax 580 and al,15 581 shr ebp,4 582 pxor mm0,mm0 583 rol edx,8 584 pxor mm1,mm1 585 pxor mm2,mm2 586 movq mm7,[16+eax*8+esp] 587 movq mm6,[144+eax*8+esp] 588 mov al,dl 589 movd ebx,mm7 590 psrlq mm7,8 591 movq mm3,mm6 592 mov edi,eax 593 psrlq mm6,8 594 pxor mm7,[272+ebp*8+esp] 595 and al,15 596 psllq mm3,56 597 shr edi,4 598 pxor mm7,[16+eax*8+esp] 599 rol edx,8 600 pxor mm6,[144+eax*8+esp] 601 pxor mm7,mm3 602 pxor mm6,[400+ebp*8+esp] 603 xor bl,BYTE [ebp*1+esp] 604 mov al,dl 605 movd ecx,mm7 606 movzx ebx,bl 607 psrlq mm7,8 608 movq mm3,mm6 609 mov ebp,eax 610 psrlq mm6,8 611 pxor mm7,[272+edi*8+esp] 612 and al,15 613 psllq mm3,56 614 shr ebp,4 615 pinsrw mm2,WORD [ebx*2+esi],2 616 pxor mm7,[16+eax*8+esp] 617 rol edx,8 618 pxor mm6,[144+eax*8+esp] 619 pxor mm7,mm3 620 pxor mm6,[400+edi*8+esp] 621 xor cl,BYTE [edi*1+esp] 622 mov al,dl 623 mov edx,DWORD [536+esp] 624 movd ebx,mm7 625 movzx ecx,cl 626 psrlq mm7,8 627 movq mm3,mm6 628 mov edi,eax 629 psrlq mm6,8 630 pxor mm7,[272+ebp*8+esp] 631 and al,15 632 psllq mm3,56 633 pxor mm6,mm2 634 shr edi,4 635 pinsrw mm1,WORD [ecx*2+esi],2 636 pxor mm7,[16+eax*8+esp] 637 rol edx,8 638 pxor mm6,[144+eax*8+esp] 639 pxor mm7,mm3 640 pxor mm6,[400+ebp*8+esp] 641 xor bl,BYTE [ebp*1+esp] 642 mov al,dl 643 movd ecx,mm7 644 movzx ebx,bl 645 psrlq mm7,8 646 movq mm3,mm6 647 mov ebp,eax 648 psrlq mm6,8 649 pxor mm7,[272+edi*8+esp] 650 and al,15 651 psllq mm3,56 652 pxor mm6,mm1 653 shr ebp,4 654 pinsrw mm0,WORD [ebx*2+esi],2 655 pxor mm7,[16+eax*8+esp] 656 rol edx,8 657 pxor mm6,[144+eax*8+esp] 658 pxor mm7,mm3 659 pxor mm6,[400+edi*8+esp] 660 xor cl,BYTE [edi*1+esp] 661 mov al,dl 662 movd ebx,mm7 663 movzx ecx,cl 664 psrlq mm7,8 665 movq mm3,mm6 666 mov edi,eax 667 psrlq mm6,8 668 pxor mm7,[272+ebp*8+esp] 669 and al,15 670 psllq mm3,56 671 pxor mm6,mm0 672 shr edi,4 673 pinsrw mm2,WORD [ecx*2+esi],2 674 pxor mm7,[16+eax*8+esp] 675 rol edx,8 676 pxor mm6,[144+eax*8+esp] 677 pxor mm7,mm3 678 pxor mm6,[400+ebp*8+esp] 679 xor bl,BYTE [ebp*1+esp] 680 mov al,dl 681 movd ecx,mm7 682 movzx ebx,bl 683 psrlq mm7,8 684 movq mm3,mm6 685 mov ebp,eax 686 psrlq mm6,8 687 pxor mm7,[272+edi*8+esp] 688 and al,15 689 psllq mm3,56 690 pxor mm6,mm2 691 shr ebp,4 692 pinsrw mm1,WORD [ebx*2+esi],2 693 pxor mm7,[16+eax*8+esp] 694 rol edx,8 695 pxor mm6,[144+eax*8+esp] 696 pxor mm7,mm3 697 pxor mm6,[400+edi*8+esp] 698 xor cl,BYTE [edi*1+esp] 699 mov al,dl 700 mov edx,DWORD [532+esp] 701 movd ebx,mm7 702 movzx ecx,cl 703 psrlq mm7,8 704 movq mm3,mm6 705 mov edi,eax 706 psrlq mm6,8 707 pxor mm7,[272+ebp*8+esp] 708 and al,15 709 psllq mm3,56 710 pxor mm6,mm1 711 shr edi,4 712 pinsrw mm0,WORD [ecx*2+esi],2 713 pxor mm7,[16+eax*8+esp] 714 rol edx,8 715 pxor mm6,[144+eax*8+esp] 716 pxor mm7,mm3 717 pxor mm6,[400+ebp*8+esp] 718 xor bl,BYTE [ebp*1+esp] 719 mov al,dl 720 movd ecx,mm7 721 movzx ebx,bl 722 psrlq mm7,8 723 movq mm3,mm6 724 mov ebp,eax 725 psrlq mm6,8 726 pxor mm7,[272+edi*8+esp] 727 and al,15 728 psllq mm3,56 729 pxor mm6,mm0 730 shr ebp,4 731 pinsrw mm2,WORD [ebx*2+esi],2 732 pxor mm7,[16+eax*8+esp] 733 rol edx,8 734 pxor mm6,[144+eax*8+esp] 735 pxor mm7,mm3 736 pxor mm6,[400+edi*8+esp] 737 xor cl,BYTE [edi*1+esp] 738 mov al,dl 739 movd ebx,mm7 740 movzx ecx,cl 741 psrlq mm7,8 742 movq mm3,mm6 743 mov edi,eax 744 psrlq mm6,8 745 pxor mm7,[272+ebp*8+esp] 746 and al,15 747 psllq mm3,56 748 pxor mm6,mm2 749 shr edi,4 750 pinsrw mm1,WORD [ecx*2+esi],2 751 pxor mm7,[16+eax*8+esp] 752 rol edx,8 753 pxor mm6,[144+eax*8+esp] 754 pxor mm7,mm3 755 pxor mm6,[400+ebp*8+esp] 756 xor bl,BYTE [ebp*1+esp] 757 mov al,dl 758 movd ecx,mm7 759 movzx ebx,bl 760 psrlq mm7,8 761 movq mm3,mm6 762 mov ebp,eax 763 psrlq mm6,8 764 pxor mm7,[272+edi*8+esp] 765 and al,15 766 psllq mm3,56 767 pxor mm6,mm1 768 shr ebp,4 769 pinsrw mm0,WORD [ebx*2+esi],2 770 pxor mm7,[16+eax*8+esp] 771 rol edx,8 772 pxor mm6,[144+eax*8+esp] 773 pxor mm7,mm3 774 pxor mm6,[400+edi*8+esp] 775 xor cl,BYTE [edi*1+esp] 776 mov al,dl 777 mov edx,DWORD [528+esp] 778 movd ebx,mm7 779 movzx ecx,cl 780 psrlq mm7,8 781 movq mm3,mm6 782 mov edi,eax 783 psrlq mm6,8 784 pxor mm7,[272+ebp*8+esp] 785 and al,15 786 psllq mm3,56 787 pxor mm6,mm0 788 shr edi,4 789 pinsrw mm2,WORD [ecx*2+esi],2 790 pxor mm7,[16+eax*8+esp] 791 rol edx,8 792 pxor mm6,[144+eax*8+esp] 793 pxor mm7,mm3 794 pxor mm6,[400+ebp*8+esp] 795 xor bl,BYTE [ebp*1+esp] 796 mov al,dl 797 movd ecx,mm7 798 movzx ebx,bl 799 psrlq mm7,8 800 movq mm3,mm6 801 mov ebp,eax 802 psrlq mm6,8 803 pxor mm7,[272+edi*8+esp] 804 and al,15 805 psllq mm3,56 806 pxor mm6,mm2 807 shr ebp,4 808 pinsrw mm1,WORD [ebx*2+esi],2 809 pxor mm7,[16+eax*8+esp] 810 rol edx,8 811 pxor mm6,[144+eax*8+esp] 812 pxor mm7,mm3 813 pxor mm6,[400+edi*8+esp] 814 xor cl,BYTE [edi*1+esp] 815 mov al,dl 816 movd ebx,mm7 817 movzx ecx,cl 818 psrlq mm7,8 819 movq mm3,mm6 820 mov edi,eax 821 psrlq mm6,8 822 pxor mm7,[272+ebp*8+esp] 823 and al,15 824 psllq mm3,56 825 pxor mm6,mm1 826 shr edi,4 827 pinsrw mm0,WORD [ecx*2+esi],2 828 pxor mm7,[16+eax*8+esp] 829 rol edx,8 830 pxor mm6,[144+eax*8+esp] 831 pxor mm7,mm3 832 pxor mm6,[400+ebp*8+esp] 833 xor bl,BYTE [ebp*1+esp] 834 mov al,dl 835 movd ecx,mm7 836 movzx ebx,bl 837 psrlq mm7,8 838 movq mm3,mm6 839 mov ebp,eax 840 psrlq mm6,8 841 pxor mm7,[272+edi*8+esp] 842 and al,15 843 psllq mm3,56 844 pxor mm6,mm0 845 shr ebp,4 846 pinsrw mm2,WORD [ebx*2+esi],2 847 pxor mm7,[16+eax*8+esp] 848 rol edx,8 849 pxor mm6,[144+eax*8+esp] 850 pxor mm7,mm3 851 pxor mm6,[400+edi*8+esp] 852 xor cl,BYTE [edi*1+esp] 853 mov al,dl 854 mov edx,DWORD [524+esp] 855 movd ebx,mm7 856 movzx ecx,cl 857 psrlq mm7,8 858 movq mm3,mm6 859 mov edi,eax 860 psrlq mm6,8 861 pxor mm7,[272+ebp*8+esp] 862 and al,15 863 psllq mm3,56 864 pxor mm6,mm2 865 shr edi,4 866 pinsrw mm1,WORD [ecx*2+esi],2 867 pxor mm7,[16+eax*8+esp] 868 pxor mm6,[144+eax*8+esp] 869 xor bl,BYTE [ebp*1+esp] 870 pxor mm7,mm3 871 pxor mm6,[400+ebp*8+esp] 872 movzx ebx,bl 873 pxor mm2,mm2 874 psllq mm1,4 875 movd ecx,mm7 876 psrlq mm7,4 877 movq mm3,mm6 878 psrlq mm6,4 879 shl ecx,4 880 pxor mm7,[16+edi*8+esp] 881 psllq mm3,60 882 movzx ecx,cl 883 pxor mm7,mm3 884 pxor mm6,[144+edi*8+esp] 885 pinsrw mm0,WORD [ebx*2+esi],2 886 pxor mm6,mm1 887 movd edx,mm7 888 pinsrw mm2,WORD [ecx*2+esi],3 889 psllq mm0,12 890 pxor mm6,mm0 891 psrlq mm7,32 892 pxor mm6,mm2 893 mov ecx,DWORD [548+esp] 894 movd ebx,mm7 895 movq mm3,mm6 896 psllw mm6,8 897 psrlw mm3,8 898 por mm6,mm3 899 bswap edx 900 pshufw mm6,mm6,27 901 bswap ebx 902 cmp ecx,DWORD [552+esp] 903 jne NEAR L$009outer 904 mov eax,DWORD [544+esp] 905 mov DWORD [12+eax],edx 906 mov DWORD [8+eax],ebx 907 movq [eax],mm6 908 mov esp,DWORD [556+esp] 909 emms 910 pop edi 911 pop esi 912 pop ebx 913 pop ebp 914 ret 915global _gcm_init_clmul 916align 16 917_gcm_init_clmul: 918L$_gcm_init_clmul_begin: 919 mov edx,DWORD [4+esp] 920 mov eax,DWORD [8+esp] 921 call L$010pic 922L$010pic: 923 pop ecx 924 lea ecx,[(L$bswap-L$010pic)+ecx] 925 movdqu xmm2,[eax] 926 pshufd xmm2,xmm2,78 927 pshufd xmm4,xmm2,255 928 movdqa xmm3,xmm2 929 psllq xmm2,1 930 pxor xmm5,xmm5 931 psrlq xmm3,63 932 pcmpgtd xmm5,xmm4 933 pslldq xmm3,8 934 por xmm2,xmm3 935 pand xmm5,[16+ecx] 936 pxor xmm2,xmm5 937 movdqa xmm0,xmm2 938 movdqa xmm1,xmm0 939 pshufd xmm3,xmm0,78 940 pshufd xmm4,xmm2,78 941 pxor xmm3,xmm0 942 pxor xmm4,xmm2 943db 102,15,58,68,194,0 944db 102,15,58,68,202,17 945db 102,15,58,68,220,0 946 xorps xmm3,xmm0 947 xorps xmm3,xmm1 948 movdqa xmm4,xmm3 949 psrldq xmm3,8 950 pslldq xmm4,8 951 pxor xmm1,xmm3 952 pxor xmm0,xmm4 953 movdqa xmm4,xmm0 954 movdqa xmm3,xmm0 955 psllq xmm0,5 956 pxor xmm3,xmm0 957 psllq xmm0,1 958 pxor xmm0,xmm3 959 psllq xmm0,57 960 movdqa xmm3,xmm0 961 pslldq xmm0,8 962 psrldq xmm3,8 963 pxor xmm0,xmm4 964 pxor xmm1,xmm3 965 movdqa xmm4,xmm0 966 psrlq xmm0,1 967 pxor xmm1,xmm4 968 pxor xmm4,xmm0 969 psrlq xmm0,5 970 pxor xmm0,xmm4 971 psrlq xmm0,1 972 pxor xmm0,xmm1 973 pshufd xmm3,xmm2,78 974 pshufd xmm4,xmm0,78 975 pxor xmm3,xmm2 976 movdqu [edx],xmm2 977 pxor xmm4,xmm0 978 movdqu [16+edx],xmm0 979db 102,15,58,15,227,8 980 movdqu [32+edx],xmm4 981 ret 982global _gcm_gmult_clmul 983align 16 984_gcm_gmult_clmul: 985L$_gcm_gmult_clmul_begin: 986 mov eax,DWORD [4+esp] 987 mov edx,DWORD [8+esp] 988 call L$011pic 989L$011pic: 990 pop ecx 991 lea ecx,[(L$bswap-L$011pic)+ecx] 992 movdqu xmm0,[eax] 993 movdqa xmm5,[ecx] 994 movups xmm2,[edx] 995db 102,15,56,0,197 996 movups xmm4,[32+edx] 997 movdqa xmm1,xmm0 998 pshufd xmm3,xmm0,78 999 pxor xmm3,xmm0 1000db 102,15,58,68,194,0 1001db 102,15,58,68,202,17 1002db 102,15,58,68,220,0 1003 xorps xmm3,xmm0 1004 xorps xmm3,xmm1 1005 movdqa xmm4,xmm3 1006 psrldq xmm3,8 1007 pslldq xmm4,8 1008 pxor xmm1,xmm3 1009 pxor xmm0,xmm4 1010 movdqa xmm4,xmm0 1011 movdqa xmm3,xmm0 1012 psllq xmm0,5 1013 pxor xmm3,xmm0 1014 psllq xmm0,1 1015 pxor xmm0,xmm3 1016 psllq xmm0,57 1017 movdqa xmm3,xmm0 1018 pslldq xmm0,8 1019 psrldq xmm3,8 1020 pxor xmm0,xmm4 1021 pxor xmm1,xmm3 1022 movdqa xmm4,xmm0 1023 psrlq xmm0,1 1024 pxor xmm1,xmm4 1025 pxor xmm4,xmm0 1026 psrlq xmm0,5 1027 pxor xmm0,xmm4 1028 psrlq xmm0,1 1029 pxor xmm0,xmm1 1030db 102,15,56,0,197 1031 movdqu [eax],xmm0 1032 ret 1033global _gcm_ghash_clmul 1034align 16 1035_gcm_ghash_clmul: 1036L$_gcm_ghash_clmul_begin: 1037 push ebp 1038 push ebx 1039 push esi 1040 push edi 1041 mov eax,DWORD [20+esp] 1042 mov edx,DWORD [24+esp] 1043 mov esi,DWORD [28+esp] 1044 mov ebx,DWORD [32+esp] 1045 call L$012pic 1046L$012pic: 1047 pop ecx 1048 lea ecx,[(L$bswap-L$012pic)+ecx] 1049 movdqu xmm0,[eax] 1050 movdqa xmm5,[ecx] 1051 movdqu xmm2,[edx] 1052db 102,15,56,0,197 1053 sub ebx,16 1054 jz NEAR L$013odd_tail 1055 movdqu xmm3,[esi] 1056 movdqu xmm6,[16+esi] 1057db 102,15,56,0,221 1058db 102,15,56,0,245 1059 movdqu xmm5,[32+edx] 1060 pxor xmm0,xmm3 1061 pshufd xmm3,xmm6,78 1062 movdqa xmm7,xmm6 1063 pxor xmm3,xmm6 1064 lea esi,[32+esi] 1065db 102,15,58,68,242,0 1066db 102,15,58,68,250,17 1067db 102,15,58,68,221,0 1068 movups xmm2,[16+edx] 1069 nop 1070 sub ebx,32 1071 jbe NEAR L$014even_tail 1072 jmp NEAR L$015mod_loop 1073align 32 1074L$015mod_loop: 1075 pshufd xmm4,xmm0,78 1076 movdqa xmm1,xmm0 1077 pxor xmm4,xmm0 1078 nop 1079db 102,15,58,68,194,0 1080db 102,15,58,68,202,17 1081db 102,15,58,68,229,16 1082 movups xmm2,[edx] 1083 xorps xmm0,xmm6 1084 movdqa xmm5,[ecx] 1085 xorps xmm1,xmm7 1086 movdqu xmm7,[esi] 1087 pxor xmm3,xmm0 1088 movdqu xmm6,[16+esi] 1089 pxor xmm3,xmm1 1090db 102,15,56,0,253 1091 pxor xmm4,xmm3 1092 movdqa xmm3,xmm4 1093 psrldq xmm4,8 1094 pslldq xmm3,8 1095 pxor xmm1,xmm4 1096 pxor xmm0,xmm3 1097db 102,15,56,0,245 1098 pxor xmm1,xmm7 1099 movdqa xmm7,xmm6 1100 movdqa xmm4,xmm0 1101 movdqa xmm3,xmm0 1102 psllq xmm0,5 1103 pxor xmm3,xmm0 1104 psllq xmm0,1 1105 pxor xmm0,xmm3 1106db 102,15,58,68,242,0 1107 movups xmm5,[32+edx] 1108 psllq xmm0,57 1109 movdqa xmm3,xmm0 1110 pslldq xmm0,8 1111 psrldq xmm3,8 1112 pxor xmm0,xmm4 1113 pxor xmm1,xmm3 1114 pshufd xmm3,xmm7,78 1115 movdqa xmm4,xmm0 1116 psrlq xmm0,1 1117 pxor xmm3,xmm7 1118 pxor xmm1,xmm4 1119db 102,15,58,68,250,17 1120 movups xmm2,[16+edx] 1121 pxor xmm4,xmm0 1122 psrlq xmm0,5 1123 pxor xmm0,xmm4 1124 psrlq xmm0,1 1125 pxor xmm0,xmm1 1126db 102,15,58,68,221,0 1127 lea esi,[32+esi] 1128 sub ebx,32 1129 ja NEAR L$015mod_loop 1130L$014even_tail: 1131 pshufd xmm4,xmm0,78 1132 movdqa xmm1,xmm0 1133 pxor xmm4,xmm0 1134db 102,15,58,68,194,0 1135db 102,15,58,68,202,17 1136db 102,15,58,68,229,16 1137 movdqa xmm5,[ecx] 1138 xorps xmm0,xmm6 1139 xorps xmm1,xmm7 1140 pxor xmm3,xmm0 1141 pxor xmm3,xmm1 1142 pxor xmm4,xmm3 1143 movdqa xmm3,xmm4 1144 psrldq xmm4,8 1145 pslldq xmm3,8 1146 pxor xmm1,xmm4 1147 pxor xmm0,xmm3 1148 movdqa xmm4,xmm0 1149 movdqa xmm3,xmm0 1150 psllq xmm0,5 1151 pxor xmm3,xmm0 1152 psllq xmm0,1 1153 pxor xmm0,xmm3 1154 psllq xmm0,57 1155 movdqa xmm3,xmm0 1156 pslldq xmm0,8 1157 psrldq xmm3,8 1158 pxor xmm0,xmm4 1159 pxor xmm1,xmm3 1160 movdqa xmm4,xmm0 1161 psrlq xmm0,1 1162 pxor xmm1,xmm4 1163 pxor xmm4,xmm0 1164 psrlq xmm0,5 1165 pxor xmm0,xmm4 1166 psrlq xmm0,1 1167 pxor xmm0,xmm1 1168 test ebx,ebx 1169 jnz NEAR L$016done 1170 movups xmm2,[edx] 1171L$013odd_tail: 1172 movdqu xmm3,[esi] 1173db 102,15,56,0,221 1174 pxor xmm0,xmm3 1175 movdqa xmm1,xmm0 1176 pshufd xmm3,xmm0,78 1177 pshufd xmm4,xmm2,78 1178 pxor xmm3,xmm0 1179 pxor xmm4,xmm2 1180db 102,15,58,68,194,0 1181db 102,15,58,68,202,17 1182db 102,15,58,68,220,0 1183 xorps xmm3,xmm0 1184 xorps xmm3,xmm1 1185 movdqa xmm4,xmm3 1186 psrldq xmm3,8 1187 pslldq xmm4,8 1188 pxor xmm1,xmm3 1189 pxor xmm0,xmm4 1190 movdqa xmm4,xmm0 1191 movdqa xmm3,xmm0 1192 psllq xmm0,5 1193 pxor xmm3,xmm0 1194 psllq xmm0,1 1195 pxor xmm0,xmm3 1196 psllq xmm0,57 1197 movdqa xmm3,xmm0 1198 pslldq xmm0,8 1199 psrldq xmm3,8 1200 pxor xmm0,xmm4 1201 pxor xmm1,xmm3 1202 movdqa xmm4,xmm0 1203 psrlq xmm0,1 1204 pxor xmm1,xmm4 1205 pxor xmm4,xmm0 1206 psrlq xmm0,5 1207 pxor xmm0,xmm4 1208 psrlq xmm0,1 1209 pxor xmm0,xmm1 1210L$016done: 1211db 102,15,56,0,197 1212 movdqu [eax],xmm0 1213 pop edi 1214 pop esi 1215 pop ebx 1216 pop ebp 1217 ret 1218align 64 1219L$bswap: 1220db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1221db 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 1222align 64 1223L$rem_8bit: 1224dw 0,450,900,582,1800,1738,1164,1358 1225dw 3600,4050,3476,3158,2328,2266,2716,2910 1226dw 7200,7650,8100,7782,6952,6890,6316,6510 1227dw 4656,5106,4532,4214,5432,5370,5820,6014 1228dw 14400,14722,15300,14854,16200,16010,15564,15630 1229dw 13904,14226,13780,13334,12632,12442,13020,13086 1230dw 9312,9634,10212,9766,9064,8874,8428,8494 1231dw 10864,11186,10740,10294,11640,11450,12028,12094 1232dw 28800,28994,29444,29382,30600,30282,29708,30158 1233dw 32400,32594,32020,31958,31128,30810,31260,31710 1234dw 27808,28002,28452,28390,27560,27242,26668,27118 1235dw 25264,25458,24884,24822,26040,25722,26172,26622 1236dw 18624,18690,19268,19078,20424,19978,19532,19854 1237dw 18128,18194,17748,17558,16856,16410,16988,17310 1238dw 21728,21794,22372,22182,21480,21034,20588,20910 1239dw 23280,23346,22900,22710,24056,23610,24188,24510 1240dw 57600,57538,57988,58182,58888,59338,58764,58446 1241dw 61200,61138,60564,60758,59416,59866,60316,59998 1242dw 64800,64738,65188,65382,64040,64490,63916,63598 1243dw 62256,62194,61620,61814,62520,62970,63420,63102 1244dw 55616,55426,56004,56070,56904,57226,56780,56334 1245dw 55120,54930,54484,54550,53336,53658,54236,53790 1246dw 50528,50338,50916,50982,49768,50090,49644,49198 1247dw 52080,51890,51444,51510,52344,52666,53244,52798 1248dw 37248,36930,37380,37830,38536,38730,38156,38094 1249dw 40848,40530,39956,40406,39064,39258,39708,39646 1250dw 36256,35938,36388,36838,35496,35690,35116,35054 1251dw 33712,33394,32820,33270,33976,34170,34620,34558 1252dw 43456,43010,43588,43910,44744,44810,44364,44174 1253dw 42960,42514,42068,42390,41176,41242,41820,41630 1254dw 46560,46114,46692,47014,45800,45866,45420,45230 1255dw 48112,47666,47220,47542,48376,48442,49020,48830 1256align 64 1257L$rem_4bit: 1258dd 0,0,0,471859200,0,943718400,0,610271232 1259dd 0,1887436800,0,1822425088,0,1220542464,0,1423966208 1260dd 0,3774873600,0,4246732800,0,3644850176,0,3311403008 1261dd 0,2441084928,0,2376073216,0,2847932416,0,3051356160 1262db 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 1263db 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 1264db 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 1265db 0 1266