1%ifidn __OUTPUT_FORMAT__,obj 2section code use32 class=code align=64 3%elifidn __OUTPUT_FORMAT__,win32 4%ifdef __YASM_VERSION_ID__ 5%if __YASM_VERSION_ID__ < 01010000h 6%error yasm version 1.1.0 or later needed. 7%endif 8; Yasm automatically includes .00 and complains about redefining it. 9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html 10%else 11$@feat.00 equ 1 12%endif 13section .text code align=64 14%else 15section .text code 16%endif 17;extern _OPENSSL_ia32cap_P 18global _bn_mul_mont 19align 16 20_bn_mul_mont: 21L$_bn_mul_mont_begin: 22 push ebp 23 push ebx 24 push esi 25 push edi 26 xor eax,eax 27 mov edi,DWORD [40+esp] 28 cmp edi,4 29 jl NEAR L$000just_leave 30 lea esi,[20+esp] 31 lea edx,[24+esp] 32 mov ebp,esp 33 add edi,2 34 neg edi 35 lea esp,[edi*4+esp-32] 36 neg edi 37 mov eax,esp 38 sub eax,edx 39 and eax,2047 40 sub esp,eax 41 xor edx,esp 42 and edx,2048 43 xor edx,2048 44 sub esp,edx 45 and esp,-64 46 mov eax,DWORD [esi] 47 mov ebx,DWORD [4+esi] 48 mov ecx,DWORD [8+esi] 49 mov edx,DWORD [12+esi] 50 mov esi,DWORD [16+esi] 51 mov esi,DWORD [esi] 52 mov DWORD [4+esp],eax 53 mov DWORD [8+esp],ebx 54 mov DWORD [12+esp],ecx 55 mov DWORD [16+esp],edx 56 mov DWORD [20+esp],esi 57 lea ebx,[edi-3] 58 mov DWORD [24+esp],ebp 59 lea eax,[_OPENSSL_ia32cap_P] 60 bt DWORD [eax],26 61 jnc NEAR L$001non_sse2 62 mov eax,-1 63 movd mm7,eax 64 mov esi,DWORD [8+esp] 65 mov edi,DWORD [12+esp] 66 mov ebp,DWORD [16+esp] 67 xor edx,edx 68 xor ecx,ecx 69 movd mm4,DWORD [edi] 70 movd mm5,DWORD [esi] 71 movd mm3,DWORD [ebp] 72 pmuludq mm5,mm4 73 movq mm2,mm5 74 movq mm0,mm5 75 pand mm0,mm7 76 pmuludq mm5,[20+esp] 77 pmuludq mm3,mm5 78 paddq mm3,mm0 79 movd mm1,DWORD [4+ebp] 80 movd mm0,DWORD [4+esi] 81 psrlq mm2,32 82 psrlq mm3,32 83 inc ecx 84align 16 85L$0021st: 86 pmuludq mm0,mm4 87 pmuludq mm1,mm5 88 paddq mm2,mm0 89 paddq mm3,mm1 90 movq mm0,mm2 91 pand mm0,mm7 92 movd mm1,DWORD [4+ecx*4+ebp] 93 paddq mm3,mm0 94 movd mm0,DWORD [4+ecx*4+esi] 95 psrlq mm2,32 96 movd DWORD [28+ecx*4+esp],mm3 97 psrlq mm3,32 98 lea ecx,[1+ecx] 99 cmp ecx,ebx 100 jl NEAR L$0021st 101 pmuludq mm0,mm4 102 pmuludq mm1,mm5 103 paddq mm2,mm0 104 paddq mm3,mm1 105 movq mm0,mm2 106 pand mm0,mm7 107 paddq mm3,mm0 108 movd DWORD [28+ecx*4+esp],mm3 109 psrlq mm2,32 110 psrlq mm3,32 111 paddq mm3,mm2 112 movq [32+ebx*4+esp],mm3 113 inc edx 114L$003outer: 115 xor ecx,ecx 116 movd mm4,DWORD [edx*4+edi] 117 movd mm5,DWORD [esi] 118 movd mm6,DWORD [32+esp] 119 movd mm3,DWORD [ebp] 120 pmuludq mm5,mm4 121 paddq mm5,mm6 122 movq mm0,mm5 123 movq mm2,mm5 124 pand mm0,mm7 125 pmuludq mm5,[20+esp] 126 pmuludq mm3,mm5 127 paddq mm3,mm0 128 movd mm6,DWORD [36+esp] 129 movd mm1,DWORD [4+ebp] 130 movd mm0,DWORD [4+esi] 131 psrlq mm2,32 132 psrlq mm3,32 133 paddq mm2,mm6 134 inc ecx 135 dec ebx 136L$004inner: 137 pmuludq mm0,mm4 138 pmuludq mm1,mm5 139 paddq mm2,mm0 140 paddq mm3,mm1 141 movq mm0,mm2 142 movd mm6,DWORD [36+ecx*4+esp] 143 pand mm0,mm7 144 movd mm1,DWORD [4+ecx*4+ebp] 145 paddq mm3,mm0 146 movd mm0,DWORD [4+ecx*4+esi] 147 psrlq mm2,32 148 movd DWORD [28+ecx*4+esp],mm3 149 psrlq mm3,32 150 paddq mm2,mm6 151 dec ebx 152 lea ecx,[1+ecx] 153 jnz NEAR L$004inner 154 mov ebx,ecx 155 pmuludq mm0,mm4 156 pmuludq mm1,mm5 157 paddq mm2,mm0 158 paddq mm3,mm1 159 movq mm0,mm2 160 pand mm0,mm7 161 paddq mm3,mm0 162 movd DWORD [28+ecx*4+esp],mm3 163 psrlq mm2,32 164 psrlq mm3,32 165 movd mm6,DWORD [36+ebx*4+esp] 166 paddq mm3,mm2 167 paddq mm3,mm6 168 movq [32+ebx*4+esp],mm3 169 lea edx,[1+edx] 170 cmp edx,ebx 171 jle NEAR L$003outer 172 emms 173 jmp NEAR L$005common_tail 174align 16 175L$001non_sse2: 176 mov esi,DWORD [8+esp] 177 lea ebp,[1+ebx] 178 mov edi,DWORD [12+esp] 179 xor ecx,ecx 180 mov edx,esi 181 and ebp,1 182 sub edx,edi 183 lea eax,[4+ebx*4+edi] 184 or ebp,edx 185 mov edi,DWORD [edi] 186 jz NEAR L$006bn_sqr_mont 187 mov DWORD [28+esp],eax 188 mov eax,DWORD [esi] 189 xor edx,edx 190align 16 191L$007mull: 192 mov ebp,edx 193 mul edi 194 add ebp,eax 195 lea ecx,[1+ecx] 196 adc edx,0 197 mov eax,DWORD [ecx*4+esi] 198 cmp ecx,ebx 199 mov DWORD [28+ecx*4+esp],ebp 200 jl NEAR L$007mull 201 mov ebp,edx 202 mul edi 203 mov edi,DWORD [20+esp] 204 add eax,ebp 205 mov esi,DWORD [16+esp] 206 adc edx,0 207 imul edi,DWORD [32+esp] 208 mov DWORD [32+ebx*4+esp],eax 209 xor ecx,ecx 210 mov DWORD [36+ebx*4+esp],edx 211 mov DWORD [40+ebx*4+esp],ecx 212 mov eax,DWORD [esi] 213 mul edi 214 add eax,DWORD [32+esp] 215 mov eax,DWORD [4+esi] 216 adc edx,0 217 inc ecx 218 jmp NEAR L$0082ndmadd 219align 16 220L$0091stmadd: 221 mov ebp,edx 222 mul edi 223 add ebp,DWORD [32+ecx*4+esp] 224 lea ecx,[1+ecx] 225 adc edx,0 226 add ebp,eax 227 mov eax,DWORD [ecx*4+esi] 228 adc edx,0 229 cmp ecx,ebx 230 mov DWORD [28+ecx*4+esp],ebp 231 jl NEAR L$0091stmadd 232 mov ebp,edx 233 mul edi 234 add eax,DWORD [32+ebx*4+esp] 235 mov edi,DWORD [20+esp] 236 adc edx,0 237 mov esi,DWORD [16+esp] 238 add ebp,eax 239 adc edx,0 240 imul edi,DWORD [32+esp] 241 xor ecx,ecx 242 add edx,DWORD [36+ebx*4+esp] 243 mov DWORD [32+ebx*4+esp],ebp 244 adc ecx,0 245 mov eax,DWORD [esi] 246 mov DWORD [36+ebx*4+esp],edx 247 mov DWORD [40+ebx*4+esp],ecx 248 mul edi 249 add eax,DWORD [32+esp] 250 mov eax,DWORD [4+esi] 251 adc edx,0 252 mov ecx,1 253align 16 254L$0082ndmadd: 255 mov ebp,edx 256 mul edi 257 add ebp,DWORD [32+ecx*4+esp] 258 lea ecx,[1+ecx] 259 adc edx,0 260 add ebp,eax 261 mov eax,DWORD [ecx*4+esi] 262 adc edx,0 263 cmp ecx,ebx 264 mov DWORD [24+ecx*4+esp],ebp 265 jl NEAR L$0082ndmadd 266 mov ebp,edx 267 mul edi 268 add ebp,DWORD [32+ebx*4+esp] 269 adc edx,0 270 add ebp,eax 271 adc edx,0 272 mov DWORD [28+ebx*4+esp],ebp 273 xor eax,eax 274 mov ecx,DWORD [12+esp] 275 add edx,DWORD [36+ebx*4+esp] 276 adc eax,DWORD [40+ebx*4+esp] 277 lea ecx,[4+ecx] 278 mov DWORD [32+ebx*4+esp],edx 279 cmp ecx,DWORD [28+esp] 280 mov DWORD [36+ebx*4+esp],eax 281 je NEAR L$005common_tail 282 mov edi,DWORD [ecx] 283 mov esi,DWORD [8+esp] 284 mov DWORD [12+esp],ecx 285 xor ecx,ecx 286 xor edx,edx 287 mov eax,DWORD [esi] 288 jmp NEAR L$0091stmadd 289align 16 290L$006bn_sqr_mont: 291 mov DWORD [esp],ebx 292 mov DWORD [12+esp],ecx 293 mov eax,edi 294 mul edi 295 mov DWORD [32+esp],eax 296 mov ebx,edx 297 shr edx,1 298 and ebx,1 299 inc ecx 300align 16 301L$010sqr: 302 mov eax,DWORD [ecx*4+esi] 303 mov ebp,edx 304 mul edi 305 add eax,ebp 306 lea ecx,[1+ecx] 307 adc edx,0 308 lea ebp,[eax*2+ebx] 309 shr eax,31 310 cmp ecx,DWORD [esp] 311 mov ebx,eax 312 mov DWORD [28+ecx*4+esp],ebp 313 jl NEAR L$010sqr 314 mov eax,DWORD [ecx*4+esi] 315 mov ebp,edx 316 mul edi 317 add eax,ebp 318 mov edi,DWORD [20+esp] 319 adc edx,0 320 mov esi,DWORD [16+esp] 321 lea ebp,[eax*2+ebx] 322 imul edi,DWORD [32+esp] 323 shr eax,31 324 mov DWORD [32+ecx*4+esp],ebp 325 lea ebp,[edx*2+eax] 326 mov eax,DWORD [esi] 327 shr edx,31 328 mov DWORD [36+ecx*4+esp],ebp 329 mov DWORD [40+ecx*4+esp],edx 330 mul edi 331 add eax,DWORD [32+esp] 332 mov ebx,ecx 333 adc edx,0 334 mov eax,DWORD [4+esi] 335 mov ecx,1 336align 16 337L$0113rdmadd: 338 mov ebp,edx 339 mul edi 340 add ebp,DWORD [32+ecx*4+esp] 341 adc edx,0 342 add ebp,eax 343 mov eax,DWORD [4+ecx*4+esi] 344 adc edx,0 345 mov DWORD [28+ecx*4+esp],ebp 346 mov ebp,edx 347 mul edi 348 add ebp,DWORD [36+ecx*4+esp] 349 lea ecx,[2+ecx] 350 adc edx,0 351 add ebp,eax 352 mov eax,DWORD [ecx*4+esi] 353 adc edx,0 354 cmp ecx,ebx 355 mov DWORD [24+ecx*4+esp],ebp 356 jl NEAR L$0113rdmadd 357 mov ebp,edx 358 mul edi 359 add ebp,DWORD [32+ebx*4+esp] 360 adc edx,0 361 add ebp,eax 362 adc edx,0 363 mov DWORD [28+ebx*4+esp],ebp 364 mov ecx,DWORD [12+esp] 365 xor eax,eax 366 mov esi,DWORD [8+esp] 367 add edx,DWORD [36+ebx*4+esp] 368 adc eax,DWORD [40+ebx*4+esp] 369 mov DWORD [32+ebx*4+esp],edx 370 cmp ecx,ebx 371 mov DWORD [36+ebx*4+esp],eax 372 je NEAR L$005common_tail 373 mov edi,DWORD [4+ecx*4+esi] 374 lea ecx,[1+ecx] 375 mov eax,edi 376 mov DWORD [12+esp],ecx 377 mul edi 378 add eax,DWORD [32+ecx*4+esp] 379 adc edx,0 380 mov DWORD [32+ecx*4+esp],eax 381 xor ebp,ebp 382 cmp ecx,ebx 383 lea ecx,[1+ecx] 384 je NEAR L$012sqrlast 385 mov ebx,edx 386 shr edx,1 387 and ebx,1 388align 16 389L$013sqradd: 390 mov eax,DWORD [ecx*4+esi] 391 mov ebp,edx 392 mul edi 393 add eax,ebp 394 lea ebp,[eax*1+eax] 395 adc edx,0 396 shr eax,31 397 add ebp,DWORD [32+ecx*4+esp] 398 lea ecx,[1+ecx] 399 adc eax,0 400 add ebp,ebx 401 adc eax,0 402 cmp ecx,DWORD [esp] 403 mov DWORD [28+ecx*4+esp],ebp 404 mov ebx,eax 405 jle NEAR L$013sqradd 406 mov ebp,edx 407 add edx,edx 408 shr ebp,31 409 add edx,ebx 410 adc ebp,0 411L$012sqrlast: 412 mov edi,DWORD [20+esp] 413 mov esi,DWORD [16+esp] 414 imul edi,DWORD [32+esp] 415 add edx,DWORD [32+ecx*4+esp] 416 mov eax,DWORD [esi] 417 adc ebp,0 418 mov DWORD [32+ecx*4+esp],edx 419 mov DWORD [36+ecx*4+esp],ebp 420 mul edi 421 add eax,DWORD [32+esp] 422 lea ebx,[ecx-1] 423 adc edx,0 424 mov ecx,1 425 mov eax,DWORD [4+esi] 426 jmp NEAR L$0113rdmadd 427align 16 428L$005common_tail: 429 mov ebp,DWORD [16+esp] 430 mov edi,DWORD [4+esp] 431 lea esi,[32+esp] 432 mov eax,DWORD [esi] 433 mov ecx,ebx 434 xor edx,edx 435align 16 436L$014sub: 437 sbb eax,DWORD [edx*4+ebp] 438 mov DWORD [edx*4+edi],eax 439 dec ecx 440 mov eax,DWORD [4+edx*4+esi] 441 lea edx,[1+edx] 442 jge NEAR L$014sub 443 sbb eax,0 444align 16 445L$015copy: 446 mov edx,DWORD [ebx*4+esi] 447 mov ebp,DWORD [ebx*4+edi] 448 xor edx,ebp 449 and edx,eax 450 xor edx,ebp 451 mov DWORD [ebx*4+esi],ecx 452 mov DWORD [ebx*4+edi],edx 453 dec ebx 454 jge NEAR L$015copy 455 mov esp,DWORD [24+esp] 456 mov eax,1 457L$000just_leave: 458 pop edi 459 pop esi 460 pop ebx 461 pop ebp 462 ret 463db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 464db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 465db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 466db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 467db 111,114,103,62,0 468segment .bss 469common _OPENSSL_ia32cap_P 16 470