1#if defined(__aarch64__) 2.text 3 4.globl bn_mul_mont 5.type bn_mul_mont,%function 6.align 5 7bn_mul_mont: 8 tst x5,#7 9 b.eq __bn_sqr8x_mont 10 tst x5,#3 11 b.eq __bn_mul4x_mont 12.Lmul_mont: 13 stp x29,x30,[sp,#-64]! 14 add x29,sp,#0 15 stp x19,x20,[sp,#16] 16 stp x21,x22,[sp,#32] 17 stp x23,x24,[sp,#48] 18 19 ldr x9,[x2],#8 // bp[0] 20 sub x22,sp,x5,lsl#3 21 ldp x7,x8,[x1],#16 // ap[0..1] 22 lsl x5,x5,#3 23 ldr x4,[x4] // *n0 24 and x22,x22,#-16 // ABI says so 25 ldp x13,x14,[x3],#16 // np[0..1] 26 27 mul x6,x7,x9 // ap[0]*bp[0] 28 sub x21,x5,#16 // j=num-2 29 umulh x7,x7,x9 30 mul x10,x8,x9 // ap[1]*bp[0] 31 umulh x11,x8,x9 32 33 mul x15,x6,x4 // "tp[0]"*n0 34 mov sp,x22 // alloca 35 36 // (*) mul x12,x13,x15 // np[0]*m1 37 umulh x13,x13,x15 38 mul x16,x14,x15 // np[1]*m1 39 // (*) adds x12,x12,x6 // discarded 40 // (*) As for removal of first multiplication and addition 41 // instructions. The outcome of first addition is 42 // guaranteed to be zero, which leaves two computationally 43 // significant outcomes: it either carries or not. Then 44 // question is when does it carry? Is there alternative 45 // way to deduce it? If you follow operations, you can 46 // observe that condition for carry is quite simple: 47 // x6 being non-zero. So that carry can be calculated 48 // by adding -1 to x6. That's what next instruction does. 49 subs xzr,x6,#1 // (*) 50 umulh x17,x14,x15 51 adc x13,x13,xzr 52 cbz x21,.L1st_skip 53 54.L1st: 55 ldr x8,[x1],#8 56 adds x6,x10,x7 57 sub x21,x21,#8 // j-- 58 adc x7,x11,xzr 59 60 ldr x14,[x3],#8 61 adds x12,x16,x13 62 mul x10,x8,x9 // ap[j]*bp[0] 63 adc x13,x17,xzr 64 umulh x11,x8,x9 65 66 adds x12,x12,x6 67 mul x16,x14,x15 // np[j]*m1 68 adc x13,x13,xzr 69 umulh x17,x14,x15 70 str x12,[x22],#8 // tp[j-1] 71 cbnz x21,.L1st 72 73.L1st_skip: 74 adds x6,x10,x7 75 sub x1,x1,x5 // rewind x1 76 adc x7,x11,xzr 77 78 adds x12,x16,x13 79 sub x3,x3,x5 // rewind x3 80 adc x13,x17,xzr 81 82 adds x12,x12,x6 83 sub x20,x5,#8 // i=num-1 84 adcs x13,x13,x7 85 86 adc x19,xzr,xzr // upmost overflow bit 87 stp x12,x13,[x22] 88 89.Louter: 90 ldr x9,[x2],#8 // bp[i] 91 ldp x7,x8,[x1],#16 92 ldr x23,[sp] // tp[0] 93 add x22,sp,#8 94 95 mul x6,x7,x9 // ap[0]*bp[i] 96 sub x21,x5,#16 // j=num-2 97 umulh x7,x7,x9 98 ldp x13,x14,[x3],#16 99 mul x10,x8,x9 // ap[1]*bp[i] 100 adds x6,x6,x23 101 umulh x11,x8,x9 102 adc x7,x7,xzr 103 104 mul x15,x6,x4 105 sub x20,x20,#8 // i-- 106 107 // (*) mul x12,x13,x15 // np[0]*m1 108 umulh x13,x13,x15 109 mul x16,x14,x15 // np[1]*m1 110 // (*) adds x12,x12,x6 111 subs xzr,x6,#1 // (*) 112 umulh x17,x14,x15 113 cbz x21,.Linner_skip 114 115.Linner: 116 ldr x8,[x1],#8 117 adc x13,x13,xzr 118 ldr x23,[x22],#8 // tp[j] 119 adds x6,x10,x7 120 sub x21,x21,#8 // j-- 121 adc x7,x11,xzr 122 123 adds x12,x16,x13 124 ldr x14,[x3],#8 125 adc x13,x17,xzr 126 127 mul x10,x8,x9 // ap[j]*bp[i] 128 adds x6,x6,x23 129 umulh x11,x8,x9 130 adc x7,x7,xzr 131 132 mul x16,x14,x15 // np[j]*m1 133 adds x12,x12,x6 134 umulh x17,x14,x15 135 str x12,[x22,#-16] // tp[j-1] 136 cbnz x21,.Linner 137 138.Linner_skip: 139 ldr x23,[x22],#8 // tp[j] 140 adc x13,x13,xzr 141 adds x6,x10,x7 142 sub x1,x1,x5 // rewind x1 143 adc x7,x11,xzr 144 145 adds x12,x16,x13 146 sub x3,x3,x5 // rewind x3 147 adcs x13,x17,x19 148 adc x19,xzr,xzr 149 150 adds x6,x6,x23 151 adc x7,x7,xzr 152 153 adds x12,x12,x6 154 adcs x13,x13,x7 155 adc x19,x19,xzr // upmost overflow bit 156 stp x12,x13,[x22,#-16] 157 158 cbnz x20,.Louter 159 160 // Final step. We see if result is larger than modulus, and 161 // if it is, subtract the modulus. But comparison implies 162 // subtraction. So we subtract modulus, see if it borrowed, 163 // and conditionally copy original value. 164 ldr x23,[sp] // tp[0] 165 add x22,sp,#8 166 ldr x14,[x3],#8 // np[0] 167 subs x21,x5,#8 // j=num-1 and clear borrow 168 mov x1,x0 169.Lsub: 170 sbcs x8,x23,x14 // tp[j]-np[j] 171 ldr x23,[x22],#8 172 sub x21,x21,#8 // j-- 173 ldr x14,[x3],#8 174 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 175 cbnz x21,.Lsub 176 177 sbcs x8,x23,x14 178 sbcs x19,x19,xzr // did it borrow? 179 str x8,[x1],#8 // rp[num-1] 180 181 ldr x23,[sp] // tp[0] 182 add x22,sp,#8 183 ldr x8,[x0],#8 // rp[0] 184 sub x5,x5,#8 // num-- 185 nop 186.Lcond_copy: 187 sub x5,x5,#8 // num-- 188 csel x14,x23,x8,lo // did it borrow? 189 ldr x23,[x22],#8 190 ldr x8,[x0],#8 191 str xzr,[x22,#-16] // wipe tp 192 str x14,[x0,#-16] 193 cbnz x5,.Lcond_copy 194 195 csel x14,x23,x8,lo 196 str xzr,[x22,#-8] // wipe tp 197 str x14,[x0,#-8] 198 199 ldp x19,x20,[x29,#16] 200 mov sp,x29 201 ldp x21,x22,[x29,#32] 202 mov x0,#1 203 ldp x23,x24,[x29,#48] 204 ldr x29,[sp],#64 205 ret 206.size bn_mul_mont,.-bn_mul_mont 207.type __bn_sqr8x_mont,%function 208.align 5 209__bn_sqr8x_mont: 210 cmp x1,x2 211 b.ne __bn_mul4x_mont 212.Lsqr8x_mont: 213 stp x29,x30,[sp,#-128]! 214 add x29,sp,#0 215 stp x19,x20,[sp,#16] 216 stp x21,x22,[sp,#32] 217 stp x23,x24,[sp,#48] 218 stp x25,x26,[sp,#64] 219 stp x27,x28,[sp,#80] 220 stp x0,x3,[sp,#96] // offload rp and np 221 222 ldp x6,x7,[x1,#8*0] 223 ldp x8,x9,[x1,#8*2] 224 ldp x10,x11,[x1,#8*4] 225 ldp x12,x13,[x1,#8*6] 226 227 sub x2,sp,x5,lsl#4 228 lsl x5,x5,#3 229 ldr x4,[x4] // *n0 230 mov sp,x2 // alloca 231 sub x27,x5,#8*8 232 b .Lsqr8x_zero_start 233 234.Lsqr8x_zero: 235 sub x27,x27,#8*8 236 stp xzr,xzr,[x2,#8*0] 237 stp xzr,xzr,[x2,#8*2] 238 stp xzr,xzr,[x2,#8*4] 239 stp xzr,xzr,[x2,#8*6] 240.Lsqr8x_zero_start: 241 stp xzr,xzr,[x2,#8*8] 242 stp xzr,xzr,[x2,#8*10] 243 stp xzr,xzr,[x2,#8*12] 244 stp xzr,xzr,[x2,#8*14] 245 add x2,x2,#8*16 246 cbnz x27,.Lsqr8x_zero 247 248 add x3,x1,x5 249 add x1,x1,#8*8 250 mov x19,xzr 251 mov x20,xzr 252 mov x21,xzr 253 mov x22,xzr 254 mov x23,xzr 255 mov x24,xzr 256 mov x25,xzr 257 mov x26,xzr 258 mov x2,sp 259 str x4,[x29,#112] // offload n0 260 261 // Multiply everything but a[i]*a[i] 262.align 4 263.Lsqr8x_outer_loop: 264 // a[1]a[0] (i) 265 // a[2]a[0] 266 // a[3]a[0] 267 // a[4]a[0] 268 // a[5]a[0] 269 // a[6]a[0] 270 // a[7]a[0] 271 // a[2]a[1] (ii) 272 // a[3]a[1] 273 // a[4]a[1] 274 // a[5]a[1] 275 // a[6]a[1] 276 // a[7]a[1] 277 // a[3]a[2] (iii) 278 // a[4]a[2] 279 // a[5]a[2] 280 // a[6]a[2] 281 // a[7]a[2] 282 // a[4]a[3] (iv) 283 // a[5]a[3] 284 // a[6]a[3] 285 // a[7]a[3] 286 // a[5]a[4] (v) 287 // a[6]a[4] 288 // a[7]a[4] 289 // a[6]a[5] (vi) 290 // a[7]a[5] 291 // a[7]a[6] (vii) 292 293 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 294 mul x15,x8,x6 295 mul x16,x9,x6 296 mul x17,x10,x6 297 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 298 mul x14,x11,x6 299 adcs x21,x21,x15 300 mul x15,x12,x6 301 adcs x22,x22,x16 302 mul x16,x13,x6 303 adcs x23,x23,x17 304 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 305 adcs x24,x24,x14 306 umulh x14,x8,x6 307 adcs x25,x25,x15 308 umulh x15,x9,x6 309 adcs x26,x26,x16 310 umulh x16,x10,x6 311 stp x19,x20,[x2],#8*2 // t[0..1] 312 adc x19,xzr,xzr // t[8] 313 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 314 umulh x17,x11,x6 315 adcs x22,x22,x14 316 umulh x14,x12,x6 317 adcs x23,x23,x15 318 umulh x15,x13,x6 319 adcs x24,x24,x16 320 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 321 adcs x25,x25,x17 322 mul x17,x9,x7 323 adcs x26,x26,x14 324 mul x14,x10,x7 325 adc x19,x19,x15 326 327 mul x15,x11,x7 328 adds x22,x22,x16 329 mul x16,x12,x7 330 adcs x23,x23,x17 331 mul x17,x13,x7 332 adcs x24,x24,x14 333 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 334 adcs x25,x25,x15 335 umulh x15,x9,x7 336 adcs x26,x26,x16 337 umulh x16,x10,x7 338 adcs x19,x19,x17 339 umulh x17,x11,x7 340 stp x21,x22,[x2],#8*2 // t[2..3] 341 adc x20,xzr,xzr // t[9] 342 adds x23,x23,x14 343 umulh x14,x12,x7 344 adcs x24,x24,x15 345 umulh x15,x13,x7 346 adcs x25,x25,x16 347 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 348 adcs x26,x26,x17 349 mul x17,x10,x8 350 adcs x19,x19,x14 351 mul x14,x11,x8 352 adc x20,x20,x15 353 354 mul x15,x12,x8 355 adds x24,x24,x16 356 mul x16,x13,x8 357 adcs x25,x25,x17 358 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 359 adcs x26,x26,x14 360 umulh x14,x10,x8 361 adcs x19,x19,x15 362 umulh x15,x11,x8 363 adcs x20,x20,x16 364 umulh x16,x12,x8 365 stp x23,x24,[x2],#8*2 // t[4..5] 366 adc x21,xzr,xzr // t[10] 367 adds x25,x25,x17 368 umulh x17,x13,x8 369 adcs x26,x26,x14 370 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 371 adcs x19,x19,x15 372 mul x15,x11,x9 373 adcs x20,x20,x16 374 mul x16,x12,x9 375 adc x21,x21,x17 376 377 mul x17,x13,x9 378 adds x26,x26,x14 379 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 380 adcs x19,x19,x15 381 umulh x15,x11,x9 382 adcs x20,x20,x16 383 umulh x16,x12,x9 384 adcs x21,x21,x17 385 umulh x17,x13,x9 386 stp x25,x26,[x2],#8*2 // t[6..7] 387 adc x22,xzr,xzr // t[11] 388 adds x19,x19,x14 389 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 390 adcs x20,x20,x15 391 mul x15,x12,x10 392 adcs x21,x21,x16 393 mul x16,x13,x10 394 adc x22,x22,x17 395 396 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 397 adds x20,x20,x14 398 umulh x14,x12,x10 399 adcs x21,x21,x15 400 umulh x15,x13,x10 401 adcs x22,x22,x16 402 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 403 adc x23,xzr,xzr // t[12] 404 adds x21,x21,x17 405 mul x17,x13,x11 406 adcs x22,x22,x14 407 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 408 adc x23,x23,x15 409 410 umulh x15,x13,x11 411 adds x22,x22,x16 412 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 413 adcs x23,x23,x17 414 umulh x17,x13,x12 // hi(a[7]*a[6]) 415 adc x24,xzr,xzr // t[13] 416 adds x23,x23,x14 417 sub x27,x3,x1 // done yet? 418 adc x24,x24,x15 419 420 adds x24,x24,x16 421 sub x14,x3,x5 // rewinded ap 422 adc x25,xzr,xzr // t[14] 423 add x25,x25,x17 424 425 cbz x27,.Lsqr8x_outer_break 426 427 mov x4,x6 428 ldp x6,x7,[x2,#8*0] 429 ldp x8,x9,[x2,#8*2] 430 ldp x10,x11,[x2,#8*4] 431 ldp x12,x13,[x2,#8*6] 432 adds x19,x19,x6 433 adcs x20,x20,x7 434 ldp x6,x7,[x1,#8*0] 435 adcs x21,x21,x8 436 adcs x22,x22,x9 437 ldp x8,x9,[x1,#8*2] 438 adcs x23,x23,x10 439 adcs x24,x24,x11 440 ldp x10,x11,[x1,#8*4] 441 adcs x25,x25,x12 442 mov x0,x1 443 adcs x26,xzr,x13 444 ldp x12,x13,[x1,#8*6] 445 add x1,x1,#8*8 446 //adc x28,xzr,xzr // moved below 447 mov x27,#-8*8 448 449 // a[8]a[0] 450 // a[9]a[0] 451 // a[a]a[0] 452 // a[b]a[0] 453 // a[c]a[0] 454 // a[d]a[0] 455 // a[e]a[0] 456 // a[f]a[0] 457 // a[8]a[1] 458 // a[f]a[1]........................ 459 // a[8]a[2] 460 // a[f]a[2]........................ 461 // a[8]a[3] 462 // a[f]a[3]........................ 463 // a[8]a[4] 464 // a[f]a[4]........................ 465 // a[8]a[5] 466 // a[f]a[5]........................ 467 // a[8]a[6] 468 // a[f]a[6]........................ 469 // a[8]a[7] 470 // a[f]a[7]........................ 471.Lsqr8x_mul: 472 mul x14,x6,x4 473 adc x28,xzr,xzr // carry bit, modulo-scheduled 474 mul x15,x7,x4 475 add x27,x27,#8 476 mul x16,x8,x4 477 mul x17,x9,x4 478 adds x19,x19,x14 479 mul x14,x10,x4 480 adcs x20,x20,x15 481 mul x15,x11,x4 482 adcs x21,x21,x16 483 mul x16,x12,x4 484 adcs x22,x22,x17 485 mul x17,x13,x4 486 adcs x23,x23,x14 487 umulh x14,x6,x4 488 adcs x24,x24,x15 489 umulh x15,x7,x4 490 adcs x25,x25,x16 491 umulh x16,x8,x4 492 adcs x26,x26,x17 493 umulh x17,x9,x4 494 adc x28,x28,xzr 495 str x19,[x2],#8 496 adds x19,x20,x14 497 umulh x14,x10,x4 498 adcs x20,x21,x15 499 umulh x15,x11,x4 500 adcs x21,x22,x16 501 umulh x16,x12,x4 502 adcs x22,x23,x17 503 umulh x17,x13,x4 504 ldr x4,[x0,x27] 505 adcs x23,x24,x14 506 adcs x24,x25,x15 507 adcs x25,x26,x16 508 adcs x26,x28,x17 509 //adc x28,xzr,xzr // moved above 510 cbnz x27,.Lsqr8x_mul 511 // note that carry flag is guaranteed 512 // to be zero at this point 513 cmp x1,x3 // done yet? 514 b.eq .Lsqr8x_break 515 516 ldp x6,x7,[x2,#8*0] 517 ldp x8,x9,[x2,#8*2] 518 ldp x10,x11,[x2,#8*4] 519 ldp x12,x13,[x2,#8*6] 520 adds x19,x19,x6 521 ldr x4,[x0,#-8*8] 522 adcs x20,x20,x7 523 ldp x6,x7,[x1,#8*0] 524 adcs x21,x21,x8 525 adcs x22,x22,x9 526 ldp x8,x9,[x1,#8*2] 527 adcs x23,x23,x10 528 adcs x24,x24,x11 529 ldp x10,x11,[x1,#8*4] 530 adcs x25,x25,x12 531 mov x27,#-8*8 532 adcs x26,x26,x13 533 ldp x12,x13,[x1,#8*6] 534 add x1,x1,#8*8 535 //adc x28,xzr,xzr // moved above 536 b .Lsqr8x_mul 537 538.align 4 539.Lsqr8x_break: 540 ldp x6,x7,[x0,#8*0] 541 add x1,x0,#8*8 542 ldp x8,x9,[x0,#8*2] 543 sub x14,x3,x1 // is it last iteration? 544 ldp x10,x11,[x0,#8*4] 545 sub x15,x2,x14 546 ldp x12,x13,[x0,#8*6] 547 cbz x14,.Lsqr8x_outer_loop 548 549 stp x19,x20,[x2,#8*0] 550 ldp x19,x20,[x15,#8*0] 551 stp x21,x22,[x2,#8*2] 552 ldp x21,x22,[x15,#8*2] 553 stp x23,x24,[x2,#8*4] 554 ldp x23,x24,[x15,#8*4] 555 stp x25,x26,[x2,#8*6] 556 mov x2,x15 557 ldp x25,x26,[x15,#8*6] 558 b .Lsqr8x_outer_loop 559 560.align 4 561.Lsqr8x_outer_break: 562 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 563 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 564 ldp x15,x16,[sp,#8*1] 565 ldp x11,x13,[x14,#8*2] 566 add x1,x14,#8*4 567 ldp x17,x14,[sp,#8*3] 568 569 stp x19,x20,[x2,#8*0] 570 mul x19,x7,x7 571 stp x21,x22,[x2,#8*2] 572 umulh x7,x7,x7 573 stp x23,x24,[x2,#8*4] 574 mul x8,x9,x9 575 stp x25,x26,[x2,#8*6] 576 mov x2,sp 577 umulh x9,x9,x9 578 adds x20,x7,x15,lsl#1 579 extr x15,x16,x15,#63 580 sub x27,x5,#8*4 581 582.Lsqr4x_shift_n_add: 583 adcs x21,x8,x15 584 extr x16,x17,x16,#63 585 sub x27,x27,#8*4 586 adcs x22,x9,x16 587 ldp x15,x16,[x2,#8*5] 588 mul x10,x11,x11 589 ldp x7,x9,[x1],#8*2 590 umulh x11,x11,x11 591 mul x12,x13,x13 592 umulh x13,x13,x13 593 extr x17,x14,x17,#63 594 stp x19,x20,[x2,#8*0] 595 adcs x23,x10,x17 596 extr x14,x15,x14,#63 597 stp x21,x22,[x2,#8*2] 598 adcs x24,x11,x14 599 ldp x17,x14,[x2,#8*7] 600 extr x15,x16,x15,#63 601 adcs x25,x12,x15 602 extr x16,x17,x16,#63 603 adcs x26,x13,x16 604 ldp x15,x16,[x2,#8*9] 605 mul x6,x7,x7 606 ldp x11,x13,[x1],#8*2 607 umulh x7,x7,x7 608 mul x8,x9,x9 609 umulh x9,x9,x9 610 stp x23,x24,[x2,#8*4] 611 extr x17,x14,x17,#63 612 stp x25,x26,[x2,#8*6] 613 add x2,x2,#8*8 614 adcs x19,x6,x17 615 extr x14,x15,x14,#63 616 adcs x20,x7,x14 617 ldp x17,x14,[x2,#8*3] 618 extr x15,x16,x15,#63 619 cbnz x27,.Lsqr4x_shift_n_add 620 ldp x1,x4,[x29,#104] // pull np and n0 621 622 adcs x21,x8,x15 623 extr x16,x17,x16,#63 624 adcs x22,x9,x16 625 ldp x15,x16,[x2,#8*5] 626 mul x10,x11,x11 627 umulh x11,x11,x11 628 stp x19,x20,[x2,#8*0] 629 mul x12,x13,x13 630 umulh x13,x13,x13 631 stp x21,x22,[x2,#8*2] 632 extr x17,x14,x17,#63 633 adcs x23,x10,x17 634 extr x14,x15,x14,#63 635 ldp x19,x20,[sp,#8*0] 636 adcs x24,x11,x14 637 extr x15,x16,x15,#63 638 ldp x6,x7,[x1,#8*0] 639 adcs x25,x12,x15 640 extr x16,xzr,x16,#63 641 ldp x8,x9,[x1,#8*2] 642 adc x26,x13,x16 643 ldp x10,x11,[x1,#8*4] 644 645 // Reduce by 512 bits per iteration 646 mul x28,x4,x19 // t[0]*n0 647 ldp x12,x13,[x1,#8*6] 648 add x3,x1,x5 649 ldp x21,x22,[sp,#8*2] 650 stp x23,x24,[x2,#8*4] 651 ldp x23,x24,[sp,#8*4] 652 stp x25,x26,[x2,#8*6] 653 ldp x25,x26,[sp,#8*6] 654 add x1,x1,#8*8 655 mov x30,xzr // initial top-most carry 656 mov x2,sp 657 mov x27,#8 658 659.Lsqr8x_reduction: 660 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 661 mul x15,x7,x28 662 sub x27,x27,#1 663 mul x16,x8,x28 664 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 665 mul x17,x9,x28 666 // (*) adds xzr,x19,x14 667 subs xzr,x19,#1 // (*) 668 mul x14,x10,x28 669 adcs x19,x20,x15 670 mul x15,x11,x28 671 adcs x20,x21,x16 672 mul x16,x12,x28 673 adcs x21,x22,x17 674 mul x17,x13,x28 675 adcs x22,x23,x14 676 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 677 adcs x23,x24,x15 678 umulh x15,x7,x28 679 adcs x24,x25,x16 680 umulh x16,x8,x28 681 adcs x25,x26,x17 682 umulh x17,x9,x28 683 adc x26,xzr,xzr 684 adds x19,x19,x14 685 umulh x14,x10,x28 686 adcs x20,x20,x15 687 umulh x15,x11,x28 688 adcs x21,x21,x16 689 umulh x16,x12,x28 690 adcs x22,x22,x17 691 umulh x17,x13,x28 692 mul x28,x4,x19 // next t[0]*n0 693 adcs x23,x23,x14 694 adcs x24,x24,x15 695 adcs x25,x25,x16 696 adc x26,x26,x17 697 cbnz x27,.Lsqr8x_reduction 698 699 ldp x14,x15,[x2,#8*0] 700 ldp x16,x17,[x2,#8*2] 701 mov x0,x2 702 sub x27,x3,x1 // done yet? 703 adds x19,x19,x14 704 adcs x20,x20,x15 705 ldp x14,x15,[x2,#8*4] 706 adcs x21,x21,x16 707 adcs x22,x22,x17 708 ldp x16,x17,[x2,#8*6] 709 adcs x23,x23,x14 710 adcs x24,x24,x15 711 adcs x25,x25,x16 712 adcs x26,x26,x17 713 //adc x28,xzr,xzr // moved below 714 cbz x27,.Lsqr8x8_post_condition 715 716 ldr x4,[x2,#-8*8] 717 ldp x6,x7,[x1,#8*0] 718 ldp x8,x9,[x1,#8*2] 719 ldp x10,x11,[x1,#8*4] 720 mov x27,#-8*8 721 ldp x12,x13,[x1,#8*6] 722 add x1,x1,#8*8 723 724.Lsqr8x_tail: 725 mul x14,x6,x4 726 adc x28,xzr,xzr // carry bit, modulo-scheduled 727 mul x15,x7,x4 728 add x27,x27,#8 729 mul x16,x8,x4 730 mul x17,x9,x4 731 adds x19,x19,x14 732 mul x14,x10,x4 733 adcs x20,x20,x15 734 mul x15,x11,x4 735 adcs x21,x21,x16 736 mul x16,x12,x4 737 adcs x22,x22,x17 738 mul x17,x13,x4 739 adcs x23,x23,x14 740 umulh x14,x6,x4 741 adcs x24,x24,x15 742 umulh x15,x7,x4 743 adcs x25,x25,x16 744 umulh x16,x8,x4 745 adcs x26,x26,x17 746 umulh x17,x9,x4 747 adc x28,x28,xzr 748 str x19,[x2],#8 749 adds x19,x20,x14 750 umulh x14,x10,x4 751 adcs x20,x21,x15 752 umulh x15,x11,x4 753 adcs x21,x22,x16 754 umulh x16,x12,x4 755 adcs x22,x23,x17 756 umulh x17,x13,x4 757 ldr x4,[x0,x27] 758 adcs x23,x24,x14 759 adcs x24,x25,x15 760 adcs x25,x26,x16 761 adcs x26,x28,x17 762 //adc x28,xzr,xzr // moved above 763 cbnz x27,.Lsqr8x_tail 764 // note that carry flag is guaranteed 765 // to be zero at this point 766 ldp x6,x7,[x2,#8*0] 767 sub x27,x3,x1 // done yet? 768 sub x16,x3,x5 // rewinded np 769 ldp x8,x9,[x2,#8*2] 770 ldp x10,x11,[x2,#8*4] 771 ldp x12,x13,[x2,#8*6] 772 cbz x27,.Lsqr8x_tail_break 773 774 ldr x4,[x0,#-8*8] 775 adds x19,x19,x6 776 adcs x20,x20,x7 777 ldp x6,x7,[x1,#8*0] 778 adcs x21,x21,x8 779 adcs x22,x22,x9 780 ldp x8,x9,[x1,#8*2] 781 adcs x23,x23,x10 782 adcs x24,x24,x11 783 ldp x10,x11,[x1,#8*4] 784 adcs x25,x25,x12 785 mov x27,#-8*8 786 adcs x26,x26,x13 787 ldp x12,x13,[x1,#8*6] 788 add x1,x1,#8*8 789 //adc x28,xzr,xzr // moved above 790 b .Lsqr8x_tail 791 792.align 4 793.Lsqr8x_tail_break: 794 ldr x4,[x29,#112] // pull n0 795 add x27,x2,#8*8 // end of current t[num] window 796 797 subs xzr,x30,#1 // "move" top-most carry to carry bit 798 adcs x14,x19,x6 799 adcs x15,x20,x7 800 ldp x19,x20,[x0,#8*0] 801 adcs x21,x21,x8 802 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 803 adcs x22,x22,x9 804 ldp x8,x9,[x16,#8*2] 805 adcs x23,x23,x10 806 adcs x24,x24,x11 807 ldp x10,x11,[x16,#8*4] 808 adcs x25,x25,x12 809 adcs x26,x26,x13 810 ldp x12,x13,[x16,#8*6] 811 add x1,x16,#8*8 812 adc x30,xzr,xzr // top-most carry 813 mul x28,x4,x19 814 stp x14,x15,[x2,#8*0] 815 stp x21,x22,[x2,#8*2] 816 ldp x21,x22,[x0,#8*2] 817 stp x23,x24,[x2,#8*4] 818 ldp x23,x24,[x0,#8*4] 819 cmp x27,x29 // did we hit the bottom? 820 stp x25,x26,[x2,#8*6] 821 mov x2,x0 // slide the window 822 ldp x25,x26,[x0,#8*6] 823 mov x27,#8 824 b.ne .Lsqr8x_reduction 825 826 // Final step. We see if result is larger than modulus, and 827 // if it is, subtract the modulus. But comparison implies 828 // subtraction. So we subtract modulus, see if it borrowed, 829 // and conditionally copy original value. 830 ldr x0,[x29,#96] // pull rp 831 add x2,x2,#8*8 832 subs x14,x19,x6 833 sbcs x15,x20,x7 834 sub x27,x5,#8*8 835 mov x3,x0 // x0 copy 836 837.Lsqr8x_sub: 838 sbcs x16,x21,x8 839 ldp x6,x7,[x1,#8*0] 840 sbcs x17,x22,x9 841 stp x14,x15,[x0,#8*0] 842 sbcs x14,x23,x10 843 ldp x8,x9,[x1,#8*2] 844 sbcs x15,x24,x11 845 stp x16,x17,[x0,#8*2] 846 sbcs x16,x25,x12 847 ldp x10,x11,[x1,#8*4] 848 sbcs x17,x26,x13 849 ldp x12,x13,[x1,#8*6] 850 add x1,x1,#8*8 851 ldp x19,x20,[x2,#8*0] 852 sub x27,x27,#8*8 853 ldp x21,x22,[x2,#8*2] 854 ldp x23,x24,[x2,#8*4] 855 ldp x25,x26,[x2,#8*6] 856 add x2,x2,#8*8 857 stp x14,x15,[x0,#8*4] 858 sbcs x14,x19,x6 859 stp x16,x17,[x0,#8*6] 860 add x0,x0,#8*8 861 sbcs x15,x20,x7 862 cbnz x27,.Lsqr8x_sub 863 864 sbcs x16,x21,x8 865 mov x2,sp 866 add x1,sp,x5 867 ldp x6,x7,[x3,#8*0] 868 sbcs x17,x22,x9 869 stp x14,x15,[x0,#8*0] 870 sbcs x14,x23,x10 871 ldp x8,x9,[x3,#8*2] 872 sbcs x15,x24,x11 873 stp x16,x17,[x0,#8*2] 874 sbcs x16,x25,x12 875 ldp x19,x20,[x1,#8*0] 876 sbcs x17,x26,x13 877 ldp x21,x22,[x1,#8*2] 878 sbcs xzr,x30,xzr // did it borrow? 879 ldr x30,[x29,#8] // pull return address 880 stp x14,x15,[x0,#8*4] 881 stp x16,x17,[x0,#8*6] 882 883 sub x27,x5,#8*4 884.Lsqr4x_cond_copy: 885 sub x27,x27,#8*4 886 csel x14,x19,x6,lo 887 stp xzr,xzr,[x2,#8*0] 888 csel x15,x20,x7,lo 889 ldp x6,x7,[x3,#8*4] 890 ldp x19,x20,[x1,#8*4] 891 csel x16,x21,x8,lo 892 stp xzr,xzr,[x2,#8*2] 893 add x2,x2,#8*4 894 csel x17,x22,x9,lo 895 ldp x8,x9,[x3,#8*6] 896 ldp x21,x22,[x1,#8*6] 897 add x1,x1,#8*4 898 stp x14,x15,[x3,#8*0] 899 stp x16,x17,[x3,#8*2] 900 add x3,x3,#8*4 901 stp xzr,xzr,[x1,#8*0] 902 stp xzr,xzr,[x1,#8*2] 903 cbnz x27,.Lsqr4x_cond_copy 904 905 csel x14,x19,x6,lo 906 stp xzr,xzr,[x2,#8*0] 907 csel x15,x20,x7,lo 908 stp xzr,xzr,[x2,#8*2] 909 csel x16,x21,x8,lo 910 csel x17,x22,x9,lo 911 stp x14,x15,[x3,#8*0] 912 stp x16,x17,[x3,#8*2] 913 914 b .Lsqr8x_done 915 916.align 4 917.Lsqr8x8_post_condition: 918 adc x28,xzr,xzr 919 ldr x30,[x29,#8] // pull return address 920 // x19-7,x28 hold result, x6-7 hold modulus 921 subs x6,x19,x6 922 ldr x1,[x29,#96] // pull rp 923 sbcs x7,x20,x7 924 stp xzr,xzr,[sp,#8*0] 925 sbcs x8,x21,x8 926 stp xzr,xzr,[sp,#8*2] 927 sbcs x9,x22,x9 928 stp xzr,xzr,[sp,#8*4] 929 sbcs x10,x23,x10 930 stp xzr,xzr,[sp,#8*6] 931 sbcs x11,x24,x11 932 stp xzr,xzr,[sp,#8*8] 933 sbcs x12,x25,x12 934 stp xzr,xzr,[sp,#8*10] 935 sbcs x13,x26,x13 936 stp xzr,xzr,[sp,#8*12] 937 sbcs x28,x28,xzr // did it borrow? 938 stp xzr,xzr,[sp,#8*14] 939 940 // x6-7 hold result-modulus 941 csel x6,x19,x6,lo 942 csel x7,x20,x7,lo 943 csel x8,x21,x8,lo 944 csel x9,x22,x9,lo 945 stp x6,x7,[x1,#8*0] 946 csel x10,x23,x10,lo 947 csel x11,x24,x11,lo 948 stp x8,x9,[x1,#8*2] 949 csel x12,x25,x12,lo 950 csel x13,x26,x13,lo 951 stp x10,x11,[x1,#8*4] 952 stp x12,x13,[x1,#8*6] 953 954.Lsqr8x_done: 955 ldp x19,x20,[x29,#16] 956 mov sp,x29 957 ldp x21,x22,[x29,#32] 958 mov x0,#1 959 ldp x23,x24,[x29,#48] 960 ldp x25,x26,[x29,#64] 961 ldp x27,x28,[x29,#80] 962 ldr x29,[sp],#128 963 ret 964.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 965.type __bn_mul4x_mont,%function 966.align 5 967__bn_mul4x_mont: 968 stp x29,x30,[sp,#-128]! 969 add x29,sp,#0 970 stp x19,x20,[sp,#16] 971 stp x21,x22,[sp,#32] 972 stp x23,x24,[sp,#48] 973 stp x25,x26,[sp,#64] 974 stp x27,x28,[sp,#80] 975 976 sub x26,sp,x5,lsl#3 977 lsl x5,x5,#3 978 ldr x4,[x4] // *n0 979 sub sp,x26,#8*4 // alloca 980 981 add x10,x2,x5 982 add x27,x1,x5 983 stp x0,x10,[x29,#96] // offload rp and &b[num] 984 985 ldr x24,[x2,#8*0] // b[0] 986 ldp x6,x7,[x1,#8*0] // a[0..3] 987 ldp x8,x9,[x1,#8*2] 988 add x1,x1,#8*4 989 mov x19,xzr 990 mov x20,xzr 991 mov x21,xzr 992 mov x22,xzr 993 ldp x14,x15,[x3,#8*0] // n[0..3] 994 ldp x16,x17,[x3,#8*2] 995 adds x3,x3,#8*4 // clear carry bit 996 mov x0,xzr 997 mov x28,#0 998 mov x26,sp 999 1000.Loop_mul4x_1st_reduction: 1001 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1002 adc x0,x0,xzr // modulo-scheduled 1003 mul x11,x7,x24 1004 add x28,x28,#8 1005 mul x12,x8,x24 1006 and x28,x28,#31 1007 mul x13,x9,x24 1008 adds x19,x19,x10 1009 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1010 adcs x20,x20,x11 1011 mul x25,x19,x4 // t[0]*n0 1012 adcs x21,x21,x12 1013 umulh x11,x7,x24 1014 adcs x22,x22,x13 1015 umulh x12,x8,x24 1016 adc x23,xzr,xzr 1017 umulh x13,x9,x24 1018 ldr x24,[x2,x28] // next b[i] (or b[0]) 1019 adds x20,x20,x10 1020 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1021 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1022 adcs x21,x21,x11 1023 mul x11,x15,x25 1024 adcs x22,x22,x12 1025 mul x12,x16,x25 1026 adc x23,x23,x13 // can't overflow 1027 mul x13,x17,x25 1028 // (*) adds xzr,x19,x10 1029 subs xzr,x19,#1 // (*) 1030 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1031 adcs x19,x20,x11 1032 umulh x11,x15,x25 1033 adcs x20,x21,x12 1034 umulh x12,x16,x25 1035 adcs x21,x22,x13 1036 umulh x13,x17,x25 1037 adcs x22,x23,x0 1038 adc x0,xzr,xzr 1039 adds x19,x19,x10 1040 sub x10,x27,x1 1041 adcs x20,x20,x11 1042 adcs x21,x21,x12 1043 adcs x22,x22,x13 1044 //adc x0,x0,xzr 1045 cbnz x28,.Loop_mul4x_1st_reduction 1046 1047 cbz x10,.Lmul4x4_post_condition 1048 1049 ldp x6,x7,[x1,#8*0] // a[4..7] 1050 ldp x8,x9,[x1,#8*2] 1051 add x1,x1,#8*4 1052 ldr x25,[sp] // a[0]*n0 1053 ldp x14,x15,[x3,#8*0] // n[4..7] 1054 ldp x16,x17,[x3,#8*2] 1055 add x3,x3,#8*4 1056 1057.Loop_mul4x_1st_tail: 1058 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1059 adc x0,x0,xzr // modulo-scheduled 1060 mul x11,x7,x24 1061 add x28,x28,#8 1062 mul x12,x8,x24 1063 and x28,x28,#31 1064 mul x13,x9,x24 1065 adds x19,x19,x10 1066 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1067 adcs x20,x20,x11 1068 umulh x11,x7,x24 1069 adcs x21,x21,x12 1070 umulh x12,x8,x24 1071 adcs x22,x22,x13 1072 umulh x13,x9,x24 1073 adc x23,xzr,xzr 1074 ldr x24,[x2,x28] // next b[i] (or b[0]) 1075 adds x20,x20,x10 1076 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1077 adcs x21,x21,x11 1078 mul x11,x15,x25 1079 adcs x22,x22,x12 1080 mul x12,x16,x25 1081 adc x23,x23,x13 // can't overflow 1082 mul x13,x17,x25 1083 adds x19,x19,x10 1084 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1085 adcs x20,x20,x11 1086 umulh x11,x15,x25 1087 adcs x21,x21,x12 1088 umulh x12,x16,x25 1089 adcs x22,x22,x13 1090 adcs x23,x23,x0 1091 umulh x13,x17,x25 1092 adc x0,xzr,xzr 1093 ldr x25,[sp,x28] // next t[0]*n0 1094 str x19,[x26],#8 // result!!! 1095 adds x19,x20,x10 1096 sub x10,x27,x1 // done yet? 1097 adcs x20,x21,x11 1098 adcs x21,x22,x12 1099 adcs x22,x23,x13 1100 //adc x0,x0,xzr 1101 cbnz x28,.Loop_mul4x_1st_tail 1102 1103 sub x11,x27,x5 // rewinded x1 1104 cbz x10,.Lmul4x_proceed 1105 1106 ldp x6,x7,[x1,#8*0] 1107 ldp x8,x9,[x1,#8*2] 1108 add x1,x1,#8*4 1109 ldp x14,x15,[x3,#8*0] 1110 ldp x16,x17,[x3,#8*2] 1111 add x3,x3,#8*4 1112 b .Loop_mul4x_1st_tail 1113 1114.align 5 1115.Lmul4x_proceed: 1116 ldr x24,[x2,#8*4]! // *++b 1117 adc x30,x0,xzr 1118 ldp x6,x7,[x11,#8*0] // a[0..3] 1119 sub x3,x3,x5 // rewind np 1120 ldp x8,x9,[x11,#8*2] 1121 add x1,x11,#8*4 1122 1123 stp x19,x20,[x26,#8*0] // result!!! 1124 ldp x19,x20,[sp,#8*4] // t[0..3] 1125 stp x21,x22,[x26,#8*2] // result!!! 1126 ldp x21,x22,[sp,#8*6] 1127 1128 ldp x14,x15,[x3,#8*0] // n[0..3] 1129 mov x26,sp 1130 ldp x16,x17,[x3,#8*2] 1131 adds x3,x3,#8*4 // clear carry bit 1132 mov x0,xzr 1133 1134.align 4 1135.Loop_mul4x_reduction: 1136 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1137 adc x0,x0,xzr // modulo-scheduled 1138 mul x11,x7,x24 1139 add x28,x28,#8 1140 mul x12,x8,x24 1141 and x28,x28,#31 1142 mul x13,x9,x24 1143 adds x19,x19,x10 1144 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1145 adcs x20,x20,x11 1146 mul x25,x19,x4 // t[0]*n0 1147 adcs x21,x21,x12 1148 umulh x11,x7,x24 1149 adcs x22,x22,x13 1150 umulh x12,x8,x24 1151 adc x23,xzr,xzr 1152 umulh x13,x9,x24 1153 ldr x24,[x2,x28] // next b[i] 1154 adds x20,x20,x10 1155 // (*) mul x10,x14,x25 1156 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1157 adcs x21,x21,x11 1158 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1159 adcs x22,x22,x12 1160 mul x12,x16,x25 1161 adc x23,x23,x13 // can't overflow 1162 mul x13,x17,x25 1163 // (*) adds xzr,x19,x10 1164 subs xzr,x19,#1 // (*) 1165 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1166 adcs x19,x20,x11 1167 umulh x11,x15,x25 1168 adcs x20,x21,x12 1169 umulh x12,x16,x25 1170 adcs x21,x22,x13 1171 umulh x13,x17,x25 1172 adcs x22,x23,x0 1173 adc x0,xzr,xzr 1174 adds x19,x19,x10 1175 adcs x20,x20,x11 1176 adcs x21,x21,x12 1177 adcs x22,x22,x13 1178 //adc x0,x0,xzr 1179 cbnz x28,.Loop_mul4x_reduction 1180 1181 adc x0,x0,xzr 1182 ldp x10,x11,[x26,#8*4] // t[4..7] 1183 ldp x12,x13,[x26,#8*6] 1184 ldp x6,x7,[x1,#8*0] // a[4..7] 1185 ldp x8,x9,[x1,#8*2] 1186 add x1,x1,#8*4 1187 adds x19,x19,x10 1188 adcs x20,x20,x11 1189 adcs x21,x21,x12 1190 adcs x22,x22,x13 1191 //adc x0,x0,xzr 1192 1193 ldr x25,[sp] // t[0]*n0 1194 ldp x14,x15,[x3,#8*0] // n[4..7] 1195 ldp x16,x17,[x3,#8*2] 1196 add x3,x3,#8*4 1197 1198.align 4 1199.Loop_mul4x_tail: 1200 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1201 adc x0,x0,xzr // modulo-scheduled 1202 mul x11,x7,x24 1203 add x28,x28,#8 1204 mul x12,x8,x24 1205 and x28,x28,#31 1206 mul x13,x9,x24 1207 adds x19,x19,x10 1208 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1209 adcs x20,x20,x11 1210 umulh x11,x7,x24 1211 adcs x21,x21,x12 1212 umulh x12,x8,x24 1213 adcs x22,x22,x13 1214 umulh x13,x9,x24 1215 adc x23,xzr,xzr 1216 ldr x24,[x2,x28] // next b[i] 1217 adds x20,x20,x10 1218 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1219 adcs x21,x21,x11 1220 mul x11,x15,x25 1221 adcs x22,x22,x12 1222 mul x12,x16,x25 1223 adc x23,x23,x13 // can't overflow 1224 mul x13,x17,x25 1225 adds x19,x19,x10 1226 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1227 adcs x20,x20,x11 1228 umulh x11,x15,x25 1229 adcs x21,x21,x12 1230 umulh x12,x16,x25 1231 adcs x22,x22,x13 1232 umulh x13,x17,x25 1233 adcs x23,x23,x0 1234 ldr x25,[sp,x28] // next a[0]*n0 1235 adc x0,xzr,xzr 1236 str x19,[x26],#8 // result!!! 1237 adds x19,x20,x10 1238 sub x10,x27,x1 // done yet? 1239 adcs x20,x21,x11 1240 adcs x21,x22,x12 1241 adcs x22,x23,x13 1242 //adc x0,x0,xzr 1243 cbnz x28,.Loop_mul4x_tail 1244 1245 sub x11,x3,x5 // rewinded np? 1246 adc x0,x0,xzr 1247 cbz x10,.Loop_mul4x_break 1248 1249 ldp x10,x11,[x26,#8*4] 1250 ldp x12,x13,[x26,#8*6] 1251 ldp x6,x7,[x1,#8*0] 1252 ldp x8,x9,[x1,#8*2] 1253 add x1,x1,#8*4 1254 adds x19,x19,x10 1255 adcs x20,x20,x11 1256 adcs x21,x21,x12 1257 adcs x22,x22,x13 1258 //adc x0,x0,xzr 1259 ldp x14,x15,[x3,#8*0] 1260 ldp x16,x17,[x3,#8*2] 1261 add x3,x3,#8*4 1262 b .Loop_mul4x_tail 1263 1264.align 4 1265.Loop_mul4x_break: 1266 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1267 adds x19,x19,x30 1268 add x2,x2,#8*4 // bp++ 1269 adcs x20,x20,xzr 1270 sub x1,x1,x5 // rewind ap 1271 adcs x21,x21,xzr 1272 stp x19,x20,[x26,#8*0] // result!!! 1273 adcs x22,x22,xzr 1274 ldp x19,x20,[sp,#8*4] // t[0..3] 1275 adc x30,x0,xzr 1276 stp x21,x22,[x26,#8*2] // result!!! 1277 cmp x2,x13 // done yet? 1278 ldp x21,x22,[sp,#8*6] 1279 ldp x14,x15,[x11,#8*0] // n[0..3] 1280 ldp x16,x17,[x11,#8*2] 1281 add x3,x11,#8*4 1282 b.eq .Lmul4x_post 1283 1284 ldr x24,[x2] 1285 ldp x6,x7,[x1,#8*0] // a[0..3] 1286 ldp x8,x9,[x1,#8*2] 1287 adds x1,x1,#8*4 // clear carry bit 1288 mov x0,xzr 1289 mov x26,sp 1290 b .Loop_mul4x_reduction 1291 1292.align 4 1293.Lmul4x_post: 1294 // Final step. We see if result is larger than modulus, and 1295 // if it is, subtract the modulus. But comparison implies 1296 // subtraction. So we subtract modulus, see if it borrowed, 1297 // and conditionally copy original value. 1298 mov x0,x12 1299 mov x27,x12 // x0 copy 1300 subs x10,x19,x14 1301 add x26,sp,#8*8 1302 sbcs x11,x20,x15 1303 sub x28,x5,#8*4 1304 1305.Lmul4x_sub: 1306 sbcs x12,x21,x16 1307 ldp x14,x15,[x3,#8*0] 1308 sub x28,x28,#8*4 1309 ldp x19,x20,[x26,#8*0] 1310 sbcs x13,x22,x17 1311 ldp x16,x17,[x3,#8*2] 1312 add x3,x3,#8*4 1313 ldp x21,x22,[x26,#8*2] 1314 add x26,x26,#8*4 1315 stp x10,x11,[x0,#8*0] 1316 sbcs x10,x19,x14 1317 stp x12,x13,[x0,#8*2] 1318 add x0,x0,#8*4 1319 sbcs x11,x20,x15 1320 cbnz x28,.Lmul4x_sub 1321 1322 sbcs x12,x21,x16 1323 mov x26,sp 1324 add x1,sp,#8*4 1325 ldp x6,x7,[x27,#8*0] 1326 sbcs x13,x22,x17 1327 stp x10,x11,[x0,#8*0] 1328 ldp x8,x9,[x27,#8*2] 1329 stp x12,x13,[x0,#8*2] 1330 ldp x19,x20,[x1,#8*0] 1331 ldp x21,x22,[x1,#8*2] 1332 sbcs xzr,x30,xzr // did it borrow? 1333 ldr x30,[x29,#8] // pull return address 1334 1335 sub x28,x5,#8*4 1336.Lmul4x_cond_copy: 1337 sub x28,x28,#8*4 1338 csel x10,x19,x6,lo 1339 stp xzr,xzr,[x26,#8*0] 1340 csel x11,x20,x7,lo 1341 ldp x6,x7,[x27,#8*4] 1342 ldp x19,x20,[x1,#8*4] 1343 csel x12,x21,x8,lo 1344 stp xzr,xzr,[x26,#8*2] 1345 add x26,x26,#8*4 1346 csel x13,x22,x9,lo 1347 ldp x8,x9,[x27,#8*6] 1348 ldp x21,x22,[x1,#8*6] 1349 add x1,x1,#8*4 1350 stp x10,x11,[x27,#8*0] 1351 stp x12,x13,[x27,#8*2] 1352 add x27,x27,#8*4 1353 cbnz x28,.Lmul4x_cond_copy 1354 1355 csel x10,x19,x6,lo 1356 stp xzr,xzr,[x26,#8*0] 1357 csel x11,x20,x7,lo 1358 stp xzr,xzr,[x26,#8*2] 1359 csel x12,x21,x8,lo 1360 stp xzr,xzr,[x26,#8*3] 1361 csel x13,x22,x9,lo 1362 stp xzr,xzr,[x26,#8*4] 1363 stp x10,x11,[x27,#8*0] 1364 stp x12,x13,[x27,#8*2] 1365 1366 b .Lmul4x_done 1367 1368.align 4 1369.Lmul4x4_post_condition: 1370 adc x0,x0,xzr 1371 ldr x1,[x29,#96] // pull rp 1372 // x19-3,x0 hold result, x14-7 hold modulus 1373 subs x6,x19,x14 1374 ldr x30,[x29,#8] // pull return address 1375 sbcs x7,x20,x15 1376 stp xzr,xzr,[sp,#8*0] 1377 sbcs x8,x21,x16 1378 stp xzr,xzr,[sp,#8*2] 1379 sbcs x9,x22,x17 1380 stp xzr,xzr,[sp,#8*4] 1381 sbcs xzr,x0,xzr // did it borrow? 1382 stp xzr,xzr,[sp,#8*6] 1383 1384 // x6-3 hold result-modulus 1385 csel x6,x19,x6,lo 1386 csel x7,x20,x7,lo 1387 csel x8,x21,x8,lo 1388 csel x9,x22,x9,lo 1389 stp x6,x7,[x1,#8*0] 1390 stp x8,x9,[x1,#8*2] 1391 1392.Lmul4x_done: 1393 ldp x19,x20,[x29,#16] 1394 mov sp,x29 1395 ldp x21,x22,[x29,#32] 1396 mov x0,#1 1397 ldp x23,x24,[x29,#48] 1398 ldp x25,x26,[x29,#64] 1399 ldp x27,x28,[x29,#80] 1400 ldr x29,[sp],#128 1401 ret 1402.size __bn_mul4x_mont,.-__bn_mul4x_mont 1403.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1404.align 2 1405.align 4 1406#endif