1#if defined(__aarch64__) 2.text 3 4.globl bn_mul_mont 5.hidden bn_mul_mont 6.type bn_mul_mont,%function 7.align 5 8bn_mul_mont: 9 tst x5,#7 10 b.eq __bn_sqr8x_mont 11 tst x5,#3 12 b.eq __bn_mul4x_mont 13.Lmul_mont: 14 stp x29,x30,[sp,#-64]! 15 add x29,sp,#0 16 stp x19,x20,[sp,#16] 17 stp x21,x22,[sp,#32] 18 stp x23,x24,[sp,#48] 19 20 ldr x9,[x2],#8 // bp[0] 21 sub x22,sp,x5,lsl#3 22 ldp x7,x8,[x1],#16 // ap[0..1] 23 lsl x5,x5,#3 24 ldr x4,[x4] // *n0 25 and x22,x22,#-16 // ABI says so 26 ldp x13,x14,[x3],#16 // np[0..1] 27 28 mul x6,x7,x9 // ap[0]*bp[0] 29 sub x21,x5,#16 // j=num-2 30 umulh x7,x7,x9 31 mul x10,x8,x9 // ap[1]*bp[0] 32 umulh x11,x8,x9 33 34 mul x15,x6,x4 // "tp[0]"*n0 35 mov sp,x22 // alloca 36 37 // (*) mul x12,x13,x15 // np[0]*m1 38 umulh x13,x13,x15 39 mul x16,x14,x15 // np[1]*m1 40 // (*) adds x12,x12,x6 // discarded 41 // (*) As for removal of first multiplication and addition 42 // instructions. The outcome of first addition is 43 // guaranteed to be zero, which leaves two computationally 44 // significant outcomes: it either carries or not. Then 45 // question is when does it carry? Is there alternative 46 // way to deduce it? If you follow operations, you can 47 // observe that condition for carry is quite simple: 48 // x6 being non-zero. So that carry can be calculated 49 // by adding -1 to x6. That's what next instruction does. 50 subs xzr,x6,#1 // (*) 51 umulh x17,x14,x15 52 adc x13,x13,xzr 53 cbz x21,.L1st_skip 54 55.L1st: 56 ldr x8,[x1],#8 57 adds x6,x10,x7 58 sub x21,x21,#8 // j-- 59 adc x7,x11,xzr 60 61 ldr x14,[x3],#8 62 adds x12,x16,x13 63 mul x10,x8,x9 // ap[j]*bp[0] 64 adc x13,x17,xzr 65 umulh x11,x8,x9 66 67 adds x12,x12,x6 68 mul x16,x14,x15 // np[j]*m1 69 adc x13,x13,xzr 70 umulh x17,x14,x15 71 str x12,[x22],#8 // tp[j-1] 72 cbnz x21,.L1st 73 74.L1st_skip: 75 adds x6,x10,x7 76 sub x1,x1,x5 // rewind x1 77 adc x7,x11,xzr 78 79 adds x12,x16,x13 80 sub x3,x3,x5 // rewind x3 81 adc x13,x17,xzr 82 83 adds x12,x12,x6 84 sub x20,x5,#8 // i=num-1 85 adcs x13,x13,x7 86 87 adc x19,xzr,xzr // upmost overflow bit 88 stp x12,x13,[x22] 89 90.Louter: 91 ldr x9,[x2],#8 // bp[i] 92 ldp x7,x8,[x1],#16 93 ldr x23,[sp] // tp[0] 94 add x22,sp,#8 95 96 mul x6,x7,x9 // ap[0]*bp[i] 97 sub x21,x5,#16 // j=num-2 98 umulh x7,x7,x9 99 ldp x13,x14,[x3],#16 100 mul x10,x8,x9 // ap[1]*bp[i] 101 adds x6,x6,x23 102 umulh x11,x8,x9 103 adc x7,x7,xzr 104 105 mul x15,x6,x4 106 sub x20,x20,#8 // i-- 107 108 // (*) mul x12,x13,x15 // np[0]*m1 109 umulh x13,x13,x15 110 mul x16,x14,x15 // np[1]*m1 111 // (*) adds x12,x12,x6 112 subs xzr,x6,#1 // (*) 113 umulh x17,x14,x15 114 cbz x21,.Linner_skip 115 116.Linner: 117 ldr x8,[x1],#8 118 adc x13,x13,xzr 119 ldr x23,[x22],#8 // tp[j] 120 adds x6,x10,x7 121 sub x21,x21,#8 // j-- 122 adc x7,x11,xzr 123 124 adds x12,x16,x13 125 ldr x14,[x3],#8 126 adc x13,x17,xzr 127 128 mul x10,x8,x9 // ap[j]*bp[i] 129 adds x6,x6,x23 130 umulh x11,x8,x9 131 adc x7,x7,xzr 132 133 mul x16,x14,x15 // np[j]*m1 134 adds x12,x12,x6 135 umulh x17,x14,x15 136 str x12,[x22,#-16] // tp[j-1] 137 cbnz x21,.Linner 138 139.Linner_skip: 140 ldr x23,[x22],#8 // tp[j] 141 adc x13,x13,xzr 142 adds x6,x10,x7 143 sub x1,x1,x5 // rewind x1 144 adc x7,x11,xzr 145 146 adds x12,x16,x13 147 sub x3,x3,x5 // rewind x3 148 adcs x13,x17,x19 149 adc x19,xzr,xzr 150 151 adds x6,x6,x23 152 adc x7,x7,xzr 153 154 adds x12,x12,x6 155 adcs x13,x13,x7 156 adc x19,x19,xzr // upmost overflow bit 157 stp x12,x13,[x22,#-16] 158 159 cbnz x20,.Louter 160 161 // Final step. We see if result is larger than modulus, and 162 // if it is, subtract the modulus. But comparison implies 163 // subtraction. So we subtract modulus, see if it borrowed, 164 // and conditionally copy original value. 165 ldr x23,[sp] // tp[0] 166 add x22,sp,#8 167 ldr x14,[x3],#8 // np[0] 168 subs x21,x5,#8 // j=num-1 and clear borrow 169 mov x1,x0 170.Lsub: 171 sbcs x8,x23,x14 // tp[j]-np[j] 172 ldr x23,[x22],#8 173 sub x21,x21,#8 // j-- 174 ldr x14,[x3],#8 175 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 176 cbnz x21,.Lsub 177 178 sbcs x8,x23,x14 179 sbcs x19,x19,xzr // did it borrow? 180 str x8,[x1],#8 // rp[num-1] 181 182 ldr x23,[sp] // tp[0] 183 add x22,sp,#8 184 ldr x8,[x0],#8 // rp[0] 185 sub x5,x5,#8 // num-- 186 nop 187.Lcond_copy: 188 sub x5,x5,#8 // num-- 189 csel x14,x23,x8,lo // did it borrow? 190 ldr x23,[x22],#8 191 ldr x8,[x0],#8 192 str xzr,[x22,#-16] // wipe tp 193 str x14,[x0,#-16] 194 cbnz x5,.Lcond_copy 195 196 csel x14,x23,x8,lo 197 str xzr,[x22,#-8] // wipe tp 198 str x14,[x0,#-8] 199 200 ldp x19,x20,[x29,#16] 201 mov sp,x29 202 ldp x21,x22,[x29,#32] 203 mov x0,#1 204 ldp x23,x24,[x29,#48] 205 ldr x29,[sp],#64 206 ret 207.size bn_mul_mont,.-bn_mul_mont 208.type __bn_sqr8x_mont,%function 209.align 5 210__bn_sqr8x_mont: 211 cmp x1,x2 212 b.ne __bn_mul4x_mont 213.Lsqr8x_mont: 214 stp x29,x30,[sp,#-128]! 215 add x29,sp,#0 216 stp x19,x20,[sp,#16] 217 stp x21,x22,[sp,#32] 218 stp x23,x24,[sp,#48] 219 stp x25,x26,[sp,#64] 220 stp x27,x28,[sp,#80] 221 stp x0,x3,[sp,#96] // offload rp and np 222 223 ldp x6,x7,[x1,#8*0] 224 ldp x8,x9,[x1,#8*2] 225 ldp x10,x11,[x1,#8*4] 226 ldp x12,x13,[x1,#8*6] 227 228 sub x2,sp,x5,lsl#4 229 lsl x5,x5,#3 230 ldr x4,[x4] // *n0 231 mov sp,x2 // alloca 232 sub x27,x5,#8*8 233 b .Lsqr8x_zero_start 234 235.Lsqr8x_zero: 236 sub x27,x27,#8*8 237 stp xzr,xzr,[x2,#8*0] 238 stp xzr,xzr,[x2,#8*2] 239 stp xzr,xzr,[x2,#8*4] 240 stp xzr,xzr,[x2,#8*6] 241.Lsqr8x_zero_start: 242 stp xzr,xzr,[x2,#8*8] 243 stp xzr,xzr,[x2,#8*10] 244 stp xzr,xzr,[x2,#8*12] 245 stp xzr,xzr,[x2,#8*14] 246 add x2,x2,#8*16 247 cbnz x27,.Lsqr8x_zero 248 249 add x3,x1,x5 250 add x1,x1,#8*8 251 mov x19,xzr 252 mov x20,xzr 253 mov x21,xzr 254 mov x22,xzr 255 mov x23,xzr 256 mov x24,xzr 257 mov x25,xzr 258 mov x26,xzr 259 mov x2,sp 260 str x4,[x29,#112] // offload n0 261 262 // Multiply everything but a[i]*a[i] 263.align 4 264.Lsqr8x_outer_loop: 265 // a[1]a[0] (i) 266 // a[2]a[0] 267 // a[3]a[0] 268 // a[4]a[0] 269 // a[5]a[0] 270 // a[6]a[0] 271 // a[7]a[0] 272 // a[2]a[1] (ii) 273 // a[3]a[1] 274 // a[4]a[1] 275 // a[5]a[1] 276 // a[6]a[1] 277 // a[7]a[1] 278 // a[3]a[2] (iii) 279 // a[4]a[2] 280 // a[5]a[2] 281 // a[6]a[2] 282 // a[7]a[2] 283 // a[4]a[3] (iv) 284 // a[5]a[3] 285 // a[6]a[3] 286 // a[7]a[3] 287 // a[5]a[4] (v) 288 // a[6]a[4] 289 // a[7]a[4] 290 // a[6]a[5] (vi) 291 // a[7]a[5] 292 // a[7]a[6] (vii) 293 294 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 295 mul x15,x8,x6 296 mul x16,x9,x6 297 mul x17,x10,x6 298 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 299 mul x14,x11,x6 300 adcs x21,x21,x15 301 mul x15,x12,x6 302 adcs x22,x22,x16 303 mul x16,x13,x6 304 adcs x23,x23,x17 305 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 306 adcs x24,x24,x14 307 umulh x14,x8,x6 308 adcs x25,x25,x15 309 umulh x15,x9,x6 310 adcs x26,x26,x16 311 umulh x16,x10,x6 312 stp x19,x20,[x2],#8*2 // t[0..1] 313 adc x19,xzr,xzr // t[8] 314 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 315 umulh x17,x11,x6 316 adcs x22,x22,x14 317 umulh x14,x12,x6 318 adcs x23,x23,x15 319 umulh x15,x13,x6 320 adcs x24,x24,x16 321 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 322 adcs x25,x25,x17 323 mul x17,x9,x7 324 adcs x26,x26,x14 325 mul x14,x10,x7 326 adc x19,x19,x15 327 328 mul x15,x11,x7 329 adds x22,x22,x16 330 mul x16,x12,x7 331 adcs x23,x23,x17 332 mul x17,x13,x7 333 adcs x24,x24,x14 334 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 335 adcs x25,x25,x15 336 umulh x15,x9,x7 337 adcs x26,x26,x16 338 umulh x16,x10,x7 339 adcs x19,x19,x17 340 umulh x17,x11,x7 341 stp x21,x22,[x2],#8*2 // t[2..3] 342 adc x20,xzr,xzr // t[9] 343 adds x23,x23,x14 344 umulh x14,x12,x7 345 adcs x24,x24,x15 346 umulh x15,x13,x7 347 adcs x25,x25,x16 348 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 349 adcs x26,x26,x17 350 mul x17,x10,x8 351 adcs x19,x19,x14 352 mul x14,x11,x8 353 adc x20,x20,x15 354 355 mul x15,x12,x8 356 adds x24,x24,x16 357 mul x16,x13,x8 358 adcs x25,x25,x17 359 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 360 adcs x26,x26,x14 361 umulh x14,x10,x8 362 adcs x19,x19,x15 363 umulh x15,x11,x8 364 adcs x20,x20,x16 365 umulh x16,x12,x8 366 stp x23,x24,[x2],#8*2 // t[4..5] 367 adc x21,xzr,xzr // t[10] 368 adds x25,x25,x17 369 umulh x17,x13,x8 370 adcs x26,x26,x14 371 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 372 adcs x19,x19,x15 373 mul x15,x11,x9 374 adcs x20,x20,x16 375 mul x16,x12,x9 376 adc x21,x21,x17 377 378 mul x17,x13,x9 379 adds x26,x26,x14 380 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 381 adcs x19,x19,x15 382 umulh x15,x11,x9 383 adcs x20,x20,x16 384 umulh x16,x12,x9 385 adcs x21,x21,x17 386 umulh x17,x13,x9 387 stp x25,x26,[x2],#8*2 // t[6..7] 388 adc x22,xzr,xzr // t[11] 389 adds x19,x19,x14 390 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 391 adcs x20,x20,x15 392 mul x15,x12,x10 393 adcs x21,x21,x16 394 mul x16,x13,x10 395 adc x22,x22,x17 396 397 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 398 adds x20,x20,x14 399 umulh x14,x12,x10 400 adcs x21,x21,x15 401 umulh x15,x13,x10 402 adcs x22,x22,x16 403 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 404 adc x23,xzr,xzr // t[12] 405 adds x21,x21,x17 406 mul x17,x13,x11 407 adcs x22,x22,x14 408 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 409 adc x23,x23,x15 410 411 umulh x15,x13,x11 412 adds x22,x22,x16 413 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 414 adcs x23,x23,x17 415 umulh x17,x13,x12 // hi(a[7]*a[6]) 416 adc x24,xzr,xzr // t[13] 417 adds x23,x23,x14 418 sub x27,x3,x1 // done yet? 419 adc x24,x24,x15 420 421 adds x24,x24,x16 422 sub x14,x3,x5 // rewinded ap 423 adc x25,xzr,xzr // t[14] 424 add x25,x25,x17 425 426 cbz x27,.Lsqr8x_outer_break 427 428 mov x4,x6 429 ldp x6,x7,[x2,#8*0] 430 ldp x8,x9,[x2,#8*2] 431 ldp x10,x11,[x2,#8*4] 432 ldp x12,x13,[x2,#8*6] 433 adds x19,x19,x6 434 adcs x20,x20,x7 435 ldp x6,x7,[x1,#8*0] 436 adcs x21,x21,x8 437 adcs x22,x22,x9 438 ldp x8,x9,[x1,#8*2] 439 adcs x23,x23,x10 440 adcs x24,x24,x11 441 ldp x10,x11,[x1,#8*4] 442 adcs x25,x25,x12 443 mov x0,x1 444 adcs x26,xzr,x13 445 ldp x12,x13,[x1,#8*6] 446 add x1,x1,#8*8 447 //adc x28,xzr,xzr // moved below 448 mov x27,#-8*8 449 450 // a[8]a[0] 451 // a[9]a[0] 452 // a[a]a[0] 453 // a[b]a[0] 454 // a[c]a[0] 455 // a[d]a[0] 456 // a[e]a[0] 457 // a[f]a[0] 458 // a[8]a[1] 459 // a[f]a[1]........................ 460 // a[8]a[2] 461 // a[f]a[2]........................ 462 // a[8]a[3] 463 // a[f]a[3]........................ 464 // a[8]a[4] 465 // a[f]a[4]........................ 466 // a[8]a[5] 467 // a[f]a[5]........................ 468 // a[8]a[6] 469 // a[f]a[6]........................ 470 // a[8]a[7] 471 // a[f]a[7]........................ 472.Lsqr8x_mul: 473 mul x14,x6,x4 474 adc x28,xzr,xzr // carry bit, modulo-scheduled 475 mul x15,x7,x4 476 add x27,x27,#8 477 mul x16,x8,x4 478 mul x17,x9,x4 479 adds x19,x19,x14 480 mul x14,x10,x4 481 adcs x20,x20,x15 482 mul x15,x11,x4 483 adcs x21,x21,x16 484 mul x16,x12,x4 485 adcs x22,x22,x17 486 mul x17,x13,x4 487 adcs x23,x23,x14 488 umulh x14,x6,x4 489 adcs x24,x24,x15 490 umulh x15,x7,x4 491 adcs x25,x25,x16 492 umulh x16,x8,x4 493 adcs x26,x26,x17 494 umulh x17,x9,x4 495 adc x28,x28,xzr 496 str x19,[x2],#8 497 adds x19,x20,x14 498 umulh x14,x10,x4 499 adcs x20,x21,x15 500 umulh x15,x11,x4 501 adcs x21,x22,x16 502 umulh x16,x12,x4 503 adcs x22,x23,x17 504 umulh x17,x13,x4 505 ldr x4,[x0,x27] 506 adcs x23,x24,x14 507 adcs x24,x25,x15 508 adcs x25,x26,x16 509 adcs x26,x28,x17 510 //adc x28,xzr,xzr // moved above 511 cbnz x27,.Lsqr8x_mul 512 // note that carry flag is guaranteed 513 // to be zero at this point 514 cmp x1,x3 // done yet? 515 b.eq .Lsqr8x_break 516 517 ldp x6,x7,[x2,#8*0] 518 ldp x8,x9,[x2,#8*2] 519 ldp x10,x11,[x2,#8*4] 520 ldp x12,x13,[x2,#8*6] 521 adds x19,x19,x6 522 ldr x4,[x0,#-8*8] 523 adcs x20,x20,x7 524 ldp x6,x7,[x1,#8*0] 525 adcs x21,x21,x8 526 adcs x22,x22,x9 527 ldp x8,x9,[x1,#8*2] 528 adcs x23,x23,x10 529 adcs x24,x24,x11 530 ldp x10,x11,[x1,#8*4] 531 adcs x25,x25,x12 532 mov x27,#-8*8 533 adcs x26,x26,x13 534 ldp x12,x13,[x1,#8*6] 535 add x1,x1,#8*8 536 //adc x28,xzr,xzr // moved above 537 b .Lsqr8x_mul 538 539.align 4 540.Lsqr8x_break: 541 ldp x6,x7,[x0,#8*0] 542 add x1,x0,#8*8 543 ldp x8,x9,[x0,#8*2] 544 sub x14,x3,x1 // is it last iteration? 545 ldp x10,x11,[x0,#8*4] 546 sub x15,x2,x14 547 ldp x12,x13,[x0,#8*6] 548 cbz x14,.Lsqr8x_outer_loop 549 550 stp x19,x20,[x2,#8*0] 551 ldp x19,x20,[x15,#8*0] 552 stp x21,x22,[x2,#8*2] 553 ldp x21,x22,[x15,#8*2] 554 stp x23,x24,[x2,#8*4] 555 ldp x23,x24,[x15,#8*4] 556 stp x25,x26,[x2,#8*6] 557 mov x2,x15 558 ldp x25,x26,[x15,#8*6] 559 b .Lsqr8x_outer_loop 560 561.align 4 562.Lsqr8x_outer_break: 563 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 564 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 565 ldp x15,x16,[sp,#8*1] 566 ldp x11,x13,[x14,#8*2] 567 add x1,x14,#8*4 568 ldp x17,x14,[sp,#8*3] 569 570 stp x19,x20,[x2,#8*0] 571 mul x19,x7,x7 572 stp x21,x22,[x2,#8*2] 573 umulh x7,x7,x7 574 stp x23,x24,[x2,#8*4] 575 mul x8,x9,x9 576 stp x25,x26,[x2,#8*6] 577 mov x2,sp 578 umulh x9,x9,x9 579 adds x20,x7,x15,lsl#1 580 extr x15,x16,x15,#63 581 sub x27,x5,#8*4 582 583.Lsqr4x_shift_n_add: 584 adcs x21,x8,x15 585 extr x16,x17,x16,#63 586 sub x27,x27,#8*4 587 adcs x22,x9,x16 588 ldp x15,x16,[x2,#8*5] 589 mul x10,x11,x11 590 ldp x7,x9,[x1],#8*2 591 umulh x11,x11,x11 592 mul x12,x13,x13 593 umulh x13,x13,x13 594 extr x17,x14,x17,#63 595 stp x19,x20,[x2,#8*0] 596 adcs x23,x10,x17 597 extr x14,x15,x14,#63 598 stp x21,x22,[x2,#8*2] 599 adcs x24,x11,x14 600 ldp x17,x14,[x2,#8*7] 601 extr x15,x16,x15,#63 602 adcs x25,x12,x15 603 extr x16,x17,x16,#63 604 adcs x26,x13,x16 605 ldp x15,x16,[x2,#8*9] 606 mul x6,x7,x7 607 ldp x11,x13,[x1],#8*2 608 umulh x7,x7,x7 609 mul x8,x9,x9 610 umulh x9,x9,x9 611 stp x23,x24,[x2,#8*4] 612 extr x17,x14,x17,#63 613 stp x25,x26,[x2,#8*6] 614 add x2,x2,#8*8 615 adcs x19,x6,x17 616 extr x14,x15,x14,#63 617 adcs x20,x7,x14 618 ldp x17,x14,[x2,#8*3] 619 extr x15,x16,x15,#63 620 cbnz x27,.Lsqr4x_shift_n_add 621 ldp x1,x4,[x29,#104] // pull np and n0 622 623 adcs x21,x8,x15 624 extr x16,x17,x16,#63 625 adcs x22,x9,x16 626 ldp x15,x16,[x2,#8*5] 627 mul x10,x11,x11 628 umulh x11,x11,x11 629 stp x19,x20,[x2,#8*0] 630 mul x12,x13,x13 631 umulh x13,x13,x13 632 stp x21,x22,[x2,#8*2] 633 extr x17,x14,x17,#63 634 adcs x23,x10,x17 635 extr x14,x15,x14,#63 636 ldp x19,x20,[sp,#8*0] 637 adcs x24,x11,x14 638 extr x15,x16,x15,#63 639 ldp x6,x7,[x1,#8*0] 640 adcs x25,x12,x15 641 extr x16,xzr,x16,#63 642 ldp x8,x9,[x1,#8*2] 643 adc x26,x13,x16 644 ldp x10,x11,[x1,#8*4] 645 646 // Reduce by 512 bits per iteration 647 mul x28,x4,x19 // t[0]*n0 648 ldp x12,x13,[x1,#8*6] 649 add x3,x1,x5 650 ldp x21,x22,[sp,#8*2] 651 stp x23,x24,[x2,#8*4] 652 ldp x23,x24,[sp,#8*4] 653 stp x25,x26,[x2,#8*6] 654 ldp x25,x26,[sp,#8*6] 655 add x1,x1,#8*8 656 mov x30,xzr // initial top-most carry 657 mov x2,sp 658 mov x27,#8 659 660.Lsqr8x_reduction: 661 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 662 mul x15,x7,x28 663 sub x27,x27,#1 664 mul x16,x8,x28 665 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 666 mul x17,x9,x28 667 // (*) adds xzr,x19,x14 668 subs xzr,x19,#1 // (*) 669 mul x14,x10,x28 670 adcs x19,x20,x15 671 mul x15,x11,x28 672 adcs x20,x21,x16 673 mul x16,x12,x28 674 adcs x21,x22,x17 675 mul x17,x13,x28 676 adcs x22,x23,x14 677 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 678 adcs x23,x24,x15 679 umulh x15,x7,x28 680 adcs x24,x25,x16 681 umulh x16,x8,x28 682 adcs x25,x26,x17 683 umulh x17,x9,x28 684 adc x26,xzr,xzr 685 adds x19,x19,x14 686 umulh x14,x10,x28 687 adcs x20,x20,x15 688 umulh x15,x11,x28 689 adcs x21,x21,x16 690 umulh x16,x12,x28 691 adcs x22,x22,x17 692 umulh x17,x13,x28 693 mul x28,x4,x19 // next t[0]*n0 694 adcs x23,x23,x14 695 adcs x24,x24,x15 696 adcs x25,x25,x16 697 adc x26,x26,x17 698 cbnz x27,.Lsqr8x_reduction 699 700 ldp x14,x15,[x2,#8*0] 701 ldp x16,x17,[x2,#8*2] 702 mov x0,x2 703 sub x27,x3,x1 // done yet? 704 adds x19,x19,x14 705 adcs x20,x20,x15 706 ldp x14,x15,[x2,#8*4] 707 adcs x21,x21,x16 708 adcs x22,x22,x17 709 ldp x16,x17,[x2,#8*6] 710 adcs x23,x23,x14 711 adcs x24,x24,x15 712 adcs x25,x25,x16 713 adcs x26,x26,x17 714 //adc x28,xzr,xzr // moved below 715 cbz x27,.Lsqr8x8_post_condition 716 717 ldr x4,[x2,#-8*8] 718 ldp x6,x7,[x1,#8*0] 719 ldp x8,x9,[x1,#8*2] 720 ldp x10,x11,[x1,#8*4] 721 mov x27,#-8*8 722 ldp x12,x13,[x1,#8*6] 723 add x1,x1,#8*8 724 725.Lsqr8x_tail: 726 mul x14,x6,x4 727 adc x28,xzr,xzr // carry bit, modulo-scheduled 728 mul x15,x7,x4 729 add x27,x27,#8 730 mul x16,x8,x4 731 mul x17,x9,x4 732 adds x19,x19,x14 733 mul x14,x10,x4 734 adcs x20,x20,x15 735 mul x15,x11,x4 736 adcs x21,x21,x16 737 mul x16,x12,x4 738 adcs x22,x22,x17 739 mul x17,x13,x4 740 adcs x23,x23,x14 741 umulh x14,x6,x4 742 adcs x24,x24,x15 743 umulh x15,x7,x4 744 adcs x25,x25,x16 745 umulh x16,x8,x4 746 adcs x26,x26,x17 747 umulh x17,x9,x4 748 adc x28,x28,xzr 749 str x19,[x2],#8 750 adds x19,x20,x14 751 umulh x14,x10,x4 752 adcs x20,x21,x15 753 umulh x15,x11,x4 754 adcs x21,x22,x16 755 umulh x16,x12,x4 756 adcs x22,x23,x17 757 umulh x17,x13,x4 758 ldr x4,[x0,x27] 759 adcs x23,x24,x14 760 adcs x24,x25,x15 761 adcs x25,x26,x16 762 adcs x26,x28,x17 763 //adc x28,xzr,xzr // moved above 764 cbnz x27,.Lsqr8x_tail 765 // note that carry flag is guaranteed 766 // to be zero at this point 767 ldp x6,x7,[x2,#8*0] 768 sub x27,x3,x1 // done yet? 769 sub x16,x3,x5 // rewinded np 770 ldp x8,x9,[x2,#8*2] 771 ldp x10,x11,[x2,#8*4] 772 ldp x12,x13,[x2,#8*6] 773 cbz x27,.Lsqr8x_tail_break 774 775 ldr x4,[x0,#-8*8] 776 adds x19,x19,x6 777 adcs x20,x20,x7 778 ldp x6,x7,[x1,#8*0] 779 adcs x21,x21,x8 780 adcs x22,x22,x9 781 ldp x8,x9,[x1,#8*2] 782 adcs x23,x23,x10 783 adcs x24,x24,x11 784 ldp x10,x11,[x1,#8*4] 785 adcs x25,x25,x12 786 mov x27,#-8*8 787 adcs x26,x26,x13 788 ldp x12,x13,[x1,#8*6] 789 add x1,x1,#8*8 790 //adc x28,xzr,xzr // moved above 791 b .Lsqr8x_tail 792 793.align 4 794.Lsqr8x_tail_break: 795 ldr x4,[x29,#112] // pull n0 796 add x27,x2,#8*8 // end of current t[num] window 797 798 subs xzr,x30,#1 // "move" top-most carry to carry bit 799 adcs x14,x19,x6 800 adcs x15,x20,x7 801 ldp x19,x20,[x0,#8*0] 802 adcs x21,x21,x8 803 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 804 adcs x22,x22,x9 805 ldp x8,x9,[x16,#8*2] 806 adcs x23,x23,x10 807 adcs x24,x24,x11 808 ldp x10,x11,[x16,#8*4] 809 adcs x25,x25,x12 810 adcs x26,x26,x13 811 ldp x12,x13,[x16,#8*6] 812 add x1,x16,#8*8 813 adc x30,xzr,xzr // top-most carry 814 mul x28,x4,x19 815 stp x14,x15,[x2,#8*0] 816 stp x21,x22,[x2,#8*2] 817 ldp x21,x22,[x0,#8*2] 818 stp x23,x24,[x2,#8*4] 819 ldp x23,x24,[x0,#8*4] 820 cmp x27,x29 // did we hit the bottom? 821 stp x25,x26,[x2,#8*6] 822 mov x2,x0 // slide the window 823 ldp x25,x26,[x0,#8*6] 824 mov x27,#8 825 b.ne .Lsqr8x_reduction 826 827 // Final step. We see if result is larger than modulus, and 828 // if it is, subtract the modulus. But comparison implies 829 // subtraction. So we subtract modulus, see if it borrowed, 830 // and conditionally copy original value. 831 ldr x0,[x29,#96] // pull rp 832 add x2,x2,#8*8 833 subs x14,x19,x6 834 sbcs x15,x20,x7 835 sub x27,x5,#8*8 836 mov x3,x0 // x0 copy 837 838.Lsqr8x_sub: 839 sbcs x16,x21,x8 840 ldp x6,x7,[x1,#8*0] 841 sbcs x17,x22,x9 842 stp x14,x15,[x0,#8*0] 843 sbcs x14,x23,x10 844 ldp x8,x9,[x1,#8*2] 845 sbcs x15,x24,x11 846 stp x16,x17,[x0,#8*2] 847 sbcs x16,x25,x12 848 ldp x10,x11,[x1,#8*4] 849 sbcs x17,x26,x13 850 ldp x12,x13,[x1,#8*6] 851 add x1,x1,#8*8 852 ldp x19,x20,[x2,#8*0] 853 sub x27,x27,#8*8 854 ldp x21,x22,[x2,#8*2] 855 ldp x23,x24,[x2,#8*4] 856 ldp x25,x26,[x2,#8*6] 857 add x2,x2,#8*8 858 stp x14,x15,[x0,#8*4] 859 sbcs x14,x19,x6 860 stp x16,x17,[x0,#8*6] 861 add x0,x0,#8*8 862 sbcs x15,x20,x7 863 cbnz x27,.Lsqr8x_sub 864 865 sbcs x16,x21,x8 866 mov x2,sp 867 add x1,sp,x5 868 ldp x6,x7,[x3,#8*0] 869 sbcs x17,x22,x9 870 stp x14,x15,[x0,#8*0] 871 sbcs x14,x23,x10 872 ldp x8,x9,[x3,#8*2] 873 sbcs x15,x24,x11 874 stp x16,x17,[x0,#8*2] 875 sbcs x16,x25,x12 876 ldp x19,x20,[x1,#8*0] 877 sbcs x17,x26,x13 878 ldp x21,x22,[x1,#8*2] 879 sbcs xzr,x30,xzr // did it borrow? 880 ldr x30,[x29,#8] // pull return address 881 stp x14,x15,[x0,#8*4] 882 stp x16,x17,[x0,#8*6] 883 884 sub x27,x5,#8*4 885.Lsqr4x_cond_copy: 886 sub x27,x27,#8*4 887 csel x14,x19,x6,lo 888 stp xzr,xzr,[x2,#8*0] 889 csel x15,x20,x7,lo 890 ldp x6,x7,[x3,#8*4] 891 ldp x19,x20,[x1,#8*4] 892 csel x16,x21,x8,lo 893 stp xzr,xzr,[x2,#8*2] 894 add x2,x2,#8*4 895 csel x17,x22,x9,lo 896 ldp x8,x9,[x3,#8*6] 897 ldp x21,x22,[x1,#8*6] 898 add x1,x1,#8*4 899 stp x14,x15,[x3,#8*0] 900 stp x16,x17,[x3,#8*2] 901 add x3,x3,#8*4 902 stp xzr,xzr,[x1,#8*0] 903 stp xzr,xzr,[x1,#8*2] 904 cbnz x27,.Lsqr4x_cond_copy 905 906 csel x14,x19,x6,lo 907 stp xzr,xzr,[x2,#8*0] 908 csel x15,x20,x7,lo 909 stp xzr,xzr,[x2,#8*2] 910 csel x16,x21,x8,lo 911 csel x17,x22,x9,lo 912 stp x14,x15,[x3,#8*0] 913 stp x16,x17,[x3,#8*2] 914 915 b .Lsqr8x_done 916 917.align 4 918.Lsqr8x8_post_condition: 919 adc x28,xzr,xzr 920 ldr x30,[x29,#8] // pull return address 921 // x19-7,x28 hold result, x6-7 hold modulus 922 subs x6,x19,x6 923 ldr x1,[x29,#96] // pull rp 924 sbcs x7,x20,x7 925 stp xzr,xzr,[sp,#8*0] 926 sbcs x8,x21,x8 927 stp xzr,xzr,[sp,#8*2] 928 sbcs x9,x22,x9 929 stp xzr,xzr,[sp,#8*4] 930 sbcs x10,x23,x10 931 stp xzr,xzr,[sp,#8*6] 932 sbcs x11,x24,x11 933 stp xzr,xzr,[sp,#8*8] 934 sbcs x12,x25,x12 935 stp xzr,xzr,[sp,#8*10] 936 sbcs x13,x26,x13 937 stp xzr,xzr,[sp,#8*12] 938 sbcs x28,x28,xzr // did it borrow? 939 stp xzr,xzr,[sp,#8*14] 940 941 // x6-7 hold result-modulus 942 csel x6,x19,x6,lo 943 csel x7,x20,x7,lo 944 csel x8,x21,x8,lo 945 csel x9,x22,x9,lo 946 stp x6,x7,[x1,#8*0] 947 csel x10,x23,x10,lo 948 csel x11,x24,x11,lo 949 stp x8,x9,[x1,#8*2] 950 csel x12,x25,x12,lo 951 csel x13,x26,x13,lo 952 stp x10,x11,[x1,#8*4] 953 stp x12,x13,[x1,#8*6] 954 955.Lsqr8x_done: 956 ldp x19,x20,[x29,#16] 957 mov sp,x29 958 ldp x21,x22,[x29,#32] 959 mov x0,#1 960 ldp x23,x24,[x29,#48] 961 ldp x25,x26,[x29,#64] 962 ldp x27,x28,[x29,#80] 963 ldr x29,[sp],#128 964 ret 965.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 966.type __bn_mul4x_mont,%function 967.align 5 968__bn_mul4x_mont: 969 stp x29,x30,[sp,#-128]! 970 add x29,sp,#0 971 stp x19,x20,[sp,#16] 972 stp x21,x22,[sp,#32] 973 stp x23,x24,[sp,#48] 974 stp x25,x26,[sp,#64] 975 stp x27,x28,[sp,#80] 976 977 sub x26,sp,x5,lsl#3 978 lsl x5,x5,#3 979 ldr x4,[x4] // *n0 980 sub sp,x26,#8*4 // alloca 981 982 add x10,x2,x5 983 add x27,x1,x5 984 stp x0,x10,[x29,#96] // offload rp and &b[num] 985 986 ldr x24,[x2,#8*0] // b[0] 987 ldp x6,x7,[x1,#8*0] // a[0..3] 988 ldp x8,x9,[x1,#8*2] 989 add x1,x1,#8*4 990 mov x19,xzr 991 mov x20,xzr 992 mov x21,xzr 993 mov x22,xzr 994 ldp x14,x15,[x3,#8*0] // n[0..3] 995 ldp x16,x17,[x3,#8*2] 996 adds x3,x3,#8*4 // clear carry bit 997 mov x0,xzr 998 mov x28,#0 999 mov x26,sp 1000 1001.Loop_mul4x_1st_reduction: 1002 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1003 adc x0,x0,xzr // modulo-scheduled 1004 mul x11,x7,x24 1005 add x28,x28,#8 1006 mul x12,x8,x24 1007 and x28,x28,#31 1008 mul x13,x9,x24 1009 adds x19,x19,x10 1010 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1011 adcs x20,x20,x11 1012 mul x25,x19,x4 // t[0]*n0 1013 adcs x21,x21,x12 1014 umulh x11,x7,x24 1015 adcs x22,x22,x13 1016 umulh x12,x8,x24 1017 adc x23,xzr,xzr 1018 umulh x13,x9,x24 1019 ldr x24,[x2,x28] // next b[i] (or b[0]) 1020 adds x20,x20,x10 1021 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1022 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1023 adcs x21,x21,x11 1024 mul x11,x15,x25 1025 adcs x22,x22,x12 1026 mul x12,x16,x25 1027 adc x23,x23,x13 // can't overflow 1028 mul x13,x17,x25 1029 // (*) adds xzr,x19,x10 1030 subs xzr,x19,#1 // (*) 1031 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1032 adcs x19,x20,x11 1033 umulh x11,x15,x25 1034 adcs x20,x21,x12 1035 umulh x12,x16,x25 1036 adcs x21,x22,x13 1037 umulh x13,x17,x25 1038 adcs x22,x23,x0 1039 adc x0,xzr,xzr 1040 adds x19,x19,x10 1041 sub x10,x27,x1 1042 adcs x20,x20,x11 1043 adcs x21,x21,x12 1044 adcs x22,x22,x13 1045 //adc x0,x0,xzr 1046 cbnz x28,.Loop_mul4x_1st_reduction 1047 1048 cbz x10,.Lmul4x4_post_condition 1049 1050 ldp x6,x7,[x1,#8*0] // a[4..7] 1051 ldp x8,x9,[x1,#8*2] 1052 add x1,x1,#8*4 1053 ldr x25,[sp] // a[0]*n0 1054 ldp x14,x15,[x3,#8*0] // n[4..7] 1055 ldp x16,x17,[x3,#8*2] 1056 add x3,x3,#8*4 1057 1058.Loop_mul4x_1st_tail: 1059 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1060 adc x0,x0,xzr // modulo-scheduled 1061 mul x11,x7,x24 1062 add x28,x28,#8 1063 mul x12,x8,x24 1064 and x28,x28,#31 1065 mul x13,x9,x24 1066 adds x19,x19,x10 1067 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1068 adcs x20,x20,x11 1069 umulh x11,x7,x24 1070 adcs x21,x21,x12 1071 umulh x12,x8,x24 1072 adcs x22,x22,x13 1073 umulh x13,x9,x24 1074 adc x23,xzr,xzr 1075 ldr x24,[x2,x28] // next b[i] (or b[0]) 1076 adds x20,x20,x10 1077 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1078 adcs x21,x21,x11 1079 mul x11,x15,x25 1080 adcs x22,x22,x12 1081 mul x12,x16,x25 1082 adc x23,x23,x13 // can't overflow 1083 mul x13,x17,x25 1084 adds x19,x19,x10 1085 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1086 adcs x20,x20,x11 1087 umulh x11,x15,x25 1088 adcs x21,x21,x12 1089 umulh x12,x16,x25 1090 adcs x22,x22,x13 1091 adcs x23,x23,x0 1092 umulh x13,x17,x25 1093 adc x0,xzr,xzr 1094 ldr x25,[sp,x28] // next t[0]*n0 1095 str x19,[x26],#8 // result!!! 1096 adds x19,x20,x10 1097 sub x10,x27,x1 // done yet? 1098 adcs x20,x21,x11 1099 adcs x21,x22,x12 1100 adcs x22,x23,x13 1101 //adc x0,x0,xzr 1102 cbnz x28,.Loop_mul4x_1st_tail 1103 1104 sub x11,x27,x5 // rewinded x1 1105 cbz x10,.Lmul4x_proceed 1106 1107 ldp x6,x7,[x1,#8*0] 1108 ldp x8,x9,[x1,#8*2] 1109 add x1,x1,#8*4 1110 ldp x14,x15,[x3,#8*0] 1111 ldp x16,x17,[x3,#8*2] 1112 add x3,x3,#8*4 1113 b .Loop_mul4x_1st_tail 1114 1115.align 5 1116.Lmul4x_proceed: 1117 ldr x24,[x2,#8*4]! // *++b 1118 adc x30,x0,xzr 1119 ldp x6,x7,[x11,#8*0] // a[0..3] 1120 sub x3,x3,x5 // rewind np 1121 ldp x8,x9,[x11,#8*2] 1122 add x1,x11,#8*4 1123 1124 stp x19,x20,[x26,#8*0] // result!!! 1125 ldp x19,x20,[sp,#8*4] // t[0..3] 1126 stp x21,x22,[x26,#8*2] // result!!! 1127 ldp x21,x22,[sp,#8*6] 1128 1129 ldp x14,x15,[x3,#8*0] // n[0..3] 1130 mov x26,sp 1131 ldp x16,x17,[x3,#8*2] 1132 adds x3,x3,#8*4 // clear carry bit 1133 mov x0,xzr 1134 1135.align 4 1136.Loop_mul4x_reduction: 1137 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1138 adc x0,x0,xzr // modulo-scheduled 1139 mul x11,x7,x24 1140 add x28,x28,#8 1141 mul x12,x8,x24 1142 and x28,x28,#31 1143 mul x13,x9,x24 1144 adds x19,x19,x10 1145 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1146 adcs x20,x20,x11 1147 mul x25,x19,x4 // t[0]*n0 1148 adcs x21,x21,x12 1149 umulh x11,x7,x24 1150 adcs x22,x22,x13 1151 umulh x12,x8,x24 1152 adc x23,xzr,xzr 1153 umulh x13,x9,x24 1154 ldr x24,[x2,x28] // next b[i] 1155 adds x20,x20,x10 1156 // (*) mul x10,x14,x25 1157 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1158 adcs x21,x21,x11 1159 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1160 adcs x22,x22,x12 1161 mul x12,x16,x25 1162 adc x23,x23,x13 // can't overflow 1163 mul x13,x17,x25 1164 // (*) adds xzr,x19,x10 1165 subs xzr,x19,#1 // (*) 1166 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1167 adcs x19,x20,x11 1168 umulh x11,x15,x25 1169 adcs x20,x21,x12 1170 umulh x12,x16,x25 1171 adcs x21,x22,x13 1172 umulh x13,x17,x25 1173 adcs x22,x23,x0 1174 adc x0,xzr,xzr 1175 adds x19,x19,x10 1176 adcs x20,x20,x11 1177 adcs x21,x21,x12 1178 adcs x22,x22,x13 1179 //adc x0,x0,xzr 1180 cbnz x28,.Loop_mul4x_reduction 1181 1182 adc x0,x0,xzr 1183 ldp x10,x11,[x26,#8*4] // t[4..7] 1184 ldp x12,x13,[x26,#8*6] 1185 ldp x6,x7,[x1,#8*0] // a[4..7] 1186 ldp x8,x9,[x1,#8*2] 1187 add x1,x1,#8*4 1188 adds x19,x19,x10 1189 adcs x20,x20,x11 1190 adcs x21,x21,x12 1191 adcs x22,x22,x13 1192 //adc x0,x0,xzr 1193 1194 ldr x25,[sp] // t[0]*n0 1195 ldp x14,x15,[x3,#8*0] // n[4..7] 1196 ldp x16,x17,[x3,#8*2] 1197 add x3,x3,#8*4 1198 1199.align 4 1200.Loop_mul4x_tail: 1201 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1202 adc x0,x0,xzr // modulo-scheduled 1203 mul x11,x7,x24 1204 add x28,x28,#8 1205 mul x12,x8,x24 1206 and x28,x28,#31 1207 mul x13,x9,x24 1208 adds x19,x19,x10 1209 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1210 adcs x20,x20,x11 1211 umulh x11,x7,x24 1212 adcs x21,x21,x12 1213 umulh x12,x8,x24 1214 adcs x22,x22,x13 1215 umulh x13,x9,x24 1216 adc x23,xzr,xzr 1217 ldr x24,[x2,x28] // next b[i] 1218 adds x20,x20,x10 1219 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1220 adcs x21,x21,x11 1221 mul x11,x15,x25 1222 adcs x22,x22,x12 1223 mul x12,x16,x25 1224 adc x23,x23,x13 // can't overflow 1225 mul x13,x17,x25 1226 adds x19,x19,x10 1227 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1228 adcs x20,x20,x11 1229 umulh x11,x15,x25 1230 adcs x21,x21,x12 1231 umulh x12,x16,x25 1232 adcs x22,x22,x13 1233 umulh x13,x17,x25 1234 adcs x23,x23,x0 1235 ldr x25,[sp,x28] // next a[0]*n0 1236 adc x0,xzr,xzr 1237 str x19,[x26],#8 // result!!! 1238 adds x19,x20,x10 1239 sub x10,x27,x1 // done yet? 1240 adcs x20,x21,x11 1241 adcs x21,x22,x12 1242 adcs x22,x23,x13 1243 //adc x0,x0,xzr 1244 cbnz x28,.Loop_mul4x_tail 1245 1246 sub x11,x3,x5 // rewinded np? 1247 adc x0,x0,xzr 1248 cbz x10,.Loop_mul4x_break 1249 1250 ldp x10,x11,[x26,#8*4] 1251 ldp x12,x13,[x26,#8*6] 1252 ldp x6,x7,[x1,#8*0] 1253 ldp x8,x9,[x1,#8*2] 1254 add x1,x1,#8*4 1255 adds x19,x19,x10 1256 adcs x20,x20,x11 1257 adcs x21,x21,x12 1258 adcs x22,x22,x13 1259 //adc x0,x0,xzr 1260 ldp x14,x15,[x3,#8*0] 1261 ldp x16,x17,[x3,#8*2] 1262 add x3,x3,#8*4 1263 b .Loop_mul4x_tail 1264 1265.align 4 1266.Loop_mul4x_break: 1267 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1268 adds x19,x19,x30 1269 add x2,x2,#8*4 // bp++ 1270 adcs x20,x20,xzr 1271 sub x1,x1,x5 // rewind ap 1272 adcs x21,x21,xzr 1273 stp x19,x20,[x26,#8*0] // result!!! 1274 adcs x22,x22,xzr 1275 ldp x19,x20,[sp,#8*4] // t[0..3] 1276 adc x30,x0,xzr 1277 stp x21,x22,[x26,#8*2] // result!!! 1278 cmp x2,x13 // done yet? 1279 ldp x21,x22,[sp,#8*6] 1280 ldp x14,x15,[x11,#8*0] // n[0..3] 1281 ldp x16,x17,[x11,#8*2] 1282 add x3,x11,#8*4 1283 b.eq .Lmul4x_post 1284 1285 ldr x24,[x2] 1286 ldp x6,x7,[x1,#8*0] // a[0..3] 1287 ldp x8,x9,[x1,#8*2] 1288 adds x1,x1,#8*4 // clear carry bit 1289 mov x0,xzr 1290 mov x26,sp 1291 b .Loop_mul4x_reduction 1292 1293.align 4 1294.Lmul4x_post: 1295 // Final step. We see if result is larger than modulus, and 1296 // if it is, subtract the modulus. But comparison implies 1297 // subtraction. So we subtract modulus, see if it borrowed, 1298 // and conditionally copy original value. 1299 mov x0,x12 1300 mov x27,x12 // x0 copy 1301 subs x10,x19,x14 1302 add x26,sp,#8*8 1303 sbcs x11,x20,x15 1304 sub x28,x5,#8*4 1305 1306.Lmul4x_sub: 1307 sbcs x12,x21,x16 1308 ldp x14,x15,[x3,#8*0] 1309 sub x28,x28,#8*4 1310 ldp x19,x20,[x26,#8*0] 1311 sbcs x13,x22,x17 1312 ldp x16,x17,[x3,#8*2] 1313 add x3,x3,#8*4 1314 ldp x21,x22,[x26,#8*2] 1315 add x26,x26,#8*4 1316 stp x10,x11,[x0,#8*0] 1317 sbcs x10,x19,x14 1318 stp x12,x13,[x0,#8*2] 1319 add x0,x0,#8*4 1320 sbcs x11,x20,x15 1321 cbnz x28,.Lmul4x_sub 1322 1323 sbcs x12,x21,x16 1324 mov x26,sp 1325 add x1,sp,#8*4 1326 ldp x6,x7,[x27,#8*0] 1327 sbcs x13,x22,x17 1328 stp x10,x11,[x0,#8*0] 1329 ldp x8,x9,[x27,#8*2] 1330 stp x12,x13,[x0,#8*2] 1331 ldp x19,x20,[x1,#8*0] 1332 ldp x21,x22,[x1,#8*2] 1333 sbcs xzr,x30,xzr // did it borrow? 1334 ldr x30,[x29,#8] // pull return address 1335 1336 sub x28,x5,#8*4 1337.Lmul4x_cond_copy: 1338 sub x28,x28,#8*4 1339 csel x10,x19,x6,lo 1340 stp xzr,xzr,[x26,#8*0] 1341 csel x11,x20,x7,lo 1342 ldp x6,x7,[x27,#8*4] 1343 ldp x19,x20,[x1,#8*4] 1344 csel x12,x21,x8,lo 1345 stp xzr,xzr,[x26,#8*2] 1346 add x26,x26,#8*4 1347 csel x13,x22,x9,lo 1348 ldp x8,x9,[x27,#8*6] 1349 ldp x21,x22,[x1,#8*6] 1350 add x1,x1,#8*4 1351 stp x10,x11,[x27,#8*0] 1352 stp x12,x13,[x27,#8*2] 1353 add x27,x27,#8*4 1354 cbnz x28,.Lmul4x_cond_copy 1355 1356 csel x10,x19,x6,lo 1357 stp xzr,xzr,[x26,#8*0] 1358 csel x11,x20,x7,lo 1359 stp xzr,xzr,[x26,#8*2] 1360 csel x12,x21,x8,lo 1361 stp xzr,xzr,[x26,#8*3] 1362 csel x13,x22,x9,lo 1363 stp xzr,xzr,[x26,#8*4] 1364 stp x10,x11,[x27,#8*0] 1365 stp x12,x13,[x27,#8*2] 1366 1367 b .Lmul4x_done 1368 1369.align 4 1370.Lmul4x4_post_condition: 1371 adc x0,x0,xzr 1372 ldr x1,[x29,#96] // pull rp 1373 // x19-3,x0 hold result, x14-7 hold modulus 1374 subs x6,x19,x14 1375 ldr x30,[x29,#8] // pull return address 1376 sbcs x7,x20,x15 1377 stp xzr,xzr,[sp,#8*0] 1378 sbcs x8,x21,x16 1379 stp xzr,xzr,[sp,#8*2] 1380 sbcs x9,x22,x17 1381 stp xzr,xzr,[sp,#8*4] 1382 sbcs xzr,x0,xzr // did it borrow? 1383 stp xzr,xzr,[sp,#8*6] 1384 1385 // x6-3 hold result-modulus 1386 csel x6,x19,x6,lo 1387 csel x7,x20,x7,lo 1388 csel x8,x21,x8,lo 1389 csel x9,x22,x9,lo 1390 stp x6,x7,[x1,#8*0] 1391 stp x8,x9,[x1,#8*2] 1392 1393.Lmul4x_done: 1394 ldp x19,x20,[x29,#16] 1395 mov sp,x29 1396 ldp x21,x22,[x29,#32] 1397 mov x0,#1 1398 ldp x23,x24,[x29,#48] 1399 ldp x25,x26,[x29,#64] 1400 ldp x27,x28,[x29,#80] 1401 ldr x29,[sp],#128 1402 ret 1403.size __bn_mul4x_mont,.-__bn_mul4x_mont 1404.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1405.align 2 1406.align 4 1407#endif 1408