1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#include <GFp/arm_arch.h> 14 15.text 16 17.globl GFp_bn_mul_mont 18.hidden GFp_bn_mul_mont 19.type GFp_bn_mul_mont,%function 20.align 5 21GFp_bn_mul_mont: 22 AARCH64_SIGN_LINK_REGISTER 23 tst x5,#7 24 b.eq __bn_sqr8x_mont 25 tst x5,#3 26 b.eq __bn_mul4x_mont 27.Lmul_mont: 28 stp x29,x30,[sp,#-64]! 29 add x29,sp,#0 30 stp x19,x20,[sp,#16] 31 stp x21,x22,[sp,#32] 32 stp x23,x24,[sp,#48] 33 34 ldr x9,[x2],#8 // bp[0] 35 sub x22,sp,x5,lsl#3 36 ldp x7,x8,[x1],#16 // ap[0..1] 37 lsl x5,x5,#3 38 ldr x4,[x4] // *n0 39 and x22,x22,#-16 // ABI says so 40 ldp x13,x14,[x3],#16 // np[0..1] 41 42 mul x6,x7,x9 // ap[0]*bp[0] 43 sub x21,x5,#16 // j=num-2 44 umulh x7,x7,x9 45 mul x10,x8,x9 // ap[1]*bp[0] 46 umulh x11,x8,x9 47 48 mul x15,x6,x4 // "tp[0]"*n0 49 mov sp,x22 // alloca 50 51 // (*) mul x12,x13,x15 // np[0]*m1 52 umulh x13,x13,x15 53 mul x16,x14,x15 // np[1]*m1 54 // (*) adds x12,x12,x6 // discarded 55 // (*) As for removal of first multiplication and addition 56 // instructions. The outcome of first addition is 57 // guaranteed to be zero, which leaves two computationally 58 // significant outcomes: it either carries or not. Then 59 // question is when does it carry? Is there alternative 60 // way to deduce it? If you follow operations, you can 61 // observe that condition for carry is quite simple: 62 // x6 being non-zero. So that carry can be calculated 63 // by adding -1 to x6. That's what next instruction does. 64 subs xzr,x6,#1 // (*) 65 umulh x17,x14,x15 66 adc x13,x13,xzr 67 cbz x21,.L1st_skip 68 69.L1st: 70 ldr x8,[x1],#8 71 adds x6,x10,x7 72 sub x21,x21,#8 // j-- 73 adc x7,x11,xzr 74 75 ldr x14,[x3],#8 76 adds x12,x16,x13 77 mul x10,x8,x9 // ap[j]*bp[0] 78 adc x13,x17,xzr 79 umulh x11,x8,x9 80 81 adds x12,x12,x6 82 mul x16,x14,x15 // np[j]*m1 83 adc x13,x13,xzr 84 umulh x17,x14,x15 85 str x12,[x22],#8 // tp[j-1] 86 cbnz x21,.L1st 87 88.L1st_skip: 89 adds x6,x10,x7 90 sub x1,x1,x5 // rewind x1 91 adc x7,x11,xzr 92 93 adds x12,x16,x13 94 sub x3,x3,x5 // rewind x3 95 adc x13,x17,xzr 96 97 adds x12,x12,x6 98 sub x20,x5,#8 // i=num-1 99 adcs x13,x13,x7 100 101 adc x19,xzr,xzr // upmost overflow bit 102 stp x12,x13,[x22] 103 104.Louter: 105 ldr x9,[x2],#8 // bp[i] 106 ldp x7,x8,[x1],#16 107 ldr x23,[sp] // tp[0] 108 add x22,sp,#8 109 110 mul x6,x7,x9 // ap[0]*bp[i] 111 sub x21,x5,#16 // j=num-2 112 umulh x7,x7,x9 113 ldp x13,x14,[x3],#16 114 mul x10,x8,x9 // ap[1]*bp[i] 115 adds x6,x6,x23 116 umulh x11,x8,x9 117 adc x7,x7,xzr 118 119 mul x15,x6,x4 120 sub x20,x20,#8 // i-- 121 122 // (*) mul x12,x13,x15 // np[0]*m1 123 umulh x13,x13,x15 124 mul x16,x14,x15 // np[1]*m1 125 // (*) adds x12,x12,x6 126 subs xzr,x6,#1 // (*) 127 umulh x17,x14,x15 128 cbz x21,.Linner_skip 129 130.Linner: 131 ldr x8,[x1],#8 132 adc x13,x13,xzr 133 ldr x23,[x22],#8 // tp[j] 134 adds x6,x10,x7 135 sub x21,x21,#8 // j-- 136 adc x7,x11,xzr 137 138 adds x12,x16,x13 139 ldr x14,[x3],#8 140 adc x13,x17,xzr 141 142 mul x10,x8,x9 // ap[j]*bp[i] 143 adds x6,x6,x23 144 umulh x11,x8,x9 145 adc x7,x7,xzr 146 147 mul x16,x14,x15 // np[j]*m1 148 adds x12,x12,x6 149 umulh x17,x14,x15 150 str x12,[x22,#-16] // tp[j-1] 151 cbnz x21,.Linner 152 153.Linner_skip: 154 ldr x23,[x22],#8 // tp[j] 155 adc x13,x13,xzr 156 adds x6,x10,x7 157 sub x1,x1,x5 // rewind x1 158 adc x7,x11,xzr 159 160 adds x12,x16,x13 161 sub x3,x3,x5 // rewind x3 162 adcs x13,x17,x19 163 adc x19,xzr,xzr 164 165 adds x6,x6,x23 166 adc x7,x7,xzr 167 168 adds x12,x12,x6 169 adcs x13,x13,x7 170 adc x19,x19,xzr // upmost overflow bit 171 stp x12,x13,[x22,#-16] 172 173 cbnz x20,.Louter 174 175 // Final step. We see if result is larger than modulus, and 176 // if it is, subtract the modulus. But comparison implies 177 // subtraction. So we subtract modulus, see if it borrowed, 178 // and conditionally copy original value. 179 ldr x23,[sp] // tp[0] 180 add x22,sp,#8 181 ldr x14,[x3],#8 // np[0] 182 subs x21,x5,#8 // j=num-1 and clear borrow 183 mov x1,x0 184.Lsub: 185 sbcs x8,x23,x14 // tp[j]-np[j] 186 ldr x23,[x22],#8 187 sub x21,x21,#8 // j-- 188 ldr x14,[x3],#8 189 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 190 cbnz x21,.Lsub 191 192 sbcs x8,x23,x14 193 sbcs x19,x19,xzr // did it borrow? 194 str x8,[x1],#8 // rp[num-1] 195 196 ldr x23,[sp] // tp[0] 197 add x22,sp,#8 198 ldr x8,[x0],#8 // rp[0] 199 sub x5,x5,#8 // num-- 200 nop 201.Lcond_copy: 202 sub x5,x5,#8 // num-- 203 csel x14,x23,x8,lo // did it borrow? 204 ldr x23,[x22],#8 205 ldr x8,[x0],#8 206 str xzr,[x22,#-16] // wipe tp 207 str x14,[x0,#-16] 208 cbnz x5,.Lcond_copy 209 210 csel x14,x23,x8,lo 211 str xzr,[x22,#-8] // wipe tp 212 str x14,[x0,#-8] 213 214 ldp x19,x20,[x29,#16] 215 mov sp,x29 216 ldp x21,x22,[x29,#32] 217 mov x0,#1 218 ldp x23,x24,[x29,#48] 219 ldr x29,[sp],#64 220 AARCH64_VALIDATE_LINK_REGISTER 221 ret 222.size GFp_bn_mul_mont,.-GFp_bn_mul_mont 223.type __bn_sqr8x_mont,%function 224.align 5 225__bn_sqr8x_mont: 226 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 227 // only from bn_mul_mont which has already signed the return address. 228 cmp x1,x2 229 b.ne __bn_mul4x_mont 230.Lsqr8x_mont: 231 stp x29,x30,[sp,#-128]! 232 add x29,sp,#0 233 stp x19,x20,[sp,#16] 234 stp x21,x22,[sp,#32] 235 stp x23,x24,[sp,#48] 236 stp x25,x26,[sp,#64] 237 stp x27,x28,[sp,#80] 238 stp x0,x3,[sp,#96] // offload rp and np 239 240 ldp x6,x7,[x1,#8*0] 241 ldp x8,x9,[x1,#8*2] 242 ldp x10,x11,[x1,#8*4] 243 ldp x12,x13,[x1,#8*6] 244 245 sub x2,sp,x5,lsl#4 246 lsl x5,x5,#3 247 ldr x4,[x4] // *n0 248 mov sp,x2 // alloca 249 sub x27,x5,#8*8 250 b .Lsqr8x_zero_start 251 252.Lsqr8x_zero: 253 sub x27,x27,#8*8 254 stp xzr,xzr,[x2,#8*0] 255 stp xzr,xzr,[x2,#8*2] 256 stp xzr,xzr,[x2,#8*4] 257 stp xzr,xzr,[x2,#8*6] 258.Lsqr8x_zero_start: 259 stp xzr,xzr,[x2,#8*8] 260 stp xzr,xzr,[x2,#8*10] 261 stp xzr,xzr,[x2,#8*12] 262 stp xzr,xzr,[x2,#8*14] 263 add x2,x2,#8*16 264 cbnz x27,.Lsqr8x_zero 265 266 add x3,x1,x5 267 add x1,x1,#8*8 268 mov x19,xzr 269 mov x20,xzr 270 mov x21,xzr 271 mov x22,xzr 272 mov x23,xzr 273 mov x24,xzr 274 mov x25,xzr 275 mov x26,xzr 276 mov x2,sp 277 str x4,[x29,#112] // offload n0 278 279 // Multiply everything but a[i]*a[i] 280.align 4 281.Lsqr8x_outer_loop: 282 // a[1]a[0] (i) 283 // a[2]a[0] 284 // a[3]a[0] 285 // a[4]a[0] 286 // a[5]a[0] 287 // a[6]a[0] 288 // a[7]a[0] 289 // a[2]a[1] (ii) 290 // a[3]a[1] 291 // a[4]a[1] 292 // a[5]a[1] 293 // a[6]a[1] 294 // a[7]a[1] 295 // a[3]a[2] (iii) 296 // a[4]a[2] 297 // a[5]a[2] 298 // a[6]a[2] 299 // a[7]a[2] 300 // a[4]a[3] (iv) 301 // a[5]a[3] 302 // a[6]a[3] 303 // a[7]a[3] 304 // a[5]a[4] (v) 305 // a[6]a[4] 306 // a[7]a[4] 307 // a[6]a[5] (vi) 308 // a[7]a[5] 309 // a[7]a[6] (vii) 310 311 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 312 mul x15,x8,x6 313 mul x16,x9,x6 314 mul x17,x10,x6 315 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 316 mul x14,x11,x6 317 adcs x21,x21,x15 318 mul x15,x12,x6 319 adcs x22,x22,x16 320 mul x16,x13,x6 321 adcs x23,x23,x17 322 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 323 adcs x24,x24,x14 324 umulh x14,x8,x6 325 adcs x25,x25,x15 326 umulh x15,x9,x6 327 adcs x26,x26,x16 328 umulh x16,x10,x6 329 stp x19,x20,[x2],#8*2 // t[0..1] 330 adc x19,xzr,xzr // t[8] 331 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 332 umulh x17,x11,x6 333 adcs x22,x22,x14 334 umulh x14,x12,x6 335 adcs x23,x23,x15 336 umulh x15,x13,x6 337 adcs x24,x24,x16 338 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 339 adcs x25,x25,x17 340 mul x17,x9,x7 341 adcs x26,x26,x14 342 mul x14,x10,x7 343 adc x19,x19,x15 344 345 mul x15,x11,x7 346 adds x22,x22,x16 347 mul x16,x12,x7 348 adcs x23,x23,x17 349 mul x17,x13,x7 350 adcs x24,x24,x14 351 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 352 adcs x25,x25,x15 353 umulh x15,x9,x7 354 adcs x26,x26,x16 355 umulh x16,x10,x7 356 adcs x19,x19,x17 357 umulh x17,x11,x7 358 stp x21,x22,[x2],#8*2 // t[2..3] 359 adc x20,xzr,xzr // t[9] 360 adds x23,x23,x14 361 umulh x14,x12,x7 362 adcs x24,x24,x15 363 umulh x15,x13,x7 364 adcs x25,x25,x16 365 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 366 adcs x26,x26,x17 367 mul x17,x10,x8 368 adcs x19,x19,x14 369 mul x14,x11,x8 370 adc x20,x20,x15 371 372 mul x15,x12,x8 373 adds x24,x24,x16 374 mul x16,x13,x8 375 adcs x25,x25,x17 376 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 377 adcs x26,x26,x14 378 umulh x14,x10,x8 379 adcs x19,x19,x15 380 umulh x15,x11,x8 381 adcs x20,x20,x16 382 umulh x16,x12,x8 383 stp x23,x24,[x2],#8*2 // t[4..5] 384 adc x21,xzr,xzr // t[10] 385 adds x25,x25,x17 386 umulh x17,x13,x8 387 adcs x26,x26,x14 388 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 389 adcs x19,x19,x15 390 mul x15,x11,x9 391 adcs x20,x20,x16 392 mul x16,x12,x9 393 adc x21,x21,x17 394 395 mul x17,x13,x9 396 adds x26,x26,x14 397 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 398 adcs x19,x19,x15 399 umulh x15,x11,x9 400 adcs x20,x20,x16 401 umulh x16,x12,x9 402 adcs x21,x21,x17 403 umulh x17,x13,x9 404 stp x25,x26,[x2],#8*2 // t[6..7] 405 adc x22,xzr,xzr // t[11] 406 adds x19,x19,x14 407 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 408 adcs x20,x20,x15 409 mul x15,x12,x10 410 adcs x21,x21,x16 411 mul x16,x13,x10 412 adc x22,x22,x17 413 414 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 415 adds x20,x20,x14 416 umulh x14,x12,x10 417 adcs x21,x21,x15 418 umulh x15,x13,x10 419 adcs x22,x22,x16 420 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 421 adc x23,xzr,xzr // t[12] 422 adds x21,x21,x17 423 mul x17,x13,x11 424 adcs x22,x22,x14 425 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 426 adc x23,x23,x15 427 428 umulh x15,x13,x11 429 adds x22,x22,x16 430 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 431 adcs x23,x23,x17 432 umulh x17,x13,x12 // hi(a[7]*a[6]) 433 adc x24,xzr,xzr // t[13] 434 adds x23,x23,x14 435 sub x27,x3,x1 // done yet? 436 adc x24,x24,x15 437 438 adds x24,x24,x16 439 sub x14,x3,x5 // rewinded ap 440 adc x25,xzr,xzr // t[14] 441 add x25,x25,x17 442 443 cbz x27,.Lsqr8x_outer_break 444 445 mov x4,x6 446 ldp x6,x7,[x2,#8*0] 447 ldp x8,x9,[x2,#8*2] 448 ldp x10,x11,[x2,#8*4] 449 ldp x12,x13,[x2,#8*6] 450 adds x19,x19,x6 451 adcs x20,x20,x7 452 ldp x6,x7,[x1,#8*0] 453 adcs x21,x21,x8 454 adcs x22,x22,x9 455 ldp x8,x9,[x1,#8*2] 456 adcs x23,x23,x10 457 adcs x24,x24,x11 458 ldp x10,x11,[x1,#8*4] 459 adcs x25,x25,x12 460 mov x0,x1 461 adcs x26,xzr,x13 462 ldp x12,x13,[x1,#8*6] 463 add x1,x1,#8*8 464 //adc x28,xzr,xzr // moved below 465 mov x27,#-8*8 466 467 // a[8]a[0] 468 // a[9]a[0] 469 // a[a]a[0] 470 // a[b]a[0] 471 // a[c]a[0] 472 // a[d]a[0] 473 // a[e]a[0] 474 // a[f]a[0] 475 // a[8]a[1] 476 // a[f]a[1]........................ 477 // a[8]a[2] 478 // a[f]a[2]........................ 479 // a[8]a[3] 480 // a[f]a[3]........................ 481 // a[8]a[4] 482 // a[f]a[4]........................ 483 // a[8]a[5] 484 // a[f]a[5]........................ 485 // a[8]a[6] 486 // a[f]a[6]........................ 487 // a[8]a[7] 488 // a[f]a[7]........................ 489.Lsqr8x_mul: 490 mul x14,x6,x4 491 adc x28,xzr,xzr // carry bit, modulo-scheduled 492 mul x15,x7,x4 493 add x27,x27,#8 494 mul x16,x8,x4 495 mul x17,x9,x4 496 adds x19,x19,x14 497 mul x14,x10,x4 498 adcs x20,x20,x15 499 mul x15,x11,x4 500 adcs x21,x21,x16 501 mul x16,x12,x4 502 adcs x22,x22,x17 503 mul x17,x13,x4 504 adcs x23,x23,x14 505 umulh x14,x6,x4 506 adcs x24,x24,x15 507 umulh x15,x7,x4 508 adcs x25,x25,x16 509 umulh x16,x8,x4 510 adcs x26,x26,x17 511 umulh x17,x9,x4 512 adc x28,x28,xzr 513 str x19,[x2],#8 514 adds x19,x20,x14 515 umulh x14,x10,x4 516 adcs x20,x21,x15 517 umulh x15,x11,x4 518 adcs x21,x22,x16 519 umulh x16,x12,x4 520 adcs x22,x23,x17 521 umulh x17,x13,x4 522 ldr x4,[x0,x27] 523 adcs x23,x24,x14 524 adcs x24,x25,x15 525 adcs x25,x26,x16 526 adcs x26,x28,x17 527 //adc x28,xzr,xzr // moved above 528 cbnz x27,.Lsqr8x_mul 529 // note that carry flag is guaranteed 530 // to be zero at this point 531 cmp x1,x3 // done yet? 532 b.eq .Lsqr8x_break 533 534 ldp x6,x7,[x2,#8*0] 535 ldp x8,x9,[x2,#8*2] 536 ldp x10,x11,[x2,#8*4] 537 ldp x12,x13,[x2,#8*6] 538 adds x19,x19,x6 539 ldr x4,[x0,#-8*8] 540 adcs x20,x20,x7 541 ldp x6,x7,[x1,#8*0] 542 adcs x21,x21,x8 543 adcs x22,x22,x9 544 ldp x8,x9,[x1,#8*2] 545 adcs x23,x23,x10 546 adcs x24,x24,x11 547 ldp x10,x11,[x1,#8*4] 548 adcs x25,x25,x12 549 mov x27,#-8*8 550 adcs x26,x26,x13 551 ldp x12,x13,[x1,#8*6] 552 add x1,x1,#8*8 553 //adc x28,xzr,xzr // moved above 554 b .Lsqr8x_mul 555 556.align 4 557.Lsqr8x_break: 558 ldp x6,x7,[x0,#8*0] 559 add x1,x0,#8*8 560 ldp x8,x9,[x0,#8*2] 561 sub x14,x3,x1 // is it last iteration? 562 ldp x10,x11,[x0,#8*4] 563 sub x15,x2,x14 564 ldp x12,x13,[x0,#8*6] 565 cbz x14,.Lsqr8x_outer_loop 566 567 stp x19,x20,[x2,#8*0] 568 ldp x19,x20,[x15,#8*0] 569 stp x21,x22,[x2,#8*2] 570 ldp x21,x22,[x15,#8*2] 571 stp x23,x24,[x2,#8*4] 572 ldp x23,x24,[x15,#8*4] 573 stp x25,x26,[x2,#8*6] 574 mov x2,x15 575 ldp x25,x26,[x15,#8*6] 576 b .Lsqr8x_outer_loop 577 578.align 4 579.Lsqr8x_outer_break: 580 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 581 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 582 ldp x15,x16,[sp,#8*1] 583 ldp x11,x13,[x14,#8*2] 584 add x1,x14,#8*4 585 ldp x17,x14,[sp,#8*3] 586 587 stp x19,x20,[x2,#8*0] 588 mul x19,x7,x7 589 stp x21,x22,[x2,#8*2] 590 umulh x7,x7,x7 591 stp x23,x24,[x2,#8*4] 592 mul x8,x9,x9 593 stp x25,x26,[x2,#8*6] 594 mov x2,sp 595 umulh x9,x9,x9 596 adds x20,x7,x15,lsl#1 597 extr x15,x16,x15,#63 598 sub x27,x5,#8*4 599 600.Lsqr4x_shift_n_add: 601 adcs x21,x8,x15 602 extr x16,x17,x16,#63 603 sub x27,x27,#8*4 604 adcs x22,x9,x16 605 ldp x15,x16,[x2,#8*5] 606 mul x10,x11,x11 607 ldp x7,x9,[x1],#8*2 608 umulh x11,x11,x11 609 mul x12,x13,x13 610 umulh x13,x13,x13 611 extr x17,x14,x17,#63 612 stp x19,x20,[x2,#8*0] 613 adcs x23,x10,x17 614 extr x14,x15,x14,#63 615 stp x21,x22,[x2,#8*2] 616 adcs x24,x11,x14 617 ldp x17,x14,[x2,#8*7] 618 extr x15,x16,x15,#63 619 adcs x25,x12,x15 620 extr x16,x17,x16,#63 621 adcs x26,x13,x16 622 ldp x15,x16,[x2,#8*9] 623 mul x6,x7,x7 624 ldp x11,x13,[x1],#8*2 625 umulh x7,x7,x7 626 mul x8,x9,x9 627 umulh x9,x9,x9 628 stp x23,x24,[x2,#8*4] 629 extr x17,x14,x17,#63 630 stp x25,x26,[x2,#8*6] 631 add x2,x2,#8*8 632 adcs x19,x6,x17 633 extr x14,x15,x14,#63 634 adcs x20,x7,x14 635 ldp x17,x14,[x2,#8*3] 636 extr x15,x16,x15,#63 637 cbnz x27,.Lsqr4x_shift_n_add 638 ldp x1,x4,[x29,#104] // pull np and n0 639 640 adcs x21,x8,x15 641 extr x16,x17,x16,#63 642 adcs x22,x9,x16 643 ldp x15,x16,[x2,#8*5] 644 mul x10,x11,x11 645 umulh x11,x11,x11 646 stp x19,x20,[x2,#8*0] 647 mul x12,x13,x13 648 umulh x13,x13,x13 649 stp x21,x22,[x2,#8*2] 650 extr x17,x14,x17,#63 651 adcs x23,x10,x17 652 extr x14,x15,x14,#63 653 ldp x19,x20,[sp,#8*0] 654 adcs x24,x11,x14 655 extr x15,x16,x15,#63 656 ldp x6,x7,[x1,#8*0] 657 adcs x25,x12,x15 658 extr x16,xzr,x16,#63 659 ldp x8,x9,[x1,#8*2] 660 adc x26,x13,x16 661 ldp x10,x11,[x1,#8*4] 662 663 // Reduce by 512 bits per iteration 664 mul x28,x4,x19 // t[0]*n0 665 ldp x12,x13,[x1,#8*6] 666 add x3,x1,x5 667 ldp x21,x22,[sp,#8*2] 668 stp x23,x24,[x2,#8*4] 669 ldp x23,x24,[sp,#8*4] 670 stp x25,x26,[x2,#8*6] 671 ldp x25,x26,[sp,#8*6] 672 add x1,x1,#8*8 673 mov x30,xzr // initial top-most carry 674 mov x2,sp 675 mov x27,#8 676 677.Lsqr8x_reduction: 678 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 679 mul x15,x7,x28 680 sub x27,x27,#1 681 mul x16,x8,x28 682 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 683 mul x17,x9,x28 684 // (*) adds xzr,x19,x14 685 subs xzr,x19,#1 // (*) 686 mul x14,x10,x28 687 adcs x19,x20,x15 688 mul x15,x11,x28 689 adcs x20,x21,x16 690 mul x16,x12,x28 691 adcs x21,x22,x17 692 mul x17,x13,x28 693 adcs x22,x23,x14 694 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 695 adcs x23,x24,x15 696 umulh x15,x7,x28 697 adcs x24,x25,x16 698 umulh x16,x8,x28 699 adcs x25,x26,x17 700 umulh x17,x9,x28 701 adc x26,xzr,xzr 702 adds x19,x19,x14 703 umulh x14,x10,x28 704 adcs x20,x20,x15 705 umulh x15,x11,x28 706 adcs x21,x21,x16 707 umulh x16,x12,x28 708 adcs x22,x22,x17 709 umulh x17,x13,x28 710 mul x28,x4,x19 // next t[0]*n0 711 adcs x23,x23,x14 712 adcs x24,x24,x15 713 adcs x25,x25,x16 714 adc x26,x26,x17 715 cbnz x27,.Lsqr8x_reduction 716 717 ldp x14,x15,[x2,#8*0] 718 ldp x16,x17,[x2,#8*2] 719 mov x0,x2 720 sub x27,x3,x1 // done yet? 721 adds x19,x19,x14 722 adcs x20,x20,x15 723 ldp x14,x15,[x2,#8*4] 724 adcs x21,x21,x16 725 adcs x22,x22,x17 726 ldp x16,x17,[x2,#8*6] 727 adcs x23,x23,x14 728 adcs x24,x24,x15 729 adcs x25,x25,x16 730 adcs x26,x26,x17 731 //adc x28,xzr,xzr // moved below 732 cbz x27,.Lsqr8x8_post_condition 733 734 ldr x4,[x2,#-8*8] 735 ldp x6,x7,[x1,#8*0] 736 ldp x8,x9,[x1,#8*2] 737 ldp x10,x11,[x1,#8*4] 738 mov x27,#-8*8 739 ldp x12,x13,[x1,#8*6] 740 add x1,x1,#8*8 741 742.Lsqr8x_tail: 743 mul x14,x6,x4 744 adc x28,xzr,xzr // carry bit, modulo-scheduled 745 mul x15,x7,x4 746 add x27,x27,#8 747 mul x16,x8,x4 748 mul x17,x9,x4 749 adds x19,x19,x14 750 mul x14,x10,x4 751 adcs x20,x20,x15 752 mul x15,x11,x4 753 adcs x21,x21,x16 754 mul x16,x12,x4 755 adcs x22,x22,x17 756 mul x17,x13,x4 757 adcs x23,x23,x14 758 umulh x14,x6,x4 759 adcs x24,x24,x15 760 umulh x15,x7,x4 761 adcs x25,x25,x16 762 umulh x16,x8,x4 763 adcs x26,x26,x17 764 umulh x17,x9,x4 765 adc x28,x28,xzr 766 str x19,[x2],#8 767 adds x19,x20,x14 768 umulh x14,x10,x4 769 adcs x20,x21,x15 770 umulh x15,x11,x4 771 adcs x21,x22,x16 772 umulh x16,x12,x4 773 adcs x22,x23,x17 774 umulh x17,x13,x4 775 ldr x4,[x0,x27] 776 adcs x23,x24,x14 777 adcs x24,x25,x15 778 adcs x25,x26,x16 779 adcs x26,x28,x17 780 //adc x28,xzr,xzr // moved above 781 cbnz x27,.Lsqr8x_tail 782 // note that carry flag is guaranteed 783 // to be zero at this point 784 ldp x6,x7,[x2,#8*0] 785 sub x27,x3,x1 // done yet? 786 sub x16,x3,x5 // rewinded np 787 ldp x8,x9,[x2,#8*2] 788 ldp x10,x11,[x2,#8*4] 789 ldp x12,x13,[x2,#8*6] 790 cbz x27,.Lsqr8x_tail_break 791 792 ldr x4,[x0,#-8*8] 793 adds x19,x19,x6 794 adcs x20,x20,x7 795 ldp x6,x7,[x1,#8*0] 796 adcs x21,x21,x8 797 adcs x22,x22,x9 798 ldp x8,x9,[x1,#8*2] 799 adcs x23,x23,x10 800 adcs x24,x24,x11 801 ldp x10,x11,[x1,#8*4] 802 adcs x25,x25,x12 803 mov x27,#-8*8 804 adcs x26,x26,x13 805 ldp x12,x13,[x1,#8*6] 806 add x1,x1,#8*8 807 //adc x28,xzr,xzr // moved above 808 b .Lsqr8x_tail 809 810.align 4 811.Lsqr8x_tail_break: 812 ldr x4,[x29,#112] // pull n0 813 add x27,x2,#8*8 // end of current t[num] window 814 815 subs xzr,x30,#1 // "move" top-most carry to carry bit 816 adcs x14,x19,x6 817 adcs x15,x20,x7 818 ldp x19,x20,[x0,#8*0] 819 adcs x21,x21,x8 820 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 821 adcs x22,x22,x9 822 ldp x8,x9,[x16,#8*2] 823 adcs x23,x23,x10 824 adcs x24,x24,x11 825 ldp x10,x11,[x16,#8*4] 826 adcs x25,x25,x12 827 adcs x26,x26,x13 828 ldp x12,x13,[x16,#8*6] 829 add x1,x16,#8*8 830 adc x30,xzr,xzr // top-most carry 831 mul x28,x4,x19 832 stp x14,x15,[x2,#8*0] 833 stp x21,x22,[x2,#8*2] 834 ldp x21,x22,[x0,#8*2] 835 stp x23,x24,[x2,#8*4] 836 ldp x23,x24,[x0,#8*4] 837 cmp x27,x29 // did we hit the bottom? 838 stp x25,x26,[x2,#8*6] 839 mov x2,x0 // slide the window 840 ldp x25,x26,[x0,#8*6] 841 mov x27,#8 842 b.ne .Lsqr8x_reduction 843 844 // Final step. We see if result is larger than modulus, and 845 // if it is, subtract the modulus. But comparison implies 846 // subtraction. So we subtract modulus, see if it borrowed, 847 // and conditionally copy original value. 848 ldr x0,[x29,#96] // pull rp 849 add x2,x2,#8*8 850 subs x14,x19,x6 851 sbcs x15,x20,x7 852 sub x27,x5,#8*8 853 mov x3,x0 // x0 copy 854 855.Lsqr8x_sub: 856 sbcs x16,x21,x8 857 ldp x6,x7,[x1,#8*0] 858 sbcs x17,x22,x9 859 stp x14,x15,[x0,#8*0] 860 sbcs x14,x23,x10 861 ldp x8,x9,[x1,#8*2] 862 sbcs x15,x24,x11 863 stp x16,x17,[x0,#8*2] 864 sbcs x16,x25,x12 865 ldp x10,x11,[x1,#8*4] 866 sbcs x17,x26,x13 867 ldp x12,x13,[x1,#8*6] 868 add x1,x1,#8*8 869 ldp x19,x20,[x2,#8*0] 870 sub x27,x27,#8*8 871 ldp x21,x22,[x2,#8*2] 872 ldp x23,x24,[x2,#8*4] 873 ldp x25,x26,[x2,#8*6] 874 add x2,x2,#8*8 875 stp x14,x15,[x0,#8*4] 876 sbcs x14,x19,x6 877 stp x16,x17,[x0,#8*6] 878 add x0,x0,#8*8 879 sbcs x15,x20,x7 880 cbnz x27,.Lsqr8x_sub 881 882 sbcs x16,x21,x8 883 mov x2,sp 884 add x1,sp,x5 885 ldp x6,x7,[x3,#8*0] 886 sbcs x17,x22,x9 887 stp x14,x15,[x0,#8*0] 888 sbcs x14,x23,x10 889 ldp x8,x9,[x3,#8*2] 890 sbcs x15,x24,x11 891 stp x16,x17,[x0,#8*2] 892 sbcs x16,x25,x12 893 ldp x19,x20,[x1,#8*0] 894 sbcs x17,x26,x13 895 ldp x21,x22,[x1,#8*2] 896 sbcs xzr,x30,xzr // did it borrow? 897 ldr x30,[x29,#8] // pull return address 898 stp x14,x15,[x0,#8*4] 899 stp x16,x17,[x0,#8*6] 900 901 sub x27,x5,#8*4 902.Lsqr4x_cond_copy: 903 sub x27,x27,#8*4 904 csel x14,x19,x6,lo 905 stp xzr,xzr,[x2,#8*0] 906 csel x15,x20,x7,lo 907 ldp x6,x7,[x3,#8*4] 908 ldp x19,x20,[x1,#8*4] 909 csel x16,x21,x8,lo 910 stp xzr,xzr,[x2,#8*2] 911 add x2,x2,#8*4 912 csel x17,x22,x9,lo 913 ldp x8,x9,[x3,#8*6] 914 ldp x21,x22,[x1,#8*6] 915 add x1,x1,#8*4 916 stp x14,x15,[x3,#8*0] 917 stp x16,x17,[x3,#8*2] 918 add x3,x3,#8*4 919 stp xzr,xzr,[x1,#8*0] 920 stp xzr,xzr,[x1,#8*2] 921 cbnz x27,.Lsqr4x_cond_copy 922 923 csel x14,x19,x6,lo 924 stp xzr,xzr,[x2,#8*0] 925 csel x15,x20,x7,lo 926 stp xzr,xzr,[x2,#8*2] 927 csel x16,x21,x8,lo 928 csel x17,x22,x9,lo 929 stp x14,x15,[x3,#8*0] 930 stp x16,x17,[x3,#8*2] 931 932 b .Lsqr8x_done 933 934.align 4 935.Lsqr8x8_post_condition: 936 adc x28,xzr,xzr 937 ldr x30,[x29,#8] // pull return address 938 // x19-7,x28 hold result, x6-7 hold modulus 939 subs x6,x19,x6 940 ldr x1,[x29,#96] // pull rp 941 sbcs x7,x20,x7 942 stp xzr,xzr,[sp,#8*0] 943 sbcs x8,x21,x8 944 stp xzr,xzr,[sp,#8*2] 945 sbcs x9,x22,x9 946 stp xzr,xzr,[sp,#8*4] 947 sbcs x10,x23,x10 948 stp xzr,xzr,[sp,#8*6] 949 sbcs x11,x24,x11 950 stp xzr,xzr,[sp,#8*8] 951 sbcs x12,x25,x12 952 stp xzr,xzr,[sp,#8*10] 953 sbcs x13,x26,x13 954 stp xzr,xzr,[sp,#8*12] 955 sbcs x28,x28,xzr // did it borrow? 956 stp xzr,xzr,[sp,#8*14] 957 958 // x6-7 hold result-modulus 959 csel x6,x19,x6,lo 960 csel x7,x20,x7,lo 961 csel x8,x21,x8,lo 962 csel x9,x22,x9,lo 963 stp x6,x7,[x1,#8*0] 964 csel x10,x23,x10,lo 965 csel x11,x24,x11,lo 966 stp x8,x9,[x1,#8*2] 967 csel x12,x25,x12,lo 968 csel x13,x26,x13,lo 969 stp x10,x11,[x1,#8*4] 970 stp x12,x13,[x1,#8*6] 971 972.Lsqr8x_done: 973 ldp x19,x20,[x29,#16] 974 mov sp,x29 975 ldp x21,x22,[x29,#32] 976 mov x0,#1 977 ldp x23,x24,[x29,#48] 978 ldp x25,x26,[x29,#64] 979 ldp x27,x28,[x29,#80] 980 ldr x29,[sp],#128 981 // x30 is popped earlier 982 AARCH64_VALIDATE_LINK_REGISTER 983 ret 984.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 985.type __bn_mul4x_mont,%function 986.align 5 987__bn_mul4x_mont: 988 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 989 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the 990 // return address. 991 stp x29,x30,[sp,#-128]! 992 add x29,sp,#0 993 stp x19,x20,[sp,#16] 994 stp x21,x22,[sp,#32] 995 stp x23,x24,[sp,#48] 996 stp x25,x26,[sp,#64] 997 stp x27,x28,[sp,#80] 998 999 sub x26,sp,x5,lsl#3 1000 lsl x5,x5,#3 1001 ldr x4,[x4] // *n0 1002 sub sp,x26,#8*4 // alloca 1003 1004 add x10,x2,x5 1005 add x27,x1,x5 1006 stp x0,x10,[x29,#96] // offload rp and &b[num] 1007 1008 ldr x24,[x2,#8*0] // b[0] 1009 ldp x6,x7,[x1,#8*0] // a[0..3] 1010 ldp x8,x9,[x1,#8*2] 1011 add x1,x1,#8*4 1012 mov x19,xzr 1013 mov x20,xzr 1014 mov x21,xzr 1015 mov x22,xzr 1016 ldp x14,x15,[x3,#8*0] // n[0..3] 1017 ldp x16,x17,[x3,#8*2] 1018 adds x3,x3,#8*4 // clear carry bit 1019 mov x0,xzr 1020 mov x28,#0 1021 mov x26,sp 1022 1023.Loop_mul4x_1st_reduction: 1024 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1025 adc x0,x0,xzr // modulo-scheduled 1026 mul x11,x7,x24 1027 add x28,x28,#8 1028 mul x12,x8,x24 1029 and x28,x28,#31 1030 mul x13,x9,x24 1031 adds x19,x19,x10 1032 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1033 adcs x20,x20,x11 1034 mul x25,x19,x4 // t[0]*n0 1035 adcs x21,x21,x12 1036 umulh x11,x7,x24 1037 adcs x22,x22,x13 1038 umulh x12,x8,x24 1039 adc x23,xzr,xzr 1040 umulh x13,x9,x24 1041 ldr x24,[x2,x28] // next b[i] (or b[0]) 1042 adds x20,x20,x10 1043 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1044 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1045 adcs x21,x21,x11 1046 mul x11,x15,x25 1047 adcs x22,x22,x12 1048 mul x12,x16,x25 1049 adc x23,x23,x13 // can't overflow 1050 mul x13,x17,x25 1051 // (*) adds xzr,x19,x10 1052 subs xzr,x19,#1 // (*) 1053 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1054 adcs x19,x20,x11 1055 umulh x11,x15,x25 1056 adcs x20,x21,x12 1057 umulh x12,x16,x25 1058 adcs x21,x22,x13 1059 umulh x13,x17,x25 1060 adcs x22,x23,x0 1061 adc x0,xzr,xzr 1062 adds x19,x19,x10 1063 sub x10,x27,x1 1064 adcs x20,x20,x11 1065 adcs x21,x21,x12 1066 adcs x22,x22,x13 1067 //adc x0,x0,xzr 1068 cbnz x28,.Loop_mul4x_1st_reduction 1069 1070 cbz x10,.Lmul4x4_post_condition 1071 1072 ldp x6,x7,[x1,#8*0] // a[4..7] 1073 ldp x8,x9,[x1,#8*2] 1074 add x1,x1,#8*4 1075 ldr x25,[sp] // a[0]*n0 1076 ldp x14,x15,[x3,#8*0] // n[4..7] 1077 ldp x16,x17,[x3,#8*2] 1078 add x3,x3,#8*4 1079 1080.Loop_mul4x_1st_tail: 1081 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1082 adc x0,x0,xzr // modulo-scheduled 1083 mul x11,x7,x24 1084 add x28,x28,#8 1085 mul x12,x8,x24 1086 and x28,x28,#31 1087 mul x13,x9,x24 1088 adds x19,x19,x10 1089 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1090 adcs x20,x20,x11 1091 umulh x11,x7,x24 1092 adcs x21,x21,x12 1093 umulh x12,x8,x24 1094 adcs x22,x22,x13 1095 umulh x13,x9,x24 1096 adc x23,xzr,xzr 1097 ldr x24,[x2,x28] // next b[i] (or b[0]) 1098 adds x20,x20,x10 1099 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1100 adcs x21,x21,x11 1101 mul x11,x15,x25 1102 adcs x22,x22,x12 1103 mul x12,x16,x25 1104 adc x23,x23,x13 // can't overflow 1105 mul x13,x17,x25 1106 adds x19,x19,x10 1107 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1108 adcs x20,x20,x11 1109 umulh x11,x15,x25 1110 adcs x21,x21,x12 1111 umulh x12,x16,x25 1112 adcs x22,x22,x13 1113 adcs x23,x23,x0 1114 umulh x13,x17,x25 1115 adc x0,xzr,xzr 1116 ldr x25,[sp,x28] // next t[0]*n0 1117 str x19,[x26],#8 // result!!! 1118 adds x19,x20,x10 1119 sub x10,x27,x1 // done yet? 1120 adcs x20,x21,x11 1121 adcs x21,x22,x12 1122 adcs x22,x23,x13 1123 //adc x0,x0,xzr 1124 cbnz x28,.Loop_mul4x_1st_tail 1125 1126 sub x11,x27,x5 // rewinded x1 1127 cbz x10,.Lmul4x_proceed 1128 1129 ldp x6,x7,[x1,#8*0] 1130 ldp x8,x9,[x1,#8*2] 1131 add x1,x1,#8*4 1132 ldp x14,x15,[x3,#8*0] 1133 ldp x16,x17,[x3,#8*2] 1134 add x3,x3,#8*4 1135 b .Loop_mul4x_1st_tail 1136 1137.align 5 1138.Lmul4x_proceed: 1139 ldr x24,[x2,#8*4]! // *++b 1140 adc x30,x0,xzr 1141 ldp x6,x7,[x11,#8*0] // a[0..3] 1142 sub x3,x3,x5 // rewind np 1143 ldp x8,x9,[x11,#8*2] 1144 add x1,x11,#8*4 1145 1146 stp x19,x20,[x26,#8*0] // result!!! 1147 ldp x19,x20,[sp,#8*4] // t[0..3] 1148 stp x21,x22,[x26,#8*2] // result!!! 1149 ldp x21,x22,[sp,#8*6] 1150 1151 ldp x14,x15,[x3,#8*0] // n[0..3] 1152 mov x26,sp 1153 ldp x16,x17,[x3,#8*2] 1154 adds x3,x3,#8*4 // clear carry bit 1155 mov x0,xzr 1156 1157.align 4 1158.Loop_mul4x_reduction: 1159 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1160 adc x0,x0,xzr // modulo-scheduled 1161 mul x11,x7,x24 1162 add x28,x28,#8 1163 mul x12,x8,x24 1164 and x28,x28,#31 1165 mul x13,x9,x24 1166 adds x19,x19,x10 1167 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1168 adcs x20,x20,x11 1169 mul x25,x19,x4 // t[0]*n0 1170 adcs x21,x21,x12 1171 umulh x11,x7,x24 1172 adcs x22,x22,x13 1173 umulh x12,x8,x24 1174 adc x23,xzr,xzr 1175 umulh x13,x9,x24 1176 ldr x24,[x2,x28] // next b[i] 1177 adds x20,x20,x10 1178 // (*) mul x10,x14,x25 1179 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1180 adcs x21,x21,x11 1181 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1182 adcs x22,x22,x12 1183 mul x12,x16,x25 1184 adc x23,x23,x13 // can't overflow 1185 mul x13,x17,x25 1186 // (*) adds xzr,x19,x10 1187 subs xzr,x19,#1 // (*) 1188 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1189 adcs x19,x20,x11 1190 umulh x11,x15,x25 1191 adcs x20,x21,x12 1192 umulh x12,x16,x25 1193 adcs x21,x22,x13 1194 umulh x13,x17,x25 1195 adcs x22,x23,x0 1196 adc x0,xzr,xzr 1197 adds x19,x19,x10 1198 adcs x20,x20,x11 1199 adcs x21,x21,x12 1200 adcs x22,x22,x13 1201 //adc x0,x0,xzr 1202 cbnz x28,.Loop_mul4x_reduction 1203 1204 adc x0,x0,xzr 1205 ldp x10,x11,[x26,#8*4] // t[4..7] 1206 ldp x12,x13,[x26,#8*6] 1207 ldp x6,x7,[x1,#8*0] // a[4..7] 1208 ldp x8,x9,[x1,#8*2] 1209 add x1,x1,#8*4 1210 adds x19,x19,x10 1211 adcs x20,x20,x11 1212 adcs x21,x21,x12 1213 adcs x22,x22,x13 1214 //adc x0,x0,xzr 1215 1216 ldr x25,[sp] // t[0]*n0 1217 ldp x14,x15,[x3,#8*0] // n[4..7] 1218 ldp x16,x17,[x3,#8*2] 1219 add x3,x3,#8*4 1220 1221.align 4 1222.Loop_mul4x_tail: 1223 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1224 adc x0,x0,xzr // modulo-scheduled 1225 mul x11,x7,x24 1226 add x28,x28,#8 1227 mul x12,x8,x24 1228 and x28,x28,#31 1229 mul x13,x9,x24 1230 adds x19,x19,x10 1231 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1232 adcs x20,x20,x11 1233 umulh x11,x7,x24 1234 adcs x21,x21,x12 1235 umulh x12,x8,x24 1236 adcs x22,x22,x13 1237 umulh x13,x9,x24 1238 adc x23,xzr,xzr 1239 ldr x24,[x2,x28] // next b[i] 1240 adds x20,x20,x10 1241 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1242 adcs x21,x21,x11 1243 mul x11,x15,x25 1244 adcs x22,x22,x12 1245 mul x12,x16,x25 1246 adc x23,x23,x13 // can't overflow 1247 mul x13,x17,x25 1248 adds x19,x19,x10 1249 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1250 adcs x20,x20,x11 1251 umulh x11,x15,x25 1252 adcs x21,x21,x12 1253 umulh x12,x16,x25 1254 adcs x22,x22,x13 1255 umulh x13,x17,x25 1256 adcs x23,x23,x0 1257 ldr x25,[sp,x28] // next a[0]*n0 1258 adc x0,xzr,xzr 1259 str x19,[x26],#8 // result!!! 1260 adds x19,x20,x10 1261 sub x10,x27,x1 // done yet? 1262 adcs x20,x21,x11 1263 adcs x21,x22,x12 1264 adcs x22,x23,x13 1265 //adc x0,x0,xzr 1266 cbnz x28,.Loop_mul4x_tail 1267 1268 sub x11,x3,x5 // rewinded np? 1269 adc x0,x0,xzr 1270 cbz x10,.Loop_mul4x_break 1271 1272 ldp x10,x11,[x26,#8*4] 1273 ldp x12,x13,[x26,#8*6] 1274 ldp x6,x7,[x1,#8*0] 1275 ldp x8,x9,[x1,#8*2] 1276 add x1,x1,#8*4 1277 adds x19,x19,x10 1278 adcs x20,x20,x11 1279 adcs x21,x21,x12 1280 adcs x22,x22,x13 1281 //adc x0,x0,xzr 1282 ldp x14,x15,[x3,#8*0] 1283 ldp x16,x17,[x3,#8*2] 1284 add x3,x3,#8*4 1285 b .Loop_mul4x_tail 1286 1287.align 4 1288.Loop_mul4x_break: 1289 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1290 adds x19,x19,x30 1291 add x2,x2,#8*4 // bp++ 1292 adcs x20,x20,xzr 1293 sub x1,x1,x5 // rewind ap 1294 adcs x21,x21,xzr 1295 stp x19,x20,[x26,#8*0] // result!!! 1296 adcs x22,x22,xzr 1297 ldp x19,x20,[sp,#8*4] // t[0..3] 1298 adc x30,x0,xzr 1299 stp x21,x22,[x26,#8*2] // result!!! 1300 cmp x2,x13 // done yet? 1301 ldp x21,x22,[sp,#8*6] 1302 ldp x14,x15,[x11,#8*0] // n[0..3] 1303 ldp x16,x17,[x11,#8*2] 1304 add x3,x11,#8*4 1305 b.eq .Lmul4x_post 1306 1307 ldr x24,[x2] 1308 ldp x6,x7,[x1,#8*0] // a[0..3] 1309 ldp x8,x9,[x1,#8*2] 1310 adds x1,x1,#8*4 // clear carry bit 1311 mov x0,xzr 1312 mov x26,sp 1313 b .Loop_mul4x_reduction 1314 1315.align 4 1316.Lmul4x_post: 1317 // Final step. We see if result is larger than modulus, and 1318 // if it is, subtract the modulus. But comparison implies 1319 // subtraction. So we subtract modulus, see if it borrowed, 1320 // and conditionally copy original value. 1321 mov x0,x12 1322 mov x27,x12 // x0 copy 1323 subs x10,x19,x14 1324 add x26,sp,#8*8 1325 sbcs x11,x20,x15 1326 sub x28,x5,#8*4 1327 1328.Lmul4x_sub: 1329 sbcs x12,x21,x16 1330 ldp x14,x15,[x3,#8*0] 1331 sub x28,x28,#8*4 1332 ldp x19,x20,[x26,#8*0] 1333 sbcs x13,x22,x17 1334 ldp x16,x17,[x3,#8*2] 1335 add x3,x3,#8*4 1336 ldp x21,x22,[x26,#8*2] 1337 add x26,x26,#8*4 1338 stp x10,x11,[x0,#8*0] 1339 sbcs x10,x19,x14 1340 stp x12,x13,[x0,#8*2] 1341 add x0,x0,#8*4 1342 sbcs x11,x20,x15 1343 cbnz x28,.Lmul4x_sub 1344 1345 sbcs x12,x21,x16 1346 mov x26,sp 1347 add x1,sp,#8*4 1348 ldp x6,x7,[x27,#8*0] 1349 sbcs x13,x22,x17 1350 stp x10,x11,[x0,#8*0] 1351 ldp x8,x9,[x27,#8*2] 1352 stp x12,x13,[x0,#8*2] 1353 ldp x19,x20,[x1,#8*0] 1354 ldp x21,x22,[x1,#8*2] 1355 sbcs xzr,x30,xzr // did it borrow? 1356 ldr x30,[x29,#8] // pull return address 1357 1358 sub x28,x5,#8*4 1359.Lmul4x_cond_copy: 1360 sub x28,x28,#8*4 1361 csel x10,x19,x6,lo 1362 stp xzr,xzr,[x26,#8*0] 1363 csel x11,x20,x7,lo 1364 ldp x6,x7,[x27,#8*4] 1365 ldp x19,x20,[x1,#8*4] 1366 csel x12,x21,x8,lo 1367 stp xzr,xzr,[x26,#8*2] 1368 add x26,x26,#8*4 1369 csel x13,x22,x9,lo 1370 ldp x8,x9,[x27,#8*6] 1371 ldp x21,x22,[x1,#8*6] 1372 add x1,x1,#8*4 1373 stp x10,x11,[x27,#8*0] 1374 stp x12,x13,[x27,#8*2] 1375 add x27,x27,#8*4 1376 cbnz x28,.Lmul4x_cond_copy 1377 1378 csel x10,x19,x6,lo 1379 stp xzr,xzr,[x26,#8*0] 1380 csel x11,x20,x7,lo 1381 stp xzr,xzr,[x26,#8*2] 1382 csel x12,x21,x8,lo 1383 stp xzr,xzr,[x26,#8*3] 1384 csel x13,x22,x9,lo 1385 stp xzr,xzr,[x26,#8*4] 1386 stp x10,x11,[x27,#8*0] 1387 stp x12,x13,[x27,#8*2] 1388 1389 b .Lmul4x_done 1390 1391.align 4 1392.Lmul4x4_post_condition: 1393 adc x0,x0,xzr 1394 ldr x1,[x29,#96] // pull rp 1395 // x19-3,x0 hold result, x14-7 hold modulus 1396 subs x6,x19,x14 1397 ldr x30,[x29,#8] // pull return address 1398 sbcs x7,x20,x15 1399 stp xzr,xzr,[sp,#8*0] 1400 sbcs x8,x21,x16 1401 stp xzr,xzr,[sp,#8*2] 1402 sbcs x9,x22,x17 1403 stp xzr,xzr,[sp,#8*4] 1404 sbcs xzr,x0,xzr // did it borrow? 1405 stp xzr,xzr,[sp,#8*6] 1406 1407 // x6-3 hold result-modulus 1408 csel x6,x19,x6,lo 1409 csel x7,x20,x7,lo 1410 csel x8,x21,x8,lo 1411 csel x9,x22,x9,lo 1412 stp x6,x7,[x1,#8*0] 1413 stp x8,x9,[x1,#8*2] 1414 1415.Lmul4x_done: 1416 ldp x19,x20,[x29,#16] 1417 mov sp,x29 1418 ldp x21,x22,[x29,#32] 1419 mov x0,#1 1420 ldp x23,x24,[x29,#48] 1421 ldp x25,x26,[x29,#64] 1422 ldp x27,x28,[x29,#80] 1423 ldr x29,[sp],#128 1424 // x30 is popped earlier 1425 AARCH64_VALIDATE_LINK_REGISTER 1426 ret 1427.size __bn_mul4x_mont,.-__bn_mul4x_mont 1428.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1429.align 2 1430.align 4 1431#endif 1432#endif // !OPENSSL_NO_ASM 1433.section .note.GNU-stack,"",%progbits 1434