1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__arm__) 13#include <GFp/arm_arch.h> 14 15.text 16#if defined(__thumb2__) 17.syntax unified 18.thumb 19#else 20.code 32 21#endif 22 23.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 24.align 2 25.align 6 26.type __ecp_nistz256_mul_by_2,%function 27.align 4 28__ecp_nistz256_mul_by_2: 29 ldr r4,[r1,#0] 30 ldr r5,[r1,#4] 31 ldr r6,[r1,#8] 32 adds r4,r4,r4 @ a[0:7]+=a[0:7], i.e. add with itself 33 ldr r7,[r1,#12] 34 adcs r5,r5,r5 35 ldr r8,[r1,#16] 36 adcs r6,r6,r6 37 ldr r9,[r1,#20] 38 adcs r7,r7,r7 39 ldr r10,[r1,#24] 40 adcs r8,r8,r8 41 ldr r11,[r1,#28] 42 adcs r9,r9,r9 43 adcs r10,r10,r10 44 mov r3,#0 45 adcs r11,r11,r11 46 adc r3,r3,#0 47 48 b .Lreduce_by_sub 49.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 50 51@ void GFp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], 52@ const BN_ULONG r2[8]); 53.globl GFp_nistz256_add 54.hidden GFp_nistz256_add 55.type GFp_nistz256_add,%function 56.align 4 57GFp_nistz256_add: 58 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 59 bl __ecp_nistz256_add 60#if __ARM_ARCH__>=5 || !defined(__thumb__) 61 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 62#else 63 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 64 bx lr @ interoperable with Thumb ISA:-) 65#endif 66.size GFp_nistz256_add,.-GFp_nistz256_add 67 68.type __ecp_nistz256_add,%function 69.align 4 70__ecp_nistz256_add: 71 str lr,[sp,#-4]! @ push lr 72 73 ldr r4,[r1,#0] 74 ldr r5,[r1,#4] 75 ldr r6,[r1,#8] 76 ldr r7,[r1,#12] 77 ldr r8,[r1,#16] 78 ldr r3,[r2,#0] 79 ldr r9,[r1,#20] 80 ldr r12,[r2,#4] 81 ldr r10,[r1,#24] 82 ldr r14,[r2,#8] 83 ldr r11,[r1,#28] 84 ldr r1,[r2,#12] 85 adds r4,r4,r3 86 ldr r3,[r2,#16] 87 adcs r5,r5,r12 88 ldr r12,[r2,#20] 89 adcs r6,r6,r14 90 ldr r14,[r2,#24] 91 adcs r7,r7,r1 92 ldr r1,[r2,#28] 93 adcs r8,r8,r3 94 adcs r9,r9,r12 95 adcs r10,r10,r14 96 mov r3,#0 97 adcs r11,r11,r1 98 adc r3,r3,#0 99 ldr lr,[sp],#4 @ pop lr 100 101.Lreduce_by_sub: 102 103 @ if a+b >= modulus, subtract modulus. 104 @ 105 @ But since comparison implies subtraction, we subtract 106 @ modulus and then add it back if subtraction borrowed. 107 108 subs r4,r4,#-1 109 sbcs r5,r5,#-1 110 sbcs r6,r6,#-1 111 sbcs r7,r7,#0 112 sbcs r8,r8,#0 113 sbcs r9,r9,#0 114 sbcs r10,r10,#1 115 sbcs r11,r11,#-1 116 sbc r3,r3,#0 117 118 @ Note that because mod has special form, i.e. consists of 119 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 120 @ using value of borrow as a whole or extracting single bit. 121 @ Follow r3 register... 122 123 adds r4,r4,r3 @ add synthesized modulus 124 adcs r5,r5,r3 125 str r4,[r0,#0] 126 adcs r6,r6,r3 127 str r5,[r0,#4] 128 adcs r7,r7,#0 129 str r6,[r0,#8] 130 adcs r8,r8,#0 131 str r7,[r0,#12] 132 adcs r9,r9,#0 133 str r8,[r0,#16] 134 adcs r10,r10,r3,lsr#31 135 str r9,[r0,#20] 136 adcs r11,r11,r3 137 str r10,[r0,#24] 138 str r11,[r0,#28] 139 140 mov pc,lr 141.size __ecp_nistz256_add,.-__ecp_nistz256_add 142 143.type __ecp_nistz256_mul_by_3,%function 144.align 4 145__ecp_nistz256_mul_by_3: 146 str lr,[sp,#-4]! @ push lr 147 148 @ As multiplication by 3 is performed as 2*n+n, below are inline 149 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see 150 @ corresponding subroutines for details. 151 152 ldr r4,[r1,#0] 153 ldr r5,[r1,#4] 154 ldr r6,[r1,#8] 155 adds r4,r4,r4 @ a[0:7]+=a[0:7] 156 ldr r7,[r1,#12] 157 adcs r5,r5,r5 158 ldr r8,[r1,#16] 159 adcs r6,r6,r6 160 ldr r9,[r1,#20] 161 adcs r7,r7,r7 162 ldr r10,[r1,#24] 163 adcs r8,r8,r8 164 ldr r11,[r1,#28] 165 adcs r9,r9,r9 166 adcs r10,r10,r10 167 mov r3,#0 168 adcs r11,r11,r11 169 adc r3,r3,#0 170 171 subs r4,r4,#-1 @ .Lreduce_by_sub but without stores 172 sbcs r5,r5,#-1 173 sbcs r6,r6,#-1 174 sbcs r7,r7,#0 175 sbcs r8,r8,#0 176 sbcs r9,r9,#0 177 sbcs r10,r10,#1 178 sbcs r11,r11,#-1 179 sbc r3,r3,#0 180 181 adds r4,r4,r3 @ add synthesized modulus 182 adcs r5,r5,r3 183 adcs r6,r6,r3 184 adcs r7,r7,#0 185 adcs r8,r8,#0 186 ldr r2,[r1,#0] 187 adcs r9,r9,#0 188 ldr r12,[r1,#4] 189 adcs r10,r10,r3,lsr#31 190 ldr r14,[r1,#8] 191 adc r11,r11,r3 192 193 ldr r3,[r1,#12] 194 adds r4,r4,r2 @ 2*a[0:7]+=a[0:7] 195 ldr r2,[r1,#16] 196 adcs r5,r5,r12 197 ldr r12,[r1,#20] 198 adcs r6,r6,r14 199 ldr r14,[r1,#24] 200 adcs r7,r7,r3 201 ldr r1,[r1,#28] 202 adcs r8,r8,r2 203 adcs r9,r9,r12 204 adcs r10,r10,r14 205 mov r3,#0 206 adcs r11,r11,r1 207 adc r3,r3,#0 208 ldr lr,[sp],#4 @ pop lr 209 210 b .Lreduce_by_sub 211.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3 212 213.type __ecp_nistz256_div_by_2,%function 214.align 4 215__ecp_nistz256_div_by_2: 216 @ ret = (a is odd ? a+mod : a) >> 1 217 218 ldr r4,[r1,#0] 219 ldr r5,[r1,#4] 220 ldr r6,[r1,#8] 221 mov r3,r4,lsl#31 @ place least significant bit to most 222 @ significant position, now arithmetic 223 @ right shift by 31 will produce -1 or 224 @ 0, while logical right shift 1 or 0, 225 @ this is how modulus is conditionally 226 @ synthesized in this case... 227 ldr r7,[r1,#12] 228 adds r4,r4,r3,asr#31 229 ldr r8,[r1,#16] 230 adcs r5,r5,r3,asr#31 231 ldr r9,[r1,#20] 232 adcs r6,r6,r3,asr#31 233 ldr r10,[r1,#24] 234 adcs r7,r7,#0 235 ldr r11,[r1,#28] 236 adcs r8,r8,#0 237 mov r4,r4,lsr#1 @ a[0:7]>>=1, we can start early 238 @ because it doesn't affect flags 239 adcs r9,r9,#0 240 orr r4,r4,r5,lsl#31 241 adcs r10,r10,r3,lsr#31 242 mov r2,#0 243 adcs r11,r11,r3,asr#31 244 mov r5,r5,lsr#1 245 adc r2,r2,#0 @ top-most carry bit from addition 246 247 orr r5,r5,r6,lsl#31 248 mov r6,r6,lsr#1 249 str r4,[r0,#0] 250 orr r6,r6,r7,lsl#31 251 mov r7,r7,lsr#1 252 str r5,[r0,#4] 253 orr r7,r7,r8,lsl#31 254 mov r8,r8,lsr#1 255 str r6,[r0,#8] 256 orr r8,r8,r9,lsl#31 257 mov r9,r9,lsr#1 258 str r7,[r0,#12] 259 orr r9,r9,r10,lsl#31 260 mov r10,r10,lsr#1 261 str r8,[r0,#16] 262 orr r10,r10,r11,lsl#31 263 mov r11,r11,lsr#1 264 str r9,[r0,#20] 265 orr r11,r11,r2,lsl#31 @ don't forget the top-most carry bit 266 str r10,[r0,#24] 267 str r11,[r0,#28] 268 269 mov pc,lr 270.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 271 272.type __ecp_nistz256_sub,%function 273.align 4 274__ecp_nistz256_sub: 275 str lr,[sp,#-4]! @ push lr 276 277 ldr r4,[r1,#0] 278 ldr r5,[r1,#4] 279 ldr r6,[r1,#8] 280 ldr r7,[r1,#12] 281 ldr r8,[r1,#16] 282 ldr r3,[r2,#0] 283 ldr r9,[r1,#20] 284 ldr r12,[r2,#4] 285 ldr r10,[r1,#24] 286 ldr r14,[r2,#8] 287 ldr r11,[r1,#28] 288 ldr r1,[r2,#12] 289 subs r4,r4,r3 290 ldr r3,[r2,#16] 291 sbcs r5,r5,r12 292 ldr r12,[r2,#20] 293 sbcs r6,r6,r14 294 ldr r14,[r2,#24] 295 sbcs r7,r7,r1 296 ldr r1,[r2,#28] 297 sbcs r8,r8,r3 298 sbcs r9,r9,r12 299 sbcs r10,r10,r14 300 sbcs r11,r11,r1 301 sbc r3,r3,r3 @ broadcast borrow bit 302 ldr lr,[sp],#4 @ pop lr 303 304.Lreduce_by_add: 305 306 @ if a-b borrows, add modulus. 307 @ 308 @ Note that because mod has special form, i.e. consists of 309 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 310 @ broadcasting borrow bit to a register, r3, and using it as 311 @ a whole or extracting single bit. 312 313 adds r4,r4,r3 @ add synthesized modulus 314 adcs r5,r5,r3 315 str r4,[r0,#0] 316 adcs r6,r6,r3 317 str r5,[r0,#4] 318 adcs r7,r7,#0 319 str r6,[r0,#8] 320 adcs r8,r8,#0 321 str r7,[r0,#12] 322 adcs r9,r9,#0 323 str r8,[r0,#16] 324 adcs r10,r10,r3,lsr#31 325 str r9,[r0,#20] 326 adcs r11,r11,r3 327 str r10,[r0,#24] 328 str r11,[r0,#28] 329 330 mov pc,lr 331.size __ecp_nistz256_sub,.-__ecp_nistz256_sub 332 333@ void GFp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); 334.globl GFp_nistz256_neg 335.hidden GFp_nistz256_neg 336.type GFp_nistz256_neg,%function 337.align 4 338GFp_nistz256_neg: 339 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 340 bl __ecp_nistz256_neg 341#if __ARM_ARCH__>=5 || !defined(__thumb__) 342 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 343#else 344 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 345 bx lr @ interoperable with Thumb ISA:-) 346#endif 347.size GFp_nistz256_neg,.-GFp_nistz256_neg 348 349.type __ecp_nistz256_neg,%function 350.align 4 351__ecp_nistz256_neg: 352 ldr r4,[r1,#0] 353 eor r3,r3,r3 354 ldr r5,[r1,#4] 355 ldr r6,[r1,#8] 356 subs r4,r3,r4 357 ldr r7,[r1,#12] 358 sbcs r5,r3,r5 359 ldr r8,[r1,#16] 360 sbcs r6,r3,r6 361 ldr r9,[r1,#20] 362 sbcs r7,r3,r7 363 ldr r10,[r1,#24] 364 sbcs r8,r3,r8 365 ldr r11,[r1,#28] 366 sbcs r9,r3,r9 367 sbcs r10,r3,r10 368 sbcs r11,r3,r11 369 sbc r3,r3,r3 370 371 b .Lreduce_by_add 372.size __ecp_nistz256_neg,.-__ecp_nistz256_neg 373@ void GFp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], 374@ const BN_ULONG r2[8]); 375.globl GFp_nistz256_mul_mont 376.hidden GFp_nistz256_mul_mont 377.type GFp_nistz256_mul_mont,%function 378.align 4 379GFp_nistz256_mul_mont: 380 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 381 bl __ecp_nistz256_mul_mont 382#if __ARM_ARCH__>=5 || !defined(__thumb__) 383 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 384#else 385 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 386 bx lr @ interoperable with Thumb ISA:-) 387#endif 388.size GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont 389 390.type __ecp_nistz256_mul_mont,%function 391.align 4 392__ecp_nistz256_mul_mont: 393 stmdb sp!,{r0,r1,r2,lr} @ make a copy of arguments too 394 395 ldr r2,[r2,#0] @ b[0] 396 ldmia r1,{r4,r5,r6,r7,r8,r9,r10,r11} 397 398 umull r3,r14,r4,r2 @ r[0]=a[0]*b[0] 399 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy a[0-7] to stack, so 400 @ that it can be addressed 401 @ without spending register 402 @ on address 403 umull r4,r0,r5,r2 @ r[1]=a[1]*b[0] 404 umull r5,r1,r6,r2 405 adds r4,r4,r14 @ accumulate high part of mult 406 umull r6,r12,r7,r2 407 adcs r5,r5,r0 408 umull r7,r14,r8,r2 409 adcs r6,r6,r1 410 umull r8,r0,r9,r2 411 adcs r7,r7,r12 412 umull r9,r1,r10,r2 413 adcs r8,r8,r14 414 umull r10,r12,r11,r2 415 adcs r9,r9,r0 416 adcs r10,r10,r1 417 eor r14,r14,r14 @ first overflow bit is zero 418 adc r11,r12,#0 419 @ multiplication-less reduction 1 420 adds r6,r6,r3 @ r[3]+=r[0] 421 ldr r2,[sp,#40] @ restore b_ptr 422 adcs r7,r7,#0 @ r[4]+=0 423 adcs r8,r8,#0 @ r[5]+=0 424 adcs r9,r9,r3 @ r[6]+=r[0] 425 ldr r1,[sp,#0] @ load a[0] 426 adcs r10,r10,#0 @ r[7]+=0 427 ldr r2,[r2,#4*1] @ load b[i] 428 adcs r11,r11,r3 @ r[8]+=r[0] 429 eor r0,r0,r0 430 adc r14,r14,#0 @ overflow bit 431 subs r10,r10,r3 @ r[7]-=r[0] 432 ldr r12,[sp,#4] @ a[1] 433 sbcs r11,r11,#0 @ r[8]-=0 434 umlal r4,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 435 eor r1,r1,r1 436 sbc r3,r14,#0 @ overflow bit, keep in mind 437 @ that netto result is 438 @ addition of a value which 439 @ makes underflow impossible 440 441 ldr r14,[sp,#8] @ a[2] 442 umlal r5,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 443 str r3,[sp,#36] @ temporarily offload overflow 444 eor r12,r12,r12 445 ldr r3,[sp,#12] @ a[3], r3 is alias r3 446 umlal r6,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 447 eor r14,r14,r14 448 adds r5,r5,r0 @ accumulate high part of mult 449 ldr r0,[sp,#16] @ a[4] 450 umlal r7,r14,r3,r2 @ "r[3]"+=a[3]*b[i] 451 eor r3,r3,r3 452 adcs r6,r6,r1 453 ldr r1,[sp,#20] @ a[5] 454 umlal r8,r3,r0,r2 @ "r[4]"+=a[4]*b[i] 455 eor r0,r0,r0 456 adcs r7,r7,r12 457 ldr r12,[sp,#24] @ a[6] 458 umlal r9,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 459 eor r1,r1,r1 460 adcs r8,r8,r14 461 ldr r14,[sp,#28] @ a[7] 462 umlal r10,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 463 eor r12,r12,r12 464 adcs r9,r9,r3 465 ldr r3,[sp,#36] @ restore overflow bit 466 umlal r11,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 467 eor r14,r14,r14 468 adcs r10,r10,r0 469 adcs r11,r11,r1 470 adcs r3,r3,r12 471 adc r14,r14,#0 @ new overflow bit 472 @ multiplication-less reduction 2 473 adds r7,r7,r4 @ r[3]+=r[0] 474 ldr r2,[sp,#40] @ restore b_ptr 475 adcs r8,r8,#0 @ r[4]+=0 476 adcs r9,r9,#0 @ r[5]+=0 477 adcs r10,r10,r4 @ r[6]+=r[0] 478 ldr r1,[sp,#0] @ load a[0] 479 adcs r11,r11,#0 @ r[7]+=0 480 ldr r2,[r2,#4*2] @ load b[i] 481 adcs r3,r3,r4 @ r[8]+=r[0] 482 eor r0,r0,r0 483 adc r14,r14,#0 @ overflow bit 484 subs r11,r11,r4 @ r[7]-=r[0] 485 ldr r12,[sp,#4] @ a[1] 486 sbcs r3,r3,#0 @ r[8]-=0 487 umlal r5,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 488 eor r1,r1,r1 489 sbc r4,r14,#0 @ overflow bit, keep in mind 490 @ that netto result is 491 @ addition of a value which 492 @ makes underflow impossible 493 494 ldr r14,[sp,#8] @ a[2] 495 umlal r6,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 496 str r4,[sp,#36] @ temporarily offload overflow 497 eor r12,r12,r12 498 ldr r4,[sp,#12] @ a[3], r4 is alias r4 499 umlal r7,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 500 eor r14,r14,r14 501 adds r6,r6,r0 @ accumulate high part of mult 502 ldr r0,[sp,#16] @ a[4] 503 umlal r8,r14,r4,r2 @ "r[3]"+=a[3]*b[i] 504 eor r4,r4,r4 505 adcs r7,r7,r1 506 ldr r1,[sp,#20] @ a[5] 507 umlal r9,r4,r0,r2 @ "r[4]"+=a[4]*b[i] 508 eor r0,r0,r0 509 adcs r8,r8,r12 510 ldr r12,[sp,#24] @ a[6] 511 umlal r10,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 512 eor r1,r1,r1 513 adcs r9,r9,r14 514 ldr r14,[sp,#28] @ a[7] 515 umlal r11,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 516 eor r12,r12,r12 517 adcs r10,r10,r4 518 ldr r4,[sp,#36] @ restore overflow bit 519 umlal r3,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 520 eor r14,r14,r14 521 adcs r11,r11,r0 522 adcs r3,r3,r1 523 adcs r4,r4,r12 524 adc r14,r14,#0 @ new overflow bit 525 @ multiplication-less reduction 3 526 adds r8,r8,r5 @ r[3]+=r[0] 527 ldr r2,[sp,#40] @ restore b_ptr 528 adcs r9,r9,#0 @ r[4]+=0 529 adcs r10,r10,#0 @ r[5]+=0 530 adcs r11,r11,r5 @ r[6]+=r[0] 531 ldr r1,[sp,#0] @ load a[0] 532 adcs r3,r3,#0 @ r[7]+=0 533 ldr r2,[r2,#4*3] @ load b[i] 534 adcs r4,r4,r5 @ r[8]+=r[0] 535 eor r0,r0,r0 536 adc r14,r14,#0 @ overflow bit 537 subs r3,r3,r5 @ r[7]-=r[0] 538 ldr r12,[sp,#4] @ a[1] 539 sbcs r4,r4,#0 @ r[8]-=0 540 umlal r6,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 541 eor r1,r1,r1 542 sbc r5,r14,#0 @ overflow bit, keep in mind 543 @ that netto result is 544 @ addition of a value which 545 @ makes underflow impossible 546 547 ldr r14,[sp,#8] @ a[2] 548 umlal r7,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 549 str r5,[sp,#36] @ temporarily offload overflow 550 eor r12,r12,r12 551 ldr r5,[sp,#12] @ a[3], r5 is alias r5 552 umlal r8,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 553 eor r14,r14,r14 554 adds r7,r7,r0 @ accumulate high part of mult 555 ldr r0,[sp,#16] @ a[4] 556 umlal r9,r14,r5,r2 @ "r[3]"+=a[3]*b[i] 557 eor r5,r5,r5 558 adcs r8,r8,r1 559 ldr r1,[sp,#20] @ a[5] 560 umlal r10,r5,r0,r2 @ "r[4]"+=a[4]*b[i] 561 eor r0,r0,r0 562 adcs r9,r9,r12 563 ldr r12,[sp,#24] @ a[6] 564 umlal r11,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 565 eor r1,r1,r1 566 adcs r10,r10,r14 567 ldr r14,[sp,#28] @ a[7] 568 umlal r3,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 569 eor r12,r12,r12 570 adcs r11,r11,r5 571 ldr r5,[sp,#36] @ restore overflow bit 572 umlal r4,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 573 eor r14,r14,r14 574 adcs r3,r3,r0 575 adcs r4,r4,r1 576 adcs r5,r5,r12 577 adc r14,r14,#0 @ new overflow bit 578 @ multiplication-less reduction 4 579 adds r9,r9,r6 @ r[3]+=r[0] 580 ldr r2,[sp,#40] @ restore b_ptr 581 adcs r10,r10,#0 @ r[4]+=0 582 adcs r11,r11,#0 @ r[5]+=0 583 adcs r3,r3,r6 @ r[6]+=r[0] 584 ldr r1,[sp,#0] @ load a[0] 585 adcs r4,r4,#0 @ r[7]+=0 586 ldr r2,[r2,#4*4] @ load b[i] 587 adcs r5,r5,r6 @ r[8]+=r[0] 588 eor r0,r0,r0 589 adc r14,r14,#0 @ overflow bit 590 subs r4,r4,r6 @ r[7]-=r[0] 591 ldr r12,[sp,#4] @ a[1] 592 sbcs r5,r5,#0 @ r[8]-=0 593 umlal r7,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 594 eor r1,r1,r1 595 sbc r6,r14,#0 @ overflow bit, keep in mind 596 @ that netto result is 597 @ addition of a value which 598 @ makes underflow impossible 599 600 ldr r14,[sp,#8] @ a[2] 601 umlal r8,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 602 str r6,[sp,#36] @ temporarily offload overflow 603 eor r12,r12,r12 604 ldr r6,[sp,#12] @ a[3], r6 is alias r6 605 umlal r9,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 606 eor r14,r14,r14 607 adds r8,r8,r0 @ accumulate high part of mult 608 ldr r0,[sp,#16] @ a[4] 609 umlal r10,r14,r6,r2 @ "r[3]"+=a[3]*b[i] 610 eor r6,r6,r6 611 adcs r9,r9,r1 612 ldr r1,[sp,#20] @ a[5] 613 umlal r11,r6,r0,r2 @ "r[4]"+=a[4]*b[i] 614 eor r0,r0,r0 615 adcs r10,r10,r12 616 ldr r12,[sp,#24] @ a[6] 617 umlal r3,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 618 eor r1,r1,r1 619 adcs r11,r11,r14 620 ldr r14,[sp,#28] @ a[7] 621 umlal r4,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 622 eor r12,r12,r12 623 adcs r3,r3,r6 624 ldr r6,[sp,#36] @ restore overflow bit 625 umlal r5,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 626 eor r14,r14,r14 627 adcs r4,r4,r0 628 adcs r5,r5,r1 629 adcs r6,r6,r12 630 adc r14,r14,#0 @ new overflow bit 631 @ multiplication-less reduction 5 632 adds r10,r10,r7 @ r[3]+=r[0] 633 ldr r2,[sp,#40] @ restore b_ptr 634 adcs r11,r11,#0 @ r[4]+=0 635 adcs r3,r3,#0 @ r[5]+=0 636 adcs r4,r4,r7 @ r[6]+=r[0] 637 ldr r1,[sp,#0] @ load a[0] 638 adcs r5,r5,#0 @ r[7]+=0 639 ldr r2,[r2,#4*5] @ load b[i] 640 adcs r6,r6,r7 @ r[8]+=r[0] 641 eor r0,r0,r0 642 adc r14,r14,#0 @ overflow bit 643 subs r5,r5,r7 @ r[7]-=r[0] 644 ldr r12,[sp,#4] @ a[1] 645 sbcs r6,r6,#0 @ r[8]-=0 646 umlal r8,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 647 eor r1,r1,r1 648 sbc r7,r14,#0 @ overflow bit, keep in mind 649 @ that netto result is 650 @ addition of a value which 651 @ makes underflow impossible 652 653 ldr r14,[sp,#8] @ a[2] 654 umlal r9,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 655 str r7,[sp,#36] @ temporarily offload overflow 656 eor r12,r12,r12 657 ldr r7,[sp,#12] @ a[3], r7 is alias r7 658 umlal r10,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 659 eor r14,r14,r14 660 adds r9,r9,r0 @ accumulate high part of mult 661 ldr r0,[sp,#16] @ a[4] 662 umlal r11,r14,r7,r2 @ "r[3]"+=a[3]*b[i] 663 eor r7,r7,r7 664 adcs r10,r10,r1 665 ldr r1,[sp,#20] @ a[5] 666 umlal r3,r7,r0,r2 @ "r[4]"+=a[4]*b[i] 667 eor r0,r0,r0 668 adcs r11,r11,r12 669 ldr r12,[sp,#24] @ a[6] 670 umlal r4,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 671 eor r1,r1,r1 672 adcs r3,r3,r14 673 ldr r14,[sp,#28] @ a[7] 674 umlal r5,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 675 eor r12,r12,r12 676 adcs r4,r4,r7 677 ldr r7,[sp,#36] @ restore overflow bit 678 umlal r6,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 679 eor r14,r14,r14 680 adcs r5,r5,r0 681 adcs r6,r6,r1 682 adcs r7,r7,r12 683 adc r14,r14,#0 @ new overflow bit 684 @ multiplication-less reduction 6 685 adds r11,r11,r8 @ r[3]+=r[0] 686 ldr r2,[sp,#40] @ restore b_ptr 687 adcs r3,r3,#0 @ r[4]+=0 688 adcs r4,r4,#0 @ r[5]+=0 689 adcs r5,r5,r8 @ r[6]+=r[0] 690 ldr r1,[sp,#0] @ load a[0] 691 adcs r6,r6,#0 @ r[7]+=0 692 ldr r2,[r2,#4*6] @ load b[i] 693 adcs r7,r7,r8 @ r[8]+=r[0] 694 eor r0,r0,r0 695 adc r14,r14,#0 @ overflow bit 696 subs r6,r6,r8 @ r[7]-=r[0] 697 ldr r12,[sp,#4] @ a[1] 698 sbcs r7,r7,#0 @ r[8]-=0 699 umlal r9,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 700 eor r1,r1,r1 701 sbc r8,r14,#0 @ overflow bit, keep in mind 702 @ that netto result is 703 @ addition of a value which 704 @ makes underflow impossible 705 706 ldr r14,[sp,#8] @ a[2] 707 umlal r10,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 708 str r8,[sp,#36] @ temporarily offload overflow 709 eor r12,r12,r12 710 ldr r8,[sp,#12] @ a[3], r8 is alias r8 711 umlal r11,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 712 eor r14,r14,r14 713 adds r10,r10,r0 @ accumulate high part of mult 714 ldr r0,[sp,#16] @ a[4] 715 umlal r3,r14,r8,r2 @ "r[3]"+=a[3]*b[i] 716 eor r8,r8,r8 717 adcs r11,r11,r1 718 ldr r1,[sp,#20] @ a[5] 719 umlal r4,r8,r0,r2 @ "r[4]"+=a[4]*b[i] 720 eor r0,r0,r0 721 adcs r3,r3,r12 722 ldr r12,[sp,#24] @ a[6] 723 umlal r5,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 724 eor r1,r1,r1 725 adcs r4,r4,r14 726 ldr r14,[sp,#28] @ a[7] 727 umlal r6,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 728 eor r12,r12,r12 729 adcs r5,r5,r8 730 ldr r8,[sp,#36] @ restore overflow bit 731 umlal r7,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 732 eor r14,r14,r14 733 adcs r6,r6,r0 734 adcs r7,r7,r1 735 adcs r8,r8,r12 736 adc r14,r14,#0 @ new overflow bit 737 @ multiplication-less reduction 7 738 adds r3,r3,r9 @ r[3]+=r[0] 739 ldr r2,[sp,#40] @ restore b_ptr 740 adcs r4,r4,#0 @ r[4]+=0 741 adcs r5,r5,#0 @ r[5]+=0 742 adcs r6,r6,r9 @ r[6]+=r[0] 743 ldr r1,[sp,#0] @ load a[0] 744 adcs r7,r7,#0 @ r[7]+=0 745 ldr r2,[r2,#4*7] @ load b[i] 746 adcs r8,r8,r9 @ r[8]+=r[0] 747 eor r0,r0,r0 748 adc r14,r14,#0 @ overflow bit 749 subs r7,r7,r9 @ r[7]-=r[0] 750 ldr r12,[sp,#4] @ a[1] 751 sbcs r8,r8,#0 @ r[8]-=0 752 umlal r10,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 753 eor r1,r1,r1 754 sbc r9,r14,#0 @ overflow bit, keep in mind 755 @ that netto result is 756 @ addition of a value which 757 @ makes underflow impossible 758 759 ldr r14,[sp,#8] @ a[2] 760 umlal r11,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 761 str r9,[sp,#36] @ temporarily offload overflow 762 eor r12,r12,r12 763 ldr r9,[sp,#12] @ a[3], r9 is alias r9 764 umlal r3,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 765 eor r14,r14,r14 766 adds r11,r11,r0 @ accumulate high part of mult 767 ldr r0,[sp,#16] @ a[4] 768 umlal r4,r14,r9,r2 @ "r[3]"+=a[3]*b[i] 769 eor r9,r9,r9 770 adcs r3,r3,r1 771 ldr r1,[sp,#20] @ a[5] 772 umlal r5,r9,r0,r2 @ "r[4]"+=a[4]*b[i] 773 eor r0,r0,r0 774 adcs r4,r4,r12 775 ldr r12,[sp,#24] @ a[6] 776 umlal r6,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 777 eor r1,r1,r1 778 adcs r5,r5,r14 779 ldr r14,[sp,#28] @ a[7] 780 umlal r7,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 781 eor r12,r12,r12 782 adcs r6,r6,r9 783 ldr r9,[sp,#36] @ restore overflow bit 784 umlal r8,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 785 eor r14,r14,r14 786 adcs r7,r7,r0 787 adcs r8,r8,r1 788 adcs r9,r9,r12 789 adc r14,r14,#0 @ new overflow bit 790 @ last multiplication-less reduction 791 adds r4,r4,r10 792 ldr r0,[sp,#32] @ restore r_ptr 793 adcs r5,r5,#0 794 adcs r6,r6,#0 795 adcs r7,r7,r10 796 adcs r8,r8,#0 797 adcs r9,r9,r10 798 adc r14,r14,#0 799 subs r8,r8,r10 800 sbcs r9,r9,#0 801 sbc r10,r14,#0 @ overflow bit 802 803 @ Final step is "if result > mod, subtract mod", but we do it 804 @ "other way around", namely subtract modulus from result 805 @ and if it borrowed, add modulus back. 806 807 adds r11,r11,#1 @ subs r11,r11,#-1 808 adcs r3,r3,#0 @ sbcs r3,r3,#-1 809 adcs r4,r4,#0 @ sbcs r4,r4,#-1 810 sbcs r5,r5,#0 811 sbcs r6,r6,#0 812 sbcs r7,r7,#0 813 sbcs r8,r8,#1 814 adcs r9,r9,#0 @ sbcs r9,r9,#-1 815 ldr lr,[sp,#44] @ restore lr 816 sbc r10,r10,#0 @ broadcast borrow bit 817 add sp,sp,#48 818 819 @ Note that because mod has special form, i.e. consists of 820 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 821 @ broadcasting borrow bit to a register, r10, and using it as 822 @ a whole or extracting single bit. 823 824 adds r11,r11,r10 @ add modulus or zero 825 adcs r3,r3,r10 826 str r11,[r0,#0] 827 adcs r4,r4,r10 828 str r3,[r0,#4] 829 adcs r5,r5,#0 830 str r4,[r0,#8] 831 adcs r6,r6,#0 832 str r5,[r0,#12] 833 adcs r7,r7,#0 834 str r6,[r0,#16] 835 adcs r8,r8,r10,lsr#31 836 str r7,[r0,#20] 837 adc r9,r9,r10 838 str r8,[r0,#24] 839 str r9,[r0,#28] 840 841 mov pc,lr 842.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 843.type __ecp_nistz256_sub_from,%function 844.align 5 845__ecp_nistz256_sub_from: 846 str lr,[sp,#-4]! @ push lr 847 848 ldr r10,[r2,#0] 849 ldr r12,[r2,#4] 850 ldr r14,[r2,#8] 851 ldr r1,[r2,#12] 852 subs r11,r11,r10 853 ldr r10,[r2,#16] 854 sbcs r3,r3,r12 855 ldr r12,[r2,#20] 856 sbcs r4,r4,r14 857 ldr r14,[r2,#24] 858 sbcs r5,r5,r1 859 ldr r1,[r2,#28] 860 sbcs r6,r6,r10 861 sbcs r7,r7,r12 862 sbcs r8,r8,r14 863 sbcs r9,r9,r1 864 sbc r2,r2,r2 @ broadcast borrow bit 865 ldr lr,[sp],#4 @ pop lr 866 867 adds r11,r11,r2 @ add synthesized modulus 868 adcs r3,r3,r2 869 str r11,[r0,#0] 870 adcs r4,r4,r2 871 str r3,[r0,#4] 872 adcs r5,r5,#0 873 str r4,[r0,#8] 874 adcs r6,r6,#0 875 str r5,[r0,#12] 876 adcs r7,r7,#0 877 str r6,[r0,#16] 878 adcs r8,r8,r2,lsr#31 879 str r7,[r0,#20] 880 adcs r9,r9,r2 881 str r8,[r0,#24] 882 str r9,[r0,#28] 883 884 mov pc,lr 885.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 886 887.type __ecp_nistz256_sub_morf,%function 888.align 5 889__ecp_nistz256_sub_morf: 890 str lr,[sp,#-4]! @ push lr 891 892 ldr r10,[r2,#0] 893 ldr r12,[r2,#4] 894 ldr r14,[r2,#8] 895 ldr r1,[r2,#12] 896 subs r11,r10,r11 897 ldr r10,[r2,#16] 898 sbcs r3,r12,r3 899 ldr r12,[r2,#20] 900 sbcs r4,r14,r4 901 ldr r14,[r2,#24] 902 sbcs r5,r1,r5 903 ldr r1,[r2,#28] 904 sbcs r6,r10,r6 905 sbcs r7,r12,r7 906 sbcs r8,r14,r8 907 sbcs r9,r1,r9 908 sbc r2,r2,r2 @ broadcast borrow bit 909 ldr lr,[sp],#4 @ pop lr 910 911 adds r11,r11,r2 @ add synthesized modulus 912 adcs r3,r3,r2 913 str r11,[r0,#0] 914 adcs r4,r4,r2 915 str r3,[r0,#4] 916 adcs r5,r5,#0 917 str r4,[r0,#8] 918 adcs r6,r6,#0 919 str r5,[r0,#12] 920 adcs r7,r7,#0 921 str r6,[r0,#16] 922 adcs r8,r8,r2,lsr#31 923 str r7,[r0,#20] 924 adcs r9,r9,r2 925 str r8,[r0,#24] 926 str r9,[r0,#28] 927 928 mov pc,lr 929.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 930 931.type __ecp_nistz256_add_self,%function 932.align 4 933__ecp_nistz256_add_self: 934 adds r11,r11,r11 @ a[0:7]+=a[0:7] 935 adcs r3,r3,r3 936 adcs r4,r4,r4 937 adcs r5,r5,r5 938 adcs r6,r6,r6 939 adcs r7,r7,r7 940 adcs r8,r8,r8 941 mov r2,#0 942 adcs r9,r9,r9 943 adc r2,r2,#0 944 945 @ if a+b >= modulus, subtract modulus. 946 @ 947 @ But since comparison implies subtraction, we subtract 948 @ modulus and then add it back if subtraction borrowed. 949 950 subs r11,r11,#-1 951 sbcs r3,r3,#-1 952 sbcs r4,r4,#-1 953 sbcs r5,r5,#0 954 sbcs r6,r6,#0 955 sbcs r7,r7,#0 956 sbcs r8,r8,#1 957 sbcs r9,r9,#-1 958 sbc r2,r2,#0 959 960 @ Note that because mod has special form, i.e. consists of 961 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 962 @ using value of borrow as a whole or extracting single bit. 963 @ Follow r2 register... 964 965 adds r11,r11,r2 @ add synthesized modulus 966 adcs r3,r3,r2 967 str r11,[r0,#0] 968 adcs r4,r4,r2 969 str r3,[r0,#4] 970 adcs r5,r5,#0 971 str r4,[r0,#8] 972 adcs r6,r6,#0 973 str r5,[r0,#12] 974 adcs r7,r7,#0 975 str r6,[r0,#16] 976 adcs r8,r8,r2,lsr#31 977 str r7,[r0,#20] 978 adcs r9,r9,r2 979 str r8,[r0,#24] 980 str r9,[r0,#28] 981 982 mov pc,lr 983.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self 984 985.globl GFp_nistz256_point_double 986.hidden GFp_nistz256_point_double 987.type GFp_nistz256_point_double,%function 988.align 5 989GFp_nistz256_point_double: 990 stmdb sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ push from r0, unusual, but intentional 991 sub sp,sp,#32*5 992 993.Lpoint_double_shortcut: 994 add r3,sp,#96 995 ldmia r1!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy in_x 996 stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} 997 998 add r0,sp,#0 999 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); 1000 1001 add r2,r1,#32 1002 add r1,r1,#32 1003 add r0,sp,#64 1004 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); 1005 1006 add r1,sp,#0 1007 add r2,sp,#0 1008 add r0,sp,#0 1009 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); 1010 1011 ldr r2,[sp,#32*5+4] 1012 add r1,r2,#32 1013 add r2,r2,#64 1014 add r0,sp,#128 1015 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); 1016 1017 ldr r0,[sp,#32*5] 1018 add r0,r0,#64 1019 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); 1020 1021 add r1,sp,#96 1022 add r2,sp,#64 1023 add r0,sp,#32 1024 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); 1025 1026 add r1,sp,#96 1027 add r2,sp,#64 1028 add r0,sp,#64 1029 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); 1030 1031 add r1,sp,#0 1032 add r2,sp,#0 1033 add r0,sp,#128 1034 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); 1035 1036 add r1,sp,#64 1037 add r2,sp,#32 1038 add r0,sp,#32 1039 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); 1040 1041 ldr r0,[sp,#32*5] 1042 add r1,sp,#128 1043 add r0,r0,#32 1044 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); 1045 1046 add r1,sp,#32 1047 add r0,sp,#32 1048 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); 1049 1050 add r1,sp,#96 1051 add r2,sp,#0 1052 add r0,sp,#0 1053 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); 1054 1055 add r0,sp,#128 1056 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); 1057 1058 ldr r0,[sp,#32*5] 1059 add r1,sp,#32 1060 add r2,sp,#32 1061 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); 1062 1063 add r2,sp,#128 1064 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); 1065 1066 add r2,sp,#0 1067 add r0,sp,#0 1068 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); 1069 1070 add r1,sp,#32 1071 add r2,sp,#0 1072 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); 1073 1074 ldr r0,[sp,#32*5] 1075 add r2,r0,#32 1076 add r0,r0,#32 1077 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); 1078 1079 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" 1080#if __ARM_ARCH__>=5 || !defined(__thumb__) 1081 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 1082#else 1083 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 1084 bx lr @ interoperable with Thumb ISA:-) 1085#endif 1086.size GFp_nistz256_point_double,.-GFp_nistz256_point_double 1087#endif 1088#endif // !OPENSSL_NO_ASM 1089.section .note.GNU-stack,"",%progbits 1090