1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#include <GFp/arm_arch.h> 13 14 15.private_extern _GFp_armcap_P 16 17.section __TEXT,__const 18 19.align 5 20Lsigma: 21.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 22Lone: 23.long 1,0,0,0 24.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 25.align 2 26 27.text 28 29.globl _GFp_ChaCha20_ctr32 30.private_extern _GFp_ChaCha20_ctr32 31 32.align 5 33_GFp_ChaCha20_ctr32: 34 AARCH64_VALID_CALL_TARGET 35 cbz x2,Labort 36#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 37 adrp x5,:pg_hi21_nc:_GFp_armcap_P 38#else 39 adrp x5,_GFp_armcap_P@PAGE 40#endif 41 cmp x2,#192 42 b.lo Lshort 43 ldr w17,[x5,_GFp_armcap_P@PAGEOFF] 44 tst w17,#ARMV7_NEON 45 b.ne ChaCha20_neon 46 47Lshort: 48 AARCH64_SIGN_LINK_REGISTER 49 stp x29,x30,[sp,#-96]! 50 add x29,sp,#0 51 52 adrp x5,Lsigma@PAGE 53 add x5,x5,Lsigma@PAGEOFF 54 stp x19,x20,[sp,#16] 55 stp x21,x22,[sp,#32] 56 stp x23,x24,[sp,#48] 57 stp x25,x26,[sp,#64] 58 stp x27,x28,[sp,#80] 59 sub sp,sp,#64 60 61 ldp x22,x23,[x5] // load sigma 62 ldp x24,x25,[x3] // load key 63 ldp x26,x27,[x3,#16] 64 ldp x28,x30,[x4] // load counter 65#ifdef __ARMEB__ 66 ror x24,x24,#32 67 ror x25,x25,#32 68 ror x26,x26,#32 69 ror x27,x27,#32 70 ror x28,x28,#32 71 ror x30,x30,#32 72#endif 73 74Loop_outer: 75 mov w5,w22 // unpack key block 76 lsr x6,x22,#32 77 mov w7,w23 78 lsr x8,x23,#32 79 mov w9,w24 80 lsr x10,x24,#32 81 mov w11,w25 82 lsr x12,x25,#32 83 mov w13,w26 84 lsr x14,x26,#32 85 mov w15,w27 86 lsr x16,x27,#32 87 mov w17,w28 88 lsr x19,x28,#32 89 mov w20,w30 90 lsr x21,x30,#32 91 92 mov x4,#10 93 subs x2,x2,#64 94Loop: 95 sub x4,x4,#1 96 add w5,w5,w9 97 add w6,w6,w10 98 add w7,w7,w11 99 add w8,w8,w12 100 eor w17,w17,w5 101 eor w19,w19,w6 102 eor w20,w20,w7 103 eor w21,w21,w8 104 ror w17,w17,#16 105 ror w19,w19,#16 106 ror w20,w20,#16 107 ror w21,w21,#16 108 add w13,w13,w17 109 add w14,w14,w19 110 add w15,w15,w20 111 add w16,w16,w21 112 eor w9,w9,w13 113 eor w10,w10,w14 114 eor w11,w11,w15 115 eor w12,w12,w16 116 ror w9,w9,#20 117 ror w10,w10,#20 118 ror w11,w11,#20 119 ror w12,w12,#20 120 add w5,w5,w9 121 add w6,w6,w10 122 add w7,w7,w11 123 add w8,w8,w12 124 eor w17,w17,w5 125 eor w19,w19,w6 126 eor w20,w20,w7 127 eor w21,w21,w8 128 ror w17,w17,#24 129 ror w19,w19,#24 130 ror w20,w20,#24 131 ror w21,w21,#24 132 add w13,w13,w17 133 add w14,w14,w19 134 add w15,w15,w20 135 add w16,w16,w21 136 eor w9,w9,w13 137 eor w10,w10,w14 138 eor w11,w11,w15 139 eor w12,w12,w16 140 ror w9,w9,#25 141 ror w10,w10,#25 142 ror w11,w11,#25 143 ror w12,w12,#25 144 add w5,w5,w10 145 add w6,w6,w11 146 add w7,w7,w12 147 add w8,w8,w9 148 eor w21,w21,w5 149 eor w17,w17,w6 150 eor w19,w19,w7 151 eor w20,w20,w8 152 ror w21,w21,#16 153 ror w17,w17,#16 154 ror w19,w19,#16 155 ror w20,w20,#16 156 add w15,w15,w21 157 add w16,w16,w17 158 add w13,w13,w19 159 add w14,w14,w20 160 eor w10,w10,w15 161 eor w11,w11,w16 162 eor w12,w12,w13 163 eor w9,w9,w14 164 ror w10,w10,#20 165 ror w11,w11,#20 166 ror w12,w12,#20 167 ror w9,w9,#20 168 add w5,w5,w10 169 add w6,w6,w11 170 add w7,w7,w12 171 add w8,w8,w9 172 eor w21,w21,w5 173 eor w17,w17,w6 174 eor w19,w19,w7 175 eor w20,w20,w8 176 ror w21,w21,#24 177 ror w17,w17,#24 178 ror w19,w19,#24 179 ror w20,w20,#24 180 add w15,w15,w21 181 add w16,w16,w17 182 add w13,w13,w19 183 add w14,w14,w20 184 eor w10,w10,w15 185 eor w11,w11,w16 186 eor w12,w12,w13 187 eor w9,w9,w14 188 ror w10,w10,#25 189 ror w11,w11,#25 190 ror w12,w12,#25 191 ror w9,w9,#25 192 cbnz x4,Loop 193 194 add w5,w5,w22 // accumulate key block 195 add x6,x6,x22,lsr#32 196 add w7,w7,w23 197 add x8,x8,x23,lsr#32 198 add w9,w9,w24 199 add x10,x10,x24,lsr#32 200 add w11,w11,w25 201 add x12,x12,x25,lsr#32 202 add w13,w13,w26 203 add x14,x14,x26,lsr#32 204 add w15,w15,w27 205 add x16,x16,x27,lsr#32 206 add w17,w17,w28 207 add x19,x19,x28,lsr#32 208 add w20,w20,w30 209 add x21,x21,x30,lsr#32 210 211 b.lo Ltail 212 213 add x5,x5,x6,lsl#32 // pack 214 add x7,x7,x8,lsl#32 215 ldp x6,x8,[x1,#0] // load input 216 add x9,x9,x10,lsl#32 217 add x11,x11,x12,lsl#32 218 ldp x10,x12,[x1,#16] 219 add x13,x13,x14,lsl#32 220 add x15,x15,x16,lsl#32 221 ldp x14,x16,[x1,#32] 222 add x17,x17,x19,lsl#32 223 add x20,x20,x21,lsl#32 224 ldp x19,x21,[x1,#48] 225 add x1,x1,#64 226#ifdef __ARMEB__ 227 rev x5,x5 228 rev x7,x7 229 rev x9,x9 230 rev x11,x11 231 rev x13,x13 232 rev x15,x15 233 rev x17,x17 234 rev x20,x20 235#endif 236 eor x5,x5,x6 237 eor x7,x7,x8 238 eor x9,x9,x10 239 eor x11,x11,x12 240 eor x13,x13,x14 241 eor x15,x15,x16 242 eor x17,x17,x19 243 eor x20,x20,x21 244 245 stp x5,x7,[x0,#0] // store output 246 add x28,x28,#1 // increment counter 247 stp x9,x11,[x0,#16] 248 stp x13,x15,[x0,#32] 249 stp x17,x20,[x0,#48] 250 add x0,x0,#64 251 252 b.hi Loop_outer 253 254 ldp x19,x20,[x29,#16] 255 add sp,sp,#64 256 ldp x21,x22,[x29,#32] 257 ldp x23,x24,[x29,#48] 258 ldp x25,x26,[x29,#64] 259 ldp x27,x28,[x29,#80] 260 ldp x29,x30,[sp],#96 261 AARCH64_VALIDATE_LINK_REGISTER 262Labort: 263 ret 264 265.align 4 266Ltail: 267 add x2,x2,#64 268Less_than_64: 269 sub x0,x0,#1 270 add x1,x1,x2 271 add x0,x0,x2 272 add x4,sp,x2 273 neg x2,x2 274 275 add x5,x5,x6,lsl#32 // pack 276 add x7,x7,x8,lsl#32 277 add x9,x9,x10,lsl#32 278 add x11,x11,x12,lsl#32 279 add x13,x13,x14,lsl#32 280 add x15,x15,x16,lsl#32 281 add x17,x17,x19,lsl#32 282 add x20,x20,x21,lsl#32 283#ifdef __ARMEB__ 284 rev x5,x5 285 rev x7,x7 286 rev x9,x9 287 rev x11,x11 288 rev x13,x13 289 rev x15,x15 290 rev x17,x17 291 rev x20,x20 292#endif 293 stp x5,x7,[sp,#0] 294 stp x9,x11,[sp,#16] 295 stp x13,x15,[sp,#32] 296 stp x17,x20,[sp,#48] 297 298Loop_tail: 299 ldrb w10,[x1,x2] 300 ldrb w11,[x4,x2] 301 add x2,x2,#1 302 eor w10,w10,w11 303 strb w10,[x0,x2] 304 cbnz x2,Loop_tail 305 306 stp xzr,xzr,[sp,#0] 307 stp xzr,xzr,[sp,#16] 308 stp xzr,xzr,[sp,#32] 309 stp xzr,xzr,[sp,#48] 310 311 ldp x19,x20,[x29,#16] 312 add sp,sp,#64 313 ldp x21,x22,[x29,#32] 314 ldp x23,x24,[x29,#48] 315 ldp x25,x26,[x29,#64] 316 ldp x27,x28,[x29,#80] 317 ldp x29,x30,[sp],#96 318 AARCH64_VALIDATE_LINK_REGISTER 319 ret 320 321 322 323.align 5 324ChaCha20_neon: 325 AARCH64_SIGN_LINK_REGISTER 326 stp x29,x30,[sp,#-96]! 327 add x29,sp,#0 328 329 adrp x5,Lsigma@PAGE 330 add x5,x5,Lsigma@PAGEOFF 331 stp x19,x20,[sp,#16] 332 stp x21,x22,[sp,#32] 333 stp x23,x24,[sp,#48] 334 stp x25,x26,[sp,#64] 335 stp x27,x28,[sp,#80] 336 cmp x2,#512 337 b.hs L512_or_more_neon 338 339 sub sp,sp,#64 340 341 ldp x22,x23,[x5] // load sigma 342 ld1 {v24.4s},[x5],#16 343 ldp x24,x25,[x3] // load key 344 ldp x26,x27,[x3,#16] 345 ld1 {v25.4s,v26.4s},[x3] 346 ldp x28,x30,[x4] // load counter 347 ld1 {v27.4s},[x4] 348 ld1 {v31.4s},[x5] 349#ifdef __ARMEB__ 350 rev64 v24.4s,v24.4s 351 ror x24,x24,#32 352 ror x25,x25,#32 353 ror x26,x26,#32 354 ror x27,x27,#32 355 ror x28,x28,#32 356 ror x30,x30,#32 357#endif 358 add v27.4s,v27.4s,v31.4s // += 1 359 add v28.4s,v27.4s,v31.4s 360 add v29.4s,v28.4s,v31.4s 361 shl v31.4s,v31.4s,#2 // 1 -> 4 362 363Loop_outer_neon: 364 mov w5,w22 // unpack key block 365 lsr x6,x22,#32 366 mov v0.16b,v24.16b 367 mov w7,w23 368 lsr x8,x23,#32 369 mov v4.16b,v24.16b 370 mov w9,w24 371 lsr x10,x24,#32 372 mov v16.16b,v24.16b 373 mov w11,w25 374 mov v1.16b,v25.16b 375 lsr x12,x25,#32 376 mov v5.16b,v25.16b 377 mov w13,w26 378 mov v17.16b,v25.16b 379 lsr x14,x26,#32 380 mov v3.16b,v27.16b 381 mov w15,w27 382 mov v7.16b,v28.16b 383 lsr x16,x27,#32 384 mov v19.16b,v29.16b 385 mov w17,w28 386 mov v2.16b,v26.16b 387 lsr x19,x28,#32 388 mov v6.16b,v26.16b 389 mov w20,w30 390 mov v18.16b,v26.16b 391 lsr x21,x30,#32 392 393 mov x4,#10 394 subs x2,x2,#256 395Loop_neon: 396 sub x4,x4,#1 397 add v0.4s,v0.4s,v1.4s 398 add w5,w5,w9 399 add v4.4s,v4.4s,v5.4s 400 add w6,w6,w10 401 add v16.4s,v16.4s,v17.4s 402 add w7,w7,w11 403 eor v3.16b,v3.16b,v0.16b 404 add w8,w8,w12 405 eor v7.16b,v7.16b,v4.16b 406 eor w17,w17,w5 407 eor v19.16b,v19.16b,v16.16b 408 eor w19,w19,w6 409 rev32 v3.8h,v3.8h 410 eor w20,w20,w7 411 rev32 v7.8h,v7.8h 412 eor w21,w21,w8 413 rev32 v19.8h,v19.8h 414 ror w17,w17,#16 415 add v2.4s,v2.4s,v3.4s 416 ror w19,w19,#16 417 add v6.4s,v6.4s,v7.4s 418 ror w20,w20,#16 419 add v18.4s,v18.4s,v19.4s 420 ror w21,w21,#16 421 eor v20.16b,v1.16b,v2.16b 422 add w13,w13,w17 423 eor v21.16b,v5.16b,v6.16b 424 add w14,w14,w19 425 eor v22.16b,v17.16b,v18.16b 426 add w15,w15,w20 427 ushr v1.4s,v20.4s,#20 428 add w16,w16,w21 429 ushr v5.4s,v21.4s,#20 430 eor w9,w9,w13 431 ushr v17.4s,v22.4s,#20 432 eor w10,w10,w14 433 sli v1.4s,v20.4s,#12 434 eor w11,w11,w15 435 sli v5.4s,v21.4s,#12 436 eor w12,w12,w16 437 sli v17.4s,v22.4s,#12 438 ror w9,w9,#20 439 add v0.4s,v0.4s,v1.4s 440 ror w10,w10,#20 441 add v4.4s,v4.4s,v5.4s 442 ror w11,w11,#20 443 add v16.4s,v16.4s,v17.4s 444 ror w12,w12,#20 445 eor v20.16b,v3.16b,v0.16b 446 add w5,w5,w9 447 eor v21.16b,v7.16b,v4.16b 448 add w6,w6,w10 449 eor v22.16b,v19.16b,v16.16b 450 add w7,w7,w11 451 ushr v3.4s,v20.4s,#24 452 add w8,w8,w12 453 ushr v7.4s,v21.4s,#24 454 eor w17,w17,w5 455 ushr v19.4s,v22.4s,#24 456 eor w19,w19,w6 457 sli v3.4s,v20.4s,#8 458 eor w20,w20,w7 459 sli v7.4s,v21.4s,#8 460 eor w21,w21,w8 461 sli v19.4s,v22.4s,#8 462 ror w17,w17,#24 463 add v2.4s,v2.4s,v3.4s 464 ror w19,w19,#24 465 add v6.4s,v6.4s,v7.4s 466 ror w20,w20,#24 467 add v18.4s,v18.4s,v19.4s 468 ror w21,w21,#24 469 eor v20.16b,v1.16b,v2.16b 470 add w13,w13,w17 471 eor v21.16b,v5.16b,v6.16b 472 add w14,w14,w19 473 eor v22.16b,v17.16b,v18.16b 474 add w15,w15,w20 475 ushr v1.4s,v20.4s,#25 476 add w16,w16,w21 477 ushr v5.4s,v21.4s,#25 478 eor w9,w9,w13 479 ushr v17.4s,v22.4s,#25 480 eor w10,w10,w14 481 sli v1.4s,v20.4s,#7 482 eor w11,w11,w15 483 sli v5.4s,v21.4s,#7 484 eor w12,w12,w16 485 sli v17.4s,v22.4s,#7 486 ror w9,w9,#25 487 ext v2.16b,v2.16b,v2.16b,#8 488 ror w10,w10,#25 489 ext v6.16b,v6.16b,v6.16b,#8 490 ror w11,w11,#25 491 ext v18.16b,v18.16b,v18.16b,#8 492 ror w12,w12,#25 493 ext v3.16b,v3.16b,v3.16b,#12 494 ext v7.16b,v7.16b,v7.16b,#12 495 ext v19.16b,v19.16b,v19.16b,#12 496 ext v1.16b,v1.16b,v1.16b,#4 497 ext v5.16b,v5.16b,v5.16b,#4 498 ext v17.16b,v17.16b,v17.16b,#4 499 add v0.4s,v0.4s,v1.4s 500 add w5,w5,w10 501 add v4.4s,v4.4s,v5.4s 502 add w6,w6,w11 503 add v16.4s,v16.4s,v17.4s 504 add w7,w7,w12 505 eor v3.16b,v3.16b,v0.16b 506 add w8,w8,w9 507 eor v7.16b,v7.16b,v4.16b 508 eor w21,w21,w5 509 eor v19.16b,v19.16b,v16.16b 510 eor w17,w17,w6 511 rev32 v3.8h,v3.8h 512 eor w19,w19,w7 513 rev32 v7.8h,v7.8h 514 eor w20,w20,w8 515 rev32 v19.8h,v19.8h 516 ror w21,w21,#16 517 add v2.4s,v2.4s,v3.4s 518 ror w17,w17,#16 519 add v6.4s,v6.4s,v7.4s 520 ror w19,w19,#16 521 add v18.4s,v18.4s,v19.4s 522 ror w20,w20,#16 523 eor v20.16b,v1.16b,v2.16b 524 add w15,w15,w21 525 eor v21.16b,v5.16b,v6.16b 526 add w16,w16,w17 527 eor v22.16b,v17.16b,v18.16b 528 add w13,w13,w19 529 ushr v1.4s,v20.4s,#20 530 add w14,w14,w20 531 ushr v5.4s,v21.4s,#20 532 eor w10,w10,w15 533 ushr v17.4s,v22.4s,#20 534 eor w11,w11,w16 535 sli v1.4s,v20.4s,#12 536 eor w12,w12,w13 537 sli v5.4s,v21.4s,#12 538 eor w9,w9,w14 539 sli v17.4s,v22.4s,#12 540 ror w10,w10,#20 541 add v0.4s,v0.4s,v1.4s 542 ror w11,w11,#20 543 add v4.4s,v4.4s,v5.4s 544 ror w12,w12,#20 545 add v16.4s,v16.4s,v17.4s 546 ror w9,w9,#20 547 eor v20.16b,v3.16b,v0.16b 548 add w5,w5,w10 549 eor v21.16b,v7.16b,v4.16b 550 add w6,w6,w11 551 eor v22.16b,v19.16b,v16.16b 552 add w7,w7,w12 553 ushr v3.4s,v20.4s,#24 554 add w8,w8,w9 555 ushr v7.4s,v21.4s,#24 556 eor w21,w21,w5 557 ushr v19.4s,v22.4s,#24 558 eor w17,w17,w6 559 sli v3.4s,v20.4s,#8 560 eor w19,w19,w7 561 sli v7.4s,v21.4s,#8 562 eor w20,w20,w8 563 sli v19.4s,v22.4s,#8 564 ror w21,w21,#24 565 add v2.4s,v2.4s,v3.4s 566 ror w17,w17,#24 567 add v6.4s,v6.4s,v7.4s 568 ror w19,w19,#24 569 add v18.4s,v18.4s,v19.4s 570 ror w20,w20,#24 571 eor v20.16b,v1.16b,v2.16b 572 add w15,w15,w21 573 eor v21.16b,v5.16b,v6.16b 574 add w16,w16,w17 575 eor v22.16b,v17.16b,v18.16b 576 add w13,w13,w19 577 ushr v1.4s,v20.4s,#25 578 add w14,w14,w20 579 ushr v5.4s,v21.4s,#25 580 eor w10,w10,w15 581 ushr v17.4s,v22.4s,#25 582 eor w11,w11,w16 583 sli v1.4s,v20.4s,#7 584 eor w12,w12,w13 585 sli v5.4s,v21.4s,#7 586 eor w9,w9,w14 587 sli v17.4s,v22.4s,#7 588 ror w10,w10,#25 589 ext v2.16b,v2.16b,v2.16b,#8 590 ror w11,w11,#25 591 ext v6.16b,v6.16b,v6.16b,#8 592 ror w12,w12,#25 593 ext v18.16b,v18.16b,v18.16b,#8 594 ror w9,w9,#25 595 ext v3.16b,v3.16b,v3.16b,#4 596 ext v7.16b,v7.16b,v7.16b,#4 597 ext v19.16b,v19.16b,v19.16b,#4 598 ext v1.16b,v1.16b,v1.16b,#12 599 ext v5.16b,v5.16b,v5.16b,#12 600 ext v17.16b,v17.16b,v17.16b,#12 601 cbnz x4,Loop_neon 602 603 add w5,w5,w22 // accumulate key block 604 add v0.4s,v0.4s,v24.4s 605 add x6,x6,x22,lsr#32 606 add v4.4s,v4.4s,v24.4s 607 add w7,w7,w23 608 add v16.4s,v16.4s,v24.4s 609 add x8,x8,x23,lsr#32 610 add v2.4s,v2.4s,v26.4s 611 add w9,w9,w24 612 add v6.4s,v6.4s,v26.4s 613 add x10,x10,x24,lsr#32 614 add v18.4s,v18.4s,v26.4s 615 add w11,w11,w25 616 add v3.4s,v3.4s,v27.4s 617 add x12,x12,x25,lsr#32 618 add w13,w13,w26 619 add v7.4s,v7.4s,v28.4s 620 add x14,x14,x26,lsr#32 621 add w15,w15,w27 622 add v19.4s,v19.4s,v29.4s 623 add x16,x16,x27,lsr#32 624 add w17,w17,w28 625 add v1.4s,v1.4s,v25.4s 626 add x19,x19,x28,lsr#32 627 add w20,w20,w30 628 add v5.4s,v5.4s,v25.4s 629 add x21,x21,x30,lsr#32 630 add v17.4s,v17.4s,v25.4s 631 632 b.lo Ltail_neon 633 634 add x5,x5,x6,lsl#32 // pack 635 add x7,x7,x8,lsl#32 636 ldp x6,x8,[x1,#0] // load input 637 add x9,x9,x10,lsl#32 638 add x11,x11,x12,lsl#32 639 ldp x10,x12,[x1,#16] 640 add x13,x13,x14,lsl#32 641 add x15,x15,x16,lsl#32 642 ldp x14,x16,[x1,#32] 643 add x17,x17,x19,lsl#32 644 add x20,x20,x21,lsl#32 645 ldp x19,x21,[x1,#48] 646 add x1,x1,#64 647#ifdef __ARMEB__ 648 rev x5,x5 649 rev x7,x7 650 rev x9,x9 651 rev x11,x11 652 rev x13,x13 653 rev x15,x15 654 rev x17,x17 655 rev x20,x20 656#endif 657 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 658 eor x5,x5,x6 659 eor x7,x7,x8 660 eor x9,x9,x10 661 eor x11,x11,x12 662 eor x13,x13,x14 663 eor v0.16b,v0.16b,v20.16b 664 eor x15,x15,x16 665 eor v1.16b,v1.16b,v21.16b 666 eor x17,x17,x19 667 eor v2.16b,v2.16b,v22.16b 668 eor x20,x20,x21 669 eor v3.16b,v3.16b,v23.16b 670 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 671 672 stp x5,x7,[x0,#0] // store output 673 add x28,x28,#4 // increment counter 674 stp x9,x11,[x0,#16] 675 add v27.4s,v27.4s,v31.4s // += 4 676 stp x13,x15,[x0,#32] 677 add v28.4s,v28.4s,v31.4s 678 stp x17,x20,[x0,#48] 679 add v29.4s,v29.4s,v31.4s 680 add x0,x0,#64 681 682 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 683 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 684 685 eor v4.16b,v4.16b,v20.16b 686 eor v5.16b,v5.16b,v21.16b 687 eor v6.16b,v6.16b,v22.16b 688 eor v7.16b,v7.16b,v23.16b 689 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 690 691 eor v16.16b,v16.16b,v0.16b 692 eor v17.16b,v17.16b,v1.16b 693 eor v18.16b,v18.16b,v2.16b 694 eor v19.16b,v19.16b,v3.16b 695 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 696 697 b.hi Loop_outer_neon 698 699 ldp x19,x20,[x29,#16] 700 add sp,sp,#64 701 ldp x21,x22,[x29,#32] 702 ldp x23,x24,[x29,#48] 703 ldp x25,x26,[x29,#64] 704 ldp x27,x28,[x29,#80] 705 ldp x29,x30,[sp],#96 706 AARCH64_VALIDATE_LINK_REGISTER 707 ret 708 709Ltail_neon: 710 add x2,x2,#256 711 cmp x2,#64 712 b.lo Less_than_64 713 714 add x5,x5,x6,lsl#32 // pack 715 add x7,x7,x8,lsl#32 716 ldp x6,x8,[x1,#0] // load input 717 add x9,x9,x10,lsl#32 718 add x11,x11,x12,lsl#32 719 ldp x10,x12,[x1,#16] 720 add x13,x13,x14,lsl#32 721 add x15,x15,x16,lsl#32 722 ldp x14,x16,[x1,#32] 723 add x17,x17,x19,lsl#32 724 add x20,x20,x21,lsl#32 725 ldp x19,x21,[x1,#48] 726 add x1,x1,#64 727#ifdef __ARMEB__ 728 rev x5,x5 729 rev x7,x7 730 rev x9,x9 731 rev x11,x11 732 rev x13,x13 733 rev x15,x15 734 rev x17,x17 735 rev x20,x20 736#endif 737 eor x5,x5,x6 738 eor x7,x7,x8 739 eor x9,x9,x10 740 eor x11,x11,x12 741 eor x13,x13,x14 742 eor x15,x15,x16 743 eor x17,x17,x19 744 eor x20,x20,x21 745 746 stp x5,x7,[x0,#0] // store output 747 add x28,x28,#4 // increment counter 748 stp x9,x11,[x0,#16] 749 stp x13,x15,[x0,#32] 750 stp x17,x20,[x0,#48] 751 add x0,x0,#64 752 b.eq Ldone_neon 753 sub x2,x2,#64 754 cmp x2,#64 755 b.lo Less_than_128 756 757 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 758 eor v0.16b,v0.16b,v20.16b 759 eor v1.16b,v1.16b,v21.16b 760 eor v2.16b,v2.16b,v22.16b 761 eor v3.16b,v3.16b,v23.16b 762 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 763 b.eq Ldone_neon 764 sub x2,x2,#64 765 cmp x2,#64 766 b.lo Less_than_192 767 768 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 769 eor v4.16b,v4.16b,v20.16b 770 eor v5.16b,v5.16b,v21.16b 771 eor v6.16b,v6.16b,v22.16b 772 eor v7.16b,v7.16b,v23.16b 773 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 774 b.eq Ldone_neon 775 sub x2,x2,#64 776 777 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 778 b Last_neon 779 780Less_than_128: 781 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 782 b Last_neon 783Less_than_192: 784 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 785 b Last_neon 786 787.align 4 788Last_neon: 789 sub x0,x0,#1 790 add x1,x1,x2 791 add x0,x0,x2 792 add x4,sp,x2 793 neg x2,x2 794 795Loop_tail_neon: 796 ldrb w10,[x1,x2] 797 ldrb w11,[x4,x2] 798 add x2,x2,#1 799 eor w10,w10,w11 800 strb w10,[x0,x2] 801 cbnz x2,Loop_tail_neon 802 803 stp xzr,xzr,[sp,#0] 804 stp xzr,xzr,[sp,#16] 805 stp xzr,xzr,[sp,#32] 806 stp xzr,xzr,[sp,#48] 807 808Ldone_neon: 809 ldp x19,x20,[x29,#16] 810 add sp,sp,#64 811 ldp x21,x22,[x29,#32] 812 ldp x23,x24,[x29,#48] 813 ldp x25,x26,[x29,#64] 814 ldp x27,x28,[x29,#80] 815 ldp x29,x30,[sp],#96 816 AARCH64_VALIDATE_LINK_REGISTER 817 ret 818 819 820.align 5 821ChaCha20_512_neon: 822 AARCH64_SIGN_LINK_REGISTER 823 stp x29,x30,[sp,#-96]! 824 add x29,sp,#0 825 826 adrp x5,Lsigma@PAGE 827 add x5,x5,Lsigma@PAGEOFF 828 stp x19,x20,[sp,#16] 829 stp x21,x22,[sp,#32] 830 stp x23,x24,[sp,#48] 831 stp x25,x26,[sp,#64] 832 stp x27,x28,[sp,#80] 833 834L512_or_more_neon: 835 sub sp,sp,#128+64 836 837 ldp x22,x23,[x5] // load sigma 838 ld1 {v24.4s},[x5],#16 839 ldp x24,x25,[x3] // load key 840 ldp x26,x27,[x3,#16] 841 ld1 {v25.4s,v26.4s},[x3] 842 ldp x28,x30,[x4] // load counter 843 ld1 {v27.4s},[x4] 844 ld1 {v31.4s},[x5] 845#ifdef __ARMEB__ 846 rev64 v24.4s,v24.4s 847 ror x24,x24,#32 848 ror x25,x25,#32 849 ror x26,x26,#32 850 ror x27,x27,#32 851 ror x28,x28,#32 852 ror x30,x30,#32 853#endif 854 add v27.4s,v27.4s,v31.4s // += 1 855 stp q24,q25,[sp,#0] // off-load key block, invariant part 856 add v27.4s,v27.4s,v31.4s // not typo 857 str q26,[sp,#32] 858 add v28.4s,v27.4s,v31.4s 859 add v29.4s,v28.4s,v31.4s 860 add v30.4s,v29.4s,v31.4s 861 shl v31.4s,v31.4s,#2 // 1 -> 4 862 863 stp d8,d9,[sp,#128+0] // meet ABI requirements 864 stp d10,d11,[sp,#128+16] 865 stp d12,d13,[sp,#128+32] 866 stp d14,d15,[sp,#128+48] 867 868 sub x2,x2,#512 // not typo 869 870Loop_outer_512_neon: 871 mov v0.16b,v24.16b 872 mov v4.16b,v24.16b 873 mov v8.16b,v24.16b 874 mov v12.16b,v24.16b 875 mov v16.16b,v24.16b 876 mov v20.16b,v24.16b 877 mov v1.16b,v25.16b 878 mov w5,w22 // unpack key block 879 mov v5.16b,v25.16b 880 lsr x6,x22,#32 881 mov v9.16b,v25.16b 882 mov w7,w23 883 mov v13.16b,v25.16b 884 lsr x8,x23,#32 885 mov v17.16b,v25.16b 886 mov w9,w24 887 mov v21.16b,v25.16b 888 lsr x10,x24,#32 889 mov v3.16b,v27.16b 890 mov w11,w25 891 mov v7.16b,v28.16b 892 lsr x12,x25,#32 893 mov v11.16b,v29.16b 894 mov w13,w26 895 mov v15.16b,v30.16b 896 lsr x14,x26,#32 897 mov v2.16b,v26.16b 898 mov w15,w27 899 mov v6.16b,v26.16b 900 lsr x16,x27,#32 901 add v19.4s,v3.4s,v31.4s // +4 902 mov w17,w28 903 add v23.4s,v7.4s,v31.4s // +4 904 lsr x19,x28,#32 905 mov v10.16b,v26.16b 906 mov w20,w30 907 mov v14.16b,v26.16b 908 lsr x21,x30,#32 909 mov v18.16b,v26.16b 910 stp q27,q28,[sp,#48] // off-load key block, variable part 911 mov v22.16b,v26.16b 912 str q29,[sp,#80] 913 914 mov x4,#5 915 subs x2,x2,#512 916Loop_upper_neon: 917 sub x4,x4,#1 918 add v0.4s,v0.4s,v1.4s 919 add w5,w5,w9 920 add v4.4s,v4.4s,v5.4s 921 add w6,w6,w10 922 add v8.4s,v8.4s,v9.4s 923 add w7,w7,w11 924 add v12.4s,v12.4s,v13.4s 925 add w8,w8,w12 926 add v16.4s,v16.4s,v17.4s 927 eor w17,w17,w5 928 add v20.4s,v20.4s,v21.4s 929 eor w19,w19,w6 930 eor v3.16b,v3.16b,v0.16b 931 eor w20,w20,w7 932 eor v7.16b,v7.16b,v4.16b 933 eor w21,w21,w8 934 eor v11.16b,v11.16b,v8.16b 935 ror w17,w17,#16 936 eor v15.16b,v15.16b,v12.16b 937 ror w19,w19,#16 938 eor v19.16b,v19.16b,v16.16b 939 ror w20,w20,#16 940 eor v23.16b,v23.16b,v20.16b 941 ror w21,w21,#16 942 rev32 v3.8h,v3.8h 943 add w13,w13,w17 944 rev32 v7.8h,v7.8h 945 add w14,w14,w19 946 rev32 v11.8h,v11.8h 947 add w15,w15,w20 948 rev32 v15.8h,v15.8h 949 add w16,w16,w21 950 rev32 v19.8h,v19.8h 951 eor w9,w9,w13 952 rev32 v23.8h,v23.8h 953 eor w10,w10,w14 954 add v2.4s,v2.4s,v3.4s 955 eor w11,w11,w15 956 add v6.4s,v6.4s,v7.4s 957 eor w12,w12,w16 958 add v10.4s,v10.4s,v11.4s 959 ror w9,w9,#20 960 add v14.4s,v14.4s,v15.4s 961 ror w10,w10,#20 962 add v18.4s,v18.4s,v19.4s 963 ror w11,w11,#20 964 add v22.4s,v22.4s,v23.4s 965 ror w12,w12,#20 966 eor v24.16b,v1.16b,v2.16b 967 add w5,w5,w9 968 eor v25.16b,v5.16b,v6.16b 969 add w6,w6,w10 970 eor v26.16b,v9.16b,v10.16b 971 add w7,w7,w11 972 eor v27.16b,v13.16b,v14.16b 973 add w8,w8,w12 974 eor v28.16b,v17.16b,v18.16b 975 eor w17,w17,w5 976 eor v29.16b,v21.16b,v22.16b 977 eor w19,w19,w6 978 ushr v1.4s,v24.4s,#20 979 eor w20,w20,w7 980 ushr v5.4s,v25.4s,#20 981 eor w21,w21,w8 982 ushr v9.4s,v26.4s,#20 983 ror w17,w17,#24 984 ushr v13.4s,v27.4s,#20 985 ror w19,w19,#24 986 ushr v17.4s,v28.4s,#20 987 ror w20,w20,#24 988 ushr v21.4s,v29.4s,#20 989 ror w21,w21,#24 990 sli v1.4s,v24.4s,#12 991 add w13,w13,w17 992 sli v5.4s,v25.4s,#12 993 add w14,w14,w19 994 sli v9.4s,v26.4s,#12 995 add w15,w15,w20 996 sli v13.4s,v27.4s,#12 997 add w16,w16,w21 998 sli v17.4s,v28.4s,#12 999 eor w9,w9,w13 1000 sli v21.4s,v29.4s,#12 1001 eor w10,w10,w14 1002 add v0.4s,v0.4s,v1.4s 1003 eor w11,w11,w15 1004 add v4.4s,v4.4s,v5.4s 1005 eor w12,w12,w16 1006 add v8.4s,v8.4s,v9.4s 1007 ror w9,w9,#25 1008 add v12.4s,v12.4s,v13.4s 1009 ror w10,w10,#25 1010 add v16.4s,v16.4s,v17.4s 1011 ror w11,w11,#25 1012 add v20.4s,v20.4s,v21.4s 1013 ror w12,w12,#25 1014 eor v24.16b,v3.16b,v0.16b 1015 add w5,w5,w10 1016 eor v25.16b,v7.16b,v4.16b 1017 add w6,w6,w11 1018 eor v26.16b,v11.16b,v8.16b 1019 add w7,w7,w12 1020 eor v27.16b,v15.16b,v12.16b 1021 add w8,w8,w9 1022 eor v28.16b,v19.16b,v16.16b 1023 eor w21,w21,w5 1024 eor v29.16b,v23.16b,v20.16b 1025 eor w17,w17,w6 1026 ushr v3.4s,v24.4s,#24 1027 eor w19,w19,w7 1028 ushr v7.4s,v25.4s,#24 1029 eor w20,w20,w8 1030 ushr v11.4s,v26.4s,#24 1031 ror w21,w21,#16 1032 ushr v15.4s,v27.4s,#24 1033 ror w17,w17,#16 1034 ushr v19.4s,v28.4s,#24 1035 ror w19,w19,#16 1036 ushr v23.4s,v29.4s,#24 1037 ror w20,w20,#16 1038 sli v3.4s,v24.4s,#8 1039 add w15,w15,w21 1040 sli v7.4s,v25.4s,#8 1041 add w16,w16,w17 1042 sli v11.4s,v26.4s,#8 1043 add w13,w13,w19 1044 sli v15.4s,v27.4s,#8 1045 add w14,w14,w20 1046 sli v19.4s,v28.4s,#8 1047 eor w10,w10,w15 1048 sli v23.4s,v29.4s,#8 1049 eor w11,w11,w16 1050 add v2.4s,v2.4s,v3.4s 1051 eor w12,w12,w13 1052 add v6.4s,v6.4s,v7.4s 1053 eor w9,w9,w14 1054 add v10.4s,v10.4s,v11.4s 1055 ror w10,w10,#20 1056 add v14.4s,v14.4s,v15.4s 1057 ror w11,w11,#20 1058 add v18.4s,v18.4s,v19.4s 1059 ror w12,w12,#20 1060 add v22.4s,v22.4s,v23.4s 1061 ror w9,w9,#20 1062 eor v24.16b,v1.16b,v2.16b 1063 add w5,w5,w10 1064 eor v25.16b,v5.16b,v6.16b 1065 add w6,w6,w11 1066 eor v26.16b,v9.16b,v10.16b 1067 add w7,w7,w12 1068 eor v27.16b,v13.16b,v14.16b 1069 add w8,w8,w9 1070 eor v28.16b,v17.16b,v18.16b 1071 eor w21,w21,w5 1072 eor v29.16b,v21.16b,v22.16b 1073 eor w17,w17,w6 1074 ushr v1.4s,v24.4s,#25 1075 eor w19,w19,w7 1076 ushr v5.4s,v25.4s,#25 1077 eor w20,w20,w8 1078 ushr v9.4s,v26.4s,#25 1079 ror w21,w21,#24 1080 ushr v13.4s,v27.4s,#25 1081 ror w17,w17,#24 1082 ushr v17.4s,v28.4s,#25 1083 ror w19,w19,#24 1084 ushr v21.4s,v29.4s,#25 1085 ror w20,w20,#24 1086 sli v1.4s,v24.4s,#7 1087 add w15,w15,w21 1088 sli v5.4s,v25.4s,#7 1089 add w16,w16,w17 1090 sli v9.4s,v26.4s,#7 1091 add w13,w13,w19 1092 sli v13.4s,v27.4s,#7 1093 add w14,w14,w20 1094 sli v17.4s,v28.4s,#7 1095 eor w10,w10,w15 1096 sli v21.4s,v29.4s,#7 1097 eor w11,w11,w16 1098 ext v2.16b,v2.16b,v2.16b,#8 1099 eor w12,w12,w13 1100 ext v6.16b,v6.16b,v6.16b,#8 1101 eor w9,w9,w14 1102 ext v10.16b,v10.16b,v10.16b,#8 1103 ror w10,w10,#25 1104 ext v14.16b,v14.16b,v14.16b,#8 1105 ror w11,w11,#25 1106 ext v18.16b,v18.16b,v18.16b,#8 1107 ror w12,w12,#25 1108 ext v22.16b,v22.16b,v22.16b,#8 1109 ror w9,w9,#25 1110 ext v3.16b,v3.16b,v3.16b,#12 1111 ext v7.16b,v7.16b,v7.16b,#12 1112 ext v11.16b,v11.16b,v11.16b,#12 1113 ext v15.16b,v15.16b,v15.16b,#12 1114 ext v19.16b,v19.16b,v19.16b,#12 1115 ext v23.16b,v23.16b,v23.16b,#12 1116 ext v1.16b,v1.16b,v1.16b,#4 1117 ext v5.16b,v5.16b,v5.16b,#4 1118 ext v9.16b,v9.16b,v9.16b,#4 1119 ext v13.16b,v13.16b,v13.16b,#4 1120 ext v17.16b,v17.16b,v17.16b,#4 1121 ext v21.16b,v21.16b,v21.16b,#4 1122 add v0.4s,v0.4s,v1.4s 1123 add w5,w5,w9 1124 add v4.4s,v4.4s,v5.4s 1125 add w6,w6,w10 1126 add v8.4s,v8.4s,v9.4s 1127 add w7,w7,w11 1128 add v12.4s,v12.4s,v13.4s 1129 add w8,w8,w12 1130 add v16.4s,v16.4s,v17.4s 1131 eor w17,w17,w5 1132 add v20.4s,v20.4s,v21.4s 1133 eor w19,w19,w6 1134 eor v3.16b,v3.16b,v0.16b 1135 eor w20,w20,w7 1136 eor v7.16b,v7.16b,v4.16b 1137 eor w21,w21,w8 1138 eor v11.16b,v11.16b,v8.16b 1139 ror w17,w17,#16 1140 eor v15.16b,v15.16b,v12.16b 1141 ror w19,w19,#16 1142 eor v19.16b,v19.16b,v16.16b 1143 ror w20,w20,#16 1144 eor v23.16b,v23.16b,v20.16b 1145 ror w21,w21,#16 1146 rev32 v3.8h,v3.8h 1147 add w13,w13,w17 1148 rev32 v7.8h,v7.8h 1149 add w14,w14,w19 1150 rev32 v11.8h,v11.8h 1151 add w15,w15,w20 1152 rev32 v15.8h,v15.8h 1153 add w16,w16,w21 1154 rev32 v19.8h,v19.8h 1155 eor w9,w9,w13 1156 rev32 v23.8h,v23.8h 1157 eor w10,w10,w14 1158 add v2.4s,v2.4s,v3.4s 1159 eor w11,w11,w15 1160 add v6.4s,v6.4s,v7.4s 1161 eor w12,w12,w16 1162 add v10.4s,v10.4s,v11.4s 1163 ror w9,w9,#20 1164 add v14.4s,v14.4s,v15.4s 1165 ror w10,w10,#20 1166 add v18.4s,v18.4s,v19.4s 1167 ror w11,w11,#20 1168 add v22.4s,v22.4s,v23.4s 1169 ror w12,w12,#20 1170 eor v24.16b,v1.16b,v2.16b 1171 add w5,w5,w9 1172 eor v25.16b,v5.16b,v6.16b 1173 add w6,w6,w10 1174 eor v26.16b,v9.16b,v10.16b 1175 add w7,w7,w11 1176 eor v27.16b,v13.16b,v14.16b 1177 add w8,w8,w12 1178 eor v28.16b,v17.16b,v18.16b 1179 eor w17,w17,w5 1180 eor v29.16b,v21.16b,v22.16b 1181 eor w19,w19,w6 1182 ushr v1.4s,v24.4s,#20 1183 eor w20,w20,w7 1184 ushr v5.4s,v25.4s,#20 1185 eor w21,w21,w8 1186 ushr v9.4s,v26.4s,#20 1187 ror w17,w17,#24 1188 ushr v13.4s,v27.4s,#20 1189 ror w19,w19,#24 1190 ushr v17.4s,v28.4s,#20 1191 ror w20,w20,#24 1192 ushr v21.4s,v29.4s,#20 1193 ror w21,w21,#24 1194 sli v1.4s,v24.4s,#12 1195 add w13,w13,w17 1196 sli v5.4s,v25.4s,#12 1197 add w14,w14,w19 1198 sli v9.4s,v26.4s,#12 1199 add w15,w15,w20 1200 sli v13.4s,v27.4s,#12 1201 add w16,w16,w21 1202 sli v17.4s,v28.4s,#12 1203 eor w9,w9,w13 1204 sli v21.4s,v29.4s,#12 1205 eor w10,w10,w14 1206 add v0.4s,v0.4s,v1.4s 1207 eor w11,w11,w15 1208 add v4.4s,v4.4s,v5.4s 1209 eor w12,w12,w16 1210 add v8.4s,v8.4s,v9.4s 1211 ror w9,w9,#25 1212 add v12.4s,v12.4s,v13.4s 1213 ror w10,w10,#25 1214 add v16.4s,v16.4s,v17.4s 1215 ror w11,w11,#25 1216 add v20.4s,v20.4s,v21.4s 1217 ror w12,w12,#25 1218 eor v24.16b,v3.16b,v0.16b 1219 add w5,w5,w10 1220 eor v25.16b,v7.16b,v4.16b 1221 add w6,w6,w11 1222 eor v26.16b,v11.16b,v8.16b 1223 add w7,w7,w12 1224 eor v27.16b,v15.16b,v12.16b 1225 add w8,w8,w9 1226 eor v28.16b,v19.16b,v16.16b 1227 eor w21,w21,w5 1228 eor v29.16b,v23.16b,v20.16b 1229 eor w17,w17,w6 1230 ushr v3.4s,v24.4s,#24 1231 eor w19,w19,w7 1232 ushr v7.4s,v25.4s,#24 1233 eor w20,w20,w8 1234 ushr v11.4s,v26.4s,#24 1235 ror w21,w21,#16 1236 ushr v15.4s,v27.4s,#24 1237 ror w17,w17,#16 1238 ushr v19.4s,v28.4s,#24 1239 ror w19,w19,#16 1240 ushr v23.4s,v29.4s,#24 1241 ror w20,w20,#16 1242 sli v3.4s,v24.4s,#8 1243 add w15,w15,w21 1244 sli v7.4s,v25.4s,#8 1245 add w16,w16,w17 1246 sli v11.4s,v26.4s,#8 1247 add w13,w13,w19 1248 sli v15.4s,v27.4s,#8 1249 add w14,w14,w20 1250 sli v19.4s,v28.4s,#8 1251 eor w10,w10,w15 1252 sli v23.4s,v29.4s,#8 1253 eor w11,w11,w16 1254 add v2.4s,v2.4s,v3.4s 1255 eor w12,w12,w13 1256 add v6.4s,v6.4s,v7.4s 1257 eor w9,w9,w14 1258 add v10.4s,v10.4s,v11.4s 1259 ror w10,w10,#20 1260 add v14.4s,v14.4s,v15.4s 1261 ror w11,w11,#20 1262 add v18.4s,v18.4s,v19.4s 1263 ror w12,w12,#20 1264 add v22.4s,v22.4s,v23.4s 1265 ror w9,w9,#20 1266 eor v24.16b,v1.16b,v2.16b 1267 add w5,w5,w10 1268 eor v25.16b,v5.16b,v6.16b 1269 add w6,w6,w11 1270 eor v26.16b,v9.16b,v10.16b 1271 add w7,w7,w12 1272 eor v27.16b,v13.16b,v14.16b 1273 add w8,w8,w9 1274 eor v28.16b,v17.16b,v18.16b 1275 eor w21,w21,w5 1276 eor v29.16b,v21.16b,v22.16b 1277 eor w17,w17,w6 1278 ushr v1.4s,v24.4s,#25 1279 eor w19,w19,w7 1280 ushr v5.4s,v25.4s,#25 1281 eor w20,w20,w8 1282 ushr v9.4s,v26.4s,#25 1283 ror w21,w21,#24 1284 ushr v13.4s,v27.4s,#25 1285 ror w17,w17,#24 1286 ushr v17.4s,v28.4s,#25 1287 ror w19,w19,#24 1288 ushr v21.4s,v29.4s,#25 1289 ror w20,w20,#24 1290 sli v1.4s,v24.4s,#7 1291 add w15,w15,w21 1292 sli v5.4s,v25.4s,#7 1293 add w16,w16,w17 1294 sli v9.4s,v26.4s,#7 1295 add w13,w13,w19 1296 sli v13.4s,v27.4s,#7 1297 add w14,w14,w20 1298 sli v17.4s,v28.4s,#7 1299 eor w10,w10,w15 1300 sli v21.4s,v29.4s,#7 1301 eor w11,w11,w16 1302 ext v2.16b,v2.16b,v2.16b,#8 1303 eor w12,w12,w13 1304 ext v6.16b,v6.16b,v6.16b,#8 1305 eor w9,w9,w14 1306 ext v10.16b,v10.16b,v10.16b,#8 1307 ror w10,w10,#25 1308 ext v14.16b,v14.16b,v14.16b,#8 1309 ror w11,w11,#25 1310 ext v18.16b,v18.16b,v18.16b,#8 1311 ror w12,w12,#25 1312 ext v22.16b,v22.16b,v22.16b,#8 1313 ror w9,w9,#25 1314 ext v3.16b,v3.16b,v3.16b,#4 1315 ext v7.16b,v7.16b,v7.16b,#4 1316 ext v11.16b,v11.16b,v11.16b,#4 1317 ext v15.16b,v15.16b,v15.16b,#4 1318 ext v19.16b,v19.16b,v19.16b,#4 1319 ext v23.16b,v23.16b,v23.16b,#4 1320 ext v1.16b,v1.16b,v1.16b,#12 1321 ext v5.16b,v5.16b,v5.16b,#12 1322 ext v9.16b,v9.16b,v9.16b,#12 1323 ext v13.16b,v13.16b,v13.16b,#12 1324 ext v17.16b,v17.16b,v17.16b,#12 1325 ext v21.16b,v21.16b,v21.16b,#12 1326 cbnz x4,Loop_upper_neon 1327 1328 add w5,w5,w22 // accumulate key block 1329 add x6,x6,x22,lsr#32 1330 add w7,w7,w23 1331 add x8,x8,x23,lsr#32 1332 add w9,w9,w24 1333 add x10,x10,x24,lsr#32 1334 add w11,w11,w25 1335 add x12,x12,x25,lsr#32 1336 add w13,w13,w26 1337 add x14,x14,x26,lsr#32 1338 add w15,w15,w27 1339 add x16,x16,x27,lsr#32 1340 add w17,w17,w28 1341 add x19,x19,x28,lsr#32 1342 add w20,w20,w30 1343 add x21,x21,x30,lsr#32 1344 1345 add x5,x5,x6,lsl#32 // pack 1346 add x7,x7,x8,lsl#32 1347 ldp x6,x8,[x1,#0] // load input 1348 add x9,x9,x10,lsl#32 1349 add x11,x11,x12,lsl#32 1350 ldp x10,x12,[x1,#16] 1351 add x13,x13,x14,lsl#32 1352 add x15,x15,x16,lsl#32 1353 ldp x14,x16,[x1,#32] 1354 add x17,x17,x19,lsl#32 1355 add x20,x20,x21,lsl#32 1356 ldp x19,x21,[x1,#48] 1357 add x1,x1,#64 1358#ifdef __ARMEB__ 1359 rev x5,x5 1360 rev x7,x7 1361 rev x9,x9 1362 rev x11,x11 1363 rev x13,x13 1364 rev x15,x15 1365 rev x17,x17 1366 rev x20,x20 1367#endif 1368 eor x5,x5,x6 1369 eor x7,x7,x8 1370 eor x9,x9,x10 1371 eor x11,x11,x12 1372 eor x13,x13,x14 1373 eor x15,x15,x16 1374 eor x17,x17,x19 1375 eor x20,x20,x21 1376 1377 stp x5,x7,[x0,#0] // store output 1378 add x28,x28,#1 // increment counter 1379 mov w5,w22 // unpack key block 1380 lsr x6,x22,#32 1381 stp x9,x11,[x0,#16] 1382 mov w7,w23 1383 lsr x8,x23,#32 1384 stp x13,x15,[x0,#32] 1385 mov w9,w24 1386 lsr x10,x24,#32 1387 stp x17,x20,[x0,#48] 1388 add x0,x0,#64 1389 mov w11,w25 1390 lsr x12,x25,#32 1391 mov w13,w26 1392 lsr x14,x26,#32 1393 mov w15,w27 1394 lsr x16,x27,#32 1395 mov w17,w28 1396 lsr x19,x28,#32 1397 mov w20,w30 1398 lsr x21,x30,#32 1399 1400 mov x4,#5 1401Loop_lower_neon: 1402 sub x4,x4,#1 1403 add v0.4s,v0.4s,v1.4s 1404 add w5,w5,w9 1405 add v4.4s,v4.4s,v5.4s 1406 add w6,w6,w10 1407 add v8.4s,v8.4s,v9.4s 1408 add w7,w7,w11 1409 add v12.4s,v12.4s,v13.4s 1410 add w8,w8,w12 1411 add v16.4s,v16.4s,v17.4s 1412 eor w17,w17,w5 1413 add v20.4s,v20.4s,v21.4s 1414 eor w19,w19,w6 1415 eor v3.16b,v3.16b,v0.16b 1416 eor w20,w20,w7 1417 eor v7.16b,v7.16b,v4.16b 1418 eor w21,w21,w8 1419 eor v11.16b,v11.16b,v8.16b 1420 ror w17,w17,#16 1421 eor v15.16b,v15.16b,v12.16b 1422 ror w19,w19,#16 1423 eor v19.16b,v19.16b,v16.16b 1424 ror w20,w20,#16 1425 eor v23.16b,v23.16b,v20.16b 1426 ror w21,w21,#16 1427 rev32 v3.8h,v3.8h 1428 add w13,w13,w17 1429 rev32 v7.8h,v7.8h 1430 add w14,w14,w19 1431 rev32 v11.8h,v11.8h 1432 add w15,w15,w20 1433 rev32 v15.8h,v15.8h 1434 add w16,w16,w21 1435 rev32 v19.8h,v19.8h 1436 eor w9,w9,w13 1437 rev32 v23.8h,v23.8h 1438 eor w10,w10,w14 1439 add v2.4s,v2.4s,v3.4s 1440 eor w11,w11,w15 1441 add v6.4s,v6.4s,v7.4s 1442 eor w12,w12,w16 1443 add v10.4s,v10.4s,v11.4s 1444 ror w9,w9,#20 1445 add v14.4s,v14.4s,v15.4s 1446 ror w10,w10,#20 1447 add v18.4s,v18.4s,v19.4s 1448 ror w11,w11,#20 1449 add v22.4s,v22.4s,v23.4s 1450 ror w12,w12,#20 1451 eor v24.16b,v1.16b,v2.16b 1452 add w5,w5,w9 1453 eor v25.16b,v5.16b,v6.16b 1454 add w6,w6,w10 1455 eor v26.16b,v9.16b,v10.16b 1456 add w7,w7,w11 1457 eor v27.16b,v13.16b,v14.16b 1458 add w8,w8,w12 1459 eor v28.16b,v17.16b,v18.16b 1460 eor w17,w17,w5 1461 eor v29.16b,v21.16b,v22.16b 1462 eor w19,w19,w6 1463 ushr v1.4s,v24.4s,#20 1464 eor w20,w20,w7 1465 ushr v5.4s,v25.4s,#20 1466 eor w21,w21,w8 1467 ushr v9.4s,v26.4s,#20 1468 ror w17,w17,#24 1469 ushr v13.4s,v27.4s,#20 1470 ror w19,w19,#24 1471 ushr v17.4s,v28.4s,#20 1472 ror w20,w20,#24 1473 ushr v21.4s,v29.4s,#20 1474 ror w21,w21,#24 1475 sli v1.4s,v24.4s,#12 1476 add w13,w13,w17 1477 sli v5.4s,v25.4s,#12 1478 add w14,w14,w19 1479 sli v9.4s,v26.4s,#12 1480 add w15,w15,w20 1481 sli v13.4s,v27.4s,#12 1482 add w16,w16,w21 1483 sli v17.4s,v28.4s,#12 1484 eor w9,w9,w13 1485 sli v21.4s,v29.4s,#12 1486 eor w10,w10,w14 1487 add v0.4s,v0.4s,v1.4s 1488 eor w11,w11,w15 1489 add v4.4s,v4.4s,v5.4s 1490 eor w12,w12,w16 1491 add v8.4s,v8.4s,v9.4s 1492 ror w9,w9,#25 1493 add v12.4s,v12.4s,v13.4s 1494 ror w10,w10,#25 1495 add v16.4s,v16.4s,v17.4s 1496 ror w11,w11,#25 1497 add v20.4s,v20.4s,v21.4s 1498 ror w12,w12,#25 1499 eor v24.16b,v3.16b,v0.16b 1500 add w5,w5,w10 1501 eor v25.16b,v7.16b,v4.16b 1502 add w6,w6,w11 1503 eor v26.16b,v11.16b,v8.16b 1504 add w7,w7,w12 1505 eor v27.16b,v15.16b,v12.16b 1506 add w8,w8,w9 1507 eor v28.16b,v19.16b,v16.16b 1508 eor w21,w21,w5 1509 eor v29.16b,v23.16b,v20.16b 1510 eor w17,w17,w6 1511 ushr v3.4s,v24.4s,#24 1512 eor w19,w19,w7 1513 ushr v7.4s,v25.4s,#24 1514 eor w20,w20,w8 1515 ushr v11.4s,v26.4s,#24 1516 ror w21,w21,#16 1517 ushr v15.4s,v27.4s,#24 1518 ror w17,w17,#16 1519 ushr v19.4s,v28.4s,#24 1520 ror w19,w19,#16 1521 ushr v23.4s,v29.4s,#24 1522 ror w20,w20,#16 1523 sli v3.4s,v24.4s,#8 1524 add w15,w15,w21 1525 sli v7.4s,v25.4s,#8 1526 add w16,w16,w17 1527 sli v11.4s,v26.4s,#8 1528 add w13,w13,w19 1529 sli v15.4s,v27.4s,#8 1530 add w14,w14,w20 1531 sli v19.4s,v28.4s,#8 1532 eor w10,w10,w15 1533 sli v23.4s,v29.4s,#8 1534 eor w11,w11,w16 1535 add v2.4s,v2.4s,v3.4s 1536 eor w12,w12,w13 1537 add v6.4s,v6.4s,v7.4s 1538 eor w9,w9,w14 1539 add v10.4s,v10.4s,v11.4s 1540 ror w10,w10,#20 1541 add v14.4s,v14.4s,v15.4s 1542 ror w11,w11,#20 1543 add v18.4s,v18.4s,v19.4s 1544 ror w12,w12,#20 1545 add v22.4s,v22.4s,v23.4s 1546 ror w9,w9,#20 1547 eor v24.16b,v1.16b,v2.16b 1548 add w5,w5,w10 1549 eor v25.16b,v5.16b,v6.16b 1550 add w6,w6,w11 1551 eor v26.16b,v9.16b,v10.16b 1552 add w7,w7,w12 1553 eor v27.16b,v13.16b,v14.16b 1554 add w8,w8,w9 1555 eor v28.16b,v17.16b,v18.16b 1556 eor w21,w21,w5 1557 eor v29.16b,v21.16b,v22.16b 1558 eor w17,w17,w6 1559 ushr v1.4s,v24.4s,#25 1560 eor w19,w19,w7 1561 ushr v5.4s,v25.4s,#25 1562 eor w20,w20,w8 1563 ushr v9.4s,v26.4s,#25 1564 ror w21,w21,#24 1565 ushr v13.4s,v27.4s,#25 1566 ror w17,w17,#24 1567 ushr v17.4s,v28.4s,#25 1568 ror w19,w19,#24 1569 ushr v21.4s,v29.4s,#25 1570 ror w20,w20,#24 1571 sli v1.4s,v24.4s,#7 1572 add w15,w15,w21 1573 sli v5.4s,v25.4s,#7 1574 add w16,w16,w17 1575 sli v9.4s,v26.4s,#7 1576 add w13,w13,w19 1577 sli v13.4s,v27.4s,#7 1578 add w14,w14,w20 1579 sli v17.4s,v28.4s,#7 1580 eor w10,w10,w15 1581 sli v21.4s,v29.4s,#7 1582 eor w11,w11,w16 1583 ext v2.16b,v2.16b,v2.16b,#8 1584 eor w12,w12,w13 1585 ext v6.16b,v6.16b,v6.16b,#8 1586 eor w9,w9,w14 1587 ext v10.16b,v10.16b,v10.16b,#8 1588 ror w10,w10,#25 1589 ext v14.16b,v14.16b,v14.16b,#8 1590 ror w11,w11,#25 1591 ext v18.16b,v18.16b,v18.16b,#8 1592 ror w12,w12,#25 1593 ext v22.16b,v22.16b,v22.16b,#8 1594 ror w9,w9,#25 1595 ext v3.16b,v3.16b,v3.16b,#12 1596 ext v7.16b,v7.16b,v7.16b,#12 1597 ext v11.16b,v11.16b,v11.16b,#12 1598 ext v15.16b,v15.16b,v15.16b,#12 1599 ext v19.16b,v19.16b,v19.16b,#12 1600 ext v23.16b,v23.16b,v23.16b,#12 1601 ext v1.16b,v1.16b,v1.16b,#4 1602 ext v5.16b,v5.16b,v5.16b,#4 1603 ext v9.16b,v9.16b,v9.16b,#4 1604 ext v13.16b,v13.16b,v13.16b,#4 1605 ext v17.16b,v17.16b,v17.16b,#4 1606 ext v21.16b,v21.16b,v21.16b,#4 1607 add v0.4s,v0.4s,v1.4s 1608 add w5,w5,w9 1609 add v4.4s,v4.4s,v5.4s 1610 add w6,w6,w10 1611 add v8.4s,v8.4s,v9.4s 1612 add w7,w7,w11 1613 add v12.4s,v12.4s,v13.4s 1614 add w8,w8,w12 1615 add v16.4s,v16.4s,v17.4s 1616 eor w17,w17,w5 1617 add v20.4s,v20.4s,v21.4s 1618 eor w19,w19,w6 1619 eor v3.16b,v3.16b,v0.16b 1620 eor w20,w20,w7 1621 eor v7.16b,v7.16b,v4.16b 1622 eor w21,w21,w8 1623 eor v11.16b,v11.16b,v8.16b 1624 ror w17,w17,#16 1625 eor v15.16b,v15.16b,v12.16b 1626 ror w19,w19,#16 1627 eor v19.16b,v19.16b,v16.16b 1628 ror w20,w20,#16 1629 eor v23.16b,v23.16b,v20.16b 1630 ror w21,w21,#16 1631 rev32 v3.8h,v3.8h 1632 add w13,w13,w17 1633 rev32 v7.8h,v7.8h 1634 add w14,w14,w19 1635 rev32 v11.8h,v11.8h 1636 add w15,w15,w20 1637 rev32 v15.8h,v15.8h 1638 add w16,w16,w21 1639 rev32 v19.8h,v19.8h 1640 eor w9,w9,w13 1641 rev32 v23.8h,v23.8h 1642 eor w10,w10,w14 1643 add v2.4s,v2.4s,v3.4s 1644 eor w11,w11,w15 1645 add v6.4s,v6.4s,v7.4s 1646 eor w12,w12,w16 1647 add v10.4s,v10.4s,v11.4s 1648 ror w9,w9,#20 1649 add v14.4s,v14.4s,v15.4s 1650 ror w10,w10,#20 1651 add v18.4s,v18.4s,v19.4s 1652 ror w11,w11,#20 1653 add v22.4s,v22.4s,v23.4s 1654 ror w12,w12,#20 1655 eor v24.16b,v1.16b,v2.16b 1656 add w5,w5,w9 1657 eor v25.16b,v5.16b,v6.16b 1658 add w6,w6,w10 1659 eor v26.16b,v9.16b,v10.16b 1660 add w7,w7,w11 1661 eor v27.16b,v13.16b,v14.16b 1662 add w8,w8,w12 1663 eor v28.16b,v17.16b,v18.16b 1664 eor w17,w17,w5 1665 eor v29.16b,v21.16b,v22.16b 1666 eor w19,w19,w6 1667 ushr v1.4s,v24.4s,#20 1668 eor w20,w20,w7 1669 ushr v5.4s,v25.4s,#20 1670 eor w21,w21,w8 1671 ushr v9.4s,v26.4s,#20 1672 ror w17,w17,#24 1673 ushr v13.4s,v27.4s,#20 1674 ror w19,w19,#24 1675 ushr v17.4s,v28.4s,#20 1676 ror w20,w20,#24 1677 ushr v21.4s,v29.4s,#20 1678 ror w21,w21,#24 1679 sli v1.4s,v24.4s,#12 1680 add w13,w13,w17 1681 sli v5.4s,v25.4s,#12 1682 add w14,w14,w19 1683 sli v9.4s,v26.4s,#12 1684 add w15,w15,w20 1685 sli v13.4s,v27.4s,#12 1686 add w16,w16,w21 1687 sli v17.4s,v28.4s,#12 1688 eor w9,w9,w13 1689 sli v21.4s,v29.4s,#12 1690 eor w10,w10,w14 1691 add v0.4s,v0.4s,v1.4s 1692 eor w11,w11,w15 1693 add v4.4s,v4.4s,v5.4s 1694 eor w12,w12,w16 1695 add v8.4s,v8.4s,v9.4s 1696 ror w9,w9,#25 1697 add v12.4s,v12.4s,v13.4s 1698 ror w10,w10,#25 1699 add v16.4s,v16.4s,v17.4s 1700 ror w11,w11,#25 1701 add v20.4s,v20.4s,v21.4s 1702 ror w12,w12,#25 1703 eor v24.16b,v3.16b,v0.16b 1704 add w5,w5,w10 1705 eor v25.16b,v7.16b,v4.16b 1706 add w6,w6,w11 1707 eor v26.16b,v11.16b,v8.16b 1708 add w7,w7,w12 1709 eor v27.16b,v15.16b,v12.16b 1710 add w8,w8,w9 1711 eor v28.16b,v19.16b,v16.16b 1712 eor w21,w21,w5 1713 eor v29.16b,v23.16b,v20.16b 1714 eor w17,w17,w6 1715 ushr v3.4s,v24.4s,#24 1716 eor w19,w19,w7 1717 ushr v7.4s,v25.4s,#24 1718 eor w20,w20,w8 1719 ushr v11.4s,v26.4s,#24 1720 ror w21,w21,#16 1721 ushr v15.4s,v27.4s,#24 1722 ror w17,w17,#16 1723 ushr v19.4s,v28.4s,#24 1724 ror w19,w19,#16 1725 ushr v23.4s,v29.4s,#24 1726 ror w20,w20,#16 1727 sli v3.4s,v24.4s,#8 1728 add w15,w15,w21 1729 sli v7.4s,v25.4s,#8 1730 add w16,w16,w17 1731 sli v11.4s,v26.4s,#8 1732 add w13,w13,w19 1733 sli v15.4s,v27.4s,#8 1734 add w14,w14,w20 1735 sli v19.4s,v28.4s,#8 1736 eor w10,w10,w15 1737 sli v23.4s,v29.4s,#8 1738 eor w11,w11,w16 1739 add v2.4s,v2.4s,v3.4s 1740 eor w12,w12,w13 1741 add v6.4s,v6.4s,v7.4s 1742 eor w9,w9,w14 1743 add v10.4s,v10.4s,v11.4s 1744 ror w10,w10,#20 1745 add v14.4s,v14.4s,v15.4s 1746 ror w11,w11,#20 1747 add v18.4s,v18.4s,v19.4s 1748 ror w12,w12,#20 1749 add v22.4s,v22.4s,v23.4s 1750 ror w9,w9,#20 1751 eor v24.16b,v1.16b,v2.16b 1752 add w5,w5,w10 1753 eor v25.16b,v5.16b,v6.16b 1754 add w6,w6,w11 1755 eor v26.16b,v9.16b,v10.16b 1756 add w7,w7,w12 1757 eor v27.16b,v13.16b,v14.16b 1758 add w8,w8,w9 1759 eor v28.16b,v17.16b,v18.16b 1760 eor w21,w21,w5 1761 eor v29.16b,v21.16b,v22.16b 1762 eor w17,w17,w6 1763 ushr v1.4s,v24.4s,#25 1764 eor w19,w19,w7 1765 ushr v5.4s,v25.4s,#25 1766 eor w20,w20,w8 1767 ushr v9.4s,v26.4s,#25 1768 ror w21,w21,#24 1769 ushr v13.4s,v27.4s,#25 1770 ror w17,w17,#24 1771 ushr v17.4s,v28.4s,#25 1772 ror w19,w19,#24 1773 ushr v21.4s,v29.4s,#25 1774 ror w20,w20,#24 1775 sli v1.4s,v24.4s,#7 1776 add w15,w15,w21 1777 sli v5.4s,v25.4s,#7 1778 add w16,w16,w17 1779 sli v9.4s,v26.4s,#7 1780 add w13,w13,w19 1781 sli v13.4s,v27.4s,#7 1782 add w14,w14,w20 1783 sli v17.4s,v28.4s,#7 1784 eor w10,w10,w15 1785 sli v21.4s,v29.4s,#7 1786 eor w11,w11,w16 1787 ext v2.16b,v2.16b,v2.16b,#8 1788 eor w12,w12,w13 1789 ext v6.16b,v6.16b,v6.16b,#8 1790 eor w9,w9,w14 1791 ext v10.16b,v10.16b,v10.16b,#8 1792 ror w10,w10,#25 1793 ext v14.16b,v14.16b,v14.16b,#8 1794 ror w11,w11,#25 1795 ext v18.16b,v18.16b,v18.16b,#8 1796 ror w12,w12,#25 1797 ext v22.16b,v22.16b,v22.16b,#8 1798 ror w9,w9,#25 1799 ext v3.16b,v3.16b,v3.16b,#4 1800 ext v7.16b,v7.16b,v7.16b,#4 1801 ext v11.16b,v11.16b,v11.16b,#4 1802 ext v15.16b,v15.16b,v15.16b,#4 1803 ext v19.16b,v19.16b,v19.16b,#4 1804 ext v23.16b,v23.16b,v23.16b,#4 1805 ext v1.16b,v1.16b,v1.16b,#12 1806 ext v5.16b,v5.16b,v5.16b,#12 1807 ext v9.16b,v9.16b,v9.16b,#12 1808 ext v13.16b,v13.16b,v13.16b,#12 1809 ext v17.16b,v17.16b,v17.16b,#12 1810 ext v21.16b,v21.16b,v21.16b,#12 1811 cbnz x4,Loop_lower_neon 1812 1813 add w5,w5,w22 // accumulate key block 1814 ldp q24,q25,[sp,#0] 1815 add x6,x6,x22,lsr#32 1816 ldp q26,q27,[sp,#32] 1817 add w7,w7,w23 1818 ldp q28,q29,[sp,#64] 1819 add x8,x8,x23,lsr#32 1820 add v0.4s,v0.4s,v24.4s 1821 add w9,w9,w24 1822 add v4.4s,v4.4s,v24.4s 1823 add x10,x10,x24,lsr#32 1824 add v8.4s,v8.4s,v24.4s 1825 add w11,w11,w25 1826 add v12.4s,v12.4s,v24.4s 1827 add x12,x12,x25,lsr#32 1828 add v16.4s,v16.4s,v24.4s 1829 add w13,w13,w26 1830 add v20.4s,v20.4s,v24.4s 1831 add x14,x14,x26,lsr#32 1832 add v2.4s,v2.4s,v26.4s 1833 add w15,w15,w27 1834 add v6.4s,v6.4s,v26.4s 1835 add x16,x16,x27,lsr#32 1836 add v10.4s,v10.4s,v26.4s 1837 add w17,w17,w28 1838 add v14.4s,v14.4s,v26.4s 1839 add x19,x19,x28,lsr#32 1840 add v18.4s,v18.4s,v26.4s 1841 add w20,w20,w30 1842 add v22.4s,v22.4s,v26.4s 1843 add x21,x21,x30,lsr#32 1844 add v19.4s,v19.4s,v31.4s // +4 1845 add x5,x5,x6,lsl#32 // pack 1846 add v23.4s,v23.4s,v31.4s // +4 1847 add x7,x7,x8,lsl#32 1848 add v3.4s,v3.4s,v27.4s 1849 ldp x6,x8,[x1,#0] // load input 1850 add v7.4s,v7.4s,v28.4s 1851 add x9,x9,x10,lsl#32 1852 add v11.4s,v11.4s,v29.4s 1853 add x11,x11,x12,lsl#32 1854 add v15.4s,v15.4s,v30.4s 1855 ldp x10,x12,[x1,#16] 1856 add v19.4s,v19.4s,v27.4s 1857 add x13,x13,x14,lsl#32 1858 add v23.4s,v23.4s,v28.4s 1859 add x15,x15,x16,lsl#32 1860 add v1.4s,v1.4s,v25.4s 1861 ldp x14,x16,[x1,#32] 1862 add v5.4s,v5.4s,v25.4s 1863 add x17,x17,x19,lsl#32 1864 add v9.4s,v9.4s,v25.4s 1865 add x20,x20,x21,lsl#32 1866 add v13.4s,v13.4s,v25.4s 1867 ldp x19,x21,[x1,#48] 1868 add v17.4s,v17.4s,v25.4s 1869 add x1,x1,#64 1870 add v21.4s,v21.4s,v25.4s 1871 1872#ifdef __ARMEB__ 1873 rev x5,x5 1874 rev x7,x7 1875 rev x9,x9 1876 rev x11,x11 1877 rev x13,x13 1878 rev x15,x15 1879 rev x17,x17 1880 rev x20,x20 1881#endif 1882 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1883 eor x5,x5,x6 1884 eor x7,x7,x8 1885 eor x9,x9,x10 1886 eor x11,x11,x12 1887 eor x13,x13,x14 1888 eor v0.16b,v0.16b,v24.16b 1889 eor x15,x15,x16 1890 eor v1.16b,v1.16b,v25.16b 1891 eor x17,x17,x19 1892 eor v2.16b,v2.16b,v26.16b 1893 eor x20,x20,x21 1894 eor v3.16b,v3.16b,v27.16b 1895 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1896 1897 stp x5,x7,[x0,#0] // store output 1898 add x28,x28,#7 // increment counter 1899 stp x9,x11,[x0,#16] 1900 stp x13,x15,[x0,#32] 1901 stp x17,x20,[x0,#48] 1902 add x0,x0,#64 1903 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1904 1905 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1906 eor v4.16b,v4.16b,v24.16b 1907 eor v5.16b,v5.16b,v25.16b 1908 eor v6.16b,v6.16b,v26.16b 1909 eor v7.16b,v7.16b,v27.16b 1910 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1911 1912 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1913 eor v8.16b,v8.16b,v0.16b 1914 ldp q24,q25,[sp,#0] 1915 eor v9.16b,v9.16b,v1.16b 1916 ldp q26,q27,[sp,#32] 1917 eor v10.16b,v10.16b,v2.16b 1918 eor v11.16b,v11.16b,v3.16b 1919 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1920 1921 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1922 eor v12.16b,v12.16b,v4.16b 1923 eor v13.16b,v13.16b,v5.16b 1924 eor v14.16b,v14.16b,v6.16b 1925 eor v15.16b,v15.16b,v7.16b 1926 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1927 1928 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1929 eor v16.16b,v16.16b,v8.16b 1930 eor v17.16b,v17.16b,v9.16b 1931 eor v18.16b,v18.16b,v10.16b 1932 eor v19.16b,v19.16b,v11.16b 1933 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1934 1935 shl v0.4s,v31.4s,#1 // 4 -> 8 1936 eor v20.16b,v20.16b,v12.16b 1937 eor v21.16b,v21.16b,v13.16b 1938 eor v22.16b,v22.16b,v14.16b 1939 eor v23.16b,v23.16b,v15.16b 1940 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1941 1942 add v27.4s,v27.4s,v0.4s // += 8 1943 add v28.4s,v28.4s,v0.4s 1944 add v29.4s,v29.4s,v0.4s 1945 add v30.4s,v30.4s,v0.4s 1946 1947 b.hs Loop_outer_512_neon 1948 1949 adds x2,x2,#512 1950 ushr v0.4s,v31.4s,#2 // 4 -> 1 1951 1952 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1953 ldp d10,d11,[sp,#128+16] 1954 ldp d12,d13,[sp,#128+32] 1955 ldp d14,d15,[sp,#128+48] 1956 1957 stp q24,q31,[sp,#0] // wipe off-load area 1958 stp q24,q31,[sp,#32] 1959 stp q24,q31,[sp,#64] 1960 1961 b.eq Ldone_512_neon 1962 1963 cmp x2,#192 1964 sub v27.4s,v27.4s,v0.4s // -= 1 1965 sub v28.4s,v28.4s,v0.4s 1966 sub v29.4s,v29.4s,v0.4s 1967 add sp,sp,#128 1968 b.hs Loop_outer_neon 1969 1970 eor v25.16b,v25.16b,v25.16b 1971 eor v26.16b,v26.16b,v26.16b 1972 eor v27.16b,v27.16b,v27.16b 1973 eor v28.16b,v28.16b,v28.16b 1974 eor v29.16b,v29.16b,v29.16b 1975 eor v30.16b,v30.16b,v30.16b 1976 b Loop_outer 1977 1978Ldone_512_neon: 1979 ldp x19,x20,[x29,#16] 1980 add sp,sp,#128+64 1981 ldp x21,x22,[x29,#32] 1982 ldp x23,x24,[x29,#48] 1983 ldp x25,x26,[x29,#64] 1984 ldp x27,x28,[x29,#80] 1985 ldp x29,x30,[sp],#96 1986 AARCH64_VALIDATE_LINK_REGISTER 1987 ret 1988 1989#endif // !OPENSSL_NO_ASM 1990