1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(BORINGSSL_PREFIX) 13#include <boringssl_prefix_symbols_asm.h> 14#endif 15#include <openssl/arm_arch.h> 16 17 18.private_extern _OPENSSL_armcap_P 19 20.section __TEXT,__const 21 22.align 5 23Lsigma: 24.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 25Lone: 26.long 1,0,0,0 27.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 28.align 2 29 30.text 31 32.globl _ChaCha20_ctr32 33.private_extern _ChaCha20_ctr32 34 35.align 5 36_ChaCha20_ctr32: 37 AARCH64_VALID_CALL_TARGET 38 cbz x2,Labort 39#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 40 adrp x5,:pg_hi21_nc:_OPENSSL_armcap_P 41#else 42 adrp x5,_OPENSSL_armcap_P@PAGE 43#endif 44 cmp x2,#192 45 b.lo Lshort 46 ldr w17,[x5,_OPENSSL_armcap_P@PAGEOFF] 47 tst w17,#ARMV7_NEON 48 b.ne ChaCha20_neon 49 50Lshort: 51 AARCH64_SIGN_LINK_REGISTER 52 stp x29,x30,[sp,#-96]! 53 add x29,sp,#0 54 55 adrp x5,Lsigma@PAGE 56 add x5,x5,Lsigma@PAGEOFF 57 stp x19,x20,[sp,#16] 58 stp x21,x22,[sp,#32] 59 stp x23,x24,[sp,#48] 60 stp x25,x26,[sp,#64] 61 stp x27,x28,[sp,#80] 62 sub sp,sp,#64 63 64 ldp x22,x23,[x5] // load sigma 65 ldp x24,x25,[x3] // load key 66 ldp x26,x27,[x3,#16] 67 ldp x28,x30,[x4] // load counter 68#ifdef __ARMEB__ 69 ror x24,x24,#32 70 ror x25,x25,#32 71 ror x26,x26,#32 72 ror x27,x27,#32 73 ror x28,x28,#32 74 ror x30,x30,#32 75#endif 76 77Loop_outer: 78 mov w5,w22 // unpack key block 79 lsr x6,x22,#32 80 mov w7,w23 81 lsr x8,x23,#32 82 mov w9,w24 83 lsr x10,x24,#32 84 mov w11,w25 85 lsr x12,x25,#32 86 mov w13,w26 87 lsr x14,x26,#32 88 mov w15,w27 89 lsr x16,x27,#32 90 mov w17,w28 91 lsr x19,x28,#32 92 mov w20,w30 93 lsr x21,x30,#32 94 95 mov x4,#10 96 subs x2,x2,#64 97Loop: 98 sub x4,x4,#1 99 add w5,w5,w9 100 add w6,w6,w10 101 add w7,w7,w11 102 add w8,w8,w12 103 eor w17,w17,w5 104 eor w19,w19,w6 105 eor w20,w20,w7 106 eor w21,w21,w8 107 ror w17,w17,#16 108 ror w19,w19,#16 109 ror w20,w20,#16 110 ror w21,w21,#16 111 add w13,w13,w17 112 add w14,w14,w19 113 add w15,w15,w20 114 add w16,w16,w21 115 eor w9,w9,w13 116 eor w10,w10,w14 117 eor w11,w11,w15 118 eor w12,w12,w16 119 ror w9,w9,#20 120 ror w10,w10,#20 121 ror w11,w11,#20 122 ror w12,w12,#20 123 add w5,w5,w9 124 add w6,w6,w10 125 add w7,w7,w11 126 add w8,w8,w12 127 eor w17,w17,w5 128 eor w19,w19,w6 129 eor w20,w20,w7 130 eor w21,w21,w8 131 ror w17,w17,#24 132 ror w19,w19,#24 133 ror w20,w20,#24 134 ror w21,w21,#24 135 add w13,w13,w17 136 add w14,w14,w19 137 add w15,w15,w20 138 add w16,w16,w21 139 eor w9,w9,w13 140 eor w10,w10,w14 141 eor w11,w11,w15 142 eor w12,w12,w16 143 ror w9,w9,#25 144 ror w10,w10,#25 145 ror w11,w11,#25 146 ror w12,w12,#25 147 add w5,w5,w10 148 add w6,w6,w11 149 add w7,w7,w12 150 add w8,w8,w9 151 eor w21,w21,w5 152 eor w17,w17,w6 153 eor w19,w19,w7 154 eor w20,w20,w8 155 ror w21,w21,#16 156 ror w17,w17,#16 157 ror w19,w19,#16 158 ror w20,w20,#16 159 add w15,w15,w21 160 add w16,w16,w17 161 add w13,w13,w19 162 add w14,w14,w20 163 eor w10,w10,w15 164 eor w11,w11,w16 165 eor w12,w12,w13 166 eor w9,w9,w14 167 ror w10,w10,#20 168 ror w11,w11,#20 169 ror w12,w12,#20 170 ror w9,w9,#20 171 add w5,w5,w10 172 add w6,w6,w11 173 add w7,w7,w12 174 add w8,w8,w9 175 eor w21,w21,w5 176 eor w17,w17,w6 177 eor w19,w19,w7 178 eor w20,w20,w8 179 ror w21,w21,#24 180 ror w17,w17,#24 181 ror w19,w19,#24 182 ror w20,w20,#24 183 add w15,w15,w21 184 add w16,w16,w17 185 add w13,w13,w19 186 add w14,w14,w20 187 eor w10,w10,w15 188 eor w11,w11,w16 189 eor w12,w12,w13 190 eor w9,w9,w14 191 ror w10,w10,#25 192 ror w11,w11,#25 193 ror w12,w12,#25 194 ror w9,w9,#25 195 cbnz x4,Loop 196 197 add w5,w5,w22 // accumulate key block 198 add x6,x6,x22,lsr#32 199 add w7,w7,w23 200 add x8,x8,x23,lsr#32 201 add w9,w9,w24 202 add x10,x10,x24,lsr#32 203 add w11,w11,w25 204 add x12,x12,x25,lsr#32 205 add w13,w13,w26 206 add x14,x14,x26,lsr#32 207 add w15,w15,w27 208 add x16,x16,x27,lsr#32 209 add w17,w17,w28 210 add x19,x19,x28,lsr#32 211 add w20,w20,w30 212 add x21,x21,x30,lsr#32 213 214 b.lo Ltail 215 216 add x5,x5,x6,lsl#32 // pack 217 add x7,x7,x8,lsl#32 218 ldp x6,x8,[x1,#0] // load input 219 add x9,x9,x10,lsl#32 220 add x11,x11,x12,lsl#32 221 ldp x10,x12,[x1,#16] 222 add x13,x13,x14,lsl#32 223 add x15,x15,x16,lsl#32 224 ldp x14,x16,[x1,#32] 225 add x17,x17,x19,lsl#32 226 add x20,x20,x21,lsl#32 227 ldp x19,x21,[x1,#48] 228 add x1,x1,#64 229#ifdef __ARMEB__ 230 rev x5,x5 231 rev x7,x7 232 rev x9,x9 233 rev x11,x11 234 rev x13,x13 235 rev x15,x15 236 rev x17,x17 237 rev x20,x20 238#endif 239 eor x5,x5,x6 240 eor x7,x7,x8 241 eor x9,x9,x10 242 eor x11,x11,x12 243 eor x13,x13,x14 244 eor x15,x15,x16 245 eor x17,x17,x19 246 eor x20,x20,x21 247 248 stp x5,x7,[x0,#0] // store output 249 add x28,x28,#1 // increment counter 250 stp x9,x11,[x0,#16] 251 stp x13,x15,[x0,#32] 252 stp x17,x20,[x0,#48] 253 add x0,x0,#64 254 255 b.hi Loop_outer 256 257 ldp x19,x20,[x29,#16] 258 add sp,sp,#64 259 ldp x21,x22,[x29,#32] 260 ldp x23,x24,[x29,#48] 261 ldp x25,x26,[x29,#64] 262 ldp x27,x28,[x29,#80] 263 ldp x29,x30,[sp],#96 264 AARCH64_VALIDATE_LINK_REGISTER 265Labort: 266 ret 267 268.align 4 269Ltail: 270 add x2,x2,#64 271Less_than_64: 272 sub x0,x0,#1 273 add x1,x1,x2 274 add x0,x0,x2 275 add x4,sp,x2 276 neg x2,x2 277 278 add x5,x5,x6,lsl#32 // pack 279 add x7,x7,x8,lsl#32 280 add x9,x9,x10,lsl#32 281 add x11,x11,x12,lsl#32 282 add x13,x13,x14,lsl#32 283 add x15,x15,x16,lsl#32 284 add x17,x17,x19,lsl#32 285 add x20,x20,x21,lsl#32 286#ifdef __ARMEB__ 287 rev x5,x5 288 rev x7,x7 289 rev x9,x9 290 rev x11,x11 291 rev x13,x13 292 rev x15,x15 293 rev x17,x17 294 rev x20,x20 295#endif 296 stp x5,x7,[sp,#0] 297 stp x9,x11,[sp,#16] 298 stp x13,x15,[sp,#32] 299 stp x17,x20,[sp,#48] 300 301Loop_tail: 302 ldrb w10,[x1,x2] 303 ldrb w11,[x4,x2] 304 add x2,x2,#1 305 eor w10,w10,w11 306 strb w10,[x0,x2] 307 cbnz x2,Loop_tail 308 309 stp xzr,xzr,[sp,#0] 310 stp xzr,xzr,[sp,#16] 311 stp xzr,xzr,[sp,#32] 312 stp xzr,xzr,[sp,#48] 313 314 ldp x19,x20,[x29,#16] 315 add sp,sp,#64 316 ldp x21,x22,[x29,#32] 317 ldp x23,x24,[x29,#48] 318 ldp x25,x26,[x29,#64] 319 ldp x27,x28,[x29,#80] 320 ldp x29,x30,[sp],#96 321 AARCH64_VALIDATE_LINK_REGISTER 322 ret 323 324 325 326.align 5 327ChaCha20_neon: 328 AARCH64_SIGN_LINK_REGISTER 329 stp x29,x30,[sp,#-96]! 330 add x29,sp,#0 331 332 adrp x5,Lsigma@PAGE 333 add x5,x5,Lsigma@PAGEOFF 334 stp x19,x20,[sp,#16] 335 stp x21,x22,[sp,#32] 336 stp x23,x24,[sp,#48] 337 stp x25,x26,[sp,#64] 338 stp x27,x28,[sp,#80] 339 cmp x2,#512 340 b.hs L512_or_more_neon 341 342 sub sp,sp,#64 343 344 ldp x22,x23,[x5] // load sigma 345 ld1 {v24.4s},[x5],#16 346 ldp x24,x25,[x3] // load key 347 ldp x26,x27,[x3,#16] 348 ld1 {v25.4s,v26.4s},[x3] 349 ldp x28,x30,[x4] // load counter 350 ld1 {v27.4s},[x4] 351 ld1 {v31.4s},[x5] 352#ifdef __ARMEB__ 353 rev64 v24.4s,v24.4s 354 ror x24,x24,#32 355 ror x25,x25,#32 356 ror x26,x26,#32 357 ror x27,x27,#32 358 ror x28,x28,#32 359 ror x30,x30,#32 360#endif 361 add v27.4s,v27.4s,v31.4s // += 1 362 add v28.4s,v27.4s,v31.4s 363 add v29.4s,v28.4s,v31.4s 364 shl v31.4s,v31.4s,#2 // 1 -> 4 365 366Loop_outer_neon: 367 mov w5,w22 // unpack key block 368 lsr x6,x22,#32 369 mov v0.16b,v24.16b 370 mov w7,w23 371 lsr x8,x23,#32 372 mov v4.16b,v24.16b 373 mov w9,w24 374 lsr x10,x24,#32 375 mov v16.16b,v24.16b 376 mov w11,w25 377 mov v1.16b,v25.16b 378 lsr x12,x25,#32 379 mov v5.16b,v25.16b 380 mov w13,w26 381 mov v17.16b,v25.16b 382 lsr x14,x26,#32 383 mov v3.16b,v27.16b 384 mov w15,w27 385 mov v7.16b,v28.16b 386 lsr x16,x27,#32 387 mov v19.16b,v29.16b 388 mov w17,w28 389 mov v2.16b,v26.16b 390 lsr x19,x28,#32 391 mov v6.16b,v26.16b 392 mov w20,w30 393 mov v18.16b,v26.16b 394 lsr x21,x30,#32 395 396 mov x4,#10 397 subs x2,x2,#256 398Loop_neon: 399 sub x4,x4,#1 400 add v0.4s,v0.4s,v1.4s 401 add w5,w5,w9 402 add v4.4s,v4.4s,v5.4s 403 add w6,w6,w10 404 add v16.4s,v16.4s,v17.4s 405 add w7,w7,w11 406 eor v3.16b,v3.16b,v0.16b 407 add w8,w8,w12 408 eor v7.16b,v7.16b,v4.16b 409 eor w17,w17,w5 410 eor v19.16b,v19.16b,v16.16b 411 eor w19,w19,w6 412 rev32 v3.8h,v3.8h 413 eor w20,w20,w7 414 rev32 v7.8h,v7.8h 415 eor w21,w21,w8 416 rev32 v19.8h,v19.8h 417 ror w17,w17,#16 418 add v2.4s,v2.4s,v3.4s 419 ror w19,w19,#16 420 add v6.4s,v6.4s,v7.4s 421 ror w20,w20,#16 422 add v18.4s,v18.4s,v19.4s 423 ror w21,w21,#16 424 eor v20.16b,v1.16b,v2.16b 425 add w13,w13,w17 426 eor v21.16b,v5.16b,v6.16b 427 add w14,w14,w19 428 eor v22.16b,v17.16b,v18.16b 429 add w15,w15,w20 430 ushr v1.4s,v20.4s,#20 431 add w16,w16,w21 432 ushr v5.4s,v21.4s,#20 433 eor w9,w9,w13 434 ushr v17.4s,v22.4s,#20 435 eor w10,w10,w14 436 sli v1.4s,v20.4s,#12 437 eor w11,w11,w15 438 sli v5.4s,v21.4s,#12 439 eor w12,w12,w16 440 sli v17.4s,v22.4s,#12 441 ror w9,w9,#20 442 add v0.4s,v0.4s,v1.4s 443 ror w10,w10,#20 444 add v4.4s,v4.4s,v5.4s 445 ror w11,w11,#20 446 add v16.4s,v16.4s,v17.4s 447 ror w12,w12,#20 448 eor v20.16b,v3.16b,v0.16b 449 add w5,w5,w9 450 eor v21.16b,v7.16b,v4.16b 451 add w6,w6,w10 452 eor v22.16b,v19.16b,v16.16b 453 add w7,w7,w11 454 ushr v3.4s,v20.4s,#24 455 add w8,w8,w12 456 ushr v7.4s,v21.4s,#24 457 eor w17,w17,w5 458 ushr v19.4s,v22.4s,#24 459 eor w19,w19,w6 460 sli v3.4s,v20.4s,#8 461 eor w20,w20,w7 462 sli v7.4s,v21.4s,#8 463 eor w21,w21,w8 464 sli v19.4s,v22.4s,#8 465 ror w17,w17,#24 466 add v2.4s,v2.4s,v3.4s 467 ror w19,w19,#24 468 add v6.4s,v6.4s,v7.4s 469 ror w20,w20,#24 470 add v18.4s,v18.4s,v19.4s 471 ror w21,w21,#24 472 eor v20.16b,v1.16b,v2.16b 473 add w13,w13,w17 474 eor v21.16b,v5.16b,v6.16b 475 add w14,w14,w19 476 eor v22.16b,v17.16b,v18.16b 477 add w15,w15,w20 478 ushr v1.4s,v20.4s,#25 479 add w16,w16,w21 480 ushr v5.4s,v21.4s,#25 481 eor w9,w9,w13 482 ushr v17.4s,v22.4s,#25 483 eor w10,w10,w14 484 sli v1.4s,v20.4s,#7 485 eor w11,w11,w15 486 sli v5.4s,v21.4s,#7 487 eor w12,w12,w16 488 sli v17.4s,v22.4s,#7 489 ror w9,w9,#25 490 ext v2.16b,v2.16b,v2.16b,#8 491 ror w10,w10,#25 492 ext v6.16b,v6.16b,v6.16b,#8 493 ror w11,w11,#25 494 ext v18.16b,v18.16b,v18.16b,#8 495 ror w12,w12,#25 496 ext v3.16b,v3.16b,v3.16b,#12 497 ext v7.16b,v7.16b,v7.16b,#12 498 ext v19.16b,v19.16b,v19.16b,#12 499 ext v1.16b,v1.16b,v1.16b,#4 500 ext v5.16b,v5.16b,v5.16b,#4 501 ext v17.16b,v17.16b,v17.16b,#4 502 add v0.4s,v0.4s,v1.4s 503 add w5,w5,w10 504 add v4.4s,v4.4s,v5.4s 505 add w6,w6,w11 506 add v16.4s,v16.4s,v17.4s 507 add w7,w7,w12 508 eor v3.16b,v3.16b,v0.16b 509 add w8,w8,w9 510 eor v7.16b,v7.16b,v4.16b 511 eor w21,w21,w5 512 eor v19.16b,v19.16b,v16.16b 513 eor w17,w17,w6 514 rev32 v3.8h,v3.8h 515 eor w19,w19,w7 516 rev32 v7.8h,v7.8h 517 eor w20,w20,w8 518 rev32 v19.8h,v19.8h 519 ror w21,w21,#16 520 add v2.4s,v2.4s,v3.4s 521 ror w17,w17,#16 522 add v6.4s,v6.4s,v7.4s 523 ror w19,w19,#16 524 add v18.4s,v18.4s,v19.4s 525 ror w20,w20,#16 526 eor v20.16b,v1.16b,v2.16b 527 add w15,w15,w21 528 eor v21.16b,v5.16b,v6.16b 529 add w16,w16,w17 530 eor v22.16b,v17.16b,v18.16b 531 add w13,w13,w19 532 ushr v1.4s,v20.4s,#20 533 add w14,w14,w20 534 ushr v5.4s,v21.4s,#20 535 eor w10,w10,w15 536 ushr v17.4s,v22.4s,#20 537 eor w11,w11,w16 538 sli v1.4s,v20.4s,#12 539 eor w12,w12,w13 540 sli v5.4s,v21.4s,#12 541 eor w9,w9,w14 542 sli v17.4s,v22.4s,#12 543 ror w10,w10,#20 544 add v0.4s,v0.4s,v1.4s 545 ror w11,w11,#20 546 add v4.4s,v4.4s,v5.4s 547 ror w12,w12,#20 548 add v16.4s,v16.4s,v17.4s 549 ror w9,w9,#20 550 eor v20.16b,v3.16b,v0.16b 551 add w5,w5,w10 552 eor v21.16b,v7.16b,v4.16b 553 add w6,w6,w11 554 eor v22.16b,v19.16b,v16.16b 555 add w7,w7,w12 556 ushr v3.4s,v20.4s,#24 557 add w8,w8,w9 558 ushr v7.4s,v21.4s,#24 559 eor w21,w21,w5 560 ushr v19.4s,v22.4s,#24 561 eor w17,w17,w6 562 sli v3.4s,v20.4s,#8 563 eor w19,w19,w7 564 sli v7.4s,v21.4s,#8 565 eor w20,w20,w8 566 sli v19.4s,v22.4s,#8 567 ror w21,w21,#24 568 add v2.4s,v2.4s,v3.4s 569 ror w17,w17,#24 570 add v6.4s,v6.4s,v7.4s 571 ror w19,w19,#24 572 add v18.4s,v18.4s,v19.4s 573 ror w20,w20,#24 574 eor v20.16b,v1.16b,v2.16b 575 add w15,w15,w21 576 eor v21.16b,v5.16b,v6.16b 577 add w16,w16,w17 578 eor v22.16b,v17.16b,v18.16b 579 add w13,w13,w19 580 ushr v1.4s,v20.4s,#25 581 add w14,w14,w20 582 ushr v5.4s,v21.4s,#25 583 eor w10,w10,w15 584 ushr v17.4s,v22.4s,#25 585 eor w11,w11,w16 586 sli v1.4s,v20.4s,#7 587 eor w12,w12,w13 588 sli v5.4s,v21.4s,#7 589 eor w9,w9,w14 590 sli v17.4s,v22.4s,#7 591 ror w10,w10,#25 592 ext v2.16b,v2.16b,v2.16b,#8 593 ror w11,w11,#25 594 ext v6.16b,v6.16b,v6.16b,#8 595 ror w12,w12,#25 596 ext v18.16b,v18.16b,v18.16b,#8 597 ror w9,w9,#25 598 ext v3.16b,v3.16b,v3.16b,#4 599 ext v7.16b,v7.16b,v7.16b,#4 600 ext v19.16b,v19.16b,v19.16b,#4 601 ext v1.16b,v1.16b,v1.16b,#12 602 ext v5.16b,v5.16b,v5.16b,#12 603 ext v17.16b,v17.16b,v17.16b,#12 604 cbnz x4,Loop_neon 605 606 add w5,w5,w22 // accumulate key block 607 add v0.4s,v0.4s,v24.4s 608 add x6,x6,x22,lsr#32 609 add v4.4s,v4.4s,v24.4s 610 add w7,w7,w23 611 add v16.4s,v16.4s,v24.4s 612 add x8,x8,x23,lsr#32 613 add v2.4s,v2.4s,v26.4s 614 add w9,w9,w24 615 add v6.4s,v6.4s,v26.4s 616 add x10,x10,x24,lsr#32 617 add v18.4s,v18.4s,v26.4s 618 add w11,w11,w25 619 add v3.4s,v3.4s,v27.4s 620 add x12,x12,x25,lsr#32 621 add w13,w13,w26 622 add v7.4s,v7.4s,v28.4s 623 add x14,x14,x26,lsr#32 624 add w15,w15,w27 625 add v19.4s,v19.4s,v29.4s 626 add x16,x16,x27,lsr#32 627 add w17,w17,w28 628 add v1.4s,v1.4s,v25.4s 629 add x19,x19,x28,lsr#32 630 add w20,w20,w30 631 add v5.4s,v5.4s,v25.4s 632 add x21,x21,x30,lsr#32 633 add v17.4s,v17.4s,v25.4s 634 635 b.lo Ltail_neon 636 637 add x5,x5,x6,lsl#32 // pack 638 add x7,x7,x8,lsl#32 639 ldp x6,x8,[x1,#0] // load input 640 add x9,x9,x10,lsl#32 641 add x11,x11,x12,lsl#32 642 ldp x10,x12,[x1,#16] 643 add x13,x13,x14,lsl#32 644 add x15,x15,x16,lsl#32 645 ldp x14,x16,[x1,#32] 646 add x17,x17,x19,lsl#32 647 add x20,x20,x21,lsl#32 648 ldp x19,x21,[x1,#48] 649 add x1,x1,#64 650#ifdef __ARMEB__ 651 rev x5,x5 652 rev x7,x7 653 rev x9,x9 654 rev x11,x11 655 rev x13,x13 656 rev x15,x15 657 rev x17,x17 658 rev x20,x20 659#endif 660 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 661 eor x5,x5,x6 662 eor x7,x7,x8 663 eor x9,x9,x10 664 eor x11,x11,x12 665 eor x13,x13,x14 666 eor v0.16b,v0.16b,v20.16b 667 eor x15,x15,x16 668 eor v1.16b,v1.16b,v21.16b 669 eor x17,x17,x19 670 eor v2.16b,v2.16b,v22.16b 671 eor x20,x20,x21 672 eor v3.16b,v3.16b,v23.16b 673 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 674 675 stp x5,x7,[x0,#0] // store output 676 add x28,x28,#4 // increment counter 677 stp x9,x11,[x0,#16] 678 add v27.4s,v27.4s,v31.4s // += 4 679 stp x13,x15,[x0,#32] 680 add v28.4s,v28.4s,v31.4s 681 stp x17,x20,[x0,#48] 682 add v29.4s,v29.4s,v31.4s 683 add x0,x0,#64 684 685 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 686 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 687 688 eor v4.16b,v4.16b,v20.16b 689 eor v5.16b,v5.16b,v21.16b 690 eor v6.16b,v6.16b,v22.16b 691 eor v7.16b,v7.16b,v23.16b 692 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 693 694 eor v16.16b,v16.16b,v0.16b 695 eor v17.16b,v17.16b,v1.16b 696 eor v18.16b,v18.16b,v2.16b 697 eor v19.16b,v19.16b,v3.16b 698 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 699 700 b.hi Loop_outer_neon 701 702 ldp x19,x20,[x29,#16] 703 add sp,sp,#64 704 ldp x21,x22,[x29,#32] 705 ldp x23,x24,[x29,#48] 706 ldp x25,x26,[x29,#64] 707 ldp x27,x28,[x29,#80] 708 ldp x29,x30,[sp],#96 709 AARCH64_VALIDATE_LINK_REGISTER 710 ret 711 712Ltail_neon: 713 add x2,x2,#256 714 cmp x2,#64 715 b.lo Less_than_64 716 717 add x5,x5,x6,lsl#32 // pack 718 add x7,x7,x8,lsl#32 719 ldp x6,x8,[x1,#0] // load input 720 add x9,x9,x10,lsl#32 721 add x11,x11,x12,lsl#32 722 ldp x10,x12,[x1,#16] 723 add x13,x13,x14,lsl#32 724 add x15,x15,x16,lsl#32 725 ldp x14,x16,[x1,#32] 726 add x17,x17,x19,lsl#32 727 add x20,x20,x21,lsl#32 728 ldp x19,x21,[x1,#48] 729 add x1,x1,#64 730#ifdef __ARMEB__ 731 rev x5,x5 732 rev x7,x7 733 rev x9,x9 734 rev x11,x11 735 rev x13,x13 736 rev x15,x15 737 rev x17,x17 738 rev x20,x20 739#endif 740 eor x5,x5,x6 741 eor x7,x7,x8 742 eor x9,x9,x10 743 eor x11,x11,x12 744 eor x13,x13,x14 745 eor x15,x15,x16 746 eor x17,x17,x19 747 eor x20,x20,x21 748 749 stp x5,x7,[x0,#0] // store output 750 add x28,x28,#4 // increment counter 751 stp x9,x11,[x0,#16] 752 stp x13,x15,[x0,#32] 753 stp x17,x20,[x0,#48] 754 add x0,x0,#64 755 b.eq Ldone_neon 756 sub x2,x2,#64 757 cmp x2,#64 758 b.lo Less_than_128 759 760 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 761 eor v0.16b,v0.16b,v20.16b 762 eor v1.16b,v1.16b,v21.16b 763 eor v2.16b,v2.16b,v22.16b 764 eor v3.16b,v3.16b,v23.16b 765 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 766 b.eq Ldone_neon 767 sub x2,x2,#64 768 cmp x2,#64 769 b.lo Less_than_192 770 771 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 772 eor v4.16b,v4.16b,v20.16b 773 eor v5.16b,v5.16b,v21.16b 774 eor v6.16b,v6.16b,v22.16b 775 eor v7.16b,v7.16b,v23.16b 776 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 777 b.eq Ldone_neon 778 sub x2,x2,#64 779 780 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 781 b Last_neon 782 783Less_than_128: 784 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 785 b Last_neon 786Less_than_192: 787 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 788 b Last_neon 789 790.align 4 791Last_neon: 792 sub x0,x0,#1 793 add x1,x1,x2 794 add x0,x0,x2 795 add x4,sp,x2 796 neg x2,x2 797 798Loop_tail_neon: 799 ldrb w10,[x1,x2] 800 ldrb w11,[x4,x2] 801 add x2,x2,#1 802 eor w10,w10,w11 803 strb w10,[x0,x2] 804 cbnz x2,Loop_tail_neon 805 806 stp xzr,xzr,[sp,#0] 807 stp xzr,xzr,[sp,#16] 808 stp xzr,xzr,[sp,#32] 809 stp xzr,xzr,[sp,#48] 810 811Ldone_neon: 812 ldp x19,x20,[x29,#16] 813 add sp,sp,#64 814 ldp x21,x22,[x29,#32] 815 ldp x23,x24,[x29,#48] 816 ldp x25,x26,[x29,#64] 817 ldp x27,x28,[x29,#80] 818 ldp x29,x30,[sp],#96 819 AARCH64_VALIDATE_LINK_REGISTER 820 ret 821 822 823.align 5 824ChaCha20_512_neon: 825 AARCH64_SIGN_LINK_REGISTER 826 stp x29,x30,[sp,#-96]! 827 add x29,sp,#0 828 829 adrp x5,Lsigma@PAGE 830 add x5,x5,Lsigma@PAGEOFF 831 stp x19,x20,[sp,#16] 832 stp x21,x22,[sp,#32] 833 stp x23,x24,[sp,#48] 834 stp x25,x26,[sp,#64] 835 stp x27,x28,[sp,#80] 836 837L512_or_more_neon: 838 sub sp,sp,#128+64 839 840 ldp x22,x23,[x5] // load sigma 841 ld1 {v24.4s},[x5],#16 842 ldp x24,x25,[x3] // load key 843 ldp x26,x27,[x3,#16] 844 ld1 {v25.4s,v26.4s},[x3] 845 ldp x28,x30,[x4] // load counter 846 ld1 {v27.4s},[x4] 847 ld1 {v31.4s},[x5] 848#ifdef __ARMEB__ 849 rev64 v24.4s,v24.4s 850 ror x24,x24,#32 851 ror x25,x25,#32 852 ror x26,x26,#32 853 ror x27,x27,#32 854 ror x28,x28,#32 855 ror x30,x30,#32 856#endif 857 add v27.4s,v27.4s,v31.4s // += 1 858 stp q24,q25,[sp,#0] // off-load key block, invariant part 859 add v27.4s,v27.4s,v31.4s // not typo 860 str q26,[sp,#32] 861 add v28.4s,v27.4s,v31.4s 862 add v29.4s,v28.4s,v31.4s 863 add v30.4s,v29.4s,v31.4s 864 shl v31.4s,v31.4s,#2 // 1 -> 4 865 866 stp d8,d9,[sp,#128+0] // meet ABI requirements 867 stp d10,d11,[sp,#128+16] 868 stp d12,d13,[sp,#128+32] 869 stp d14,d15,[sp,#128+48] 870 871 sub x2,x2,#512 // not typo 872 873Loop_outer_512_neon: 874 mov v0.16b,v24.16b 875 mov v4.16b,v24.16b 876 mov v8.16b,v24.16b 877 mov v12.16b,v24.16b 878 mov v16.16b,v24.16b 879 mov v20.16b,v24.16b 880 mov v1.16b,v25.16b 881 mov w5,w22 // unpack key block 882 mov v5.16b,v25.16b 883 lsr x6,x22,#32 884 mov v9.16b,v25.16b 885 mov w7,w23 886 mov v13.16b,v25.16b 887 lsr x8,x23,#32 888 mov v17.16b,v25.16b 889 mov w9,w24 890 mov v21.16b,v25.16b 891 lsr x10,x24,#32 892 mov v3.16b,v27.16b 893 mov w11,w25 894 mov v7.16b,v28.16b 895 lsr x12,x25,#32 896 mov v11.16b,v29.16b 897 mov w13,w26 898 mov v15.16b,v30.16b 899 lsr x14,x26,#32 900 mov v2.16b,v26.16b 901 mov w15,w27 902 mov v6.16b,v26.16b 903 lsr x16,x27,#32 904 add v19.4s,v3.4s,v31.4s // +4 905 mov w17,w28 906 add v23.4s,v7.4s,v31.4s // +4 907 lsr x19,x28,#32 908 mov v10.16b,v26.16b 909 mov w20,w30 910 mov v14.16b,v26.16b 911 lsr x21,x30,#32 912 mov v18.16b,v26.16b 913 stp q27,q28,[sp,#48] // off-load key block, variable part 914 mov v22.16b,v26.16b 915 str q29,[sp,#80] 916 917 mov x4,#5 918 subs x2,x2,#512 919Loop_upper_neon: 920 sub x4,x4,#1 921 add v0.4s,v0.4s,v1.4s 922 add w5,w5,w9 923 add v4.4s,v4.4s,v5.4s 924 add w6,w6,w10 925 add v8.4s,v8.4s,v9.4s 926 add w7,w7,w11 927 add v12.4s,v12.4s,v13.4s 928 add w8,w8,w12 929 add v16.4s,v16.4s,v17.4s 930 eor w17,w17,w5 931 add v20.4s,v20.4s,v21.4s 932 eor w19,w19,w6 933 eor v3.16b,v3.16b,v0.16b 934 eor w20,w20,w7 935 eor v7.16b,v7.16b,v4.16b 936 eor w21,w21,w8 937 eor v11.16b,v11.16b,v8.16b 938 ror w17,w17,#16 939 eor v15.16b,v15.16b,v12.16b 940 ror w19,w19,#16 941 eor v19.16b,v19.16b,v16.16b 942 ror w20,w20,#16 943 eor v23.16b,v23.16b,v20.16b 944 ror w21,w21,#16 945 rev32 v3.8h,v3.8h 946 add w13,w13,w17 947 rev32 v7.8h,v7.8h 948 add w14,w14,w19 949 rev32 v11.8h,v11.8h 950 add w15,w15,w20 951 rev32 v15.8h,v15.8h 952 add w16,w16,w21 953 rev32 v19.8h,v19.8h 954 eor w9,w9,w13 955 rev32 v23.8h,v23.8h 956 eor w10,w10,w14 957 add v2.4s,v2.4s,v3.4s 958 eor w11,w11,w15 959 add v6.4s,v6.4s,v7.4s 960 eor w12,w12,w16 961 add v10.4s,v10.4s,v11.4s 962 ror w9,w9,#20 963 add v14.4s,v14.4s,v15.4s 964 ror w10,w10,#20 965 add v18.4s,v18.4s,v19.4s 966 ror w11,w11,#20 967 add v22.4s,v22.4s,v23.4s 968 ror w12,w12,#20 969 eor v24.16b,v1.16b,v2.16b 970 add w5,w5,w9 971 eor v25.16b,v5.16b,v6.16b 972 add w6,w6,w10 973 eor v26.16b,v9.16b,v10.16b 974 add w7,w7,w11 975 eor v27.16b,v13.16b,v14.16b 976 add w8,w8,w12 977 eor v28.16b,v17.16b,v18.16b 978 eor w17,w17,w5 979 eor v29.16b,v21.16b,v22.16b 980 eor w19,w19,w6 981 ushr v1.4s,v24.4s,#20 982 eor w20,w20,w7 983 ushr v5.4s,v25.4s,#20 984 eor w21,w21,w8 985 ushr v9.4s,v26.4s,#20 986 ror w17,w17,#24 987 ushr v13.4s,v27.4s,#20 988 ror w19,w19,#24 989 ushr v17.4s,v28.4s,#20 990 ror w20,w20,#24 991 ushr v21.4s,v29.4s,#20 992 ror w21,w21,#24 993 sli v1.4s,v24.4s,#12 994 add w13,w13,w17 995 sli v5.4s,v25.4s,#12 996 add w14,w14,w19 997 sli v9.4s,v26.4s,#12 998 add w15,w15,w20 999 sli v13.4s,v27.4s,#12 1000 add w16,w16,w21 1001 sli v17.4s,v28.4s,#12 1002 eor w9,w9,w13 1003 sli v21.4s,v29.4s,#12 1004 eor w10,w10,w14 1005 add v0.4s,v0.4s,v1.4s 1006 eor w11,w11,w15 1007 add v4.4s,v4.4s,v5.4s 1008 eor w12,w12,w16 1009 add v8.4s,v8.4s,v9.4s 1010 ror w9,w9,#25 1011 add v12.4s,v12.4s,v13.4s 1012 ror w10,w10,#25 1013 add v16.4s,v16.4s,v17.4s 1014 ror w11,w11,#25 1015 add v20.4s,v20.4s,v21.4s 1016 ror w12,w12,#25 1017 eor v24.16b,v3.16b,v0.16b 1018 add w5,w5,w10 1019 eor v25.16b,v7.16b,v4.16b 1020 add w6,w6,w11 1021 eor v26.16b,v11.16b,v8.16b 1022 add w7,w7,w12 1023 eor v27.16b,v15.16b,v12.16b 1024 add w8,w8,w9 1025 eor v28.16b,v19.16b,v16.16b 1026 eor w21,w21,w5 1027 eor v29.16b,v23.16b,v20.16b 1028 eor w17,w17,w6 1029 ushr v3.4s,v24.4s,#24 1030 eor w19,w19,w7 1031 ushr v7.4s,v25.4s,#24 1032 eor w20,w20,w8 1033 ushr v11.4s,v26.4s,#24 1034 ror w21,w21,#16 1035 ushr v15.4s,v27.4s,#24 1036 ror w17,w17,#16 1037 ushr v19.4s,v28.4s,#24 1038 ror w19,w19,#16 1039 ushr v23.4s,v29.4s,#24 1040 ror w20,w20,#16 1041 sli v3.4s,v24.4s,#8 1042 add w15,w15,w21 1043 sli v7.4s,v25.4s,#8 1044 add w16,w16,w17 1045 sli v11.4s,v26.4s,#8 1046 add w13,w13,w19 1047 sli v15.4s,v27.4s,#8 1048 add w14,w14,w20 1049 sli v19.4s,v28.4s,#8 1050 eor w10,w10,w15 1051 sli v23.4s,v29.4s,#8 1052 eor w11,w11,w16 1053 add v2.4s,v2.4s,v3.4s 1054 eor w12,w12,w13 1055 add v6.4s,v6.4s,v7.4s 1056 eor w9,w9,w14 1057 add v10.4s,v10.4s,v11.4s 1058 ror w10,w10,#20 1059 add v14.4s,v14.4s,v15.4s 1060 ror w11,w11,#20 1061 add v18.4s,v18.4s,v19.4s 1062 ror w12,w12,#20 1063 add v22.4s,v22.4s,v23.4s 1064 ror w9,w9,#20 1065 eor v24.16b,v1.16b,v2.16b 1066 add w5,w5,w10 1067 eor v25.16b,v5.16b,v6.16b 1068 add w6,w6,w11 1069 eor v26.16b,v9.16b,v10.16b 1070 add w7,w7,w12 1071 eor v27.16b,v13.16b,v14.16b 1072 add w8,w8,w9 1073 eor v28.16b,v17.16b,v18.16b 1074 eor w21,w21,w5 1075 eor v29.16b,v21.16b,v22.16b 1076 eor w17,w17,w6 1077 ushr v1.4s,v24.4s,#25 1078 eor w19,w19,w7 1079 ushr v5.4s,v25.4s,#25 1080 eor w20,w20,w8 1081 ushr v9.4s,v26.4s,#25 1082 ror w21,w21,#24 1083 ushr v13.4s,v27.4s,#25 1084 ror w17,w17,#24 1085 ushr v17.4s,v28.4s,#25 1086 ror w19,w19,#24 1087 ushr v21.4s,v29.4s,#25 1088 ror w20,w20,#24 1089 sli v1.4s,v24.4s,#7 1090 add w15,w15,w21 1091 sli v5.4s,v25.4s,#7 1092 add w16,w16,w17 1093 sli v9.4s,v26.4s,#7 1094 add w13,w13,w19 1095 sli v13.4s,v27.4s,#7 1096 add w14,w14,w20 1097 sli v17.4s,v28.4s,#7 1098 eor w10,w10,w15 1099 sli v21.4s,v29.4s,#7 1100 eor w11,w11,w16 1101 ext v2.16b,v2.16b,v2.16b,#8 1102 eor w12,w12,w13 1103 ext v6.16b,v6.16b,v6.16b,#8 1104 eor w9,w9,w14 1105 ext v10.16b,v10.16b,v10.16b,#8 1106 ror w10,w10,#25 1107 ext v14.16b,v14.16b,v14.16b,#8 1108 ror w11,w11,#25 1109 ext v18.16b,v18.16b,v18.16b,#8 1110 ror w12,w12,#25 1111 ext v22.16b,v22.16b,v22.16b,#8 1112 ror w9,w9,#25 1113 ext v3.16b,v3.16b,v3.16b,#12 1114 ext v7.16b,v7.16b,v7.16b,#12 1115 ext v11.16b,v11.16b,v11.16b,#12 1116 ext v15.16b,v15.16b,v15.16b,#12 1117 ext v19.16b,v19.16b,v19.16b,#12 1118 ext v23.16b,v23.16b,v23.16b,#12 1119 ext v1.16b,v1.16b,v1.16b,#4 1120 ext v5.16b,v5.16b,v5.16b,#4 1121 ext v9.16b,v9.16b,v9.16b,#4 1122 ext v13.16b,v13.16b,v13.16b,#4 1123 ext v17.16b,v17.16b,v17.16b,#4 1124 ext v21.16b,v21.16b,v21.16b,#4 1125 add v0.4s,v0.4s,v1.4s 1126 add w5,w5,w9 1127 add v4.4s,v4.4s,v5.4s 1128 add w6,w6,w10 1129 add v8.4s,v8.4s,v9.4s 1130 add w7,w7,w11 1131 add v12.4s,v12.4s,v13.4s 1132 add w8,w8,w12 1133 add v16.4s,v16.4s,v17.4s 1134 eor w17,w17,w5 1135 add v20.4s,v20.4s,v21.4s 1136 eor w19,w19,w6 1137 eor v3.16b,v3.16b,v0.16b 1138 eor w20,w20,w7 1139 eor v7.16b,v7.16b,v4.16b 1140 eor w21,w21,w8 1141 eor v11.16b,v11.16b,v8.16b 1142 ror w17,w17,#16 1143 eor v15.16b,v15.16b,v12.16b 1144 ror w19,w19,#16 1145 eor v19.16b,v19.16b,v16.16b 1146 ror w20,w20,#16 1147 eor v23.16b,v23.16b,v20.16b 1148 ror w21,w21,#16 1149 rev32 v3.8h,v3.8h 1150 add w13,w13,w17 1151 rev32 v7.8h,v7.8h 1152 add w14,w14,w19 1153 rev32 v11.8h,v11.8h 1154 add w15,w15,w20 1155 rev32 v15.8h,v15.8h 1156 add w16,w16,w21 1157 rev32 v19.8h,v19.8h 1158 eor w9,w9,w13 1159 rev32 v23.8h,v23.8h 1160 eor w10,w10,w14 1161 add v2.4s,v2.4s,v3.4s 1162 eor w11,w11,w15 1163 add v6.4s,v6.4s,v7.4s 1164 eor w12,w12,w16 1165 add v10.4s,v10.4s,v11.4s 1166 ror w9,w9,#20 1167 add v14.4s,v14.4s,v15.4s 1168 ror w10,w10,#20 1169 add v18.4s,v18.4s,v19.4s 1170 ror w11,w11,#20 1171 add v22.4s,v22.4s,v23.4s 1172 ror w12,w12,#20 1173 eor v24.16b,v1.16b,v2.16b 1174 add w5,w5,w9 1175 eor v25.16b,v5.16b,v6.16b 1176 add w6,w6,w10 1177 eor v26.16b,v9.16b,v10.16b 1178 add w7,w7,w11 1179 eor v27.16b,v13.16b,v14.16b 1180 add w8,w8,w12 1181 eor v28.16b,v17.16b,v18.16b 1182 eor w17,w17,w5 1183 eor v29.16b,v21.16b,v22.16b 1184 eor w19,w19,w6 1185 ushr v1.4s,v24.4s,#20 1186 eor w20,w20,w7 1187 ushr v5.4s,v25.4s,#20 1188 eor w21,w21,w8 1189 ushr v9.4s,v26.4s,#20 1190 ror w17,w17,#24 1191 ushr v13.4s,v27.4s,#20 1192 ror w19,w19,#24 1193 ushr v17.4s,v28.4s,#20 1194 ror w20,w20,#24 1195 ushr v21.4s,v29.4s,#20 1196 ror w21,w21,#24 1197 sli v1.4s,v24.4s,#12 1198 add w13,w13,w17 1199 sli v5.4s,v25.4s,#12 1200 add w14,w14,w19 1201 sli v9.4s,v26.4s,#12 1202 add w15,w15,w20 1203 sli v13.4s,v27.4s,#12 1204 add w16,w16,w21 1205 sli v17.4s,v28.4s,#12 1206 eor w9,w9,w13 1207 sli v21.4s,v29.4s,#12 1208 eor w10,w10,w14 1209 add v0.4s,v0.4s,v1.4s 1210 eor w11,w11,w15 1211 add v4.4s,v4.4s,v5.4s 1212 eor w12,w12,w16 1213 add v8.4s,v8.4s,v9.4s 1214 ror w9,w9,#25 1215 add v12.4s,v12.4s,v13.4s 1216 ror w10,w10,#25 1217 add v16.4s,v16.4s,v17.4s 1218 ror w11,w11,#25 1219 add v20.4s,v20.4s,v21.4s 1220 ror w12,w12,#25 1221 eor v24.16b,v3.16b,v0.16b 1222 add w5,w5,w10 1223 eor v25.16b,v7.16b,v4.16b 1224 add w6,w6,w11 1225 eor v26.16b,v11.16b,v8.16b 1226 add w7,w7,w12 1227 eor v27.16b,v15.16b,v12.16b 1228 add w8,w8,w9 1229 eor v28.16b,v19.16b,v16.16b 1230 eor w21,w21,w5 1231 eor v29.16b,v23.16b,v20.16b 1232 eor w17,w17,w6 1233 ushr v3.4s,v24.4s,#24 1234 eor w19,w19,w7 1235 ushr v7.4s,v25.4s,#24 1236 eor w20,w20,w8 1237 ushr v11.4s,v26.4s,#24 1238 ror w21,w21,#16 1239 ushr v15.4s,v27.4s,#24 1240 ror w17,w17,#16 1241 ushr v19.4s,v28.4s,#24 1242 ror w19,w19,#16 1243 ushr v23.4s,v29.4s,#24 1244 ror w20,w20,#16 1245 sli v3.4s,v24.4s,#8 1246 add w15,w15,w21 1247 sli v7.4s,v25.4s,#8 1248 add w16,w16,w17 1249 sli v11.4s,v26.4s,#8 1250 add w13,w13,w19 1251 sli v15.4s,v27.4s,#8 1252 add w14,w14,w20 1253 sli v19.4s,v28.4s,#8 1254 eor w10,w10,w15 1255 sli v23.4s,v29.4s,#8 1256 eor w11,w11,w16 1257 add v2.4s,v2.4s,v3.4s 1258 eor w12,w12,w13 1259 add v6.4s,v6.4s,v7.4s 1260 eor w9,w9,w14 1261 add v10.4s,v10.4s,v11.4s 1262 ror w10,w10,#20 1263 add v14.4s,v14.4s,v15.4s 1264 ror w11,w11,#20 1265 add v18.4s,v18.4s,v19.4s 1266 ror w12,w12,#20 1267 add v22.4s,v22.4s,v23.4s 1268 ror w9,w9,#20 1269 eor v24.16b,v1.16b,v2.16b 1270 add w5,w5,w10 1271 eor v25.16b,v5.16b,v6.16b 1272 add w6,w6,w11 1273 eor v26.16b,v9.16b,v10.16b 1274 add w7,w7,w12 1275 eor v27.16b,v13.16b,v14.16b 1276 add w8,w8,w9 1277 eor v28.16b,v17.16b,v18.16b 1278 eor w21,w21,w5 1279 eor v29.16b,v21.16b,v22.16b 1280 eor w17,w17,w6 1281 ushr v1.4s,v24.4s,#25 1282 eor w19,w19,w7 1283 ushr v5.4s,v25.4s,#25 1284 eor w20,w20,w8 1285 ushr v9.4s,v26.4s,#25 1286 ror w21,w21,#24 1287 ushr v13.4s,v27.4s,#25 1288 ror w17,w17,#24 1289 ushr v17.4s,v28.4s,#25 1290 ror w19,w19,#24 1291 ushr v21.4s,v29.4s,#25 1292 ror w20,w20,#24 1293 sli v1.4s,v24.4s,#7 1294 add w15,w15,w21 1295 sli v5.4s,v25.4s,#7 1296 add w16,w16,w17 1297 sli v9.4s,v26.4s,#7 1298 add w13,w13,w19 1299 sli v13.4s,v27.4s,#7 1300 add w14,w14,w20 1301 sli v17.4s,v28.4s,#7 1302 eor w10,w10,w15 1303 sli v21.4s,v29.4s,#7 1304 eor w11,w11,w16 1305 ext v2.16b,v2.16b,v2.16b,#8 1306 eor w12,w12,w13 1307 ext v6.16b,v6.16b,v6.16b,#8 1308 eor w9,w9,w14 1309 ext v10.16b,v10.16b,v10.16b,#8 1310 ror w10,w10,#25 1311 ext v14.16b,v14.16b,v14.16b,#8 1312 ror w11,w11,#25 1313 ext v18.16b,v18.16b,v18.16b,#8 1314 ror w12,w12,#25 1315 ext v22.16b,v22.16b,v22.16b,#8 1316 ror w9,w9,#25 1317 ext v3.16b,v3.16b,v3.16b,#4 1318 ext v7.16b,v7.16b,v7.16b,#4 1319 ext v11.16b,v11.16b,v11.16b,#4 1320 ext v15.16b,v15.16b,v15.16b,#4 1321 ext v19.16b,v19.16b,v19.16b,#4 1322 ext v23.16b,v23.16b,v23.16b,#4 1323 ext v1.16b,v1.16b,v1.16b,#12 1324 ext v5.16b,v5.16b,v5.16b,#12 1325 ext v9.16b,v9.16b,v9.16b,#12 1326 ext v13.16b,v13.16b,v13.16b,#12 1327 ext v17.16b,v17.16b,v17.16b,#12 1328 ext v21.16b,v21.16b,v21.16b,#12 1329 cbnz x4,Loop_upper_neon 1330 1331 add w5,w5,w22 // accumulate key block 1332 add x6,x6,x22,lsr#32 1333 add w7,w7,w23 1334 add x8,x8,x23,lsr#32 1335 add w9,w9,w24 1336 add x10,x10,x24,lsr#32 1337 add w11,w11,w25 1338 add x12,x12,x25,lsr#32 1339 add w13,w13,w26 1340 add x14,x14,x26,lsr#32 1341 add w15,w15,w27 1342 add x16,x16,x27,lsr#32 1343 add w17,w17,w28 1344 add x19,x19,x28,lsr#32 1345 add w20,w20,w30 1346 add x21,x21,x30,lsr#32 1347 1348 add x5,x5,x6,lsl#32 // pack 1349 add x7,x7,x8,lsl#32 1350 ldp x6,x8,[x1,#0] // load input 1351 add x9,x9,x10,lsl#32 1352 add x11,x11,x12,lsl#32 1353 ldp x10,x12,[x1,#16] 1354 add x13,x13,x14,lsl#32 1355 add x15,x15,x16,lsl#32 1356 ldp x14,x16,[x1,#32] 1357 add x17,x17,x19,lsl#32 1358 add x20,x20,x21,lsl#32 1359 ldp x19,x21,[x1,#48] 1360 add x1,x1,#64 1361#ifdef __ARMEB__ 1362 rev x5,x5 1363 rev x7,x7 1364 rev x9,x9 1365 rev x11,x11 1366 rev x13,x13 1367 rev x15,x15 1368 rev x17,x17 1369 rev x20,x20 1370#endif 1371 eor x5,x5,x6 1372 eor x7,x7,x8 1373 eor x9,x9,x10 1374 eor x11,x11,x12 1375 eor x13,x13,x14 1376 eor x15,x15,x16 1377 eor x17,x17,x19 1378 eor x20,x20,x21 1379 1380 stp x5,x7,[x0,#0] // store output 1381 add x28,x28,#1 // increment counter 1382 mov w5,w22 // unpack key block 1383 lsr x6,x22,#32 1384 stp x9,x11,[x0,#16] 1385 mov w7,w23 1386 lsr x8,x23,#32 1387 stp x13,x15,[x0,#32] 1388 mov w9,w24 1389 lsr x10,x24,#32 1390 stp x17,x20,[x0,#48] 1391 add x0,x0,#64 1392 mov w11,w25 1393 lsr x12,x25,#32 1394 mov w13,w26 1395 lsr x14,x26,#32 1396 mov w15,w27 1397 lsr x16,x27,#32 1398 mov w17,w28 1399 lsr x19,x28,#32 1400 mov w20,w30 1401 lsr x21,x30,#32 1402 1403 mov x4,#5 1404Loop_lower_neon: 1405 sub x4,x4,#1 1406 add v0.4s,v0.4s,v1.4s 1407 add w5,w5,w9 1408 add v4.4s,v4.4s,v5.4s 1409 add w6,w6,w10 1410 add v8.4s,v8.4s,v9.4s 1411 add w7,w7,w11 1412 add v12.4s,v12.4s,v13.4s 1413 add w8,w8,w12 1414 add v16.4s,v16.4s,v17.4s 1415 eor w17,w17,w5 1416 add v20.4s,v20.4s,v21.4s 1417 eor w19,w19,w6 1418 eor v3.16b,v3.16b,v0.16b 1419 eor w20,w20,w7 1420 eor v7.16b,v7.16b,v4.16b 1421 eor w21,w21,w8 1422 eor v11.16b,v11.16b,v8.16b 1423 ror w17,w17,#16 1424 eor v15.16b,v15.16b,v12.16b 1425 ror w19,w19,#16 1426 eor v19.16b,v19.16b,v16.16b 1427 ror w20,w20,#16 1428 eor v23.16b,v23.16b,v20.16b 1429 ror w21,w21,#16 1430 rev32 v3.8h,v3.8h 1431 add w13,w13,w17 1432 rev32 v7.8h,v7.8h 1433 add w14,w14,w19 1434 rev32 v11.8h,v11.8h 1435 add w15,w15,w20 1436 rev32 v15.8h,v15.8h 1437 add w16,w16,w21 1438 rev32 v19.8h,v19.8h 1439 eor w9,w9,w13 1440 rev32 v23.8h,v23.8h 1441 eor w10,w10,w14 1442 add v2.4s,v2.4s,v3.4s 1443 eor w11,w11,w15 1444 add v6.4s,v6.4s,v7.4s 1445 eor w12,w12,w16 1446 add v10.4s,v10.4s,v11.4s 1447 ror w9,w9,#20 1448 add v14.4s,v14.4s,v15.4s 1449 ror w10,w10,#20 1450 add v18.4s,v18.4s,v19.4s 1451 ror w11,w11,#20 1452 add v22.4s,v22.4s,v23.4s 1453 ror w12,w12,#20 1454 eor v24.16b,v1.16b,v2.16b 1455 add w5,w5,w9 1456 eor v25.16b,v5.16b,v6.16b 1457 add w6,w6,w10 1458 eor v26.16b,v9.16b,v10.16b 1459 add w7,w7,w11 1460 eor v27.16b,v13.16b,v14.16b 1461 add w8,w8,w12 1462 eor v28.16b,v17.16b,v18.16b 1463 eor w17,w17,w5 1464 eor v29.16b,v21.16b,v22.16b 1465 eor w19,w19,w6 1466 ushr v1.4s,v24.4s,#20 1467 eor w20,w20,w7 1468 ushr v5.4s,v25.4s,#20 1469 eor w21,w21,w8 1470 ushr v9.4s,v26.4s,#20 1471 ror w17,w17,#24 1472 ushr v13.4s,v27.4s,#20 1473 ror w19,w19,#24 1474 ushr v17.4s,v28.4s,#20 1475 ror w20,w20,#24 1476 ushr v21.4s,v29.4s,#20 1477 ror w21,w21,#24 1478 sli v1.4s,v24.4s,#12 1479 add w13,w13,w17 1480 sli v5.4s,v25.4s,#12 1481 add w14,w14,w19 1482 sli v9.4s,v26.4s,#12 1483 add w15,w15,w20 1484 sli v13.4s,v27.4s,#12 1485 add w16,w16,w21 1486 sli v17.4s,v28.4s,#12 1487 eor w9,w9,w13 1488 sli v21.4s,v29.4s,#12 1489 eor w10,w10,w14 1490 add v0.4s,v0.4s,v1.4s 1491 eor w11,w11,w15 1492 add v4.4s,v4.4s,v5.4s 1493 eor w12,w12,w16 1494 add v8.4s,v8.4s,v9.4s 1495 ror w9,w9,#25 1496 add v12.4s,v12.4s,v13.4s 1497 ror w10,w10,#25 1498 add v16.4s,v16.4s,v17.4s 1499 ror w11,w11,#25 1500 add v20.4s,v20.4s,v21.4s 1501 ror w12,w12,#25 1502 eor v24.16b,v3.16b,v0.16b 1503 add w5,w5,w10 1504 eor v25.16b,v7.16b,v4.16b 1505 add w6,w6,w11 1506 eor v26.16b,v11.16b,v8.16b 1507 add w7,w7,w12 1508 eor v27.16b,v15.16b,v12.16b 1509 add w8,w8,w9 1510 eor v28.16b,v19.16b,v16.16b 1511 eor w21,w21,w5 1512 eor v29.16b,v23.16b,v20.16b 1513 eor w17,w17,w6 1514 ushr v3.4s,v24.4s,#24 1515 eor w19,w19,w7 1516 ushr v7.4s,v25.4s,#24 1517 eor w20,w20,w8 1518 ushr v11.4s,v26.4s,#24 1519 ror w21,w21,#16 1520 ushr v15.4s,v27.4s,#24 1521 ror w17,w17,#16 1522 ushr v19.4s,v28.4s,#24 1523 ror w19,w19,#16 1524 ushr v23.4s,v29.4s,#24 1525 ror w20,w20,#16 1526 sli v3.4s,v24.4s,#8 1527 add w15,w15,w21 1528 sli v7.4s,v25.4s,#8 1529 add w16,w16,w17 1530 sli v11.4s,v26.4s,#8 1531 add w13,w13,w19 1532 sli v15.4s,v27.4s,#8 1533 add w14,w14,w20 1534 sli v19.4s,v28.4s,#8 1535 eor w10,w10,w15 1536 sli v23.4s,v29.4s,#8 1537 eor w11,w11,w16 1538 add v2.4s,v2.4s,v3.4s 1539 eor w12,w12,w13 1540 add v6.4s,v6.4s,v7.4s 1541 eor w9,w9,w14 1542 add v10.4s,v10.4s,v11.4s 1543 ror w10,w10,#20 1544 add v14.4s,v14.4s,v15.4s 1545 ror w11,w11,#20 1546 add v18.4s,v18.4s,v19.4s 1547 ror w12,w12,#20 1548 add v22.4s,v22.4s,v23.4s 1549 ror w9,w9,#20 1550 eor v24.16b,v1.16b,v2.16b 1551 add w5,w5,w10 1552 eor v25.16b,v5.16b,v6.16b 1553 add w6,w6,w11 1554 eor v26.16b,v9.16b,v10.16b 1555 add w7,w7,w12 1556 eor v27.16b,v13.16b,v14.16b 1557 add w8,w8,w9 1558 eor v28.16b,v17.16b,v18.16b 1559 eor w21,w21,w5 1560 eor v29.16b,v21.16b,v22.16b 1561 eor w17,w17,w6 1562 ushr v1.4s,v24.4s,#25 1563 eor w19,w19,w7 1564 ushr v5.4s,v25.4s,#25 1565 eor w20,w20,w8 1566 ushr v9.4s,v26.4s,#25 1567 ror w21,w21,#24 1568 ushr v13.4s,v27.4s,#25 1569 ror w17,w17,#24 1570 ushr v17.4s,v28.4s,#25 1571 ror w19,w19,#24 1572 ushr v21.4s,v29.4s,#25 1573 ror w20,w20,#24 1574 sli v1.4s,v24.4s,#7 1575 add w15,w15,w21 1576 sli v5.4s,v25.4s,#7 1577 add w16,w16,w17 1578 sli v9.4s,v26.4s,#7 1579 add w13,w13,w19 1580 sli v13.4s,v27.4s,#7 1581 add w14,w14,w20 1582 sli v17.4s,v28.4s,#7 1583 eor w10,w10,w15 1584 sli v21.4s,v29.4s,#7 1585 eor w11,w11,w16 1586 ext v2.16b,v2.16b,v2.16b,#8 1587 eor w12,w12,w13 1588 ext v6.16b,v6.16b,v6.16b,#8 1589 eor w9,w9,w14 1590 ext v10.16b,v10.16b,v10.16b,#8 1591 ror w10,w10,#25 1592 ext v14.16b,v14.16b,v14.16b,#8 1593 ror w11,w11,#25 1594 ext v18.16b,v18.16b,v18.16b,#8 1595 ror w12,w12,#25 1596 ext v22.16b,v22.16b,v22.16b,#8 1597 ror w9,w9,#25 1598 ext v3.16b,v3.16b,v3.16b,#12 1599 ext v7.16b,v7.16b,v7.16b,#12 1600 ext v11.16b,v11.16b,v11.16b,#12 1601 ext v15.16b,v15.16b,v15.16b,#12 1602 ext v19.16b,v19.16b,v19.16b,#12 1603 ext v23.16b,v23.16b,v23.16b,#12 1604 ext v1.16b,v1.16b,v1.16b,#4 1605 ext v5.16b,v5.16b,v5.16b,#4 1606 ext v9.16b,v9.16b,v9.16b,#4 1607 ext v13.16b,v13.16b,v13.16b,#4 1608 ext v17.16b,v17.16b,v17.16b,#4 1609 ext v21.16b,v21.16b,v21.16b,#4 1610 add v0.4s,v0.4s,v1.4s 1611 add w5,w5,w9 1612 add v4.4s,v4.4s,v5.4s 1613 add w6,w6,w10 1614 add v8.4s,v8.4s,v9.4s 1615 add w7,w7,w11 1616 add v12.4s,v12.4s,v13.4s 1617 add w8,w8,w12 1618 add v16.4s,v16.4s,v17.4s 1619 eor w17,w17,w5 1620 add v20.4s,v20.4s,v21.4s 1621 eor w19,w19,w6 1622 eor v3.16b,v3.16b,v0.16b 1623 eor w20,w20,w7 1624 eor v7.16b,v7.16b,v4.16b 1625 eor w21,w21,w8 1626 eor v11.16b,v11.16b,v8.16b 1627 ror w17,w17,#16 1628 eor v15.16b,v15.16b,v12.16b 1629 ror w19,w19,#16 1630 eor v19.16b,v19.16b,v16.16b 1631 ror w20,w20,#16 1632 eor v23.16b,v23.16b,v20.16b 1633 ror w21,w21,#16 1634 rev32 v3.8h,v3.8h 1635 add w13,w13,w17 1636 rev32 v7.8h,v7.8h 1637 add w14,w14,w19 1638 rev32 v11.8h,v11.8h 1639 add w15,w15,w20 1640 rev32 v15.8h,v15.8h 1641 add w16,w16,w21 1642 rev32 v19.8h,v19.8h 1643 eor w9,w9,w13 1644 rev32 v23.8h,v23.8h 1645 eor w10,w10,w14 1646 add v2.4s,v2.4s,v3.4s 1647 eor w11,w11,w15 1648 add v6.4s,v6.4s,v7.4s 1649 eor w12,w12,w16 1650 add v10.4s,v10.4s,v11.4s 1651 ror w9,w9,#20 1652 add v14.4s,v14.4s,v15.4s 1653 ror w10,w10,#20 1654 add v18.4s,v18.4s,v19.4s 1655 ror w11,w11,#20 1656 add v22.4s,v22.4s,v23.4s 1657 ror w12,w12,#20 1658 eor v24.16b,v1.16b,v2.16b 1659 add w5,w5,w9 1660 eor v25.16b,v5.16b,v6.16b 1661 add w6,w6,w10 1662 eor v26.16b,v9.16b,v10.16b 1663 add w7,w7,w11 1664 eor v27.16b,v13.16b,v14.16b 1665 add w8,w8,w12 1666 eor v28.16b,v17.16b,v18.16b 1667 eor w17,w17,w5 1668 eor v29.16b,v21.16b,v22.16b 1669 eor w19,w19,w6 1670 ushr v1.4s,v24.4s,#20 1671 eor w20,w20,w7 1672 ushr v5.4s,v25.4s,#20 1673 eor w21,w21,w8 1674 ushr v9.4s,v26.4s,#20 1675 ror w17,w17,#24 1676 ushr v13.4s,v27.4s,#20 1677 ror w19,w19,#24 1678 ushr v17.4s,v28.4s,#20 1679 ror w20,w20,#24 1680 ushr v21.4s,v29.4s,#20 1681 ror w21,w21,#24 1682 sli v1.4s,v24.4s,#12 1683 add w13,w13,w17 1684 sli v5.4s,v25.4s,#12 1685 add w14,w14,w19 1686 sli v9.4s,v26.4s,#12 1687 add w15,w15,w20 1688 sli v13.4s,v27.4s,#12 1689 add w16,w16,w21 1690 sli v17.4s,v28.4s,#12 1691 eor w9,w9,w13 1692 sli v21.4s,v29.4s,#12 1693 eor w10,w10,w14 1694 add v0.4s,v0.4s,v1.4s 1695 eor w11,w11,w15 1696 add v4.4s,v4.4s,v5.4s 1697 eor w12,w12,w16 1698 add v8.4s,v8.4s,v9.4s 1699 ror w9,w9,#25 1700 add v12.4s,v12.4s,v13.4s 1701 ror w10,w10,#25 1702 add v16.4s,v16.4s,v17.4s 1703 ror w11,w11,#25 1704 add v20.4s,v20.4s,v21.4s 1705 ror w12,w12,#25 1706 eor v24.16b,v3.16b,v0.16b 1707 add w5,w5,w10 1708 eor v25.16b,v7.16b,v4.16b 1709 add w6,w6,w11 1710 eor v26.16b,v11.16b,v8.16b 1711 add w7,w7,w12 1712 eor v27.16b,v15.16b,v12.16b 1713 add w8,w8,w9 1714 eor v28.16b,v19.16b,v16.16b 1715 eor w21,w21,w5 1716 eor v29.16b,v23.16b,v20.16b 1717 eor w17,w17,w6 1718 ushr v3.4s,v24.4s,#24 1719 eor w19,w19,w7 1720 ushr v7.4s,v25.4s,#24 1721 eor w20,w20,w8 1722 ushr v11.4s,v26.4s,#24 1723 ror w21,w21,#16 1724 ushr v15.4s,v27.4s,#24 1725 ror w17,w17,#16 1726 ushr v19.4s,v28.4s,#24 1727 ror w19,w19,#16 1728 ushr v23.4s,v29.4s,#24 1729 ror w20,w20,#16 1730 sli v3.4s,v24.4s,#8 1731 add w15,w15,w21 1732 sli v7.4s,v25.4s,#8 1733 add w16,w16,w17 1734 sli v11.4s,v26.4s,#8 1735 add w13,w13,w19 1736 sli v15.4s,v27.4s,#8 1737 add w14,w14,w20 1738 sli v19.4s,v28.4s,#8 1739 eor w10,w10,w15 1740 sli v23.4s,v29.4s,#8 1741 eor w11,w11,w16 1742 add v2.4s,v2.4s,v3.4s 1743 eor w12,w12,w13 1744 add v6.4s,v6.4s,v7.4s 1745 eor w9,w9,w14 1746 add v10.4s,v10.4s,v11.4s 1747 ror w10,w10,#20 1748 add v14.4s,v14.4s,v15.4s 1749 ror w11,w11,#20 1750 add v18.4s,v18.4s,v19.4s 1751 ror w12,w12,#20 1752 add v22.4s,v22.4s,v23.4s 1753 ror w9,w9,#20 1754 eor v24.16b,v1.16b,v2.16b 1755 add w5,w5,w10 1756 eor v25.16b,v5.16b,v6.16b 1757 add w6,w6,w11 1758 eor v26.16b,v9.16b,v10.16b 1759 add w7,w7,w12 1760 eor v27.16b,v13.16b,v14.16b 1761 add w8,w8,w9 1762 eor v28.16b,v17.16b,v18.16b 1763 eor w21,w21,w5 1764 eor v29.16b,v21.16b,v22.16b 1765 eor w17,w17,w6 1766 ushr v1.4s,v24.4s,#25 1767 eor w19,w19,w7 1768 ushr v5.4s,v25.4s,#25 1769 eor w20,w20,w8 1770 ushr v9.4s,v26.4s,#25 1771 ror w21,w21,#24 1772 ushr v13.4s,v27.4s,#25 1773 ror w17,w17,#24 1774 ushr v17.4s,v28.4s,#25 1775 ror w19,w19,#24 1776 ushr v21.4s,v29.4s,#25 1777 ror w20,w20,#24 1778 sli v1.4s,v24.4s,#7 1779 add w15,w15,w21 1780 sli v5.4s,v25.4s,#7 1781 add w16,w16,w17 1782 sli v9.4s,v26.4s,#7 1783 add w13,w13,w19 1784 sli v13.4s,v27.4s,#7 1785 add w14,w14,w20 1786 sli v17.4s,v28.4s,#7 1787 eor w10,w10,w15 1788 sli v21.4s,v29.4s,#7 1789 eor w11,w11,w16 1790 ext v2.16b,v2.16b,v2.16b,#8 1791 eor w12,w12,w13 1792 ext v6.16b,v6.16b,v6.16b,#8 1793 eor w9,w9,w14 1794 ext v10.16b,v10.16b,v10.16b,#8 1795 ror w10,w10,#25 1796 ext v14.16b,v14.16b,v14.16b,#8 1797 ror w11,w11,#25 1798 ext v18.16b,v18.16b,v18.16b,#8 1799 ror w12,w12,#25 1800 ext v22.16b,v22.16b,v22.16b,#8 1801 ror w9,w9,#25 1802 ext v3.16b,v3.16b,v3.16b,#4 1803 ext v7.16b,v7.16b,v7.16b,#4 1804 ext v11.16b,v11.16b,v11.16b,#4 1805 ext v15.16b,v15.16b,v15.16b,#4 1806 ext v19.16b,v19.16b,v19.16b,#4 1807 ext v23.16b,v23.16b,v23.16b,#4 1808 ext v1.16b,v1.16b,v1.16b,#12 1809 ext v5.16b,v5.16b,v5.16b,#12 1810 ext v9.16b,v9.16b,v9.16b,#12 1811 ext v13.16b,v13.16b,v13.16b,#12 1812 ext v17.16b,v17.16b,v17.16b,#12 1813 ext v21.16b,v21.16b,v21.16b,#12 1814 cbnz x4,Loop_lower_neon 1815 1816 add w5,w5,w22 // accumulate key block 1817 ldp q24,q25,[sp,#0] 1818 add x6,x6,x22,lsr#32 1819 ldp q26,q27,[sp,#32] 1820 add w7,w7,w23 1821 ldp q28,q29,[sp,#64] 1822 add x8,x8,x23,lsr#32 1823 add v0.4s,v0.4s,v24.4s 1824 add w9,w9,w24 1825 add v4.4s,v4.4s,v24.4s 1826 add x10,x10,x24,lsr#32 1827 add v8.4s,v8.4s,v24.4s 1828 add w11,w11,w25 1829 add v12.4s,v12.4s,v24.4s 1830 add x12,x12,x25,lsr#32 1831 add v16.4s,v16.4s,v24.4s 1832 add w13,w13,w26 1833 add v20.4s,v20.4s,v24.4s 1834 add x14,x14,x26,lsr#32 1835 add v2.4s,v2.4s,v26.4s 1836 add w15,w15,w27 1837 add v6.4s,v6.4s,v26.4s 1838 add x16,x16,x27,lsr#32 1839 add v10.4s,v10.4s,v26.4s 1840 add w17,w17,w28 1841 add v14.4s,v14.4s,v26.4s 1842 add x19,x19,x28,lsr#32 1843 add v18.4s,v18.4s,v26.4s 1844 add w20,w20,w30 1845 add v22.4s,v22.4s,v26.4s 1846 add x21,x21,x30,lsr#32 1847 add v19.4s,v19.4s,v31.4s // +4 1848 add x5,x5,x6,lsl#32 // pack 1849 add v23.4s,v23.4s,v31.4s // +4 1850 add x7,x7,x8,lsl#32 1851 add v3.4s,v3.4s,v27.4s 1852 ldp x6,x8,[x1,#0] // load input 1853 add v7.4s,v7.4s,v28.4s 1854 add x9,x9,x10,lsl#32 1855 add v11.4s,v11.4s,v29.4s 1856 add x11,x11,x12,lsl#32 1857 add v15.4s,v15.4s,v30.4s 1858 ldp x10,x12,[x1,#16] 1859 add v19.4s,v19.4s,v27.4s 1860 add x13,x13,x14,lsl#32 1861 add v23.4s,v23.4s,v28.4s 1862 add x15,x15,x16,lsl#32 1863 add v1.4s,v1.4s,v25.4s 1864 ldp x14,x16,[x1,#32] 1865 add v5.4s,v5.4s,v25.4s 1866 add x17,x17,x19,lsl#32 1867 add v9.4s,v9.4s,v25.4s 1868 add x20,x20,x21,lsl#32 1869 add v13.4s,v13.4s,v25.4s 1870 ldp x19,x21,[x1,#48] 1871 add v17.4s,v17.4s,v25.4s 1872 add x1,x1,#64 1873 add v21.4s,v21.4s,v25.4s 1874 1875#ifdef __ARMEB__ 1876 rev x5,x5 1877 rev x7,x7 1878 rev x9,x9 1879 rev x11,x11 1880 rev x13,x13 1881 rev x15,x15 1882 rev x17,x17 1883 rev x20,x20 1884#endif 1885 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1886 eor x5,x5,x6 1887 eor x7,x7,x8 1888 eor x9,x9,x10 1889 eor x11,x11,x12 1890 eor x13,x13,x14 1891 eor v0.16b,v0.16b,v24.16b 1892 eor x15,x15,x16 1893 eor v1.16b,v1.16b,v25.16b 1894 eor x17,x17,x19 1895 eor v2.16b,v2.16b,v26.16b 1896 eor x20,x20,x21 1897 eor v3.16b,v3.16b,v27.16b 1898 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1899 1900 stp x5,x7,[x0,#0] // store output 1901 add x28,x28,#7 // increment counter 1902 stp x9,x11,[x0,#16] 1903 stp x13,x15,[x0,#32] 1904 stp x17,x20,[x0,#48] 1905 add x0,x0,#64 1906 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1907 1908 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1909 eor v4.16b,v4.16b,v24.16b 1910 eor v5.16b,v5.16b,v25.16b 1911 eor v6.16b,v6.16b,v26.16b 1912 eor v7.16b,v7.16b,v27.16b 1913 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1914 1915 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1916 eor v8.16b,v8.16b,v0.16b 1917 ldp q24,q25,[sp,#0] 1918 eor v9.16b,v9.16b,v1.16b 1919 ldp q26,q27,[sp,#32] 1920 eor v10.16b,v10.16b,v2.16b 1921 eor v11.16b,v11.16b,v3.16b 1922 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1923 1924 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1925 eor v12.16b,v12.16b,v4.16b 1926 eor v13.16b,v13.16b,v5.16b 1927 eor v14.16b,v14.16b,v6.16b 1928 eor v15.16b,v15.16b,v7.16b 1929 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1930 1931 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1932 eor v16.16b,v16.16b,v8.16b 1933 eor v17.16b,v17.16b,v9.16b 1934 eor v18.16b,v18.16b,v10.16b 1935 eor v19.16b,v19.16b,v11.16b 1936 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1937 1938 shl v0.4s,v31.4s,#1 // 4 -> 8 1939 eor v20.16b,v20.16b,v12.16b 1940 eor v21.16b,v21.16b,v13.16b 1941 eor v22.16b,v22.16b,v14.16b 1942 eor v23.16b,v23.16b,v15.16b 1943 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1944 1945 add v27.4s,v27.4s,v0.4s // += 8 1946 add v28.4s,v28.4s,v0.4s 1947 add v29.4s,v29.4s,v0.4s 1948 add v30.4s,v30.4s,v0.4s 1949 1950 b.hs Loop_outer_512_neon 1951 1952 adds x2,x2,#512 1953 ushr v0.4s,v31.4s,#2 // 4 -> 1 1954 1955 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1956 ldp d10,d11,[sp,#128+16] 1957 ldp d12,d13,[sp,#128+32] 1958 ldp d14,d15,[sp,#128+48] 1959 1960 stp q24,q31,[sp,#0] // wipe off-load area 1961 stp q24,q31,[sp,#32] 1962 stp q24,q31,[sp,#64] 1963 1964 b.eq Ldone_512_neon 1965 1966 cmp x2,#192 1967 sub v27.4s,v27.4s,v0.4s // -= 1 1968 sub v28.4s,v28.4s,v0.4s 1969 sub v29.4s,v29.4s,v0.4s 1970 add sp,sp,#128 1971 b.hs Loop_outer_neon 1972 1973 eor v25.16b,v25.16b,v25.16b 1974 eor v26.16b,v26.16b,v26.16b 1975 eor v27.16b,v27.16b,v27.16b 1976 eor v28.16b,v28.16b,v28.16b 1977 eor v29.16b,v29.16b,v29.16b 1978 eor v30.16b,v30.16b,v30.16b 1979 b Loop_outer 1980 1981Ldone_512_neon: 1982 ldp x19,x20,[x29,#16] 1983 add sp,sp,#128+64 1984 ldp x21,x22,[x29,#32] 1985 ldp x23,x24,[x29,#48] 1986 ldp x25,x26,[x29,#64] 1987 ldp x27,x28,[x29,#80] 1988 ldp x29,x30,[sp],#96 1989 AARCH64_VALIDATE_LINK_REGISTER 1990 ret 1991 1992#endif // !OPENSSL_NO_ASM 1993