1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if !defined(OPENSSL_NO_ASM) 11#if defined(BORINGSSL_PREFIX) 12#include <boringssl_prefix_symbols_asm.h> 13#endif 14#include <openssl/arm_arch.h> 15 16 17 18.section __TEXT,__const 19 20.align 5 21Lsigma: 22.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 23Lone: 24.long 1,0,0,0 25LOPENSSL_armcap_P: 26#ifdef __ILP32__ 27.long _OPENSSL_armcap_P-. 28#else 29.quad _OPENSSL_armcap_P-. 30#endif 31.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 32.align 2 33 34.text 35 36.globl _ChaCha20_ctr32 37.private_extern _ChaCha20_ctr32 38 39.align 5 40_ChaCha20_ctr32: 41 cbz x2,Labort 42 adrp x5,_OPENSSL_armcap_P@PAGE 43 cmp x2,#192 44 b.lo Lshort 45 add x5,x5,_OPENSSL_armcap_P@PAGEOFF 46 ldr w17,[x5] 47 tst w17,#ARMV7_NEON 48 b.ne ChaCha20_neon 49 50Lshort: 51 stp x29,x30,[sp,#-96]! 52 add x29,sp,#0 53 54 adrp x5,Lsigma@PAGE 55 add x5,x5,Lsigma@PAGEOFF 56 stp x19,x20,[sp,#16] 57 stp x21,x22,[sp,#32] 58 stp x23,x24,[sp,#48] 59 stp x25,x26,[sp,#64] 60 stp x27,x28,[sp,#80] 61 sub sp,sp,#64 62 63 ldp x22,x23,[x5] // load sigma 64 ldp x24,x25,[x3] // load key 65 ldp x26,x27,[x3,#16] 66 ldp x28,x30,[x4] // load counter 67#ifdef __ARMEB__ 68 ror x24,x24,#32 69 ror x25,x25,#32 70 ror x26,x26,#32 71 ror x27,x27,#32 72 ror x28,x28,#32 73 ror x30,x30,#32 74#endif 75 76Loop_outer: 77 mov w5,w22 // unpack key block 78 lsr x6,x22,#32 79 mov w7,w23 80 lsr x8,x23,#32 81 mov w9,w24 82 lsr x10,x24,#32 83 mov w11,w25 84 lsr x12,x25,#32 85 mov w13,w26 86 lsr x14,x26,#32 87 mov w15,w27 88 lsr x16,x27,#32 89 mov w17,w28 90 lsr x19,x28,#32 91 mov w20,w30 92 lsr x21,x30,#32 93 94 mov x4,#10 95 subs x2,x2,#64 96Loop: 97 sub x4,x4,#1 98 add w5,w5,w9 99 add w6,w6,w10 100 add w7,w7,w11 101 add w8,w8,w12 102 eor w17,w17,w5 103 eor w19,w19,w6 104 eor w20,w20,w7 105 eor w21,w21,w8 106 ror w17,w17,#16 107 ror w19,w19,#16 108 ror w20,w20,#16 109 ror w21,w21,#16 110 add w13,w13,w17 111 add w14,w14,w19 112 add w15,w15,w20 113 add w16,w16,w21 114 eor w9,w9,w13 115 eor w10,w10,w14 116 eor w11,w11,w15 117 eor w12,w12,w16 118 ror w9,w9,#20 119 ror w10,w10,#20 120 ror w11,w11,#20 121 ror w12,w12,#20 122 add w5,w5,w9 123 add w6,w6,w10 124 add w7,w7,w11 125 add w8,w8,w12 126 eor w17,w17,w5 127 eor w19,w19,w6 128 eor w20,w20,w7 129 eor w21,w21,w8 130 ror w17,w17,#24 131 ror w19,w19,#24 132 ror w20,w20,#24 133 ror w21,w21,#24 134 add w13,w13,w17 135 add w14,w14,w19 136 add w15,w15,w20 137 add w16,w16,w21 138 eor w9,w9,w13 139 eor w10,w10,w14 140 eor w11,w11,w15 141 eor w12,w12,w16 142 ror w9,w9,#25 143 ror w10,w10,#25 144 ror w11,w11,#25 145 ror w12,w12,#25 146 add w5,w5,w10 147 add w6,w6,w11 148 add w7,w7,w12 149 add w8,w8,w9 150 eor w21,w21,w5 151 eor w17,w17,w6 152 eor w19,w19,w7 153 eor w20,w20,w8 154 ror w21,w21,#16 155 ror w17,w17,#16 156 ror w19,w19,#16 157 ror w20,w20,#16 158 add w15,w15,w21 159 add w16,w16,w17 160 add w13,w13,w19 161 add w14,w14,w20 162 eor w10,w10,w15 163 eor w11,w11,w16 164 eor w12,w12,w13 165 eor w9,w9,w14 166 ror w10,w10,#20 167 ror w11,w11,#20 168 ror w12,w12,#20 169 ror w9,w9,#20 170 add w5,w5,w10 171 add w6,w6,w11 172 add w7,w7,w12 173 add w8,w8,w9 174 eor w21,w21,w5 175 eor w17,w17,w6 176 eor w19,w19,w7 177 eor w20,w20,w8 178 ror w21,w21,#24 179 ror w17,w17,#24 180 ror w19,w19,#24 181 ror w20,w20,#24 182 add w15,w15,w21 183 add w16,w16,w17 184 add w13,w13,w19 185 add w14,w14,w20 186 eor w10,w10,w15 187 eor w11,w11,w16 188 eor w12,w12,w13 189 eor w9,w9,w14 190 ror w10,w10,#25 191 ror w11,w11,#25 192 ror w12,w12,#25 193 ror w9,w9,#25 194 cbnz x4,Loop 195 196 add w5,w5,w22 // accumulate key block 197 add x6,x6,x22,lsr#32 198 add w7,w7,w23 199 add x8,x8,x23,lsr#32 200 add w9,w9,w24 201 add x10,x10,x24,lsr#32 202 add w11,w11,w25 203 add x12,x12,x25,lsr#32 204 add w13,w13,w26 205 add x14,x14,x26,lsr#32 206 add w15,w15,w27 207 add x16,x16,x27,lsr#32 208 add w17,w17,w28 209 add x19,x19,x28,lsr#32 210 add w20,w20,w30 211 add x21,x21,x30,lsr#32 212 213 b.lo Ltail 214 215 add x5,x5,x6,lsl#32 // pack 216 add x7,x7,x8,lsl#32 217 ldp x6,x8,[x1,#0] // load input 218 add x9,x9,x10,lsl#32 219 add x11,x11,x12,lsl#32 220 ldp x10,x12,[x1,#16] 221 add x13,x13,x14,lsl#32 222 add x15,x15,x16,lsl#32 223 ldp x14,x16,[x1,#32] 224 add x17,x17,x19,lsl#32 225 add x20,x20,x21,lsl#32 226 ldp x19,x21,[x1,#48] 227 add x1,x1,#64 228#ifdef __ARMEB__ 229 rev x5,x5 230 rev x7,x7 231 rev x9,x9 232 rev x11,x11 233 rev x13,x13 234 rev x15,x15 235 rev x17,x17 236 rev x20,x20 237#endif 238 eor x5,x5,x6 239 eor x7,x7,x8 240 eor x9,x9,x10 241 eor x11,x11,x12 242 eor x13,x13,x14 243 eor x15,x15,x16 244 eor x17,x17,x19 245 eor x20,x20,x21 246 247 stp x5,x7,[x0,#0] // store output 248 add x28,x28,#1 // increment counter 249 stp x9,x11,[x0,#16] 250 stp x13,x15,[x0,#32] 251 stp x17,x20,[x0,#48] 252 add x0,x0,#64 253 254 b.hi Loop_outer 255 256 ldp x19,x20,[x29,#16] 257 add sp,sp,#64 258 ldp x21,x22,[x29,#32] 259 ldp x23,x24,[x29,#48] 260 ldp x25,x26,[x29,#64] 261 ldp x27,x28,[x29,#80] 262 ldp x29,x30,[sp],#96 263Labort: 264 ret 265 266.align 4 267Ltail: 268 add x2,x2,#64 269Less_than_64: 270 sub x0,x0,#1 271 add x1,x1,x2 272 add x0,x0,x2 273 add x4,sp,x2 274 neg x2,x2 275 276 add x5,x5,x6,lsl#32 // pack 277 add x7,x7,x8,lsl#32 278 add x9,x9,x10,lsl#32 279 add x11,x11,x12,lsl#32 280 add x13,x13,x14,lsl#32 281 add x15,x15,x16,lsl#32 282 add x17,x17,x19,lsl#32 283 add x20,x20,x21,lsl#32 284#ifdef __ARMEB__ 285 rev x5,x5 286 rev x7,x7 287 rev x9,x9 288 rev x11,x11 289 rev x13,x13 290 rev x15,x15 291 rev x17,x17 292 rev x20,x20 293#endif 294 stp x5,x7,[sp,#0] 295 stp x9,x11,[sp,#16] 296 stp x13,x15,[sp,#32] 297 stp x17,x20,[sp,#48] 298 299Loop_tail: 300 ldrb w10,[x1,x2] 301 ldrb w11,[x4,x2] 302 add x2,x2,#1 303 eor w10,w10,w11 304 strb w10,[x0,x2] 305 cbnz x2,Loop_tail 306 307 stp xzr,xzr,[sp,#0] 308 stp xzr,xzr,[sp,#16] 309 stp xzr,xzr,[sp,#32] 310 stp xzr,xzr,[sp,#48] 311 312 ldp x19,x20,[x29,#16] 313 add sp,sp,#64 314 ldp x21,x22,[x29,#32] 315 ldp x23,x24,[x29,#48] 316 ldp x25,x26,[x29,#64] 317 ldp x27,x28,[x29,#80] 318 ldp x29,x30,[sp],#96 319 ret 320 321 322 323.align 5 324ChaCha20_neon: 325 stp x29,x30,[sp,#-96]! 326 add x29,sp,#0 327 328 adrp x5,Lsigma@PAGE 329 add x5,x5,Lsigma@PAGEOFF 330 stp x19,x20,[sp,#16] 331 stp x21,x22,[sp,#32] 332 stp x23,x24,[sp,#48] 333 stp x25,x26,[sp,#64] 334 stp x27,x28,[sp,#80] 335 cmp x2,#512 336 b.hs L512_or_more_neon 337 338 sub sp,sp,#64 339 340 ldp x22,x23,[x5] // load sigma 341 ld1 {v24.4s},[x5],#16 342 ldp x24,x25,[x3] // load key 343 ldp x26,x27,[x3,#16] 344 ld1 {v25.4s,v26.4s},[x3] 345 ldp x28,x30,[x4] // load counter 346 ld1 {v27.4s},[x4] 347 ld1 {v31.4s},[x5] 348#ifdef __ARMEB__ 349 rev64 v24.4s,v24.4s 350 ror x24,x24,#32 351 ror x25,x25,#32 352 ror x26,x26,#32 353 ror x27,x27,#32 354 ror x28,x28,#32 355 ror x30,x30,#32 356#endif 357 add v27.4s,v27.4s,v31.4s // += 1 358 add v28.4s,v27.4s,v31.4s 359 add v29.4s,v28.4s,v31.4s 360 shl v31.4s,v31.4s,#2 // 1 -> 4 361 362Loop_outer_neon: 363 mov w5,w22 // unpack key block 364 lsr x6,x22,#32 365 mov v0.16b,v24.16b 366 mov w7,w23 367 lsr x8,x23,#32 368 mov v4.16b,v24.16b 369 mov w9,w24 370 lsr x10,x24,#32 371 mov v16.16b,v24.16b 372 mov w11,w25 373 mov v1.16b,v25.16b 374 lsr x12,x25,#32 375 mov v5.16b,v25.16b 376 mov w13,w26 377 mov v17.16b,v25.16b 378 lsr x14,x26,#32 379 mov v3.16b,v27.16b 380 mov w15,w27 381 mov v7.16b,v28.16b 382 lsr x16,x27,#32 383 mov v19.16b,v29.16b 384 mov w17,w28 385 mov v2.16b,v26.16b 386 lsr x19,x28,#32 387 mov v6.16b,v26.16b 388 mov w20,w30 389 mov v18.16b,v26.16b 390 lsr x21,x30,#32 391 392 mov x4,#10 393 subs x2,x2,#256 394Loop_neon: 395 sub x4,x4,#1 396 add v0.4s,v0.4s,v1.4s 397 add w5,w5,w9 398 add v4.4s,v4.4s,v5.4s 399 add w6,w6,w10 400 add v16.4s,v16.4s,v17.4s 401 add w7,w7,w11 402 eor v3.16b,v3.16b,v0.16b 403 add w8,w8,w12 404 eor v7.16b,v7.16b,v4.16b 405 eor w17,w17,w5 406 eor v19.16b,v19.16b,v16.16b 407 eor w19,w19,w6 408 rev32 v3.8h,v3.8h 409 eor w20,w20,w7 410 rev32 v7.8h,v7.8h 411 eor w21,w21,w8 412 rev32 v19.8h,v19.8h 413 ror w17,w17,#16 414 add v2.4s,v2.4s,v3.4s 415 ror w19,w19,#16 416 add v6.4s,v6.4s,v7.4s 417 ror w20,w20,#16 418 add v18.4s,v18.4s,v19.4s 419 ror w21,w21,#16 420 eor v20.16b,v1.16b,v2.16b 421 add w13,w13,w17 422 eor v21.16b,v5.16b,v6.16b 423 add w14,w14,w19 424 eor v22.16b,v17.16b,v18.16b 425 add w15,w15,w20 426 ushr v1.4s,v20.4s,#20 427 add w16,w16,w21 428 ushr v5.4s,v21.4s,#20 429 eor w9,w9,w13 430 ushr v17.4s,v22.4s,#20 431 eor w10,w10,w14 432 sli v1.4s,v20.4s,#12 433 eor w11,w11,w15 434 sli v5.4s,v21.4s,#12 435 eor w12,w12,w16 436 sli v17.4s,v22.4s,#12 437 ror w9,w9,#20 438 add v0.4s,v0.4s,v1.4s 439 ror w10,w10,#20 440 add v4.4s,v4.4s,v5.4s 441 ror w11,w11,#20 442 add v16.4s,v16.4s,v17.4s 443 ror w12,w12,#20 444 eor v20.16b,v3.16b,v0.16b 445 add w5,w5,w9 446 eor v21.16b,v7.16b,v4.16b 447 add w6,w6,w10 448 eor v22.16b,v19.16b,v16.16b 449 add w7,w7,w11 450 ushr v3.4s,v20.4s,#24 451 add w8,w8,w12 452 ushr v7.4s,v21.4s,#24 453 eor w17,w17,w5 454 ushr v19.4s,v22.4s,#24 455 eor w19,w19,w6 456 sli v3.4s,v20.4s,#8 457 eor w20,w20,w7 458 sli v7.4s,v21.4s,#8 459 eor w21,w21,w8 460 sli v19.4s,v22.4s,#8 461 ror w17,w17,#24 462 add v2.4s,v2.4s,v3.4s 463 ror w19,w19,#24 464 add v6.4s,v6.4s,v7.4s 465 ror w20,w20,#24 466 add v18.4s,v18.4s,v19.4s 467 ror w21,w21,#24 468 eor v20.16b,v1.16b,v2.16b 469 add w13,w13,w17 470 eor v21.16b,v5.16b,v6.16b 471 add w14,w14,w19 472 eor v22.16b,v17.16b,v18.16b 473 add w15,w15,w20 474 ushr v1.4s,v20.4s,#25 475 add w16,w16,w21 476 ushr v5.4s,v21.4s,#25 477 eor w9,w9,w13 478 ushr v17.4s,v22.4s,#25 479 eor w10,w10,w14 480 sli v1.4s,v20.4s,#7 481 eor w11,w11,w15 482 sli v5.4s,v21.4s,#7 483 eor w12,w12,w16 484 sli v17.4s,v22.4s,#7 485 ror w9,w9,#25 486 ext v2.16b,v2.16b,v2.16b,#8 487 ror w10,w10,#25 488 ext v6.16b,v6.16b,v6.16b,#8 489 ror w11,w11,#25 490 ext v18.16b,v18.16b,v18.16b,#8 491 ror w12,w12,#25 492 ext v3.16b,v3.16b,v3.16b,#12 493 ext v7.16b,v7.16b,v7.16b,#12 494 ext v19.16b,v19.16b,v19.16b,#12 495 ext v1.16b,v1.16b,v1.16b,#4 496 ext v5.16b,v5.16b,v5.16b,#4 497 ext v17.16b,v17.16b,v17.16b,#4 498 add v0.4s,v0.4s,v1.4s 499 add w5,w5,w10 500 add v4.4s,v4.4s,v5.4s 501 add w6,w6,w11 502 add v16.4s,v16.4s,v17.4s 503 add w7,w7,w12 504 eor v3.16b,v3.16b,v0.16b 505 add w8,w8,w9 506 eor v7.16b,v7.16b,v4.16b 507 eor w21,w21,w5 508 eor v19.16b,v19.16b,v16.16b 509 eor w17,w17,w6 510 rev32 v3.8h,v3.8h 511 eor w19,w19,w7 512 rev32 v7.8h,v7.8h 513 eor w20,w20,w8 514 rev32 v19.8h,v19.8h 515 ror w21,w21,#16 516 add v2.4s,v2.4s,v3.4s 517 ror w17,w17,#16 518 add v6.4s,v6.4s,v7.4s 519 ror w19,w19,#16 520 add v18.4s,v18.4s,v19.4s 521 ror w20,w20,#16 522 eor v20.16b,v1.16b,v2.16b 523 add w15,w15,w21 524 eor v21.16b,v5.16b,v6.16b 525 add w16,w16,w17 526 eor v22.16b,v17.16b,v18.16b 527 add w13,w13,w19 528 ushr v1.4s,v20.4s,#20 529 add w14,w14,w20 530 ushr v5.4s,v21.4s,#20 531 eor w10,w10,w15 532 ushr v17.4s,v22.4s,#20 533 eor w11,w11,w16 534 sli v1.4s,v20.4s,#12 535 eor w12,w12,w13 536 sli v5.4s,v21.4s,#12 537 eor w9,w9,w14 538 sli v17.4s,v22.4s,#12 539 ror w10,w10,#20 540 add v0.4s,v0.4s,v1.4s 541 ror w11,w11,#20 542 add v4.4s,v4.4s,v5.4s 543 ror w12,w12,#20 544 add v16.4s,v16.4s,v17.4s 545 ror w9,w9,#20 546 eor v20.16b,v3.16b,v0.16b 547 add w5,w5,w10 548 eor v21.16b,v7.16b,v4.16b 549 add w6,w6,w11 550 eor v22.16b,v19.16b,v16.16b 551 add w7,w7,w12 552 ushr v3.4s,v20.4s,#24 553 add w8,w8,w9 554 ushr v7.4s,v21.4s,#24 555 eor w21,w21,w5 556 ushr v19.4s,v22.4s,#24 557 eor w17,w17,w6 558 sli v3.4s,v20.4s,#8 559 eor w19,w19,w7 560 sli v7.4s,v21.4s,#8 561 eor w20,w20,w8 562 sli v19.4s,v22.4s,#8 563 ror w21,w21,#24 564 add v2.4s,v2.4s,v3.4s 565 ror w17,w17,#24 566 add v6.4s,v6.4s,v7.4s 567 ror w19,w19,#24 568 add v18.4s,v18.4s,v19.4s 569 ror w20,w20,#24 570 eor v20.16b,v1.16b,v2.16b 571 add w15,w15,w21 572 eor v21.16b,v5.16b,v6.16b 573 add w16,w16,w17 574 eor v22.16b,v17.16b,v18.16b 575 add w13,w13,w19 576 ushr v1.4s,v20.4s,#25 577 add w14,w14,w20 578 ushr v5.4s,v21.4s,#25 579 eor w10,w10,w15 580 ushr v17.4s,v22.4s,#25 581 eor w11,w11,w16 582 sli v1.4s,v20.4s,#7 583 eor w12,w12,w13 584 sli v5.4s,v21.4s,#7 585 eor w9,w9,w14 586 sli v17.4s,v22.4s,#7 587 ror w10,w10,#25 588 ext v2.16b,v2.16b,v2.16b,#8 589 ror w11,w11,#25 590 ext v6.16b,v6.16b,v6.16b,#8 591 ror w12,w12,#25 592 ext v18.16b,v18.16b,v18.16b,#8 593 ror w9,w9,#25 594 ext v3.16b,v3.16b,v3.16b,#4 595 ext v7.16b,v7.16b,v7.16b,#4 596 ext v19.16b,v19.16b,v19.16b,#4 597 ext v1.16b,v1.16b,v1.16b,#12 598 ext v5.16b,v5.16b,v5.16b,#12 599 ext v17.16b,v17.16b,v17.16b,#12 600 cbnz x4,Loop_neon 601 602 add w5,w5,w22 // accumulate key block 603 add v0.4s,v0.4s,v24.4s 604 add x6,x6,x22,lsr#32 605 add v4.4s,v4.4s,v24.4s 606 add w7,w7,w23 607 add v16.4s,v16.4s,v24.4s 608 add x8,x8,x23,lsr#32 609 add v2.4s,v2.4s,v26.4s 610 add w9,w9,w24 611 add v6.4s,v6.4s,v26.4s 612 add x10,x10,x24,lsr#32 613 add v18.4s,v18.4s,v26.4s 614 add w11,w11,w25 615 add v3.4s,v3.4s,v27.4s 616 add x12,x12,x25,lsr#32 617 add w13,w13,w26 618 add v7.4s,v7.4s,v28.4s 619 add x14,x14,x26,lsr#32 620 add w15,w15,w27 621 add v19.4s,v19.4s,v29.4s 622 add x16,x16,x27,lsr#32 623 add w17,w17,w28 624 add v1.4s,v1.4s,v25.4s 625 add x19,x19,x28,lsr#32 626 add w20,w20,w30 627 add v5.4s,v5.4s,v25.4s 628 add x21,x21,x30,lsr#32 629 add v17.4s,v17.4s,v25.4s 630 631 b.lo Ltail_neon 632 633 add x5,x5,x6,lsl#32 // pack 634 add x7,x7,x8,lsl#32 635 ldp x6,x8,[x1,#0] // load input 636 add x9,x9,x10,lsl#32 637 add x11,x11,x12,lsl#32 638 ldp x10,x12,[x1,#16] 639 add x13,x13,x14,lsl#32 640 add x15,x15,x16,lsl#32 641 ldp x14,x16,[x1,#32] 642 add x17,x17,x19,lsl#32 643 add x20,x20,x21,lsl#32 644 ldp x19,x21,[x1,#48] 645 add x1,x1,#64 646#ifdef __ARMEB__ 647 rev x5,x5 648 rev x7,x7 649 rev x9,x9 650 rev x11,x11 651 rev x13,x13 652 rev x15,x15 653 rev x17,x17 654 rev x20,x20 655#endif 656 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 657 eor x5,x5,x6 658 eor x7,x7,x8 659 eor x9,x9,x10 660 eor x11,x11,x12 661 eor x13,x13,x14 662 eor v0.16b,v0.16b,v20.16b 663 eor x15,x15,x16 664 eor v1.16b,v1.16b,v21.16b 665 eor x17,x17,x19 666 eor v2.16b,v2.16b,v22.16b 667 eor x20,x20,x21 668 eor v3.16b,v3.16b,v23.16b 669 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 670 671 stp x5,x7,[x0,#0] // store output 672 add x28,x28,#4 // increment counter 673 stp x9,x11,[x0,#16] 674 add v27.4s,v27.4s,v31.4s // += 4 675 stp x13,x15,[x0,#32] 676 add v28.4s,v28.4s,v31.4s 677 stp x17,x20,[x0,#48] 678 add v29.4s,v29.4s,v31.4s 679 add x0,x0,#64 680 681 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 682 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 683 684 eor v4.16b,v4.16b,v20.16b 685 eor v5.16b,v5.16b,v21.16b 686 eor v6.16b,v6.16b,v22.16b 687 eor v7.16b,v7.16b,v23.16b 688 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 689 690 eor v16.16b,v16.16b,v0.16b 691 eor v17.16b,v17.16b,v1.16b 692 eor v18.16b,v18.16b,v2.16b 693 eor v19.16b,v19.16b,v3.16b 694 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 695 696 b.hi Loop_outer_neon 697 698 ldp x19,x20,[x29,#16] 699 add sp,sp,#64 700 ldp x21,x22,[x29,#32] 701 ldp x23,x24,[x29,#48] 702 ldp x25,x26,[x29,#64] 703 ldp x27,x28,[x29,#80] 704 ldp x29,x30,[sp],#96 705 ret 706 707Ltail_neon: 708 add x2,x2,#256 709 cmp x2,#64 710 b.lo Less_than_64 711 712 add x5,x5,x6,lsl#32 // pack 713 add x7,x7,x8,lsl#32 714 ldp x6,x8,[x1,#0] // load input 715 add x9,x9,x10,lsl#32 716 add x11,x11,x12,lsl#32 717 ldp x10,x12,[x1,#16] 718 add x13,x13,x14,lsl#32 719 add x15,x15,x16,lsl#32 720 ldp x14,x16,[x1,#32] 721 add x17,x17,x19,lsl#32 722 add x20,x20,x21,lsl#32 723 ldp x19,x21,[x1,#48] 724 add x1,x1,#64 725#ifdef __ARMEB__ 726 rev x5,x5 727 rev x7,x7 728 rev x9,x9 729 rev x11,x11 730 rev x13,x13 731 rev x15,x15 732 rev x17,x17 733 rev x20,x20 734#endif 735 eor x5,x5,x6 736 eor x7,x7,x8 737 eor x9,x9,x10 738 eor x11,x11,x12 739 eor x13,x13,x14 740 eor x15,x15,x16 741 eor x17,x17,x19 742 eor x20,x20,x21 743 744 stp x5,x7,[x0,#0] // store output 745 add x28,x28,#4 // increment counter 746 stp x9,x11,[x0,#16] 747 stp x13,x15,[x0,#32] 748 stp x17,x20,[x0,#48] 749 add x0,x0,#64 750 b.eq Ldone_neon 751 sub x2,x2,#64 752 cmp x2,#64 753 b.lo Less_than_128 754 755 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 756 eor v0.16b,v0.16b,v20.16b 757 eor v1.16b,v1.16b,v21.16b 758 eor v2.16b,v2.16b,v22.16b 759 eor v3.16b,v3.16b,v23.16b 760 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 761 b.eq Ldone_neon 762 sub x2,x2,#64 763 cmp x2,#64 764 b.lo Less_than_192 765 766 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 767 eor v4.16b,v4.16b,v20.16b 768 eor v5.16b,v5.16b,v21.16b 769 eor v6.16b,v6.16b,v22.16b 770 eor v7.16b,v7.16b,v23.16b 771 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 772 b.eq Ldone_neon 773 sub x2,x2,#64 774 775 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 776 b Last_neon 777 778Less_than_128: 779 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 780 b Last_neon 781Less_than_192: 782 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 783 b Last_neon 784 785.align 4 786Last_neon: 787 sub x0,x0,#1 788 add x1,x1,x2 789 add x0,x0,x2 790 add x4,sp,x2 791 neg x2,x2 792 793Loop_tail_neon: 794 ldrb w10,[x1,x2] 795 ldrb w11,[x4,x2] 796 add x2,x2,#1 797 eor w10,w10,w11 798 strb w10,[x0,x2] 799 cbnz x2,Loop_tail_neon 800 801 stp xzr,xzr,[sp,#0] 802 stp xzr,xzr,[sp,#16] 803 stp xzr,xzr,[sp,#32] 804 stp xzr,xzr,[sp,#48] 805 806Ldone_neon: 807 ldp x19,x20,[x29,#16] 808 add sp,sp,#64 809 ldp x21,x22,[x29,#32] 810 ldp x23,x24,[x29,#48] 811 ldp x25,x26,[x29,#64] 812 ldp x27,x28,[x29,#80] 813 ldp x29,x30,[sp],#96 814 ret 815 816 817.align 5 818ChaCha20_512_neon: 819 stp x29,x30,[sp,#-96]! 820 add x29,sp,#0 821 822 adrp x5,Lsigma@PAGE 823 add x5,x5,Lsigma@PAGEOFF 824 stp x19,x20,[sp,#16] 825 stp x21,x22,[sp,#32] 826 stp x23,x24,[sp,#48] 827 stp x25,x26,[sp,#64] 828 stp x27,x28,[sp,#80] 829 830L512_or_more_neon: 831 sub sp,sp,#128+64 832 833 ldp x22,x23,[x5] // load sigma 834 ld1 {v24.4s},[x5],#16 835 ldp x24,x25,[x3] // load key 836 ldp x26,x27,[x3,#16] 837 ld1 {v25.4s,v26.4s},[x3] 838 ldp x28,x30,[x4] // load counter 839 ld1 {v27.4s},[x4] 840 ld1 {v31.4s},[x5] 841#ifdef __ARMEB__ 842 rev64 v24.4s,v24.4s 843 ror x24,x24,#32 844 ror x25,x25,#32 845 ror x26,x26,#32 846 ror x27,x27,#32 847 ror x28,x28,#32 848 ror x30,x30,#32 849#endif 850 add v27.4s,v27.4s,v31.4s // += 1 851 stp q24,q25,[sp,#0] // off-load key block, invariant part 852 add v27.4s,v27.4s,v31.4s // not typo 853 str q26,[sp,#32] 854 add v28.4s,v27.4s,v31.4s 855 add v29.4s,v28.4s,v31.4s 856 add v30.4s,v29.4s,v31.4s 857 shl v31.4s,v31.4s,#2 // 1 -> 4 858 859 stp d8,d9,[sp,#128+0] // meet ABI requirements 860 stp d10,d11,[sp,#128+16] 861 stp d12,d13,[sp,#128+32] 862 stp d14,d15,[sp,#128+48] 863 864 sub x2,x2,#512 // not typo 865 866Loop_outer_512_neon: 867 mov v0.16b,v24.16b 868 mov v4.16b,v24.16b 869 mov v8.16b,v24.16b 870 mov v12.16b,v24.16b 871 mov v16.16b,v24.16b 872 mov v20.16b,v24.16b 873 mov v1.16b,v25.16b 874 mov w5,w22 // unpack key block 875 mov v5.16b,v25.16b 876 lsr x6,x22,#32 877 mov v9.16b,v25.16b 878 mov w7,w23 879 mov v13.16b,v25.16b 880 lsr x8,x23,#32 881 mov v17.16b,v25.16b 882 mov w9,w24 883 mov v21.16b,v25.16b 884 lsr x10,x24,#32 885 mov v3.16b,v27.16b 886 mov w11,w25 887 mov v7.16b,v28.16b 888 lsr x12,x25,#32 889 mov v11.16b,v29.16b 890 mov w13,w26 891 mov v15.16b,v30.16b 892 lsr x14,x26,#32 893 mov v2.16b,v26.16b 894 mov w15,w27 895 mov v6.16b,v26.16b 896 lsr x16,x27,#32 897 add v19.4s,v3.4s,v31.4s // +4 898 mov w17,w28 899 add v23.4s,v7.4s,v31.4s // +4 900 lsr x19,x28,#32 901 mov v10.16b,v26.16b 902 mov w20,w30 903 mov v14.16b,v26.16b 904 lsr x21,x30,#32 905 mov v18.16b,v26.16b 906 stp q27,q28,[sp,#48] // off-load key block, variable part 907 mov v22.16b,v26.16b 908 str q29,[sp,#80] 909 910 mov x4,#5 911 subs x2,x2,#512 912Loop_upper_neon: 913 sub x4,x4,#1 914 add v0.4s,v0.4s,v1.4s 915 add w5,w5,w9 916 add v4.4s,v4.4s,v5.4s 917 add w6,w6,w10 918 add v8.4s,v8.4s,v9.4s 919 add w7,w7,w11 920 add v12.4s,v12.4s,v13.4s 921 add w8,w8,w12 922 add v16.4s,v16.4s,v17.4s 923 eor w17,w17,w5 924 add v20.4s,v20.4s,v21.4s 925 eor w19,w19,w6 926 eor v3.16b,v3.16b,v0.16b 927 eor w20,w20,w7 928 eor v7.16b,v7.16b,v4.16b 929 eor w21,w21,w8 930 eor v11.16b,v11.16b,v8.16b 931 ror w17,w17,#16 932 eor v15.16b,v15.16b,v12.16b 933 ror w19,w19,#16 934 eor v19.16b,v19.16b,v16.16b 935 ror w20,w20,#16 936 eor v23.16b,v23.16b,v20.16b 937 ror w21,w21,#16 938 rev32 v3.8h,v3.8h 939 add w13,w13,w17 940 rev32 v7.8h,v7.8h 941 add w14,w14,w19 942 rev32 v11.8h,v11.8h 943 add w15,w15,w20 944 rev32 v15.8h,v15.8h 945 add w16,w16,w21 946 rev32 v19.8h,v19.8h 947 eor w9,w9,w13 948 rev32 v23.8h,v23.8h 949 eor w10,w10,w14 950 add v2.4s,v2.4s,v3.4s 951 eor w11,w11,w15 952 add v6.4s,v6.4s,v7.4s 953 eor w12,w12,w16 954 add v10.4s,v10.4s,v11.4s 955 ror w9,w9,#20 956 add v14.4s,v14.4s,v15.4s 957 ror w10,w10,#20 958 add v18.4s,v18.4s,v19.4s 959 ror w11,w11,#20 960 add v22.4s,v22.4s,v23.4s 961 ror w12,w12,#20 962 eor v24.16b,v1.16b,v2.16b 963 add w5,w5,w9 964 eor v25.16b,v5.16b,v6.16b 965 add w6,w6,w10 966 eor v26.16b,v9.16b,v10.16b 967 add w7,w7,w11 968 eor v27.16b,v13.16b,v14.16b 969 add w8,w8,w12 970 eor v28.16b,v17.16b,v18.16b 971 eor w17,w17,w5 972 eor v29.16b,v21.16b,v22.16b 973 eor w19,w19,w6 974 ushr v1.4s,v24.4s,#20 975 eor w20,w20,w7 976 ushr v5.4s,v25.4s,#20 977 eor w21,w21,w8 978 ushr v9.4s,v26.4s,#20 979 ror w17,w17,#24 980 ushr v13.4s,v27.4s,#20 981 ror w19,w19,#24 982 ushr v17.4s,v28.4s,#20 983 ror w20,w20,#24 984 ushr v21.4s,v29.4s,#20 985 ror w21,w21,#24 986 sli v1.4s,v24.4s,#12 987 add w13,w13,w17 988 sli v5.4s,v25.4s,#12 989 add w14,w14,w19 990 sli v9.4s,v26.4s,#12 991 add w15,w15,w20 992 sli v13.4s,v27.4s,#12 993 add w16,w16,w21 994 sli v17.4s,v28.4s,#12 995 eor w9,w9,w13 996 sli v21.4s,v29.4s,#12 997 eor w10,w10,w14 998 add v0.4s,v0.4s,v1.4s 999 eor w11,w11,w15 1000 add v4.4s,v4.4s,v5.4s 1001 eor w12,w12,w16 1002 add v8.4s,v8.4s,v9.4s 1003 ror w9,w9,#25 1004 add v12.4s,v12.4s,v13.4s 1005 ror w10,w10,#25 1006 add v16.4s,v16.4s,v17.4s 1007 ror w11,w11,#25 1008 add v20.4s,v20.4s,v21.4s 1009 ror w12,w12,#25 1010 eor v24.16b,v3.16b,v0.16b 1011 add w5,w5,w10 1012 eor v25.16b,v7.16b,v4.16b 1013 add w6,w6,w11 1014 eor v26.16b,v11.16b,v8.16b 1015 add w7,w7,w12 1016 eor v27.16b,v15.16b,v12.16b 1017 add w8,w8,w9 1018 eor v28.16b,v19.16b,v16.16b 1019 eor w21,w21,w5 1020 eor v29.16b,v23.16b,v20.16b 1021 eor w17,w17,w6 1022 ushr v3.4s,v24.4s,#24 1023 eor w19,w19,w7 1024 ushr v7.4s,v25.4s,#24 1025 eor w20,w20,w8 1026 ushr v11.4s,v26.4s,#24 1027 ror w21,w21,#16 1028 ushr v15.4s,v27.4s,#24 1029 ror w17,w17,#16 1030 ushr v19.4s,v28.4s,#24 1031 ror w19,w19,#16 1032 ushr v23.4s,v29.4s,#24 1033 ror w20,w20,#16 1034 sli v3.4s,v24.4s,#8 1035 add w15,w15,w21 1036 sli v7.4s,v25.4s,#8 1037 add w16,w16,w17 1038 sli v11.4s,v26.4s,#8 1039 add w13,w13,w19 1040 sli v15.4s,v27.4s,#8 1041 add w14,w14,w20 1042 sli v19.4s,v28.4s,#8 1043 eor w10,w10,w15 1044 sli v23.4s,v29.4s,#8 1045 eor w11,w11,w16 1046 add v2.4s,v2.4s,v3.4s 1047 eor w12,w12,w13 1048 add v6.4s,v6.4s,v7.4s 1049 eor w9,w9,w14 1050 add v10.4s,v10.4s,v11.4s 1051 ror w10,w10,#20 1052 add v14.4s,v14.4s,v15.4s 1053 ror w11,w11,#20 1054 add v18.4s,v18.4s,v19.4s 1055 ror w12,w12,#20 1056 add v22.4s,v22.4s,v23.4s 1057 ror w9,w9,#20 1058 eor v24.16b,v1.16b,v2.16b 1059 add w5,w5,w10 1060 eor v25.16b,v5.16b,v6.16b 1061 add w6,w6,w11 1062 eor v26.16b,v9.16b,v10.16b 1063 add w7,w7,w12 1064 eor v27.16b,v13.16b,v14.16b 1065 add w8,w8,w9 1066 eor v28.16b,v17.16b,v18.16b 1067 eor w21,w21,w5 1068 eor v29.16b,v21.16b,v22.16b 1069 eor w17,w17,w6 1070 ushr v1.4s,v24.4s,#25 1071 eor w19,w19,w7 1072 ushr v5.4s,v25.4s,#25 1073 eor w20,w20,w8 1074 ushr v9.4s,v26.4s,#25 1075 ror w21,w21,#24 1076 ushr v13.4s,v27.4s,#25 1077 ror w17,w17,#24 1078 ushr v17.4s,v28.4s,#25 1079 ror w19,w19,#24 1080 ushr v21.4s,v29.4s,#25 1081 ror w20,w20,#24 1082 sli v1.4s,v24.4s,#7 1083 add w15,w15,w21 1084 sli v5.4s,v25.4s,#7 1085 add w16,w16,w17 1086 sli v9.4s,v26.4s,#7 1087 add w13,w13,w19 1088 sli v13.4s,v27.4s,#7 1089 add w14,w14,w20 1090 sli v17.4s,v28.4s,#7 1091 eor w10,w10,w15 1092 sli v21.4s,v29.4s,#7 1093 eor w11,w11,w16 1094 ext v2.16b,v2.16b,v2.16b,#8 1095 eor w12,w12,w13 1096 ext v6.16b,v6.16b,v6.16b,#8 1097 eor w9,w9,w14 1098 ext v10.16b,v10.16b,v10.16b,#8 1099 ror w10,w10,#25 1100 ext v14.16b,v14.16b,v14.16b,#8 1101 ror w11,w11,#25 1102 ext v18.16b,v18.16b,v18.16b,#8 1103 ror w12,w12,#25 1104 ext v22.16b,v22.16b,v22.16b,#8 1105 ror w9,w9,#25 1106 ext v3.16b,v3.16b,v3.16b,#12 1107 ext v7.16b,v7.16b,v7.16b,#12 1108 ext v11.16b,v11.16b,v11.16b,#12 1109 ext v15.16b,v15.16b,v15.16b,#12 1110 ext v19.16b,v19.16b,v19.16b,#12 1111 ext v23.16b,v23.16b,v23.16b,#12 1112 ext v1.16b,v1.16b,v1.16b,#4 1113 ext v5.16b,v5.16b,v5.16b,#4 1114 ext v9.16b,v9.16b,v9.16b,#4 1115 ext v13.16b,v13.16b,v13.16b,#4 1116 ext v17.16b,v17.16b,v17.16b,#4 1117 ext v21.16b,v21.16b,v21.16b,#4 1118 add v0.4s,v0.4s,v1.4s 1119 add w5,w5,w9 1120 add v4.4s,v4.4s,v5.4s 1121 add w6,w6,w10 1122 add v8.4s,v8.4s,v9.4s 1123 add w7,w7,w11 1124 add v12.4s,v12.4s,v13.4s 1125 add w8,w8,w12 1126 add v16.4s,v16.4s,v17.4s 1127 eor w17,w17,w5 1128 add v20.4s,v20.4s,v21.4s 1129 eor w19,w19,w6 1130 eor v3.16b,v3.16b,v0.16b 1131 eor w20,w20,w7 1132 eor v7.16b,v7.16b,v4.16b 1133 eor w21,w21,w8 1134 eor v11.16b,v11.16b,v8.16b 1135 ror w17,w17,#16 1136 eor v15.16b,v15.16b,v12.16b 1137 ror w19,w19,#16 1138 eor v19.16b,v19.16b,v16.16b 1139 ror w20,w20,#16 1140 eor v23.16b,v23.16b,v20.16b 1141 ror w21,w21,#16 1142 rev32 v3.8h,v3.8h 1143 add w13,w13,w17 1144 rev32 v7.8h,v7.8h 1145 add w14,w14,w19 1146 rev32 v11.8h,v11.8h 1147 add w15,w15,w20 1148 rev32 v15.8h,v15.8h 1149 add w16,w16,w21 1150 rev32 v19.8h,v19.8h 1151 eor w9,w9,w13 1152 rev32 v23.8h,v23.8h 1153 eor w10,w10,w14 1154 add v2.4s,v2.4s,v3.4s 1155 eor w11,w11,w15 1156 add v6.4s,v6.4s,v7.4s 1157 eor w12,w12,w16 1158 add v10.4s,v10.4s,v11.4s 1159 ror w9,w9,#20 1160 add v14.4s,v14.4s,v15.4s 1161 ror w10,w10,#20 1162 add v18.4s,v18.4s,v19.4s 1163 ror w11,w11,#20 1164 add v22.4s,v22.4s,v23.4s 1165 ror w12,w12,#20 1166 eor v24.16b,v1.16b,v2.16b 1167 add w5,w5,w9 1168 eor v25.16b,v5.16b,v6.16b 1169 add w6,w6,w10 1170 eor v26.16b,v9.16b,v10.16b 1171 add w7,w7,w11 1172 eor v27.16b,v13.16b,v14.16b 1173 add w8,w8,w12 1174 eor v28.16b,v17.16b,v18.16b 1175 eor w17,w17,w5 1176 eor v29.16b,v21.16b,v22.16b 1177 eor w19,w19,w6 1178 ushr v1.4s,v24.4s,#20 1179 eor w20,w20,w7 1180 ushr v5.4s,v25.4s,#20 1181 eor w21,w21,w8 1182 ushr v9.4s,v26.4s,#20 1183 ror w17,w17,#24 1184 ushr v13.4s,v27.4s,#20 1185 ror w19,w19,#24 1186 ushr v17.4s,v28.4s,#20 1187 ror w20,w20,#24 1188 ushr v21.4s,v29.4s,#20 1189 ror w21,w21,#24 1190 sli v1.4s,v24.4s,#12 1191 add w13,w13,w17 1192 sli v5.4s,v25.4s,#12 1193 add w14,w14,w19 1194 sli v9.4s,v26.4s,#12 1195 add w15,w15,w20 1196 sli v13.4s,v27.4s,#12 1197 add w16,w16,w21 1198 sli v17.4s,v28.4s,#12 1199 eor w9,w9,w13 1200 sli v21.4s,v29.4s,#12 1201 eor w10,w10,w14 1202 add v0.4s,v0.4s,v1.4s 1203 eor w11,w11,w15 1204 add v4.4s,v4.4s,v5.4s 1205 eor w12,w12,w16 1206 add v8.4s,v8.4s,v9.4s 1207 ror w9,w9,#25 1208 add v12.4s,v12.4s,v13.4s 1209 ror w10,w10,#25 1210 add v16.4s,v16.4s,v17.4s 1211 ror w11,w11,#25 1212 add v20.4s,v20.4s,v21.4s 1213 ror w12,w12,#25 1214 eor v24.16b,v3.16b,v0.16b 1215 add w5,w5,w10 1216 eor v25.16b,v7.16b,v4.16b 1217 add w6,w6,w11 1218 eor v26.16b,v11.16b,v8.16b 1219 add w7,w7,w12 1220 eor v27.16b,v15.16b,v12.16b 1221 add w8,w8,w9 1222 eor v28.16b,v19.16b,v16.16b 1223 eor w21,w21,w5 1224 eor v29.16b,v23.16b,v20.16b 1225 eor w17,w17,w6 1226 ushr v3.4s,v24.4s,#24 1227 eor w19,w19,w7 1228 ushr v7.4s,v25.4s,#24 1229 eor w20,w20,w8 1230 ushr v11.4s,v26.4s,#24 1231 ror w21,w21,#16 1232 ushr v15.4s,v27.4s,#24 1233 ror w17,w17,#16 1234 ushr v19.4s,v28.4s,#24 1235 ror w19,w19,#16 1236 ushr v23.4s,v29.4s,#24 1237 ror w20,w20,#16 1238 sli v3.4s,v24.4s,#8 1239 add w15,w15,w21 1240 sli v7.4s,v25.4s,#8 1241 add w16,w16,w17 1242 sli v11.4s,v26.4s,#8 1243 add w13,w13,w19 1244 sli v15.4s,v27.4s,#8 1245 add w14,w14,w20 1246 sli v19.4s,v28.4s,#8 1247 eor w10,w10,w15 1248 sli v23.4s,v29.4s,#8 1249 eor w11,w11,w16 1250 add v2.4s,v2.4s,v3.4s 1251 eor w12,w12,w13 1252 add v6.4s,v6.4s,v7.4s 1253 eor w9,w9,w14 1254 add v10.4s,v10.4s,v11.4s 1255 ror w10,w10,#20 1256 add v14.4s,v14.4s,v15.4s 1257 ror w11,w11,#20 1258 add v18.4s,v18.4s,v19.4s 1259 ror w12,w12,#20 1260 add v22.4s,v22.4s,v23.4s 1261 ror w9,w9,#20 1262 eor v24.16b,v1.16b,v2.16b 1263 add w5,w5,w10 1264 eor v25.16b,v5.16b,v6.16b 1265 add w6,w6,w11 1266 eor v26.16b,v9.16b,v10.16b 1267 add w7,w7,w12 1268 eor v27.16b,v13.16b,v14.16b 1269 add w8,w8,w9 1270 eor v28.16b,v17.16b,v18.16b 1271 eor w21,w21,w5 1272 eor v29.16b,v21.16b,v22.16b 1273 eor w17,w17,w6 1274 ushr v1.4s,v24.4s,#25 1275 eor w19,w19,w7 1276 ushr v5.4s,v25.4s,#25 1277 eor w20,w20,w8 1278 ushr v9.4s,v26.4s,#25 1279 ror w21,w21,#24 1280 ushr v13.4s,v27.4s,#25 1281 ror w17,w17,#24 1282 ushr v17.4s,v28.4s,#25 1283 ror w19,w19,#24 1284 ushr v21.4s,v29.4s,#25 1285 ror w20,w20,#24 1286 sli v1.4s,v24.4s,#7 1287 add w15,w15,w21 1288 sli v5.4s,v25.4s,#7 1289 add w16,w16,w17 1290 sli v9.4s,v26.4s,#7 1291 add w13,w13,w19 1292 sli v13.4s,v27.4s,#7 1293 add w14,w14,w20 1294 sli v17.4s,v28.4s,#7 1295 eor w10,w10,w15 1296 sli v21.4s,v29.4s,#7 1297 eor w11,w11,w16 1298 ext v2.16b,v2.16b,v2.16b,#8 1299 eor w12,w12,w13 1300 ext v6.16b,v6.16b,v6.16b,#8 1301 eor w9,w9,w14 1302 ext v10.16b,v10.16b,v10.16b,#8 1303 ror w10,w10,#25 1304 ext v14.16b,v14.16b,v14.16b,#8 1305 ror w11,w11,#25 1306 ext v18.16b,v18.16b,v18.16b,#8 1307 ror w12,w12,#25 1308 ext v22.16b,v22.16b,v22.16b,#8 1309 ror w9,w9,#25 1310 ext v3.16b,v3.16b,v3.16b,#4 1311 ext v7.16b,v7.16b,v7.16b,#4 1312 ext v11.16b,v11.16b,v11.16b,#4 1313 ext v15.16b,v15.16b,v15.16b,#4 1314 ext v19.16b,v19.16b,v19.16b,#4 1315 ext v23.16b,v23.16b,v23.16b,#4 1316 ext v1.16b,v1.16b,v1.16b,#12 1317 ext v5.16b,v5.16b,v5.16b,#12 1318 ext v9.16b,v9.16b,v9.16b,#12 1319 ext v13.16b,v13.16b,v13.16b,#12 1320 ext v17.16b,v17.16b,v17.16b,#12 1321 ext v21.16b,v21.16b,v21.16b,#12 1322 cbnz x4,Loop_upper_neon 1323 1324 add w5,w5,w22 // accumulate key block 1325 add x6,x6,x22,lsr#32 1326 add w7,w7,w23 1327 add x8,x8,x23,lsr#32 1328 add w9,w9,w24 1329 add x10,x10,x24,lsr#32 1330 add w11,w11,w25 1331 add x12,x12,x25,lsr#32 1332 add w13,w13,w26 1333 add x14,x14,x26,lsr#32 1334 add w15,w15,w27 1335 add x16,x16,x27,lsr#32 1336 add w17,w17,w28 1337 add x19,x19,x28,lsr#32 1338 add w20,w20,w30 1339 add x21,x21,x30,lsr#32 1340 1341 add x5,x5,x6,lsl#32 // pack 1342 add x7,x7,x8,lsl#32 1343 ldp x6,x8,[x1,#0] // load input 1344 add x9,x9,x10,lsl#32 1345 add x11,x11,x12,lsl#32 1346 ldp x10,x12,[x1,#16] 1347 add x13,x13,x14,lsl#32 1348 add x15,x15,x16,lsl#32 1349 ldp x14,x16,[x1,#32] 1350 add x17,x17,x19,lsl#32 1351 add x20,x20,x21,lsl#32 1352 ldp x19,x21,[x1,#48] 1353 add x1,x1,#64 1354#ifdef __ARMEB__ 1355 rev x5,x5 1356 rev x7,x7 1357 rev x9,x9 1358 rev x11,x11 1359 rev x13,x13 1360 rev x15,x15 1361 rev x17,x17 1362 rev x20,x20 1363#endif 1364 eor x5,x5,x6 1365 eor x7,x7,x8 1366 eor x9,x9,x10 1367 eor x11,x11,x12 1368 eor x13,x13,x14 1369 eor x15,x15,x16 1370 eor x17,x17,x19 1371 eor x20,x20,x21 1372 1373 stp x5,x7,[x0,#0] // store output 1374 add x28,x28,#1 // increment counter 1375 mov w5,w22 // unpack key block 1376 lsr x6,x22,#32 1377 stp x9,x11,[x0,#16] 1378 mov w7,w23 1379 lsr x8,x23,#32 1380 stp x13,x15,[x0,#32] 1381 mov w9,w24 1382 lsr x10,x24,#32 1383 stp x17,x20,[x0,#48] 1384 add x0,x0,#64 1385 mov w11,w25 1386 lsr x12,x25,#32 1387 mov w13,w26 1388 lsr x14,x26,#32 1389 mov w15,w27 1390 lsr x16,x27,#32 1391 mov w17,w28 1392 lsr x19,x28,#32 1393 mov w20,w30 1394 lsr x21,x30,#32 1395 1396 mov x4,#5 1397Loop_lower_neon: 1398 sub x4,x4,#1 1399 add v0.4s,v0.4s,v1.4s 1400 add w5,w5,w9 1401 add v4.4s,v4.4s,v5.4s 1402 add w6,w6,w10 1403 add v8.4s,v8.4s,v9.4s 1404 add w7,w7,w11 1405 add v12.4s,v12.4s,v13.4s 1406 add w8,w8,w12 1407 add v16.4s,v16.4s,v17.4s 1408 eor w17,w17,w5 1409 add v20.4s,v20.4s,v21.4s 1410 eor w19,w19,w6 1411 eor v3.16b,v3.16b,v0.16b 1412 eor w20,w20,w7 1413 eor v7.16b,v7.16b,v4.16b 1414 eor w21,w21,w8 1415 eor v11.16b,v11.16b,v8.16b 1416 ror w17,w17,#16 1417 eor v15.16b,v15.16b,v12.16b 1418 ror w19,w19,#16 1419 eor v19.16b,v19.16b,v16.16b 1420 ror w20,w20,#16 1421 eor v23.16b,v23.16b,v20.16b 1422 ror w21,w21,#16 1423 rev32 v3.8h,v3.8h 1424 add w13,w13,w17 1425 rev32 v7.8h,v7.8h 1426 add w14,w14,w19 1427 rev32 v11.8h,v11.8h 1428 add w15,w15,w20 1429 rev32 v15.8h,v15.8h 1430 add w16,w16,w21 1431 rev32 v19.8h,v19.8h 1432 eor w9,w9,w13 1433 rev32 v23.8h,v23.8h 1434 eor w10,w10,w14 1435 add v2.4s,v2.4s,v3.4s 1436 eor w11,w11,w15 1437 add v6.4s,v6.4s,v7.4s 1438 eor w12,w12,w16 1439 add v10.4s,v10.4s,v11.4s 1440 ror w9,w9,#20 1441 add v14.4s,v14.4s,v15.4s 1442 ror w10,w10,#20 1443 add v18.4s,v18.4s,v19.4s 1444 ror w11,w11,#20 1445 add v22.4s,v22.4s,v23.4s 1446 ror w12,w12,#20 1447 eor v24.16b,v1.16b,v2.16b 1448 add w5,w5,w9 1449 eor v25.16b,v5.16b,v6.16b 1450 add w6,w6,w10 1451 eor v26.16b,v9.16b,v10.16b 1452 add w7,w7,w11 1453 eor v27.16b,v13.16b,v14.16b 1454 add w8,w8,w12 1455 eor v28.16b,v17.16b,v18.16b 1456 eor w17,w17,w5 1457 eor v29.16b,v21.16b,v22.16b 1458 eor w19,w19,w6 1459 ushr v1.4s,v24.4s,#20 1460 eor w20,w20,w7 1461 ushr v5.4s,v25.4s,#20 1462 eor w21,w21,w8 1463 ushr v9.4s,v26.4s,#20 1464 ror w17,w17,#24 1465 ushr v13.4s,v27.4s,#20 1466 ror w19,w19,#24 1467 ushr v17.4s,v28.4s,#20 1468 ror w20,w20,#24 1469 ushr v21.4s,v29.4s,#20 1470 ror w21,w21,#24 1471 sli v1.4s,v24.4s,#12 1472 add w13,w13,w17 1473 sli v5.4s,v25.4s,#12 1474 add w14,w14,w19 1475 sli v9.4s,v26.4s,#12 1476 add w15,w15,w20 1477 sli v13.4s,v27.4s,#12 1478 add w16,w16,w21 1479 sli v17.4s,v28.4s,#12 1480 eor w9,w9,w13 1481 sli v21.4s,v29.4s,#12 1482 eor w10,w10,w14 1483 add v0.4s,v0.4s,v1.4s 1484 eor w11,w11,w15 1485 add v4.4s,v4.4s,v5.4s 1486 eor w12,w12,w16 1487 add v8.4s,v8.4s,v9.4s 1488 ror w9,w9,#25 1489 add v12.4s,v12.4s,v13.4s 1490 ror w10,w10,#25 1491 add v16.4s,v16.4s,v17.4s 1492 ror w11,w11,#25 1493 add v20.4s,v20.4s,v21.4s 1494 ror w12,w12,#25 1495 eor v24.16b,v3.16b,v0.16b 1496 add w5,w5,w10 1497 eor v25.16b,v7.16b,v4.16b 1498 add w6,w6,w11 1499 eor v26.16b,v11.16b,v8.16b 1500 add w7,w7,w12 1501 eor v27.16b,v15.16b,v12.16b 1502 add w8,w8,w9 1503 eor v28.16b,v19.16b,v16.16b 1504 eor w21,w21,w5 1505 eor v29.16b,v23.16b,v20.16b 1506 eor w17,w17,w6 1507 ushr v3.4s,v24.4s,#24 1508 eor w19,w19,w7 1509 ushr v7.4s,v25.4s,#24 1510 eor w20,w20,w8 1511 ushr v11.4s,v26.4s,#24 1512 ror w21,w21,#16 1513 ushr v15.4s,v27.4s,#24 1514 ror w17,w17,#16 1515 ushr v19.4s,v28.4s,#24 1516 ror w19,w19,#16 1517 ushr v23.4s,v29.4s,#24 1518 ror w20,w20,#16 1519 sli v3.4s,v24.4s,#8 1520 add w15,w15,w21 1521 sli v7.4s,v25.4s,#8 1522 add w16,w16,w17 1523 sli v11.4s,v26.4s,#8 1524 add w13,w13,w19 1525 sli v15.4s,v27.4s,#8 1526 add w14,w14,w20 1527 sli v19.4s,v28.4s,#8 1528 eor w10,w10,w15 1529 sli v23.4s,v29.4s,#8 1530 eor w11,w11,w16 1531 add v2.4s,v2.4s,v3.4s 1532 eor w12,w12,w13 1533 add v6.4s,v6.4s,v7.4s 1534 eor w9,w9,w14 1535 add v10.4s,v10.4s,v11.4s 1536 ror w10,w10,#20 1537 add v14.4s,v14.4s,v15.4s 1538 ror w11,w11,#20 1539 add v18.4s,v18.4s,v19.4s 1540 ror w12,w12,#20 1541 add v22.4s,v22.4s,v23.4s 1542 ror w9,w9,#20 1543 eor v24.16b,v1.16b,v2.16b 1544 add w5,w5,w10 1545 eor v25.16b,v5.16b,v6.16b 1546 add w6,w6,w11 1547 eor v26.16b,v9.16b,v10.16b 1548 add w7,w7,w12 1549 eor v27.16b,v13.16b,v14.16b 1550 add w8,w8,w9 1551 eor v28.16b,v17.16b,v18.16b 1552 eor w21,w21,w5 1553 eor v29.16b,v21.16b,v22.16b 1554 eor w17,w17,w6 1555 ushr v1.4s,v24.4s,#25 1556 eor w19,w19,w7 1557 ushr v5.4s,v25.4s,#25 1558 eor w20,w20,w8 1559 ushr v9.4s,v26.4s,#25 1560 ror w21,w21,#24 1561 ushr v13.4s,v27.4s,#25 1562 ror w17,w17,#24 1563 ushr v17.4s,v28.4s,#25 1564 ror w19,w19,#24 1565 ushr v21.4s,v29.4s,#25 1566 ror w20,w20,#24 1567 sli v1.4s,v24.4s,#7 1568 add w15,w15,w21 1569 sli v5.4s,v25.4s,#7 1570 add w16,w16,w17 1571 sli v9.4s,v26.4s,#7 1572 add w13,w13,w19 1573 sli v13.4s,v27.4s,#7 1574 add w14,w14,w20 1575 sli v17.4s,v28.4s,#7 1576 eor w10,w10,w15 1577 sli v21.4s,v29.4s,#7 1578 eor w11,w11,w16 1579 ext v2.16b,v2.16b,v2.16b,#8 1580 eor w12,w12,w13 1581 ext v6.16b,v6.16b,v6.16b,#8 1582 eor w9,w9,w14 1583 ext v10.16b,v10.16b,v10.16b,#8 1584 ror w10,w10,#25 1585 ext v14.16b,v14.16b,v14.16b,#8 1586 ror w11,w11,#25 1587 ext v18.16b,v18.16b,v18.16b,#8 1588 ror w12,w12,#25 1589 ext v22.16b,v22.16b,v22.16b,#8 1590 ror w9,w9,#25 1591 ext v3.16b,v3.16b,v3.16b,#12 1592 ext v7.16b,v7.16b,v7.16b,#12 1593 ext v11.16b,v11.16b,v11.16b,#12 1594 ext v15.16b,v15.16b,v15.16b,#12 1595 ext v19.16b,v19.16b,v19.16b,#12 1596 ext v23.16b,v23.16b,v23.16b,#12 1597 ext v1.16b,v1.16b,v1.16b,#4 1598 ext v5.16b,v5.16b,v5.16b,#4 1599 ext v9.16b,v9.16b,v9.16b,#4 1600 ext v13.16b,v13.16b,v13.16b,#4 1601 ext v17.16b,v17.16b,v17.16b,#4 1602 ext v21.16b,v21.16b,v21.16b,#4 1603 add v0.4s,v0.4s,v1.4s 1604 add w5,w5,w9 1605 add v4.4s,v4.4s,v5.4s 1606 add w6,w6,w10 1607 add v8.4s,v8.4s,v9.4s 1608 add w7,w7,w11 1609 add v12.4s,v12.4s,v13.4s 1610 add w8,w8,w12 1611 add v16.4s,v16.4s,v17.4s 1612 eor w17,w17,w5 1613 add v20.4s,v20.4s,v21.4s 1614 eor w19,w19,w6 1615 eor v3.16b,v3.16b,v0.16b 1616 eor w20,w20,w7 1617 eor v7.16b,v7.16b,v4.16b 1618 eor w21,w21,w8 1619 eor v11.16b,v11.16b,v8.16b 1620 ror w17,w17,#16 1621 eor v15.16b,v15.16b,v12.16b 1622 ror w19,w19,#16 1623 eor v19.16b,v19.16b,v16.16b 1624 ror w20,w20,#16 1625 eor v23.16b,v23.16b,v20.16b 1626 ror w21,w21,#16 1627 rev32 v3.8h,v3.8h 1628 add w13,w13,w17 1629 rev32 v7.8h,v7.8h 1630 add w14,w14,w19 1631 rev32 v11.8h,v11.8h 1632 add w15,w15,w20 1633 rev32 v15.8h,v15.8h 1634 add w16,w16,w21 1635 rev32 v19.8h,v19.8h 1636 eor w9,w9,w13 1637 rev32 v23.8h,v23.8h 1638 eor w10,w10,w14 1639 add v2.4s,v2.4s,v3.4s 1640 eor w11,w11,w15 1641 add v6.4s,v6.4s,v7.4s 1642 eor w12,w12,w16 1643 add v10.4s,v10.4s,v11.4s 1644 ror w9,w9,#20 1645 add v14.4s,v14.4s,v15.4s 1646 ror w10,w10,#20 1647 add v18.4s,v18.4s,v19.4s 1648 ror w11,w11,#20 1649 add v22.4s,v22.4s,v23.4s 1650 ror w12,w12,#20 1651 eor v24.16b,v1.16b,v2.16b 1652 add w5,w5,w9 1653 eor v25.16b,v5.16b,v6.16b 1654 add w6,w6,w10 1655 eor v26.16b,v9.16b,v10.16b 1656 add w7,w7,w11 1657 eor v27.16b,v13.16b,v14.16b 1658 add w8,w8,w12 1659 eor v28.16b,v17.16b,v18.16b 1660 eor w17,w17,w5 1661 eor v29.16b,v21.16b,v22.16b 1662 eor w19,w19,w6 1663 ushr v1.4s,v24.4s,#20 1664 eor w20,w20,w7 1665 ushr v5.4s,v25.4s,#20 1666 eor w21,w21,w8 1667 ushr v9.4s,v26.4s,#20 1668 ror w17,w17,#24 1669 ushr v13.4s,v27.4s,#20 1670 ror w19,w19,#24 1671 ushr v17.4s,v28.4s,#20 1672 ror w20,w20,#24 1673 ushr v21.4s,v29.4s,#20 1674 ror w21,w21,#24 1675 sli v1.4s,v24.4s,#12 1676 add w13,w13,w17 1677 sli v5.4s,v25.4s,#12 1678 add w14,w14,w19 1679 sli v9.4s,v26.4s,#12 1680 add w15,w15,w20 1681 sli v13.4s,v27.4s,#12 1682 add w16,w16,w21 1683 sli v17.4s,v28.4s,#12 1684 eor w9,w9,w13 1685 sli v21.4s,v29.4s,#12 1686 eor w10,w10,w14 1687 add v0.4s,v0.4s,v1.4s 1688 eor w11,w11,w15 1689 add v4.4s,v4.4s,v5.4s 1690 eor w12,w12,w16 1691 add v8.4s,v8.4s,v9.4s 1692 ror w9,w9,#25 1693 add v12.4s,v12.4s,v13.4s 1694 ror w10,w10,#25 1695 add v16.4s,v16.4s,v17.4s 1696 ror w11,w11,#25 1697 add v20.4s,v20.4s,v21.4s 1698 ror w12,w12,#25 1699 eor v24.16b,v3.16b,v0.16b 1700 add w5,w5,w10 1701 eor v25.16b,v7.16b,v4.16b 1702 add w6,w6,w11 1703 eor v26.16b,v11.16b,v8.16b 1704 add w7,w7,w12 1705 eor v27.16b,v15.16b,v12.16b 1706 add w8,w8,w9 1707 eor v28.16b,v19.16b,v16.16b 1708 eor w21,w21,w5 1709 eor v29.16b,v23.16b,v20.16b 1710 eor w17,w17,w6 1711 ushr v3.4s,v24.4s,#24 1712 eor w19,w19,w7 1713 ushr v7.4s,v25.4s,#24 1714 eor w20,w20,w8 1715 ushr v11.4s,v26.4s,#24 1716 ror w21,w21,#16 1717 ushr v15.4s,v27.4s,#24 1718 ror w17,w17,#16 1719 ushr v19.4s,v28.4s,#24 1720 ror w19,w19,#16 1721 ushr v23.4s,v29.4s,#24 1722 ror w20,w20,#16 1723 sli v3.4s,v24.4s,#8 1724 add w15,w15,w21 1725 sli v7.4s,v25.4s,#8 1726 add w16,w16,w17 1727 sli v11.4s,v26.4s,#8 1728 add w13,w13,w19 1729 sli v15.4s,v27.4s,#8 1730 add w14,w14,w20 1731 sli v19.4s,v28.4s,#8 1732 eor w10,w10,w15 1733 sli v23.4s,v29.4s,#8 1734 eor w11,w11,w16 1735 add v2.4s,v2.4s,v3.4s 1736 eor w12,w12,w13 1737 add v6.4s,v6.4s,v7.4s 1738 eor w9,w9,w14 1739 add v10.4s,v10.4s,v11.4s 1740 ror w10,w10,#20 1741 add v14.4s,v14.4s,v15.4s 1742 ror w11,w11,#20 1743 add v18.4s,v18.4s,v19.4s 1744 ror w12,w12,#20 1745 add v22.4s,v22.4s,v23.4s 1746 ror w9,w9,#20 1747 eor v24.16b,v1.16b,v2.16b 1748 add w5,w5,w10 1749 eor v25.16b,v5.16b,v6.16b 1750 add w6,w6,w11 1751 eor v26.16b,v9.16b,v10.16b 1752 add w7,w7,w12 1753 eor v27.16b,v13.16b,v14.16b 1754 add w8,w8,w9 1755 eor v28.16b,v17.16b,v18.16b 1756 eor w21,w21,w5 1757 eor v29.16b,v21.16b,v22.16b 1758 eor w17,w17,w6 1759 ushr v1.4s,v24.4s,#25 1760 eor w19,w19,w7 1761 ushr v5.4s,v25.4s,#25 1762 eor w20,w20,w8 1763 ushr v9.4s,v26.4s,#25 1764 ror w21,w21,#24 1765 ushr v13.4s,v27.4s,#25 1766 ror w17,w17,#24 1767 ushr v17.4s,v28.4s,#25 1768 ror w19,w19,#24 1769 ushr v21.4s,v29.4s,#25 1770 ror w20,w20,#24 1771 sli v1.4s,v24.4s,#7 1772 add w15,w15,w21 1773 sli v5.4s,v25.4s,#7 1774 add w16,w16,w17 1775 sli v9.4s,v26.4s,#7 1776 add w13,w13,w19 1777 sli v13.4s,v27.4s,#7 1778 add w14,w14,w20 1779 sli v17.4s,v28.4s,#7 1780 eor w10,w10,w15 1781 sli v21.4s,v29.4s,#7 1782 eor w11,w11,w16 1783 ext v2.16b,v2.16b,v2.16b,#8 1784 eor w12,w12,w13 1785 ext v6.16b,v6.16b,v6.16b,#8 1786 eor w9,w9,w14 1787 ext v10.16b,v10.16b,v10.16b,#8 1788 ror w10,w10,#25 1789 ext v14.16b,v14.16b,v14.16b,#8 1790 ror w11,w11,#25 1791 ext v18.16b,v18.16b,v18.16b,#8 1792 ror w12,w12,#25 1793 ext v22.16b,v22.16b,v22.16b,#8 1794 ror w9,w9,#25 1795 ext v3.16b,v3.16b,v3.16b,#4 1796 ext v7.16b,v7.16b,v7.16b,#4 1797 ext v11.16b,v11.16b,v11.16b,#4 1798 ext v15.16b,v15.16b,v15.16b,#4 1799 ext v19.16b,v19.16b,v19.16b,#4 1800 ext v23.16b,v23.16b,v23.16b,#4 1801 ext v1.16b,v1.16b,v1.16b,#12 1802 ext v5.16b,v5.16b,v5.16b,#12 1803 ext v9.16b,v9.16b,v9.16b,#12 1804 ext v13.16b,v13.16b,v13.16b,#12 1805 ext v17.16b,v17.16b,v17.16b,#12 1806 ext v21.16b,v21.16b,v21.16b,#12 1807 cbnz x4,Loop_lower_neon 1808 1809 add w5,w5,w22 // accumulate key block 1810 ldp q24,q25,[sp,#0] 1811 add x6,x6,x22,lsr#32 1812 ldp q26,q27,[sp,#32] 1813 add w7,w7,w23 1814 ldp q28,q29,[sp,#64] 1815 add x8,x8,x23,lsr#32 1816 add v0.4s,v0.4s,v24.4s 1817 add w9,w9,w24 1818 add v4.4s,v4.4s,v24.4s 1819 add x10,x10,x24,lsr#32 1820 add v8.4s,v8.4s,v24.4s 1821 add w11,w11,w25 1822 add v12.4s,v12.4s,v24.4s 1823 add x12,x12,x25,lsr#32 1824 add v16.4s,v16.4s,v24.4s 1825 add w13,w13,w26 1826 add v20.4s,v20.4s,v24.4s 1827 add x14,x14,x26,lsr#32 1828 add v2.4s,v2.4s,v26.4s 1829 add w15,w15,w27 1830 add v6.4s,v6.4s,v26.4s 1831 add x16,x16,x27,lsr#32 1832 add v10.4s,v10.4s,v26.4s 1833 add w17,w17,w28 1834 add v14.4s,v14.4s,v26.4s 1835 add x19,x19,x28,lsr#32 1836 add v18.4s,v18.4s,v26.4s 1837 add w20,w20,w30 1838 add v22.4s,v22.4s,v26.4s 1839 add x21,x21,x30,lsr#32 1840 add v19.4s,v19.4s,v31.4s // +4 1841 add x5,x5,x6,lsl#32 // pack 1842 add v23.4s,v23.4s,v31.4s // +4 1843 add x7,x7,x8,lsl#32 1844 add v3.4s,v3.4s,v27.4s 1845 ldp x6,x8,[x1,#0] // load input 1846 add v7.4s,v7.4s,v28.4s 1847 add x9,x9,x10,lsl#32 1848 add v11.4s,v11.4s,v29.4s 1849 add x11,x11,x12,lsl#32 1850 add v15.4s,v15.4s,v30.4s 1851 ldp x10,x12,[x1,#16] 1852 add v19.4s,v19.4s,v27.4s 1853 add x13,x13,x14,lsl#32 1854 add v23.4s,v23.4s,v28.4s 1855 add x15,x15,x16,lsl#32 1856 add v1.4s,v1.4s,v25.4s 1857 ldp x14,x16,[x1,#32] 1858 add v5.4s,v5.4s,v25.4s 1859 add x17,x17,x19,lsl#32 1860 add v9.4s,v9.4s,v25.4s 1861 add x20,x20,x21,lsl#32 1862 add v13.4s,v13.4s,v25.4s 1863 ldp x19,x21,[x1,#48] 1864 add v17.4s,v17.4s,v25.4s 1865 add x1,x1,#64 1866 add v21.4s,v21.4s,v25.4s 1867 1868#ifdef __ARMEB__ 1869 rev x5,x5 1870 rev x7,x7 1871 rev x9,x9 1872 rev x11,x11 1873 rev x13,x13 1874 rev x15,x15 1875 rev x17,x17 1876 rev x20,x20 1877#endif 1878 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1879 eor x5,x5,x6 1880 eor x7,x7,x8 1881 eor x9,x9,x10 1882 eor x11,x11,x12 1883 eor x13,x13,x14 1884 eor v0.16b,v0.16b,v24.16b 1885 eor x15,x15,x16 1886 eor v1.16b,v1.16b,v25.16b 1887 eor x17,x17,x19 1888 eor v2.16b,v2.16b,v26.16b 1889 eor x20,x20,x21 1890 eor v3.16b,v3.16b,v27.16b 1891 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1892 1893 stp x5,x7,[x0,#0] // store output 1894 add x28,x28,#7 // increment counter 1895 stp x9,x11,[x0,#16] 1896 stp x13,x15,[x0,#32] 1897 stp x17,x20,[x0,#48] 1898 add x0,x0,#64 1899 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1900 1901 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1902 eor v4.16b,v4.16b,v24.16b 1903 eor v5.16b,v5.16b,v25.16b 1904 eor v6.16b,v6.16b,v26.16b 1905 eor v7.16b,v7.16b,v27.16b 1906 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1907 1908 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1909 eor v8.16b,v8.16b,v0.16b 1910 ldp q24,q25,[sp,#0] 1911 eor v9.16b,v9.16b,v1.16b 1912 ldp q26,q27,[sp,#32] 1913 eor v10.16b,v10.16b,v2.16b 1914 eor v11.16b,v11.16b,v3.16b 1915 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1916 1917 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1918 eor v12.16b,v12.16b,v4.16b 1919 eor v13.16b,v13.16b,v5.16b 1920 eor v14.16b,v14.16b,v6.16b 1921 eor v15.16b,v15.16b,v7.16b 1922 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1923 1924 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1925 eor v16.16b,v16.16b,v8.16b 1926 eor v17.16b,v17.16b,v9.16b 1927 eor v18.16b,v18.16b,v10.16b 1928 eor v19.16b,v19.16b,v11.16b 1929 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1930 1931 shl v0.4s,v31.4s,#1 // 4 -> 8 1932 eor v20.16b,v20.16b,v12.16b 1933 eor v21.16b,v21.16b,v13.16b 1934 eor v22.16b,v22.16b,v14.16b 1935 eor v23.16b,v23.16b,v15.16b 1936 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1937 1938 add v27.4s,v27.4s,v0.4s // += 8 1939 add v28.4s,v28.4s,v0.4s 1940 add v29.4s,v29.4s,v0.4s 1941 add v30.4s,v30.4s,v0.4s 1942 1943 b.hs Loop_outer_512_neon 1944 1945 adds x2,x2,#512 1946 ushr v0.4s,v31.4s,#2 // 4 -> 1 1947 1948 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1949 ldp d10,d11,[sp,#128+16] 1950 ldp d12,d13,[sp,#128+32] 1951 ldp d14,d15,[sp,#128+48] 1952 1953 stp q24,q31,[sp,#0] // wipe off-load area 1954 stp q24,q31,[sp,#32] 1955 stp q24,q31,[sp,#64] 1956 1957 b.eq Ldone_512_neon 1958 1959 cmp x2,#192 1960 sub v27.4s,v27.4s,v0.4s // -= 1 1961 sub v28.4s,v28.4s,v0.4s 1962 sub v29.4s,v29.4s,v0.4s 1963 add sp,sp,#128 1964 b.hs Loop_outer_neon 1965 1966 eor v25.16b,v25.16b,v25.16b 1967 eor v26.16b,v26.16b,v26.16b 1968 eor v27.16b,v27.16b,v27.16b 1969 eor v28.16b,v28.16b,v28.16b 1970 eor v29.16b,v29.16b,v29.16b 1971 eor v30.16b,v30.16b,v30.16b 1972 b Loop_outer 1973 1974Ldone_512_neon: 1975 ldp x19,x20,[x29,#16] 1976 add sp,sp,#128+64 1977 ldp x21,x22,[x29,#32] 1978 ldp x23,x24,[x29,#48] 1979 ldp x25,x26,[x29,#64] 1980 ldp x27,x28,[x29,#80] 1981 ldp x29,x30,[sp],#96 1982 ret 1983 1984#endif // !OPENSSL_NO_ASM 1985