1#if defined(__aarch64__) 2#include <openssl/arm_arch.h> 3 4.text 5 6 7 8.align 5 9.Lsigma: 10.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 11.Lone: 12.long 1,0,0,0 13.LOPENSSL_armcap_P: 14#ifdef __ILP32__ 15.long OPENSSL_armcap_P-. 16#else 17.quad OPENSSL_armcap_P-. 18#endif 19.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 20.align 2 21 22.globl ChaCha20_ctr32 23.hidden ChaCha20_ctr32 24.type ChaCha20_ctr32,%function 25.align 5 26ChaCha20_ctr32: 27 cbz x2,.Labort 28 adr x5,.LOPENSSL_armcap_P 29 cmp x2,#192 30 b.lo .Lshort 31#ifdef __ILP32__ 32 ldrsw x6,[x5] 33#else 34 ldr x6,[x5] 35#endif 36 ldr w17,[x6,x5] 37 tst w17,#ARMV7_NEON 38 b.ne ChaCha20_neon 39 40.Lshort: 41 stp x29,x30,[sp,#-96]! 42 add x29,sp,#0 43 44 adr x5,.Lsigma 45 stp x19,x20,[sp,#16] 46 stp x21,x22,[sp,#32] 47 stp x23,x24,[sp,#48] 48 stp x25,x26,[sp,#64] 49 stp x27,x28,[sp,#80] 50 sub sp,sp,#64 51 52 ldp x22,x23,[x5] // load sigma 53 ldp x24,x25,[x3] // load key 54 ldp x26,x27,[x3,#16] 55 ldp x28,x30,[x4] // load counter 56#ifdef __ARMEB__ 57 ror x24,x24,#32 58 ror x25,x25,#32 59 ror x26,x26,#32 60 ror x27,x27,#32 61 ror x28,x28,#32 62 ror x30,x30,#32 63#endif 64 65.Loop_outer: 66 mov w5,w22 // unpack key block 67 lsr x6,x22,#32 68 mov w7,w23 69 lsr x8,x23,#32 70 mov w9,w24 71 lsr x10,x24,#32 72 mov w11,w25 73 lsr x12,x25,#32 74 mov w13,w26 75 lsr x14,x26,#32 76 mov w15,w27 77 lsr x16,x27,#32 78 mov w17,w28 79 lsr x19,x28,#32 80 mov w20,w30 81 lsr x21,x30,#32 82 83 mov x4,#10 84 subs x2,x2,#64 85.Loop: 86 sub x4,x4,#1 87 add w5,w5,w9 88 add w6,w6,w10 89 add w7,w7,w11 90 add w8,w8,w12 91 eor w17,w17,w5 92 eor w19,w19,w6 93 eor w20,w20,w7 94 eor w21,w21,w8 95 ror w17,w17,#16 96 ror w19,w19,#16 97 ror w20,w20,#16 98 ror w21,w21,#16 99 add w13,w13,w17 100 add w14,w14,w19 101 add w15,w15,w20 102 add w16,w16,w21 103 eor w9,w9,w13 104 eor w10,w10,w14 105 eor w11,w11,w15 106 eor w12,w12,w16 107 ror w9,w9,#20 108 ror w10,w10,#20 109 ror w11,w11,#20 110 ror w12,w12,#20 111 add w5,w5,w9 112 add w6,w6,w10 113 add w7,w7,w11 114 add w8,w8,w12 115 eor w17,w17,w5 116 eor w19,w19,w6 117 eor w20,w20,w7 118 eor w21,w21,w8 119 ror w17,w17,#24 120 ror w19,w19,#24 121 ror w20,w20,#24 122 ror w21,w21,#24 123 add w13,w13,w17 124 add w14,w14,w19 125 add w15,w15,w20 126 add w16,w16,w21 127 eor w9,w9,w13 128 eor w10,w10,w14 129 eor w11,w11,w15 130 eor w12,w12,w16 131 ror w9,w9,#25 132 ror w10,w10,#25 133 ror w11,w11,#25 134 ror w12,w12,#25 135 add w5,w5,w10 136 add w6,w6,w11 137 add w7,w7,w12 138 add w8,w8,w9 139 eor w21,w21,w5 140 eor w17,w17,w6 141 eor w19,w19,w7 142 eor w20,w20,w8 143 ror w21,w21,#16 144 ror w17,w17,#16 145 ror w19,w19,#16 146 ror w20,w20,#16 147 add w15,w15,w21 148 add w16,w16,w17 149 add w13,w13,w19 150 add w14,w14,w20 151 eor w10,w10,w15 152 eor w11,w11,w16 153 eor w12,w12,w13 154 eor w9,w9,w14 155 ror w10,w10,#20 156 ror w11,w11,#20 157 ror w12,w12,#20 158 ror w9,w9,#20 159 add w5,w5,w10 160 add w6,w6,w11 161 add w7,w7,w12 162 add w8,w8,w9 163 eor w21,w21,w5 164 eor w17,w17,w6 165 eor w19,w19,w7 166 eor w20,w20,w8 167 ror w21,w21,#24 168 ror w17,w17,#24 169 ror w19,w19,#24 170 ror w20,w20,#24 171 add w15,w15,w21 172 add w16,w16,w17 173 add w13,w13,w19 174 add w14,w14,w20 175 eor w10,w10,w15 176 eor w11,w11,w16 177 eor w12,w12,w13 178 eor w9,w9,w14 179 ror w10,w10,#25 180 ror w11,w11,#25 181 ror w12,w12,#25 182 ror w9,w9,#25 183 cbnz x4,.Loop 184 185 add w5,w5,w22 // accumulate key block 186 add x6,x6,x22,lsr#32 187 add w7,w7,w23 188 add x8,x8,x23,lsr#32 189 add w9,w9,w24 190 add x10,x10,x24,lsr#32 191 add w11,w11,w25 192 add x12,x12,x25,lsr#32 193 add w13,w13,w26 194 add x14,x14,x26,lsr#32 195 add w15,w15,w27 196 add x16,x16,x27,lsr#32 197 add w17,w17,w28 198 add x19,x19,x28,lsr#32 199 add w20,w20,w30 200 add x21,x21,x30,lsr#32 201 202 b.lo .Ltail 203 204 add x5,x5,x6,lsl#32 // pack 205 add x7,x7,x8,lsl#32 206 ldp x6,x8,[x1,#0] // load input 207 add x9,x9,x10,lsl#32 208 add x11,x11,x12,lsl#32 209 ldp x10,x12,[x1,#16] 210 add x13,x13,x14,lsl#32 211 add x15,x15,x16,lsl#32 212 ldp x14,x16,[x1,#32] 213 add x17,x17,x19,lsl#32 214 add x20,x20,x21,lsl#32 215 ldp x19,x21,[x1,#48] 216 add x1,x1,#64 217#ifdef __ARMEB__ 218 rev x5,x5 219 rev x7,x7 220 rev x9,x9 221 rev x11,x11 222 rev x13,x13 223 rev x15,x15 224 rev x17,x17 225 rev x20,x20 226#endif 227 eor x5,x5,x6 228 eor x7,x7,x8 229 eor x9,x9,x10 230 eor x11,x11,x12 231 eor x13,x13,x14 232 eor x15,x15,x16 233 eor x17,x17,x19 234 eor x20,x20,x21 235 236 stp x5,x7,[x0,#0] // store output 237 add x28,x28,#1 // increment counter 238 stp x9,x11,[x0,#16] 239 stp x13,x15,[x0,#32] 240 stp x17,x20,[x0,#48] 241 add x0,x0,#64 242 243 b.hi .Loop_outer 244 245 ldp x19,x20,[x29,#16] 246 add sp,sp,#64 247 ldp x21,x22,[x29,#32] 248 ldp x23,x24,[x29,#48] 249 ldp x25,x26,[x29,#64] 250 ldp x27,x28,[x29,#80] 251 ldp x29,x30,[sp],#96 252.Labort: 253 ret 254 255.align 4 256.Ltail: 257 add x2,x2,#64 258.Less_than_64: 259 sub x0,x0,#1 260 add x1,x1,x2 261 add x0,x0,x2 262 add x4,sp,x2 263 neg x2,x2 264 265 add x5,x5,x6,lsl#32 // pack 266 add x7,x7,x8,lsl#32 267 add x9,x9,x10,lsl#32 268 add x11,x11,x12,lsl#32 269 add x13,x13,x14,lsl#32 270 add x15,x15,x16,lsl#32 271 add x17,x17,x19,lsl#32 272 add x20,x20,x21,lsl#32 273#ifdef __ARMEB__ 274 rev x5,x5 275 rev x7,x7 276 rev x9,x9 277 rev x11,x11 278 rev x13,x13 279 rev x15,x15 280 rev x17,x17 281 rev x20,x20 282#endif 283 stp x5,x7,[sp,#0] 284 stp x9,x11,[sp,#16] 285 stp x13,x15,[sp,#32] 286 stp x17,x20,[sp,#48] 287 288.Loop_tail: 289 ldrb w10,[x1,x2] 290 ldrb w11,[x4,x2] 291 add x2,x2,#1 292 eor w10,w10,w11 293 strb w10,[x0,x2] 294 cbnz x2,.Loop_tail 295 296 stp xzr,xzr,[sp,#0] 297 stp xzr,xzr,[sp,#16] 298 stp xzr,xzr,[sp,#32] 299 stp xzr,xzr,[sp,#48] 300 301 ldp x19,x20,[x29,#16] 302 add sp,sp,#64 303 ldp x21,x22,[x29,#32] 304 ldp x23,x24,[x29,#48] 305 ldp x25,x26,[x29,#64] 306 ldp x27,x28,[x29,#80] 307 ldp x29,x30,[sp],#96 308 ret 309.size ChaCha20_ctr32,.-ChaCha20_ctr32 310 311.type ChaCha20_neon,%function 312.align 5 313ChaCha20_neon: 314 stp x29,x30,[sp,#-96]! 315 add x29,sp,#0 316 317 adr x5,.Lsigma 318 stp x19,x20,[sp,#16] 319 stp x21,x22,[sp,#32] 320 stp x23,x24,[sp,#48] 321 stp x25,x26,[sp,#64] 322 stp x27,x28,[sp,#80] 323 cmp x2,#512 324 b.hs .L512_or_more_neon 325 326 sub sp,sp,#64 327 328 ldp x22,x23,[x5] // load sigma 329 ld1 {v24.4s},[x5],#16 330 ldp x24,x25,[x3] // load key 331 ldp x26,x27,[x3,#16] 332 ld1 {v25.4s,v26.4s},[x3] 333 ldp x28,x30,[x4] // load counter 334 ld1 {v27.4s},[x4] 335 ld1 {v31.4s},[x5] 336#ifdef __ARMEB__ 337 rev64 v24.4s,v24.4s 338 ror x24,x24,#32 339 ror x25,x25,#32 340 ror x26,x26,#32 341 ror x27,x27,#32 342 ror x28,x28,#32 343 ror x30,x30,#32 344#endif 345 add v27.4s,v27.4s,v31.4s // += 1 346 add v28.4s,v27.4s,v31.4s 347 add v29.4s,v28.4s,v31.4s 348 shl v31.4s,v31.4s,#2 // 1 -> 4 349 350.Loop_outer_neon: 351 mov w5,w22 // unpack key block 352 lsr x6,x22,#32 353 mov v0.16b,v24.16b 354 mov w7,w23 355 lsr x8,x23,#32 356 mov v4.16b,v24.16b 357 mov w9,w24 358 lsr x10,x24,#32 359 mov v16.16b,v24.16b 360 mov w11,w25 361 mov v1.16b,v25.16b 362 lsr x12,x25,#32 363 mov v5.16b,v25.16b 364 mov w13,w26 365 mov v17.16b,v25.16b 366 lsr x14,x26,#32 367 mov v3.16b,v27.16b 368 mov w15,w27 369 mov v7.16b,v28.16b 370 lsr x16,x27,#32 371 mov v19.16b,v29.16b 372 mov w17,w28 373 mov v2.16b,v26.16b 374 lsr x19,x28,#32 375 mov v6.16b,v26.16b 376 mov w20,w30 377 mov v18.16b,v26.16b 378 lsr x21,x30,#32 379 380 mov x4,#10 381 subs x2,x2,#256 382.Loop_neon: 383 sub x4,x4,#1 384 add v0.4s,v0.4s,v1.4s 385 add w5,w5,w9 386 add v4.4s,v4.4s,v5.4s 387 add w6,w6,w10 388 add v16.4s,v16.4s,v17.4s 389 add w7,w7,w11 390 eor v3.16b,v3.16b,v0.16b 391 add w8,w8,w12 392 eor v7.16b,v7.16b,v4.16b 393 eor w17,w17,w5 394 eor v19.16b,v19.16b,v16.16b 395 eor w19,w19,w6 396 rev32 v3.8h,v3.8h 397 eor w20,w20,w7 398 rev32 v7.8h,v7.8h 399 eor w21,w21,w8 400 rev32 v19.8h,v19.8h 401 ror w17,w17,#16 402 add v2.4s,v2.4s,v3.4s 403 ror w19,w19,#16 404 add v6.4s,v6.4s,v7.4s 405 ror w20,w20,#16 406 add v18.4s,v18.4s,v19.4s 407 ror w21,w21,#16 408 eor v20.16b,v1.16b,v2.16b 409 add w13,w13,w17 410 eor v21.16b,v5.16b,v6.16b 411 add w14,w14,w19 412 eor v22.16b,v17.16b,v18.16b 413 add w15,w15,w20 414 ushr v1.4s,v20.4s,#20 415 add w16,w16,w21 416 ushr v5.4s,v21.4s,#20 417 eor w9,w9,w13 418 ushr v17.4s,v22.4s,#20 419 eor w10,w10,w14 420 sli v1.4s,v20.4s,#12 421 eor w11,w11,w15 422 sli v5.4s,v21.4s,#12 423 eor w12,w12,w16 424 sli v17.4s,v22.4s,#12 425 ror w9,w9,#20 426 add v0.4s,v0.4s,v1.4s 427 ror w10,w10,#20 428 add v4.4s,v4.4s,v5.4s 429 ror w11,w11,#20 430 add v16.4s,v16.4s,v17.4s 431 ror w12,w12,#20 432 eor v20.16b,v3.16b,v0.16b 433 add w5,w5,w9 434 eor v21.16b,v7.16b,v4.16b 435 add w6,w6,w10 436 eor v22.16b,v19.16b,v16.16b 437 add w7,w7,w11 438 ushr v3.4s,v20.4s,#24 439 add w8,w8,w12 440 ushr v7.4s,v21.4s,#24 441 eor w17,w17,w5 442 ushr v19.4s,v22.4s,#24 443 eor w19,w19,w6 444 sli v3.4s,v20.4s,#8 445 eor w20,w20,w7 446 sli v7.4s,v21.4s,#8 447 eor w21,w21,w8 448 sli v19.4s,v22.4s,#8 449 ror w17,w17,#24 450 add v2.4s,v2.4s,v3.4s 451 ror w19,w19,#24 452 add v6.4s,v6.4s,v7.4s 453 ror w20,w20,#24 454 add v18.4s,v18.4s,v19.4s 455 ror w21,w21,#24 456 eor v20.16b,v1.16b,v2.16b 457 add w13,w13,w17 458 eor v21.16b,v5.16b,v6.16b 459 add w14,w14,w19 460 eor v22.16b,v17.16b,v18.16b 461 add w15,w15,w20 462 ushr v1.4s,v20.4s,#25 463 add w16,w16,w21 464 ushr v5.4s,v21.4s,#25 465 eor w9,w9,w13 466 ushr v17.4s,v22.4s,#25 467 eor w10,w10,w14 468 sli v1.4s,v20.4s,#7 469 eor w11,w11,w15 470 sli v5.4s,v21.4s,#7 471 eor w12,w12,w16 472 sli v17.4s,v22.4s,#7 473 ror w9,w9,#25 474 ext v2.16b,v2.16b,v2.16b,#8 475 ror w10,w10,#25 476 ext v6.16b,v6.16b,v6.16b,#8 477 ror w11,w11,#25 478 ext v18.16b,v18.16b,v18.16b,#8 479 ror w12,w12,#25 480 ext v3.16b,v3.16b,v3.16b,#12 481 ext v7.16b,v7.16b,v7.16b,#12 482 ext v19.16b,v19.16b,v19.16b,#12 483 ext v1.16b,v1.16b,v1.16b,#4 484 ext v5.16b,v5.16b,v5.16b,#4 485 ext v17.16b,v17.16b,v17.16b,#4 486 add v0.4s,v0.4s,v1.4s 487 add w5,w5,w10 488 add v4.4s,v4.4s,v5.4s 489 add w6,w6,w11 490 add v16.4s,v16.4s,v17.4s 491 add w7,w7,w12 492 eor v3.16b,v3.16b,v0.16b 493 add w8,w8,w9 494 eor v7.16b,v7.16b,v4.16b 495 eor w21,w21,w5 496 eor v19.16b,v19.16b,v16.16b 497 eor w17,w17,w6 498 rev32 v3.8h,v3.8h 499 eor w19,w19,w7 500 rev32 v7.8h,v7.8h 501 eor w20,w20,w8 502 rev32 v19.8h,v19.8h 503 ror w21,w21,#16 504 add v2.4s,v2.4s,v3.4s 505 ror w17,w17,#16 506 add v6.4s,v6.4s,v7.4s 507 ror w19,w19,#16 508 add v18.4s,v18.4s,v19.4s 509 ror w20,w20,#16 510 eor v20.16b,v1.16b,v2.16b 511 add w15,w15,w21 512 eor v21.16b,v5.16b,v6.16b 513 add w16,w16,w17 514 eor v22.16b,v17.16b,v18.16b 515 add w13,w13,w19 516 ushr v1.4s,v20.4s,#20 517 add w14,w14,w20 518 ushr v5.4s,v21.4s,#20 519 eor w10,w10,w15 520 ushr v17.4s,v22.4s,#20 521 eor w11,w11,w16 522 sli v1.4s,v20.4s,#12 523 eor w12,w12,w13 524 sli v5.4s,v21.4s,#12 525 eor w9,w9,w14 526 sli v17.4s,v22.4s,#12 527 ror w10,w10,#20 528 add v0.4s,v0.4s,v1.4s 529 ror w11,w11,#20 530 add v4.4s,v4.4s,v5.4s 531 ror w12,w12,#20 532 add v16.4s,v16.4s,v17.4s 533 ror w9,w9,#20 534 eor v20.16b,v3.16b,v0.16b 535 add w5,w5,w10 536 eor v21.16b,v7.16b,v4.16b 537 add w6,w6,w11 538 eor v22.16b,v19.16b,v16.16b 539 add w7,w7,w12 540 ushr v3.4s,v20.4s,#24 541 add w8,w8,w9 542 ushr v7.4s,v21.4s,#24 543 eor w21,w21,w5 544 ushr v19.4s,v22.4s,#24 545 eor w17,w17,w6 546 sli v3.4s,v20.4s,#8 547 eor w19,w19,w7 548 sli v7.4s,v21.4s,#8 549 eor w20,w20,w8 550 sli v19.4s,v22.4s,#8 551 ror w21,w21,#24 552 add v2.4s,v2.4s,v3.4s 553 ror w17,w17,#24 554 add v6.4s,v6.4s,v7.4s 555 ror w19,w19,#24 556 add v18.4s,v18.4s,v19.4s 557 ror w20,w20,#24 558 eor v20.16b,v1.16b,v2.16b 559 add w15,w15,w21 560 eor v21.16b,v5.16b,v6.16b 561 add w16,w16,w17 562 eor v22.16b,v17.16b,v18.16b 563 add w13,w13,w19 564 ushr v1.4s,v20.4s,#25 565 add w14,w14,w20 566 ushr v5.4s,v21.4s,#25 567 eor w10,w10,w15 568 ushr v17.4s,v22.4s,#25 569 eor w11,w11,w16 570 sli v1.4s,v20.4s,#7 571 eor w12,w12,w13 572 sli v5.4s,v21.4s,#7 573 eor w9,w9,w14 574 sli v17.4s,v22.4s,#7 575 ror w10,w10,#25 576 ext v2.16b,v2.16b,v2.16b,#8 577 ror w11,w11,#25 578 ext v6.16b,v6.16b,v6.16b,#8 579 ror w12,w12,#25 580 ext v18.16b,v18.16b,v18.16b,#8 581 ror w9,w9,#25 582 ext v3.16b,v3.16b,v3.16b,#4 583 ext v7.16b,v7.16b,v7.16b,#4 584 ext v19.16b,v19.16b,v19.16b,#4 585 ext v1.16b,v1.16b,v1.16b,#12 586 ext v5.16b,v5.16b,v5.16b,#12 587 ext v17.16b,v17.16b,v17.16b,#12 588 cbnz x4,.Loop_neon 589 590 add w5,w5,w22 // accumulate key block 591 add v0.4s,v0.4s,v24.4s 592 add x6,x6,x22,lsr#32 593 add v4.4s,v4.4s,v24.4s 594 add w7,w7,w23 595 add v16.4s,v16.4s,v24.4s 596 add x8,x8,x23,lsr#32 597 add v2.4s,v2.4s,v26.4s 598 add w9,w9,w24 599 add v6.4s,v6.4s,v26.4s 600 add x10,x10,x24,lsr#32 601 add v18.4s,v18.4s,v26.4s 602 add w11,w11,w25 603 add v3.4s,v3.4s,v27.4s 604 add x12,x12,x25,lsr#32 605 add w13,w13,w26 606 add v7.4s,v7.4s,v28.4s 607 add x14,x14,x26,lsr#32 608 add w15,w15,w27 609 add v19.4s,v19.4s,v29.4s 610 add x16,x16,x27,lsr#32 611 add w17,w17,w28 612 add v1.4s,v1.4s,v25.4s 613 add x19,x19,x28,lsr#32 614 add w20,w20,w30 615 add v5.4s,v5.4s,v25.4s 616 add x21,x21,x30,lsr#32 617 add v17.4s,v17.4s,v25.4s 618 619 b.lo .Ltail_neon 620 621 add x5,x5,x6,lsl#32 // pack 622 add x7,x7,x8,lsl#32 623 ldp x6,x8,[x1,#0] // load input 624 add x9,x9,x10,lsl#32 625 add x11,x11,x12,lsl#32 626 ldp x10,x12,[x1,#16] 627 add x13,x13,x14,lsl#32 628 add x15,x15,x16,lsl#32 629 ldp x14,x16,[x1,#32] 630 add x17,x17,x19,lsl#32 631 add x20,x20,x21,lsl#32 632 ldp x19,x21,[x1,#48] 633 add x1,x1,#64 634#ifdef __ARMEB__ 635 rev x5,x5 636 rev x7,x7 637 rev x9,x9 638 rev x11,x11 639 rev x13,x13 640 rev x15,x15 641 rev x17,x17 642 rev x20,x20 643#endif 644 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 645 eor x5,x5,x6 646 eor x7,x7,x8 647 eor x9,x9,x10 648 eor x11,x11,x12 649 eor x13,x13,x14 650 eor v0.16b,v0.16b,v20.16b 651 eor x15,x15,x16 652 eor v1.16b,v1.16b,v21.16b 653 eor x17,x17,x19 654 eor v2.16b,v2.16b,v22.16b 655 eor x20,x20,x21 656 eor v3.16b,v3.16b,v23.16b 657 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 658 659 stp x5,x7,[x0,#0] // store output 660 add x28,x28,#4 // increment counter 661 stp x9,x11,[x0,#16] 662 add v27.4s,v27.4s,v31.4s // += 4 663 stp x13,x15,[x0,#32] 664 add v28.4s,v28.4s,v31.4s 665 stp x17,x20,[x0,#48] 666 add v29.4s,v29.4s,v31.4s 667 add x0,x0,#64 668 669 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 670 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 671 672 eor v4.16b,v4.16b,v20.16b 673 eor v5.16b,v5.16b,v21.16b 674 eor v6.16b,v6.16b,v22.16b 675 eor v7.16b,v7.16b,v23.16b 676 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 677 678 eor v16.16b,v16.16b,v0.16b 679 eor v17.16b,v17.16b,v1.16b 680 eor v18.16b,v18.16b,v2.16b 681 eor v19.16b,v19.16b,v3.16b 682 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 683 684 b.hi .Loop_outer_neon 685 686 ldp x19,x20,[x29,#16] 687 add sp,sp,#64 688 ldp x21,x22,[x29,#32] 689 ldp x23,x24,[x29,#48] 690 ldp x25,x26,[x29,#64] 691 ldp x27,x28,[x29,#80] 692 ldp x29,x30,[sp],#96 693 ret 694 695.Ltail_neon: 696 add x2,x2,#256 697 cmp x2,#64 698 b.lo .Less_than_64 699 700 add x5,x5,x6,lsl#32 // pack 701 add x7,x7,x8,lsl#32 702 ldp x6,x8,[x1,#0] // load input 703 add x9,x9,x10,lsl#32 704 add x11,x11,x12,lsl#32 705 ldp x10,x12,[x1,#16] 706 add x13,x13,x14,lsl#32 707 add x15,x15,x16,lsl#32 708 ldp x14,x16,[x1,#32] 709 add x17,x17,x19,lsl#32 710 add x20,x20,x21,lsl#32 711 ldp x19,x21,[x1,#48] 712 add x1,x1,#64 713#ifdef __ARMEB__ 714 rev x5,x5 715 rev x7,x7 716 rev x9,x9 717 rev x11,x11 718 rev x13,x13 719 rev x15,x15 720 rev x17,x17 721 rev x20,x20 722#endif 723 eor x5,x5,x6 724 eor x7,x7,x8 725 eor x9,x9,x10 726 eor x11,x11,x12 727 eor x13,x13,x14 728 eor x15,x15,x16 729 eor x17,x17,x19 730 eor x20,x20,x21 731 732 stp x5,x7,[x0,#0] // store output 733 add x28,x28,#4 // increment counter 734 stp x9,x11,[x0,#16] 735 stp x13,x15,[x0,#32] 736 stp x17,x20,[x0,#48] 737 add x0,x0,#64 738 b.eq .Ldone_neon 739 sub x2,x2,#64 740 cmp x2,#64 741 b.lo .Less_than_128 742 743 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 744 eor v0.16b,v0.16b,v20.16b 745 eor v1.16b,v1.16b,v21.16b 746 eor v2.16b,v2.16b,v22.16b 747 eor v3.16b,v3.16b,v23.16b 748 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 749 b.eq .Ldone_neon 750 sub x2,x2,#64 751 cmp x2,#64 752 b.lo .Less_than_192 753 754 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 755 eor v4.16b,v4.16b,v20.16b 756 eor v5.16b,v5.16b,v21.16b 757 eor v6.16b,v6.16b,v22.16b 758 eor v7.16b,v7.16b,v23.16b 759 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 760 b.eq .Ldone_neon 761 sub x2,x2,#64 762 763 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 764 b .Last_neon 765 766.Less_than_128: 767 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 768 b .Last_neon 769.Less_than_192: 770 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 771 b .Last_neon 772 773.align 4 774.Last_neon: 775 sub x0,x0,#1 776 add x1,x1,x2 777 add x0,x0,x2 778 add x4,sp,x2 779 neg x2,x2 780 781.Loop_tail_neon: 782 ldrb w10,[x1,x2] 783 ldrb w11,[x4,x2] 784 add x2,x2,#1 785 eor w10,w10,w11 786 strb w10,[x0,x2] 787 cbnz x2,.Loop_tail_neon 788 789 stp xzr,xzr,[sp,#0] 790 stp xzr,xzr,[sp,#16] 791 stp xzr,xzr,[sp,#32] 792 stp xzr,xzr,[sp,#48] 793 794.Ldone_neon: 795 ldp x19,x20,[x29,#16] 796 add sp,sp,#64 797 ldp x21,x22,[x29,#32] 798 ldp x23,x24,[x29,#48] 799 ldp x25,x26,[x29,#64] 800 ldp x27,x28,[x29,#80] 801 ldp x29,x30,[sp],#96 802 ret 803.size ChaCha20_neon,.-ChaCha20_neon 804.type ChaCha20_512_neon,%function 805.align 5 806ChaCha20_512_neon: 807 stp x29,x30,[sp,#-96]! 808 add x29,sp,#0 809 810 adr x5,.Lsigma 811 stp x19,x20,[sp,#16] 812 stp x21,x22,[sp,#32] 813 stp x23,x24,[sp,#48] 814 stp x25,x26,[sp,#64] 815 stp x27,x28,[sp,#80] 816 817.L512_or_more_neon: 818 sub sp,sp,#128+64 819 820 ldp x22,x23,[x5] // load sigma 821 ld1 {v24.4s},[x5],#16 822 ldp x24,x25,[x3] // load key 823 ldp x26,x27,[x3,#16] 824 ld1 {v25.4s,v26.4s},[x3] 825 ldp x28,x30,[x4] // load counter 826 ld1 {v27.4s},[x4] 827 ld1 {v31.4s},[x5] 828#ifdef __ARMEB__ 829 rev64 v24.4s,v24.4s 830 ror x24,x24,#32 831 ror x25,x25,#32 832 ror x26,x26,#32 833 ror x27,x27,#32 834 ror x28,x28,#32 835 ror x30,x30,#32 836#endif 837 add v27.4s,v27.4s,v31.4s // += 1 838 stp q24,q25,[sp,#0] // off-load key block, invariant part 839 add v27.4s,v27.4s,v31.4s // not typo 840 str q26,[sp,#32] 841 add v28.4s,v27.4s,v31.4s 842 add v29.4s,v28.4s,v31.4s 843 add v30.4s,v29.4s,v31.4s 844 shl v31.4s,v31.4s,#2 // 1 -> 4 845 846 stp d8,d9,[sp,#128+0] // meet ABI requirements 847 stp d10,d11,[sp,#128+16] 848 stp d12,d13,[sp,#128+32] 849 stp d14,d15,[sp,#128+48] 850 851 sub x2,x2,#512 // not typo 852 853.Loop_outer_512_neon: 854 mov v0.16b,v24.16b 855 mov v4.16b,v24.16b 856 mov v8.16b,v24.16b 857 mov v12.16b,v24.16b 858 mov v16.16b,v24.16b 859 mov v20.16b,v24.16b 860 mov v1.16b,v25.16b 861 mov w5,w22 // unpack key block 862 mov v5.16b,v25.16b 863 lsr x6,x22,#32 864 mov v9.16b,v25.16b 865 mov w7,w23 866 mov v13.16b,v25.16b 867 lsr x8,x23,#32 868 mov v17.16b,v25.16b 869 mov w9,w24 870 mov v21.16b,v25.16b 871 lsr x10,x24,#32 872 mov v3.16b,v27.16b 873 mov w11,w25 874 mov v7.16b,v28.16b 875 lsr x12,x25,#32 876 mov v11.16b,v29.16b 877 mov w13,w26 878 mov v15.16b,v30.16b 879 lsr x14,x26,#32 880 mov v2.16b,v26.16b 881 mov w15,w27 882 mov v6.16b,v26.16b 883 lsr x16,x27,#32 884 add v19.4s,v3.4s,v31.4s // +4 885 mov w17,w28 886 add v23.4s,v7.4s,v31.4s // +4 887 lsr x19,x28,#32 888 mov v10.16b,v26.16b 889 mov w20,w30 890 mov v14.16b,v26.16b 891 lsr x21,x30,#32 892 mov v18.16b,v26.16b 893 stp q27,q28,[sp,#48] // off-load key block, variable part 894 mov v22.16b,v26.16b 895 str q29,[sp,#80] 896 897 mov x4,#5 898 subs x2,x2,#512 899.Loop_upper_neon: 900 sub x4,x4,#1 901 add v0.4s,v0.4s,v1.4s 902 add w5,w5,w9 903 add v4.4s,v4.4s,v5.4s 904 add w6,w6,w10 905 add v8.4s,v8.4s,v9.4s 906 add w7,w7,w11 907 add v12.4s,v12.4s,v13.4s 908 add w8,w8,w12 909 add v16.4s,v16.4s,v17.4s 910 eor w17,w17,w5 911 add v20.4s,v20.4s,v21.4s 912 eor w19,w19,w6 913 eor v3.16b,v3.16b,v0.16b 914 eor w20,w20,w7 915 eor v7.16b,v7.16b,v4.16b 916 eor w21,w21,w8 917 eor v11.16b,v11.16b,v8.16b 918 ror w17,w17,#16 919 eor v15.16b,v15.16b,v12.16b 920 ror w19,w19,#16 921 eor v19.16b,v19.16b,v16.16b 922 ror w20,w20,#16 923 eor v23.16b,v23.16b,v20.16b 924 ror w21,w21,#16 925 rev32 v3.8h,v3.8h 926 add w13,w13,w17 927 rev32 v7.8h,v7.8h 928 add w14,w14,w19 929 rev32 v11.8h,v11.8h 930 add w15,w15,w20 931 rev32 v15.8h,v15.8h 932 add w16,w16,w21 933 rev32 v19.8h,v19.8h 934 eor w9,w9,w13 935 rev32 v23.8h,v23.8h 936 eor w10,w10,w14 937 add v2.4s,v2.4s,v3.4s 938 eor w11,w11,w15 939 add v6.4s,v6.4s,v7.4s 940 eor w12,w12,w16 941 add v10.4s,v10.4s,v11.4s 942 ror w9,w9,#20 943 add v14.4s,v14.4s,v15.4s 944 ror w10,w10,#20 945 add v18.4s,v18.4s,v19.4s 946 ror w11,w11,#20 947 add v22.4s,v22.4s,v23.4s 948 ror w12,w12,#20 949 eor v24.16b,v1.16b,v2.16b 950 add w5,w5,w9 951 eor v25.16b,v5.16b,v6.16b 952 add w6,w6,w10 953 eor v26.16b,v9.16b,v10.16b 954 add w7,w7,w11 955 eor v27.16b,v13.16b,v14.16b 956 add w8,w8,w12 957 eor v28.16b,v17.16b,v18.16b 958 eor w17,w17,w5 959 eor v29.16b,v21.16b,v22.16b 960 eor w19,w19,w6 961 ushr v1.4s,v24.4s,#20 962 eor w20,w20,w7 963 ushr v5.4s,v25.4s,#20 964 eor w21,w21,w8 965 ushr v9.4s,v26.4s,#20 966 ror w17,w17,#24 967 ushr v13.4s,v27.4s,#20 968 ror w19,w19,#24 969 ushr v17.4s,v28.4s,#20 970 ror w20,w20,#24 971 ushr v21.4s,v29.4s,#20 972 ror w21,w21,#24 973 sli v1.4s,v24.4s,#12 974 add w13,w13,w17 975 sli v5.4s,v25.4s,#12 976 add w14,w14,w19 977 sli v9.4s,v26.4s,#12 978 add w15,w15,w20 979 sli v13.4s,v27.4s,#12 980 add w16,w16,w21 981 sli v17.4s,v28.4s,#12 982 eor w9,w9,w13 983 sli v21.4s,v29.4s,#12 984 eor w10,w10,w14 985 add v0.4s,v0.4s,v1.4s 986 eor w11,w11,w15 987 add v4.4s,v4.4s,v5.4s 988 eor w12,w12,w16 989 add v8.4s,v8.4s,v9.4s 990 ror w9,w9,#25 991 add v12.4s,v12.4s,v13.4s 992 ror w10,w10,#25 993 add v16.4s,v16.4s,v17.4s 994 ror w11,w11,#25 995 add v20.4s,v20.4s,v21.4s 996 ror w12,w12,#25 997 eor v24.16b,v3.16b,v0.16b 998 add w5,w5,w10 999 eor v25.16b,v7.16b,v4.16b 1000 add w6,w6,w11 1001 eor v26.16b,v11.16b,v8.16b 1002 add w7,w7,w12 1003 eor v27.16b,v15.16b,v12.16b 1004 add w8,w8,w9 1005 eor v28.16b,v19.16b,v16.16b 1006 eor w21,w21,w5 1007 eor v29.16b,v23.16b,v20.16b 1008 eor w17,w17,w6 1009 ushr v3.4s,v24.4s,#24 1010 eor w19,w19,w7 1011 ushr v7.4s,v25.4s,#24 1012 eor w20,w20,w8 1013 ushr v11.4s,v26.4s,#24 1014 ror w21,w21,#16 1015 ushr v15.4s,v27.4s,#24 1016 ror w17,w17,#16 1017 ushr v19.4s,v28.4s,#24 1018 ror w19,w19,#16 1019 ushr v23.4s,v29.4s,#24 1020 ror w20,w20,#16 1021 sli v3.4s,v24.4s,#8 1022 add w15,w15,w21 1023 sli v7.4s,v25.4s,#8 1024 add w16,w16,w17 1025 sli v11.4s,v26.4s,#8 1026 add w13,w13,w19 1027 sli v15.4s,v27.4s,#8 1028 add w14,w14,w20 1029 sli v19.4s,v28.4s,#8 1030 eor w10,w10,w15 1031 sli v23.4s,v29.4s,#8 1032 eor w11,w11,w16 1033 add v2.4s,v2.4s,v3.4s 1034 eor w12,w12,w13 1035 add v6.4s,v6.4s,v7.4s 1036 eor w9,w9,w14 1037 add v10.4s,v10.4s,v11.4s 1038 ror w10,w10,#20 1039 add v14.4s,v14.4s,v15.4s 1040 ror w11,w11,#20 1041 add v18.4s,v18.4s,v19.4s 1042 ror w12,w12,#20 1043 add v22.4s,v22.4s,v23.4s 1044 ror w9,w9,#20 1045 eor v24.16b,v1.16b,v2.16b 1046 add w5,w5,w10 1047 eor v25.16b,v5.16b,v6.16b 1048 add w6,w6,w11 1049 eor v26.16b,v9.16b,v10.16b 1050 add w7,w7,w12 1051 eor v27.16b,v13.16b,v14.16b 1052 add w8,w8,w9 1053 eor v28.16b,v17.16b,v18.16b 1054 eor w21,w21,w5 1055 eor v29.16b,v21.16b,v22.16b 1056 eor w17,w17,w6 1057 ushr v1.4s,v24.4s,#25 1058 eor w19,w19,w7 1059 ushr v5.4s,v25.4s,#25 1060 eor w20,w20,w8 1061 ushr v9.4s,v26.4s,#25 1062 ror w21,w21,#24 1063 ushr v13.4s,v27.4s,#25 1064 ror w17,w17,#24 1065 ushr v17.4s,v28.4s,#25 1066 ror w19,w19,#24 1067 ushr v21.4s,v29.4s,#25 1068 ror w20,w20,#24 1069 sli v1.4s,v24.4s,#7 1070 add w15,w15,w21 1071 sli v5.4s,v25.4s,#7 1072 add w16,w16,w17 1073 sli v9.4s,v26.4s,#7 1074 add w13,w13,w19 1075 sli v13.4s,v27.4s,#7 1076 add w14,w14,w20 1077 sli v17.4s,v28.4s,#7 1078 eor w10,w10,w15 1079 sli v21.4s,v29.4s,#7 1080 eor w11,w11,w16 1081 ext v2.16b,v2.16b,v2.16b,#8 1082 eor w12,w12,w13 1083 ext v6.16b,v6.16b,v6.16b,#8 1084 eor w9,w9,w14 1085 ext v10.16b,v10.16b,v10.16b,#8 1086 ror w10,w10,#25 1087 ext v14.16b,v14.16b,v14.16b,#8 1088 ror w11,w11,#25 1089 ext v18.16b,v18.16b,v18.16b,#8 1090 ror w12,w12,#25 1091 ext v22.16b,v22.16b,v22.16b,#8 1092 ror w9,w9,#25 1093 ext v3.16b,v3.16b,v3.16b,#12 1094 ext v7.16b,v7.16b,v7.16b,#12 1095 ext v11.16b,v11.16b,v11.16b,#12 1096 ext v15.16b,v15.16b,v15.16b,#12 1097 ext v19.16b,v19.16b,v19.16b,#12 1098 ext v23.16b,v23.16b,v23.16b,#12 1099 ext v1.16b,v1.16b,v1.16b,#4 1100 ext v5.16b,v5.16b,v5.16b,#4 1101 ext v9.16b,v9.16b,v9.16b,#4 1102 ext v13.16b,v13.16b,v13.16b,#4 1103 ext v17.16b,v17.16b,v17.16b,#4 1104 ext v21.16b,v21.16b,v21.16b,#4 1105 add v0.4s,v0.4s,v1.4s 1106 add w5,w5,w9 1107 add v4.4s,v4.4s,v5.4s 1108 add w6,w6,w10 1109 add v8.4s,v8.4s,v9.4s 1110 add w7,w7,w11 1111 add v12.4s,v12.4s,v13.4s 1112 add w8,w8,w12 1113 add v16.4s,v16.4s,v17.4s 1114 eor w17,w17,w5 1115 add v20.4s,v20.4s,v21.4s 1116 eor w19,w19,w6 1117 eor v3.16b,v3.16b,v0.16b 1118 eor w20,w20,w7 1119 eor v7.16b,v7.16b,v4.16b 1120 eor w21,w21,w8 1121 eor v11.16b,v11.16b,v8.16b 1122 ror w17,w17,#16 1123 eor v15.16b,v15.16b,v12.16b 1124 ror w19,w19,#16 1125 eor v19.16b,v19.16b,v16.16b 1126 ror w20,w20,#16 1127 eor v23.16b,v23.16b,v20.16b 1128 ror w21,w21,#16 1129 rev32 v3.8h,v3.8h 1130 add w13,w13,w17 1131 rev32 v7.8h,v7.8h 1132 add w14,w14,w19 1133 rev32 v11.8h,v11.8h 1134 add w15,w15,w20 1135 rev32 v15.8h,v15.8h 1136 add w16,w16,w21 1137 rev32 v19.8h,v19.8h 1138 eor w9,w9,w13 1139 rev32 v23.8h,v23.8h 1140 eor w10,w10,w14 1141 add v2.4s,v2.4s,v3.4s 1142 eor w11,w11,w15 1143 add v6.4s,v6.4s,v7.4s 1144 eor w12,w12,w16 1145 add v10.4s,v10.4s,v11.4s 1146 ror w9,w9,#20 1147 add v14.4s,v14.4s,v15.4s 1148 ror w10,w10,#20 1149 add v18.4s,v18.4s,v19.4s 1150 ror w11,w11,#20 1151 add v22.4s,v22.4s,v23.4s 1152 ror w12,w12,#20 1153 eor v24.16b,v1.16b,v2.16b 1154 add w5,w5,w9 1155 eor v25.16b,v5.16b,v6.16b 1156 add w6,w6,w10 1157 eor v26.16b,v9.16b,v10.16b 1158 add w7,w7,w11 1159 eor v27.16b,v13.16b,v14.16b 1160 add w8,w8,w12 1161 eor v28.16b,v17.16b,v18.16b 1162 eor w17,w17,w5 1163 eor v29.16b,v21.16b,v22.16b 1164 eor w19,w19,w6 1165 ushr v1.4s,v24.4s,#20 1166 eor w20,w20,w7 1167 ushr v5.4s,v25.4s,#20 1168 eor w21,w21,w8 1169 ushr v9.4s,v26.4s,#20 1170 ror w17,w17,#24 1171 ushr v13.4s,v27.4s,#20 1172 ror w19,w19,#24 1173 ushr v17.4s,v28.4s,#20 1174 ror w20,w20,#24 1175 ushr v21.4s,v29.4s,#20 1176 ror w21,w21,#24 1177 sli v1.4s,v24.4s,#12 1178 add w13,w13,w17 1179 sli v5.4s,v25.4s,#12 1180 add w14,w14,w19 1181 sli v9.4s,v26.4s,#12 1182 add w15,w15,w20 1183 sli v13.4s,v27.4s,#12 1184 add w16,w16,w21 1185 sli v17.4s,v28.4s,#12 1186 eor w9,w9,w13 1187 sli v21.4s,v29.4s,#12 1188 eor w10,w10,w14 1189 add v0.4s,v0.4s,v1.4s 1190 eor w11,w11,w15 1191 add v4.4s,v4.4s,v5.4s 1192 eor w12,w12,w16 1193 add v8.4s,v8.4s,v9.4s 1194 ror w9,w9,#25 1195 add v12.4s,v12.4s,v13.4s 1196 ror w10,w10,#25 1197 add v16.4s,v16.4s,v17.4s 1198 ror w11,w11,#25 1199 add v20.4s,v20.4s,v21.4s 1200 ror w12,w12,#25 1201 eor v24.16b,v3.16b,v0.16b 1202 add w5,w5,w10 1203 eor v25.16b,v7.16b,v4.16b 1204 add w6,w6,w11 1205 eor v26.16b,v11.16b,v8.16b 1206 add w7,w7,w12 1207 eor v27.16b,v15.16b,v12.16b 1208 add w8,w8,w9 1209 eor v28.16b,v19.16b,v16.16b 1210 eor w21,w21,w5 1211 eor v29.16b,v23.16b,v20.16b 1212 eor w17,w17,w6 1213 ushr v3.4s,v24.4s,#24 1214 eor w19,w19,w7 1215 ushr v7.4s,v25.4s,#24 1216 eor w20,w20,w8 1217 ushr v11.4s,v26.4s,#24 1218 ror w21,w21,#16 1219 ushr v15.4s,v27.4s,#24 1220 ror w17,w17,#16 1221 ushr v19.4s,v28.4s,#24 1222 ror w19,w19,#16 1223 ushr v23.4s,v29.4s,#24 1224 ror w20,w20,#16 1225 sli v3.4s,v24.4s,#8 1226 add w15,w15,w21 1227 sli v7.4s,v25.4s,#8 1228 add w16,w16,w17 1229 sli v11.4s,v26.4s,#8 1230 add w13,w13,w19 1231 sli v15.4s,v27.4s,#8 1232 add w14,w14,w20 1233 sli v19.4s,v28.4s,#8 1234 eor w10,w10,w15 1235 sli v23.4s,v29.4s,#8 1236 eor w11,w11,w16 1237 add v2.4s,v2.4s,v3.4s 1238 eor w12,w12,w13 1239 add v6.4s,v6.4s,v7.4s 1240 eor w9,w9,w14 1241 add v10.4s,v10.4s,v11.4s 1242 ror w10,w10,#20 1243 add v14.4s,v14.4s,v15.4s 1244 ror w11,w11,#20 1245 add v18.4s,v18.4s,v19.4s 1246 ror w12,w12,#20 1247 add v22.4s,v22.4s,v23.4s 1248 ror w9,w9,#20 1249 eor v24.16b,v1.16b,v2.16b 1250 add w5,w5,w10 1251 eor v25.16b,v5.16b,v6.16b 1252 add w6,w6,w11 1253 eor v26.16b,v9.16b,v10.16b 1254 add w7,w7,w12 1255 eor v27.16b,v13.16b,v14.16b 1256 add w8,w8,w9 1257 eor v28.16b,v17.16b,v18.16b 1258 eor w21,w21,w5 1259 eor v29.16b,v21.16b,v22.16b 1260 eor w17,w17,w6 1261 ushr v1.4s,v24.4s,#25 1262 eor w19,w19,w7 1263 ushr v5.4s,v25.4s,#25 1264 eor w20,w20,w8 1265 ushr v9.4s,v26.4s,#25 1266 ror w21,w21,#24 1267 ushr v13.4s,v27.4s,#25 1268 ror w17,w17,#24 1269 ushr v17.4s,v28.4s,#25 1270 ror w19,w19,#24 1271 ushr v21.4s,v29.4s,#25 1272 ror w20,w20,#24 1273 sli v1.4s,v24.4s,#7 1274 add w15,w15,w21 1275 sli v5.4s,v25.4s,#7 1276 add w16,w16,w17 1277 sli v9.4s,v26.4s,#7 1278 add w13,w13,w19 1279 sli v13.4s,v27.4s,#7 1280 add w14,w14,w20 1281 sli v17.4s,v28.4s,#7 1282 eor w10,w10,w15 1283 sli v21.4s,v29.4s,#7 1284 eor w11,w11,w16 1285 ext v2.16b,v2.16b,v2.16b,#8 1286 eor w12,w12,w13 1287 ext v6.16b,v6.16b,v6.16b,#8 1288 eor w9,w9,w14 1289 ext v10.16b,v10.16b,v10.16b,#8 1290 ror w10,w10,#25 1291 ext v14.16b,v14.16b,v14.16b,#8 1292 ror w11,w11,#25 1293 ext v18.16b,v18.16b,v18.16b,#8 1294 ror w12,w12,#25 1295 ext v22.16b,v22.16b,v22.16b,#8 1296 ror w9,w9,#25 1297 ext v3.16b,v3.16b,v3.16b,#4 1298 ext v7.16b,v7.16b,v7.16b,#4 1299 ext v11.16b,v11.16b,v11.16b,#4 1300 ext v15.16b,v15.16b,v15.16b,#4 1301 ext v19.16b,v19.16b,v19.16b,#4 1302 ext v23.16b,v23.16b,v23.16b,#4 1303 ext v1.16b,v1.16b,v1.16b,#12 1304 ext v5.16b,v5.16b,v5.16b,#12 1305 ext v9.16b,v9.16b,v9.16b,#12 1306 ext v13.16b,v13.16b,v13.16b,#12 1307 ext v17.16b,v17.16b,v17.16b,#12 1308 ext v21.16b,v21.16b,v21.16b,#12 1309 cbnz x4,.Loop_upper_neon 1310 1311 add w5,w5,w22 // accumulate key block 1312 add x6,x6,x22,lsr#32 1313 add w7,w7,w23 1314 add x8,x8,x23,lsr#32 1315 add w9,w9,w24 1316 add x10,x10,x24,lsr#32 1317 add w11,w11,w25 1318 add x12,x12,x25,lsr#32 1319 add w13,w13,w26 1320 add x14,x14,x26,lsr#32 1321 add w15,w15,w27 1322 add x16,x16,x27,lsr#32 1323 add w17,w17,w28 1324 add x19,x19,x28,lsr#32 1325 add w20,w20,w30 1326 add x21,x21,x30,lsr#32 1327 1328 add x5,x5,x6,lsl#32 // pack 1329 add x7,x7,x8,lsl#32 1330 ldp x6,x8,[x1,#0] // load input 1331 add x9,x9,x10,lsl#32 1332 add x11,x11,x12,lsl#32 1333 ldp x10,x12,[x1,#16] 1334 add x13,x13,x14,lsl#32 1335 add x15,x15,x16,lsl#32 1336 ldp x14,x16,[x1,#32] 1337 add x17,x17,x19,lsl#32 1338 add x20,x20,x21,lsl#32 1339 ldp x19,x21,[x1,#48] 1340 add x1,x1,#64 1341#ifdef __ARMEB__ 1342 rev x5,x5 1343 rev x7,x7 1344 rev x9,x9 1345 rev x11,x11 1346 rev x13,x13 1347 rev x15,x15 1348 rev x17,x17 1349 rev x20,x20 1350#endif 1351 eor x5,x5,x6 1352 eor x7,x7,x8 1353 eor x9,x9,x10 1354 eor x11,x11,x12 1355 eor x13,x13,x14 1356 eor x15,x15,x16 1357 eor x17,x17,x19 1358 eor x20,x20,x21 1359 1360 stp x5,x7,[x0,#0] // store output 1361 add x28,x28,#1 // increment counter 1362 mov w5,w22 // unpack key block 1363 lsr x6,x22,#32 1364 stp x9,x11,[x0,#16] 1365 mov w7,w23 1366 lsr x8,x23,#32 1367 stp x13,x15,[x0,#32] 1368 mov w9,w24 1369 lsr x10,x24,#32 1370 stp x17,x20,[x0,#48] 1371 add x0,x0,#64 1372 mov w11,w25 1373 lsr x12,x25,#32 1374 mov w13,w26 1375 lsr x14,x26,#32 1376 mov w15,w27 1377 lsr x16,x27,#32 1378 mov w17,w28 1379 lsr x19,x28,#32 1380 mov w20,w30 1381 lsr x21,x30,#32 1382 1383 mov x4,#5 1384.Loop_lower_neon: 1385 sub x4,x4,#1 1386 add v0.4s,v0.4s,v1.4s 1387 add w5,w5,w9 1388 add v4.4s,v4.4s,v5.4s 1389 add w6,w6,w10 1390 add v8.4s,v8.4s,v9.4s 1391 add w7,w7,w11 1392 add v12.4s,v12.4s,v13.4s 1393 add w8,w8,w12 1394 add v16.4s,v16.4s,v17.4s 1395 eor w17,w17,w5 1396 add v20.4s,v20.4s,v21.4s 1397 eor w19,w19,w6 1398 eor v3.16b,v3.16b,v0.16b 1399 eor w20,w20,w7 1400 eor v7.16b,v7.16b,v4.16b 1401 eor w21,w21,w8 1402 eor v11.16b,v11.16b,v8.16b 1403 ror w17,w17,#16 1404 eor v15.16b,v15.16b,v12.16b 1405 ror w19,w19,#16 1406 eor v19.16b,v19.16b,v16.16b 1407 ror w20,w20,#16 1408 eor v23.16b,v23.16b,v20.16b 1409 ror w21,w21,#16 1410 rev32 v3.8h,v3.8h 1411 add w13,w13,w17 1412 rev32 v7.8h,v7.8h 1413 add w14,w14,w19 1414 rev32 v11.8h,v11.8h 1415 add w15,w15,w20 1416 rev32 v15.8h,v15.8h 1417 add w16,w16,w21 1418 rev32 v19.8h,v19.8h 1419 eor w9,w9,w13 1420 rev32 v23.8h,v23.8h 1421 eor w10,w10,w14 1422 add v2.4s,v2.4s,v3.4s 1423 eor w11,w11,w15 1424 add v6.4s,v6.4s,v7.4s 1425 eor w12,w12,w16 1426 add v10.4s,v10.4s,v11.4s 1427 ror w9,w9,#20 1428 add v14.4s,v14.4s,v15.4s 1429 ror w10,w10,#20 1430 add v18.4s,v18.4s,v19.4s 1431 ror w11,w11,#20 1432 add v22.4s,v22.4s,v23.4s 1433 ror w12,w12,#20 1434 eor v24.16b,v1.16b,v2.16b 1435 add w5,w5,w9 1436 eor v25.16b,v5.16b,v6.16b 1437 add w6,w6,w10 1438 eor v26.16b,v9.16b,v10.16b 1439 add w7,w7,w11 1440 eor v27.16b,v13.16b,v14.16b 1441 add w8,w8,w12 1442 eor v28.16b,v17.16b,v18.16b 1443 eor w17,w17,w5 1444 eor v29.16b,v21.16b,v22.16b 1445 eor w19,w19,w6 1446 ushr v1.4s,v24.4s,#20 1447 eor w20,w20,w7 1448 ushr v5.4s,v25.4s,#20 1449 eor w21,w21,w8 1450 ushr v9.4s,v26.4s,#20 1451 ror w17,w17,#24 1452 ushr v13.4s,v27.4s,#20 1453 ror w19,w19,#24 1454 ushr v17.4s,v28.4s,#20 1455 ror w20,w20,#24 1456 ushr v21.4s,v29.4s,#20 1457 ror w21,w21,#24 1458 sli v1.4s,v24.4s,#12 1459 add w13,w13,w17 1460 sli v5.4s,v25.4s,#12 1461 add w14,w14,w19 1462 sli v9.4s,v26.4s,#12 1463 add w15,w15,w20 1464 sli v13.4s,v27.4s,#12 1465 add w16,w16,w21 1466 sli v17.4s,v28.4s,#12 1467 eor w9,w9,w13 1468 sli v21.4s,v29.4s,#12 1469 eor w10,w10,w14 1470 add v0.4s,v0.4s,v1.4s 1471 eor w11,w11,w15 1472 add v4.4s,v4.4s,v5.4s 1473 eor w12,w12,w16 1474 add v8.4s,v8.4s,v9.4s 1475 ror w9,w9,#25 1476 add v12.4s,v12.4s,v13.4s 1477 ror w10,w10,#25 1478 add v16.4s,v16.4s,v17.4s 1479 ror w11,w11,#25 1480 add v20.4s,v20.4s,v21.4s 1481 ror w12,w12,#25 1482 eor v24.16b,v3.16b,v0.16b 1483 add w5,w5,w10 1484 eor v25.16b,v7.16b,v4.16b 1485 add w6,w6,w11 1486 eor v26.16b,v11.16b,v8.16b 1487 add w7,w7,w12 1488 eor v27.16b,v15.16b,v12.16b 1489 add w8,w8,w9 1490 eor v28.16b,v19.16b,v16.16b 1491 eor w21,w21,w5 1492 eor v29.16b,v23.16b,v20.16b 1493 eor w17,w17,w6 1494 ushr v3.4s,v24.4s,#24 1495 eor w19,w19,w7 1496 ushr v7.4s,v25.4s,#24 1497 eor w20,w20,w8 1498 ushr v11.4s,v26.4s,#24 1499 ror w21,w21,#16 1500 ushr v15.4s,v27.4s,#24 1501 ror w17,w17,#16 1502 ushr v19.4s,v28.4s,#24 1503 ror w19,w19,#16 1504 ushr v23.4s,v29.4s,#24 1505 ror w20,w20,#16 1506 sli v3.4s,v24.4s,#8 1507 add w15,w15,w21 1508 sli v7.4s,v25.4s,#8 1509 add w16,w16,w17 1510 sli v11.4s,v26.4s,#8 1511 add w13,w13,w19 1512 sli v15.4s,v27.4s,#8 1513 add w14,w14,w20 1514 sli v19.4s,v28.4s,#8 1515 eor w10,w10,w15 1516 sli v23.4s,v29.4s,#8 1517 eor w11,w11,w16 1518 add v2.4s,v2.4s,v3.4s 1519 eor w12,w12,w13 1520 add v6.4s,v6.4s,v7.4s 1521 eor w9,w9,w14 1522 add v10.4s,v10.4s,v11.4s 1523 ror w10,w10,#20 1524 add v14.4s,v14.4s,v15.4s 1525 ror w11,w11,#20 1526 add v18.4s,v18.4s,v19.4s 1527 ror w12,w12,#20 1528 add v22.4s,v22.4s,v23.4s 1529 ror w9,w9,#20 1530 eor v24.16b,v1.16b,v2.16b 1531 add w5,w5,w10 1532 eor v25.16b,v5.16b,v6.16b 1533 add w6,w6,w11 1534 eor v26.16b,v9.16b,v10.16b 1535 add w7,w7,w12 1536 eor v27.16b,v13.16b,v14.16b 1537 add w8,w8,w9 1538 eor v28.16b,v17.16b,v18.16b 1539 eor w21,w21,w5 1540 eor v29.16b,v21.16b,v22.16b 1541 eor w17,w17,w6 1542 ushr v1.4s,v24.4s,#25 1543 eor w19,w19,w7 1544 ushr v5.4s,v25.4s,#25 1545 eor w20,w20,w8 1546 ushr v9.4s,v26.4s,#25 1547 ror w21,w21,#24 1548 ushr v13.4s,v27.4s,#25 1549 ror w17,w17,#24 1550 ushr v17.4s,v28.4s,#25 1551 ror w19,w19,#24 1552 ushr v21.4s,v29.4s,#25 1553 ror w20,w20,#24 1554 sli v1.4s,v24.4s,#7 1555 add w15,w15,w21 1556 sli v5.4s,v25.4s,#7 1557 add w16,w16,w17 1558 sli v9.4s,v26.4s,#7 1559 add w13,w13,w19 1560 sli v13.4s,v27.4s,#7 1561 add w14,w14,w20 1562 sli v17.4s,v28.4s,#7 1563 eor w10,w10,w15 1564 sli v21.4s,v29.4s,#7 1565 eor w11,w11,w16 1566 ext v2.16b,v2.16b,v2.16b,#8 1567 eor w12,w12,w13 1568 ext v6.16b,v6.16b,v6.16b,#8 1569 eor w9,w9,w14 1570 ext v10.16b,v10.16b,v10.16b,#8 1571 ror w10,w10,#25 1572 ext v14.16b,v14.16b,v14.16b,#8 1573 ror w11,w11,#25 1574 ext v18.16b,v18.16b,v18.16b,#8 1575 ror w12,w12,#25 1576 ext v22.16b,v22.16b,v22.16b,#8 1577 ror w9,w9,#25 1578 ext v3.16b,v3.16b,v3.16b,#12 1579 ext v7.16b,v7.16b,v7.16b,#12 1580 ext v11.16b,v11.16b,v11.16b,#12 1581 ext v15.16b,v15.16b,v15.16b,#12 1582 ext v19.16b,v19.16b,v19.16b,#12 1583 ext v23.16b,v23.16b,v23.16b,#12 1584 ext v1.16b,v1.16b,v1.16b,#4 1585 ext v5.16b,v5.16b,v5.16b,#4 1586 ext v9.16b,v9.16b,v9.16b,#4 1587 ext v13.16b,v13.16b,v13.16b,#4 1588 ext v17.16b,v17.16b,v17.16b,#4 1589 ext v21.16b,v21.16b,v21.16b,#4 1590 add v0.4s,v0.4s,v1.4s 1591 add w5,w5,w9 1592 add v4.4s,v4.4s,v5.4s 1593 add w6,w6,w10 1594 add v8.4s,v8.4s,v9.4s 1595 add w7,w7,w11 1596 add v12.4s,v12.4s,v13.4s 1597 add w8,w8,w12 1598 add v16.4s,v16.4s,v17.4s 1599 eor w17,w17,w5 1600 add v20.4s,v20.4s,v21.4s 1601 eor w19,w19,w6 1602 eor v3.16b,v3.16b,v0.16b 1603 eor w20,w20,w7 1604 eor v7.16b,v7.16b,v4.16b 1605 eor w21,w21,w8 1606 eor v11.16b,v11.16b,v8.16b 1607 ror w17,w17,#16 1608 eor v15.16b,v15.16b,v12.16b 1609 ror w19,w19,#16 1610 eor v19.16b,v19.16b,v16.16b 1611 ror w20,w20,#16 1612 eor v23.16b,v23.16b,v20.16b 1613 ror w21,w21,#16 1614 rev32 v3.8h,v3.8h 1615 add w13,w13,w17 1616 rev32 v7.8h,v7.8h 1617 add w14,w14,w19 1618 rev32 v11.8h,v11.8h 1619 add w15,w15,w20 1620 rev32 v15.8h,v15.8h 1621 add w16,w16,w21 1622 rev32 v19.8h,v19.8h 1623 eor w9,w9,w13 1624 rev32 v23.8h,v23.8h 1625 eor w10,w10,w14 1626 add v2.4s,v2.4s,v3.4s 1627 eor w11,w11,w15 1628 add v6.4s,v6.4s,v7.4s 1629 eor w12,w12,w16 1630 add v10.4s,v10.4s,v11.4s 1631 ror w9,w9,#20 1632 add v14.4s,v14.4s,v15.4s 1633 ror w10,w10,#20 1634 add v18.4s,v18.4s,v19.4s 1635 ror w11,w11,#20 1636 add v22.4s,v22.4s,v23.4s 1637 ror w12,w12,#20 1638 eor v24.16b,v1.16b,v2.16b 1639 add w5,w5,w9 1640 eor v25.16b,v5.16b,v6.16b 1641 add w6,w6,w10 1642 eor v26.16b,v9.16b,v10.16b 1643 add w7,w7,w11 1644 eor v27.16b,v13.16b,v14.16b 1645 add w8,w8,w12 1646 eor v28.16b,v17.16b,v18.16b 1647 eor w17,w17,w5 1648 eor v29.16b,v21.16b,v22.16b 1649 eor w19,w19,w6 1650 ushr v1.4s,v24.4s,#20 1651 eor w20,w20,w7 1652 ushr v5.4s,v25.4s,#20 1653 eor w21,w21,w8 1654 ushr v9.4s,v26.4s,#20 1655 ror w17,w17,#24 1656 ushr v13.4s,v27.4s,#20 1657 ror w19,w19,#24 1658 ushr v17.4s,v28.4s,#20 1659 ror w20,w20,#24 1660 ushr v21.4s,v29.4s,#20 1661 ror w21,w21,#24 1662 sli v1.4s,v24.4s,#12 1663 add w13,w13,w17 1664 sli v5.4s,v25.4s,#12 1665 add w14,w14,w19 1666 sli v9.4s,v26.4s,#12 1667 add w15,w15,w20 1668 sli v13.4s,v27.4s,#12 1669 add w16,w16,w21 1670 sli v17.4s,v28.4s,#12 1671 eor w9,w9,w13 1672 sli v21.4s,v29.4s,#12 1673 eor w10,w10,w14 1674 add v0.4s,v0.4s,v1.4s 1675 eor w11,w11,w15 1676 add v4.4s,v4.4s,v5.4s 1677 eor w12,w12,w16 1678 add v8.4s,v8.4s,v9.4s 1679 ror w9,w9,#25 1680 add v12.4s,v12.4s,v13.4s 1681 ror w10,w10,#25 1682 add v16.4s,v16.4s,v17.4s 1683 ror w11,w11,#25 1684 add v20.4s,v20.4s,v21.4s 1685 ror w12,w12,#25 1686 eor v24.16b,v3.16b,v0.16b 1687 add w5,w5,w10 1688 eor v25.16b,v7.16b,v4.16b 1689 add w6,w6,w11 1690 eor v26.16b,v11.16b,v8.16b 1691 add w7,w7,w12 1692 eor v27.16b,v15.16b,v12.16b 1693 add w8,w8,w9 1694 eor v28.16b,v19.16b,v16.16b 1695 eor w21,w21,w5 1696 eor v29.16b,v23.16b,v20.16b 1697 eor w17,w17,w6 1698 ushr v3.4s,v24.4s,#24 1699 eor w19,w19,w7 1700 ushr v7.4s,v25.4s,#24 1701 eor w20,w20,w8 1702 ushr v11.4s,v26.4s,#24 1703 ror w21,w21,#16 1704 ushr v15.4s,v27.4s,#24 1705 ror w17,w17,#16 1706 ushr v19.4s,v28.4s,#24 1707 ror w19,w19,#16 1708 ushr v23.4s,v29.4s,#24 1709 ror w20,w20,#16 1710 sli v3.4s,v24.4s,#8 1711 add w15,w15,w21 1712 sli v7.4s,v25.4s,#8 1713 add w16,w16,w17 1714 sli v11.4s,v26.4s,#8 1715 add w13,w13,w19 1716 sli v15.4s,v27.4s,#8 1717 add w14,w14,w20 1718 sli v19.4s,v28.4s,#8 1719 eor w10,w10,w15 1720 sli v23.4s,v29.4s,#8 1721 eor w11,w11,w16 1722 add v2.4s,v2.4s,v3.4s 1723 eor w12,w12,w13 1724 add v6.4s,v6.4s,v7.4s 1725 eor w9,w9,w14 1726 add v10.4s,v10.4s,v11.4s 1727 ror w10,w10,#20 1728 add v14.4s,v14.4s,v15.4s 1729 ror w11,w11,#20 1730 add v18.4s,v18.4s,v19.4s 1731 ror w12,w12,#20 1732 add v22.4s,v22.4s,v23.4s 1733 ror w9,w9,#20 1734 eor v24.16b,v1.16b,v2.16b 1735 add w5,w5,w10 1736 eor v25.16b,v5.16b,v6.16b 1737 add w6,w6,w11 1738 eor v26.16b,v9.16b,v10.16b 1739 add w7,w7,w12 1740 eor v27.16b,v13.16b,v14.16b 1741 add w8,w8,w9 1742 eor v28.16b,v17.16b,v18.16b 1743 eor w21,w21,w5 1744 eor v29.16b,v21.16b,v22.16b 1745 eor w17,w17,w6 1746 ushr v1.4s,v24.4s,#25 1747 eor w19,w19,w7 1748 ushr v5.4s,v25.4s,#25 1749 eor w20,w20,w8 1750 ushr v9.4s,v26.4s,#25 1751 ror w21,w21,#24 1752 ushr v13.4s,v27.4s,#25 1753 ror w17,w17,#24 1754 ushr v17.4s,v28.4s,#25 1755 ror w19,w19,#24 1756 ushr v21.4s,v29.4s,#25 1757 ror w20,w20,#24 1758 sli v1.4s,v24.4s,#7 1759 add w15,w15,w21 1760 sli v5.4s,v25.4s,#7 1761 add w16,w16,w17 1762 sli v9.4s,v26.4s,#7 1763 add w13,w13,w19 1764 sli v13.4s,v27.4s,#7 1765 add w14,w14,w20 1766 sli v17.4s,v28.4s,#7 1767 eor w10,w10,w15 1768 sli v21.4s,v29.4s,#7 1769 eor w11,w11,w16 1770 ext v2.16b,v2.16b,v2.16b,#8 1771 eor w12,w12,w13 1772 ext v6.16b,v6.16b,v6.16b,#8 1773 eor w9,w9,w14 1774 ext v10.16b,v10.16b,v10.16b,#8 1775 ror w10,w10,#25 1776 ext v14.16b,v14.16b,v14.16b,#8 1777 ror w11,w11,#25 1778 ext v18.16b,v18.16b,v18.16b,#8 1779 ror w12,w12,#25 1780 ext v22.16b,v22.16b,v22.16b,#8 1781 ror w9,w9,#25 1782 ext v3.16b,v3.16b,v3.16b,#4 1783 ext v7.16b,v7.16b,v7.16b,#4 1784 ext v11.16b,v11.16b,v11.16b,#4 1785 ext v15.16b,v15.16b,v15.16b,#4 1786 ext v19.16b,v19.16b,v19.16b,#4 1787 ext v23.16b,v23.16b,v23.16b,#4 1788 ext v1.16b,v1.16b,v1.16b,#12 1789 ext v5.16b,v5.16b,v5.16b,#12 1790 ext v9.16b,v9.16b,v9.16b,#12 1791 ext v13.16b,v13.16b,v13.16b,#12 1792 ext v17.16b,v17.16b,v17.16b,#12 1793 ext v21.16b,v21.16b,v21.16b,#12 1794 cbnz x4,.Loop_lower_neon 1795 1796 add w5,w5,w22 // accumulate key block 1797 ldp q24,q25,[sp,#0] 1798 add x6,x6,x22,lsr#32 1799 ldp q26,q27,[sp,#32] 1800 add w7,w7,w23 1801 ldp q28,q29,[sp,#64] 1802 add x8,x8,x23,lsr#32 1803 add v0.4s,v0.4s,v24.4s 1804 add w9,w9,w24 1805 add v4.4s,v4.4s,v24.4s 1806 add x10,x10,x24,lsr#32 1807 add v8.4s,v8.4s,v24.4s 1808 add w11,w11,w25 1809 add v12.4s,v12.4s,v24.4s 1810 add x12,x12,x25,lsr#32 1811 add v16.4s,v16.4s,v24.4s 1812 add w13,w13,w26 1813 add v20.4s,v20.4s,v24.4s 1814 add x14,x14,x26,lsr#32 1815 add v2.4s,v2.4s,v26.4s 1816 add w15,w15,w27 1817 add v6.4s,v6.4s,v26.4s 1818 add x16,x16,x27,lsr#32 1819 add v10.4s,v10.4s,v26.4s 1820 add w17,w17,w28 1821 add v14.4s,v14.4s,v26.4s 1822 add x19,x19,x28,lsr#32 1823 add v18.4s,v18.4s,v26.4s 1824 add w20,w20,w30 1825 add v22.4s,v22.4s,v26.4s 1826 add x21,x21,x30,lsr#32 1827 add v19.4s,v19.4s,v31.4s // +4 1828 add x5,x5,x6,lsl#32 // pack 1829 add v23.4s,v23.4s,v31.4s // +4 1830 add x7,x7,x8,lsl#32 1831 add v3.4s,v3.4s,v27.4s 1832 ldp x6,x8,[x1,#0] // load input 1833 add v7.4s,v7.4s,v28.4s 1834 add x9,x9,x10,lsl#32 1835 add v11.4s,v11.4s,v29.4s 1836 add x11,x11,x12,lsl#32 1837 add v15.4s,v15.4s,v30.4s 1838 ldp x10,x12,[x1,#16] 1839 add v19.4s,v19.4s,v27.4s 1840 add x13,x13,x14,lsl#32 1841 add v23.4s,v23.4s,v28.4s 1842 add x15,x15,x16,lsl#32 1843 add v1.4s,v1.4s,v25.4s 1844 ldp x14,x16,[x1,#32] 1845 add v5.4s,v5.4s,v25.4s 1846 add x17,x17,x19,lsl#32 1847 add v9.4s,v9.4s,v25.4s 1848 add x20,x20,x21,lsl#32 1849 add v13.4s,v13.4s,v25.4s 1850 ldp x19,x21,[x1,#48] 1851 add v17.4s,v17.4s,v25.4s 1852 add x1,x1,#64 1853 add v21.4s,v21.4s,v25.4s 1854 1855#ifdef __ARMEB__ 1856 rev x5,x5 1857 rev x7,x7 1858 rev x9,x9 1859 rev x11,x11 1860 rev x13,x13 1861 rev x15,x15 1862 rev x17,x17 1863 rev x20,x20 1864#endif 1865 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1866 eor x5,x5,x6 1867 eor x7,x7,x8 1868 eor x9,x9,x10 1869 eor x11,x11,x12 1870 eor x13,x13,x14 1871 eor v0.16b,v0.16b,v24.16b 1872 eor x15,x15,x16 1873 eor v1.16b,v1.16b,v25.16b 1874 eor x17,x17,x19 1875 eor v2.16b,v2.16b,v26.16b 1876 eor x20,x20,x21 1877 eor v3.16b,v3.16b,v27.16b 1878 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1879 1880 stp x5,x7,[x0,#0] // store output 1881 add x28,x28,#7 // increment counter 1882 stp x9,x11,[x0,#16] 1883 stp x13,x15,[x0,#32] 1884 stp x17,x20,[x0,#48] 1885 add x0,x0,#64 1886 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1887 1888 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1889 eor v4.16b,v4.16b,v24.16b 1890 eor v5.16b,v5.16b,v25.16b 1891 eor v6.16b,v6.16b,v26.16b 1892 eor v7.16b,v7.16b,v27.16b 1893 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1894 1895 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1896 eor v8.16b,v8.16b,v0.16b 1897 ldp q24,q25,[sp,#0] 1898 eor v9.16b,v9.16b,v1.16b 1899 ldp q26,q27,[sp,#32] 1900 eor v10.16b,v10.16b,v2.16b 1901 eor v11.16b,v11.16b,v3.16b 1902 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1903 1904 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1905 eor v12.16b,v12.16b,v4.16b 1906 eor v13.16b,v13.16b,v5.16b 1907 eor v14.16b,v14.16b,v6.16b 1908 eor v15.16b,v15.16b,v7.16b 1909 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1910 1911 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1912 eor v16.16b,v16.16b,v8.16b 1913 eor v17.16b,v17.16b,v9.16b 1914 eor v18.16b,v18.16b,v10.16b 1915 eor v19.16b,v19.16b,v11.16b 1916 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1917 1918 shl v0.4s,v31.4s,#1 // 4 -> 8 1919 eor v20.16b,v20.16b,v12.16b 1920 eor v21.16b,v21.16b,v13.16b 1921 eor v22.16b,v22.16b,v14.16b 1922 eor v23.16b,v23.16b,v15.16b 1923 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1924 1925 add v27.4s,v27.4s,v0.4s // += 8 1926 add v28.4s,v28.4s,v0.4s 1927 add v29.4s,v29.4s,v0.4s 1928 add v30.4s,v30.4s,v0.4s 1929 1930 b.hs .Loop_outer_512_neon 1931 1932 adds x2,x2,#512 1933 ushr v0.4s,v31.4s,#2 // 4 -> 1 1934 1935 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1936 ldp d10,d11,[sp,#128+16] 1937 ldp d12,d13,[sp,#128+32] 1938 ldp d14,d15,[sp,#128+48] 1939 1940 stp q24,q31,[sp,#0] // wipe off-load area 1941 stp q24,q31,[sp,#32] 1942 stp q24,q31,[sp,#64] 1943 1944 b.eq .Ldone_512_neon 1945 1946 cmp x2,#192 1947 sub v27.4s,v27.4s,v0.4s // -= 1 1948 sub v28.4s,v28.4s,v0.4s 1949 sub v29.4s,v29.4s,v0.4s 1950 add sp,sp,#128 1951 b.hs .Loop_outer_neon 1952 1953 eor v25.16b,v25.16b,v25.16b 1954 eor v26.16b,v26.16b,v26.16b 1955 eor v27.16b,v27.16b,v27.16b 1956 eor v28.16b,v28.16b,v28.16b 1957 eor v29.16b,v29.16b,v29.16b 1958 eor v30.16b,v30.16b,v30.16b 1959 b .Loop_outer 1960 1961.Ldone_512_neon: 1962 ldp x19,x20,[x29,#16] 1963 add sp,sp,#128+64 1964 ldp x21,x22,[x29,#32] 1965 ldp x23,x24,[x29,#48] 1966 ldp x25,x26,[x29,#64] 1967 ldp x27,x28,[x29,#80] 1968 ldp x29,x30,[sp],#96 1969 ret 1970.size ChaCha20_512_neon,.-ChaCha20_512_neon 1971#endif 1972