1#if defined(__x86_64__) 2.text 3 4.extern OPENSSL_ia32cap_P 5.hidden OPENSSL_ia32cap_P 6 7.align 64 8.Lzero: 9.long 0,0,0,0 10.Lone: 11.long 1,0,0,0 12.Linc: 13.long 0,1,2,3 14.Lfour: 15.long 4,4,4,4 16.Lincy: 17.long 0,2,4,6,1,3,5,7 18.Leight: 19.long 8,8,8,8,8,8,8,8 20.Lrot16: 21.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 22.Lrot24: 23.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 24.Lsigma: 25.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 26.align 64 27.Lzeroz: 28.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 29.Lfourz: 30.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 31.Lincz: 32.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 33.Lsixteen: 34.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 35.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 36.globl ChaCha20_ctr32 37.hidden ChaCha20_ctr32 38.type ChaCha20_ctr32,@function 39.align 64 40ChaCha20_ctr32: 41 cmpq $0,%rdx 42 je .Lno_data 43 movq OPENSSL_ia32cap_P+4(%rip),%r10 44 testl $512,%r10d 45 jnz .LChaCha20_ssse3 46 47 pushq %rbx 48 pushq %rbp 49 pushq %r12 50 pushq %r13 51 pushq %r14 52 pushq %r15 53 subq $64+24,%rsp 54.Lctr32_body: 55 56 57 movdqu (%rcx),%xmm1 58 movdqu 16(%rcx),%xmm2 59 movdqu (%r8),%xmm3 60 movdqa .Lone(%rip),%xmm4 61 62 63 movdqa %xmm1,16(%rsp) 64 movdqa %xmm2,32(%rsp) 65 movdqa %xmm3,48(%rsp) 66 movq %rdx,%rbp 67 jmp .Loop_outer 68 69.align 32 70.Loop_outer: 71 movl $0x61707865,%eax 72 movl $0x3320646e,%ebx 73 movl $0x79622d32,%ecx 74 movl $0x6b206574,%edx 75 movl 16(%rsp),%r8d 76 movl 20(%rsp),%r9d 77 movl 24(%rsp),%r10d 78 movl 28(%rsp),%r11d 79 movd %xmm3,%r12d 80 movl 52(%rsp),%r13d 81 movl 56(%rsp),%r14d 82 movl 60(%rsp),%r15d 83 84 movq %rbp,64+0(%rsp) 85 movl $10,%ebp 86 movq %rsi,64+8(%rsp) 87.byte 102,72,15,126,214 88 movq %rdi,64+16(%rsp) 89 movq %rsi,%rdi 90 shrq $32,%rdi 91 jmp .Loop 92 93.align 32 94.Loop: 95 addl %r8d,%eax 96 xorl %eax,%r12d 97 roll $16,%r12d 98 addl %r9d,%ebx 99 xorl %ebx,%r13d 100 roll $16,%r13d 101 addl %r12d,%esi 102 xorl %esi,%r8d 103 roll $12,%r8d 104 addl %r13d,%edi 105 xorl %edi,%r9d 106 roll $12,%r9d 107 addl %r8d,%eax 108 xorl %eax,%r12d 109 roll $8,%r12d 110 addl %r9d,%ebx 111 xorl %ebx,%r13d 112 roll $8,%r13d 113 addl %r12d,%esi 114 xorl %esi,%r8d 115 roll $7,%r8d 116 addl %r13d,%edi 117 xorl %edi,%r9d 118 roll $7,%r9d 119 movl %esi,32(%rsp) 120 movl %edi,36(%rsp) 121 movl 40(%rsp),%esi 122 movl 44(%rsp),%edi 123 addl %r10d,%ecx 124 xorl %ecx,%r14d 125 roll $16,%r14d 126 addl %r11d,%edx 127 xorl %edx,%r15d 128 roll $16,%r15d 129 addl %r14d,%esi 130 xorl %esi,%r10d 131 roll $12,%r10d 132 addl %r15d,%edi 133 xorl %edi,%r11d 134 roll $12,%r11d 135 addl %r10d,%ecx 136 xorl %ecx,%r14d 137 roll $8,%r14d 138 addl %r11d,%edx 139 xorl %edx,%r15d 140 roll $8,%r15d 141 addl %r14d,%esi 142 xorl %esi,%r10d 143 roll $7,%r10d 144 addl %r15d,%edi 145 xorl %edi,%r11d 146 roll $7,%r11d 147 addl %r9d,%eax 148 xorl %eax,%r15d 149 roll $16,%r15d 150 addl %r10d,%ebx 151 xorl %ebx,%r12d 152 roll $16,%r12d 153 addl %r15d,%esi 154 xorl %esi,%r9d 155 roll $12,%r9d 156 addl %r12d,%edi 157 xorl %edi,%r10d 158 roll $12,%r10d 159 addl %r9d,%eax 160 xorl %eax,%r15d 161 roll $8,%r15d 162 addl %r10d,%ebx 163 xorl %ebx,%r12d 164 roll $8,%r12d 165 addl %r15d,%esi 166 xorl %esi,%r9d 167 roll $7,%r9d 168 addl %r12d,%edi 169 xorl %edi,%r10d 170 roll $7,%r10d 171 movl %esi,40(%rsp) 172 movl %edi,44(%rsp) 173 movl 32(%rsp),%esi 174 movl 36(%rsp),%edi 175 addl %r11d,%ecx 176 xorl %ecx,%r13d 177 roll $16,%r13d 178 addl %r8d,%edx 179 xorl %edx,%r14d 180 roll $16,%r14d 181 addl %r13d,%esi 182 xorl %esi,%r11d 183 roll $12,%r11d 184 addl %r14d,%edi 185 xorl %edi,%r8d 186 roll $12,%r8d 187 addl %r11d,%ecx 188 xorl %ecx,%r13d 189 roll $8,%r13d 190 addl %r8d,%edx 191 xorl %edx,%r14d 192 roll $8,%r14d 193 addl %r13d,%esi 194 xorl %esi,%r11d 195 roll $7,%r11d 196 addl %r14d,%edi 197 xorl %edi,%r8d 198 roll $7,%r8d 199 decl %ebp 200 jnz .Loop 201 movl %edi,36(%rsp) 202 movl %esi,32(%rsp) 203 movq 64(%rsp),%rbp 204 movdqa %xmm2,%xmm1 205 movq 64+8(%rsp),%rsi 206 paddd %xmm4,%xmm3 207 movq 64+16(%rsp),%rdi 208 209 addl $0x61707865,%eax 210 addl $0x3320646e,%ebx 211 addl $0x79622d32,%ecx 212 addl $0x6b206574,%edx 213 addl 16(%rsp),%r8d 214 addl 20(%rsp),%r9d 215 addl 24(%rsp),%r10d 216 addl 28(%rsp),%r11d 217 addl 48(%rsp),%r12d 218 addl 52(%rsp),%r13d 219 addl 56(%rsp),%r14d 220 addl 60(%rsp),%r15d 221 paddd 32(%rsp),%xmm1 222 223 cmpq $64,%rbp 224 jb .Ltail 225 226 xorl 0(%rsi),%eax 227 xorl 4(%rsi),%ebx 228 xorl 8(%rsi),%ecx 229 xorl 12(%rsi),%edx 230 xorl 16(%rsi),%r8d 231 xorl 20(%rsi),%r9d 232 xorl 24(%rsi),%r10d 233 xorl 28(%rsi),%r11d 234 movdqu 32(%rsi),%xmm0 235 xorl 48(%rsi),%r12d 236 xorl 52(%rsi),%r13d 237 xorl 56(%rsi),%r14d 238 xorl 60(%rsi),%r15d 239 leaq 64(%rsi),%rsi 240 pxor %xmm1,%xmm0 241 242 movdqa %xmm2,32(%rsp) 243 movd %xmm3,48(%rsp) 244 245 movl %eax,0(%rdi) 246 movl %ebx,4(%rdi) 247 movl %ecx,8(%rdi) 248 movl %edx,12(%rdi) 249 movl %r8d,16(%rdi) 250 movl %r9d,20(%rdi) 251 movl %r10d,24(%rdi) 252 movl %r11d,28(%rdi) 253 movdqu %xmm0,32(%rdi) 254 movl %r12d,48(%rdi) 255 movl %r13d,52(%rdi) 256 movl %r14d,56(%rdi) 257 movl %r15d,60(%rdi) 258 leaq 64(%rdi),%rdi 259 260 subq $64,%rbp 261 jnz .Loop_outer 262 263 jmp .Ldone 264 265.align 16 266.Ltail: 267 movl %eax,0(%rsp) 268 movl %ebx,4(%rsp) 269 xorq %rbx,%rbx 270 movl %ecx,8(%rsp) 271 movl %edx,12(%rsp) 272 movl %r8d,16(%rsp) 273 movl %r9d,20(%rsp) 274 movl %r10d,24(%rsp) 275 movl %r11d,28(%rsp) 276 movdqa %xmm1,32(%rsp) 277 movl %r12d,48(%rsp) 278 movl %r13d,52(%rsp) 279 movl %r14d,56(%rsp) 280 movl %r15d,60(%rsp) 281 282.Loop_tail: 283 movzbl (%rsi,%rbx,1),%eax 284 movzbl (%rsp,%rbx,1),%edx 285 leaq 1(%rbx),%rbx 286 xorl %edx,%eax 287 movb %al,-1(%rdi,%rbx,1) 288 decq %rbp 289 jnz .Loop_tail 290 291.Ldone: 292 leaq 64+24+48(%rsp),%rsi 293 movq -48(%rsi),%r15 294 movq -40(%rsi),%r14 295 movq -32(%rsi),%r13 296 movq -24(%rsi),%r12 297 movq -16(%rsi),%rbp 298 movq -8(%rsi),%rbx 299 leaq (%rsi),%rsp 300.Lno_data: 301 .byte 0xf3,0xc3 302.size ChaCha20_ctr32,.-ChaCha20_ctr32 303.type ChaCha20_ssse3,@function 304.align 32 305ChaCha20_ssse3: 306.LChaCha20_ssse3: 307 movq %rsp,%r9 308 cmpq $128,%rdx 309 ja .LChaCha20_4x 310 311.Ldo_sse3_after_all: 312 subq $64+8,%rsp 313 movdqa .Lsigma(%rip),%xmm0 314 movdqu (%rcx),%xmm1 315 movdqu 16(%rcx),%xmm2 316 movdqu (%r8),%xmm3 317 movdqa .Lrot16(%rip),%xmm6 318 movdqa .Lrot24(%rip),%xmm7 319 320 movdqa %xmm0,0(%rsp) 321 movdqa %xmm1,16(%rsp) 322 movdqa %xmm2,32(%rsp) 323 movdqa %xmm3,48(%rsp) 324 movq $10,%r8 325 jmp .Loop_ssse3 326 327.align 32 328.Loop_outer_ssse3: 329 movdqa .Lone(%rip),%xmm3 330 movdqa 0(%rsp),%xmm0 331 movdqa 16(%rsp),%xmm1 332 movdqa 32(%rsp),%xmm2 333 paddd 48(%rsp),%xmm3 334 movq $10,%r8 335 movdqa %xmm3,48(%rsp) 336 jmp .Loop_ssse3 337 338.align 32 339.Loop_ssse3: 340 paddd %xmm1,%xmm0 341 pxor %xmm0,%xmm3 342.byte 102,15,56,0,222 343 paddd %xmm3,%xmm2 344 pxor %xmm2,%xmm1 345 movdqa %xmm1,%xmm4 346 psrld $20,%xmm1 347 pslld $12,%xmm4 348 por %xmm4,%xmm1 349 paddd %xmm1,%xmm0 350 pxor %xmm0,%xmm3 351.byte 102,15,56,0,223 352 paddd %xmm3,%xmm2 353 pxor %xmm2,%xmm1 354 movdqa %xmm1,%xmm4 355 psrld $25,%xmm1 356 pslld $7,%xmm4 357 por %xmm4,%xmm1 358 pshufd $78,%xmm2,%xmm2 359 pshufd $57,%xmm1,%xmm1 360 pshufd $147,%xmm3,%xmm3 361 nop 362 paddd %xmm1,%xmm0 363 pxor %xmm0,%xmm3 364.byte 102,15,56,0,222 365 paddd %xmm3,%xmm2 366 pxor %xmm2,%xmm1 367 movdqa %xmm1,%xmm4 368 psrld $20,%xmm1 369 pslld $12,%xmm4 370 por %xmm4,%xmm1 371 paddd %xmm1,%xmm0 372 pxor %xmm0,%xmm3 373.byte 102,15,56,0,223 374 paddd %xmm3,%xmm2 375 pxor %xmm2,%xmm1 376 movdqa %xmm1,%xmm4 377 psrld $25,%xmm1 378 pslld $7,%xmm4 379 por %xmm4,%xmm1 380 pshufd $78,%xmm2,%xmm2 381 pshufd $147,%xmm1,%xmm1 382 pshufd $57,%xmm3,%xmm3 383 decq %r8 384 jnz .Loop_ssse3 385 paddd 0(%rsp),%xmm0 386 paddd 16(%rsp),%xmm1 387 paddd 32(%rsp),%xmm2 388 paddd 48(%rsp),%xmm3 389 390 cmpq $64,%rdx 391 jb .Ltail_ssse3 392 393 movdqu 0(%rsi),%xmm4 394 movdqu 16(%rsi),%xmm5 395 pxor %xmm4,%xmm0 396 movdqu 32(%rsi),%xmm4 397 pxor %xmm5,%xmm1 398 movdqu 48(%rsi),%xmm5 399 leaq 64(%rsi),%rsi 400 pxor %xmm4,%xmm2 401 pxor %xmm5,%xmm3 402 403 movdqu %xmm0,0(%rdi) 404 movdqu %xmm1,16(%rdi) 405 movdqu %xmm2,32(%rdi) 406 movdqu %xmm3,48(%rdi) 407 leaq 64(%rdi),%rdi 408 409 subq $64,%rdx 410 jnz .Loop_outer_ssse3 411 412 jmp .Ldone_ssse3 413 414.align 16 415.Ltail_ssse3: 416 movdqa %xmm0,0(%rsp) 417 movdqa %xmm1,16(%rsp) 418 movdqa %xmm2,32(%rsp) 419 movdqa %xmm3,48(%rsp) 420 xorq %r8,%r8 421 422.Loop_tail_ssse3: 423 movzbl (%rsi,%r8,1),%eax 424 movzbl (%rsp,%r8,1),%ecx 425 leaq 1(%r8),%r8 426 xorl %ecx,%eax 427 movb %al,-1(%rdi,%r8,1) 428 decq %rdx 429 jnz .Loop_tail_ssse3 430 431.Ldone_ssse3: 432 leaq (%r9),%rsp 433.Lssse3_epilogue: 434 .byte 0xf3,0xc3 435.size ChaCha20_ssse3,.-ChaCha20_ssse3 436.type ChaCha20_4x,@function 437.align 32 438ChaCha20_4x: 439.LChaCha20_4x: 440 movq %rsp,%r9 441 movq %r10,%r11 442 shrq $32,%r10 443 testq $32,%r10 444 jnz .LChaCha20_8x 445 cmpq $192,%rdx 446 ja .Lproceed4x 447 448 andq $71303168,%r11 449 cmpq $4194304,%r11 450 je .Ldo_sse3_after_all 451 452.Lproceed4x: 453 subq $0x140+8,%rsp 454 movdqa .Lsigma(%rip),%xmm11 455 movdqu (%rcx),%xmm15 456 movdqu 16(%rcx),%xmm7 457 movdqu (%r8),%xmm3 458 leaq 256(%rsp),%rcx 459 leaq .Lrot16(%rip),%r10 460 leaq .Lrot24(%rip),%r11 461 462 pshufd $0x00,%xmm11,%xmm8 463 pshufd $0x55,%xmm11,%xmm9 464 movdqa %xmm8,64(%rsp) 465 pshufd $0xaa,%xmm11,%xmm10 466 movdqa %xmm9,80(%rsp) 467 pshufd $0xff,%xmm11,%xmm11 468 movdqa %xmm10,96(%rsp) 469 movdqa %xmm11,112(%rsp) 470 471 pshufd $0x00,%xmm15,%xmm12 472 pshufd $0x55,%xmm15,%xmm13 473 movdqa %xmm12,128-256(%rcx) 474 pshufd $0xaa,%xmm15,%xmm14 475 movdqa %xmm13,144-256(%rcx) 476 pshufd $0xff,%xmm15,%xmm15 477 movdqa %xmm14,160-256(%rcx) 478 movdqa %xmm15,176-256(%rcx) 479 480 pshufd $0x00,%xmm7,%xmm4 481 pshufd $0x55,%xmm7,%xmm5 482 movdqa %xmm4,192-256(%rcx) 483 pshufd $0xaa,%xmm7,%xmm6 484 movdqa %xmm5,208-256(%rcx) 485 pshufd $0xff,%xmm7,%xmm7 486 movdqa %xmm6,224-256(%rcx) 487 movdqa %xmm7,240-256(%rcx) 488 489 pshufd $0x00,%xmm3,%xmm0 490 pshufd $0x55,%xmm3,%xmm1 491 paddd .Linc(%rip),%xmm0 492 pshufd $0xaa,%xmm3,%xmm2 493 movdqa %xmm1,272-256(%rcx) 494 pshufd $0xff,%xmm3,%xmm3 495 movdqa %xmm2,288-256(%rcx) 496 movdqa %xmm3,304-256(%rcx) 497 498 jmp .Loop_enter4x 499 500.align 32 501.Loop_outer4x: 502 movdqa 64(%rsp),%xmm8 503 movdqa 80(%rsp),%xmm9 504 movdqa 96(%rsp),%xmm10 505 movdqa 112(%rsp),%xmm11 506 movdqa 128-256(%rcx),%xmm12 507 movdqa 144-256(%rcx),%xmm13 508 movdqa 160-256(%rcx),%xmm14 509 movdqa 176-256(%rcx),%xmm15 510 movdqa 192-256(%rcx),%xmm4 511 movdqa 208-256(%rcx),%xmm5 512 movdqa 224-256(%rcx),%xmm6 513 movdqa 240-256(%rcx),%xmm7 514 movdqa 256-256(%rcx),%xmm0 515 movdqa 272-256(%rcx),%xmm1 516 movdqa 288-256(%rcx),%xmm2 517 movdqa 304-256(%rcx),%xmm3 518 paddd .Lfour(%rip),%xmm0 519 520.Loop_enter4x: 521 movdqa %xmm6,32(%rsp) 522 movdqa %xmm7,48(%rsp) 523 movdqa (%r10),%xmm7 524 movl $10,%eax 525 movdqa %xmm0,256-256(%rcx) 526 jmp .Loop4x 527 528.align 32 529.Loop4x: 530 paddd %xmm12,%xmm8 531 paddd %xmm13,%xmm9 532 pxor %xmm8,%xmm0 533 pxor %xmm9,%xmm1 534.byte 102,15,56,0,199 535.byte 102,15,56,0,207 536 paddd %xmm0,%xmm4 537 paddd %xmm1,%xmm5 538 pxor %xmm4,%xmm12 539 pxor %xmm5,%xmm13 540 movdqa %xmm12,%xmm6 541 pslld $12,%xmm12 542 psrld $20,%xmm6 543 movdqa %xmm13,%xmm7 544 pslld $12,%xmm13 545 por %xmm6,%xmm12 546 psrld $20,%xmm7 547 movdqa (%r11),%xmm6 548 por %xmm7,%xmm13 549 paddd %xmm12,%xmm8 550 paddd %xmm13,%xmm9 551 pxor %xmm8,%xmm0 552 pxor %xmm9,%xmm1 553.byte 102,15,56,0,198 554.byte 102,15,56,0,206 555 paddd %xmm0,%xmm4 556 paddd %xmm1,%xmm5 557 pxor %xmm4,%xmm12 558 pxor %xmm5,%xmm13 559 movdqa %xmm12,%xmm7 560 pslld $7,%xmm12 561 psrld $25,%xmm7 562 movdqa %xmm13,%xmm6 563 pslld $7,%xmm13 564 por %xmm7,%xmm12 565 psrld $25,%xmm6 566 movdqa (%r10),%xmm7 567 por %xmm6,%xmm13 568 movdqa %xmm4,0(%rsp) 569 movdqa %xmm5,16(%rsp) 570 movdqa 32(%rsp),%xmm4 571 movdqa 48(%rsp),%xmm5 572 paddd %xmm14,%xmm10 573 paddd %xmm15,%xmm11 574 pxor %xmm10,%xmm2 575 pxor %xmm11,%xmm3 576.byte 102,15,56,0,215 577.byte 102,15,56,0,223 578 paddd %xmm2,%xmm4 579 paddd %xmm3,%xmm5 580 pxor %xmm4,%xmm14 581 pxor %xmm5,%xmm15 582 movdqa %xmm14,%xmm6 583 pslld $12,%xmm14 584 psrld $20,%xmm6 585 movdqa %xmm15,%xmm7 586 pslld $12,%xmm15 587 por %xmm6,%xmm14 588 psrld $20,%xmm7 589 movdqa (%r11),%xmm6 590 por %xmm7,%xmm15 591 paddd %xmm14,%xmm10 592 paddd %xmm15,%xmm11 593 pxor %xmm10,%xmm2 594 pxor %xmm11,%xmm3 595.byte 102,15,56,0,214 596.byte 102,15,56,0,222 597 paddd %xmm2,%xmm4 598 paddd %xmm3,%xmm5 599 pxor %xmm4,%xmm14 600 pxor %xmm5,%xmm15 601 movdqa %xmm14,%xmm7 602 pslld $7,%xmm14 603 psrld $25,%xmm7 604 movdqa %xmm15,%xmm6 605 pslld $7,%xmm15 606 por %xmm7,%xmm14 607 psrld $25,%xmm6 608 movdqa (%r10),%xmm7 609 por %xmm6,%xmm15 610 paddd %xmm13,%xmm8 611 paddd %xmm14,%xmm9 612 pxor %xmm8,%xmm3 613 pxor %xmm9,%xmm0 614.byte 102,15,56,0,223 615.byte 102,15,56,0,199 616 paddd %xmm3,%xmm4 617 paddd %xmm0,%xmm5 618 pxor %xmm4,%xmm13 619 pxor %xmm5,%xmm14 620 movdqa %xmm13,%xmm6 621 pslld $12,%xmm13 622 psrld $20,%xmm6 623 movdqa %xmm14,%xmm7 624 pslld $12,%xmm14 625 por %xmm6,%xmm13 626 psrld $20,%xmm7 627 movdqa (%r11),%xmm6 628 por %xmm7,%xmm14 629 paddd %xmm13,%xmm8 630 paddd %xmm14,%xmm9 631 pxor %xmm8,%xmm3 632 pxor %xmm9,%xmm0 633.byte 102,15,56,0,222 634.byte 102,15,56,0,198 635 paddd %xmm3,%xmm4 636 paddd %xmm0,%xmm5 637 pxor %xmm4,%xmm13 638 pxor %xmm5,%xmm14 639 movdqa %xmm13,%xmm7 640 pslld $7,%xmm13 641 psrld $25,%xmm7 642 movdqa %xmm14,%xmm6 643 pslld $7,%xmm14 644 por %xmm7,%xmm13 645 psrld $25,%xmm6 646 movdqa (%r10),%xmm7 647 por %xmm6,%xmm14 648 movdqa %xmm4,32(%rsp) 649 movdqa %xmm5,48(%rsp) 650 movdqa 0(%rsp),%xmm4 651 movdqa 16(%rsp),%xmm5 652 paddd %xmm15,%xmm10 653 paddd %xmm12,%xmm11 654 pxor %xmm10,%xmm1 655 pxor %xmm11,%xmm2 656.byte 102,15,56,0,207 657.byte 102,15,56,0,215 658 paddd %xmm1,%xmm4 659 paddd %xmm2,%xmm5 660 pxor %xmm4,%xmm15 661 pxor %xmm5,%xmm12 662 movdqa %xmm15,%xmm6 663 pslld $12,%xmm15 664 psrld $20,%xmm6 665 movdqa %xmm12,%xmm7 666 pslld $12,%xmm12 667 por %xmm6,%xmm15 668 psrld $20,%xmm7 669 movdqa (%r11),%xmm6 670 por %xmm7,%xmm12 671 paddd %xmm15,%xmm10 672 paddd %xmm12,%xmm11 673 pxor %xmm10,%xmm1 674 pxor %xmm11,%xmm2 675.byte 102,15,56,0,206 676.byte 102,15,56,0,214 677 paddd %xmm1,%xmm4 678 paddd %xmm2,%xmm5 679 pxor %xmm4,%xmm15 680 pxor %xmm5,%xmm12 681 movdqa %xmm15,%xmm7 682 pslld $7,%xmm15 683 psrld $25,%xmm7 684 movdqa %xmm12,%xmm6 685 pslld $7,%xmm12 686 por %xmm7,%xmm15 687 psrld $25,%xmm6 688 movdqa (%r10),%xmm7 689 por %xmm6,%xmm12 690 decl %eax 691 jnz .Loop4x 692 693 paddd 64(%rsp),%xmm8 694 paddd 80(%rsp),%xmm9 695 paddd 96(%rsp),%xmm10 696 paddd 112(%rsp),%xmm11 697 698 movdqa %xmm8,%xmm6 699 punpckldq %xmm9,%xmm8 700 movdqa %xmm10,%xmm7 701 punpckldq %xmm11,%xmm10 702 punpckhdq %xmm9,%xmm6 703 punpckhdq %xmm11,%xmm7 704 movdqa %xmm8,%xmm9 705 punpcklqdq %xmm10,%xmm8 706 movdqa %xmm6,%xmm11 707 punpcklqdq %xmm7,%xmm6 708 punpckhqdq %xmm10,%xmm9 709 punpckhqdq %xmm7,%xmm11 710 paddd 128-256(%rcx),%xmm12 711 paddd 144-256(%rcx),%xmm13 712 paddd 160-256(%rcx),%xmm14 713 paddd 176-256(%rcx),%xmm15 714 715 movdqa %xmm8,0(%rsp) 716 movdqa %xmm9,16(%rsp) 717 movdqa 32(%rsp),%xmm8 718 movdqa 48(%rsp),%xmm9 719 720 movdqa %xmm12,%xmm10 721 punpckldq %xmm13,%xmm12 722 movdqa %xmm14,%xmm7 723 punpckldq %xmm15,%xmm14 724 punpckhdq %xmm13,%xmm10 725 punpckhdq %xmm15,%xmm7 726 movdqa %xmm12,%xmm13 727 punpcklqdq %xmm14,%xmm12 728 movdqa %xmm10,%xmm15 729 punpcklqdq %xmm7,%xmm10 730 punpckhqdq %xmm14,%xmm13 731 punpckhqdq %xmm7,%xmm15 732 paddd 192-256(%rcx),%xmm4 733 paddd 208-256(%rcx),%xmm5 734 paddd 224-256(%rcx),%xmm8 735 paddd 240-256(%rcx),%xmm9 736 737 movdqa %xmm6,32(%rsp) 738 movdqa %xmm11,48(%rsp) 739 740 movdqa %xmm4,%xmm14 741 punpckldq %xmm5,%xmm4 742 movdqa %xmm8,%xmm7 743 punpckldq %xmm9,%xmm8 744 punpckhdq %xmm5,%xmm14 745 punpckhdq %xmm9,%xmm7 746 movdqa %xmm4,%xmm5 747 punpcklqdq %xmm8,%xmm4 748 movdqa %xmm14,%xmm9 749 punpcklqdq %xmm7,%xmm14 750 punpckhqdq %xmm8,%xmm5 751 punpckhqdq %xmm7,%xmm9 752 paddd 256-256(%rcx),%xmm0 753 paddd 272-256(%rcx),%xmm1 754 paddd 288-256(%rcx),%xmm2 755 paddd 304-256(%rcx),%xmm3 756 757 movdqa %xmm0,%xmm8 758 punpckldq %xmm1,%xmm0 759 movdqa %xmm2,%xmm7 760 punpckldq %xmm3,%xmm2 761 punpckhdq %xmm1,%xmm8 762 punpckhdq %xmm3,%xmm7 763 movdqa %xmm0,%xmm1 764 punpcklqdq %xmm2,%xmm0 765 movdqa %xmm8,%xmm3 766 punpcklqdq %xmm7,%xmm8 767 punpckhqdq %xmm2,%xmm1 768 punpckhqdq %xmm7,%xmm3 769 cmpq $256,%rdx 770 jb .Ltail4x 771 772 movdqu 0(%rsi),%xmm6 773 movdqu 16(%rsi),%xmm11 774 movdqu 32(%rsi),%xmm2 775 movdqu 48(%rsi),%xmm7 776 pxor 0(%rsp),%xmm6 777 pxor %xmm12,%xmm11 778 pxor %xmm4,%xmm2 779 pxor %xmm0,%xmm7 780 781 movdqu %xmm6,0(%rdi) 782 movdqu 64(%rsi),%xmm6 783 movdqu %xmm11,16(%rdi) 784 movdqu 80(%rsi),%xmm11 785 movdqu %xmm2,32(%rdi) 786 movdqu 96(%rsi),%xmm2 787 movdqu %xmm7,48(%rdi) 788 movdqu 112(%rsi),%xmm7 789 leaq 128(%rsi),%rsi 790 pxor 16(%rsp),%xmm6 791 pxor %xmm13,%xmm11 792 pxor %xmm5,%xmm2 793 pxor %xmm1,%xmm7 794 795 movdqu %xmm6,64(%rdi) 796 movdqu 0(%rsi),%xmm6 797 movdqu %xmm11,80(%rdi) 798 movdqu 16(%rsi),%xmm11 799 movdqu %xmm2,96(%rdi) 800 movdqu 32(%rsi),%xmm2 801 movdqu %xmm7,112(%rdi) 802 leaq 128(%rdi),%rdi 803 movdqu 48(%rsi),%xmm7 804 pxor 32(%rsp),%xmm6 805 pxor %xmm10,%xmm11 806 pxor %xmm14,%xmm2 807 pxor %xmm8,%xmm7 808 809 movdqu %xmm6,0(%rdi) 810 movdqu 64(%rsi),%xmm6 811 movdqu %xmm11,16(%rdi) 812 movdqu 80(%rsi),%xmm11 813 movdqu %xmm2,32(%rdi) 814 movdqu 96(%rsi),%xmm2 815 movdqu %xmm7,48(%rdi) 816 movdqu 112(%rsi),%xmm7 817 leaq 128(%rsi),%rsi 818 pxor 48(%rsp),%xmm6 819 pxor %xmm15,%xmm11 820 pxor %xmm9,%xmm2 821 pxor %xmm3,%xmm7 822 movdqu %xmm6,64(%rdi) 823 movdqu %xmm11,80(%rdi) 824 movdqu %xmm2,96(%rdi) 825 movdqu %xmm7,112(%rdi) 826 leaq 128(%rdi),%rdi 827 828 subq $256,%rdx 829 jnz .Loop_outer4x 830 831 jmp .Ldone4x 832 833.Ltail4x: 834 cmpq $192,%rdx 835 jae .L192_or_more4x 836 cmpq $128,%rdx 837 jae .L128_or_more4x 838 cmpq $64,%rdx 839 jae .L64_or_more4x 840 841 842 xorq %r10,%r10 843 844 movdqa %xmm12,16(%rsp) 845 movdqa %xmm4,32(%rsp) 846 movdqa %xmm0,48(%rsp) 847 jmp .Loop_tail4x 848 849.align 32 850.L64_or_more4x: 851 movdqu 0(%rsi),%xmm6 852 movdqu 16(%rsi),%xmm11 853 movdqu 32(%rsi),%xmm2 854 movdqu 48(%rsi),%xmm7 855 pxor 0(%rsp),%xmm6 856 pxor %xmm12,%xmm11 857 pxor %xmm4,%xmm2 858 pxor %xmm0,%xmm7 859 movdqu %xmm6,0(%rdi) 860 movdqu %xmm11,16(%rdi) 861 movdqu %xmm2,32(%rdi) 862 movdqu %xmm7,48(%rdi) 863 je .Ldone4x 864 865 movdqa 16(%rsp),%xmm6 866 leaq 64(%rsi),%rsi 867 xorq %r10,%r10 868 movdqa %xmm6,0(%rsp) 869 movdqa %xmm13,16(%rsp) 870 leaq 64(%rdi),%rdi 871 movdqa %xmm5,32(%rsp) 872 subq $64,%rdx 873 movdqa %xmm1,48(%rsp) 874 jmp .Loop_tail4x 875 876.align 32 877.L128_or_more4x: 878 movdqu 0(%rsi),%xmm6 879 movdqu 16(%rsi),%xmm11 880 movdqu 32(%rsi),%xmm2 881 movdqu 48(%rsi),%xmm7 882 pxor 0(%rsp),%xmm6 883 pxor %xmm12,%xmm11 884 pxor %xmm4,%xmm2 885 pxor %xmm0,%xmm7 886 887 movdqu %xmm6,0(%rdi) 888 movdqu 64(%rsi),%xmm6 889 movdqu %xmm11,16(%rdi) 890 movdqu 80(%rsi),%xmm11 891 movdqu %xmm2,32(%rdi) 892 movdqu 96(%rsi),%xmm2 893 movdqu %xmm7,48(%rdi) 894 movdqu 112(%rsi),%xmm7 895 pxor 16(%rsp),%xmm6 896 pxor %xmm13,%xmm11 897 pxor %xmm5,%xmm2 898 pxor %xmm1,%xmm7 899 movdqu %xmm6,64(%rdi) 900 movdqu %xmm11,80(%rdi) 901 movdqu %xmm2,96(%rdi) 902 movdqu %xmm7,112(%rdi) 903 je .Ldone4x 904 905 movdqa 32(%rsp),%xmm6 906 leaq 128(%rsi),%rsi 907 xorq %r10,%r10 908 movdqa %xmm6,0(%rsp) 909 movdqa %xmm10,16(%rsp) 910 leaq 128(%rdi),%rdi 911 movdqa %xmm14,32(%rsp) 912 subq $128,%rdx 913 movdqa %xmm8,48(%rsp) 914 jmp .Loop_tail4x 915 916.align 32 917.L192_or_more4x: 918 movdqu 0(%rsi),%xmm6 919 movdqu 16(%rsi),%xmm11 920 movdqu 32(%rsi),%xmm2 921 movdqu 48(%rsi),%xmm7 922 pxor 0(%rsp),%xmm6 923 pxor %xmm12,%xmm11 924 pxor %xmm4,%xmm2 925 pxor %xmm0,%xmm7 926 927 movdqu %xmm6,0(%rdi) 928 movdqu 64(%rsi),%xmm6 929 movdqu %xmm11,16(%rdi) 930 movdqu 80(%rsi),%xmm11 931 movdqu %xmm2,32(%rdi) 932 movdqu 96(%rsi),%xmm2 933 movdqu %xmm7,48(%rdi) 934 movdqu 112(%rsi),%xmm7 935 leaq 128(%rsi),%rsi 936 pxor 16(%rsp),%xmm6 937 pxor %xmm13,%xmm11 938 pxor %xmm5,%xmm2 939 pxor %xmm1,%xmm7 940 941 movdqu %xmm6,64(%rdi) 942 movdqu 0(%rsi),%xmm6 943 movdqu %xmm11,80(%rdi) 944 movdqu 16(%rsi),%xmm11 945 movdqu %xmm2,96(%rdi) 946 movdqu 32(%rsi),%xmm2 947 movdqu %xmm7,112(%rdi) 948 leaq 128(%rdi),%rdi 949 movdqu 48(%rsi),%xmm7 950 pxor 32(%rsp),%xmm6 951 pxor %xmm10,%xmm11 952 pxor %xmm14,%xmm2 953 pxor %xmm8,%xmm7 954 movdqu %xmm6,0(%rdi) 955 movdqu %xmm11,16(%rdi) 956 movdqu %xmm2,32(%rdi) 957 movdqu %xmm7,48(%rdi) 958 je .Ldone4x 959 960 movdqa 48(%rsp),%xmm6 961 leaq 64(%rsi),%rsi 962 xorq %r10,%r10 963 movdqa %xmm6,0(%rsp) 964 movdqa %xmm15,16(%rsp) 965 leaq 64(%rdi),%rdi 966 movdqa %xmm9,32(%rsp) 967 subq $192,%rdx 968 movdqa %xmm3,48(%rsp) 969 970.Loop_tail4x: 971 movzbl (%rsi,%r10,1),%eax 972 movzbl (%rsp,%r10,1),%ecx 973 leaq 1(%r10),%r10 974 xorl %ecx,%eax 975 movb %al,-1(%rdi,%r10,1) 976 decq %rdx 977 jnz .Loop_tail4x 978 979.Ldone4x: 980 leaq (%r9),%rsp 981.L4x_epilogue: 982 .byte 0xf3,0xc3 983.size ChaCha20_4x,.-ChaCha20_4x 984.type ChaCha20_8x,@function 985.align 32 986ChaCha20_8x: 987.LChaCha20_8x: 988 movq %rsp,%r9 989 subq $0x280+8,%rsp 990 andq $-32,%rsp 991 vzeroupper 992 993 994 995 996 997 998 999 1000 1001 1002 vbroadcasti128 .Lsigma(%rip),%ymm11 1003 vbroadcasti128 (%rcx),%ymm3 1004 vbroadcasti128 16(%rcx),%ymm15 1005 vbroadcasti128 (%r8),%ymm7 1006 leaq 256(%rsp),%rcx 1007 leaq 512(%rsp),%rax 1008 leaq .Lrot16(%rip),%r10 1009 leaq .Lrot24(%rip),%r11 1010 1011 vpshufd $0x00,%ymm11,%ymm8 1012 vpshufd $0x55,%ymm11,%ymm9 1013 vmovdqa %ymm8,128-256(%rcx) 1014 vpshufd $0xaa,%ymm11,%ymm10 1015 vmovdqa %ymm9,160-256(%rcx) 1016 vpshufd $0xff,%ymm11,%ymm11 1017 vmovdqa %ymm10,192-256(%rcx) 1018 vmovdqa %ymm11,224-256(%rcx) 1019 1020 vpshufd $0x00,%ymm3,%ymm0 1021 vpshufd $0x55,%ymm3,%ymm1 1022 vmovdqa %ymm0,256-256(%rcx) 1023 vpshufd $0xaa,%ymm3,%ymm2 1024 vmovdqa %ymm1,288-256(%rcx) 1025 vpshufd $0xff,%ymm3,%ymm3 1026 vmovdqa %ymm2,320-256(%rcx) 1027 vmovdqa %ymm3,352-256(%rcx) 1028 1029 vpshufd $0x00,%ymm15,%ymm12 1030 vpshufd $0x55,%ymm15,%ymm13 1031 vmovdqa %ymm12,384-512(%rax) 1032 vpshufd $0xaa,%ymm15,%ymm14 1033 vmovdqa %ymm13,416-512(%rax) 1034 vpshufd $0xff,%ymm15,%ymm15 1035 vmovdqa %ymm14,448-512(%rax) 1036 vmovdqa %ymm15,480-512(%rax) 1037 1038 vpshufd $0x00,%ymm7,%ymm4 1039 vpshufd $0x55,%ymm7,%ymm5 1040 vpaddd .Lincy(%rip),%ymm4,%ymm4 1041 vpshufd $0xaa,%ymm7,%ymm6 1042 vmovdqa %ymm5,544-512(%rax) 1043 vpshufd $0xff,%ymm7,%ymm7 1044 vmovdqa %ymm6,576-512(%rax) 1045 vmovdqa %ymm7,608-512(%rax) 1046 1047 jmp .Loop_enter8x 1048 1049.align 32 1050.Loop_outer8x: 1051 vmovdqa 128-256(%rcx),%ymm8 1052 vmovdqa 160-256(%rcx),%ymm9 1053 vmovdqa 192-256(%rcx),%ymm10 1054 vmovdqa 224-256(%rcx),%ymm11 1055 vmovdqa 256-256(%rcx),%ymm0 1056 vmovdqa 288-256(%rcx),%ymm1 1057 vmovdqa 320-256(%rcx),%ymm2 1058 vmovdqa 352-256(%rcx),%ymm3 1059 vmovdqa 384-512(%rax),%ymm12 1060 vmovdqa 416-512(%rax),%ymm13 1061 vmovdqa 448-512(%rax),%ymm14 1062 vmovdqa 480-512(%rax),%ymm15 1063 vmovdqa 512-512(%rax),%ymm4 1064 vmovdqa 544-512(%rax),%ymm5 1065 vmovdqa 576-512(%rax),%ymm6 1066 vmovdqa 608-512(%rax),%ymm7 1067 vpaddd .Leight(%rip),%ymm4,%ymm4 1068 1069.Loop_enter8x: 1070 vmovdqa %ymm14,64(%rsp) 1071 vmovdqa %ymm15,96(%rsp) 1072 vbroadcasti128 (%r10),%ymm15 1073 vmovdqa %ymm4,512-512(%rax) 1074 movl $10,%eax 1075 jmp .Loop8x 1076 1077.align 32 1078.Loop8x: 1079 vpaddd %ymm0,%ymm8,%ymm8 1080 vpxor %ymm4,%ymm8,%ymm4 1081 vpshufb %ymm15,%ymm4,%ymm4 1082 vpaddd %ymm1,%ymm9,%ymm9 1083 vpxor %ymm5,%ymm9,%ymm5 1084 vpshufb %ymm15,%ymm5,%ymm5 1085 vpaddd %ymm4,%ymm12,%ymm12 1086 vpxor %ymm0,%ymm12,%ymm0 1087 vpslld $12,%ymm0,%ymm14 1088 vpsrld $20,%ymm0,%ymm0 1089 vpor %ymm0,%ymm14,%ymm0 1090 vbroadcasti128 (%r11),%ymm14 1091 vpaddd %ymm5,%ymm13,%ymm13 1092 vpxor %ymm1,%ymm13,%ymm1 1093 vpslld $12,%ymm1,%ymm15 1094 vpsrld $20,%ymm1,%ymm1 1095 vpor %ymm1,%ymm15,%ymm1 1096 vpaddd %ymm0,%ymm8,%ymm8 1097 vpxor %ymm4,%ymm8,%ymm4 1098 vpshufb %ymm14,%ymm4,%ymm4 1099 vpaddd %ymm1,%ymm9,%ymm9 1100 vpxor %ymm5,%ymm9,%ymm5 1101 vpshufb %ymm14,%ymm5,%ymm5 1102 vpaddd %ymm4,%ymm12,%ymm12 1103 vpxor %ymm0,%ymm12,%ymm0 1104 vpslld $7,%ymm0,%ymm15 1105 vpsrld $25,%ymm0,%ymm0 1106 vpor %ymm0,%ymm15,%ymm0 1107 vbroadcasti128 (%r10),%ymm15 1108 vpaddd %ymm5,%ymm13,%ymm13 1109 vpxor %ymm1,%ymm13,%ymm1 1110 vpslld $7,%ymm1,%ymm14 1111 vpsrld $25,%ymm1,%ymm1 1112 vpor %ymm1,%ymm14,%ymm1 1113 vmovdqa %ymm12,0(%rsp) 1114 vmovdqa %ymm13,32(%rsp) 1115 vmovdqa 64(%rsp),%ymm12 1116 vmovdqa 96(%rsp),%ymm13 1117 vpaddd %ymm2,%ymm10,%ymm10 1118 vpxor %ymm6,%ymm10,%ymm6 1119 vpshufb %ymm15,%ymm6,%ymm6 1120 vpaddd %ymm3,%ymm11,%ymm11 1121 vpxor %ymm7,%ymm11,%ymm7 1122 vpshufb %ymm15,%ymm7,%ymm7 1123 vpaddd %ymm6,%ymm12,%ymm12 1124 vpxor %ymm2,%ymm12,%ymm2 1125 vpslld $12,%ymm2,%ymm14 1126 vpsrld $20,%ymm2,%ymm2 1127 vpor %ymm2,%ymm14,%ymm2 1128 vbroadcasti128 (%r11),%ymm14 1129 vpaddd %ymm7,%ymm13,%ymm13 1130 vpxor %ymm3,%ymm13,%ymm3 1131 vpslld $12,%ymm3,%ymm15 1132 vpsrld $20,%ymm3,%ymm3 1133 vpor %ymm3,%ymm15,%ymm3 1134 vpaddd %ymm2,%ymm10,%ymm10 1135 vpxor %ymm6,%ymm10,%ymm6 1136 vpshufb %ymm14,%ymm6,%ymm6 1137 vpaddd %ymm3,%ymm11,%ymm11 1138 vpxor %ymm7,%ymm11,%ymm7 1139 vpshufb %ymm14,%ymm7,%ymm7 1140 vpaddd %ymm6,%ymm12,%ymm12 1141 vpxor %ymm2,%ymm12,%ymm2 1142 vpslld $7,%ymm2,%ymm15 1143 vpsrld $25,%ymm2,%ymm2 1144 vpor %ymm2,%ymm15,%ymm2 1145 vbroadcasti128 (%r10),%ymm15 1146 vpaddd %ymm7,%ymm13,%ymm13 1147 vpxor %ymm3,%ymm13,%ymm3 1148 vpslld $7,%ymm3,%ymm14 1149 vpsrld $25,%ymm3,%ymm3 1150 vpor %ymm3,%ymm14,%ymm3 1151 vpaddd %ymm1,%ymm8,%ymm8 1152 vpxor %ymm7,%ymm8,%ymm7 1153 vpshufb %ymm15,%ymm7,%ymm7 1154 vpaddd %ymm2,%ymm9,%ymm9 1155 vpxor %ymm4,%ymm9,%ymm4 1156 vpshufb %ymm15,%ymm4,%ymm4 1157 vpaddd %ymm7,%ymm12,%ymm12 1158 vpxor %ymm1,%ymm12,%ymm1 1159 vpslld $12,%ymm1,%ymm14 1160 vpsrld $20,%ymm1,%ymm1 1161 vpor %ymm1,%ymm14,%ymm1 1162 vbroadcasti128 (%r11),%ymm14 1163 vpaddd %ymm4,%ymm13,%ymm13 1164 vpxor %ymm2,%ymm13,%ymm2 1165 vpslld $12,%ymm2,%ymm15 1166 vpsrld $20,%ymm2,%ymm2 1167 vpor %ymm2,%ymm15,%ymm2 1168 vpaddd %ymm1,%ymm8,%ymm8 1169 vpxor %ymm7,%ymm8,%ymm7 1170 vpshufb %ymm14,%ymm7,%ymm7 1171 vpaddd %ymm2,%ymm9,%ymm9 1172 vpxor %ymm4,%ymm9,%ymm4 1173 vpshufb %ymm14,%ymm4,%ymm4 1174 vpaddd %ymm7,%ymm12,%ymm12 1175 vpxor %ymm1,%ymm12,%ymm1 1176 vpslld $7,%ymm1,%ymm15 1177 vpsrld $25,%ymm1,%ymm1 1178 vpor %ymm1,%ymm15,%ymm1 1179 vbroadcasti128 (%r10),%ymm15 1180 vpaddd %ymm4,%ymm13,%ymm13 1181 vpxor %ymm2,%ymm13,%ymm2 1182 vpslld $7,%ymm2,%ymm14 1183 vpsrld $25,%ymm2,%ymm2 1184 vpor %ymm2,%ymm14,%ymm2 1185 vmovdqa %ymm12,64(%rsp) 1186 vmovdqa %ymm13,96(%rsp) 1187 vmovdqa 0(%rsp),%ymm12 1188 vmovdqa 32(%rsp),%ymm13 1189 vpaddd %ymm3,%ymm10,%ymm10 1190 vpxor %ymm5,%ymm10,%ymm5 1191 vpshufb %ymm15,%ymm5,%ymm5 1192 vpaddd %ymm0,%ymm11,%ymm11 1193 vpxor %ymm6,%ymm11,%ymm6 1194 vpshufb %ymm15,%ymm6,%ymm6 1195 vpaddd %ymm5,%ymm12,%ymm12 1196 vpxor %ymm3,%ymm12,%ymm3 1197 vpslld $12,%ymm3,%ymm14 1198 vpsrld $20,%ymm3,%ymm3 1199 vpor %ymm3,%ymm14,%ymm3 1200 vbroadcasti128 (%r11),%ymm14 1201 vpaddd %ymm6,%ymm13,%ymm13 1202 vpxor %ymm0,%ymm13,%ymm0 1203 vpslld $12,%ymm0,%ymm15 1204 vpsrld $20,%ymm0,%ymm0 1205 vpor %ymm0,%ymm15,%ymm0 1206 vpaddd %ymm3,%ymm10,%ymm10 1207 vpxor %ymm5,%ymm10,%ymm5 1208 vpshufb %ymm14,%ymm5,%ymm5 1209 vpaddd %ymm0,%ymm11,%ymm11 1210 vpxor %ymm6,%ymm11,%ymm6 1211 vpshufb %ymm14,%ymm6,%ymm6 1212 vpaddd %ymm5,%ymm12,%ymm12 1213 vpxor %ymm3,%ymm12,%ymm3 1214 vpslld $7,%ymm3,%ymm15 1215 vpsrld $25,%ymm3,%ymm3 1216 vpor %ymm3,%ymm15,%ymm3 1217 vbroadcasti128 (%r10),%ymm15 1218 vpaddd %ymm6,%ymm13,%ymm13 1219 vpxor %ymm0,%ymm13,%ymm0 1220 vpslld $7,%ymm0,%ymm14 1221 vpsrld $25,%ymm0,%ymm0 1222 vpor %ymm0,%ymm14,%ymm0 1223 decl %eax 1224 jnz .Loop8x 1225 1226 leaq 512(%rsp),%rax 1227 vpaddd 128-256(%rcx),%ymm8,%ymm8 1228 vpaddd 160-256(%rcx),%ymm9,%ymm9 1229 vpaddd 192-256(%rcx),%ymm10,%ymm10 1230 vpaddd 224-256(%rcx),%ymm11,%ymm11 1231 1232 vpunpckldq %ymm9,%ymm8,%ymm14 1233 vpunpckldq %ymm11,%ymm10,%ymm15 1234 vpunpckhdq %ymm9,%ymm8,%ymm8 1235 vpunpckhdq %ymm11,%ymm10,%ymm10 1236 vpunpcklqdq %ymm15,%ymm14,%ymm9 1237 vpunpckhqdq %ymm15,%ymm14,%ymm14 1238 vpunpcklqdq %ymm10,%ymm8,%ymm11 1239 vpunpckhqdq %ymm10,%ymm8,%ymm8 1240 vpaddd 256-256(%rcx),%ymm0,%ymm0 1241 vpaddd 288-256(%rcx),%ymm1,%ymm1 1242 vpaddd 320-256(%rcx),%ymm2,%ymm2 1243 vpaddd 352-256(%rcx),%ymm3,%ymm3 1244 1245 vpunpckldq %ymm1,%ymm0,%ymm10 1246 vpunpckldq %ymm3,%ymm2,%ymm15 1247 vpunpckhdq %ymm1,%ymm0,%ymm0 1248 vpunpckhdq %ymm3,%ymm2,%ymm2 1249 vpunpcklqdq %ymm15,%ymm10,%ymm1 1250 vpunpckhqdq %ymm15,%ymm10,%ymm10 1251 vpunpcklqdq %ymm2,%ymm0,%ymm3 1252 vpunpckhqdq %ymm2,%ymm0,%ymm0 1253 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1254 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1255 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1256 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1257 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1258 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1259 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1260 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1261 vmovdqa %ymm15,0(%rsp) 1262 vmovdqa %ymm9,32(%rsp) 1263 vmovdqa 64(%rsp),%ymm15 1264 vmovdqa 96(%rsp),%ymm9 1265 1266 vpaddd 384-512(%rax),%ymm12,%ymm12 1267 vpaddd 416-512(%rax),%ymm13,%ymm13 1268 vpaddd 448-512(%rax),%ymm15,%ymm15 1269 vpaddd 480-512(%rax),%ymm9,%ymm9 1270 1271 vpunpckldq %ymm13,%ymm12,%ymm2 1272 vpunpckldq %ymm9,%ymm15,%ymm8 1273 vpunpckhdq %ymm13,%ymm12,%ymm12 1274 vpunpckhdq %ymm9,%ymm15,%ymm15 1275 vpunpcklqdq %ymm8,%ymm2,%ymm13 1276 vpunpckhqdq %ymm8,%ymm2,%ymm2 1277 vpunpcklqdq %ymm15,%ymm12,%ymm9 1278 vpunpckhqdq %ymm15,%ymm12,%ymm12 1279 vpaddd 512-512(%rax),%ymm4,%ymm4 1280 vpaddd 544-512(%rax),%ymm5,%ymm5 1281 vpaddd 576-512(%rax),%ymm6,%ymm6 1282 vpaddd 608-512(%rax),%ymm7,%ymm7 1283 1284 vpunpckldq %ymm5,%ymm4,%ymm15 1285 vpunpckldq %ymm7,%ymm6,%ymm8 1286 vpunpckhdq %ymm5,%ymm4,%ymm4 1287 vpunpckhdq %ymm7,%ymm6,%ymm6 1288 vpunpcklqdq %ymm8,%ymm15,%ymm5 1289 vpunpckhqdq %ymm8,%ymm15,%ymm15 1290 vpunpcklqdq %ymm6,%ymm4,%ymm7 1291 vpunpckhqdq %ymm6,%ymm4,%ymm4 1292 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1293 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1294 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1295 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1296 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1297 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1298 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1299 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1300 vmovdqa 0(%rsp),%ymm6 1301 vmovdqa 32(%rsp),%ymm12 1302 1303 cmpq $512,%rdx 1304 jb .Ltail8x 1305 1306 vpxor 0(%rsi),%ymm6,%ymm6 1307 vpxor 32(%rsi),%ymm8,%ymm8 1308 vpxor 64(%rsi),%ymm1,%ymm1 1309 vpxor 96(%rsi),%ymm5,%ymm5 1310 leaq 128(%rsi),%rsi 1311 vmovdqu %ymm6,0(%rdi) 1312 vmovdqu %ymm8,32(%rdi) 1313 vmovdqu %ymm1,64(%rdi) 1314 vmovdqu %ymm5,96(%rdi) 1315 leaq 128(%rdi),%rdi 1316 1317 vpxor 0(%rsi),%ymm12,%ymm12 1318 vpxor 32(%rsi),%ymm13,%ymm13 1319 vpxor 64(%rsi),%ymm10,%ymm10 1320 vpxor 96(%rsi),%ymm15,%ymm15 1321 leaq 128(%rsi),%rsi 1322 vmovdqu %ymm12,0(%rdi) 1323 vmovdqu %ymm13,32(%rdi) 1324 vmovdqu %ymm10,64(%rdi) 1325 vmovdqu %ymm15,96(%rdi) 1326 leaq 128(%rdi),%rdi 1327 1328 vpxor 0(%rsi),%ymm14,%ymm14 1329 vpxor 32(%rsi),%ymm2,%ymm2 1330 vpxor 64(%rsi),%ymm3,%ymm3 1331 vpxor 96(%rsi),%ymm7,%ymm7 1332 leaq 128(%rsi),%rsi 1333 vmovdqu %ymm14,0(%rdi) 1334 vmovdqu %ymm2,32(%rdi) 1335 vmovdqu %ymm3,64(%rdi) 1336 vmovdqu %ymm7,96(%rdi) 1337 leaq 128(%rdi),%rdi 1338 1339 vpxor 0(%rsi),%ymm11,%ymm11 1340 vpxor 32(%rsi),%ymm9,%ymm9 1341 vpxor 64(%rsi),%ymm0,%ymm0 1342 vpxor 96(%rsi),%ymm4,%ymm4 1343 leaq 128(%rsi),%rsi 1344 vmovdqu %ymm11,0(%rdi) 1345 vmovdqu %ymm9,32(%rdi) 1346 vmovdqu %ymm0,64(%rdi) 1347 vmovdqu %ymm4,96(%rdi) 1348 leaq 128(%rdi),%rdi 1349 1350 subq $512,%rdx 1351 jnz .Loop_outer8x 1352 1353 jmp .Ldone8x 1354 1355.Ltail8x: 1356 cmpq $448,%rdx 1357 jae .L448_or_more8x 1358 cmpq $384,%rdx 1359 jae .L384_or_more8x 1360 cmpq $320,%rdx 1361 jae .L320_or_more8x 1362 cmpq $256,%rdx 1363 jae .L256_or_more8x 1364 cmpq $192,%rdx 1365 jae .L192_or_more8x 1366 cmpq $128,%rdx 1367 jae .L128_or_more8x 1368 cmpq $64,%rdx 1369 jae .L64_or_more8x 1370 1371 xorq %r10,%r10 1372 vmovdqa %ymm6,0(%rsp) 1373 vmovdqa %ymm8,32(%rsp) 1374 jmp .Loop_tail8x 1375 1376.align 32 1377.L64_or_more8x: 1378 vpxor 0(%rsi),%ymm6,%ymm6 1379 vpxor 32(%rsi),%ymm8,%ymm8 1380 vmovdqu %ymm6,0(%rdi) 1381 vmovdqu %ymm8,32(%rdi) 1382 je .Ldone8x 1383 1384 leaq 64(%rsi),%rsi 1385 xorq %r10,%r10 1386 vmovdqa %ymm1,0(%rsp) 1387 leaq 64(%rdi),%rdi 1388 subq $64,%rdx 1389 vmovdqa %ymm5,32(%rsp) 1390 jmp .Loop_tail8x 1391 1392.align 32 1393.L128_or_more8x: 1394 vpxor 0(%rsi),%ymm6,%ymm6 1395 vpxor 32(%rsi),%ymm8,%ymm8 1396 vpxor 64(%rsi),%ymm1,%ymm1 1397 vpxor 96(%rsi),%ymm5,%ymm5 1398 vmovdqu %ymm6,0(%rdi) 1399 vmovdqu %ymm8,32(%rdi) 1400 vmovdqu %ymm1,64(%rdi) 1401 vmovdqu %ymm5,96(%rdi) 1402 je .Ldone8x 1403 1404 leaq 128(%rsi),%rsi 1405 xorq %r10,%r10 1406 vmovdqa %ymm12,0(%rsp) 1407 leaq 128(%rdi),%rdi 1408 subq $128,%rdx 1409 vmovdqa %ymm13,32(%rsp) 1410 jmp .Loop_tail8x 1411 1412.align 32 1413.L192_or_more8x: 1414 vpxor 0(%rsi),%ymm6,%ymm6 1415 vpxor 32(%rsi),%ymm8,%ymm8 1416 vpxor 64(%rsi),%ymm1,%ymm1 1417 vpxor 96(%rsi),%ymm5,%ymm5 1418 vpxor 128(%rsi),%ymm12,%ymm12 1419 vpxor 160(%rsi),%ymm13,%ymm13 1420 vmovdqu %ymm6,0(%rdi) 1421 vmovdqu %ymm8,32(%rdi) 1422 vmovdqu %ymm1,64(%rdi) 1423 vmovdqu %ymm5,96(%rdi) 1424 vmovdqu %ymm12,128(%rdi) 1425 vmovdqu %ymm13,160(%rdi) 1426 je .Ldone8x 1427 1428 leaq 192(%rsi),%rsi 1429 xorq %r10,%r10 1430 vmovdqa %ymm10,0(%rsp) 1431 leaq 192(%rdi),%rdi 1432 subq $192,%rdx 1433 vmovdqa %ymm15,32(%rsp) 1434 jmp .Loop_tail8x 1435 1436.align 32 1437.L256_or_more8x: 1438 vpxor 0(%rsi),%ymm6,%ymm6 1439 vpxor 32(%rsi),%ymm8,%ymm8 1440 vpxor 64(%rsi),%ymm1,%ymm1 1441 vpxor 96(%rsi),%ymm5,%ymm5 1442 vpxor 128(%rsi),%ymm12,%ymm12 1443 vpxor 160(%rsi),%ymm13,%ymm13 1444 vpxor 192(%rsi),%ymm10,%ymm10 1445 vpxor 224(%rsi),%ymm15,%ymm15 1446 vmovdqu %ymm6,0(%rdi) 1447 vmovdqu %ymm8,32(%rdi) 1448 vmovdqu %ymm1,64(%rdi) 1449 vmovdqu %ymm5,96(%rdi) 1450 vmovdqu %ymm12,128(%rdi) 1451 vmovdqu %ymm13,160(%rdi) 1452 vmovdqu %ymm10,192(%rdi) 1453 vmovdqu %ymm15,224(%rdi) 1454 je .Ldone8x 1455 1456 leaq 256(%rsi),%rsi 1457 xorq %r10,%r10 1458 vmovdqa %ymm14,0(%rsp) 1459 leaq 256(%rdi),%rdi 1460 subq $256,%rdx 1461 vmovdqa %ymm2,32(%rsp) 1462 jmp .Loop_tail8x 1463 1464.align 32 1465.L320_or_more8x: 1466 vpxor 0(%rsi),%ymm6,%ymm6 1467 vpxor 32(%rsi),%ymm8,%ymm8 1468 vpxor 64(%rsi),%ymm1,%ymm1 1469 vpxor 96(%rsi),%ymm5,%ymm5 1470 vpxor 128(%rsi),%ymm12,%ymm12 1471 vpxor 160(%rsi),%ymm13,%ymm13 1472 vpxor 192(%rsi),%ymm10,%ymm10 1473 vpxor 224(%rsi),%ymm15,%ymm15 1474 vpxor 256(%rsi),%ymm14,%ymm14 1475 vpxor 288(%rsi),%ymm2,%ymm2 1476 vmovdqu %ymm6,0(%rdi) 1477 vmovdqu %ymm8,32(%rdi) 1478 vmovdqu %ymm1,64(%rdi) 1479 vmovdqu %ymm5,96(%rdi) 1480 vmovdqu %ymm12,128(%rdi) 1481 vmovdqu %ymm13,160(%rdi) 1482 vmovdqu %ymm10,192(%rdi) 1483 vmovdqu %ymm15,224(%rdi) 1484 vmovdqu %ymm14,256(%rdi) 1485 vmovdqu %ymm2,288(%rdi) 1486 je .Ldone8x 1487 1488 leaq 320(%rsi),%rsi 1489 xorq %r10,%r10 1490 vmovdqa %ymm3,0(%rsp) 1491 leaq 320(%rdi),%rdi 1492 subq $320,%rdx 1493 vmovdqa %ymm7,32(%rsp) 1494 jmp .Loop_tail8x 1495 1496.align 32 1497.L384_or_more8x: 1498 vpxor 0(%rsi),%ymm6,%ymm6 1499 vpxor 32(%rsi),%ymm8,%ymm8 1500 vpxor 64(%rsi),%ymm1,%ymm1 1501 vpxor 96(%rsi),%ymm5,%ymm5 1502 vpxor 128(%rsi),%ymm12,%ymm12 1503 vpxor 160(%rsi),%ymm13,%ymm13 1504 vpxor 192(%rsi),%ymm10,%ymm10 1505 vpxor 224(%rsi),%ymm15,%ymm15 1506 vpxor 256(%rsi),%ymm14,%ymm14 1507 vpxor 288(%rsi),%ymm2,%ymm2 1508 vpxor 320(%rsi),%ymm3,%ymm3 1509 vpxor 352(%rsi),%ymm7,%ymm7 1510 vmovdqu %ymm6,0(%rdi) 1511 vmovdqu %ymm8,32(%rdi) 1512 vmovdqu %ymm1,64(%rdi) 1513 vmovdqu %ymm5,96(%rdi) 1514 vmovdqu %ymm12,128(%rdi) 1515 vmovdqu %ymm13,160(%rdi) 1516 vmovdqu %ymm10,192(%rdi) 1517 vmovdqu %ymm15,224(%rdi) 1518 vmovdqu %ymm14,256(%rdi) 1519 vmovdqu %ymm2,288(%rdi) 1520 vmovdqu %ymm3,320(%rdi) 1521 vmovdqu %ymm7,352(%rdi) 1522 je .Ldone8x 1523 1524 leaq 384(%rsi),%rsi 1525 xorq %r10,%r10 1526 vmovdqa %ymm11,0(%rsp) 1527 leaq 384(%rdi),%rdi 1528 subq $384,%rdx 1529 vmovdqa %ymm9,32(%rsp) 1530 jmp .Loop_tail8x 1531 1532.align 32 1533.L448_or_more8x: 1534 vpxor 0(%rsi),%ymm6,%ymm6 1535 vpxor 32(%rsi),%ymm8,%ymm8 1536 vpxor 64(%rsi),%ymm1,%ymm1 1537 vpxor 96(%rsi),%ymm5,%ymm5 1538 vpxor 128(%rsi),%ymm12,%ymm12 1539 vpxor 160(%rsi),%ymm13,%ymm13 1540 vpxor 192(%rsi),%ymm10,%ymm10 1541 vpxor 224(%rsi),%ymm15,%ymm15 1542 vpxor 256(%rsi),%ymm14,%ymm14 1543 vpxor 288(%rsi),%ymm2,%ymm2 1544 vpxor 320(%rsi),%ymm3,%ymm3 1545 vpxor 352(%rsi),%ymm7,%ymm7 1546 vpxor 384(%rsi),%ymm11,%ymm11 1547 vpxor 416(%rsi),%ymm9,%ymm9 1548 vmovdqu %ymm6,0(%rdi) 1549 vmovdqu %ymm8,32(%rdi) 1550 vmovdqu %ymm1,64(%rdi) 1551 vmovdqu %ymm5,96(%rdi) 1552 vmovdqu %ymm12,128(%rdi) 1553 vmovdqu %ymm13,160(%rdi) 1554 vmovdqu %ymm10,192(%rdi) 1555 vmovdqu %ymm15,224(%rdi) 1556 vmovdqu %ymm14,256(%rdi) 1557 vmovdqu %ymm2,288(%rdi) 1558 vmovdqu %ymm3,320(%rdi) 1559 vmovdqu %ymm7,352(%rdi) 1560 vmovdqu %ymm11,384(%rdi) 1561 vmovdqu %ymm9,416(%rdi) 1562 je .Ldone8x 1563 1564 leaq 448(%rsi),%rsi 1565 xorq %r10,%r10 1566 vmovdqa %ymm0,0(%rsp) 1567 leaq 448(%rdi),%rdi 1568 subq $448,%rdx 1569 vmovdqa %ymm4,32(%rsp) 1570 1571.Loop_tail8x: 1572 movzbl (%rsi,%r10,1),%eax 1573 movzbl (%rsp,%r10,1),%ecx 1574 leaq 1(%r10),%r10 1575 xorl %ecx,%eax 1576 movb %al,-1(%rdi,%r10,1) 1577 decq %rdx 1578 jnz .Loop_tail8x 1579 1580.Ldone8x: 1581 vzeroall 1582 leaq (%r9),%rsp 1583.L8x_epilogue: 1584 .byte 0xf3,0xc3 1585.size ChaCha20_8x,.-ChaCha20_8x 1586#endif 1587