1/* 2Copyright (c) 2010, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMCPY 34# define MEMCPY memcpy 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef END 71# define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74#endif 75 76#ifdef USE_AS_BCOPY 77# define SRC PARMS 78# define DEST SRC+4 79# define LEN DEST+4 80#else 81# define DEST PARMS 82# define SRC DEST+4 83# define LEN SRC+4 84#endif 85 86#define CFI_PUSH(REG) \ 87 cfi_adjust_cfa_offset (4); \ 88 cfi_rel_offset (REG, 0) 89 90#define CFI_POP(REG) \ 91 cfi_adjust_cfa_offset (-4); \ 92 cfi_restore (REG) 93 94#define PUSH(REG) pushl REG; CFI_PUSH (REG) 95#define POP(REG) popl REG; CFI_POP (REG) 96 97#if (defined SHARED || defined __PIC__) 98# define PARMS 8 /* Preserve EBX. */ 99# define ENTRANCE PUSH (%ebx); 100# define RETURN_END POP (%ebx); ret 101# define RETURN RETURN_END; CFI_PUSH (%ebx) 102# define JMPTBL(I, B) I - B 103 104# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x 105 106/* Load an entry in a jump table into EBX and branch to it. TABLE is a 107 jump table with relative offsets. INDEX is a register contains the 108 index into the jump table. SCALE is the scale of INDEX. */ 109 110# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 111 /* We first load PC into EBX. */ \ 112 SETUP_PIC_REG(bx); \ 113 /* Get the address of the jump table. */ \ 114 addl $(TABLE - .), %ebx; \ 115 /* Get the entry and convert the relative offset to the \ 116 absolute address. */ \ 117 addl (%ebx, INDEX, SCALE), %ebx; \ 118 /* We loaded the jump table. Go. */ \ 119 jmp *%ebx 120#else 121 122# define PARMS 4 123# define ENTRANCE 124# define RETURN_END ret 125# define RETURN RETURN_END 126# define JMPTBL(I, B) I 127 128/* Branch to an entry in a jump table. TABLE is a jump table with 129 absolute offsets. INDEX is a register contains the index into the 130 jump table. SCALE is the scale of INDEX. */ 131 132# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 133 jmp *TABLE(, INDEX, SCALE) 134#endif 135 136 .section .text.ssse3,"ax",@progbits 137ENTRY (MEMCPY) 138 ENTRANCE 139 movl LEN(%esp), %ecx 140 movl SRC(%esp), %eax 141 movl DEST(%esp), %edx 142 143#ifdef USE_AS_MEMMOVE 144 cmp %eax, %edx 145 jb L(copy_forward) 146 je L(fwd_write_0bytes) 147 cmp $32, %ecx 148 jae L(memmove_bwd) 149 jmp L(bk_write_less32bytes_2) 150 151 .p2align 4 152L(memmove_bwd): 153 add %ecx, %eax 154 cmp %eax, %edx 155 movl SRC(%esp), %eax 156 jb L(copy_backward) 157 158L(copy_forward): 159#endif 160 cmp $48, %ecx 161 jae L(48bytesormore) 162 163L(fwd_write_less32bytes): 164#ifndef USE_AS_MEMMOVE 165 cmp %dl, %al 166 jb L(bk_write) 167#endif 168 add %ecx, %edx 169 add %ecx, %eax 170 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 171#ifndef USE_AS_MEMMOVE 172 .p2align 4 173L(bk_write): 174 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 175#endif 176 177 .p2align 4 178L(48bytesormore): 179#ifndef USE_AS_MEMMOVE 180 movlpd (%eax), %xmm0 181 movlpd 8(%eax), %xmm1 182 movlpd %xmm0, (%edx) 183 movlpd %xmm1, 8(%edx) 184#else 185 movdqu (%eax), %xmm0 186#endif 187 PUSH (%edi) 188 movl %edx, %edi 189 and $-16, %edx 190 add $16, %edx 191 sub %edx, %edi 192 add %edi, %ecx 193 sub %edi, %eax 194 195#ifdef SHARED_CACHE_SIZE_HALF 196 cmp $SHARED_CACHE_SIZE_HALF, %ecx 197#else 198# if (defined SHARED || defined __PIC__) 199 SETUP_PIC_REG(bx) 200 add $_GLOBAL_OFFSET_TABLE_, %ebx 201 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 202# else 203 cmp __x86_shared_cache_size_half, %ecx 204# endif 205#endif 206 207 mov %eax, %edi 208 jae L(large_page) 209 and $0xf, %edi 210 jz L(shl_0) 211 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) 212 213 .p2align 4 214L(shl_0): 215#ifdef USE_AS_MEMMOVE 216 movl DEST+4(%esp), %edi 217 movdqu %xmm0, (%edi) 218#endif 219 xor %edi, %edi 220 cmp $127, %ecx 221 ja L(shl_0_gobble) 222 lea -32(%ecx), %ecx 223 224 .p2align 4 225L(shl_0_loop): 226 movdqa (%eax, %edi), %xmm0 227 movdqa 16(%eax, %edi), %xmm1 228 sub $32, %ecx 229 movdqa %xmm0, (%edx, %edi) 230 movdqa %xmm1, 16(%edx, %edi) 231 lea 32(%edi), %edi 232 jb L(shl_0_end) 233 234 movdqa (%eax, %edi), %xmm0 235 movdqa 16(%eax, %edi), %xmm1 236 sub $32, %ecx 237 movdqa %xmm0, (%edx, %edi) 238 movdqa %xmm1, 16(%edx, %edi) 239 lea 32(%edi), %edi 240 jb L(shl_0_end) 241 242 movdqa (%eax, %edi), %xmm0 243 movdqa 16(%eax, %edi), %xmm1 244 sub $32, %ecx 245 movdqa %xmm0, (%edx, %edi) 246 movdqa %xmm1, 16(%edx, %edi) 247 lea 32(%edi), %edi 248 jb L(shl_0_end) 249 250 movdqa (%eax, %edi), %xmm0 251 movdqa 16(%eax, %edi), %xmm1 252 sub $32, %ecx 253 movdqa %xmm0, (%edx, %edi) 254 movdqa %xmm1, 16(%edx, %edi) 255 lea 32(%edi), %edi 256 257L(shl_0_end): 258 lea 32(%ecx), %ecx 259 add %ecx, %edi 260 add %edi, %edx 261 add %edi, %eax 262 POP (%edi) 263 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 264 265 CFI_PUSH (%edi) 266 267 .p2align 4 268L(shl_0_gobble): 269#ifdef DATA_CACHE_SIZE_HALF 270 cmp $DATA_CACHE_SIZE_HALF, %ecx 271#else 272# if (defined SHARED || defined __PIC__) 273 SETUP_PIC_REG(bx) 274 add $_GLOBAL_OFFSET_TABLE_, %ebx 275 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 276# else 277 cmp __x86_data_cache_size_half, %ecx 278# endif 279#endif 280 POP (%edi) 281 lea -128(%ecx), %ecx 282 jae L(shl_0_gobble_mem_loop) 283 284 .p2align 4 285L(shl_0_gobble_cache_loop): 286 movdqa (%eax), %xmm0 287 movdqa 0x10(%eax), %xmm1 288 movdqa 0x20(%eax), %xmm2 289 movdqa 0x30(%eax), %xmm3 290 movdqa 0x40(%eax), %xmm4 291 movdqa 0x50(%eax), %xmm5 292 movdqa 0x60(%eax), %xmm6 293 movdqa 0x70(%eax), %xmm7 294 lea 0x80(%eax), %eax 295 sub $128, %ecx 296 movdqa %xmm0, (%edx) 297 movdqa %xmm1, 0x10(%edx) 298 movdqa %xmm2, 0x20(%edx) 299 movdqa %xmm3, 0x30(%edx) 300 movdqa %xmm4, 0x40(%edx) 301 movdqa %xmm5, 0x50(%edx) 302 movdqa %xmm6, 0x60(%edx) 303 movdqa %xmm7, 0x70(%edx) 304 lea 0x80(%edx), %edx 305 306 jae L(shl_0_gobble_cache_loop) 307 cmp $-0x40, %ecx 308 lea 0x80(%ecx), %ecx 309 jl L(shl_0_cache_less_64bytes) 310 311 movdqa (%eax), %xmm0 312 sub $0x40, %ecx 313 movdqa 0x10(%eax), %xmm1 314 movdqa %xmm0, (%edx) 315 movdqa %xmm1, 0x10(%edx) 316 movdqa 0x20(%eax), %xmm0 317 movdqa 0x30(%eax), %xmm1 318 add $0x40, %eax 319 movdqa %xmm0, 0x20(%edx) 320 movdqa %xmm1, 0x30(%edx) 321 add $0x40, %edx 322 323L(shl_0_cache_less_64bytes): 324 cmp $0x20, %ecx 325 jb L(shl_0_cache_less_32bytes) 326 movdqa (%eax), %xmm0 327 sub $0x20, %ecx 328 movdqa 0x10(%eax), %xmm1 329 add $0x20, %eax 330 movdqa %xmm0, (%edx) 331 movdqa %xmm1, 0x10(%edx) 332 add $0x20, %edx 333 334L(shl_0_cache_less_32bytes): 335 cmp $0x10, %ecx 336 jb L(shl_0_cache_less_16bytes) 337 sub $0x10, %ecx 338 movdqa (%eax), %xmm0 339 add $0x10, %eax 340 movdqa %xmm0, (%edx) 341 add $0x10, %edx 342 343L(shl_0_cache_less_16bytes): 344 add %ecx, %edx 345 add %ecx, %eax 346 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 347 348 .p2align 4 349L(shl_0_gobble_mem_loop): 350 prefetcht0 0x1c0(%eax) 351 prefetcht0 0x280(%eax) 352 prefetcht0 0x1c0(%edx) 353 354 movdqa (%eax), %xmm0 355 movdqa 0x10(%eax), %xmm1 356 movdqa 0x20(%eax), %xmm2 357 movdqa 0x30(%eax), %xmm3 358 movdqa 0x40(%eax), %xmm4 359 movdqa 0x50(%eax), %xmm5 360 movdqa 0x60(%eax), %xmm6 361 movdqa 0x70(%eax), %xmm7 362 lea 0x80(%eax), %eax 363 sub $0x80, %ecx 364 movdqa %xmm0, (%edx) 365 movdqa %xmm1, 0x10(%edx) 366 movdqa %xmm2, 0x20(%edx) 367 movdqa %xmm3, 0x30(%edx) 368 movdqa %xmm4, 0x40(%edx) 369 movdqa %xmm5, 0x50(%edx) 370 movdqa %xmm6, 0x60(%edx) 371 movdqa %xmm7, 0x70(%edx) 372 lea 0x80(%edx), %edx 373 374 jae L(shl_0_gobble_mem_loop) 375 cmp $-0x40, %ecx 376 lea 0x80(%ecx), %ecx 377 jl L(shl_0_mem_less_64bytes) 378 379 movdqa (%eax), %xmm0 380 sub $0x40, %ecx 381 movdqa 0x10(%eax), %xmm1 382 383 movdqa %xmm0, (%edx) 384 movdqa %xmm1, 0x10(%edx) 385 386 movdqa 0x20(%eax), %xmm0 387 movdqa 0x30(%eax), %xmm1 388 add $0x40, %eax 389 390 movdqa %xmm0, 0x20(%edx) 391 movdqa %xmm1, 0x30(%edx) 392 add $0x40, %edx 393 394L(shl_0_mem_less_64bytes): 395 cmp $0x20, %ecx 396 jb L(shl_0_mem_less_32bytes) 397 movdqa (%eax), %xmm0 398 sub $0x20, %ecx 399 movdqa 0x10(%eax), %xmm1 400 add $0x20, %eax 401 movdqa %xmm0, (%edx) 402 movdqa %xmm1, 0x10(%edx) 403 add $0x20, %edx 404 405L(shl_0_mem_less_32bytes): 406 cmp $0x10, %ecx 407 jb L(shl_0_mem_less_16bytes) 408 sub $0x10, %ecx 409 movdqa (%eax), %xmm0 410 add $0x10, %eax 411 movdqa %xmm0, (%edx) 412 add $0x10, %edx 413 414L(shl_0_mem_less_16bytes): 415 add %ecx, %edx 416 add %ecx, %eax 417 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 418 419 .p2align 4 420L(shl_1): 421#ifndef USE_AS_MEMMOVE 422 movaps -1(%eax), %xmm1 423#else 424 movl DEST+4(%esp), %edi 425 movaps -1(%eax), %xmm1 426 movdqu %xmm0, (%edi) 427#endif 428#ifdef DATA_CACHE_SIZE_HALF 429 cmp $DATA_CACHE_SIZE_HALF, %ecx 430#else 431# if (defined SHARED || defined __PIC__) 432 SETUP_PIC_REG(bx) 433 add $_GLOBAL_OFFSET_TABLE_, %ebx 434 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 435# else 436 cmp __x86_data_cache_size_half, %ecx 437# endif 438#endif 439 jb L(sh_1_no_prefetch) 440 441 lea -64(%ecx), %ecx 442 443 .p2align 4 444L(Shl1LoopStart): 445 prefetcht0 0x1c0(%eax) 446 prefetcht0 0x1c0(%edx) 447 movaps 15(%eax), %xmm2 448 movaps 31(%eax), %xmm3 449 movaps 47(%eax), %xmm4 450 movaps 63(%eax), %xmm5 451 movaps %xmm5, %xmm7 452 palignr $1, %xmm4, %xmm5 453 palignr $1, %xmm3, %xmm4 454 movaps %xmm5, 48(%edx) 455 palignr $1, %xmm2, %xmm3 456 lea 64(%eax), %eax 457 palignr $1, %xmm1, %xmm2 458 movaps %xmm4, 32(%edx) 459 movaps %xmm3, 16(%edx) 460 movaps %xmm7, %xmm1 461 movaps %xmm2, (%edx) 462 lea 64(%edx), %edx 463 sub $64, %ecx 464 ja L(Shl1LoopStart) 465 466L(Shl1LoopLeave): 467 add $32, %ecx 468 jle L(shl_end_0) 469 470 movaps 15(%eax), %xmm2 471 movaps 31(%eax), %xmm3 472 palignr $1, %xmm2, %xmm3 473 palignr $1, %xmm1, %xmm2 474 movaps %xmm2, (%edx) 475 movaps %xmm3, 16(%edx) 476 lea 32(%edx, %ecx), %edx 477 lea 32(%eax, %ecx), %eax 478 POP (%edi) 479 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 480 481 CFI_PUSH (%edi) 482 483 .p2align 4 484L(sh_1_no_prefetch): 485 lea -32(%ecx), %ecx 486 lea -1(%eax), %eax 487 xor %edi, %edi 488 489 .p2align 4 490L(sh_1_no_prefetch_loop): 491 movdqa 16(%eax, %edi), %xmm2 492 sub $32, %ecx 493 movdqa 32(%eax, %edi), %xmm3 494 movdqa %xmm3, %xmm4 495 palignr $1, %xmm2, %xmm3 496 palignr $1, %xmm1, %xmm2 497 lea 32(%edi), %edi 498 movdqa %xmm2, -32(%edx, %edi) 499 movdqa %xmm3, -16(%edx, %edi) 500 jb L(sh_1_end_no_prefetch_loop) 501 502 movdqa 16(%eax, %edi), %xmm2 503 sub $32, %ecx 504 movdqa 32(%eax, %edi), %xmm3 505 movdqa %xmm3, %xmm1 506 palignr $1, %xmm2, %xmm3 507 palignr $1, %xmm4, %xmm2 508 lea 32(%edi), %edi 509 movdqa %xmm2, -32(%edx, %edi) 510 movdqa %xmm3, -16(%edx, %edi) 511 jae L(sh_1_no_prefetch_loop) 512 513L(sh_1_end_no_prefetch_loop): 514 lea 32(%ecx), %ecx 515 add %ecx, %edi 516 add %edi, %edx 517 lea 1(%edi, %eax), %eax 518 POP (%edi) 519 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 520 521 CFI_PUSH (%edi) 522 523 .p2align 4 524L(shl_2): 525#ifndef USE_AS_MEMMOVE 526 movaps -2(%eax), %xmm1 527#else 528 movl DEST+4(%esp), %edi 529 movaps -2(%eax), %xmm1 530 movdqu %xmm0, (%edi) 531#endif 532#ifdef DATA_CACHE_SIZE_HALF 533 cmp $DATA_CACHE_SIZE_HALF, %ecx 534#else 535# if (defined SHARED || defined __PIC__) 536 SETUP_PIC_REG(bx) 537 add $_GLOBAL_OFFSET_TABLE_, %ebx 538 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 539# else 540 cmp __x86_data_cache_size_half, %ecx 541# endif 542#endif 543 jb L(sh_2_no_prefetch) 544 545 lea -64(%ecx), %ecx 546 547 .p2align 4 548L(Shl2LoopStart): 549 prefetcht0 0x1c0(%eax) 550 prefetcht0 0x1c0(%edx) 551 movaps 14(%eax), %xmm2 552 movaps 30(%eax), %xmm3 553 movaps 46(%eax), %xmm4 554 movaps 62(%eax), %xmm5 555 movaps %xmm5, %xmm7 556 palignr $2, %xmm4, %xmm5 557 palignr $2, %xmm3, %xmm4 558 movaps %xmm5, 48(%edx) 559 palignr $2, %xmm2, %xmm3 560 lea 64(%eax), %eax 561 palignr $2, %xmm1, %xmm2 562 movaps %xmm4, 32(%edx) 563 movaps %xmm3, 16(%edx) 564 movaps %xmm7, %xmm1 565 movaps %xmm2, (%edx) 566 lea 64(%edx), %edx 567 sub $64, %ecx 568 ja L(Shl2LoopStart) 569 570L(Shl2LoopLeave): 571 add $32, %ecx 572 jle L(shl_end_0) 573 574 movaps 14(%eax), %xmm2 575 movaps 30(%eax), %xmm3 576 palignr $2, %xmm2, %xmm3 577 palignr $2, %xmm1, %xmm2 578 movaps %xmm2, (%edx) 579 movaps %xmm3, 16(%edx) 580 lea 32(%edx, %ecx), %edx 581 lea 32(%eax, %ecx), %eax 582 POP (%edi) 583 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 584 585 CFI_PUSH (%edi) 586 587 .p2align 4 588L(sh_2_no_prefetch): 589 lea -32(%ecx), %ecx 590 lea -2(%eax), %eax 591 xor %edi, %edi 592 593 .p2align 4 594L(sh_2_no_prefetch_loop): 595 movdqa 16(%eax, %edi), %xmm2 596 sub $32, %ecx 597 movdqa 32(%eax, %edi), %xmm3 598 movdqa %xmm3, %xmm4 599 palignr $2, %xmm2, %xmm3 600 palignr $2, %xmm1, %xmm2 601 lea 32(%edi), %edi 602 movdqa %xmm2, -32(%edx, %edi) 603 movdqa %xmm3, -16(%edx, %edi) 604 jb L(sh_2_end_no_prefetch_loop) 605 606 movdqa 16(%eax, %edi), %xmm2 607 sub $32, %ecx 608 movdqa 32(%eax, %edi), %xmm3 609 movdqa %xmm3, %xmm1 610 palignr $2, %xmm2, %xmm3 611 palignr $2, %xmm4, %xmm2 612 lea 32(%edi), %edi 613 movdqa %xmm2, -32(%edx, %edi) 614 movdqa %xmm3, -16(%edx, %edi) 615 jae L(sh_2_no_prefetch_loop) 616 617L(sh_2_end_no_prefetch_loop): 618 lea 32(%ecx), %ecx 619 add %ecx, %edi 620 add %edi, %edx 621 lea 2(%edi, %eax), %eax 622 POP (%edi) 623 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 624 625 CFI_PUSH (%edi) 626 627 .p2align 4 628L(shl_3): 629#ifndef USE_AS_MEMMOVE 630 movaps -3(%eax), %xmm1 631#else 632 movl DEST+4(%esp), %edi 633 movaps -3(%eax), %xmm1 634 movdqu %xmm0, (%edi) 635#endif 636#ifdef DATA_CACHE_SIZE_HALF 637 cmp $DATA_CACHE_SIZE_HALF, %ecx 638#else 639# if (defined SHARED || defined __PIC__) 640 SETUP_PIC_REG(bx) 641 add $_GLOBAL_OFFSET_TABLE_, %ebx 642 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 643# else 644 cmp __x86_data_cache_size_half, %ecx 645# endif 646#endif 647 jb L(sh_3_no_prefetch) 648 649 lea -64(%ecx), %ecx 650 651 .p2align 4 652L(Shl3LoopStart): 653 prefetcht0 0x1c0(%eax) 654 prefetcht0 0x1c0(%edx) 655 movaps 13(%eax), %xmm2 656 movaps 29(%eax), %xmm3 657 movaps 45(%eax), %xmm4 658 movaps 61(%eax), %xmm5 659 movaps %xmm5, %xmm7 660 palignr $3, %xmm4, %xmm5 661 palignr $3, %xmm3, %xmm4 662 movaps %xmm5, 48(%edx) 663 palignr $3, %xmm2, %xmm3 664 lea 64(%eax), %eax 665 palignr $3, %xmm1, %xmm2 666 movaps %xmm4, 32(%edx) 667 movaps %xmm3, 16(%edx) 668 movaps %xmm7, %xmm1 669 movaps %xmm2, (%edx) 670 lea 64(%edx), %edx 671 sub $64, %ecx 672 ja L(Shl3LoopStart) 673 674L(Shl3LoopLeave): 675 add $32, %ecx 676 jle L(shl_end_0) 677 678 movaps 13(%eax), %xmm2 679 movaps 29(%eax), %xmm3 680 palignr $3, %xmm2, %xmm3 681 palignr $3, %xmm1, %xmm2 682 movaps %xmm2, (%edx) 683 movaps %xmm3, 16(%edx) 684 lea 32(%edx, %ecx), %edx 685 lea 32(%eax, %ecx), %eax 686 POP (%edi) 687 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 688 689 CFI_PUSH (%edi) 690 691 .p2align 4 692L(sh_3_no_prefetch): 693 lea -32(%ecx), %ecx 694 lea -3(%eax), %eax 695 xor %edi, %edi 696 697 .p2align 4 698L(sh_3_no_prefetch_loop): 699 movdqa 16(%eax, %edi), %xmm2 700 sub $32, %ecx 701 movdqa 32(%eax, %edi), %xmm3 702 movdqa %xmm3, %xmm4 703 palignr $3, %xmm2, %xmm3 704 palignr $3, %xmm1, %xmm2 705 lea 32(%edi), %edi 706 movdqa %xmm2, -32(%edx, %edi) 707 movdqa %xmm3, -16(%edx, %edi) 708 709 jb L(sh_3_end_no_prefetch_loop) 710 711 movdqa 16(%eax, %edi), %xmm2 712 sub $32, %ecx 713 movdqa 32(%eax, %edi), %xmm3 714 movdqa %xmm3, %xmm1 715 palignr $3, %xmm2, %xmm3 716 palignr $3, %xmm4, %xmm2 717 lea 32(%edi), %edi 718 movdqa %xmm2, -32(%edx, %edi) 719 movdqa %xmm3, -16(%edx, %edi) 720 721 jae L(sh_3_no_prefetch_loop) 722 723L(sh_3_end_no_prefetch_loop): 724 lea 32(%ecx), %ecx 725 add %ecx, %edi 726 add %edi, %edx 727 lea 3(%edi, %eax), %eax 728 POP (%edi) 729 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 730 731 CFI_PUSH (%edi) 732 733 .p2align 4 734L(shl_4): 735#ifndef USE_AS_MEMMOVE 736 movaps -4(%eax), %xmm1 737#else 738 movl DEST+4(%esp), %edi 739 movaps -4(%eax), %xmm1 740 movdqu %xmm0, (%edi) 741#endif 742#ifdef DATA_CACHE_SIZE_HALF 743 cmp $DATA_CACHE_SIZE_HALF, %ecx 744#else 745# if (defined SHARED || defined __PIC__) 746 SETUP_PIC_REG(bx) 747 add $_GLOBAL_OFFSET_TABLE_, %ebx 748 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 749# else 750 cmp __x86_data_cache_size_half, %ecx 751# endif 752#endif 753 jb L(sh_4_no_prefetch) 754 755 lea -64(%ecx), %ecx 756 757 .p2align 4 758L(Shl4LoopStart): 759 prefetcht0 0x1c0(%eax) 760 prefetcht0 0x1c0(%edx) 761 movaps 12(%eax), %xmm2 762 movaps 28(%eax), %xmm3 763 movaps 44(%eax), %xmm4 764 movaps 60(%eax), %xmm5 765 movaps %xmm5, %xmm7 766 palignr $4, %xmm4, %xmm5 767 palignr $4, %xmm3, %xmm4 768 movaps %xmm5, 48(%edx) 769 palignr $4, %xmm2, %xmm3 770 lea 64(%eax), %eax 771 palignr $4, %xmm1, %xmm2 772 movaps %xmm4, 32(%edx) 773 movaps %xmm3, 16(%edx) 774 movaps %xmm7, %xmm1 775 movaps %xmm2, (%edx) 776 lea 64(%edx), %edx 777 sub $64, %ecx 778 ja L(Shl4LoopStart) 779 780L(Shl4LoopLeave): 781 add $32, %ecx 782 jle L(shl_end_0) 783 784 movaps 12(%eax), %xmm2 785 movaps 28(%eax), %xmm3 786 palignr $4, %xmm2, %xmm3 787 palignr $4, %xmm1, %xmm2 788 movaps %xmm2, (%edx) 789 movaps %xmm3, 16(%edx) 790 lea 32(%edx, %ecx), %edx 791 lea 32(%eax, %ecx), %eax 792 POP (%edi) 793 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 794 795 CFI_PUSH (%edi) 796 797 .p2align 4 798L(sh_4_no_prefetch): 799 lea -32(%ecx), %ecx 800 lea -4(%eax), %eax 801 xor %edi, %edi 802 803 .p2align 4 804L(sh_4_no_prefetch_loop): 805 movdqa 16(%eax, %edi), %xmm2 806 sub $32, %ecx 807 movdqa 32(%eax, %edi), %xmm3 808 movdqa %xmm3, %xmm4 809 palignr $4, %xmm2, %xmm3 810 palignr $4, %xmm1, %xmm2 811 lea 32(%edi), %edi 812 movdqa %xmm2, -32(%edx, %edi) 813 movdqa %xmm3, -16(%edx, %edi) 814 815 jb L(sh_4_end_no_prefetch_loop) 816 817 movdqa 16(%eax, %edi), %xmm2 818 sub $32, %ecx 819 movdqa 32(%eax, %edi), %xmm3 820 movdqa %xmm3, %xmm1 821 palignr $4, %xmm2, %xmm3 822 palignr $4, %xmm4, %xmm2 823 lea 32(%edi), %edi 824 movdqa %xmm2, -32(%edx, %edi) 825 movdqa %xmm3, -16(%edx, %edi) 826 827 jae L(sh_4_no_prefetch_loop) 828 829L(sh_4_end_no_prefetch_loop): 830 lea 32(%ecx), %ecx 831 add %ecx, %edi 832 add %edi, %edx 833 lea 4(%edi, %eax), %eax 834 POP (%edi) 835 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 836 837 CFI_PUSH (%edi) 838 839 .p2align 4 840L(shl_5): 841#ifndef USE_AS_MEMMOVE 842 movaps -5(%eax), %xmm1 843#else 844 movl DEST+4(%esp), %edi 845 movaps -5(%eax), %xmm1 846 movdqu %xmm0, (%edi) 847#endif 848#ifdef DATA_CACHE_SIZE_HALF 849 cmp $DATA_CACHE_SIZE_HALF, %ecx 850#else 851# if (defined SHARED || defined __PIC__) 852 SETUP_PIC_REG(bx) 853 add $_GLOBAL_OFFSET_TABLE_, %ebx 854 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 855# else 856 cmp __x86_data_cache_size_half, %ecx 857# endif 858#endif 859 jb L(sh_5_no_prefetch) 860 861 lea -64(%ecx), %ecx 862 863 .p2align 4 864L(Shl5LoopStart): 865 prefetcht0 0x1c0(%eax) 866 prefetcht0 0x1c0(%edx) 867 movaps 11(%eax), %xmm2 868 movaps 27(%eax), %xmm3 869 movaps 43(%eax), %xmm4 870 movaps 59(%eax), %xmm5 871 movaps %xmm5, %xmm7 872 palignr $5, %xmm4, %xmm5 873 palignr $5, %xmm3, %xmm4 874 movaps %xmm5, 48(%edx) 875 palignr $5, %xmm2, %xmm3 876 lea 64(%eax), %eax 877 palignr $5, %xmm1, %xmm2 878 movaps %xmm4, 32(%edx) 879 movaps %xmm3, 16(%edx) 880 movaps %xmm7, %xmm1 881 movaps %xmm2, (%edx) 882 lea 64(%edx), %edx 883 sub $64, %ecx 884 ja L(Shl5LoopStart) 885 886L(Shl5LoopLeave): 887 add $32, %ecx 888 jle L(shl_end_0) 889 890 movaps 11(%eax), %xmm2 891 movaps 27(%eax), %xmm3 892 palignr $5, %xmm2, %xmm3 893 palignr $5, %xmm1, %xmm2 894 movaps %xmm2, (%edx) 895 movaps %xmm3, 16(%edx) 896 lea 32(%edx, %ecx), %edx 897 lea 32(%eax, %ecx), %eax 898 POP (%edi) 899 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 900 901 CFI_PUSH (%edi) 902 903 .p2align 4 904L(sh_5_no_prefetch): 905 lea -32(%ecx), %ecx 906 lea -5(%eax), %eax 907 xor %edi, %edi 908 909 .p2align 4 910L(sh_5_no_prefetch_loop): 911 movdqa 16(%eax, %edi), %xmm2 912 sub $32, %ecx 913 movdqa 32(%eax, %edi), %xmm3 914 movdqa %xmm3, %xmm4 915 palignr $5, %xmm2, %xmm3 916 palignr $5, %xmm1, %xmm2 917 lea 32(%edi), %edi 918 movdqa %xmm2, -32(%edx, %edi) 919 movdqa %xmm3, -16(%edx, %edi) 920 921 jb L(sh_5_end_no_prefetch_loop) 922 923 movdqa 16(%eax, %edi), %xmm2 924 sub $32, %ecx 925 movdqa 32(%eax, %edi), %xmm3 926 movdqa %xmm3, %xmm1 927 palignr $5, %xmm2, %xmm3 928 palignr $5, %xmm4, %xmm2 929 lea 32(%edi), %edi 930 movdqa %xmm2, -32(%edx, %edi) 931 movdqa %xmm3, -16(%edx, %edi) 932 933 jae L(sh_5_no_prefetch_loop) 934 935L(sh_5_end_no_prefetch_loop): 936 lea 32(%ecx), %ecx 937 add %ecx, %edi 938 add %edi, %edx 939 lea 5(%edi, %eax), %eax 940 POP (%edi) 941 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 942 943 CFI_PUSH (%edi) 944 945 .p2align 4 946L(shl_6): 947#ifndef USE_AS_MEMMOVE 948 movaps -6(%eax), %xmm1 949#else 950 movl DEST+4(%esp), %edi 951 movaps -6(%eax), %xmm1 952 movdqu %xmm0, (%edi) 953#endif 954#ifdef DATA_CACHE_SIZE_HALF 955 cmp $DATA_CACHE_SIZE_HALF, %ecx 956#else 957# if (defined SHARED || defined __PIC__) 958 SETUP_PIC_REG(bx) 959 add $_GLOBAL_OFFSET_TABLE_, %ebx 960 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 961# else 962 cmp __x86_data_cache_size_half, %ecx 963# endif 964#endif 965 jb L(sh_6_no_prefetch) 966 967 lea -64(%ecx), %ecx 968 969 .p2align 4 970L(Shl6LoopStart): 971 prefetcht0 0x1c0(%eax) 972 prefetcht0 0x1c0(%edx) 973 movaps 10(%eax), %xmm2 974 movaps 26(%eax), %xmm3 975 movaps 42(%eax), %xmm4 976 movaps 58(%eax), %xmm5 977 movaps %xmm5, %xmm7 978 palignr $6, %xmm4, %xmm5 979 palignr $6, %xmm3, %xmm4 980 movaps %xmm5, 48(%edx) 981 palignr $6, %xmm2, %xmm3 982 lea 64(%eax), %eax 983 palignr $6, %xmm1, %xmm2 984 movaps %xmm4, 32(%edx) 985 movaps %xmm3, 16(%edx) 986 movaps %xmm7, %xmm1 987 movaps %xmm2, (%edx) 988 lea 64(%edx), %edx 989 sub $64, %ecx 990 ja L(Shl6LoopStart) 991 992L(Shl6LoopLeave): 993 add $32, %ecx 994 jle L(shl_end_0) 995 996 movaps 10(%eax), %xmm2 997 movaps 26(%eax), %xmm3 998 palignr $6, %xmm2, %xmm3 999 palignr $6, %xmm1, %xmm2 1000 movaps %xmm2, (%edx) 1001 movaps %xmm3, 16(%edx) 1002 lea 32(%edx, %ecx), %edx 1003 lea 32(%eax, %ecx), %eax 1004 POP (%edi) 1005 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1006 1007 CFI_PUSH (%edi) 1008 1009 .p2align 4 1010L(sh_6_no_prefetch): 1011 lea -32(%ecx), %ecx 1012 lea -6(%eax), %eax 1013 xor %edi, %edi 1014 1015 .p2align 4 1016L(sh_6_no_prefetch_loop): 1017 movdqa 16(%eax, %edi), %xmm2 1018 sub $32, %ecx 1019 movdqa 32(%eax, %edi), %xmm3 1020 movdqa %xmm3, %xmm4 1021 palignr $6, %xmm2, %xmm3 1022 palignr $6, %xmm1, %xmm2 1023 lea 32(%edi), %edi 1024 movdqa %xmm2, -32(%edx, %edi) 1025 movdqa %xmm3, -16(%edx, %edi) 1026 1027 jb L(sh_6_end_no_prefetch_loop) 1028 1029 movdqa 16(%eax, %edi), %xmm2 1030 sub $32, %ecx 1031 movdqa 32(%eax, %edi), %xmm3 1032 movdqa %xmm3, %xmm1 1033 palignr $6, %xmm2, %xmm3 1034 palignr $6, %xmm4, %xmm2 1035 lea 32(%edi), %edi 1036 movdqa %xmm2, -32(%edx, %edi) 1037 movdqa %xmm3, -16(%edx, %edi) 1038 1039 jae L(sh_6_no_prefetch_loop) 1040 1041L(sh_6_end_no_prefetch_loop): 1042 lea 32(%ecx), %ecx 1043 add %ecx, %edi 1044 add %edi, %edx 1045 lea 6(%edi, %eax), %eax 1046 POP (%edi) 1047 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1048 1049 CFI_PUSH (%edi) 1050 1051 .p2align 4 1052L(shl_7): 1053#ifndef USE_AS_MEMMOVE 1054 movaps -7(%eax), %xmm1 1055#else 1056 movl DEST+4(%esp), %edi 1057 movaps -7(%eax), %xmm1 1058 movdqu %xmm0, (%edi) 1059#endif 1060#ifdef DATA_CACHE_SIZE_HALF 1061 cmp $DATA_CACHE_SIZE_HALF, %ecx 1062#else 1063# if (defined SHARED || defined __PIC__) 1064 SETUP_PIC_REG(bx) 1065 add $_GLOBAL_OFFSET_TABLE_, %ebx 1066 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1067# else 1068 cmp __x86_data_cache_size_half, %ecx 1069# endif 1070#endif 1071 jb L(sh_7_no_prefetch) 1072 1073 lea -64(%ecx), %ecx 1074 1075 .p2align 4 1076L(Shl7LoopStart): 1077 prefetcht0 0x1c0(%eax) 1078 prefetcht0 0x1c0(%edx) 1079 movaps 9(%eax), %xmm2 1080 movaps 25(%eax), %xmm3 1081 movaps 41(%eax), %xmm4 1082 movaps 57(%eax), %xmm5 1083 movaps %xmm5, %xmm7 1084 palignr $7, %xmm4, %xmm5 1085 palignr $7, %xmm3, %xmm4 1086 movaps %xmm5, 48(%edx) 1087 palignr $7, %xmm2, %xmm3 1088 lea 64(%eax), %eax 1089 palignr $7, %xmm1, %xmm2 1090 movaps %xmm4, 32(%edx) 1091 movaps %xmm3, 16(%edx) 1092 movaps %xmm7, %xmm1 1093 movaps %xmm2, (%edx) 1094 lea 64(%edx), %edx 1095 sub $64, %ecx 1096 ja L(Shl7LoopStart) 1097 1098L(Shl7LoopLeave): 1099 add $32, %ecx 1100 jle L(shl_end_0) 1101 1102 movaps 9(%eax), %xmm2 1103 movaps 25(%eax), %xmm3 1104 palignr $7, %xmm2, %xmm3 1105 palignr $7, %xmm1, %xmm2 1106 movaps %xmm2, (%edx) 1107 movaps %xmm3, 16(%edx) 1108 lea 32(%edx, %ecx), %edx 1109 lea 32(%eax, %ecx), %eax 1110 POP (%edi) 1111 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1112 1113 CFI_PUSH (%edi) 1114 1115 .p2align 4 1116L(sh_7_no_prefetch): 1117 lea -32(%ecx), %ecx 1118 lea -7(%eax), %eax 1119 xor %edi, %edi 1120 1121 .p2align 4 1122L(sh_7_no_prefetch_loop): 1123 movdqa 16(%eax, %edi), %xmm2 1124 sub $32, %ecx 1125 movdqa 32(%eax, %edi), %xmm3 1126 movdqa %xmm3, %xmm4 1127 palignr $7, %xmm2, %xmm3 1128 palignr $7, %xmm1, %xmm2 1129 lea 32(%edi), %edi 1130 movdqa %xmm2, -32(%edx, %edi) 1131 movdqa %xmm3, -16(%edx, %edi) 1132 jb L(sh_7_end_no_prefetch_loop) 1133 1134 movdqa 16(%eax, %edi), %xmm2 1135 sub $32, %ecx 1136 movdqa 32(%eax, %edi), %xmm3 1137 movdqa %xmm3, %xmm1 1138 palignr $7, %xmm2, %xmm3 1139 palignr $7, %xmm4, %xmm2 1140 lea 32(%edi), %edi 1141 movdqa %xmm2, -32(%edx, %edi) 1142 movdqa %xmm3, -16(%edx, %edi) 1143 jae L(sh_7_no_prefetch_loop) 1144 1145L(sh_7_end_no_prefetch_loop): 1146 lea 32(%ecx), %ecx 1147 add %ecx, %edi 1148 add %edi, %edx 1149 lea 7(%edi, %eax), %eax 1150 POP (%edi) 1151 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1152 1153 CFI_PUSH (%edi) 1154 1155 .p2align 4 1156L(shl_8): 1157#ifndef USE_AS_MEMMOVE 1158 movaps -8(%eax), %xmm1 1159#else 1160 movl DEST+4(%esp), %edi 1161 movaps -8(%eax), %xmm1 1162 movdqu %xmm0, (%edi) 1163#endif 1164#ifdef DATA_CACHE_SIZE_HALF 1165 cmp $DATA_CACHE_SIZE_HALF, %ecx 1166#else 1167# if (defined SHARED || defined __PIC__) 1168 SETUP_PIC_REG(bx) 1169 add $_GLOBAL_OFFSET_TABLE_, %ebx 1170 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1171# else 1172 cmp __x86_data_cache_size_half, %ecx 1173# endif 1174#endif 1175 jb L(sh_8_no_prefetch) 1176 1177 lea -64(%ecx), %ecx 1178 1179 .p2align 4 1180L(Shl8LoopStart): 1181 prefetcht0 0x1c0(%eax) 1182 prefetcht0 0x1c0(%edx) 1183 movaps 8(%eax), %xmm2 1184 movaps 24(%eax), %xmm3 1185 movaps 40(%eax), %xmm4 1186 movaps 56(%eax), %xmm5 1187 movaps %xmm5, %xmm7 1188 palignr $8, %xmm4, %xmm5 1189 palignr $8, %xmm3, %xmm4 1190 movaps %xmm5, 48(%edx) 1191 palignr $8, %xmm2, %xmm3 1192 lea 64(%eax), %eax 1193 palignr $8, %xmm1, %xmm2 1194 movaps %xmm4, 32(%edx) 1195 movaps %xmm3, 16(%edx) 1196 movaps %xmm7, %xmm1 1197 movaps %xmm2, (%edx) 1198 lea 64(%edx), %edx 1199 sub $64, %ecx 1200 ja L(Shl8LoopStart) 1201 1202L(LoopLeave8): 1203 add $32, %ecx 1204 jle L(shl_end_0) 1205 1206 movaps 8(%eax), %xmm2 1207 movaps 24(%eax), %xmm3 1208 palignr $8, %xmm2, %xmm3 1209 palignr $8, %xmm1, %xmm2 1210 movaps %xmm2, (%edx) 1211 movaps %xmm3, 16(%edx) 1212 lea 32(%edx, %ecx), %edx 1213 lea 32(%eax, %ecx), %eax 1214 POP (%edi) 1215 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1216 1217 CFI_PUSH (%edi) 1218 1219 .p2align 4 1220L(sh_8_no_prefetch): 1221 lea -32(%ecx), %ecx 1222 lea -8(%eax), %eax 1223 xor %edi, %edi 1224 1225 .p2align 4 1226L(sh_8_no_prefetch_loop): 1227 movdqa 16(%eax, %edi), %xmm2 1228 sub $32, %ecx 1229 movdqa 32(%eax, %edi), %xmm3 1230 movdqa %xmm3, %xmm4 1231 palignr $8, %xmm2, %xmm3 1232 palignr $8, %xmm1, %xmm2 1233 lea 32(%edi), %edi 1234 movdqa %xmm2, -32(%edx, %edi) 1235 movdqa %xmm3, -16(%edx, %edi) 1236 jb L(sh_8_end_no_prefetch_loop) 1237 1238 movdqa 16(%eax, %edi), %xmm2 1239 sub $32, %ecx 1240 movdqa 32(%eax, %edi), %xmm3 1241 movdqa %xmm3, %xmm1 1242 palignr $8, %xmm2, %xmm3 1243 palignr $8, %xmm4, %xmm2 1244 lea 32(%edi), %edi 1245 movdqa %xmm2, -32(%edx, %edi) 1246 movdqa %xmm3, -16(%edx, %edi) 1247 jae L(sh_8_no_prefetch_loop) 1248 1249L(sh_8_end_no_prefetch_loop): 1250 lea 32(%ecx), %ecx 1251 add %ecx, %edi 1252 add %edi, %edx 1253 lea 8(%edi, %eax), %eax 1254 POP (%edi) 1255 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1256 1257 CFI_PUSH (%edi) 1258 1259 .p2align 4 1260L(shl_9): 1261#ifndef USE_AS_MEMMOVE 1262 movaps -9(%eax), %xmm1 1263#else 1264 movl DEST+4(%esp), %edi 1265 movaps -9(%eax), %xmm1 1266 movdqu %xmm0, (%edi) 1267#endif 1268#ifdef DATA_CACHE_SIZE_HALF 1269 cmp $DATA_CACHE_SIZE_HALF, %ecx 1270#else 1271# if (defined SHARED || defined __PIC__) 1272 SETUP_PIC_REG(bx) 1273 add $_GLOBAL_OFFSET_TABLE_, %ebx 1274 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1275# else 1276 cmp __x86_data_cache_size_half, %ecx 1277# endif 1278#endif 1279 jb L(sh_9_no_prefetch) 1280 1281 lea -64(%ecx), %ecx 1282 1283 .p2align 4 1284L(Shl9LoopStart): 1285 prefetcht0 0x1c0(%eax) 1286 prefetcht0 0x1c0(%edx) 1287 movaps 7(%eax), %xmm2 1288 movaps 23(%eax), %xmm3 1289 movaps 39(%eax), %xmm4 1290 movaps 55(%eax), %xmm5 1291 movaps %xmm5, %xmm7 1292 palignr $9, %xmm4, %xmm5 1293 palignr $9, %xmm3, %xmm4 1294 movaps %xmm5, 48(%edx) 1295 palignr $9, %xmm2, %xmm3 1296 lea 64(%eax), %eax 1297 palignr $9, %xmm1, %xmm2 1298 movaps %xmm4, 32(%edx) 1299 movaps %xmm3, 16(%edx) 1300 movaps %xmm7, %xmm1 1301 movaps %xmm2, (%edx) 1302 lea 64(%edx), %edx 1303 sub $64, %ecx 1304 ja L(Shl9LoopStart) 1305 1306L(Shl9LoopLeave): 1307 add $32, %ecx 1308 jle L(shl_end_0) 1309 1310 movaps 7(%eax), %xmm2 1311 movaps 23(%eax), %xmm3 1312 palignr $9, %xmm2, %xmm3 1313 palignr $9, %xmm1, %xmm2 1314 1315 movaps %xmm2, (%edx) 1316 movaps %xmm3, 16(%edx) 1317 lea 32(%edx, %ecx), %edx 1318 lea 32(%eax, %ecx), %eax 1319 POP (%edi) 1320 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1321 1322 CFI_PUSH (%edi) 1323 1324 .p2align 4 1325L(sh_9_no_prefetch): 1326 lea -32(%ecx), %ecx 1327 lea -9(%eax), %eax 1328 xor %edi, %edi 1329 1330 .p2align 4 1331L(sh_9_no_prefetch_loop): 1332 movdqa 16(%eax, %edi), %xmm2 1333 sub $32, %ecx 1334 movdqa 32(%eax, %edi), %xmm3 1335 movdqa %xmm3, %xmm4 1336 palignr $9, %xmm2, %xmm3 1337 palignr $9, %xmm1, %xmm2 1338 lea 32(%edi), %edi 1339 movdqa %xmm2, -32(%edx, %edi) 1340 movdqa %xmm3, -16(%edx, %edi) 1341 jb L(sh_9_end_no_prefetch_loop) 1342 1343 movdqa 16(%eax, %edi), %xmm2 1344 sub $32, %ecx 1345 movdqa 32(%eax, %edi), %xmm3 1346 movdqa %xmm3, %xmm1 1347 palignr $9, %xmm2, %xmm3 1348 palignr $9, %xmm4, %xmm2 1349 lea 32(%edi), %edi 1350 movdqa %xmm2, -32(%edx, %edi) 1351 movdqa %xmm3, -16(%edx, %edi) 1352 jae L(sh_9_no_prefetch_loop) 1353 1354L(sh_9_end_no_prefetch_loop): 1355 lea 32(%ecx), %ecx 1356 add %ecx, %edi 1357 add %edi, %edx 1358 lea 9(%edi, %eax), %eax 1359 POP (%edi) 1360 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1361 1362 CFI_PUSH (%edi) 1363 1364 .p2align 4 1365L(shl_10): 1366#ifndef USE_AS_MEMMOVE 1367 movaps -10(%eax), %xmm1 1368#else 1369 movl DEST+4(%esp), %edi 1370 movaps -10(%eax), %xmm1 1371 movdqu %xmm0, (%edi) 1372#endif 1373#ifdef DATA_CACHE_SIZE_HALF 1374 cmp $DATA_CACHE_SIZE_HALF, %ecx 1375#else 1376# if (defined SHARED || defined __PIC__) 1377 SETUP_PIC_REG(bx) 1378 add $_GLOBAL_OFFSET_TABLE_, %ebx 1379 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1380# else 1381 cmp __x86_data_cache_size_half, %ecx 1382# endif 1383#endif 1384 jb L(sh_10_no_prefetch) 1385 1386 lea -64(%ecx), %ecx 1387 1388 .p2align 4 1389L(Shl10LoopStart): 1390 prefetcht0 0x1c0(%eax) 1391 prefetcht0 0x1c0(%edx) 1392 movaps 6(%eax), %xmm2 1393 movaps 22(%eax), %xmm3 1394 movaps 38(%eax), %xmm4 1395 movaps 54(%eax), %xmm5 1396 movaps %xmm5, %xmm7 1397 palignr $10, %xmm4, %xmm5 1398 palignr $10, %xmm3, %xmm4 1399 movaps %xmm5, 48(%edx) 1400 palignr $10, %xmm2, %xmm3 1401 lea 64(%eax), %eax 1402 palignr $10, %xmm1, %xmm2 1403 movaps %xmm4, 32(%edx) 1404 movaps %xmm3, 16(%edx) 1405 movaps %xmm7, %xmm1 1406 movaps %xmm2, (%edx) 1407 lea 64(%edx), %edx 1408 sub $64, %ecx 1409 ja L(Shl10LoopStart) 1410 1411L(Shl10LoopLeave): 1412 add $32, %ecx 1413 jle L(shl_end_0) 1414 1415 movaps 6(%eax), %xmm2 1416 movaps 22(%eax), %xmm3 1417 palignr $10, %xmm2, %xmm3 1418 palignr $10, %xmm1, %xmm2 1419 1420 movaps %xmm2, (%edx) 1421 movaps %xmm3, 16(%edx) 1422 lea 32(%edx, %ecx), %edx 1423 lea 32(%eax, %ecx), %eax 1424 POP (%edi) 1425 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1426 1427 CFI_PUSH (%edi) 1428 1429 .p2align 4 1430L(sh_10_no_prefetch): 1431 lea -32(%ecx), %ecx 1432 lea -10(%eax), %eax 1433 xor %edi, %edi 1434 1435 .p2align 4 1436L(sh_10_no_prefetch_loop): 1437 movdqa 16(%eax, %edi), %xmm2 1438 sub $32, %ecx 1439 movdqa 32(%eax, %edi), %xmm3 1440 movdqa %xmm3, %xmm4 1441 palignr $10, %xmm2, %xmm3 1442 palignr $10, %xmm1, %xmm2 1443 lea 32(%edi), %edi 1444 movdqa %xmm2, -32(%edx, %edi) 1445 movdqa %xmm3, -16(%edx, %edi) 1446 jb L(sh_10_end_no_prefetch_loop) 1447 1448 movdqa 16(%eax, %edi), %xmm2 1449 sub $32, %ecx 1450 movdqa 32(%eax, %edi), %xmm3 1451 movdqa %xmm3, %xmm1 1452 palignr $10, %xmm2, %xmm3 1453 palignr $10, %xmm4, %xmm2 1454 lea 32(%edi), %edi 1455 movdqa %xmm2, -32(%edx, %edi) 1456 movdqa %xmm3, -16(%edx, %edi) 1457 jae L(sh_10_no_prefetch_loop) 1458 1459L(sh_10_end_no_prefetch_loop): 1460 lea 32(%ecx), %ecx 1461 add %ecx, %edi 1462 add %edi, %edx 1463 lea 10(%edi, %eax), %eax 1464 POP (%edi) 1465 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1466 1467 CFI_PUSH (%edi) 1468 1469 .p2align 4 1470L(shl_11): 1471#ifndef USE_AS_MEMMOVE 1472 movaps -11(%eax), %xmm1 1473#else 1474 movl DEST+4(%esp), %edi 1475 movaps -11(%eax), %xmm1 1476 movdqu %xmm0, (%edi) 1477#endif 1478#ifdef DATA_CACHE_SIZE_HALF 1479 cmp $DATA_CACHE_SIZE_HALF, %ecx 1480#else 1481# if (defined SHARED || defined __PIC__) 1482 SETUP_PIC_REG(bx) 1483 add $_GLOBAL_OFFSET_TABLE_, %ebx 1484 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1485# else 1486 cmp __x86_data_cache_size_half, %ecx 1487# endif 1488#endif 1489 jb L(sh_11_no_prefetch) 1490 1491 lea -64(%ecx), %ecx 1492 1493 .p2align 4 1494L(Shl11LoopStart): 1495 prefetcht0 0x1c0(%eax) 1496 prefetcht0 0x1c0(%edx) 1497 movaps 5(%eax), %xmm2 1498 movaps 21(%eax), %xmm3 1499 movaps 37(%eax), %xmm4 1500 movaps 53(%eax), %xmm5 1501 movaps %xmm5, %xmm7 1502 palignr $11, %xmm4, %xmm5 1503 palignr $11, %xmm3, %xmm4 1504 movaps %xmm5, 48(%edx) 1505 palignr $11, %xmm2, %xmm3 1506 lea 64(%eax), %eax 1507 palignr $11, %xmm1, %xmm2 1508 movaps %xmm4, 32(%edx) 1509 movaps %xmm3, 16(%edx) 1510 movaps %xmm7, %xmm1 1511 movaps %xmm2, (%edx) 1512 lea 64(%edx), %edx 1513 sub $64, %ecx 1514 ja L(Shl11LoopStart) 1515 1516L(Shl11LoopLeave): 1517 add $32, %ecx 1518 jle L(shl_end_0) 1519 1520 movaps 5(%eax), %xmm2 1521 movaps 21(%eax), %xmm3 1522 palignr $11, %xmm2, %xmm3 1523 palignr $11, %xmm1, %xmm2 1524 1525 movaps %xmm2, (%edx) 1526 movaps %xmm3, 16(%edx) 1527 lea 32(%edx, %ecx), %edx 1528 lea 32(%eax, %ecx), %eax 1529 POP (%edi) 1530 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1531 1532 CFI_PUSH (%edi) 1533 1534 .p2align 4 1535L(sh_11_no_prefetch): 1536 lea -32(%ecx), %ecx 1537 lea -11(%eax), %eax 1538 xor %edi, %edi 1539 1540 .p2align 4 1541L(sh_11_no_prefetch_loop): 1542 movdqa 16(%eax, %edi), %xmm2 1543 sub $32, %ecx 1544 movdqa 32(%eax, %edi), %xmm3 1545 movdqa %xmm3, %xmm4 1546 palignr $11, %xmm2, %xmm3 1547 palignr $11, %xmm1, %xmm2 1548 lea 32(%edi), %edi 1549 movdqa %xmm2, -32(%edx, %edi) 1550 movdqa %xmm3, -16(%edx, %edi) 1551 jb L(sh_11_end_no_prefetch_loop) 1552 1553 movdqa 16(%eax, %edi), %xmm2 1554 sub $32, %ecx 1555 movdqa 32(%eax, %edi), %xmm3 1556 movdqa %xmm3, %xmm1 1557 palignr $11, %xmm2, %xmm3 1558 palignr $11, %xmm4, %xmm2 1559 lea 32(%edi), %edi 1560 movdqa %xmm2, -32(%edx, %edi) 1561 movdqa %xmm3, -16(%edx, %edi) 1562 jae L(sh_11_no_prefetch_loop) 1563 1564L(sh_11_end_no_prefetch_loop): 1565 lea 32(%ecx), %ecx 1566 add %ecx, %edi 1567 add %edi, %edx 1568 lea 11(%edi, %eax), %eax 1569 POP (%edi) 1570 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1571 1572 CFI_PUSH (%edi) 1573 1574 .p2align 4 1575L(shl_12): 1576#ifndef USE_AS_MEMMOVE 1577 movaps -12(%eax), %xmm1 1578#else 1579 movl DEST+4(%esp), %edi 1580 movaps -12(%eax), %xmm1 1581 movdqu %xmm0, (%edi) 1582#endif 1583#ifdef DATA_CACHE_SIZE_HALF 1584 cmp $DATA_CACHE_SIZE_HALF, %ecx 1585#else 1586# if (defined SHARED || defined __PIC__) 1587 SETUP_PIC_REG(bx) 1588 add $_GLOBAL_OFFSET_TABLE_, %ebx 1589 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1590# else 1591 cmp __x86_data_cache_size_half, %ecx 1592# endif 1593#endif 1594 jb L(sh_12_no_prefetch) 1595 1596 lea -64(%ecx), %ecx 1597 1598 .p2align 4 1599L(Shl12LoopStart): 1600 prefetcht0 0x1c0(%eax) 1601 prefetcht0 0x1c0(%edx) 1602 movaps 4(%eax), %xmm2 1603 movaps 20(%eax), %xmm3 1604 movaps 36(%eax), %xmm4 1605 movaps 52(%eax), %xmm5 1606 movaps %xmm5, %xmm7 1607 palignr $12, %xmm4, %xmm5 1608 palignr $12, %xmm3, %xmm4 1609 movaps %xmm5, 48(%edx) 1610 palignr $12, %xmm2, %xmm3 1611 lea 64(%eax), %eax 1612 palignr $12, %xmm1, %xmm2 1613 movaps %xmm4, 32(%edx) 1614 movaps %xmm3, 16(%edx) 1615 movaps %xmm7, %xmm1 1616 movaps %xmm2, (%edx) 1617 lea 64(%edx), %edx 1618 sub $64, %ecx 1619 ja L(Shl12LoopStart) 1620 1621L(Shl12LoopLeave): 1622 add $32, %ecx 1623 jle L(shl_end_0) 1624 1625 movaps 4(%eax), %xmm2 1626 movaps 20(%eax), %xmm3 1627 palignr $12, %xmm2, %xmm3 1628 palignr $12, %xmm1, %xmm2 1629 1630 movaps %xmm2, (%edx) 1631 movaps %xmm3, 16(%edx) 1632 lea 32(%edx, %ecx), %edx 1633 lea 32(%eax, %ecx), %eax 1634 POP (%edi) 1635 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1636 1637 CFI_PUSH (%edi) 1638 1639 .p2align 4 1640L(sh_12_no_prefetch): 1641 lea -32(%ecx), %ecx 1642 lea -12(%eax), %eax 1643 xor %edi, %edi 1644 1645 .p2align 4 1646L(sh_12_no_prefetch_loop): 1647 movdqa 16(%eax, %edi), %xmm2 1648 sub $32, %ecx 1649 movdqa 32(%eax, %edi), %xmm3 1650 movdqa %xmm3, %xmm4 1651 palignr $12, %xmm2, %xmm3 1652 palignr $12, %xmm1, %xmm2 1653 lea 32(%edi), %edi 1654 movdqa %xmm2, -32(%edx, %edi) 1655 movdqa %xmm3, -16(%edx, %edi) 1656 jb L(sh_12_end_no_prefetch_loop) 1657 1658 movdqa 16(%eax, %edi), %xmm2 1659 sub $32, %ecx 1660 movdqa 32(%eax, %edi), %xmm3 1661 movdqa %xmm3, %xmm1 1662 palignr $12, %xmm2, %xmm3 1663 palignr $12, %xmm4, %xmm2 1664 lea 32(%edi), %edi 1665 movdqa %xmm2, -32(%edx, %edi) 1666 movdqa %xmm3, -16(%edx, %edi) 1667 jae L(sh_12_no_prefetch_loop) 1668 1669L(sh_12_end_no_prefetch_loop): 1670 lea 32(%ecx), %ecx 1671 add %ecx, %edi 1672 add %edi, %edx 1673 lea 12(%edi, %eax), %eax 1674 POP (%edi) 1675 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1676 1677 CFI_PUSH (%edi) 1678 1679 .p2align 4 1680L(shl_13): 1681#ifndef USE_AS_MEMMOVE 1682 movaps -13(%eax), %xmm1 1683#else 1684 movl DEST+4(%esp), %edi 1685 movaps -13(%eax), %xmm1 1686 movdqu %xmm0, (%edi) 1687#endif 1688#ifdef DATA_CACHE_SIZE_HALF 1689 cmp $DATA_CACHE_SIZE_HALF, %ecx 1690#else 1691# if (defined SHARED || defined __PIC__) 1692 SETUP_PIC_REG(bx) 1693 add $_GLOBAL_OFFSET_TABLE_, %ebx 1694 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1695# else 1696 cmp __x86_data_cache_size_half, %ecx 1697# endif 1698#endif 1699 jb L(sh_13_no_prefetch) 1700 1701 lea -64(%ecx), %ecx 1702 1703 .p2align 4 1704L(Shl13LoopStart): 1705 prefetcht0 0x1c0(%eax) 1706 prefetcht0 0x1c0(%edx) 1707 movaps 3(%eax), %xmm2 1708 movaps 19(%eax), %xmm3 1709 movaps 35(%eax), %xmm4 1710 movaps 51(%eax), %xmm5 1711 movaps %xmm5, %xmm7 1712 palignr $13, %xmm4, %xmm5 1713 palignr $13, %xmm3, %xmm4 1714 movaps %xmm5, 48(%edx) 1715 palignr $13, %xmm2, %xmm3 1716 lea 64(%eax), %eax 1717 palignr $13, %xmm1, %xmm2 1718 movaps %xmm4, 32(%edx) 1719 movaps %xmm3, 16(%edx) 1720 movaps %xmm7, %xmm1 1721 movaps %xmm2, (%edx) 1722 lea 64(%edx), %edx 1723 sub $64, %ecx 1724 ja L(Shl13LoopStart) 1725 1726L(Shl13LoopLeave): 1727 add $32, %ecx 1728 jle L(shl_end_0) 1729 1730 movaps 3(%eax), %xmm2 1731 movaps 19(%eax), %xmm3 1732 palignr $13, %xmm2, %xmm3 1733 palignr $13, %xmm1, %xmm2 1734 1735 movaps %xmm2, (%edx) 1736 movaps %xmm3, 16(%edx) 1737 lea 32(%edx, %ecx), %edx 1738 lea 32(%eax, %ecx), %eax 1739 POP (%edi) 1740 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1741 1742 CFI_PUSH (%edi) 1743 1744 .p2align 4 1745L(sh_13_no_prefetch): 1746 lea -32(%ecx), %ecx 1747 lea -13(%eax), %eax 1748 xor %edi, %edi 1749 1750 .p2align 4 1751L(sh_13_no_prefetch_loop): 1752 movdqa 16(%eax, %edi), %xmm2 1753 sub $32, %ecx 1754 movdqa 32(%eax, %edi), %xmm3 1755 movdqa %xmm3, %xmm4 1756 palignr $13, %xmm2, %xmm3 1757 palignr $13, %xmm1, %xmm2 1758 lea 32(%edi), %edi 1759 movdqa %xmm2, -32(%edx, %edi) 1760 movdqa %xmm3, -16(%edx, %edi) 1761 jb L(sh_13_end_no_prefetch_loop) 1762 1763 movdqa 16(%eax, %edi), %xmm2 1764 sub $32, %ecx 1765 movdqa 32(%eax, %edi), %xmm3 1766 movdqa %xmm3, %xmm1 1767 palignr $13, %xmm2, %xmm3 1768 palignr $13, %xmm4, %xmm2 1769 lea 32(%edi), %edi 1770 movdqa %xmm2, -32(%edx, %edi) 1771 movdqa %xmm3, -16(%edx, %edi) 1772 jae L(sh_13_no_prefetch_loop) 1773 1774L(sh_13_end_no_prefetch_loop): 1775 lea 32(%ecx), %ecx 1776 add %ecx, %edi 1777 add %edi, %edx 1778 lea 13(%edi, %eax), %eax 1779 POP (%edi) 1780 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1781 1782 CFI_PUSH (%edi) 1783 1784 .p2align 4 1785L(shl_14): 1786#ifndef USE_AS_MEMMOVE 1787 movaps -14(%eax), %xmm1 1788#else 1789 movl DEST+4(%esp), %edi 1790 movaps -14(%eax), %xmm1 1791 movdqu %xmm0, (%edi) 1792#endif 1793#ifdef DATA_CACHE_SIZE_HALF 1794 cmp $DATA_CACHE_SIZE_HALF, %ecx 1795#else 1796# if (defined SHARED || defined __PIC__) 1797 SETUP_PIC_REG(bx) 1798 add $_GLOBAL_OFFSET_TABLE_, %ebx 1799 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1800# else 1801 cmp __x86_data_cache_size_half, %ecx 1802# endif 1803#endif 1804 jb L(sh_14_no_prefetch) 1805 1806 lea -64(%ecx), %ecx 1807 1808 .p2align 4 1809L(Shl14LoopStart): 1810 prefetcht0 0x1c0(%eax) 1811 prefetcht0 0x1c0(%edx) 1812 movaps 2(%eax), %xmm2 1813 movaps 18(%eax), %xmm3 1814 movaps 34(%eax), %xmm4 1815 movaps 50(%eax), %xmm5 1816 movaps %xmm5, %xmm7 1817 palignr $14, %xmm4, %xmm5 1818 palignr $14, %xmm3, %xmm4 1819 movaps %xmm5, 48(%edx) 1820 palignr $14, %xmm2, %xmm3 1821 lea 64(%eax), %eax 1822 palignr $14, %xmm1, %xmm2 1823 movaps %xmm4, 32(%edx) 1824 movaps %xmm3, 16(%edx) 1825 movaps %xmm7, %xmm1 1826 movaps %xmm2, (%edx) 1827 lea 64(%edx), %edx 1828 sub $64, %ecx 1829 ja L(Shl14LoopStart) 1830 1831L(Shl14LoopLeave): 1832 add $32, %ecx 1833 jle L(shl_end_0) 1834 1835 movaps 2(%eax), %xmm2 1836 movaps 18(%eax), %xmm3 1837 palignr $14, %xmm2, %xmm3 1838 palignr $14, %xmm1, %xmm2 1839 1840 movaps %xmm2, (%edx) 1841 movaps %xmm3, 16(%edx) 1842 lea 32(%edx, %ecx), %edx 1843 lea 32(%eax, %ecx), %eax 1844 POP (%edi) 1845 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1846 1847 CFI_PUSH (%edi) 1848 1849 .p2align 4 1850L(sh_14_no_prefetch): 1851 lea -32(%ecx), %ecx 1852 lea -14(%eax), %eax 1853 xor %edi, %edi 1854 1855 .p2align 4 1856L(sh_14_no_prefetch_loop): 1857 movdqa 16(%eax, %edi), %xmm2 1858 sub $32, %ecx 1859 movdqa 32(%eax, %edi), %xmm3 1860 movdqa %xmm3, %xmm4 1861 palignr $14, %xmm2, %xmm3 1862 palignr $14, %xmm1, %xmm2 1863 lea 32(%edi), %edi 1864 movdqa %xmm2, -32(%edx, %edi) 1865 movdqa %xmm3, -16(%edx, %edi) 1866 jb L(sh_14_end_no_prefetch_loop) 1867 1868 movdqa 16(%eax, %edi), %xmm2 1869 sub $32, %ecx 1870 movdqa 32(%eax, %edi), %xmm3 1871 movdqa %xmm3, %xmm1 1872 palignr $14, %xmm2, %xmm3 1873 palignr $14, %xmm4, %xmm2 1874 lea 32(%edi), %edi 1875 movdqa %xmm2, -32(%edx, %edi) 1876 movdqa %xmm3, -16(%edx, %edi) 1877 jae L(sh_14_no_prefetch_loop) 1878 1879L(sh_14_end_no_prefetch_loop): 1880 lea 32(%ecx), %ecx 1881 add %ecx, %edi 1882 add %edi, %edx 1883 lea 14(%edi, %eax), %eax 1884 POP (%edi) 1885 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1886 1887 CFI_PUSH (%edi) 1888 1889 .p2align 4 1890L(shl_15): 1891#ifndef USE_AS_MEMMOVE 1892 movaps -15(%eax), %xmm1 1893#else 1894 movl DEST+4(%esp), %edi 1895 movaps -15(%eax), %xmm1 1896 movdqu %xmm0, (%edi) 1897#endif 1898#ifdef DATA_CACHE_SIZE_HALF 1899 cmp $DATA_CACHE_SIZE_HALF, %ecx 1900#else 1901# if (defined SHARED || defined __PIC__) 1902 SETUP_PIC_REG(bx) 1903 add $_GLOBAL_OFFSET_TABLE_, %ebx 1904 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1905# else 1906 cmp __x86_data_cache_size_half, %ecx 1907# endif 1908#endif 1909 jb L(sh_15_no_prefetch) 1910 1911 lea -64(%ecx), %ecx 1912 1913 .p2align 4 1914L(Shl15LoopStart): 1915 prefetcht0 0x1c0(%eax) 1916 prefetcht0 0x1c0(%edx) 1917 movaps 1(%eax), %xmm2 1918 movaps 17(%eax), %xmm3 1919 movaps 33(%eax), %xmm4 1920 movaps 49(%eax), %xmm5 1921 movaps %xmm5, %xmm7 1922 palignr $15, %xmm4, %xmm5 1923 palignr $15, %xmm3, %xmm4 1924 movaps %xmm5, 48(%edx) 1925 palignr $15, %xmm2, %xmm3 1926 lea 64(%eax), %eax 1927 palignr $15, %xmm1, %xmm2 1928 movaps %xmm4, 32(%edx) 1929 movaps %xmm3, 16(%edx) 1930 movaps %xmm7, %xmm1 1931 movaps %xmm2, (%edx) 1932 lea 64(%edx), %edx 1933 sub $64, %ecx 1934 ja L(Shl15LoopStart) 1935 1936L(Shl15LoopLeave): 1937 add $32, %ecx 1938 jle L(shl_end_0) 1939 1940 movaps 1(%eax), %xmm2 1941 movaps 17(%eax), %xmm3 1942 palignr $15, %xmm2, %xmm3 1943 palignr $15, %xmm1, %xmm2 1944 1945 movaps %xmm2, (%edx) 1946 movaps %xmm3, 16(%edx) 1947 lea 32(%edx, %ecx), %edx 1948 lea 32(%eax, %ecx), %eax 1949 POP (%edi) 1950 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1951 1952 CFI_PUSH (%edi) 1953 1954 .p2align 4 1955L(sh_15_no_prefetch): 1956 lea -32(%ecx), %ecx 1957 lea -15(%eax), %eax 1958 xor %edi, %edi 1959 1960 .p2align 4 1961L(sh_15_no_prefetch_loop): 1962 movdqa 16(%eax, %edi), %xmm2 1963 sub $32, %ecx 1964 movdqa 32(%eax, %edi), %xmm3 1965 movdqa %xmm3, %xmm4 1966 palignr $15, %xmm2, %xmm3 1967 palignr $15, %xmm1, %xmm2 1968 lea 32(%edi), %edi 1969 movdqa %xmm2, -32(%edx, %edi) 1970 movdqa %xmm3, -16(%edx, %edi) 1971 jb L(sh_15_end_no_prefetch_loop) 1972 1973 movdqa 16(%eax, %edi), %xmm2 1974 sub $32, %ecx 1975 movdqa 32(%eax, %edi), %xmm3 1976 movdqa %xmm3, %xmm1 1977 palignr $15, %xmm2, %xmm3 1978 palignr $15, %xmm4, %xmm2 1979 lea 32(%edi), %edi 1980 movdqa %xmm2, -32(%edx, %edi) 1981 movdqa %xmm3, -16(%edx, %edi) 1982 jae L(sh_15_no_prefetch_loop) 1983 1984L(sh_15_end_no_prefetch_loop): 1985 lea 32(%ecx), %ecx 1986 add %ecx, %edi 1987 add %edi, %edx 1988 lea 15(%edi, %eax), %eax 1989 POP (%edi) 1990 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1991 1992 CFI_PUSH (%edi) 1993 1994 .p2align 4 1995L(shl_end_0): 1996 lea 32(%ecx), %ecx 1997 lea (%edx, %ecx), %edx 1998 lea (%eax, %ecx), %eax 1999 POP (%edi) 2000 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 2001 2002 .p2align 4 2003L(fwd_write_44bytes): 2004 movq -44(%eax), %xmm0 2005 movq %xmm0, -44(%edx) 2006L(fwd_write_36bytes): 2007 movq -36(%eax), %xmm0 2008 movq %xmm0, -36(%edx) 2009L(fwd_write_28bytes): 2010 movq -28(%eax), %xmm0 2011 movq %xmm0, -28(%edx) 2012L(fwd_write_20bytes): 2013 movq -20(%eax), %xmm0 2014 movq %xmm0, -20(%edx) 2015L(fwd_write_12bytes): 2016 movq -12(%eax), %xmm0 2017 movq %xmm0, -12(%edx) 2018L(fwd_write_4bytes): 2019 movl -4(%eax), %ecx 2020 movl %ecx, -4(%edx) 2021#ifndef USE_AS_BCOPY 2022# ifdef USE_AS_MEMPCPY 2023 movl %edx, %eax 2024# else 2025 movl DEST(%esp), %eax 2026# endif 2027#endif 2028 RETURN 2029 2030 .p2align 4 2031L(fwd_write_40bytes): 2032 movq -40(%eax), %xmm0 2033 movq %xmm0, -40(%edx) 2034L(fwd_write_32bytes): 2035 movq -32(%eax), %xmm0 2036 movq %xmm0, -32(%edx) 2037L(fwd_write_24bytes): 2038 movq -24(%eax), %xmm0 2039 movq %xmm0, -24(%edx) 2040L(fwd_write_16bytes): 2041 movq -16(%eax), %xmm0 2042 movq %xmm0, -16(%edx) 2043L(fwd_write_8bytes): 2044 movq -8(%eax), %xmm0 2045 movq %xmm0, -8(%edx) 2046L(fwd_write_0bytes): 2047#ifndef USE_AS_BCOPY 2048# ifdef USE_AS_MEMPCPY 2049 movl %edx, %eax 2050# else 2051 movl DEST(%esp), %eax 2052# endif 2053#endif 2054 RETURN 2055 2056 .p2align 4 2057L(fwd_write_5bytes): 2058 movl -5(%eax), %ecx 2059 movl -4(%eax), %eax 2060 movl %ecx, -5(%edx) 2061 movl %eax, -4(%edx) 2062#ifndef USE_AS_BCOPY 2063# ifdef USE_AS_MEMPCPY 2064 movl %edx, %eax 2065# else 2066 movl DEST(%esp), %eax 2067# endif 2068#endif 2069 RETURN 2070 2071 .p2align 4 2072L(fwd_write_45bytes): 2073 movq -45(%eax), %xmm0 2074 movq %xmm0, -45(%edx) 2075L(fwd_write_37bytes): 2076 movq -37(%eax), %xmm0 2077 movq %xmm0, -37(%edx) 2078L(fwd_write_29bytes): 2079 movq -29(%eax), %xmm0 2080 movq %xmm0, -29(%edx) 2081L(fwd_write_21bytes): 2082 movq -21(%eax), %xmm0 2083 movq %xmm0, -21(%edx) 2084L(fwd_write_13bytes): 2085 movq -13(%eax), %xmm0 2086 movq %xmm0, -13(%edx) 2087 movl -5(%eax), %ecx 2088 movl %ecx, -5(%edx) 2089 movzbl -1(%eax), %ecx 2090 movb %cl, -1(%edx) 2091#ifndef USE_AS_BCOPY 2092# ifdef USE_AS_MEMPCPY 2093 movl %edx, %eax 2094# else 2095 movl DEST(%esp), %eax 2096# endif 2097#endif 2098 RETURN 2099 2100 .p2align 4 2101L(fwd_write_41bytes): 2102 movq -41(%eax), %xmm0 2103 movq %xmm0, -41(%edx) 2104L(fwd_write_33bytes): 2105 movq -33(%eax), %xmm0 2106 movq %xmm0, -33(%edx) 2107L(fwd_write_25bytes): 2108 movq -25(%eax), %xmm0 2109 movq %xmm0, -25(%edx) 2110L(fwd_write_17bytes): 2111 movq -17(%eax), %xmm0 2112 movq %xmm0, -17(%edx) 2113L(fwd_write_9bytes): 2114 movq -9(%eax), %xmm0 2115 movq %xmm0, -9(%edx) 2116L(fwd_write_1bytes): 2117 movzbl -1(%eax), %ecx 2118 movb %cl, -1(%edx) 2119#ifndef USE_AS_BCOPY 2120# ifdef USE_AS_MEMPCPY 2121 movl %edx, %eax 2122# else 2123 movl DEST(%esp), %eax 2124# endif 2125#endif 2126 RETURN 2127 2128 .p2align 4 2129L(fwd_write_46bytes): 2130 movq -46(%eax), %xmm0 2131 movq %xmm0, -46(%edx) 2132L(fwd_write_38bytes): 2133 movq -38(%eax), %xmm0 2134 movq %xmm0, -38(%edx) 2135L(fwd_write_30bytes): 2136 movq -30(%eax), %xmm0 2137 movq %xmm0, -30(%edx) 2138L(fwd_write_22bytes): 2139 movq -22(%eax), %xmm0 2140 movq %xmm0, -22(%edx) 2141L(fwd_write_14bytes): 2142 movq -14(%eax), %xmm0 2143 movq %xmm0, -14(%edx) 2144L(fwd_write_6bytes): 2145 movl -6(%eax), %ecx 2146 movl %ecx, -6(%edx) 2147 movzwl -2(%eax), %ecx 2148 movw %cx, -2(%edx) 2149#ifndef USE_AS_BCOPY 2150# ifdef USE_AS_MEMPCPY 2151 movl %edx, %eax 2152# else 2153 movl DEST(%esp), %eax 2154# endif 2155#endif 2156 RETURN 2157 2158 .p2align 4 2159L(fwd_write_42bytes): 2160 movq -42(%eax), %xmm0 2161 movq %xmm0, -42(%edx) 2162L(fwd_write_34bytes): 2163 movq -34(%eax), %xmm0 2164 movq %xmm0, -34(%edx) 2165L(fwd_write_26bytes): 2166 movq -26(%eax), %xmm0 2167 movq %xmm0, -26(%edx) 2168L(fwd_write_18bytes): 2169 movq -18(%eax), %xmm0 2170 movq %xmm0, -18(%edx) 2171L(fwd_write_10bytes): 2172 movq -10(%eax), %xmm0 2173 movq %xmm0, -10(%edx) 2174L(fwd_write_2bytes): 2175 movzwl -2(%eax), %ecx 2176 movw %cx, -2(%edx) 2177#ifndef USE_AS_BCOPY 2178# ifdef USE_AS_MEMPCPY 2179 movl %edx, %eax 2180# else 2181 movl DEST(%esp), %eax 2182# endif 2183#endif 2184 RETURN 2185 2186 .p2align 4 2187L(fwd_write_47bytes): 2188 movq -47(%eax), %xmm0 2189 movq %xmm0, -47(%edx) 2190L(fwd_write_39bytes): 2191 movq -39(%eax), %xmm0 2192 movq %xmm0, -39(%edx) 2193L(fwd_write_31bytes): 2194 movq -31(%eax), %xmm0 2195 movq %xmm0, -31(%edx) 2196L(fwd_write_23bytes): 2197 movq -23(%eax), %xmm0 2198 movq %xmm0, -23(%edx) 2199L(fwd_write_15bytes): 2200 movq -15(%eax), %xmm0 2201 movq %xmm0, -15(%edx) 2202L(fwd_write_7bytes): 2203 movl -7(%eax), %ecx 2204 movl %ecx, -7(%edx) 2205 movzwl -3(%eax), %ecx 2206 movzbl -1(%eax), %eax 2207 movw %cx, -3(%edx) 2208 movb %al, -1(%edx) 2209#ifndef USE_AS_BCOPY 2210# ifdef USE_AS_MEMPCPY 2211 movl %edx, %eax 2212# else 2213 movl DEST(%esp), %eax 2214# endif 2215#endif 2216 RETURN 2217 2218 .p2align 4 2219L(fwd_write_43bytes): 2220 movq -43(%eax), %xmm0 2221 movq %xmm0, -43(%edx) 2222L(fwd_write_35bytes): 2223 movq -35(%eax), %xmm0 2224 movq %xmm0, -35(%edx) 2225L(fwd_write_27bytes): 2226 movq -27(%eax), %xmm0 2227 movq %xmm0, -27(%edx) 2228L(fwd_write_19bytes): 2229 movq -19(%eax), %xmm0 2230 movq %xmm0, -19(%edx) 2231L(fwd_write_11bytes): 2232 movq -11(%eax), %xmm0 2233 movq %xmm0, -11(%edx) 2234L(fwd_write_3bytes): 2235 movzwl -3(%eax), %ecx 2236 movzbl -1(%eax), %eax 2237 movw %cx, -3(%edx) 2238 movb %al, -1(%edx) 2239#ifndef USE_AS_BCOPY 2240# ifdef USE_AS_MEMPCPY 2241 movl %edx, %eax 2242# else 2243 movl DEST(%esp), %eax 2244# endif 2245#endif 2246 RETURN 2247 2248 .p2align 4 2249L(fwd_write_40bytes_align): 2250 movdqa -40(%eax), %xmm0 2251 movdqa %xmm0, -40(%edx) 2252L(fwd_write_24bytes_align): 2253 movdqa -24(%eax), %xmm0 2254 movdqa %xmm0, -24(%edx) 2255L(fwd_write_8bytes_align): 2256 movq -8(%eax), %xmm0 2257 movq %xmm0, -8(%edx) 2258L(fwd_write_0bytes_align): 2259#ifndef USE_AS_BCOPY 2260# ifdef USE_AS_MEMPCPY 2261 movl %edx, %eax 2262# else 2263 movl DEST(%esp), %eax 2264# endif 2265#endif 2266 RETURN 2267 2268 .p2align 4 2269L(fwd_write_32bytes_align): 2270 movdqa -32(%eax), %xmm0 2271 movdqa %xmm0, -32(%edx) 2272L(fwd_write_16bytes_align): 2273 movdqa -16(%eax), %xmm0 2274 movdqa %xmm0, -16(%edx) 2275#ifndef USE_AS_BCOPY 2276# ifdef USE_AS_MEMPCPY 2277 movl %edx, %eax 2278# else 2279 movl DEST(%esp), %eax 2280# endif 2281#endif 2282 RETURN 2283 2284 .p2align 4 2285L(fwd_write_5bytes_align): 2286 movl -5(%eax), %ecx 2287 movl -4(%eax), %eax 2288 movl %ecx, -5(%edx) 2289 movl %eax, -4(%edx) 2290#ifndef USE_AS_BCOPY 2291# ifdef USE_AS_MEMPCPY 2292 movl %edx, %eax 2293# else 2294 movl DEST(%esp), %eax 2295# endif 2296#endif 2297 RETURN 2298 2299 .p2align 4 2300L(fwd_write_45bytes_align): 2301 movdqa -45(%eax), %xmm0 2302 movdqa %xmm0, -45(%edx) 2303L(fwd_write_29bytes_align): 2304 movdqa -29(%eax), %xmm0 2305 movdqa %xmm0, -29(%edx) 2306L(fwd_write_13bytes_align): 2307 movq -13(%eax), %xmm0 2308 movq %xmm0, -13(%edx) 2309 movl -5(%eax), %ecx 2310 movl %ecx, -5(%edx) 2311 movzbl -1(%eax), %ecx 2312 movb %cl, -1(%edx) 2313#ifndef USE_AS_BCOPY 2314# ifdef USE_AS_MEMPCPY 2315 movl %edx, %eax 2316# else 2317 movl DEST(%esp), %eax 2318# endif 2319#endif 2320 RETURN 2321 2322 .p2align 4 2323L(fwd_write_37bytes_align): 2324 movdqa -37(%eax), %xmm0 2325 movdqa %xmm0, -37(%edx) 2326L(fwd_write_21bytes_align): 2327 movdqa -21(%eax), %xmm0 2328 movdqa %xmm0, -21(%edx) 2329 movl -5(%eax), %ecx 2330 movl %ecx, -5(%edx) 2331 movzbl -1(%eax), %ecx 2332 movb %cl, -1(%edx) 2333#ifndef USE_AS_BCOPY 2334# ifdef USE_AS_MEMPCPY 2335 movl %edx, %eax 2336# else 2337 movl DEST(%esp), %eax 2338# endif 2339#endif 2340 RETURN 2341 2342 .p2align 4 2343L(fwd_write_41bytes_align): 2344 movdqa -41(%eax), %xmm0 2345 movdqa %xmm0, -41(%edx) 2346L(fwd_write_25bytes_align): 2347 movdqa -25(%eax), %xmm0 2348 movdqa %xmm0, -25(%edx) 2349L(fwd_write_9bytes_align): 2350 movq -9(%eax), %xmm0 2351 movq %xmm0, -9(%edx) 2352L(fwd_write_1bytes_align): 2353 movzbl -1(%eax), %ecx 2354 movb %cl, -1(%edx) 2355#ifndef USE_AS_BCOPY 2356# ifdef USE_AS_MEMPCPY 2357 movl %edx, %eax 2358# else 2359 movl DEST(%esp), %eax 2360# endif 2361#endif 2362 RETURN 2363 2364 .p2align 4 2365L(fwd_write_33bytes_align): 2366 movdqa -33(%eax), %xmm0 2367 movdqa %xmm0, -33(%edx) 2368L(fwd_write_17bytes_align): 2369 movdqa -17(%eax), %xmm0 2370 movdqa %xmm0, -17(%edx) 2371 movzbl -1(%eax), %ecx 2372 movb %cl, -1(%edx) 2373#ifndef USE_AS_BCOPY 2374# ifdef USE_AS_MEMPCPY 2375 movl %edx, %eax 2376# else 2377 movl DEST(%esp), %eax 2378# endif 2379#endif 2380 RETURN 2381 2382 .p2align 4 2383L(fwd_write_46bytes_align): 2384 movdqa -46(%eax), %xmm0 2385 movdqa %xmm0, -46(%edx) 2386L(fwd_write_30bytes_align): 2387 movdqa -30(%eax), %xmm0 2388 movdqa %xmm0, -30(%edx) 2389L(fwd_write_14bytes_align): 2390 movq -14(%eax), %xmm0 2391 movq %xmm0, -14(%edx) 2392L(fwd_write_6bytes_align): 2393 movl -6(%eax), %ecx 2394 movl %ecx, -6(%edx) 2395 movzwl -2(%eax), %ecx 2396 movw %cx, -2(%edx) 2397#ifndef USE_AS_BCOPY 2398# ifdef USE_AS_MEMPCPY 2399 movl %edx, %eax 2400# else 2401 movl DEST(%esp), %eax 2402# endif 2403#endif 2404 RETURN 2405 2406 .p2align 4 2407L(fwd_write_38bytes_align): 2408 movdqa -38(%eax), %xmm0 2409 movdqa %xmm0, -38(%edx) 2410L(fwd_write_22bytes_align): 2411 movdqa -22(%eax), %xmm0 2412 movdqa %xmm0, -22(%edx) 2413 movl -6(%eax), %ecx 2414 movl %ecx, -6(%edx) 2415 movzwl -2(%eax), %ecx 2416 movw %cx, -2(%edx) 2417#ifndef USE_AS_BCOPY 2418# ifdef USE_AS_MEMPCPY 2419 movl %edx, %eax 2420# else 2421 movl DEST(%esp), %eax 2422# endif 2423#endif 2424 RETURN 2425 2426 .p2align 4 2427L(fwd_write_42bytes_align): 2428 movdqa -42(%eax), %xmm0 2429 movdqa %xmm0, -42(%edx) 2430L(fwd_write_26bytes_align): 2431 movdqa -26(%eax), %xmm0 2432 movdqa %xmm0, -26(%edx) 2433L(fwd_write_10bytes_align): 2434 movq -10(%eax), %xmm0 2435 movq %xmm0, -10(%edx) 2436L(fwd_write_2bytes_align): 2437 movzwl -2(%eax), %ecx 2438 movw %cx, -2(%edx) 2439#ifndef USE_AS_BCOPY 2440# ifdef USE_AS_MEMPCPY 2441 movl %edx, %eax 2442# else 2443 movl DEST(%esp), %eax 2444# endif 2445#endif 2446 RETURN 2447 2448 .p2align 4 2449L(fwd_write_34bytes_align): 2450 movdqa -34(%eax), %xmm0 2451 movdqa %xmm0, -34(%edx) 2452L(fwd_write_18bytes_align): 2453 movdqa -18(%eax), %xmm0 2454 movdqa %xmm0, -18(%edx) 2455 movzwl -2(%eax), %ecx 2456 movw %cx, -2(%edx) 2457#ifndef USE_AS_BCOPY 2458# ifdef USE_AS_MEMPCPY 2459 movl %edx, %eax 2460# else 2461 movl DEST(%esp), %eax 2462# endif 2463#endif 2464 RETURN 2465 2466 .p2align 4 2467L(fwd_write_47bytes_align): 2468 movdqa -47(%eax), %xmm0 2469 movdqa %xmm0, -47(%edx) 2470L(fwd_write_31bytes_align): 2471 movdqa -31(%eax), %xmm0 2472 movdqa %xmm0, -31(%edx) 2473L(fwd_write_15bytes_align): 2474 movq -15(%eax), %xmm0 2475 movq %xmm0, -15(%edx) 2476L(fwd_write_7bytes_align): 2477 movl -7(%eax), %ecx 2478 movl %ecx, -7(%edx) 2479 movzwl -3(%eax), %ecx 2480 movzbl -1(%eax), %eax 2481 movw %cx, -3(%edx) 2482 movb %al, -1(%edx) 2483#ifndef USE_AS_BCOPY 2484# ifdef USE_AS_MEMPCPY 2485 movl %edx, %eax 2486# else 2487 movl DEST(%esp), %eax 2488# endif 2489#endif 2490 RETURN 2491 2492 .p2align 4 2493L(fwd_write_39bytes_align): 2494 movdqa -39(%eax), %xmm0 2495 movdqa %xmm0, -39(%edx) 2496L(fwd_write_23bytes_align): 2497 movdqa -23(%eax), %xmm0 2498 movdqa %xmm0, -23(%edx) 2499 movl -7(%eax), %ecx 2500 movl %ecx, -7(%edx) 2501 movzwl -3(%eax), %ecx 2502 movzbl -1(%eax), %eax 2503 movw %cx, -3(%edx) 2504 movb %al, -1(%edx) 2505#ifndef USE_AS_BCOPY 2506# ifdef USE_AS_MEMPCPY 2507 movl %edx, %eax 2508# else 2509 movl DEST(%esp), %eax 2510# endif 2511#endif 2512 RETURN 2513 2514 .p2align 4 2515L(fwd_write_43bytes_align): 2516 movdqa -43(%eax), %xmm0 2517 movdqa %xmm0, -43(%edx) 2518L(fwd_write_27bytes_align): 2519 movdqa -27(%eax), %xmm0 2520 movdqa %xmm0, -27(%edx) 2521L(fwd_write_11bytes_align): 2522 movq -11(%eax), %xmm0 2523 movq %xmm0, -11(%edx) 2524L(fwd_write_3bytes_align): 2525 movzwl -3(%eax), %ecx 2526 movzbl -1(%eax), %eax 2527 movw %cx, -3(%edx) 2528 movb %al, -1(%edx) 2529#ifndef USE_AS_BCOPY 2530# ifdef USE_AS_MEMPCPY 2531 movl %edx, %eax 2532# else 2533 movl DEST(%esp), %eax 2534# endif 2535#endif 2536 RETURN 2537 2538 .p2align 4 2539L(fwd_write_35bytes_align): 2540 movdqa -35(%eax), %xmm0 2541 movdqa %xmm0, -35(%edx) 2542L(fwd_write_19bytes_align): 2543 movdqa -19(%eax), %xmm0 2544 movdqa %xmm0, -19(%edx) 2545 movzwl -3(%eax), %ecx 2546 movzbl -1(%eax), %eax 2547 movw %cx, -3(%edx) 2548 movb %al, -1(%edx) 2549#ifndef USE_AS_BCOPY 2550# ifdef USE_AS_MEMPCPY 2551 movl %edx, %eax 2552# else 2553 movl DEST(%esp), %eax 2554# endif 2555#endif 2556 RETURN 2557 2558 .p2align 4 2559L(fwd_write_44bytes_align): 2560 movdqa -44(%eax), %xmm0 2561 movdqa %xmm0, -44(%edx) 2562L(fwd_write_28bytes_align): 2563 movdqa -28(%eax), %xmm0 2564 movdqa %xmm0, -28(%edx) 2565L(fwd_write_12bytes_align): 2566 movq -12(%eax), %xmm0 2567 movq %xmm0, -12(%edx) 2568L(fwd_write_4bytes_align): 2569 movl -4(%eax), %ecx 2570 movl %ecx, -4(%edx) 2571#ifndef USE_AS_BCOPY 2572# ifdef USE_AS_MEMPCPY 2573 movl %edx, %eax 2574# else 2575 movl DEST(%esp), %eax 2576# endif 2577#endif 2578 RETURN 2579 2580 .p2align 4 2581L(fwd_write_36bytes_align): 2582 movdqa -36(%eax), %xmm0 2583 movdqa %xmm0, -36(%edx) 2584L(fwd_write_20bytes_align): 2585 movdqa -20(%eax), %xmm0 2586 movdqa %xmm0, -20(%edx) 2587 movl -4(%eax), %ecx 2588 movl %ecx, -4(%edx) 2589#ifndef USE_AS_BCOPY 2590# ifdef USE_AS_MEMPCPY 2591 movl %edx, %eax 2592# else 2593 movl DEST(%esp), %eax 2594# endif 2595#endif 2596 RETURN_END 2597 2598 CFI_PUSH (%edi) 2599 2600 .p2align 4 2601L(large_page): 2602 movdqu (%eax), %xmm1 2603#ifdef USE_AS_MEMMOVE 2604 movl DEST+4(%esp), %edi 2605 movdqu %xmm0, (%edi) 2606#endif 2607 lea 16(%eax), %eax 2608 movntdq %xmm1, (%edx) 2609 lea 16(%edx), %edx 2610 lea -0x90(%ecx), %ecx 2611 POP (%edi) 2612 2613 .p2align 4 2614L(large_page_loop): 2615 movdqu (%eax), %xmm0 2616 movdqu 0x10(%eax), %xmm1 2617 movdqu 0x20(%eax), %xmm2 2618 movdqu 0x30(%eax), %xmm3 2619 movdqu 0x40(%eax), %xmm4 2620 movdqu 0x50(%eax), %xmm5 2621 movdqu 0x60(%eax), %xmm6 2622 movdqu 0x70(%eax), %xmm7 2623 lea 0x80(%eax), %eax 2624 2625 sub $0x80, %ecx 2626 movntdq %xmm0, (%edx) 2627 movntdq %xmm1, 0x10(%edx) 2628 movntdq %xmm2, 0x20(%edx) 2629 movntdq %xmm3, 0x30(%edx) 2630 movntdq %xmm4, 0x40(%edx) 2631 movntdq %xmm5, 0x50(%edx) 2632 movntdq %xmm6, 0x60(%edx) 2633 movntdq %xmm7, 0x70(%edx) 2634 lea 0x80(%edx), %edx 2635 jae L(large_page_loop) 2636 cmp $-0x40, %ecx 2637 lea 0x80(%ecx), %ecx 2638 jl L(large_page_less_64bytes) 2639 2640 movdqu (%eax), %xmm0 2641 movdqu 0x10(%eax), %xmm1 2642 movdqu 0x20(%eax), %xmm2 2643 movdqu 0x30(%eax), %xmm3 2644 lea 0x40(%eax), %eax 2645 2646 movntdq %xmm0, (%edx) 2647 movntdq %xmm1, 0x10(%edx) 2648 movntdq %xmm2, 0x20(%edx) 2649 movntdq %xmm3, 0x30(%edx) 2650 lea 0x40(%edx), %edx 2651 sub $0x40, %ecx 2652L(large_page_less_64bytes): 2653 cmp $32, %ecx 2654 jb L(large_page_less_32bytes) 2655 movdqu (%eax), %xmm0 2656 movdqu 0x10(%eax), %xmm1 2657 lea 0x20(%eax), %eax 2658 movntdq %xmm0, (%edx) 2659 movntdq %xmm1, 0x10(%edx) 2660 lea 0x20(%edx), %edx 2661 sub $0x20, %ecx 2662L(large_page_less_32bytes): 2663 add %ecx, %edx 2664 add %ecx, %eax 2665 sfence 2666 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 2667 2668 .p2align 4 2669L(bk_write_44bytes): 2670 movq 36(%eax), %xmm0 2671 movq %xmm0, 36(%edx) 2672L(bk_write_36bytes): 2673 movq 28(%eax), %xmm0 2674 movq %xmm0, 28(%edx) 2675L(bk_write_28bytes): 2676 movq 20(%eax), %xmm0 2677 movq %xmm0, 20(%edx) 2678L(bk_write_20bytes): 2679 movq 12(%eax), %xmm0 2680 movq %xmm0, 12(%edx) 2681L(bk_write_12bytes): 2682 movq 4(%eax), %xmm0 2683 movq %xmm0, 4(%edx) 2684L(bk_write_4bytes): 2685 movl (%eax), %ecx 2686 movl %ecx, (%edx) 2687L(bk_write_0bytes): 2688#ifndef USE_AS_BCOPY 2689 movl DEST(%esp), %eax 2690# ifdef USE_AS_MEMPCPY 2691 movl LEN(%esp), %ecx 2692 add %ecx, %eax 2693# endif 2694#endif 2695 RETURN 2696 2697 .p2align 4 2698L(bk_write_40bytes): 2699 movq 32(%eax), %xmm0 2700 movq %xmm0, 32(%edx) 2701L(bk_write_32bytes): 2702 movq 24(%eax), %xmm0 2703 movq %xmm0, 24(%edx) 2704L(bk_write_24bytes): 2705 movq 16(%eax), %xmm0 2706 movq %xmm0, 16(%edx) 2707L(bk_write_16bytes): 2708 movq 8(%eax), %xmm0 2709 movq %xmm0, 8(%edx) 2710L(bk_write_8bytes): 2711 movq (%eax), %xmm0 2712 movq %xmm0, (%edx) 2713#ifndef USE_AS_BCOPY 2714 movl DEST(%esp), %eax 2715# ifdef USE_AS_MEMPCPY 2716 movl LEN(%esp), %ecx 2717 add %ecx, %eax 2718# endif 2719#endif 2720 RETURN 2721 2722 .p2align 4 2723L(bk_write_45bytes): 2724 movq 37(%eax), %xmm0 2725 movq %xmm0, 37(%edx) 2726L(bk_write_37bytes): 2727 movq 29(%eax), %xmm0 2728 movq %xmm0, 29(%edx) 2729L(bk_write_29bytes): 2730 movq 21(%eax), %xmm0 2731 movq %xmm0, 21(%edx) 2732L(bk_write_21bytes): 2733 movq 13(%eax), %xmm0 2734 movq %xmm0, 13(%edx) 2735L(bk_write_13bytes): 2736 movq 5(%eax), %xmm0 2737 movq %xmm0, 5(%edx) 2738L(bk_write_5bytes): 2739 movl 1(%eax), %ecx 2740 movl %ecx, 1(%edx) 2741L(bk_write_1bytes): 2742 movzbl (%eax), %ecx 2743 movb %cl, (%edx) 2744#ifndef USE_AS_BCOPY 2745 movl DEST(%esp), %eax 2746# ifdef USE_AS_MEMPCPY 2747 movl LEN(%esp), %ecx 2748 add %ecx, %eax 2749# endif 2750#endif 2751 RETURN 2752 2753 .p2align 4 2754L(bk_write_41bytes): 2755 movq 33(%eax), %xmm0 2756 movq %xmm0, 33(%edx) 2757L(bk_write_33bytes): 2758 movq 25(%eax), %xmm0 2759 movq %xmm0, 25(%edx) 2760L(bk_write_25bytes): 2761 movq 17(%eax), %xmm0 2762 movq %xmm0, 17(%edx) 2763L(bk_write_17bytes): 2764 movq 9(%eax), %xmm0 2765 movq %xmm0, 9(%edx) 2766L(bk_write_9bytes): 2767 movq 1(%eax), %xmm0 2768 movq %xmm0, 1(%edx) 2769 movzbl (%eax), %ecx 2770 movb %cl, (%edx) 2771#ifndef USE_AS_BCOPY 2772 movl DEST(%esp), %eax 2773# ifdef USE_AS_MEMPCPY 2774 movl LEN(%esp), %ecx 2775 add %ecx, %eax 2776# endif 2777#endif 2778 RETURN 2779 2780 .p2align 4 2781L(bk_write_46bytes): 2782 movq 38(%eax), %xmm0 2783 movq %xmm0, 38(%edx) 2784L(bk_write_38bytes): 2785 movq 30(%eax), %xmm0 2786 movq %xmm0, 30(%edx) 2787L(bk_write_30bytes): 2788 movq 22(%eax), %xmm0 2789 movq %xmm0, 22(%edx) 2790L(bk_write_22bytes): 2791 movq 14(%eax), %xmm0 2792 movq %xmm0, 14(%edx) 2793L(bk_write_14bytes): 2794 movq 6(%eax), %xmm0 2795 movq %xmm0, 6(%edx) 2796L(bk_write_6bytes): 2797 movl 2(%eax), %ecx 2798 movl %ecx, 2(%edx) 2799 movzwl (%eax), %ecx 2800 movw %cx, (%edx) 2801#ifndef USE_AS_BCOPY 2802 movl DEST(%esp), %eax 2803# ifdef USE_AS_MEMPCPY 2804 movl LEN(%esp), %ecx 2805 add %ecx, %eax 2806# endif 2807#endif 2808 RETURN 2809 2810 .p2align 4 2811L(bk_write_42bytes): 2812 movq 34(%eax), %xmm0 2813 movq %xmm0, 34(%edx) 2814L(bk_write_34bytes): 2815 movq 26(%eax), %xmm0 2816 movq %xmm0, 26(%edx) 2817L(bk_write_26bytes): 2818 movq 18(%eax), %xmm0 2819 movq %xmm0, 18(%edx) 2820L(bk_write_18bytes): 2821 movq 10(%eax), %xmm0 2822 movq %xmm0, 10(%edx) 2823L(bk_write_10bytes): 2824 movq 2(%eax), %xmm0 2825 movq %xmm0, 2(%edx) 2826L(bk_write_2bytes): 2827 movzwl (%eax), %ecx 2828 movw %cx, (%edx) 2829#ifndef USE_AS_BCOPY 2830 movl DEST(%esp), %eax 2831# ifdef USE_AS_MEMPCPY 2832 movl LEN(%esp), %ecx 2833 add %ecx, %eax 2834# endif 2835#endif 2836 RETURN 2837 2838 .p2align 4 2839L(bk_write_47bytes): 2840 movq 39(%eax), %xmm0 2841 movq %xmm0, 39(%edx) 2842L(bk_write_39bytes): 2843 movq 31(%eax), %xmm0 2844 movq %xmm0, 31(%edx) 2845L(bk_write_31bytes): 2846 movq 23(%eax), %xmm0 2847 movq %xmm0, 23(%edx) 2848L(bk_write_23bytes): 2849 movq 15(%eax), %xmm0 2850 movq %xmm0, 15(%edx) 2851L(bk_write_15bytes): 2852 movq 7(%eax), %xmm0 2853 movq %xmm0, 7(%edx) 2854L(bk_write_7bytes): 2855 movl 3(%eax), %ecx 2856 movl %ecx, 3(%edx) 2857 movzwl 1(%eax), %ecx 2858 movw %cx, 1(%edx) 2859 movzbl (%eax), %eax 2860 movb %al, (%edx) 2861#ifndef USE_AS_BCOPY 2862 movl DEST(%esp), %eax 2863# ifdef USE_AS_MEMPCPY 2864 movl LEN(%esp), %ecx 2865 add %ecx, %eax 2866# endif 2867#endif 2868 RETURN 2869 2870 .p2align 4 2871L(bk_write_43bytes): 2872 movq 35(%eax), %xmm0 2873 movq %xmm0, 35(%edx) 2874L(bk_write_35bytes): 2875 movq 27(%eax), %xmm0 2876 movq %xmm0, 27(%edx) 2877L(bk_write_27bytes): 2878 movq 19(%eax), %xmm0 2879 movq %xmm0, 19(%edx) 2880L(bk_write_19bytes): 2881 movq 11(%eax), %xmm0 2882 movq %xmm0, 11(%edx) 2883L(bk_write_11bytes): 2884 movq 3(%eax), %xmm0 2885 movq %xmm0, 3(%edx) 2886L(bk_write_3bytes): 2887 movzwl 1(%eax), %ecx 2888 movw %cx, 1(%edx) 2889 movzbl (%eax), %eax 2890 movb %al, (%edx) 2891#ifndef USE_AS_BCOPY 2892 movl DEST(%esp), %eax 2893# ifdef USE_AS_MEMPCPY 2894 movl LEN(%esp), %ecx 2895 add %ecx, %eax 2896# endif 2897#endif 2898 RETURN_END 2899 2900 2901 .pushsection .rodata.ssse3,"a",@progbits 2902 .p2align 2 2903L(table_48bytes_fwd): 2904 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) 2905 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) 2906 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) 2907 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) 2908 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) 2909 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) 2910 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) 2911 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) 2912 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) 2913 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) 2914 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) 2915 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) 2916 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) 2917 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) 2918 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) 2919 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) 2920 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) 2921 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) 2922 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) 2923 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) 2924 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) 2925 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) 2926 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) 2927 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) 2928 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) 2929 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) 2930 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) 2931 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) 2932 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) 2933 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) 2934 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) 2935 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) 2936 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) 2937 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) 2938 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) 2939 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) 2940 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) 2941 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) 2942 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) 2943 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) 2944 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) 2945 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) 2946 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) 2947 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) 2948 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) 2949 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) 2950 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) 2951 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) 2952 2953 .p2align 2 2954L(table_48bytes_fwd_align): 2955 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) 2956 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) 2957 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) 2958 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) 2959 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) 2960 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) 2961 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) 2962 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) 2963 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) 2964 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) 2965 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) 2966 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) 2967 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) 2968 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) 2969 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) 2970 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) 2971 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) 2972 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) 2973 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) 2974 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) 2975 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) 2976 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) 2977 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) 2978 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) 2979 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) 2980 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) 2981 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) 2982 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) 2983 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) 2984 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) 2985 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) 2986 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) 2987 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) 2988 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) 2989 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) 2990 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) 2991 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) 2992 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) 2993 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) 2994 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) 2995 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) 2996 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) 2997 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) 2998 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) 2999 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) 3000 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) 3001 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) 3002 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) 3003 3004 .p2align 2 3005L(shl_table): 3006 .int JMPTBL (L(shl_0), L(shl_table)) 3007 .int JMPTBL (L(shl_1), L(shl_table)) 3008 .int JMPTBL (L(shl_2), L(shl_table)) 3009 .int JMPTBL (L(shl_3), L(shl_table)) 3010 .int JMPTBL (L(shl_4), L(shl_table)) 3011 .int JMPTBL (L(shl_5), L(shl_table)) 3012 .int JMPTBL (L(shl_6), L(shl_table)) 3013 .int JMPTBL (L(shl_7), L(shl_table)) 3014 .int JMPTBL (L(shl_8), L(shl_table)) 3015 .int JMPTBL (L(shl_9), L(shl_table)) 3016 .int JMPTBL (L(shl_10), L(shl_table)) 3017 .int JMPTBL (L(shl_11), L(shl_table)) 3018 .int JMPTBL (L(shl_12), L(shl_table)) 3019 .int JMPTBL (L(shl_13), L(shl_table)) 3020 .int JMPTBL (L(shl_14), L(shl_table)) 3021 .int JMPTBL (L(shl_15), L(shl_table)) 3022 3023 .p2align 2 3024L(table_48_bytes_bwd): 3025 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) 3026 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) 3027 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) 3028 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) 3029 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) 3030 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) 3031 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) 3032 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) 3033 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) 3034 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) 3035 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) 3036 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) 3037 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) 3038 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) 3039 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) 3040 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) 3041 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) 3042 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) 3043 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) 3044 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) 3045 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) 3046 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) 3047 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) 3048 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) 3049 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) 3050 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) 3051 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) 3052 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) 3053 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) 3054 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) 3055 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) 3056 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) 3057 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) 3058 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) 3059 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) 3060 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) 3061 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) 3062 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) 3063 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) 3064 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) 3065 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) 3066 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) 3067 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) 3068 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) 3069 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) 3070 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) 3071 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) 3072 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) 3073 3074 .popsection 3075 3076#ifdef USE_AS_MEMMOVE 3077 .p2align 4 3078L(copy_backward): 3079 PUSH (%edi) 3080 movl %eax, %edi 3081 lea (%ecx,%edx,1),%edx 3082 lea (%ecx,%edi,1),%edi 3083 testl $0x3, %edx 3084 jnz L(bk_align) 3085 3086L(bk_aligned_4): 3087 cmp $64, %ecx 3088 jae L(bk_write_more64bytes) 3089 3090L(bk_write_64bytesless): 3091 cmp $32, %ecx 3092 jb L(bk_write_less32bytes) 3093 3094L(bk_write_more32bytes): 3095 /* Copy 32 bytes at a time. */ 3096 sub $32, %ecx 3097 movq -8(%edi), %xmm0 3098 movq %xmm0, -8(%edx) 3099 movq -16(%edi), %xmm0 3100 movq %xmm0, -16(%edx) 3101 movq -24(%edi), %xmm0 3102 movq %xmm0, -24(%edx) 3103 movq -32(%edi), %xmm0 3104 movq %xmm0, -32(%edx) 3105 sub $32, %edx 3106 sub $32, %edi 3107 3108L(bk_write_less32bytes): 3109 movl %edi, %eax 3110 sub %ecx, %edx 3111 sub %ecx, %eax 3112 POP (%edi) 3113L(bk_write_less32bytes_2): 3114 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 3115 3116 CFI_PUSH (%edi) 3117 3118 .p2align 4 3119L(bk_align): 3120 cmp $8, %ecx 3121 jbe L(bk_write_less32bytes) 3122 testl $1, %edx 3123 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, 3124 then (EDX & 2) must be != 0. */ 3125 jz L(bk_got2) 3126 sub $1, %edi 3127 sub $1, %ecx 3128 sub $1, %edx 3129 movzbl (%edi), %eax 3130 movb %al, (%edx) 3131 3132 testl $2, %edx 3133 jz L(bk_aligned_4) 3134 3135L(bk_got2): 3136 sub $2, %edi 3137 sub $2, %ecx 3138 sub $2, %edx 3139 movzwl (%edi), %eax 3140 movw %ax, (%edx) 3141 jmp L(bk_aligned_4) 3142 3143 .p2align 4 3144L(bk_write_more64bytes): 3145 /* Check alignment of last byte. */ 3146 testl $15, %edx 3147 jz L(bk_ssse3_cpy_pre) 3148 3149/* EDX is aligned 4 bytes, but not 16 bytes. */ 3150L(bk_ssse3_align): 3151 sub $4, %edi 3152 sub $4, %ecx 3153 sub $4, %edx 3154 movl (%edi), %eax 3155 movl %eax, (%edx) 3156 3157 testl $15, %edx 3158 jz L(bk_ssse3_cpy_pre) 3159 3160 sub $4, %edi 3161 sub $4, %ecx 3162 sub $4, %edx 3163 movl (%edi), %eax 3164 movl %eax, (%edx) 3165 3166 testl $15, %edx 3167 jz L(bk_ssse3_cpy_pre) 3168 3169 sub $4, %edi 3170 sub $4, %ecx 3171 sub $4, %edx 3172 movl (%edi), %eax 3173 movl %eax, (%edx) 3174 3175L(bk_ssse3_cpy_pre): 3176 cmp $64, %ecx 3177 jb L(bk_write_more32bytes) 3178 3179 .p2align 4 3180L(bk_ssse3_cpy): 3181 sub $64, %edi 3182 sub $64, %ecx 3183 sub $64, %edx 3184 movdqu 0x30(%edi), %xmm3 3185 movdqa %xmm3, 0x30(%edx) 3186 movdqu 0x20(%edi), %xmm2 3187 movdqa %xmm2, 0x20(%edx) 3188 movdqu 0x10(%edi), %xmm1 3189 movdqa %xmm1, 0x10(%edx) 3190 movdqu (%edi), %xmm0 3191 movdqa %xmm0, (%edx) 3192 cmp $64, %ecx 3193 jae L(bk_ssse3_cpy) 3194 jmp L(bk_write_64bytesless) 3195 3196#endif 3197 3198END (MEMCPY) 3199