1/* 2Copyright (c) 2010, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#define FOR_ATOM 32#include "cache.h" 33 34#ifndef MEMCPY 35# define MEMCPY memcpy_atom 36#endif 37 38#ifndef L 39# define L(label) .L##label 40#endif 41 42#ifndef cfi_startproc 43# define cfi_startproc .cfi_startproc 44#endif 45 46#ifndef cfi_endproc 47# define cfi_endproc .cfi_endproc 48#endif 49 50#ifndef cfi_rel_offset 51# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 52#endif 53 54#ifndef cfi_restore 55# define cfi_restore(reg) .cfi_restore reg 56#endif 57 58#ifndef cfi_adjust_cfa_offset 59# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 60#endif 61 62#ifndef ENTRY 63# define ENTRY(name) \ 64 .type name, @function; \ 65 .globl name; \ 66 .p2align 4; \ 67name: \ 68 cfi_startproc 69#endif 70 71#ifndef END 72# define END(name) \ 73 cfi_endproc; \ 74 .size name, .-name 75#endif 76 77#define DEST PARMS 78#define SRC DEST+4 79#define LEN SRC+4 80 81#define CFI_PUSH(REG) \ 82 cfi_adjust_cfa_offset (4); \ 83 cfi_rel_offset (REG, 0) 84 85#define CFI_POP(REG) \ 86 cfi_adjust_cfa_offset (-4); \ 87 cfi_restore (REG) 88 89#define PUSH(REG) pushl REG; CFI_PUSH (REG) 90#define POP(REG) popl REG; CFI_POP (REG) 91 92#if (defined SHARED || defined __PIC__) 93# define PARMS 8 /* Preserve EBX. */ 94# define ENTRANCE PUSH (%ebx); 95# define RETURN_END POP (%ebx); ret 96# define RETURN RETURN_END; CFI_PUSH (%ebx) 97# define JMPTBL(I, B) I - B 98 99# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x 100 101/* Load an entry in a jump table into EBX and branch to it. TABLE is a 102 jump table with relative offsets. INDEX is a register contains the 103 index into the jump table. SCALE is the scale of INDEX. */ 104 105# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 106 /* We first load PC into EBX. */ \ 107 SETUP_PIC_REG(bx); \ 108 /* Get the address of the jump table. */ \ 109 addl $(TABLE - .), %ebx; \ 110 /* Get the entry and convert the relative offset to the \ 111 absolute address. */ \ 112 addl (%ebx, INDEX, SCALE), %ebx; \ 113 /* We loaded the jump table. Go. */ \ 114 jmp *%ebx 115#else 116 117# define PARMS 4 118# define ENTRANCE 119# define RETURN_END ret 120# define RETURN RETURN_END 121# define JMPTBL(I, B) I 122 123/* Branch to an entry in a jump table. TABLE is a jump table with 124 absolute offsets. INDEX is a register contains the index into the 125 jump table. SCALE is the scale of INDEX. */ 126 127# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 128 jmp *TABLE(, INDEX, SCALE) 129#endif 130 131 .section .text.ssse3,"ax",@progbits 132ENTRY (MEMCPY) 133 ENTRANCE 134 movl LEN(%esp), %ecx 135 movl SRC(%esp), %eax 136 movl DEST(%esp), %edx 137 138#ifdef USE_AS_MEMMOVE 139 cmp %eax, %edx 140 jb L(copy_forward) 141 je L(fwd_write_0bytes) 142 cmp $32, %ecx 143 jae L(memmove_bwd) 144 jmp L(bk_write_less32bytes_2) 145 146 .p2align 4 147L(memmove_bwd): 148 add %ecx, %eax 149 cmp %eax, %edx 150 movl SRC(%esp), %eax 151 jb L(copy_backward) 152 153L(copy_forward): 154#endif 155 cmp $48, %ecx 156 jae L(48bytesormore) 157 158L(fwd_write_less32bytes): 159#ifndef USE_AS_MEMMOVE 160 cmp %dl, %al 161 jb L(bk_write) 162#endif 163 add %ecx, %edx 164 add %ecx, %eax 165 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 166#ifndef USE_AS_MEMMOVE 167 .p2align 4 168L(bk_write): 169 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 170#endif 171 172 .p2align 4 173L(48bytesormore): 174#ifndef USE_AS_MEMMOVE 175 movlpd (%eax), %xmm0 176 movlpd 8(%eax), %xmm1 177 movlpd %xmm0, (%edx) 178 movlpd %xmm1, 8(%edx) 179#else 180 movdqu (%eax), %xmm0 181#endif 182 PUSH (%edi) 183 movl %edx, %edi 184 and $-16, %edx 185 add $16, %edx 186 sub %edx, %edi 187 add %edi, %ecx 188 sub %edi, %eax 189 190#ifdef SHARED_CACHE_SIZE_HALF 191 cmp $SHARED_CACHE_SIZE_HALF, %ecx 192#else 193# if (defined SHARED || defined __PIC__) 194 SETUP_PIC_REG(bx) 195 add $_GLOBAL_OFFSET_TABLE_, %ebx 196 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 197# else 198 cmp __x86_shared_cache_size_half, %ecx 199# endif 200#endif 201 202 mov %eax, %edi 203 jae L(large_page) 204 and $0xf, %edi 205 jz L(shl_0) 206 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) 207 208 .p2align 4 209L(shl_0): 210#ifdef USE_AS_MEMMOVE 211 movl DEST+4(%esp), %edi 212 movdqu %xmm0, (%edi) 213#endif 214 xor %edi, %edi 215 cmp $127, %ecx 216 ja L(shl_0_gobble) 217 lea -32(%ecx), %ecx 218 219 .p2align 4 220L(shl_0_loop): 221 movdqa (%eax, %edi), %xmm0 222 movdqa 16(%eax, %edi), %xmm1 223 sub $32, %ecx 224 movdqa %xmm0, (%edx, %edi) 225 movdqa %xmm1, 16(%edx, %edi) 226 lea 32(%edi), %edi 227 jb L(shl_0_end) 228 229 movdqa (%eax, %edi), %xmm0 230 movdqa 16(%eax, %edi), %xmm1 231 sub $32, %ecx 232 movdqa %xmm0, (%edx, %edi) 233 movdqa %xmm1, 16(%edx, %edi) 234 lea 32(%edi), %edi 235 jb L(shl_0_end) 236 237 movdqa (%eax, %edi), %xmm0 238 movdqa 16(%eax, %edi), %xmm1 239 sub $32, %ecx 240 movdqa %xmm0, (%edx, %edi) 241 movdqa %xmm1, 16(%edx, %edi) 242 lea 32(%edi), %edi 243 jb L(shl_0_end) 244 245 movdqa (%eax, %edi), %xmm0 246 movdqa 16(%eax, %edi), %xmm1 247 sub $32, %ecx 248 movdqa %xmm0, (%edx, %edi) 249 movdqa %xmm1, 16(%edx, %edi) 250 lea 32(%edi), %edi 251 252L(shl_0_end): 253 lea 32(%ecx), %ecx 254 add %ecx, %edi 255 add %edi, %edx 256 add %edi, %eax 257 POP (%edi) 258 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 259 260 CFI_PUSH (%edi) 261 262 .p2align 4 263L(shl_0_gobble): 264#ifdef DATA_CACHE_SIZE_HALF 265 cmp $DATA_CACHE_SIZE_HALF, %ecx 266#else 267# if (defined SHARED || defined __PIC__) 268 SETUP_PIC_REG(bx) 269 add $_GLOBAL_OFFSET_TABLE_, %ebx 270 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 271# else 272 cmp __x86_data_cache_size_half, %ecx 273# endif 274#endif 275 POP (%edi) 276 lea -128(%ecx), %ecx 277 jae L(shl_0_gobble_mem_loop) 278 279 .p2align 4 280L(shl_0_gobble_cache_loop): 281 movdqa (%eax), %xmm0 282 movdqa 0x10(%eax), %xmm1 283 movdqa 0x20(%eax), %xmm2 284 movdqa 0x30(%eax), %xmm3 285 movdqa 0x40(%eax), %xmm4 286 movdqa 0x50(%eax), %xmm5 287 movdqa 0x60(%eax), %xmm6 288 movdqa 0x70(%eax), %xmm7 289 lea 0x80(%eax), %eax 290 sub $128, %ecx 291 movdqa %xmm0, (%edx) 292 movdqa %xmm1, 0x10(%edx) 293 movdqa %xmm2, 0x20(%edx) 294 movdqa %xmm3, 0x30(%edx) 295 movdqa %xmm4, 0x40(%edx) 296 movdqa %xmm5, 0x50(%edx) 297 movdqa %xmm6, 0x60(%edx) 298 movdqa %xmm7, 0x70(%edx) 299 lea 0x80(%edx), %edx 300 301 jae L(shl_0_gobble_cache_loop) 302 cmp $-0x40, %ecx 303 lea 0x80(%ecx), %ecx 304 jl L(shl_0_cache_less_64bytes) 305 306 movdqa (%eax), %xmm0 307 sub $0x40, %ecx 308 movdqa 0x10(%eax), %xmm1 309 movdqa %xmm0, (%edx) 310 movdqa %xmm1, 0x10(%edx) 311 movdqa 0x20(%eax), %xmm0 312 movdqa 0x30(%eax), %xmm1 313 add $0x40, %eax 314 movdqa %xmm0, 0x20(%edx) 315 movdqa %xmm1, 0x30(%edx) 316 add $0x40, %edx 317 318L(shl_0_cache_less_64bytes): 319 cmp $0x20, %ecx 320 jb L(shl_0_cache_less_32bytes) 321 movdqa (%eax), %xmm0 322 sub $0x20, %ecx 323 movdqa 0x10(%eax), %xmm1 324 add $0x20, %eax 325 movdqa %xmm0, (%edx) 326 movdqa %xmm1, 0x10(%edx) 327 add $0x20, %edx 328 329L(shl_0_cache_less_32bytes): 330 cmp $0x10, %ecx 331 jb L(shl_0_cache_less_16bytes) 332 sub $0x10, %ecx 333 movdqa (%eax), %xmm0 334 add $0x10, %eax 335 movdqa %xmm0, (%edx) 336 add $0x10, %edx 337 338L(shl_0_cache_less_16bytes): 339 add %ecx, %edx 340 add %ecx, %eax 341 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 342 343 .p2align 4 344L(shl_0_gobble_mem_loop): 345 prefetcht0 0x1c0(%eax) 346 prefetcht0 0x280(%eax) 347 prefetcht0 0x1c0(%edx) 348 349 movdqa (%eax), %xmm0 350 movdqa 0x10(%eax), %xmm1 351 movdqa 0x20(%eax), %xmm2 352 movdqa 0x30(%eax), %xmm3 353 movdqa 0x40(%eax), %xmm4 354 movdqa 0x50(%eax), %xmm5 355 movdqa 0x60(%eax), %xmm6 356 movdqa 0x70(%eax), %xmm7 357 lea 0x80(%eax), %eax 358 sub $0x80, %ecx 359 movdqa %xmm0, (%edx) 360 movdqa %xmm1, 0x10(%edx) 361 movdqa %xmm2, 0x20(%edx) 362 movdqa %xmm3, 0x30(%edx) 363 movdqa %xmm4, 0x40(%edx) 364 movdqa %xmm5, 0x50(%edx) 365 movdqa %xmm6, 0x60(%edx) 366 movdqa %xmm7, 0x70(%edx) 367 lea 0x80(%edx), %edx 368 369 jae L(shl_0_gobble_mem_loop) 370 cmp $-0x40, %ecx 371 lea 0x80(%ecx), %ecx 372 jl L(shl_0_mem_less_64bytes) 373 374 movdqa (%eax), %xmm0 375 sub $0x40, %ecx 376 movdqa 0x10(%eax), %xmm1 377 378 movdqa %xmm0, (%edx) 379 movdqa %xmm1, 0x10(%edx) 380 381 movdqa 0x20(%eax), %xmm0 382 movdqa 0x30(%eax), %xmm1 383 add $0x40, %eax 384 385 movdqa %xmm0, 0x20(%edx) 386 movdqa %xmm1, 0x30(%edx) 387 add $0x40, %edx 388 389L(shl_0_mem_less_64bytes): 390 cmp $0x20, %ecx 391 jb L(shl_0_mem_less_32bytes) 392 movdqa (%eax), %xmm0 393 sub $0x20, %ecx 394 movdqa 0x10(%eax), %xmm1 395 add $0x20, %eax 396 movdqa %xmm0, (%edx) 397 movdqa %xmm1, 0x10(%edx) 398 add $0x20, %edx 399 400L(shl_0_mem_less_32bytes): 401 cmp $0x10, %ecx 402 jb L(shl_0_mem_less_16bytes) 403 sub $0x10, %ecx 404 movdqa (%eax), %xmm0 405 add $0x10, %eax 406 movdqa %xmm0, (%edx) 407 add $0x10, %edx 408 409L(shl_0_mem_less_16bytes): 410 add %ecx, %edx 411 add %ecx, %eax 412 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 413 414 .p2align 4 415L(shl_1): 416#ifndef USE_AS_MEMMOVE 417 movaps -1(%eax), %xmm1 418#else 419 movl DEST+4(%esp), %edi 420 movaps -1(%eax), %xmm1 421 movdqu %xmm0, (%edi) 422#endif 423#ifdef DATA_CACHE_SIZE_HALF 424 cmp $DATA_CACHE_SIZE_HALF, %ecx 425#else 426# if (defined SHARED || defined __PIC__) 427 SETUP_PIC_REG(bx) 428 add $_GLOBAL_OFFSET_TABLE_, %ebx 429 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 430# else 431 cmp __x86_data_cache_size_half, %ecx 432# endif 433#endif 434 jb L(sh_1_no_prefetch) 435 436 lea -64(%ecx), %ecx 437 438 .p2align 4 439L(Shl1LoopStart): 440 prefetcht0 0x1c0(%eax) 441 prefetcht0 0x1c0(%edx) 442 movaps 15(%eax), %xmm2 443 movaps 31(%eax), %xmm3 444 movaps 47(%eax), %xmm4 445 movaps 63(%eax), %xmm5 446 movaps %xmm5, %xmm7 447 palignr $1, %xmm4, %xmm5 448 palignr $1, %xmm3, %xmm4 449 movaps %xmm5, 48(%edx) 450 palignr $1, %xmm2, %xmm3 451 lea 64(%eax), %eax 452 palignr $1, %xmm1, %xmm2 453 movaps %xmm4, 32(%edx) 454 movaps %xmm3, 16(%edx) 455 movaps %xmm7, %xmm1 456 movaps %xmm2, (%edx) 457 lea 64(%edx), %edx 458 sub $64, %ecx 459 ja L(Shl1LoopStart) 460 461L(Shl1LoopLeave): 462 add $32, %ecx 463 jle L(shl_end_0) 464 465 movaps 15(%eax), %xmm2 466 movaps 31(%eax), %xmm3 467 palignr $1, %xmm2, %xmm3 468 palignr $1, %xmm1, %xmm2 469 movaps %xmm2, (%edx) 470 movaps %xmm3, 16(%edx) 471 lea 32(%edx, %ecx), %edx 472 lea 32(%eax, %ecx), %eax 473 POP (%edi) 474 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 475 476 CFI_PUSH (%edi) 477 478 .p2align 4 479L(sh_1_no_prefetch): 480 lea -32(%ecx), %ecx 481 lea -1(%eax), %eax 482 xor %edi, %edi 483 484 .p2align 4 485L(sh_1_no_prefetch_loop): 486 movdqa 16(%eax, %edi), %xmm2 487 sub $32, %ecx 488 movdqa 32(%eax, %edi), %xmm3 489 movdqa %xmm3, %xmm4 490 palignr $1, %xmm2, %xmm3 491 palignr $1, %xmm1, %xmm2 492 lea 32(%edi), %edi 493 movdqa %xmm2, -32(%edx, %edi) 494 movdqa %xmm3, -16(%edx, %edi) 495 jb L(sh_1_end_no_prefetch_loop) 496 497 movdqa 16(%eax, %edi), %xmm2 498 sub $32, %ecx 499 movdqa 32(%eax, %edi), %xmm3 500 movdqa %xmm3, %xmm1 501 palignr $1, %xmm2, %xmm3 502 palignr $1, %xmm4, %xmm2 503 lea 32(%edi), %edi 504 movdqa %xmm2, -32(%edx, %edi) 505 movdqa %xmm3, -16(%edx, %edi) 506 jae L(sh_1_no_prefetch_loop) 507 508L(sh_1_end_no_prefetch_loop): 509 lea 32(%ecx), %ecx 510 add %ecx, %edi 511 add %edi, %edx 512 lea 1(%edi, %eax), %eax 513 POP (%edi) 514 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 515 516 CFI_PUSH (%edi) 517 518 .p2align 4 519L(shl_2): 520#ifndef USE_AS_MEMMOVE 521 movaps -2(%eax), %xmm1 522#else 523 movl DEST+4(%esp), %edi 524 movaps -2(%eax), %xmm1 525 movdqu %xmm0, (%edi) 526#endif 527#ifdef DATA_CACHE_SIZE_HALF 528 cmp $DATA_CACHE_SIZE_HALF, %ecx 529#else 530# if (defined SHARED || defined __PIC__) 531 SETUP_PIC_REG(bx) 532 add $_GLOBAL_OFFSET_TABLE_, %ebx 533 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 534# else 535 cmp __x86_data_cache_size_half, %ecx 536# endif 537#endif 538 jb L(sh_2_no_prefetch) 539 540 lea -64(%ecx), %ecx 541 542 .p2align 4 543L(Shl2LoopStart): 544 prefetcht0 0x1c0(%eax) 545 prefetcht0 0x1c0(%edx) 546 movaps 14(%eax), %xmm2 547 movaps 30(%eax), %xmm3 548 movaps 46(%eax), %xmm4 549 movaps 62(%eax), %xmm5 550 movaps %xmm5, %xmm7 551 palignr $2, %xmm4, %xmm5 552 palignr $2, %xmm3, %xmm4 553 movaps %xmm5, 48(%edx) 554 palignr $2, %xmm2, %xmm3 555 lea 64(%eax), %eax 556 palignr $2, %xmm1, %xmm2 557 movaps %xmm4, 32(%edx) 558 movaps %xmm3, 16(%edx) 559 movaps %xmm7, %xmm1 560 movaps %xmm2, (%edx) 561 lea 64(%edx), %edx 562 sub $64, %ecx 563 ja L(Shl2LoopStart) 564 565L(Shl2LoopLeave): 566 add $32, %ecx 567 jle L(shl_end_0) 568 569 movaps 14(%eax), %xmm2 570 movaps 30(%eax), %xmm3 571 palignr $2, %xmm2, %xmm3 572 palignr $2, %xmm1, %xmm2 573 movaps %xmm2, (%edx) 574 movaps %xmm3, 16(%edx) 575 lea 32(%edx, %ecx), %edx 576 lea 32(%eax, %ecx), %eax 577 POP (%edi) 578 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 579 580 CFI_PUSH (%edi) 581 582 .p2align 4 583L(sh_2_no_prefetch): 584 lea -32(%ecx), %ecx 585 lea -2(%eax), %eax 586 xor %edi, %edi 587 588 .p2align 4 589L(sh_2_no_prefetch_loop): 590 movdqa 16(%eax, %edi), %xmm2 591 sub $32, %ecx 592 movdqa 32(%eax, %edi), %xmm3 593 movdqa %xmm3, %xmm4 594 palignr $2, %xmm2, %xmm3 595 palignr $2, %xmm1, %xmm2 596 lea 32(%edi), %edi 597 movdqa %xmm2, -32(%edx, %edi) 598 movdqa %xmm3, -16(%edx, %edi) 599 jb L(sh_2_end_no_prefetch_loop) 600 601 movdqa 16(%eax, %edi), %xmm2 602 sub $32, %ecx 603 movdqa 32(%eax, %edi), %xmm3 604 movdqa %xmm3, %xmm1 605 palignr $2, %xmm2, %xmm3 606 palignr $2, %xmm4, %xmm2 607 lea 32(%edi), %edi 608 movdqa %xmm2, -32(%edx, %edi) 609 movdqa %xmm3, -16(%edx, %edi) 610 jae L(sh_2_no_prefetch_loop) 611 612L(sh_2_end_no_prefetch_loop): 613 lea 32(%ecx), %ecx 614 add %ecx, %edi 615 add %edi, %edx 616 lea 2(%edi, %eax), %eax 617 POP (%edi) 618 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 619 620 CFI_PUSH (%edi) 621 622 .p2align 4 623L(shl_3): 624#ifndef USE_AS_MEMMOVE 625 movaps -3(%eax), %xmm1 626#else 627 movl DEST+4(%esp), %edi 628 movaps -3(%eax), %xmm1 629 movdqu %xmm0, (%edi) 630#endif 631#ifdef DATA_CACHE_SIZE_HALF 632 cmp $DATA_CACHE_SIZE_HALF, %ecx 633#else 634# if (defined SHARED || defined __PIC__) 635 SETUP_PIC_REG(bx) 636 add $_GLOBAL_OFFSET_TABLE_, %ebx 637 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 638# else 639 cmp __x86_data_cache_size_half, %ecx 640# endif 641#endif 642 jb L(sh_3_no_prefetch) 643 644 lea -64(%ecx), %ecx 645 646 .p2align 4 647L(Shl3LoopStart): 648 prefetcht0 0x1c0(%eax) 649 prefetcht0 0x1c0(%edx) 650 movaps 13(%eax), %xmm2 651 movaps 29(%eax), %xmm3 652 movaps 45(%eax), %xmm4 653 movaps 61(%eax), %xmm5 654 movaps %xmm5, %xmm7 655 palignr $3, %xmm4, %xmm5 656 palignr $3, %xmm3, %xmm4 657 movaps %xmm5, 48(%edx) 658 palignr $3, %xmm2, %xmm3 659 lea 64(%eax), %eax 660 palignr $3, %xmm1, %xmm2 661 movaps %xmm4, 32(%edx) 662 movaps %xmm3, 16(%edx) 663 movaps %xmm7, %xmm1 664 movaps %xmm2, (%edx) 665 lea 64(%edx), %edx 666 sub $64, %ecx 667 ja L(Shl3LoopStart) 668 669L(Shl3LoopLeave): 670 add $32, %ecx 671 jle L(shl_end_0) 672 673 movaps 13(%eax), %xmm2 674 movaps 29(%eax), %xmm3 675 palignr $3, %xmm2, %xmm3 676 palignr $3, %xmm1, %xmm2 677 movaps %xmm2, (%edx) 678 movaps %xmm3, 16(%edx) 679 lea 32(%edx, %ecx), %edx 680 lea 32(%eax, %ecx), %eax 681 POP (%edi) 682 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 683 684 CFI_PUSH (%edi) 685 686 .p2align 4 687L(sh_3_no_prefetch): 688 lea -32(%ecx), %ecx 689 lea -3(%eax), %eax 690 xor %edi, %edi 691 692 .p2align 4 693L(sh_3_no_prefetch_loop): 694 movdqa 16(%eax, %edi), %xmm2 695 sub $32, %ecx 696 movdqa 32(%eax, %edi), %xmm3 697 movdqa %xmm3, %xmm4 698 palignr $3, %xmm2, %xmm3 699 palignr $3, %xmm1, %xmm2 700 lea 32(%edi), %edi 701 movdqa %xmm2, -32(%edx, %edi) 702 movdqa %xmm3, -16(%edx, %edi) 703 704 jb L(sh_3_end_no_prefetch_loop) 705 706 movdqa 16(%eax, %edi), %xmm2 707 sub $32, %ecx 708 movdqa 32(%eax, %edi), %xmm3 709 movdqa %xmm3, %xmm1 710 palignr $3, %xmm2, %xmm3 711 palignr $3, %xmm4, %xmm2 712 lea 32(%edi), %edi 713 movdqa %xmm2, -32(%edx, %edi) 714 movdqa %xmm3, -16(%edx, %edi) 715 716 jae L(sh_3_no_prefetch_loop) 717 718L(sh_3_end_no_prefetch_loop): 719 lea 32(%ecx), %ecx 720 add %ecx, %edi 721 add %edi, %edx 722 lea 3(%edi, %eax), %eax 723 POP (%edi) 724 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 725 726 CFI_PUSH (%edi) 727 728 .p2align 4 729L(shl_4): 730#ifndef USE_AS_MEMMOVE 731 movaps -4(%eax), %xmm1 732#else 733 movl DEST+4(%esp), %edi 734 movaps -4(%eax), %xmm1 735 movdqu %xmm0, (%edi) 736#endif 737#ifdef DATA_CACHE_SIZE_HALF 738 cmp $DATA_CACHE_SIZE_HALF, %ecx 739#else 740# if (defined SHARED || defined __PIC__) 741 SETUP_PIC_REG(bx) 742 add $_GLOBAL_OFFSET_TABLE_, %ebx 743 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 744# else 745 cmp __x86_data_cache_size_half, %ecx 746# endif 747#endif 748 jb L(sh_4_no_prefetch) 749 750 lea -64(%ecx), %ecx 751 752 .p2align 4 753L(Shl4LoopStart): 754 prefetcht0 0x1c0(%eax) 755 prefetcht0 0x1c0(%edx) 756 movaps 12(%eax), %xmm2 757 movaps 28(%eax), %xmm3 758 movaps 44(%eax), %xmm4 759 movaps 60(%eax), %xmm5 760 movaps %xmm5, %xmm7 761 palignr $4, %xmm4, %xmm5 762 palignr $4, %xmm3, %xmm4 763 movaps %xmm5, 48(%edx) 764 palignr $4, %xmm2, %xmm3 765 lea 64(%eax), %eax 766 palignr $4, %xmm1, %xmm2 767 movaps %xmm4, 32(%edx) 768 movaps %xmm3, 16(%edx) 769 movaps %xmm7, %xmm1 770 movaps %xmm2, (%edx) 771 lea 64(%edx), %edx 772 sub $64, %ecx 773 ja L(Shl4LoopStart) 774 775L(Shl4LoopLeave): 776 add $32, %ecx 777 jle L(shl_end_0) 778 779 movaps 12(%eax), %xmm2 780 movaps 28(%eax), %xmm3 781 palignr $4, %xmm2, %xmm3 782 palignr $4, %xmm1, %xmm2 783 movaps %xmm2, (%edx) 784 movaps %xmm3, 16(%edx) 785 lea 32(%edx, %ecx), %edx 786 lea 32(%eax, %ecx), %eax 787 POP (%edi) 788 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 789 790 CFI_PUSH (%edi) 791 792 .p2align 4 793L(sh_4_no_prefetch): 794 lea -32(%ecx), %ecx 795 lea -4(%eax), %eax 796 xor %edi, %edi 797 798 .p2align 4 799L(sh_4_no_prefetch_loop): 800 movdqa 16(%eax, %edi), %xmm2 801 sub $32, %ecx 802 movdqa 32(%eax, %edi), %xmm3 803 movdqa %xmm3, %xmm4 804 palignr $4, %xmm2, %xmm3 805 palignr $4, %xmm1, %xmm2 806 lea 32(%edi), %edi 807 movdqa %xmm2, -32(%edx, %edi) 808 movdqa %xmm3, -16(%edx, %edi) 809 810 jb L(sh_4_end_no_prefetch_loop) 811 812 movdqa 16(%eax, %edi), %xmm2 813 sub $32, %ecx 814 movdqa 32(%eax, %edi), %xmm3 815 movdqa %xmm3, %xmm1 816 palignr $4, %xmm2, %xmm3 817 palignr $4, %xmm4, %xmm2 818 lea 32(%edi), %edi 819 movdqa %xmm2, -32(%edx, %edi) 820 movdqa %xmm3, -16(%edx, %edi) 821 822 jae L(sh_4_no_prefetch_loop) 823 824L(sh_4_end_no_prefetch_loop): 825 lea 32(%ecx), %ecx 826 add %ecx, %edi 827 add %edi, %edx 828 lea 4(%edi, %eax), %eax 829 POP (%edi) 830 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 831 832 CFI_PUSH (%edi) 833 834 .p2align 4 835L(shl_5): 836#ifndef USE_AS_MEMMOVE 837 movaps -5(%eax), %xmm1 838#else 839 movl DEST+4(%esp), %edi 840 movaps -5(%eax), %xmm1 841 movdqu %xmm0, (%edi) 842#endif 843#ifdef DATA_CACHE_SIZE_HALF 844 cmp $DATA_CACHE_SIZE_HALF, %ecx 845#else 846# if (defined SHARED || defined __PIC__) 847 SETUP_PIC_REG(bx) 848 add $_GLOBAL_OFFSET_TABLE_, %ebx 849 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 850# else 851 cmp __x86_data_cache_size_half, %ecx 852# endif 853#endif 854 jb L(sh_5_no_prefetch) 855 856 lea -64(%ecx), %ecx 857 858 .p2align 4 859L(Shl5LoopStart): 860 prefetcht0 0x1c0(%eax) 861 prefetcht0 0x1c0(%edx) 862 movaps 11(%eax), %xmm2 863 movaps 27(%eax), %xmm3 864 movaps 43(%eax), %xmm4 865 movaps 59(%eax), %xmm5 866 movaps %xmm5, %xmm7 867 palignr $5, %xmm4, %xmm5 868 palignr $5, %xmm3, %xmm4 869 movaps %xmm5, 48(%edx) 870 palignr $5, %xmm2, %xmm3 871 lea 64(%eax), %eax 872 palignr $5, %xmm1, %xmm2 873 movaps %xmm4, 32(%edx) 874 movaps %xmm3, 16(%edx) 875 movaps %xmm7, %xmm1 876 movaps %xmm2, (%edx) 877 lea 64(%edx), %edx 878 sub $64, %ecx 879 ja L(Shl5LoopStart) 880 881L(Shl5LoopLeave): 882 add $32, %ecx 883 jle L(shl_end_0) 884 885 movaps 11(%eax), %xmm2 886 movaps 27(%eax), %xmm3 887 palignr $5, %xmm2, %xmm3 888 palignr $5, %xmm1, %xmm2 889 movaps %xmm2, (%edx) 890 movaps %xmm3, 16(%edx) 891 lea 32(%edx, %ecx), %edx 892 lea 32(%eax, %ecx), %eax 893 POP (%edi) 894 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 895 896 CFI_PUSH (%edi) 897 898 .p2align 4 899L(sh_5_no_prefetch): 900 lea -32(%ecx), %ecx 901 lea -5(%eax), %eax 902 xor %edi, %edi 903 904 .p2align 4 905L(sh_5_no_prefetch_loop): 906 movdqa 16(%eax, %edi), %xmm2 907 sub $32, %ecx 908 movdqa 32(%eax, %edi), %xmm3 909 movdqa %xmm3, %xmm4 910 palignr $5, %xmm2, %xmm3 911 palignr $5, %xmm1, %xmm2 912 lea 32(%edi), %edi 913 movdqa %xmm2, -32(%edx, %edi) 914 movdqa %xmm3, -16(%edx, %edi) 915 916 jb L(sh_5_end_no_prefetch_loop) 917 918 movdqa 16(%eax, %edi), %xmm2 919 sub $32, %ecx 920 movdqa 32(%eax, %edi), %xmm3 921 movdqa %xmm3, %xmm1 922 palignr $5, %xmm2, %xmm3 923 palignr $5, %xmm4, %xmm2 924 lea 32(%edi), %edi 925 movdqa %xmm2, -32(%edx, %edi) 926 movdqa %xmm3, -16(%edx, %edi) 927 928 jae L(sh_5_no_prefetch_loop) 929 930L(sh_5_end_no_prefetch_loop): 931 lea 32(%ecx), %ecx 932 add %ecx, %edi 933 add %edi, %edx 934 lea 5(%edi, %eax), %eax 935 POP (%edi) 936 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 937 938 CFI_PUSH (%edi) 939 940 .p2align 4 941L(shl_6): 942#ifndef USE_AS_MEMMOVE 943 movaps -6(%eax), %xmm1 944#else 945 movl DEST+4(%esp), %edi 946 movaps -6(%eax), %xmm1 947 movdqu %xmm0, (%edi) 948#endif 949#ifdef DATA_CACHE_SIZE_HALF 950 cmp $DATA_CACHE_SIZE_HALF, %ecx 951#else 952# if (defined SHARED || defined __PIC__) 953 SETUP_PIC_REG(bx) 954 add $_GLOBAL_OFFSET_TABLE_, %ebx 955 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 956# else 957 cmp __x86_data_cache_size_half, %ecx 958# endif 959#endif 960 jb L(sh_6_no_prefetch) 961 962 lea -64(%ecx), %ecx 963 964 .p2align 4 965L(Shl6LoopStart): 966 prefetcht0 0x1c0(%eax) 967 prefetcht0 0x1c0(%edx) 968 movaps 10(%eax), %xmm2 969 movaps 26(%eax), %xmm3 970 movaps 42(%eax), %xmm4 971 movaps 58(%eax), %xmm5 972 movaps %xmm5, %xmm7 973 palignr $6, %xmm4, %xmm5 974 palignr $6, %xmm3, %xmm4 975 movaps %xmm5, 48(%edx) 976 palignr $6, %xmm2, %xmm3 977 lea 64(%eax), %eax 978 palignr $6, %xmm1, %xmm2 979 movaps %xmm4, 32(%edx) 980 movaps %xmm3, 16(%edx) 981 movaps %xmm7, %xmm1 982 movaps %xmm2, (%edx) 983 lea 64(%edx), %edx 984 sub $64, %ecx 985 ja L(Shl6LoopStart) 986 987L(Shl6LoopLeave): 988 add $32, %ecx 989 jle L(shl_end_0) 990 991 movaps 10(%eax), %xmm2 992 movaps 26(%eax), %xmm3 993 palignr $6, %xmm2, %xmm3 994 palignr $6, %xmm1, %xmm2 995 movaps %xmm2, (%edx) 996 movaps %xmm3, 16(%edx) 997 lea 32(%edx, %ecx), %edx 998 lea 32(%eax, %ecx), %eax 999 POP (%edi) 1000 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1001 1002 CFI_PUSH (%edi) 1003 1004 .p2align 4 1005L(sh_6_no_prefetch): 1006 lea -32(%ecx), %ecx 1007 lea -6(%eax), %eax 1008 xor %edi, %edi 1009 1010 .p2align 4 1011L(sh_6_no_prefetch_loop): 1012 movdqa 16(%eax, %edi), %xmm2 1013 sub $32, %ecx 1014 movdqa 32(%eax, %edi), %xmm3 1015 movdqa %xmm3, %xmm4 1016 palignr $6, %xmm2, %xmm3 1017 palignr $6, %xmm1, %xmm2 1018 lea 32(%edi), %edi 1019 movdqa %xmm2, -32(%edx, %edi) 1020 movdqa %xmm3, -16(%edx, %edi) 1021 1022 jb L(sh_6_end_no_prefetch_loop) 1023 1024 movdqa 16(%eax, %edi), %xmm2 1025 sub $32, %ecx 1026 movdqa 32(%eax, %edi), %xmm3 1027 movdqa %xmm3, %xmm1 1028 palignr $6, %xmm2, %xmm3 1029 palignr $6, %xmm4, %xmm2 1030 lea 32(%edi), %edi 1031 movdqa %xmm2, -32(%edx, %edi) 1032 movdqa %xmm3, -16(%edx, %edi) 1033 1034 jae L(sh_6_no_prefetch_loop) 1035 1036L(sh_6_end_no_prefetch_loop): 1037 lea 32(%ecx), %ecx 1038 add %ecx, %edi 1039 add %edi, %edx 1040 lea 6(%edi, %eax), %eax 1041 POP (%edi) 1042 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1043 1044 CFI_PUSH (%edi) 1045 1046 .p2align 4 1047L(shl_7): 1048#ifndef USE_AS_MEMMOVE 1049 movaps -7(%eax), %xmm1 1050#else 1051 movl DEST+4(%esp), %edi 1052 movaps -7(%eax), %xmm1 1053 movdqu %xmm0, (%edi) 1054#endif 1055#ifdef DATA_CACHE_SIZE_HALF 1056 cmp $DATA_CACHE_SIZE_HALF, %ecx 1057#else 1058# if (defined SHARED || defined __PIC__) 1059 SETUP_PIC_REG(bx) 1060 add $_GLOBAL_OFFSET_TABLE_, %ebx 1061 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1062# else 1063 cmp __x86_data_cache_size_half, %ecx 1064# endif 1065#endif 1066 jb L(sh_7_no_prefetch) 1067 1068 lea -64(%ecx), %ecx 1069 1070 .p2align 4 1071L(Shl7LoopStart): 1072 prefetcht0 0x1c0(%eax) 1073 prefetcht0 0x1c0(%edx) 1074 movaps 9(%eax), %xmm2 1075 movaps 25(%eax), %xmm3 1076 movaps 41(%eax), %xmm4 1077 movaps 57(%eax), %xmm5 1078 movaps %xmm5, %xmm7 1079 palignr $7, %xmm4, %xmm5 1080 palignr $7, %xmm3, %xmm4 1081 movaps %xmm5, 48(%edx) 1082 palignr $7, %xmm2, %xmm3 1083 lea 64(%eax), %eax 1084 palignr $7, %xmm1, %xmm2 1085 movaps %xmm4, 32(%edx) 1086 movaps %xmm3, 16(%edx) 1087 movaps %xmm7, %xmm1 1088 movaps %xmm2, (%edx) 1089 lea 64(%edx), %edx 1090 sub $64, %ecx 1091 ja L(Shl7LoopStart) 1092 1093L(Shl7LoopLeave): 1094 add $32, %ecx 1095 jle L(shl_end_0) 1096 1097 movaps 9(%eax), %xmm2 1098 movaps 25(%eax), %xmm3 1099 palignr $7, %xmm2, %xmm3 1100 palignr $7, %xmm1, %xmm2 1101 movaps %xmm2, (%edx) 1102 movaps %xmm3, 16(%edx) 1103 lea 32(%edx, %ecx), %edx 1104 lea 32(%eax, %ecx), %eax 1105 POP (%edi) 1106 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1107 1108 CFI_PUSH (%edi) 1109 1110 .p2align 4 1111L(sh_7_no_prefetch): 1112 lea -32(%ecx), %ecx 1113 lea -7(%eax), %eax 1114 xor %edi, %edi 1115 1116 .p2align 4 1117L(sh_7_no_prefetch_loop): 1118 movdqa 16(%eax, %edi), %xmm2 1119 sub $32, %ecx 1120 movdqa 32(%eax, %edi), %xmm3 1121 movdqa %xmm3, %xmm4 1122 palignr $7, %xmm2, %xmm3 1123 palignr $7, %xmm1, %xmm2 1124 lea 32(%edi), %edi 1125 movdqa %xmm2, -32(%edx, %edi) 1126 movdqa %xmm3, -16(%edx, %edi) 1127 jb L(sh_7_end_no_prefetch_loop) 1128 1129 movdqa 16(%eax, %edi), %xmm2 1130 sub $32, %ecx 1131 movdqa 32(%eax, %edi), %xmm3 1132 movdqa %xmm3, %xmm1 1133 palignr $7, %xmm2, %xmm3 1134 palignr $7, %xmm4, %xmm2 1135 lea 32(%edi), %edi 1136 movdqa %xmm2, -32(%edx, %edi) 1137 movdqa %xmm3, -16(%edx, %edi) 1138 jae L(sh_7_no_prefetch_loop) 1139 1140L(sh_7_end_no_prefetch_loop): 1141 lea 32(%ecx), %ecx 1142 add %ecx, %edi 1143 add %edi, %edx 1144 lea 7(%edi, %eax), %eax 1145 POP (%edi) 1146 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1147 1148 CFI_PUSH (%edi) 1149 1150 .p2align 4 1151L(shl_8): 1152#ifndef USE_AS_MEMMOVE 1153 movaps -8(%eax), %xmm1 1154#else 1155 movl DEST+4(%esp), %edi 1156 movaps -8(%eax), %xmm1 1157 movdqu %xmm0, (%edi) 1158#endif 1159#ifdef DATA_CACHE_SIZE_HALF 1160 cmp $DATA_CACHE_SIZE_HALF, %ecx 1161#else 1162# if (defined SHARED || defined __PIC__) 1163 SETUP_PIC_REG(bx) 1164 add $_GLOBAL_OFFSET_TABLE_, %ebx 1165 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1166# else 1167 cmp __x86_data_cache_size_half, %ecx 1168# endif 1169#endif 1170 jb L(sh_8_no_prefetch) 1171 1172 lea -64(%ecx), %ecx 1173 1174 .p2align 4 1175L(Shl8LoopStart): 1176 prefetcht0 0x1c0(%eax) 1177 prefetcht0 0x1c0(%edx) 1178 movaps 8(%eax), %xmm2 1179 movaps 24(%eax), %xmm3 1180 movaps 40(%eax), %xmm4 1181 movaps 56(%eax), %xmm5 1182 movaps %xmm5, %xmm7 1183 palignr $8, %xmm4, %xmm5 1184 palignr $8, %xmm3, %xmm4 1185 movaps %xmm5, 48(%edx) 1186 palignr $8, %xmm2, %xmm3 1187 lea 64(%eax), %eax 1188 palignr $8, %xmm1, %xmm2 1189 movaps %xmm4, 32(%edx) 1190 movaps %xmm3, 16(%edx) 1191 movaps %xmm7, %xmm1 1192 movaps %xmm2, (%edx) 1193 lea 64(%edx), %edx 1194 sub $64, %ecx 1195 ja L(Shl8LoopStart) 1196 1197L(LoopLeave8): 1198 add $32, %ecx 1199 jle L(shl_end_0) 1200 1201 movaps 8(%eax), %xmm2 1202 movaps 24(%eax), %xmm3 1203 palignr $8, %xmm2, %xmm3 1204 palignr $8, %xmm1, %xmm2 1205 movaps %xmm2, (%edx) 1206 movaps %xmm3, 16(%edx) 1207 lea 32(%edx, %ecx), %edx 1208 lea 32(%eax, %ecx), %eax 1209 POP (%edi) 1210 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1211 1212 CFI_PUSH (%edi) 1213 1214 .p2align 4 1215L(sh_8_no_prefetch): 1216 lea -32(%ecx), %ecx 1217 lea -8(%eax), %eax 1218 xor %edi, %edi 1219 1220 .p2align 4 1221L(sh_8_no_prefetch_loop): 1222 movdqa 16(%eax, %edi), %xmm2 1223 sub $32, %ecx 1224 movdqa 32(%eax, %edi), %xmm3 1225 movdqa %xmm3, %xmm4 1226 palignr $8, %xmm2, %xmm3 1227 palignr $8, %xmm1, %xmm2 1228 lea 32(%edi), %edi 1229 movdqa %xmm2, -32(%edx, %edi) 1230 movdqa %xmm3, -16(%edx, %edi) 1231 jb L(sh_8_end_no_prefetch_loop) 1232 1233 movdqa 16(%eax, %edi), %xmm2 1234 sub $32, %ecx 1235 movdqa 32(%eax, %edi), %xmm3 1236 movdqa %xmm3, %xmm1 1237 palignr $8, %xmm2, %xmm3 1238 palignr $8, %xmm4, %xmm2 1239 lea 32(%edi), %edi 1240 movdqa %xmm2, -32(%edx, %edi) 1241 movdqa %xmm3, -16(%edx, %edi) 1242 jae L(sh_8_no_prefetch_loop) 1243 1244L(sh_8_end_no_prefetch_loop): 1245 lea 32(%ecx), %ecx 1246 add %ecx, %edi 1247 add %edi, %edx 1248 lea 8(%edi, %eax), %eax 1249 POP (%edi) 1250 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1251 1252 CFI_PUSH (%edi) 1253 1254 .p2align 4 1255L(shl_9): 1256#ifndef USE_AS_MEMMOVE 1257 movaps -9(%eax), %xmm1 1258#else 1259 movl DEST+4(%esp), %edi 1260 movaps -9(%eax), %xmm1 1261 movdqu %xmm0, (%edi) 1262#endif 1263#ifdef DATA_CACHE_SIZE_HALF 1264 cmp $DATA_CACHE_SIZE_HALF, %ecx 1265#else 1266# if (defined SHARED || defined __PIC__) 1267 SETUP_PIC_REG(bx) 1268 add $_GLOBAL_OFFSET_TABLE_, %ebx 1269 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1270# else 1271 cmp __x86_data_cache_size_half, %ecx 1272# endif 1273#endif 1274 jb L(sh_9_no_prefetch) 1275 1276 lea -64(%ecx), %ecx 1277 1278 .p2align 4 1279L(Shl9LoopStart): 1280 prefetcht0 0x1c0(%eax) 1281 prefetcht0 0x1c0(%edx) 1282 movaps 7(%eax), %xmm2 1283 movaps 23(%eax), %xmm3 1284 movaps 39(%eax), %xmm4 1285 movaps 55(%eax), %xmm5 1286 movaps %xmm5, %xmm7 1287 palignr $9, %xmm4, %xmm5 1288 palignr $9, %xmm3, %xmm4 1289 movaps %xmm5, 48(%edx) 1290 palignr $9, %xmm2, %xmm3 1291 lea 64(%eax), %eax 1292 palignr $9, %xmm1, %xmm2 1293 movaps %xmm4, 32(%edx) 1294 movaps %xmm3, 16(%edx) 1295 movaps %xmm7, %xmm1 1296 movaps %xmm2, (%edx) 1297 lea 64(%edx), %edx 1298 sub $64, %ecx 1299 ja L(Shl9LoopStart) 1300 1301L(Shl9LoopLeave): 1302 add $32, %ecx 1303 jle L(shl_end_0) 1304 1305 movaps 7(%eax), %xmm2 1306 movaps 23(%eax), %xmm3 1307 palignr $9, %xmm2, %xmm3 1308 palignr $9, %xmm1, %xmm2 1309 1310 movaps %xmm2, (%edx) 1311 movaps %xmm3, 16(%edx) 1312 lea 32(%edx, %ecx), %edx 1313 lea 32(%eax, %ecx), %eax 1314 POP (%edi) 1315 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1316 1317 CFI_PUSH (%edi) 1318 1319 .p2align 4 1320L(sh_9_no_prefetch): 1321 lea -32(%ecx), %ecx 1322 lea -9(%eax), %eax 1323 xor %edi, %edi 1324 1325 .p2align 4 1326L(sh_9_no_prefetch_loop): 1327 movdqa 16(%eax, %edi), %xmm2 1328 sub $32, %ecx 1329 movdqa 32(%eax, %edi), %xmm3 1330 movdqa %xmm3, %xmm4 1331 palignr $9, %xmm2, %xmm3 1332 palignr $9, %xmm1, %xmm2 1333 lea 32(%edi), %edi 1334 movdqa %xmm2, -32(%edx, %edi) 1335 movdqa %xmm3, -16(%edx, %edi) 1336 jb L(sh_9_end_no_prefetch_loop) 1337 1338 movdqa 16(%eax, %edi), %xmm2 1339 sub $32, %ecx 1340 movdqa 32(%eax, %edi), %xmm3 1341 movdqa %xmm3, %xmm1 1342 palignr $9, %xmm2, %xmm3 1343 palignr $9, %xmm4, %xmm2 1344 lea 32(%edi), %edi 1345 movdqa %xmm2, -32(%edx, %edi) 1346 movdqa %xmm3, -16(%edx, %edi) 1347 jae L(sh_9_no_prefetch_loop) 1348 1349L(sh_9_end_no_prefetch_loop): 1350 lea 32(%ecx), %ecx 1351 add %ecx, %edi 1352 add %edi, %edx 1353 lea 9(%edi, %eax), %eax 1354 POP (%edi) 1355 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1356 1357 CFI_PUSH (%edi) 1358 1359 .p2align 4 1360L(shl_10): 1361#ifndef USE_AS_MEMMOVE 1362 movaps -10(%eax), %xmm1 1363#else 1364 movl DEST+4(%esp), %edi 1365 movaps -10(%eax), %xmm1 1366 movdqu %xmm0, (%edi) 1367#endif 1368#ifdef DATA_CACHE_SIZE_HALF 1369 cmp $DATA_CACHE_SIZE_HALF, %ecx 1370#else 1371# if (defined SHARED || defined __PIC__) 1372 SETUP_PIC_REG(bx) 1373 add $_GLOBAL_OFFSET_TABLE_, %ebx 1374 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1375# else 1376 cmp __x86_data_cache_size_half, %ecx 1377# endif 1378#endif 1379 jb L(sh_10_no_prefetch) 1380 1381 lea -64(%ecx), %ecx 1382 1383 .p2align 4 1384L(Shl10LoopStart): 1385 prefetcht0 0x1c0(%eax) 1386 prefetcht0 0x1c0(%edx) 1387 movaps 6(%eax), %xmm2 1388 movaps 22(%eax), %xmm3 1389 movaps 38(%eax), %xmm4 1390 movaps 54(%eax), %xmm5 1391 movaps %xmm5, %xmm7 1392 palignr $10, %xmm4, %xmm5 1393 palignr $10, %xmm3, %xmm4 1394 movaps %xmm5, 48(%edx) 1395 palignr $10, %xmm2, %xmm3 1396 lea 64(%eax), %eax 1397 palignr $10, %xmm1, %xmm2 1398 movaps %xmm4, 32(%edx) 1399 movaps %xmm3, 16(%edx) 1400 movaps %xmm7, %xmm1 1401 movaps %xmm2, (%edx) 1402 lea 64(%edx), %edx 1403 sub $64, %ecx 1404 ja L(Shl10LoopStart) 1405 1406L(Shl10LoopLeave): 1407 add $32, %ecx 1408 jle L(shl_end_0) 1409 1410 movaps 6(%eax), %xmm2 1411 movaps 22(%eax), %xmm3 1412 palignr $10, %xmm2, %xmm3 1413 palignr $10, %xmm1, %xmm2 1414 1415 movaps %xmm2, (%edx) 1416 movaps %xmm3, 16(%edx) 1417 lea 32(%edx, %ecx), %edx 1418 lea 32(%eax, %ecx), %eax 1419 POP (%edi) 1420 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1421 1422 CFI_PUSH (%edi) 1423 1424 .p2align 4 1425L(sh_10_no_prefetch): 1426 lea -32(%ecx), %ecx 1427 lea -10(%eax), %eax 1428 xor %edi, %edi 1429 1430 .p2align 4 1431L(sh_10_no_prefetch_loop): 1432 movdqa 16(%eax, %edi), %xmm2 1433 sub $32, %ecx 1434 movdqa 32(%eax, %edi), %xmm3 1435 movdqa %xmm3, %xmm4 1436 palignr $10, %xmm2, %xmm3 1437 palignr $10, %xmm1, %xmm2 1438 lea 32(%edi), %edi 1439 movdqa %xmm2, -32(%edx, %edi) 1440 movdqa %xmm3, -16(%edx, %edi) 1441 jb L(sh_10_end_no_prefetch_loop) 1442 1443 movdqa 16(%eax, %edi), %xmm2 1444 sub $32, %ecx 1445 movdqa 32(%eax, %edi), %xmm3 1446 movdqa %xmm3, %xmm1 1447 palignr $10, %xmm2, %xmm3 1448 palignr $10, %xmm4, %xmm2 1449 lea 32(%edi), %edi 1450 movdqa %xmm2, -32(%edx, %edi) 1451 movdqa %xmm3, -16(%edx, %edi) 1452 jae L(sh_10_no_prefetch_loop) 1453 1454L(sh_10_end_no_prefetch_loop): 1455 lea 32(%ecx), %ecx 1456 add %ecx, %edi 1457 add %edi, %edx 1458 lea 10(%edi, %eax), %eax 1459 POP (%edi) 1460 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1461 1462 CFI_PUSH (%edi) 1463 1464 .p2align 4 1465L(shl_11): 1466#ifndef USE_AS_MEMMOVE 1467 movaps -11(%eax), %xmm1 1468#else 1469 movl DEST+4(%esp), %edi 1470 movaps -11(%eax), %xmm1 1471 movdqu %xmm0, (%edi) 1472#endif 1473#ifdef DATA_CACHE_SIZE_HALF 1474 cmp $DATA_CACHE_SIZE_HALF, %ecx 1475#else 1476# if (defined SHARED || defined __PIC__) 1477 SETUP_PIC_REG(bx) 1478 add $_GLOBAL_OFFSET_TABLE_, %ebx 1479 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1480# else 1481 cmp __x86_data_cache_size_half, %ecx 1482# endif 1483#endif 1484 jb L(sh_11_no_prefetch) 1485 1486 lea -64(%ecx), %ecx 1487 1488 .p2align 4 1489L(Shl11LoopStart): 1490 prefetcht0 0x1c0(%eax) 1491 prefetcht0 0x1c0(%edx) 1492 movaps 5(%eax), %xmm2 1493 movaps 21(%eax), %xmm3 1494 movaps 37(%eax), %xmm4 1495 movaps 53(%eax), %xmm5 1496 movaps %xmm5, %xmm7 1497 palignr $11, %xmm4, %xmm5 1498 palignr $11, %xmm3, %xmm4 1499 movaps %xmm5, 48(%edx) 1500 palignr $11, %xmm2, %xmm3 1501 lea 64(%eax), %eax 1502 palignr $11, %xmm1, %xmm2 1503 movaps %xmm4, 32(%edx) 1504 movaps %xmm3, 16(%edx) 1505 movaps %xmm7, %xmm1 1506 movaps %xmm2, (%edx) 1507 lea 64(%edx), %edx 1508 sub $64, %ecx 1509 ja L(Shl11LoopStart) 1510 1511L(Shl11LoopLeave): 1512 add $32, %ecx 1513 jle L(shl_end_0) 1514 1515 movaps 5(%eax), %xmm2 1516 movaps 21(%eax), %xmm3 1517 palignr $11, %xmm2, %xmm3 1518 palignr $11, %xmm1, %xmm2 1519 1520 movaps %xmm2, (%edx) 1521 movaps %xmm3, 16(%edx) 1522 lea 32(%edx, %ecx), %edx 1523 lea 32(%eax, %ecx), %eax 1524 POP (%edi) 1525 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1526 1527 CFI_PUSH (%edi) 1528 1529 .p2align 4 1530L(sh_11_no_prefetch): 1531 lea -32(%ecx), %ecx 1532 lea -11(%eax), %eax 1533 xor %edi, %edi 1534 1535 .p2align 4 1536L(sh_11_no_prefetch_loop): 1537 movdqa 16(%eax, %edi), %xmm2 1538 sub $32, %ecx 1539 movdqa 32(%eax, %edi), %xmm3 1540 movdqa %xmm3, %xmm4 1541 palignr $11, %xmm2, %xmm3 1542 palignr $11, %xmm1, %xmm2 1543 lea 32(%edi), %edi 1544 movdqa %xmm2, -32(%edx, %edi) 1545 movdqa %xmm3, -16(%edx, %edi) 1546 jb L(sh_11_end_no_prefetch_loop) 1547 1548 movdqa 16(%eax, %edi), %xmm2 1549 sub $32, %ecx 1550 movdqa 32(%eax, %edi), %xmm3 1551 movdqa %xmm3, %xmm1 1552 palignr $11, %xmm2, %xmm3 1553 palignr $11, %xmm4, %xmm2 1554 lea 32(%edi), %edi 1555 movdqa %xmm2, -32(%edx, %edi) 1556 movdqa %xmm3, -16(%edx, %edi) 1557 jae L(sh_11_no_prefetch_loop) 1558 1559L(sh_11_end_no_prefetch_loop): 1560 lea 32(%ecx), %ecx 1561 add %ecx, %edi 1562 add %edi, %edx 1563 lea 11(%edi, %eax), %eax 1564 POP (%edi) 1565 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1566 1567 CFI_PUSH (%edi) 1568 1569 .p2align 4 1570L(shl_12): 1571#ifndef USE_AS_MEMMOVE 1572 movaps -12(%eax), %xmm1 1573#else 1574 movl DEST+4(%esp), %edi 1575 movaps -12(%eax), %xmm1 1576 movdqu %xmm0, (%edi) 1577#endif 1578#ifdef DATA_CACHE_SIZE_HALF 1579 cmp $DATA_CACHE_SIZE_HALF, %ecx 1580#else 1581# if (defined SHARED || defined __PIC__) 1582 SETUP_PIC_REG(bx) 1583 add $_GLOBAL_OFFSET_TABLE_, %ebx 1584 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1585# else 1586 cmp __x86_data_cache_size_half, %ecx 1587# endif 1588#endif 1589 jb L(sh_12_no_prefetch) 1590 1591 lea -64(%ecx), %ecx 1592 1593 .p2align 4 1594L(Shl12LoopStart): 1595 prefetcht0 0x1c0(%eax) 1596 prefetcht0 0x1c0(%edx) 1597 movaps 4(%eax), %xmm2 1598 movaps 20(%eax), %xmm3 1599 movaps 36(%eax), %xmm4 1600 movaps 52(%eax), %xmm5 1601 movaps %xmm5, %xmm7 1602 palignr $12, %xmm4, %xmm5 1603 palignr $12, %xmm3, %xmm4 1604 movaps %xmm5, 48(%edx) 1605 palignr $12, %xmm2, %xmm3 1606 lea 64(%eax), %eax 1607 palignr $12, %xmm1, %xmm2 1608 movaps %xmm4, 32(%edx) 1609 movaps %xmm3, 16(%edx) 1610 movaps %xmm7, %xmm1 1611 movaps %xmm2, (%edx) 1612 lea 64(%edx), %edx 1613 sub $64, %ecx 1614 ja L(Shl12LoopStart) 1615 1616L(Shl12LoopLeave): 1617 add $32, %ecx 1618 jle L(shl_end_0) 1619 1620 movaps 4(%eax), %xmm2 1621 movaps 20(%eax), %xmm3 1622 palignr $12, %xmm2, %xmm3 1623 palignr $12, %xmm1, %xmm2 1624 1625 movaps %xmm2, (%edx) 1626 movaps %xmm3, 16(%edx) 1627 lea 32(%edx, %ecx), %edx 1628 lea 32(%eax, %ecx), %eax 1629 POP (%edi) 1630 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1631 1632 CFI_PUSH (%edi) 1633 1634 .p2align 4 1635L(sh_12_no_prefetch): 1636 lea -32(%ecx), %ecx 1637 lea -12(%eax), %eax 1638 xor %edi, %edi 1639 1640 .p2align 4 1641L(sh_12_no_prefetch_loop): 1642 movdqa 16(%eax, %edi), %xmm2 1643 sub $32, %ecx 1644 movdqa 32(%eax, %edi), %xmm3 1645 movdqa %xmm3, %xmm4 1646 palignr $12, %xmm2, %xmm3 1647 palignr $12, %xmm1, %xmm2 1648 lea 32(%edi), %edi 1649 movdqa %xmm2, -32(%edx, %edi) 1650 movdqa %xmm3, -16(%edx, %edi) 1651 jb L(sh_12_end_no_prefetch_loop) 1652 1653 movdqa 16(%eax, %edi), %xmm2 1654 sub $32, %ecx 1655 movdqa 32(%eax, %edi), %xmm3 1656 movdqa %xmm3, %xmm1 1657 palignr $12, %xmm2, %xmm3 1658 palignr $12, %xmm4, %xmm2 1659 lea 32(%edi), %edi 1660 movdqa %xmm2, -32(%edx, %edi) 1661 movdqa %xmm3, -16(%edx, %edi) 1662 jae L(sh_12_no_prefetch_loop) 1663 1664L(sh_12_end_no_prefetch_loop): 1665 lea 32(%ecx), %ecx 1666 add %ecx, %edi 1667 add %edi, %edx 1668 lea 12(%edi, %eax), %eax 1669 POP (%edi) 1670 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1671 1672 CFI_PUSH (%edi) 1673 1674 .p2align 4 1675L(shl_13): 1676#ifndef USE_AS_MEMMOVE 1677 movaps -13(%eax), %xmm1 1678#else 1679 movl DEST+4(%esp), %edi 1680 movaps -13(%eax), %xmm1 1681 movdqu %xmm0, (%edi) 1682#endif 1683#ifdef DATA_CACHE_SIZE_HALF 1684 cmp $DATA_CACHE_SIZE_HALF, %ecx 1685#else 1686# if (defined SHARED || defined __PIC__) 1687 SETUP_PIC_REG(bx) 1688 add $_GLOBAL_OFFSET_TABLE_, %ebx 1689 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1690# else 1691 cmp __x86_data_cache_size_half, %ecx 1692# endif 1693#endif 1694 jb L(sh_13_no_prefetch) 1695 1696 lea -64(%ecx), %ecx 1697 1698 .p2align 4 1699L(Shl13LoopStart): 1700 prefetcht0 0x1c0(%eax) 1701 prefetcht0 0x1c0(%edx) 1702 movaps 3(%eax), %xmm2 1703 movaps 19(%eax), %xmm3 1704 movaps 35(%eax), %xmm4 1705 movaps 51(%eax), %xmm5 1706 movaps %xmm5, %xmm7 1707 palignr $13, %xmm4, %xmm5 1708 palignr $13, %xmm3, %xmm4 1709 movaps %xmm5, 48(%edx) 1710 palignr $13, %xmm2, %xmm3 1711 lea 64(%eax), %eax 1712 palignr $13, %xmm1, %xmm2 1713 movaps %xmm4, 32(%edx) 1714 movaps %xmm3, 16(%edx) 1715 movaps %xmm7, %xmm1 1716 movaps %xmm2, (%edx) 1717 lea 64(%edx), %edx 1718 sub $64, %ecx 1719 ja L(Shl13LoopStart) 1720 1721L(Shl13LoopLeave): 1722 add $32, %ecx 1723 jle L(shl_end_0) 1724 1725 movaps 3(%eax), %xmm2 1726 movaps 19(%eax), %xmm3 1727 palignr $13, %xmm2, %xmm3 1728 palignr $13, %xmm1, %xmm2 1729 1730 movaps %xmm2, (%edx) 1731 movaps %xmm3, 16(%edx) 1732 lea 32(%edx, %ecx), %edx 1733 lea 32(%eax, %ecx), %eax 1734 POP (%edi) 1735 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1736 1737 CFI_PUSH (%edi) 1738 1739 .p2align 4 1740L(sh_13_no_prefetch): 1741 lea -32(%ecx), %ecx 1742 lea -13(%eax), %eax 1743 xor %edi, %edi 1744 1745 .p2align 4 1746L(sh_13_no_prefetch_loop): 1747 movdqa 16(%eax, %edi), %xmm2 1748 sub $32, %ecx 1749 movdqa 32(%eax, %edi), %xmm3 1750 movdqa %xmm3, %xmm4 1751 palignr $13, %xmm2, %xmm3 1752 palignr $13, %xmm1, %xmm2 1753 lea 32(%edi), %edi 1754 movdqa %xmm2, -32(%edx, %edi) 1755 movdqa %xmm3, -16(%edx, %edi) 1756 jb L(sh_13_end_no_prefetch_loop) 1757 1758 movdqa 16(%eax, %edi), %xmm2 1759 sub $32, %ecx 1760 movdqa 32(%eax, %edi), %xmm3 1761 movdqa %xmm3, %xmm1 1762 palignr $13, %xmm2, %xmm3 1763 palignr $13, %xmm4, %xmm2 1764 lea 32(%edi), %edi 1765 movdqa %xmm2, -32(%edx, %edi) 1766 movdqa %xmm3, -16(%edx, %edi) 1767 jae L(sh_13_no_prefetch_loop) 1768 1769L(sh_13_end_no_prefetch_loop): 1770 lea 32(%ecx), %ecx 1771 add %ecx, %edi 1772 add %edi, %edx 1773 lea 13(%edi, %eax), %eax 1774 POP (%edi) 1775 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1776 1777 CFI_PUSH (%edi) 1778 1779 .p2align 4 1780L(shl_14): 1781#ifndef USE_AS_MEMMOVE 1782 movaps -14(%eax), %xmm1 1783#else 1784 movl DEST+4(%esp), %edi 1785 movaps -14(%eax), %xmm1 1786 movdqu %xmm0, (%edi) 1787#endif 1788#ifdef DATA_CACHE_SIZE_HALF 1789 cmp $DATA_CACHE_SIZE_HALF, %ecx 1790#else 1791# if (defined SHARED || defined __PIC__) 1792 SETUP_PIC_REG(bx) 1793 add $_GLOBAL_OFFSET_TABLE_, %ebx 1794 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1795# else 1796 cmp __x86_data_cache_size_half, %ecx 1797# endif 1798#endif 1799 jb L(sh_14_no_prefetch) 1800 1801 lea -64(%ecx), %ecx 1802 1803 .p2align 4 1804L(Shl14LoopStart): 1805 prefetcht0 0x1c0(%eax) 1806 prefetcht0 0x1c0(%edx) 1807 movaps 2(%eax), %xmm2 1808 movaps 18(%eax), %xmm3 1809 movaps 34(%eax), %xmm4 1810 movaps 50(%eax), %xmm5 1811 movaps %xmm5, %xmm7 1812 palignr $14, %xmm4, %xmm5 1813 palignr $14, %xmm3, %xmm4 1814 movaps %xmm5, 48(%edx) 1815 palignr $14, %xmm2, %xmm3 1816 lea 64(%eax), %eax 1817 palignr $14, %xmm1, %xmm2 1818 movaps %xmm4, 32(%edx) 1819 movaps %xmm3, 16(%edx) 1820 movaps %xmm7, %xmm1 1821 movaps %xmm2, (%edx) 1822 lea 64(%edx), %edx 1823 sub $64, %ecx 1824 ja L(Shl14LoopStart) 1825 1826L(Shl14LoopLeave): 1827 add $32, %ecx 1828 jle L(shl_end_0) 1829 1830 movaps 2(%eax), %xmm2 1831 movaps 18(%eax), %xmm3 1832 palignr $14, %xmm2, %xmm3 1833 palignr $14, %xmm1, %xmm2 1834 1835 movaps %xmm2, (%edx) 1836 movaps %xmm3, 16(%edx) 1837 lea 32(%edx, %ecx), %edx 1838 lea 32(%eax, %ecx), %eax 1839 POP (%edi) 1840 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1841 1842 CFI_PUSH (%edi) 1843 1844 .p2align 4 1845L(sh_14_no_prefetch): 1846 lea -32(%ecx), %ecx 1847 lea -14(%eax), %eax 1848 xor %edi, %edi 1849 1850 .p2align 4 1851L(sh_14_no_prefetch_loop): 1852 movdqa 16(%eax, %edi), %xmm2 1853 sub $32, %ecx 1854 movdqa 32(%eax, %edi), %xmm3 1855 movdqa %xmm3, %xmm4 1856 palignr $14, %xmm2, %xmm3 1857 palignr $14, %xmm1, %xmm2 1858 lea 32(%edi), %edi 1859 movdqa %xmm2, -32(%edx, %edi) 1860 movdqa %xmm3, -16(%edx, %edi) 1861 jb L(sh_14_end_no_prefetch_loop) 1862 1863 movdqa 16(%eax, %edi), %xmm2 1864 sub $32, %ecx 1865 movdqa 32(%eax, %edi), %xmm3 1866 movdqa %xmm3, %xmm1 1867 palignr $14, %xmm2, %xmm3 1868 palignr $14, %xmm4, %xmm2 1869 lea 32(%edi), %edi 1870 movdqa %xmm2, -32(%edx, %edi) 1871 movdqa %xmm3, -16(%edx, %edi) 1872 jae L(sh_14_no_prefetch_loop) 1873 1874L(sh_14_end_no_prefetch_loop): 1875 lea 32(%ecx), %ecx 1876 add %ecx, %edi 1877 add %edi, %edx 1878 lea 14(%edi, %eax), %eax 1879 POP (%edi) 1880 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1881 1882 CFI_PUSH (%edi) 1883 1884 .p2align 4 1885L(shl_15): 1886#ifndef USE_AS_MEMMOVE 1887 movaps -15(%eax), %xmm1 1888#else 1889 movl DEST+4(%esp), %edi 1890 movaps -15(%eax), %xmm1 1891 movdqu %xmm0, (%edi) 1892#endif 1893#ifdef DATA_CACHE_SIZE_HALF 1894 cmp $DATA_CACHE_SIZE_HALF, %ecx 1895#else 1896# if (defined SHARED || defined __PIC__) 1897 SETUP_PIC_REG(bx) 1898 add $_GLOBAL_OFFSET_TABLE_, %ebx 1899 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1900# else 1901 cmp __x86_data_cache_size_half, %ecx 1902# endif 1903#endif 1904 jb L(sh_15_no_prefetch) 1905 1906 lea -64(%ecx), %ecx 1907 1908 .p2align 4 1909L(Shl15LoopStart): 1910 prefetcht0 0x1c0(%eax) 1911 prefetcht0 0x1c0(%edx) 1912 movaps 1(%eax), %xmm2 1913 movaps 17(%eax), %xmm3 1914 movaps 33(%eax), %xmm4 1915 movaps 49(%eax), %xmm5 1916 movaps %xmm5, %xmm7 1917 palignr $15, %xmm4, %xmm5 1918 palignr $15, %xmm3, %xmm4 1919 movaps %xmm5, 48(%edx) 1920 palignr $15, %xmm2, %xmm3 1921 lea 64(%eax), %eax 1922 palignr $15, %xmm1, %xmm2 1923 movaps %xmm4, 32(%edx) 1924 movaps %xmm3, 16(%edx) 1925 movaps %xmm7, %xmm1 1926 movaps %xmm2, (%edx) 1927 lea 64(%edx), %edx 1928 sub $64, %ecx 1929 ja L(Shl15LoopStart) 1930 1931L(Shl15LoopLeave): 1932 add $32, %ecx 1933 jle L(shl_end_0) 1934 1935 movaps 1(%eax), %xmm2 1936 movaps 17(%eax), %xmm3 1937 palignr $15, %xmm2, %xmm3 1938 palignr $15, %xmm1, %xmm2 1939 1940 movaps %xmm2, (%edx) 1941 movaps %xmm3, 16(%edx) 1942 lea 32(%edx, %ecx), %edx 1943 lea 32(%eax, %ecx), %eax 1944 POP (%edi) 1945 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1946 1947 CFI_PUSH (%edi) 1948 1949 .p2align 4 1950L(sh_15_no_prefetch): 1951 lea -32(%ecx), %ecx 1952 lea -15(%eax), %eax 1953 xor %edi, %edi 1954 1955 .p2align 4 1956L(sh_15_no_prefetch_loop): 1957 movdqa 16(%eax, %edi), %xmm2 1958 sub $32, %ecx 1959 movdqa 32(%eax, %edi), %xmm3 1960 movdqa %xmm3, %xmm4 1961 palignr $15, %xmm2, %xmm3 1962 palignr $15, %xmm1, %xmm2 1963 lea 32(%edi), %edi 1964 movdqa %xmm2, -32(%edx, %edi) 1965 movdqa %xmm3, -16(%edx, %edi) 1966 jb L(sh_15_end_no_prefetch_loop) 1967 1968 movdqa 16(%eax, %edi), %xmm2 1969 sub $32, %ecx 1970 movdqa 32(%eax, %edi), %xmm3 1971 movdqa %xmm3, %xmm1 1972 palignr $15, %xmm2, %xmm3 1973 palignr $15, %xmm4, %xmm2 1974 lea 32(%edi), %edi 1975 movdqa %xmm2, -32(%edx, %edi) 1976 movdqa %xmm3, -16(%edx, %edi) 1977 jae L(sh_15_no_prefetch_loop) 1978 1979L(sh_15_end_no_prefetch_loop): 1980 lea 32(%ecx), %ecx 1981 add %ecx, %edi 1982 add %edi, %edx 1983 lea 15(%edi, %eax), %eax 1984 POP (%edi) 1985 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1986 1987 CFI_PUSH (%edi) 1988 1989 .p2align 4 1990L(shl_end_0): 1991 lea 32(%ecx), %ecx 1992 lea (%edx, %ecx), %edx 1993 lea (%eax, %ecx), %eax 1994 POP (%edi) 1995 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1996 1997 .p2align 4 1998L(fwd_write_44bytes): 1999 movq -44(%eax), %xmm0 2000 movq %xmm0, -44(%edx) 2001L(fwd_write_36bytes): 2002 movq -36(%eax), %xmm0 2003 movq %xmm0, -36(%edx) 2004L(fwd_write_28bytes): 2005 movq -28(%eax), %xmm0 2006 movq %xmm0, -28(%edx) 2007L(fwd_write_20bytes): 2008 movq -20(%eax), %xmm0 2009 movq %xmm0, -20(%edx) 2010L(fwd_write_12bytes): 2011 movq -12(%eax), %xmm0 2012 movq %xmm0, -12(%edx) 2013L(fwd_write_4bytes): 2014 movl -4(%eax), %ecx 2015 movl %ecx, -4(%edx) 2016#ifdef USE_AS_MEMPCPY 2017 movl %edx, %eax 2018#else 2019 movl DEST(%esp), %eax 2020#endif 2021 RETURN 2022 2023 .p2align 4 2024L(fwd_write_40bytes): 2025 movq -40(%eax), %xmm0 2026 movq %xmm0, -40(%edx) 2027L(fwd_write_32bytes): 2028 movq -32(%eax), %xmm0 2029 movq %xmm0, -32(%edx) 2030L(fwd_write_24bytes): 2031 movq -24(%eax), %xmm0 2032 movq %xmm0, -24(%edx) 2033L(fwd_write_16bytes): 2034 movq -16(%eax), %xmm0 2035 movq %xmm0, -16(%edx) 2036L(fwd_write_8bytes): 2037 movq -8(%eax), %xmm0 2038 movq %xmm0, -8(%edx) 2039L(fwd_write_0bytes): 2040#ifdef USE_AS_MEMPCPY 2041 movl %edx, %eax 2042#else 2043 movl DEST(%esp), %eax 2044#endif 2045 RETURN 2046 2047 .p2align 4 2048L(fwd_write_5bytes): 2049 movl -5(%eax), %ecx 2050 movl -4(%eax), %eax 2051 movl %ecx, -5(%edx) 2052 movl %eax, -4(%edx) 2053#ifdef USE_AS_MEMPCPY 2054 movl %edx, %eax 2055#else 2056 movl DEST(%esp), %eax 2057#endif 2058 RETURN 2059 2060 .p2align 4 2061L(fwd_write_45bytes): 2062 movq -45(%eax), %xmm0 2063 movq %xmm0, -45(%edx) 2064L(fwd_write_37bytes): 2065 movq -37(%eax), %xmm0 2066 movq %xmm0, -37(%edx) 2067L(fwd_write_29bytes): 2068 movq -29(%eax), %xmm0 2069 movq %xmm0, -29(%edx) 2070L(fwd_write_21bytes): 2071 movq -21(%eax), %xmm0 2072 movq %xmm0, -21(%edx) 2073L(fwd_write_13bytes): 2074 movq -13(%eax), %xmm0 2075 movq %xmm0, -13(%edx) 2076 movl -5(%eax), %ecx 2077 movl %ecx, -5(%edx) 2078 movzbl -1(%eax), %ecx 2079 movb %cl, -1(%edx) 2080#ifdef USE_AS_MEMPCPY 2081 movl %edx, %eax 2082#else 2083 movl DEST(%esp), %eax 2084#endif 2085 RETURN 2086 2087 .p2align 4 2088L(fwd_write_41bytes): 2089 movq -41(%eax), %xmm0 2090 movq %xmm0, -41(%edx) 2091L(fwd_write_33bytes): 2092 movq -33(%eax), %xmm0 2093 movq %xmm0, -33(%edx) 2094L(fwd_write_25bytes): 2095 movq -25(%eax), %xmm0 2096 movq %xmm0, -25(%edx) 2097L(fwd_write_17bytes): 2098 movq -17(%eax), %xmm0 2099 movq %xmm0, -17(%edx) 2100L(fwd_write_9bytes): 2101 movq -9(%eax), %xmm0 2102 movq %xmm0, -9(%edx) 2103L(fwd_write_1bytes): 2104 movzbl -1(%eax), %ecx 2105 movb %cl, -1(%edx) 2106#ifdef USE_AS_MEMPCPY 2107 movl %edx, %eax 2108#else 2109 movl DEST(%esp), %eax 2110#endif 2111 RETURN 2112 2113 .p2align 4 2114L(fwd_write_46bytes): 2115 movq -46(%eax), %xmm0 2116 movq %xmm0, -46(%edx) 2117L(fwd_write_38bytes): 2118 movq -38(%eax), %xmm0 2119 movq %xmm0, -38(%edx) 2120L(fwd_write_30bytes): 2121 movq -30(%eax), %xmm0 2122 movq %xmm0, -30(%edx) 2123L(fwd_write_22bytes): 2124 movq -22(%eax), %xmm0 2125 movq %xmm0, -22(%edx) 2126L(fwd_write_14bytes): 2127 movq -14(%eax), %xmm0 2128 movq %xmm0, -14(%edx) 2129L(fwd_write_6bytes): 2130 movl -6(%eax), %ecx 2131 movl %ecx, -6(%edx) 2132 movzwl -2(%eax), %ecx 2133 movw %cx, -2(%edx) 2134#ifdef USE_AS_MEMPCPY 2135 movl %edx, %eax 2136#else 2137 movl DEST(%esp), %eax 2138#endif 2139 RETURN 2140 2141 .p2align 4 2142L(fwd_write_42bytes): 2143 movq -42(%eax), %xmm0 2144 movq %xmm0, -42(%edx) 2145L(fwd_write_34bytes): 2146 movq -34(%eax), %xmm0 2147 movq %xmm0, -34(%edx) 2148L(fwd_write_26bytes): 2149 movq -26(%eax), %xmm0 2150 movq %xmm0, -26(%edx) 2151L(fwd_write_18bytes): 2152 movq -18(%eax), %xmm0 2153 movq %xmm0, -18(%edx) 2154L(fwd_write_10bytes): 2155 movq -10(%eax), %xmm0 2156 movq %xmm0, -10(%edx) 2157L(fwd_write_2bytes): 2158 movzwl -2(%eax), %ecx 2159 movw %cx, -2(%edx) 2160#ifdef USE_AS_MEMPCPY 2161 movl %edx, %eax 2162#else 2163 movl DEST(%esp), %eax 2164#endif 2165 RETURN 2166 2167 .p2align 4 2168L(fwd_write_47bytes): 2169 movq -47(%eax), %xmm0 2170 movq %xmm0, -47(%edx) 2171L(fwd_write_39bytes): 2172 movq -39(%eax), %xmm0 2173 movq %xmm0, -39(%edx) 2174L(fwd_write_31bytes): 2175 movq -31(%eax), %xmm0 2176 movq %xmm0, -31(%edx) 2177L(fwd_write_23bytes): 2178 movq -23(%eax), %xmm0 2179 movq %xmm0, -23(%edx) 2180L(fwd_write_15bytes): 2181 movq -15(%eax), %xmm0 2182 movq %xmm0, -15(%edx) 2183L(fwd_write_7bytes): 2184 movl -7(%eax), %ecx 2185 movl %ecx, -7(%edx) 2186 movzwl -3(%eax), %ecx 2187 movzbl -1(%eax), %eax 2188 movw %cx, -3(%edx) 2189 movb %al, -1(%edx) 2190#ifdef USE_AS_MEMPCPY 2191 movl %edx, %eax 2192#else 2193 movl DEST(%esp), %eax 2194#endif 2195 RETURN 2196 2197 .p2align 4 2198L(fwd_write_43bytes): 2199 movq -43(%eax), %xmm0 2200 movq %xmm0, -43(%edx) 2201L(fwd_write_35bytes): 2202 movq -35(%eax), %xmm0 2203 movq %xmm0, -35(%edx) 2204L(fwd_write_27bytes): 2205 movq -27(%eax), %xmm0 2206 movq %xmm0, -27(%edx) 2207L(fwd_write_19bytes): 2208 movq -19(%eax), %xmm0 2209 movq %xmm0, -19(%edx) 2210L(fwd_write_11bytes): 2211 movq -11(%eax), %xmm0 2212 movq %xmm0, -11(%edx) 2213L(fwd_write_3bytes): 2214 movzwl -3(%eax), %ecx 2215 movzbl -1(%eax), %eax 2216 movw %cx, -3(%edx) 2217 movb %al, -1(%edx) 2218#ifdef USE_AS_MEMPCPY 2219 movl %edx, %eax 2220#else 2221 movl DEST(%esp), %eax 2222#endif 2223 RETURN 2224 2225 .p2align 4 2226L(fwd_write_40bytes_align): 2227 movdqa -40(%eax), %xmm0 2228 movdqa %xmm0, -40(%edx) 2229L(fwd_write_24bytes_align): 2230 movdqa -24(%eax), %xmm0 2231 movdqa %xmm0, -24(%edx) 2232L(fwd_write_8bytes_align): 2233 movq -8(%eax), %xmm0 2234 movq %xmm0, -8(%edx) 2235L(fwd_write_0bytes_align): 2236#ifdef USE_AS_MEMPCPY 2237 movl %edx, %eax 2238#else 2239 movl DEST(%esp), %eax 2240#endif 2241 RETURN 2242 2243 .p2align 4 2244L(fwd_write_32bytes_align): 2245 movdqa -32(%eax), %xmm0 2246 movdqa %xmm0, -32(%edx) 2247L(fwd_write_16bytes_align): 2248 movdqa -16(%eax), %xmm0 2249 movdqa %xmm0, -16(%edx) 2250#ifdef USE_AS_MEMPCPY 2251 movl %edx, %eax 2252#else 2253 movl DEST(%esp), %eax 2254#endif 2255 RETURN 2256 2257 .p2align 4 2258L(fwd_write_5bytes_align): 2259 movl -5(%eax), %ecx 2260 movl -4(%eax), %eax 2261 movl %ecx, -5(%edx) 2262 movl %eax, -4(%edx) 2263#ifdef USE_AS_MEMPCPY 2264 movl %edx, %eax 2265#else 2266 movl DEST(%esp), %eax 2267#endif 2268 RETURN 2269 2270 .p2align 4 2271L(fwd_write_45bytes_align): 2272 movdqa -45(%eax), %xmm0 2273 movdqa %xmm0, -45(%edx) 2274L(fwd_write_29bytes_align): 2275 movdqa -29(%eax), %xmm0 2276 movdqa %xmm0, -29(%edx) 2277L(fwd_write_13bytes_align): 2278 movq -13(%eax), %xmm0 2279 movq %xmm0, -13(%edx) 2280 movl -5(%eax), %ecx 2281 movl %ecx, -5(%edx) 2282 movzbl -1(%eax), %ecx 2283 movb %cl, -1(%edx) 2284#ifdef USE_AS_MEMPCPY 2285 movl %edx, %eax 2286#else 2287 movl DEST(%esp), %eax 2288#endif 2289 RETURN 2290 2291 .p2align 4 2292L(fwd_write_37bytes_align): 2293 movdqa -37(%eax), %xmm0 2294 movdqa %xmm0, -37(%edx) 2295L(fwd_write_21bytes_align): 2296 movdqa -21(%eax), %xmm0 2297 movdqa %xmm0, -21(%edx) 2298 movl -5(%eax), %ecx 2299 movl %ecx, -5(%edx) 2300 movzbl -1(%eax), %ecx 2301 movb %cl, -1(%edx) 2302#ifdef USE_AS_MEMPCPY 2303 movl %edx, %eax 2304#else 2305 movl DEST(%esp), %eax 2306#endif 2307 RETURN 2308 2309 .p2align 4 2310L(fwd_write_41bytes_align): 2311 movdqa -41(%eax), %xmm0 2312 movdqa %xmm0, -41(%edx) 2313L(fwd_write_25bytes_align): 2314 movdqa -25(%eax), %xmm0 2315 movdqa %xmm0, -25(%edx) 2316L(fwd_write_9bytes_align): 2317 movq -9(%eax), %xmm0 2318 movq %xmm0, -9(%edx) 2319L(fwd_write_1bytes_align): 2320 movzbl -1(%eax), %ecx 2321 movb %cl, -1(%edx) 2322#ifdef USE_AS_MEMPCPY 2323 movl %edx, %eax 2324#else 2325 movl DEST(%esp), %eax 2326#endif 2327 RETURN 2328 2329 .p2align 4 2330L(fwd_write_33bytes_align): 2331 movdqa -33(%eax), %xmm0 2332 movdqa %xmm0, -33(%edx) 2333L(fwd_write_17bytes_align): 2334 movdqa -17(%eax), %xmm0 2335 movdqa %xmm0, -17(%edx) 2336 movzbl -1(%eax), %ecx 2337 movb %cl, -1(%edx) 2338#ifdef USE_AS_MEMPCPY 2339 movl %edx, %eax 2340#else 2341 movl DEST(%esp), %eax 2342#endif 2343 RETURN 2344 2345 .p2align 4 2346L(fwd_write_46bytes_align): 2347 movdqa -46(%eax), %xmm0 2348 movdqa %xmm0, -46(%edx) 2349L(fwd_write_30bytes_align): 2350 movdqa -30(%eax), %xmm0 2351 movdqa %xmm0, -30(%edx) 2352L(fwd_write_14bytes_align): 2353 movq -14(%eax), %xmm0 2354 movq %xmm0, -14(%edx) 2355L(fwd_write_6bytes_align): 2356 movl -6(%eax), %ecx 2357 movl %ecx, -6(%edx) 2358 movzwl -2(%eax), %ecx 2359 movw %cx, -2(%edx) 2360#ifdef USE_AS_MEMPCPY 2361 movl %edx, %eax 2362#else 2363 movl DEST(%esp), %eax 2364#endif 2365 RETURN 2366 2367 .p2align 4 2368L(fwd_write_38bytes_align): 2369 movdqa -38(%eax), %xmm0 2370 movdqa %xmm0, -38(%edx) 2371L(fwd_write_22bytes_align): 2372 movdqa -22(%eax), %xmm0 2373 movdqa %xmm0, -22(%edx) 2374 movl -6(%eax), %ecx 2375 movl %ecx, -6(%edx) 2376 movzwl -2(%eax), %ecx 2377 movw %cx, -2(%edx) 2378#ifdef USE_AS_MEMPCPY 2379 movl %edx, %eax 2380#else 2381 movl DEST(%esp), %eax 2382#endif 2383 RETURN 2384 2385 .p2align 4 2386L(fwd_write_42bytes_align): 2387 movdqa -42(%eax), %xmm0 2388 movdqa %xmm0, -42(%edx) 2389L(fwd_write_26bytes_align): 2390 movdqa -26(%eax), %xmm0 2391 movdqa %xmm0, -26(%edx) 2392L(fwd_write_10bytes_align): 2393 movq -10(%eax), %xmm0 2394 movq %xmm0, -10(%edx) 2395L(fwd_write_2bytes_align): 2396 movzwl -2(%eax), %ecx 2397 movw %cx, -2(%edx) 2398#ifdef USE_AS_MEMPCPY 2399 movl %edx, %eax 2400#else 2401 movl DEST(%esp), %eax 2402#endif 2403 RETURN 2404 2405 .p2align 4 2406L(fwd_write_34bytes_align): 2407 movdqa -34(%eax), %xmm0 2408 movdqa %xmm0, -34(%edx) 2409L(fwd_write_18bytes_align): 2410 movdqa -18(%eax), %xmm0 2411 movdqa %xmm0, -18(%edx) 2412 movzwl -2(%eax), %ecx 2413 movw %cx, -2(%edx) 2414#ifdef USE_AS_MEMPCPY 2415 movl %edx, %eax 2416#else 2417 movl DEST(%esp), %eax 2418#endif 2419 RETURN 2420 2421 .p2align 4 2422L(fwd_write_47bytes_align): 2423 movdqa -47(%eax), %xmm0 2424 movdqa %xmm0, -47(%edx) 2425L(fwd_write_31bytes_align): 2426 movdqa -31(%eax), %xmm0 2427 movdqa %xmm0, -31(%edx) 2428L(fwd_write_15bytes_align): 2429 movq -15(%eax), %xmm0 2430 movq %xmm0, -15(%edx) 2431L(fwd_write_7bytes_align): 2432 movl -7(%eax), %ecx 2433 movl %ecx, -7(%edx) 2434 movzwl -3(%eax), %ecx 2435 movzbl -1(%eax), %eax 2436 movw %cx, -3(%edx) 2437 movb %al, -1(%edx) 2438#ifdef USE_AS_MEMPCPY 2439 movl %edx, %eax 2440#else 2441 movl DEST(%esp), %eax 2442#endif 2443 RETURN 2444 2445 .p2align 4 2446L(fwd_write_39bytes_align): 2447 movdqa -39(%eax), %xmm0 2448 movdqa %xmm0, -39(%edx) 2449L(fwd_write_23bytes_align): 2450 movdqa -23(%eax), %xmm0 2451 movdqa %xmm0, -23(%edx) 2452 movl -7(%eax), %ecx 2453 movl %ecx, -7(%edx) 2454 movzwl -3(%eax), %ecx 2455 movzbl -1(%eax), %eax 2456 movw %cx, -3(%edx) 2457 movb %al, -1(%edx) 2458#ifdef USE_AS_MEMPCPY 2459 movl %edx, %eax 2460#else 2461 movl DEST(%esp), %eax 2462#endif 2463 RETURN 2464 2465 .p2align 4 2466L(fwd_write_43bytes_align): 2467 movdqa -43(%eax), %xmm0 2468 movdqa %xmm0, -43(%edx) 2469L(fwd_write_27bytes_align): 2470 movdqa -27(%eax), %xmm0 2471 movdqa %xmm0, -27(%edx) 2472L(fwd_write_11bytes_align): 2473 movq -11(%eax), %xmm0 2474 movq %xmm0, -11(%edx) 2475L(fwd_write_3bytes_align): 2476 movzwl -3(%eax), %ecx 2477 movzbl -1(%eax), %eax 2478 movw %cx, -3(%edx) 2479 movb %al, -1(%edx) 2480#ifdef USE_AS_MEMPCPY 2481 movl %edx, %eax 2482#else 2483 movl DEST(%esp), %eax 2484#endif 2485 RETURN 2486 2487 .p2align 4 2488L(fwd_write_35bytes_align): 2489 movdqa -35(%eax), %xmm0 2490 movdqa %xmm0, -35(%edx) 2491L(fwd_write_19bytes_align): 2492 movdqa -19(%eax), %xmm0 2493 movdqa %xmm0, -19(%edx) 2494 movzwl -3(%eax), %ecx 2495 movzbl -1(%eax), %eax 2496 movw %cx, -3(%edx) 2497 movb %al, -1(%edx) 2498#ifdef USE_AS_MEMPCPY 2499 movl %edx, %eax 2500#else 2501 movl DEST(%esp), %eax 2502#endif 2503 RETURN 2504 2505 .p2align 4 2506L(fwd_write_44bytes_align): 2507 movdqa -44(%eax), %xmm0 2508 movdqa %xmm0, -44(%edx) 2509L(fwd_write_28bytes_align): 2510 movdqa -28(%eax), %xmm0 2511 movdqa %xmm0, -28(%edx) 2512L(fwd_write_12bytes_align): 2513 movq -12(%eax), %xmm0 2514 movq %xmm0, -12(%edx) 2515L(fwd_write_4bytes_align): 2516 movl -4(%eax), %ecx 2517 movl %ecx, -4(%edx) 2518#ifdef USE_AS_MEMPCPY 2519 movl %edx, %eax 2520#else 2521 movl DEST(%esp), %eax 2522#endif 2523 RETURN 2524 2525 .p2align 4 2526L(fwd_write_36bytes_align): 2527 movdqa -36(%eax), %xmm0 2528 movdqa %xmm0, -36(%edx) 2529L(fwd_write_20bytes_align): 2530 movdqa -20(%eax), %xmm0 2531 movdqa %xmm0, -20(%edx) 2532 movl -4(%eax), %ecx 2533 movl %ecx, -4(%edx) 2534#ifdef USE_AS_MEMPCPY 2535 movl %edx, %eax 2536#else 2537 movl DEST(%esp), %eax 2538#endif 2539 RETURN_END 2540 2541 CFI_PUSH (%edi) 2542 2543 .p2align 4 2544L(large_page): 2545 movdqu (%eax), %xmm1 2546#ifdef USE_AS_MEMMOVE 2547 movl DEST+4(%esp), %edi 2548 movdqu %xmm0, (%edi) 2549#endif 2550 lea 16(%eax), %eax 2551 movntdq %xmm1, (%edx) 2552 lea 16(%edx), %edx 2553 lea -0x90(%ecx), %ecx 2554 POP (%edi) 2555 2556 .p2align 4 2557L(large_page_loop): 2558 movdqu (%eax), %xmm0 2559 movdqu 0x10(%eax), %xmm1 2560 movdqu 0x20(%eax), %xmm2 2561 movdqu 0x30(%eax), %xmm3 2562 movdqu 0x40(%eax), %xmm4 2563 movdqu 0x50(%eax), %xmm5 2564 movdqu 0x60(%eax), %xmm6 2565 movdqu 0x70(%eax), %xmm7 2566 lea 0x80(%eax), %eax 2567 2568 sub $0x80, %ecx 2569 movntdq %xmm0, (%edx) 2570 movntdq %xmm1, 0x10(%edx) 2571 movntdq %xmm2, 0x20(%edx) 2572 movntdq %xmm3, 0x30(%edx) 2573 movntdq %xmm4, 0x40(%edx) 2574 movntdq %xmm5, 0x50(%edx) 2575 movntdq %xmm6, 0x60(%edx) 2576 movntdq %xmm7, 0x70(%edx) 2577 lea 0x80(%edx), %edx 2578 jae L(large_page_loop) 2579 cmp $-0x40, %ecx 2580 lea 0x80(%ecx), %ecx 2581 jl L(large_page_less_64bytes) 2582 2583 movdqu (%eax), %xmm0 2584 movdqu 0x10(%eax), %xmm1 2585 movdqu 0x20(%eax), %xmm2 2586 movdqu 0x30(%eax), %xmm3 2587 lea 0x40(%eax), %eax 2588 2589 movntdq %xmm0, (%edx) 2590 movntdq %xmm1, 0x10(%edx) 2591 movntdq %xmm2, 0x20(%edx) 2592 movntdq %xmm3, 0x30(%edx) 2593 lea 0x40(%edx), %edx 2594 sub $0x40, %ecx 2595L(large_page_less_64bytes): 2596 cmp $32, %ecx 2597 jb L(large_page_less_32bytes) 2598 movdqu (%eax), %xmm0 2599 movdqu 0x10(%eax), %xmm1 2600 lea 0x20(%eax), %eax 2601 movntdq %xmm0, (%edx) 2602 movntdq %xmm1, 0x10(%edx) 2603 lea 0x20(%edx), %edx 2604 sub $0x20, %ecx 2605L(large_page_less_32bytes): 2606 add %ecx, %edx 2607 add %ecx, %eax 2608 sfence 2609 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 2610 2611 .p2align 4 2612L(bk_write_44bytes): 2613 movq 36(%eax), %xmm0 2614 movq %xmm0, 36(%edx) 2615L(bk_write_36bytes): 2616 movq 28(%eax), %xmm0 2617 movq %xmm0, 28(%edx) 2618L(bk_write_28bytes): 2619 movq 20(%eax), %xmm0 2620 movq %xmm0, 20(%edx) 2621L(bk_write_20bytes): 2622 movq 12(%eax), %xmm0 2623 movq %xmm0, 12(%edx) 2624L(bk_write_12bytes): 2625 movq 4(%eax), %xmm0 2626 movq %xmm0, 4(%edx) 2627L(bk_write_4bytes): 2628 movl (%eax), %ecx 2629 movl %ecx, (%edx) 2630L(bk_write_0bytes): 2631 movl DEST(%esp), %eax 2632#ifdef USE_AS_MEMPCPY 2633 movl LEN(%esp), %ecx 2634 add %ecx, %eax 2635#endif 2636 RETURN 2637 2638 .p2align 4 2639L(bk_write_40bytes): 2640 movq 32(%eax), %xmm0 2641 movq %xmm0, 32(%edx) 2642L(bk_write_32bytes): 2643 movq 24(%eax), %xmm0 2644 movq %xmm0, 24(%edx) 2645L(bk_write_24bytes): 2646 movq 16(%eax), %xmm0 2647 movq %xmm0, 16(%edx) 2648L(bk_write_16bytes): 2649 movq 8(%eax), %xmm0 2650 movq %xmm0, 8(%edx) 2651L(bk_write_8bytes): 2652 movq (%eax), %xmm0 2653 movq %xmm0, (%edx) 2654 movl DEST(%esp), %eax 2655#ifdef USE_AS_MEMPCPY 2656 movl LEN(%esp), %ecx 2657 add %ecx, %eax 2658#endif 2659 RETURN 2660 2661 .p2align 4 2662L(bk_write_45bytes): 2663 movq 37(%eax), %xmm0 2664 movq %xmm0, 37(%edx) 2665L(bk_write_37bytes): 2666 movq 29(%eax), %xmm0 2667 movq %xmm0, 29(%edx) 2668L(bk_write_29bytes): 2669 movq 21(%eax), %xmm0 2670 movq %xmm0, 21(%edx) 2671L(bk_write_21bytes): 2672 movq 13(%eax), %xmm0 2673 movq %xmm0, 13(%edx) 2674L(bk_write_13bytes): 2675 movq 5(%eax), %xmm0 2676 movq %xmm0, 5(%edx) 2677L(bk_write_5bytes): 2678 movl 1(%eax), %ecx 2679 movl %ecx, 1(%edx) 2680L(bk_write_1bytes): 2681 movzbl (%eax), %ecx 2682 movb %cl, (%edx) 2683 movl DEST(%esp), %eax 2684#ifdef USE_AS_MEMPCPY 2685 movl LEN(%esp), %ecx 2686 add %ecx, %eax 2687#endif 2688 RETURN 2689 2690 .p2align 4 2691L(bk_write_41bytes): 2692 movq 33(%eax), %xmm0 2693 movq %xmm0, 33(%edx) 2694L(bk_write_33bytes): 2695 movq 25(%eax), %xmm0 2696 movq %xmm0, 25(%edx) 2697L(bk_write_25bytes): 2698 movq 17(%eax), %xmm0 2699 movq %xmm0, 17(%edx) 2700L(bk_write_17bytes): 2701 movq 9(%eax), %xmm0 2702 movq %xmm0, 9(%edx) 2703L(bk_write_9bytes): 2704 movq 1(%eax), %xmm0 2705 movq %xmm0, 1(%edx) 2706 movzbl (%eax), %ecx 2707 movb %cl, (%edx) 2708 movl DEST(%esp), %eax 2709#ifdef USE_AS_MEMPCPY 2710 movl LEN(%esp), %ecx 2711 add %ecx, %eax 2712#endif 2713 RETURN 2714 2715 .p2align 4 2716L(bk_write_46bytes): 2717 movq 38(%eax), %xmm0 2718 movq %xmm0, 38(%edx) 2719L(bk_write_38bytes): 2720 movq 30(%eax), %xmm0 2721 movq %xmm0, 30(%edx) 2722L(bk_write_30bytes): 2723 movq 22(%eax), %xmm0 2724 movq %xmm0, 22(%edx) 2725L(bk_write_22bytes): 2726 movq 14(%eax), %xmm0 2727 movq %xmm0, 14(%edx) 2728L(bk_write_14bytes): 2729 movq 6(%eax), %xmm0 2730 movq %xmm0, 6(%edx) 2731L(bk_write_6bytes): 2732 movl 2(%eax), %ecx 2733 movl %ecx, 2(%edx) 2734 movzwl (%eax), %ecx 2735 movw %cx, (%edx) 2736 movl DEST(%esp), %eax 2737#ifdef USE_AS_MEMPCPY 2738 movl LEN(%esp), %ecx 2739 add %ecx, %eax 2740#endif 2741 RETURN 2742 2743 .p2align 4 2744L(bk_write_42bytes): 2745 movq 34(%eax), %xmm0 2746 movq %xmm0, 34(%edx) 2747L(bk_write_34bytes): 2748 movq 26(%eax), %xmm0 2749 movq %xmm0, 26(%edx) 2750L(bk_write_26bytes): 2751 movq 18(%eax), %xmm0 2752 movq %xmm0, 18(%edx) 2753L(bk_write_18bytes): 2754 movq 10(%eax), %xmm0 2755 movq %xmm0, 10(%edx) 2756L(bk_write_10bytes): 2757 movq 2(%eax), %xmm0 2758 movq %xmm0, 2(%edx) 2759L(bk_write_2bytes): 2760 movzwl (%eax), %ecx 2761 movw %cx, (%edx) 2762 movl DEST(%esp), %eax 2763#ifdef USE_AS_MEMPCPY 2764 movl LEN(%esp), %ecx 2765 add %ecx, %eax 2766#endif 2767 RETURN 2768 2769 .p2align 4 2770L(bk_write_47bytes): 2771 movq 39(%eax), %xmm0 2772 movq %xmm0, 39(%edx) 2773L(bk_write_39bytes): 2774 movq 31(%eax), %xmm0 2775 movq %xmm0, 31(%edx) 2776L(bk_write_31bytes): 2777 movq 23(%eax), %xmm0 2778 movq %xmm0, 23(%edx) 2779L(bk_write_23bytes): 2780 movq 15(%eax), %xmm0 2781 movq %xmm0, 15(%edx) 2782L(bk_write_15bytes): 2783 movq 7(%eax), %xmm0 2784 movq %xmm0, 7(%edx) 2785L(bk_write_7bytes): 2786 movl 3(%eax), %ecx 2787 movl %ecx, 3(%edx) 2788 movzwl 1(%eax), %ecx 2789 movw %cx, 1(%edx) 2790 movzbl (%eax), %eax 2791 movb %al, (%edx) 2792 movl DEST(%esp), %eax 2793#ifdef USE_AS_MEMPCPY 2794 movl LEN(%esp), %ecx 2795 add %ecx, %eax 2796#endif 2797 RETURN 2798 2799 .p2align 4 2800L(bk_write_43bytes): 2801 movq 35(%eax), %xmm0 2802 movq %xmm0, 35(%edx) 2803L(bk_write_35bytes): 2804 movq 27(%eax), %xmm0 2805 movq %xmm0, 27(%edx) 2806L(bk_write_27bytes): 2807 movq 19(%eax), %xmm0 2808 movq %xmm0, 19(%edx) 2809L(bk_write_19bytes): 2810 movq 11(%eax), %xmm0 2811 movq %xmm0, 11(%edx) 2812L(bk_write_11bytes): 2813 movq 3(%eax), %xmm0 2814 movq %xmm0, 3(%edx) 2815L(bk_write_3bytes): 2816 movzwl 1(%eax), %ecx 2817 movw %cx, 1(%edx) 2818 movzbl (%eax), %eax 2819 movb %al, (%edx) 2820 movl DEST(%esp), %eax 2821#ifdef USE_AS_MEMPCPY 2822 movl LEN(%esp), %ecx 2823 add %ecx, %eax 2824#endif 2825 RETURN_END 2826 2827 2828 .pushsection .rodata.ssse3,"a",@progbits 2829 .p2align 2 2830L(table_48bytes_fwd): 2831 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) 2832 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) 2833 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) 2834 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) 2835 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) 2836 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) 2837 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) 2838 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) 2839 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) 2840 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) 2841 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) 2842 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) 2843 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) 2844 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) 2845 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) 2846 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) 2847 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) 2848 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) 2849 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) 2850 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) 2851 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) 2852 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) 2853 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) 2854 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) 2855 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) 2856 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) 2857 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) 2858 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) 2859 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) 2860 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) 2861 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) 2862 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) 2863 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) 2864 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) 2865 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) 2866 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) 2867 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) 2868 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) 2869 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) 2870 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) 2871 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) 2872 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) 2873 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) 2874 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) 2875 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) 2876 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) 2877 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) 2878 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) 2879 2880 .p2align 2 2881L(table_48bytes_fwd_align): 2882 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) 2883 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) 2884 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) 2885 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) 2886 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) 2887 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) 2888 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) 2889 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) 2890 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) 2891 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) 2892 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) 2893 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) 2894 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) 2895 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) 2896 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) 2897 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) 2898 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) 2899 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) 2900 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) 2901 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) 2902 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) 2903 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) 2904 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) 2905 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) 2906 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) 2907 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) 2908 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) 2909 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) 2910 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) 2911 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) 2912 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) 2913 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) 2914 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) 2915 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) 2916 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) 2917 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) 2918 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) 2919 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) 2920 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) 2921 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) 2922 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) 2923 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) 2924 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) 2925 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) 2926 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) 2927 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) 2928 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) 2929 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) 2930 2931 .p2align 2 2932L(shl_table): 2933 .int JMPTBL (L(shl_0), L(shl_table)) 2934 .int JMPTBL (L(shl_1), L(shl_table)) 2935 .int JMPTBL (L(shl_2), L(shl_table)) 2936 .int JMPTBL (L(shl_3), L(shl_table)) 2937 .int JMPTBL (L(shl_4), L(shl_table)) 2938 .int JMPTBL (L(shl_5), L(shl_table)) 2939 .int JMPTBL (L(shl_6), L(shl_table)) 2940 .int JMPTBL (L(shl_7), L(shl_table)) 2941 .int JMPTBL (L(shl_8), L(shl_table)) 2942 .int JMPTBL (L(shl_9), L(shl_table)) 2943 .int JMPTBL (L(shl_10), L(shl_table)) 2944 .int JMPTBL (L(shl_11), L(shl_table)) 2945 .int JMPTBL (L(shl_12), L(shl_table)) 2946 .int JMPTBL (L(shl_13), L(shl_table)) 2947 .int JMPTBL (L(shl_14), L(shl_table)) 2948 .int JMPTBL (L(shl_15), L(shl_table)) 2949 2950 .p2align 2 2951L(table_48_bytes_bwd): 2952 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) 2953 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) 2954 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) 2955 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) 2956 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) 2957 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) 2958 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) 2959 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) 2960 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) 2961 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) 2962 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) 2963 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) 2964 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) 2965 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) 2966 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) 2967 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) 2968 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) 2969 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) 2970 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) 2971 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) 2972 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) 2973 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) 2974 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) 2975 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) 2976 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) 2977 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) 2978 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) 2979 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) 2980 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) 2981 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) 2982 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) 2983 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) 2984 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) 2985 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) 2986 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) 2987 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) 2988 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) 2989 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) 2990 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) 2991 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) 2992 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) 2993 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) 2994 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) 2995 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) 2996 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) 2997 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) 2998 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) 2999 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) 3000 3001 .popsection 3002 3003#ifdef USE_AS_MEMMOVE 3004 .p2align 4 3005L(copy_backward): 3006 PUSH (%edi) 3007 movl %eax, %edi 3008 lea (%ecx,%edx,1),%edx 3009 lea (%ecx,%edi,1),%edi 3010 testl $0x3, %edx 3011 jnz L(bk_align) 3012 3013L(bk_aligned_4): 3014 cmp $64, %ecx 3015 jae L(bk_write_more64bytes) 3016 3017L(bk_write_64bytesless): 3018 cmp $32, %ecx 3019 jb L(bk_write_less32bytes) 3020 3021L(bk_write_more32bytes): 3022 /* Copy 32 bytes at a time. */ 3023 sub $32, %ecx 3024 movq -8(%edi), %xmm0 3025 movq %xmm0, -8(%edx) 3026 movq -16(%edi), %xmm0 3027 movq %xmm0, -16(%edx) 3028 movq -24(%edi), %xmm0 3029 movq %xmm0, -24(%edx) 3030 movq -32(%edi), %xmm0 3031 movq %xmm0, -32(%edx) 3032 sub $32, %edx 3033 sub $32, %edi 3034 3035L(bk_write_less32bytes): 3036 movl %edi, %eax 3037 sub %ecx, %edx 3038 sub %ecx, %eax 3039 POP (%edi) 3040L(bk_write_less32bytes_2): 3041 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 3042 3043 CFI_PUSH (%edi) 3044 3045 .p2align 4 3046L(bk_align): 3047 cmp $8, %ecx 3048 jbe L(bk_write_less32bytes) 3049 testl $1, %edx 3050 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, 3051 then (EDX & 2) must be != 0. */ 3052 jz L(bk_got2) 3053 sub $1, %edi 3054 sub $1, %ecx 3055 sub $1, %edx 3056 movzbl (%edi), %eax 3057 movb %al, (%edx) 3058 3059 testl $2, %edx 3060 jz L(bk_aligned_4) 3061 3062L(bk_got2): 3063 sub $2, %edi 3064 sub $2, %ecx 3065 sub $2, %edx 3066 movzwl (%edi), %eax 3067 movw %ax, (%edx) 3068 jmp L(bk_aligned_4) 3069 3070 .p2align 4 3071L(bk_write_more64bytes): 3072 /* Check alignment of last byte. */ 3073 testl $15, %edx 3074 jz L(bk_ssse3_cpy_pre) 3075 3076/* EDX is aligned 4 bytes, but not 16 bytes. */ 3077L(bk_ssse3_align): 3078 sub $4, %edi 3079 sub $4, %ecx 3080 sub $4, %edx 3081 movl (%edi), %eax 3082 movl %eax, (%edx) 3083 3084 testl $15, %edx 3085 jz L(bk_ssse3_cpy_pre) 3086 3087 sub $4, %edi 3088 sub $4, %ecx 3089 sub $4, %edx 3090 movl (%edi), %eax 3091 movl %eax, (%edx) 3092 3093 testl $15, %edx 3094 jz L(bk_ssse3_cpy_pre) 3095 3096 sub $4, %edi 3097 sub $4, %ecx 3098 sub $4, %edx 3099 movl (%edi), %eax 3100 movl %eax, (%edx) 3101 3102L(bk_ssse3_cpy_pre): 3103 cmp $64, %ecx 3104 jb L(bk_write_more32bytes) 3105 3106 .p2align 4 3107L(bk_ssse3_cpy): 3108 sub $64, %edi 3109 sub $64, %ecx 3110 sub $64, %edx 3111 movdqu 0x30(%edi), %xmm3 3112 movdqa %xmm3, 0x30(%edx) 3113 movdqu 0x20(%edi), %xmm2 3114 movdqa %xmm2, 0x20(%edx) 3115 movdqu 0x10(%edi), %xmm1 3116 movdqa %xmm1, 0x10(%edx) 3117 movdqu (%edi), %xmm0 3118 movdqa %xmm0, (%edx) 3119 cmp $64, %ecx 3120 jae L(bk_ssse3_cpy) 3121 jmp L(bk_write_64bytesless) 3122 3123#endif 3124 3125END (MEMCPY) 3126