1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMMOVE 34# define MEMMOVE memmove 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef END 71# define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74#endif 75 76#ifdef USE_AS_BCOPY 77# define SRC PARMS 78# define DEST SRC+4 79# define LEN DEST+4 80#else 81# define DEST PARMS 82# define SRC DEST+4 83# define LEN SRC+4 84#endif 85 86#define CFI_PUSH(REG) \ 87 cfi_adjust_cfa_offset (4); \ 88 cfi_rel_offset (REG, 0) 89 90#define CFI_POP(REG) \ 91 cfi_adjust_cfa_offset (-4); \ 92 cfi_restore (REG) 93 94#define PUSH(REG) pushl REG; CFI_PUSH (REG) 95#define POP(REG) popl REG; CFI_POP (REG) 96 97#define PARMS 8 /* Preserve EBX. */ 98#define ENTRANCE PUSH (%ebx); 99#define RETURN_END POP (%ebx); ret 100#define RETURN RETURN_END; CFI_PUSH (%ebx) 101 102 .section .text.sse2,"ax",@progbits 103ENTRY (MEMMOVE) 104 ENTRANCE 105 movl LEN(%esp), %ecx 106 movl SRC(%esp), %eax 107 movl DEST(%esp), %edx 108 109/* Check whether we should copy backward or forward. */ 110 cmp %eax, %edx 111 je L(mm_return) 112 jg L(mm_len_0_or_more_backward) 113 114/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 115 separately. */ 116 cmp $16, %ecx 117 jbe L(mm_len_0_16_bytes_forward) 118 119 cmpl $32, %ecx 120 ja L(mm_len_32_or_more_forward) 121 122/* Copy [0..32] and return. */ 123 movdqu (%eax), %xmm0 124 movdqu -16(%eax, %ecx), %xmm1 125 movdqu %xmm0, (%edx) 126 movdqu %xmm1, -16(%edx, %ecx) 127 jmp L(mm_return) 128 129L(mm_len_32_or_more_forward): 130 cmpl $64, %ecx 131 ja L(mm_len_64_or_more_forward) 132 133/* Copy [0..64] and return. */ 134 movdqu (%eax), %xmm0 135 movdqu 16(%eax), %xmm1 136 movdqu -16(%eax, %ecx), %xmm2 137 movdqu -32(%eax, %ecx), %xmm3 138 movdqu %xmm0, (%edx) 139 movdqu %xmm1, 16(%edx) 140 movdqu %xmm2, -16(%edx, %ecx) 141 movdqu %xmm3, -32(%edx, %ecx) 142 jmp L(mm_return) 143 144L(mm_len_64_or_more_forward): 145 cmpl $128, %ecx 146 ja L(mm_len_128_or_more_forward) 147 148/* Copy [0..128] and return. */ 149 movdqu (%eax), %xmm0 150 movdqu 16(%eax), %xmm1 151 movdqu 32(%eax), %xmm2 152 movdqu 48(%eax), %xmm3 153 movdqu -64(%eax, %ecx), %xmm4 154 movdqu -48(%eax, %ecx), %xmm5 155 movdqu -32(%eax, %ecx), %xmm6 156 movdqu -16(%eax, %ecx), %xmm7 157 movdqu %xmm0, (%edx) 158 movdqu %xmm1, 16(%edx) 159 movdqu %xmm2, 32(%edx) 160 movdqu %xmm3, 48(%edx) 161 movdqu %xmm4, -64(%edx, %ecx) 162 movdqu %xmm5, -48(%edx, %ecx) 163 movdqu %xmm6, -32(%edx, %ecx) 164 movdqu %xmm7, -16(%edx, %ecx) 165 jmp L(mm_return) 166 167L(mm_len_128_or_more_forward): 168 PUSH (%esi) 169 PUSH (%edi) 170 171/* Aligning the address of destination. */ 172 movdqu (%eax), %xmm0 173 movdqu 16(%eax), %xmm1 174 movdqu 32(%eax), %xmm2 175 movdqu 48(%eax), %xmm3 176 177 leal 64(%edx), %edi 178 andl $-64, %edi 179 subl %edx, %eax 180 181 movdqu (%eax, %edi), %xmm4 182 movdqu 16(%eax, %edi), %xmm5 183 movdqu 32(%eax, %edi), %xmm6 184 movdqu 48(%eax, %edi), %xmm7 185 186 movdqu %xmm0, (%edx) 187 movdqu %xmm1, 16(%edx) 188 movdqu %xmm2, 32(%edx) 189 movdqu %xmm3, 48(%edx) 190 movdqa %xmm4, (%edi) 191 movaps %xmm5, 16(%edi) 192 movaps %xmm6, 32(%edi) 193 movaps %xmm7, 48(%edi) 194 addl $64, %edi 195 196 leal (%edx, %ecx), %ebx 197 andl $-64, %ebx 198 cmp %edi, %ebx 199 jbe L(mm_copy_remaining_forward) 200 201 cmp $SHARED_CACHE_SIZE_HALF, %ecx 202 jae L(mm_large_page_loop_forward) 203 204 .p2align 4 205L(mm_main_loop_forward): 206 207 prefetcht0 128(%eax, %edi) 208 209 movdqu (%eax, %edi), %xmm0 210 movdqu 16(%eax, %edi), %xmm1 211 movdqu 32(%eax, %edi), %xmm2 212 movdqu 48(%eax, %edi), %xmm3 213 movdqa %xmm0, (%edi) 214 movaps %xmm1, 16(%edi) 215 movaps %xmm2, 32(%edi) 216 movaps %xmm3, 48(%edi) 217 leal 64(%edi), %edi 218 cmp %edi, %ebx 219 ja L(mm_main_loop_forward) 220 221L(mm_copy_remaining_forward): 222 addl %edx, %ecx 223 subl %edi, %ecx 224/* We copied all up till %edi position in the dst. 225 In %ecx now is how many bytes are left to copy. 226 Now we need to advance %esi. */ 227 leal (%edi, %eax), %esi 228 229L(mm_remaining_0_64_bytes_forward): 230 cmp $32, %ecx 231 ja L(mm_remaining_33_64_bytes_forward) 232 cmp $16, %ecx 233 ja L(mm_remaining_17_32_bytes_forward) 234 testl %ecx, %ecx 235 .p2align 4,,2 236 je L(mm_return_pop_all) 237 238 cmpb $8, %cl 239 ja L(mm_remaining_9_16_bytes_forward) 240 cmpb $4, %cl 241 .p2align 4,,5 242 ja L(mm_remaining_5_8_bytes_forward) 243 cmpb $2, %cl 244 .p2align 4,,1 245 ja L(mm_remaining_3_4_bytes_forward) 246 movzbl -1(%esi,%ecx), %eax 247 movzbl (%esi), %ebx 248 movb %al, -1(%edi,%ecx) 249 movb %bl, (%edi) 250 jmp L(mm_return_pop_all) 251 252L(mm_remaining_33_64_bytes_forward): 253 movdqu (%esi), %xmm0 254 movdqu 16(%esi), %xmm1 255 movdqu -32(%esi, %ecx), %xmm2 256 movdqu -16(%esi, %ecx), %xmm3 257 movdqu %xmm0, (%edi) 258 movdqu %xmm1, 16(%edi) 259 movdqu %xmm2, -32(%edi, %ecx) 260 movdqu %xmm3, -16(%edi, %ecx) 261 jmp L(mm_return_pop_all) 262 263L(mm_remaining_17_32_bytes_forward): 264 movdqu (%esi), %xmm0 265 movdqu -16(%esi, %ecx), %xmm1 266 movdqu %xmm0, (%edi) 267 movdqu %xmm1, -16(%edi, %ecx) 268 jmp L(mm_return_pop_all) 269 270L(mm_remaining_9_16_bytes_forward): 271 movq (%esi), %xmm0 272 movq -8(%esi, %ecx), %xmm1 273 movq %xmm0, (%edi) 274 movq %xmm1, -8(%edi, %ecx) 275 jmp L(mm_return_pop_all) 276 277L(mm_remaining_5_8_bytes_forward): 278 movl (%esi), %eax 279 movl -4(%esi,%ecx), %ebx 280 movl %eax, (%edi) 281 movl %ebx, -4(%edi,%ecx) 282 jmp L(mm_return_pop_all) 283 284L(mm_remaining_3_4_bytes_forward): 285 movzwl -2(%esi,%ecx), %eax 286 movzwl (%esi), %ebx 287 movw %ax, -2(%edi,%ecx) 288 movw %bx, (%edi) 289 jmp L(mm_return_pop_all) 290 291L(mm_len_0_16_bytes_forward): 292 testb $24, %cl 293 jne L(mm_len_9_16_bytes_forward) 294 testb $4, %cl 295 .p2align 4,,5 296 jne L(mm_len_5_8_bytes_forward) 297 testl %ecx, %ecx 298 .p2align 4,,2 299 je L(mm_return) 300 testb $2, %cl 301 .p2align 4,,1 302 jne L(mm_len_2_4_bytes_forward) 303 movzbl -1(%eax,%ecx), %ebx 304 movzbl (%eax), %eax 305 movb %bl, -1(%edx,%ecx) 306 movb %al, (%edx) 307 jmp L(mm_return) 308 309L(mm_len_2_4_bytes_forward): 310 movzwl -2(%eax,%ecx), %ebx 311 movzwl (%eax), %eax 312 movw %bx, -2(%edx,%ecx) 313 movw %ax, (%edx) 314 jmp L(mm_return) 315 316L(mm_len_5_8_bytes_forward): 317 movl (%eax), %ebx 318 movl -4(%eax,%ecx), %eax 319 movl %ebx, (%edx) 320 movl %eax, -4(%edx,%ecx) 321 jmp L(mm_return) 322 323L(mm_len_9_16_bytes_forward): 324 movq (%eax), %xmm0 325 movq -8(%eax, %ecx), %xmm1 326 movq %xmm0, (%edx) 327 movq %xmm1, -8(%edx, %ecx) 328 jmp L(mm_return) 329 330 CFI_POP (%edi) 331 CFI_POP (%esi) 332 333L(mm_recalc_len): 334/* Compute in %ecx how many bytes are left to copy after 335 the main loop stops. */ 336 movl %ebx, %ecx 337 subl %edx, %ecx 338/* The code for copying backwards. */ 339L(mm_len_0_or_more_backward): 340 341/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 342 separately. */ 343 cmp $16, %ecx 344 jbe L(mm_len_0_16_bytes_backward) 345 346 cmpl $32, %ecx 347 jg L(mm_len_32_or_more_backward) 348 349/* Copy [0..32] and return. */ 350 movdqu (%eax), %xmm0 351 movdqu -16(%eax, %ecx), %xmm1 352 movdqu %xmm0, (%edx) 353 movdqu %xmm1, -16(%edx, %ecx) 354 jmp L(mm_return) 355 356L(mm_len_32_or_more_backward): 357 cmpl $64, %ecx 358 jg L(mm_len_64_or_more_backward) 359 360/* Copy [0..64] and return. */ 361 movdqu (%eax), %xmm0 362 movdqu 16(%eax), %xmm1 363 movdqu -16(%eax, %ecx), %xmm2 364 movdqu -32(%eax, %ecx), %xmm3 365 movdqu %xmm0, (%edx) 366 movdqu %xmm1, 16(%edx) 367 movdqu %xmm2, -16(%edx, %ecx) 368 movdqu %xmm3, -32(%edx, %ecx) 369 jmp L(mm_return) 370 371L(mm_len_64_or_more_backward): 372 cmpl $128, %ecx 373 jg L(mm_len_128_or_more_backward) 374 375/* Copy [0..128] and return. */ 376 movdqu (%eax), %xmm0 377 movdqu 16(%eax), %xmm1 378 movdqu 32(%eax), %xmm2 379 movdqu 48(%eax), %xmm3 380 movdqu -64(%eax, %ecx), %xmm4 381 movdqu -48(%eax, %ecx), %xmm5 382 movdqu -32(%eax, %ecx), %xmm6 383 movdqu -16(%eax, %ecx), %xmm7 384 movdqu %xmm0, (%edx) 385 movdqu %xmm1, 16(%edx) 386 movdqu %xmm2, 32(%edx) 387 movdqu %xmm3, 48(%edx) 388 movdqu %xmm4, -64(%edx, %ecx) 389 movdqu %xmm5, -48(%edx, %ecx) 390 movdqu %xmm6, -32(%edx, %ecx) 391 movdqu %xmm7, -16(%edx, %ecx) 392 jmp L(mm_return) 393 394L(mm_len_128_or_more_backward): 395 PUSH (%esi) 396 PUSH (%edi) 397 398/* Aligning the address of destination. We need to save 399 16 bits from the source in order not to overwrite them. */ 400 movdqu -16(%eax, %ecx), %xmm0 401 movdqu -32(%eax, %ecx), %xmm1 402 movdqu -48(%eax, %ecx), %xmm2 403 movdqu -64(%eax, %ecx), %xmm3 404 405 leal (%edx, %ecx), %edi 406 andl $-64, %edi 407 408 movl %eax, %esi 409 subl %edx, %esi 410 411 movdqu -16(%edi, %esi), %xmm4 412 movdqu -32(%edi, %esi), %xmm5 413 movdqu -48(%edi, %esi), %xmm6 414 movdqu -64(%edi, %esi), %xmm7 415 416 movdqu %xmm0, -16(%edx, %ecx) 417 movdqu %xmm1, -32(%edx, %ecx) 418 movdqu %xmm2, -48(%edx, %ecx) 419 movdqu %xmm3, -64(%edx, %ecx) 420 movdqa %xmm4, -16(%edi) 421 movdqa %xmm5, -32(%edi) 422 movdqa %xmm6, -48(%edi) 423 movdqa %xmm7, -64(%edi) 424 leal -64(%edi), %edi 425 426 leal 64(%edx), %ebx 427 andl $-64, %ebx 428 429 cmp %edi, %ebx 430 jae L(mm_main_loop_backward_end) 431 432 cmp $SHARED_CACHE_SIZE_HALF, %ecx 433 jae L(mm_large_page_loop_backward) 434 435 .p2align 4 436L(mm_main_loop_backward): 437 438 prefetcht0 -128(%edi, %esi) 439 440 movdqu -64(%edi, %esi), %xmm0 441 movdqu -48(%edi, %esi), %xmm1 442 movdqu -32(%edi, %esi), %xmm2 443 movdqu -16(%edi, %esi), %xmm3 444 movdqa %xmm0, -64(%edi) 445 movdqa %xmm1, -48(%edi) 446 movdqa %xmm2, -32(%edi) 447 movdqa %xmm3, -16(%edi) 448 leal -64(%edi), %edi 449 cmp %edi, %ebx 450 jb L(mm_main_loop_backward) 451L(mm_main_loop_backward_end): 452 POP (%edi) 453 POP (%esi) 454 jmp L(mm_recalc_len) 455 456/* Copy [0..16] and return. */ 457L(mm_len_0_16_bytes_backward): 458 testb $24, %cl 459 jnz L(mm_len_9_16_bytes_backward) 460 testb $4, %cl 461 .p2align 4,,5 462 jnz L(mm_len_5_8_bytes_backward) 463 testl %ecx, %ecx 464 .p2align 4,,2 465 je L(mm_return) 466 testb $2, %cl 467 .p2align 4,,1 468 jne L(mm_len_3_4_bytes_backward) 469 movzbl -1(%eax,%ecx), %ebx 470 movzbl (%eax), %eax 471 movb %bl, -1(%edx,%ecx) 472 movb %al, (%edx) 473 jmp L(mm_return) 474 475L(mm_len_3_4_bytes_backward): 476 movzwl -2(%eax,%ecx), %ebx 477 movzwl (%eax), %eax 478 movw %bx, -2(%edx,%ecx) 479 movw %ax, (%edx) 480 jmp L(mm_return) 481 482L(mm_len_9_16_bytes_backward): 483 PUSH (%esi) 484 movl -4(%eax,%ecx), %ebx 485 movl -8(%eax,%ecx), %esi 486 movl %ebx, -4(%edx,%ecx) 487 movl %esi, -8(%edx,%ecx) 488 subl $8, %ecx 489 POP (%esi) 490 jmp L(mm_len_0_16_bytes_backward) 491 492L(mm_len_5_8_bytes_backward): 493 movl (%eax), %ebx 494 movl -4(%eax,%ecx), %eax 495 movl %ebx, (%edx) 496 movl %eax, -4(%edx,%ecx) 497 498L(mm_return): 499 movl %edx, %eax 500 RETURN 501 502L(mm_return_pop_all): 503 movl %edx, %eax 504 POP (%edi) 505 POP (%esi) 506 RETURN 507 508/* Big length copy forward part. */ 509 510 .p2align 4 511L(mm_large_page_loop_forward): 512 movdqu (%eax, %edi), %xmm0 513 movdqu 16(%eax, %edi), %xmm1 514 movdqu 32(%eax, %edi), %xmm2 515 movdqu 48(%eax, %edi), %xmm3 516 movntdq %xmm0, (%edi) 517 movntdq %xmm1, 16(%edi) 518 movntdq %xmm2, 32(%edi) 519 movntdq %xmm3, 48(%edi) 520 leal 64(%edi), %edi 521 cmp %edi, %ebx 522 ja L(mm_large_page_loop_forward) 523 sfence 524 jmp L(mm_copy_remaining_forward) 525 526/* Big length copy backward part. */ 527 .p2align 4 528L(mm_large_page_loop_backward): 529 movdqu -64(%edi, %esi), %xmm0 530 movdqu -48(%edi, %esi), %xmm1 531 movdqu -32(%edi, %esi), %xmm2 532 movdqu -16(%edi, %esi), %xmm3 533 movntdq %xmm0, -64(%edi) 534 movntdq %xmm1, -48(%edi) 535 movntdq %xmm2, -32(%edi) 536 movntdq %xmm3, -16(%edi) 537 leal -64(%edi), %edi 538 cmp %edi, %ebx 539 jb L(mm_large_page_loop_backward) 540 sfence 541 POP (%edi) 542 POP (%esi) 543 jmp L(mm_recalc_len) 544 545END (MEMMOVE) 546