1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMMOVE 34# define MEMMOVE memmove 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef END 71# define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74#endif 75 76#ifdef USE_AS_BCOPY 77# define SRC PARMS 78# define DEST SRC+4 79# define LEN DEST+4 80#else 81# define DEST PARMS 82# define SRC DEST+4 83# define LEN SRC+4 84#endif 85 86#define CFI_PUSH(REG) \ 87 cfi_adjust_cfa_offset (4); \ 88 cfi_rel_offset (REG, 0) 89 90#define CFI_POP(REG) \ 91 cfi_adjust_cfa_offset (-4); \ 92 cfi_restore (REG) 93 94#define PUSH(REG) pushl REG; CFI_PUSH (REG) 95#define POP(REG) popl REG; CFI_POP (REG) 96 97#define PARMS 8 /* Preserve EBX. */ 98#define ENTRANCE PUSH (%ebx); 99#define RETURN_END POP (%ebx); ret 100#define RETURN RETURN_END; CFI_PUSH (%ebx) 101 102 .section .text.sse2,"ax",@progbits 103ENTRY (MEMMOVE) 104 ENTRANCE 105 movl LEN(%esp), %ecx 106 movl SRC(%esp), %eax 107 movl DEST(%esp), %edx 108 109/* Check whether we should copy backward or forward. */ 110 cmp %eax, %edx 111 je L(mm_return) 112 jg L(mm_len_0_or_more_backward) 113 114/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 115 separately. */ 116 cmp $16, %ecx 117 jbe L(mm_len_0_16_bytes_forward) 118 119 cmpl $32, %ecx 120 ja L(mm_len_32_or_more_forward) 121 122/* Copy [0..32] and return. */ 123 movdqu (%eax), %xmm0 124 movdqu -16(%eax, %ecx), %xmm1 125 movdqu %xmm0, (%edx) 126 movdqu %xmm1, -16(%edx, %ecx) 127 jmp L(mm_return) 128 129L(mm_len_32_or_more_forward): 130 cmpl $64, %ecx 131 ja L(mm_len_64_or_more_forward) 132 133/* Copy [0..64] and return. */ 134 movdqu (%eax), %xmm0 135 movdqu 16(%eax), %xmm1 136 movdqu -16(%eax, %ecx), %xmm2 137 movdqu -32(%eax, %ecx), %xmm3 138 movdqu %xmm0, (%edx) 139 movdqu %xmm1, 16(%edx) 140 movdqu %xmm2, -16(%edx, %ecx) 141 movdqu %xmm3, -32(%edx, %ecx) 142 jmp L(mm_return) 143 144L(mm_len_64_or_more_forward): 145 cmpl $128, %ecx 146 ja L(mm_len_128_or_more_forward) 147 148/* Copy [0..128] and return. */ 149 movdqu (%eax), %xmm0 150 movdqu 16(%eax), %xmm1 151 movdqu 32(%eax), %xmm2 152 movdqu 48(%eax), %xmm3 153 movdqu -64(%eax, %ecx), %xmm4 154 movdqu -48(%eax, %ecx), %xmm5 155 movdqu -32(%eax, %ecx), %xmm6 156 movdqu -16(%eax, %ecx), %xmm7 157 movdqu %xmm0, (%edx) 158 movdqu %xmm1, 16(%edx) 159 movdqu %xmm2, 32(%edx) 160 movdqu %xmm3, 48(%edx) 161 movdqu %xmm4, -64(%edx, %ecx) 162 movdqu %xmm5, -48(%edx, %ecx) 163 movdqu %xmm6, -32(%edx, %ecx) 164 movdqu %xmm7, -16(%edx, %ecx) 165 jmp L(mm_return) 166 167L(mm_len_128_or_more_forward): 168 PUSH (%esi) 169 PUSH (%edi) 170 171/* Aligning the address of destination. */ 172 movdqu (%eax), %xmm0 173 movdqu 16(%eax), %xmm1 174 movdqu 32(%eax), %xmm2 175 movdqu 48(%eax), %xmm3 176 177 leal 64(%edx), %edi 178 andl $-64, %edi 179 subl %edx, %eax 180 181 movdqu (%eax, %edi), %xmm4 182 movdqu 16(%eax, %edi), %xmm5 183 movdqu 32(%eax, %edi), %xmm6 184 movdqu 48(%eax, %edi), %xmm7 185 186 movdqu %xmm0, (%edx) 187 movdqu %xmm1, 16(%edx) 188 movdqu %xmm2, 32(%edx) 189 movdqu %xmm3, 48(%edx) 190 movdqa %xmm4, (%edi) 191 movaps %xmm5, 16(%edi) 192 movaps %xmm6, 32(%edi) 193 movaps %xmm7, 48(%edi) 194 addl $64, %edi 195 196 leal (%edx, %ecx), %ebx 197 andl $-64, %ebx 198 cmp %edi, %ebx 199 jbe L(mm_copy_remaining_forward) 200 201 cmp $SHARED_CACHE_SIZE_HALF, %ecx 202 jae L(mm_large_page_loop_forward) 203 204 .p2align 4 205L(mm_main_loop_forward): 206 207 prefetcht0 128(%eax, %edi) 208 209 movdqu (%eax, %edi), %xmm0 210 movdqu 16(%eax, %edi), %xmm1 211 movdqu 32(%eax, %edi), %xmm2 212 movdqu 48(%eax, %edi), %xmm3 213 movdqa %xmm0, (%edi) 214 movaps %xmm1, 16(%edi) 215 movaps %xmm2, 32(%edi) 216 movaps %xmm3, 48(%edi) 217 leal 64(%edi), %edi 218 cmp %edi, %ebx 219 ja L(mm_main_loop_forward) 220 221L(mm_copy_remaining_forward): 222 addl %edx, %ecx 223 subl %edi, %ecx 224/* We copied all up till %edi position in the dst. 225 In %ecx now is how many bytes are left to copy. 226 Now we need to advance %esi. */ 227 leal (%edi, %eax), %esi 228 229L(mm_remaining_0_64_bytes_forward): 230 cmp $32, %ecx 231 ja L(mm_remaining_33_64_bytes_forward) 232 cmp $16, %ecx 233 ja L(mm_remaining_17_32_bytes_forward) 234 testl %ecx, %ecx 235 .p2align 4,,2 236 je L(mm_return_pop_all) 237 238 cmpb $8, %cl 239 ja L(mm_remaining_9_16_bytes_forward) 240 cmpb $4, %cl 241 .p2align 4,,5 242 ja L(mm_remaining_5_8_bytes_forward) 243 cmpb $2, %cl 244 .p2align 4,,1 245 ja L(mm_remaining_3_4_bytes_forward) 246 movzbl -1(%esi,%ecx), %eax 247 movzbl (%esi), %ebx 248 movb %al, -1(%edi,%ecx) 249 movb %bl, (%edi) 250 jmp L(mm_return_pop_all) 251 252L(mm_remaining_33_64_bytes_forward): 253 movdqu (%esi), %xmm0 254 movdqu 16(%esi), %xmm1 255 movdqu -32(%esi, %ecx), %xmm2 256 movdqu -16(%esi, %ecx), %xmm3 257 movdqu %xmm0, (%edi) 258 movdqu %xmm1, 16(%edi) 259 movdqu %xmm2, -32(%edi, %ecx) 260 movdqu %xmm3, -16(%edi, %ecx) 261 jmp L(mm_return_pop_all) 262 263L(mm_remaining_17_32_bytes_forward): 264 movdqu (%esi), %xmm0 265 movdqu -16(%esi, %ecx), %xmm1 266 movdqu %xmm0, (%edi) 267 movdqu %xmm1, -16(%edi, %ecx) 268 jmp L(mm_return_pop_all) 269 270L(mm_remaining_9_16_bytes_forward): 271 movq (%esi), %xmm0 272 movq -8(%esi, %ecx), %xmm1 273 movq %xmm0, (%edi) 274 movq %xmm1, -8(%edi, %ecx) 275 jmp L(mm_return_pop_all) 276 277L(mm_remaining_5_8_bytes_forward): 278 movl (%esi), %eax 279 movl -4(%esi,%ecx), %ebx 280 movl %eax, (%edi) 281 movl %ebx, -4(%edi,%ecx) 282 jmp L(mm_return_pop_all) 283 284L(mm_remaining_3_4_bytes_forward): 285 movzwl -2(%esi,%ecx), %eax 286 movzwl (%esi), %ebx 287 movw %ax, -2(%edi,%ecx) 288 movw %bx, (%edi) 289 jmp L(mm_return_pop_all) 290 291L(mm_len_0_16_bytes_forward): 292 testb $24, %cl 293 jne L(mm_len_9_16_bytes_forward) 294 testb $4, %cl 295 .p2align 4,,5 296 jne L(mm_len_5_8_bytes_forward) 297 testl %ecx, %ecx 298 .p2align 4,,2 299 je L(mm_return) 300 testb $2, %cl 301 .p2align 4,,1 302 jne L(mm_len_2_4_bytes_forward) 303 movzbl -1(%eax,%ecx), %ebx 304 movzbl (%eax), %eax 305 movb %bl, -1(%edx,%ecx) 306 movb %al, (%edx) 307 jmp L(mm_return) 308 309L(mm_len_2_4_bytes_forward): 310 movzwl -2(%eax,%ecx), %ebx 311 movzwl (%eax), %eax 312 movw %bx, -2(%edx,%ecx) 313 movw %ax, (%edx) 314 jmp L(mm_return) 315 316L(mm_len_5_8_bytes_forward): 317 movl (%eax), %ebx 318 movl -4(%eax,%ecx), %eax 319 movl %ebx, (%edx) 320 movl %eax, -4(%edx,%ecx) 321 jmp L(mm_return) 322 323L(mm_len_9_16_bytes_forward): 324 movq (%eax), %xmm0 325 movq -8(%eax, %ecx), %xmm1 326 movq %xmm0, (%edx) 327 movq %xmm1, -8(%edx, %ecx) 328 jmp L(mm_return) 329 330L(mm_recalc_len): 331/* Compute in %ecx how many bytes are left to copy after 332 the main loop stops. */ 333 movl %ebx, %ecx 334 subl %edx, %ecx 335/* The code for copying backwards. */ 336L(mm_len_0_or_more_backward): 337 338/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 339 separately. */ 340 cmp $16, %ecx 341 jbe L(mm_len_0_16_bytes_backward) 342 343 cmpl $32, %ecx 344 jg L(mm_len_32_or_more_backward) 345 346/* Copy [0..32] and return. */ 347 movdqu (%eax), %xmm0 348 movdqu -16(%eax, %ecx), %xmm1 349 movdqu %xmm0, (%edx) 350 movdqu %xmm1, -16(%edx, %ecx) 351 jmp L(mm_return) 352 353L(mm_len_32_or_more_backward): 354 cmpl $64, %ecx 355 jg L(mm_len_64_or_more_backward) 356 357/* Copy [0..64] and return. */ 358 movdqu (%eax), %xmm0 359 movdqu 16(%eax), %xmm1 360 movdqu -16(%eax, %ecx), %xmm2 361 movdqu -32(%eax, %ecx), %xmm3 362 movdqu %xmm0, (%edx) 363 movdqu %xmm1, 16(%edx) 364 movdqu %xmm2, -16(%edx, %ecx) 365 movdqu %xmm3, -32(%edx, %ecx) 366 jmp L(mm_return) 367 368L(mm_len_64_or_more_backward): 369 cmpl $128, %ecx 370 jg L(mm_len_128_or_more_backward) 371 372/* Copy [0..128] and return. */ 373 movdqu (%eax), %xmm0 374 movdqu 16(%eax), %xmm1 375 movdqu 32(%eax), %xmm2 376 movdqu 48(%eax), %xmm3 377 movdqu -64(%eax, %ecx), %xmm4 378 movdqu -48(%eax, %ecx), %xmm5 379 movdqu -32(%eax, %ecx), %xmm6 380 movdqu -16(%eax, %ecx), %xmm7 381 movdqu %xmm0, (%edx) 382 movdqu %xmm1, 16(%edx) 383 movdqu %xmm2, 32(%edx) 384 movdqu %xmm3, 48(%edx) 385 movdqu %xmm4, -64(%edx, %ecx) 386 movdqu %xmm5, -48(%edx, %ecx) 387 movdqu %xmm6, -32(%edx, %ecx) 388 movdqu %xmm7, -16(%edx, %ecx) 389 jmp L(mm_return) 390 391L(mm_len_128_or_more_backward): 392 PUSH (%esi) 393 PUSH (%edi) 394 395/* Aligning the address of destination. We need to save 396 16 bits from the source in order not to overwrite them. */ 397 movdqu -16(%eax, %ecx), %xmm0 398 movdqu -32(%eax, %ecx), %xmm1 399 movdqu -48(%eax, %ecx), %xmm2 400 movdqu -64(%eax, %ecx), %xmm3 401 402 leal (%edx, %ecx), %edi 403 andl $-64, %edi 404 405 movl %eax, %esi 406 subl %edx, %esi 407 408 movdqu -16(%edi, %esi), %xmm4 409 movdqu -32(%edi, %esi), %xmm5 410 movdqu -48(%edi, %esi), %xmm6 411 movdqu -64(%edi, %esi), %xmm7 412 413 movdqu %xmm0, -16(%edx, %ecx) 414 movdqu %xmm1, -32(%edx, %ecx) 415 movdqu %xmm2, -48(%edx, %ecx) 416 movdqu %xmm3, -64(%edx, %ecx) 417 movdqa %xmm4, -16(%edi) 418 movdqa %xmm5, -32(%edi) 419 movdqa %xmm6, -48(%edi) 420 movdqa %xmm7, -64(%edi) 421 leal -64(%edi), %edi 422 423 leal 64(%edx), %ebx 424 andl $-64, %ebx 425 426 cmp %edi, %ebx 427 jae L(mm_main_loop_backward_end) 428 429 cmp $SHARED_CACHE_SIZE_HALF, %ecx 430 jae L(mm_large_page_loop_backward) 431 432 .p2align 4 433L(mm_main_loop_backward): 434 435 prefetcht0 -128(%edi, %esi) 436 437 movdqu -64(%edi, %esi), %xmm0 438 movdqu -48(%edi, %esi), %xmm1 439 movdqu -32(%edi, %esi), %xmm2 440 movdqu -16(%edi, %esi), %xmm3 441 movdqa %xmm0, -64(%edi) 442 movdqa %xmm1, -48(%edi) 443 movdqa %xmm2, -32(%edi) 444 movdqa %xmm3, -16(%edi) 445 leal -64(%edi), %edi 446 cmp %edi, %ebx 447 jb L(mm_main_loop_backward) 448L(mm_main_loop_backward_end): 449 POP (%edi) 450 POP (%esi) 451 jmp L(mm_recalc_len) 452 453/* Copy [0..16] and return. */ 454L(mm_len_0_16_bytes_backward): 455 testb $24, %cl 456 jnz L(mm_len_9_16_bytes_backward) 457 testb $4, %cl 458 .p2align 4,,5 459 jnz L(mm_len_5_8_bytes_backward) 460 testl %ecx, %ecx 461 .p2align 4,,2 462 je L(mm_return) 463 testb $2, %cl 464 .p2align 4,,1 465 jne L(mm_len_3_4_bytes_backward) 466 movzbl -1(%eax,%ecx), %ebx 467 movzbl (%eax), %eax 468 movb %bl, -1(%edx,%ecx) 469 movb %al, (%edx) 470 jmp L(mm_return) 471 472L(mm_len_3_4_bytes_backward): 473 movzwl -2(%eax,%ecx), %ebx 474 movzwl (%eax), %eax 475 movw %bx, -2(%edx,%ecx) 476 movw %ax, (%edx) 477 jmp L(mm_return) 478 479L(mm_len_9_16_bytes_backward): 480 PUSH (%esi) 481 movl -4(%eax,%ecx), %ebx 482 movl -8(%eax,%ecx), %esi 483 movl %ebx, -4(%edx,%ecx) 484 movl %esi, -8(%edx,%ecx) 485 subl $8, %ecx 486 POP (%esi) 487 jmp L(mm_len_0_16_bytes_backward) 488 489L(mm_len_5_8_bytes_backward): 490 movl (%eax), %ebx 491 movl -4(%eax,%ecx), %eax 492 movl %ebx, (%edx) 493 movl %eax, -4(%edx,%ecx) 494 495L(mm_return): 496 movl %edx, %eax 497 RETURN 498 499L(mm_return_pop_all): 500 movl %edx, %eax 501 POP (%edi) 502 POP (%esi) 503 RETURN 504 505/* Big length copy forward part. */ 506 507 .p2align 4 508L(mm_large_page_loop_forward): 509 movdqu (%eax, %edi), %xmm0 510 movdqu 16(%eax, %edi), %xmm1 511 movdqu 32(%eax, %edi), %xmm2 512 movdqu 48(%eax, %edi), %xmm3 513 movntdq %xmm0, (%edi) 514 movntdq %xmm1, 16(%edi) 515 movntdq %xmm2, 32(%edi) 516 movntdq %xmm3, 48(%edi) 517 leal 64(%edi), %edi 518 cmp %edi, %ebx 519 ja L(mm_large_page_loop_forward) 520 sfence 521 jmp L(mm_copy_remaining_forward) 522 523/* Big length copy backward part. */ 524 .p2align 4 525L(mm_large_page_loop_backward): 526 movdqu -64(%edi, %esi), %xmm0 527 movdqu -48(%edi, %esi), %xmm1 528 movdqu -32(%edi, %esi), %xmm2 529 movdqu -16(%edi, %esi), %xmm3 530 movntdq %xmm0, -64(%edi) 531 movntdq %xmm1, -48(%edi) 532 movntdq %xmm2, -32(%edi) 533 movntdq %xmm3, -16(%edi) 534 leal -64(%edi), %edi 535 cmp %edi, %ebx 536 jb L(mm_large_page_loop_backward) 537 sfence 538 POP (%edi) 539 POP (%esi) 540 jmp L(mm_recalc_len) 541 542END (MEMMOVE) 543