1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMMOVE 34# define MEMMOVE memmove 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef END 71# define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74#endif 75 76#define CFI_PUSH(REG) \ 77 cfi_adjust_cfa_offset (4); \ 78 cfi_rel_offset (REG, 0) 79 80#define CFI_POP(REG) \ 81 cfi_adjust_cfa_offset (-4); \ 82 cfi_restore (REG) 83 84#define PUSH(REG) push REG; 85#define POP(REG) pop REG; 86 87#define ENTRANCE PUSH (%rbx); 88#define RETURN_END POP (%rbx); ret 89#define RETURN RETURN_END; 90 91 .section .text.sse2,"ax",@progbits 92ENTRY (MEMMOVE) 93 ENTRANCE 94#ifdef USE_AS_BCOPY 95 xchg %rsi, %rdi 96#endif 97 mov %rdi, %rax 98 99/* Check whether we should copy backward or forward. */ 100 cmp %rsi, %rdi 101 je L(mm_return) 102 jg L(mm_len_0_or_more_backward) 103 104/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 105 separately. */ 106 cmp $16, %rdx 107 jbe L(mm_len_0_16_bytes_forward) 108 109 cmp $32, %rdx 110 ja L(mm_len_32_or_more_forward) 111 112/* Copy [0..32] and return. */ 113 movdqu (%rsi), %xmm0 114 movdqu -16(%rsi, %rdx), %xmm1 115 movdqu %xmm0, (%rdi) 116 movdqu %xmm1, -16(%rdi, %rdx) 117 jmp L(mm_return) 118 119L(mm_len_32_or_more_forward): 120 cmp $64, %rdx 121 ja L(mm_len_64_or_more_forward) 122 123/* Copy [0..64] and return. */ 124 movdqu (%rsi), %xmm0 125 movdqu 16(%rsi), %xmm1 126 movdqu -16(%rsi, %rdx), %xmm2 127 movdqu -32(%rsi, %rdx), %xmm3 128 movdqu %xmm0, (%rdi) 129 movdqu %xmm1, 16(%rdi) 130 movdqu %xmm2, -16(%rdi, %rdx) 131 movdqu %xmm3, -32(%rdi, %rdx) 132 jmp L(mm_return) 133 134L(mm_len_64_or_more_forward): 135 cmp $128, %rdx 136 ja L(mm_len_128_or_more_forward) 137 138/* Copy [0..128] and return. */ 139 movdqu (%rsi), %xmm0 140 movdqu 16(%rsi), %xmm1 141 movdqu 32(%rsi), %xmm2 142 movdqu 48(%rsi), %xmm3 143 movdqu -64(%rsi, %rdx), %xmm4 144 movdqu -48(%rsi, %rdx), %xmm5 145 movdqu -32(%rsi, %rdx), %xmm6 146 movdqu -16(%rsi, %rdx), %xmm7 147 movdqu %xmm0, (%rdi) 148 movdqu %xmm1, 16(%rdi) 149 movdqu %xmm2, 32(%rdi) 150 movdqu %xmm3, 48(%rdi) 151 movdqu %xmm4, -64(%rdi, %rdx) 152 movdqu %xmm5, -48(%rdi, %rdx) 153 movdqu %xmm6, -32(%rdi, %rdx) 154 movdqu %xmm7, -16(%rdi, %rdx) 155 jmp L(mm_return) 156 157L(mm_len_128_or_more_forward): 158/* Aligning the address of destination. */ 159/* save first unaligned 64 bytes */ 160 movdqu (%rsi), %xmm0 161 movdqu 16(%rsi), %xmm1 162 movdqu 32(%rsi), %xmm2 163 movdqu 48(%rsi), %xmm3 164 165 lea 64(%rdi), %r8 166 and $-64, %r8 /* r8 now aligned to next 64 byte boundary */ 167 sub %rdi, %rsi /* rsi = src - dst = diff */ 168 169 movdqu (%r8, %rsi), %xmm4 170 movdqu 16(%r8, %rsi), %xmm5 171 movdqu 32(%r8, %rsi), %xmm6 172 movdqu 48(%r8, %rsi), %xmm7 173 174 movdqu %xmm0, (%rdi) 175 movdqu %xmm1, 16(%rdi) 176 movdqu %xmm2, 32(%rdi) 177 movdqu %xmm3, 48(%rdi) 178 movdqa %xmm4, (%r8) 179 movaps %xmm5, 16(%r8) 180 movaps %xmm6, 32(%r8) 181 movaps %xmm7, 48(%r8) 182 add $64, %r8 183 184 lea (%rdi, %rdx), %rbx 185 and $-64, %rbx 186 cmp %r8, %rbx 187 jbe L(mm_copy_remaining_forward) 188 189 cmp $SHARED_CACHE_SIZE_HALF, %rdx 190 jae L(mm_large_page_loop_forward) 191 192 .p2align 4 193L(mm_main_loop_forward): 194 195 prefetcht0 128(%r8, %rsi) 196 197 movdqu (%r8, %rsi), %xmm0 198 movdqu 16(%r8, %rsi), %xmm1 199 movdqu 32(%r8, %rsi), %xmm2 200 movdqu 48(%r8, %rsi), %xmm3 201 movdqa %xmm0, (%r8) 202 movaps %xmm1, 16(%r8) 203 movaps %xmm2, 32(%r8) 204 movaps %xmm3, 48(%r8) 205 lea 64(%r8), %r8 206 cmp %r8, %rbx 207 ja L(mm_main_loop_forward) 208 209L(mm_copy_remaining_forward): 210 add %rdi, %rdx 211 sub %r8, %rdx 212/* We copied all up till %rdi position in the dst. 213 In %rdx now is how many bytes are left to copy. 214 Now we need to advance %r8. */ 215 lea (%r8, %rsi), %r9 216 217L(mm_remaining_0_64_bytes_forward): 218 cmp $32, %rdx 219 ja L(mm_remaining_33_64_bytes_forward) 220 cmp $16, %rdx 221 ja L(mm_remaining_17_32_bytes_forward) 222 test %rdx, %rdx 223 .p2align 4,,2 224 je L(mm_return) 225 226 cmpb $8, %dl 227 ja L(mm_remaining_9_16_bytes_forward) 228 cmpb $4, %dl 229 .p2align 4,,5 230 ja L(mm_remaining_5_8_bytes_forward) 231 cmpb $2, %dl 232 .p2align 4,,1 233 ja L(mm_remaining_3_4_bytes_forward) 234 movzbl -1(%r9,%rdx), %esi 235 movzbl (%r9), %ebx 236 movb %sil, -1(%r8,%rdx) 237 movb %bl, (%r8) 238 jmp L(mm_return) 239 240L(mm_remaining_33_64_bytes_forward): 241 movdqu (%r9), %xmm0 242 movdqu 16(%r9), %xmm1 243 movdqu -32(%r9, %rdx), %xmm2 244 movdqu -16(%r9, %rdx), %xmm3 245 movdqu %xmm0, (%r8) 246 movdqu %xmm1, 16(%r8) 247 movdqu %xmm2, -32(%r8, %rdx) 248 movdqu %xmm3, -16(%r8, %rdx) 249 jmp L(mm_return) 250 251L(mm_remaining_17_32_bytes_forward): 252 movdqu (%r9), %xmm0 253 movdqu -16(%r9, %rdx), %xmm1 254 movdqu %xmm0, (%r8) 255 movdqu %xmm1, -16(%r8, %rdx) 256 jmp L(mm_return) 257 258L(mm_remaining_5_8_bytes_forward): 259 movl (%r9), %esi 260 movl -4(%r9,%rdx), %ebx 261 movl %esi, (%r8) 262 movl %ebx, -4(%r8,%rdx) 263 jmp L(mm_return) 264 265L(mm_remaining_9_16_bytes_forward): 266 mov (%r9), %rsi 267 mov -8(%r9, %rdx), %rbx 268 mov %rsi, (%r8) 269 mov %rbx, -8(%r8, %rdx) 270 jmp L(mm_return) 271 272L(mm_remaining_3_4_bytes_forward): 273 movzwl -2(%r9,%rdx), %esi 274 movzwl (%r9), %ebx 275 movw %si, -2(%r8,%rdx) 276 movw %bx, (%r8) 277 jmp L(mm_return) 278 279L(mm_len_0_16_bytes_forward): 280 testb $24, %dl 281 jne L(mm_len_9_16_bytes_forward) 282 testb $4, %dl 283 .p2align 4,,5 284 jne L(mm_len_5_8_bytes_forward) 285 test %rdx, %rdx 286 .p2align 4,,2 287 je L(mm_return) 288 testb $2, %dl 289 .p2align 4,,1 290 jne L(mm_len_2_4_bytes_forward) 291 movzbl -1(%rsi,%rdx), %ebx 292 movzbl (%rsi), %esi 293 movb %bl, -1(%rdi,%rdx) 294 movb %sil, (%rdi) 295 jmp L(mm_return) 296 297L(mm_len_2_4_bytes_forward): 298 movzwl -2(%rsi,%rdx), %ebx 299 movzwl (%rsi), %esi 300 movw %bx, -2(%rdi,%rdx) 301 movw %si, (%rdi) 302 jmp L(mm_return) 303 304L(mm_len_5_8_bytes_forward): 305 movl (%rsi), %ebx 306 movl -4(%rsi,%rdx), %esi 307 movl %ebx, (%rdi) 308 movl %esi, -4(%rdi,%rdx) 309 jmp L(mm_return) 310 311L(mm_len_9_16_bytes_forward): 312 mov (%rsi), %rbx 313 mov -8(%rsi, %rdx), %rsi 314 mov %rbx, (%rdi) 315 mov %rsi, -8(%rdi, %rdx) 316 jmp L(mm_return) 317 318L(mm_recalc_len): 319/* Compute in %rdx how many bytes are left to copy after 320 the main loop stops. */ 321 mov %rbx, %rdx 322 sub %rdi, %rdx 323/* The code for copying backwards. */ 324L(mm_len_0_or_more_backward): 325 326/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 327 separately. */ 328 cmp $16, %rdx 329 jbe L(mm_len_0_16_bytes_backward) 330 331 cmp $32, %rdx 332 ja L(mm_len_32_or_more_backward) 333 334/* Copy [0..32] and return. */ 335 movdqu (%rsi), %xmm0 336 movdqu -16(%rsi, %rdx), %xmm1 337 movdqu %xmm0, (%rdi) 338 movdqu %xmm1, -16(%rdi, %rdx) 339 jmp L(mm_return) 340 341L(mm_len_32_or_more_backward): 342 cmp $64, %rdx 343 ja L(mm_len_64_or_more_backward) 344 345/* Copy [0..64] and return. */ 346 movdqu (%rsi), %xmm0 347 movdqu 16(%rsi), %xmm1 348 movdqu -16(%rsi, %rdx), %xmm2 349 movdqu -32(%rsi, %rdx), %xmm3 350 movdqu %xmm0, (%rdi) 351 movdqu %xmm1, 16(%rdi) 352 movdqu %xmm2, -16(%rdi, %rdx) 353 movdqu %xmm3, -32(%rdi, %rdx) 354 jmp L(mm_return) 355 356L(mm_len_64_or_more_backward): 357 cmp $128, %rdx 358 ja L(mm_len_128_or_more_backward) 359 360/* Copy [0..128] and return. */ 361 movdqu (%rsi), %xmm0 362 movdqu 16(%rsi), %xmm1 363 movdqu 32(%rsi), %xmm2 364 movdqu 48(%rsi), %xmm3 365 movdqu -64(%rsi, %rdx), %xmm4 366 movdqu -48(%rsi, %rdx), %xmm5 367 movdqu -32(%rsi, %rdx), %xmm6 368 movdqu -16(%rsi, %rdx), %xmm7 369 movdqu %xmm0, (%rdi) 370 movdqu %xmm1, 16(%rdi) 371 movdqu %xmm2, 32(%rdi) 372 movdqu %xmm3, 48(%rdi) 373 movdqu %xmm4, -64(%rdi, %rdx) 374 movdqu %xmm5, -48(%rdi, %rdx) 375 movdqu %xmm6, -32(%rdi, %rdx) 376 movdqu %xmm7, -16(%rdi, %rdx) 377 jmp L(mm_return) 378 379L(mm_len_128_or_more_backward): 380/* Aligning the address of destination. We need to save 381 16 bits from the source in order not to overwrite them. */ 382 movdqu -16(%rsi, %rdx), %xmm0 383 movdqu -32(%rsi, %rdx), %xmm1 384 movdqu -48(%rsi, %rdx), %xmm2 385 movdqu -64(%rsi, %rdx), %xmm3 386 387 lea (%rdi, %rdx), %r9 388 and $-64, %r9 /* r9 = aligned dst */ 389 390 mov %rsi, %r8 391 sub %rdi, %r8 /* r8 = src - dst, diff */ 392 393 movdqu -16(%r9, %r8), %xmm4 394 movdqu -32(%r9, %r8), %xmm5 395 movdqu -48(%r9, %r8), %xmm6 396 movdqu -64(%r9, %r8), %xmm7 397 398 movdqu %xmm0, -16(%rdi, %rdx) 399 movdqu %xmm1, -32(%rdi, %rdx) 400 movdqu %xmm2, -48(%rdi, %rdx) 401 movdqu %xmm3, -64(%rdi, %rdx) 402 movdqa %xmm4, -16(%r9) 403 movaps %xmm5, -32(%r9) 404 movaps %xmm6, -48(%r9) 405 movaps %xmm7, -64(%r9) 406 lea -64(%r9), %r9 407 408 lea 64(%rdi), %rbx 409 and $-64, %rbx 410 411 cmp %r9, %rbx 412 jae L(mm_recalc_len) 413 414 cmp $SHARED_CACHE_SIZE_HALF, %rdx 415 jae L(mm_large_page_loop_backward) 416 417 .p2align 4 418L(mm_main_loop_backward): 419 420 prefetcht0 -128(%r9, %r8) 421 422 movdqu -64(%r9, %r8), %xmm0 423 movdqu -48(%r9, %r8), %xmm1 424 movdqu -32(%r9, %r8), %xmm2 425 movdqu -16(%r9, %r8), %xmm3 426 movdqa %xmm0, -64(%r9) 427 movaps %xmm1, -48(%r9) 428 movaps %xmm2, -32(%r9) 429 movaps %xmm3, -16(%r9) 430 lea -64(%r9), %r9 431 cmp %r9, %rbx 432 jb L(mm_main_loop_backward) 433 jmp L(mm_recalc_len) 434 435/* Copy [0..16] and return. */ 436L(mm_len_0_16_bytes_backward): 437 testb $24, %dl 438 jnz L(mm_len_9_16_bytes_backward) 439 testb $4, %dl 440 .p2align 4,,5 441 jnz L(mm_len_5_8_bytes_backward) 442 test %rdx, %rdx 443 .p2align 4,,2 444 je L(mm_return) 445 testb $2, %dl 446 .p2align 4,,1 447 jne L(mm_len_3_4_bytes_backward) 448 movzbl -1(%rsi,%rdx), %ebx 449 movzbl (%rsi), %ecx 450 movb %bl, -1(%rdi,%rdx) 451 movb %cl, (%rdi) 452 jmp L(mm_return) 453 454L(mm_len_3_4_bytes_backward): 455 movzwl -2(%rsi,%rdx), %ebx 456 movzwl (%rsi), %ecx 457 movw %bx, -2(%rdi,%rdx) 458 movw %cx, (%rdi) 459 jmp L(mm_return) 460 461L(mm_len_9_16_bytes_backward): 462 movl -4(%rsi,%rdx), %ebx 463 movl -8(%rsi,%rdx), %ecx 464 movl %ebx, -4(%rdi,%rdx) 465 movl %ecx, -8(%rdi,%rdx) 466 sub $8, %rdx 467 jmp L(mm_len_0_16_bytes_backward) 468 469L(mm_len_5_8_bytes_backward): 470 movl (%rsi), %ebx 471 movl -4(%rsi,%rdx), %ecx 472 movl %ebx, (%rdi) 473 movl %ecx, -4(%rdi,%rdx) 474 475L(mm_return): 476 RETURN 477 478/* Big length copy forward part. */ 479 480 .p2align 4 481L(mm_large_page_loop_forward): 482 movdqu (%r8, %rsi), %xmm0 483 movdqu 16(%r8, %rsi), %xmm1 484 movdqu 32(%r8, %rsi), %xmm2 485 movdqu 48(%r8, %rsi), %xmm3 486 movntdq %xmm0, (%r8) 487 movntdq %xmm1, 16(%r8) 488 movntdq %xmm2, 32(%r8) 489 movntdq %xmm3, 48(%r8) 490 lea 64(%r8), %r8 491 cmp %r8, %rbx 492 ja L(mm_large_page_loop_forward) 493 sfence 494 jmp L(mm_copy_remaining_forward) 495 496/* Big length copy backward part. */ 497 .p2align 4 498L(mm_large_page_loop_backward): 499 movdqu -64(%r9, %r8), %xmm0 500 movdqu -48(%r9, %r8), %xmm1 501 movdqu -32(%r9, %r8), %xmm2 502 movdqu -16(%r9, %r8), %xmm3 503 movntdq %xmm0, -64(%r9) 504 movntdq %xmm1, -48(%r9) 505 movntdq %xmm2, -32(%r9) 506 movntdq %xmm3, -16(%r9) 507 lea -64(%r9), %r9 508 cmp %r9, %rbx 509 jb L(mm_large_page_loop_backward) 510 sfence 511 jmp L(mm_recalc_len) 512 513END (MEMMOVE) 514