1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMCPY 34# define MEMCPY memcpy 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef END 71# define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74#endif 75 76#define CFI_PUSH(REG) \ 77 cfi_adjust_cfa_offset (4); \ 78 cfi_rel_offset (REG, 0) 79 80#define CFI_POP(REG) \ 81 cfi_adjust_cfa_offset (-4); \ 82 cfi_restore (REG) 83 84#define PUSH(REG) push REG; 85#define POP(REG) pop REG; 86 87#define ENTRANCE PUSH (%rbx); 88#define RETURN_END POP (%rbx); ret 89#define RETURN RETURN_END; 90 91 .section .text.sse2,"ax",@progbits 92ENTRY (MEMCPY) 93 ENTRANCE 94 cmp %rsi, %rdi 95 je L(return) 96 97 cmp $16, %rdx 98 jbe L(len_0_16_bytes) 99 100 cmp $SHARED_CACHE_SIZE_HALF, %rdx 101 jae L(large_page) 102 103 movdqu (%rsi), %xmm0 104 movdqu -16(%rsi, %rdx), %xmm1 105 cmp $32, %rdx 106 movdqu %xmm0, (%rdi) 107 movdqu %xmm1, -16(%rdi, %rdx) 108 jbe L(return) 109 110 movdqu 16(%rsi), %xmm0 111 movdqu -32(%rsi, %rdx), %xmm1 112 cmp $64, %rdx 113 movdqu %xmm0, 16(%rdi) 114 movdqu %xmm1, -32(%rdi, %rdx) 115 jbe L(return) 116 117 movdqu 32(%rsi), %xmm0 118 movdqu 48(%rsi), %xmm1 119 movdqu -48(%rsi, %rdx), %xmm2 120 movdqu -64(%rsi, %rdx), %xmm3 121 cmp $128, %rdx 122 movdqu %xmm0, 32(%rdi) 123 movdqu %xmm1, 48(%rdi) 124 movdqu %xmm2, -48(%rdi, %rdx) 125 movdqu %xmm3, -64(%rdi, %rdx) 126 jbe L(return) 127 128/* Now the main loop: we align the address of the destination. */ 129 lea 64(%rdi), %r8 130 and $-64, %r8 131 132 add %rdi, %rdx 133 and $-64, %rdx 134 135 sub %rdi, %rsi 136 137/* We should stop two iterations before the termination 138 (in order not to misprefetch). */ 139 sub $64, %rdx 140 cmp %r8, %rdx 141 je L(main_loop_just_one_iteration) 142 143 sub $64, %rdx 144 cmp %r8, %rdx 145 je L(main_loop_last_two_iterations) 146 147 148 .p2align 4 149L(main_loop_cache): 150 151 prefetcht0 128(%r8, %rsi) 152 153 movdqu (%r8, %rsi), %xmm0 154 movdqu 16(%r8, %rsi), %xmm1 155 movdqu 32(%r8, %rsi), %xmm2 156 movdqu 48(%r8, %rsi), %xmm3 157 movdqa %xmm0, (%r8) 158 movdqa %xmm1, 16(%r8) 159 movdqa %xmm2, 32(%r8) 160 movdqa %xmm3, 48(%r8) 161 lea 64(%r8), %r8 162 cmp %r8, %rdx 163 jne L(main_loop_cache) 164 165L(main_loop_last_two_iterations): 166 movdqu (%r8, %rsi), %xmm0 167 movdqu 16(%r8, %rsi), %xmm1 168 movdqu 32(%r8, %rsi), %xmm2 169 movdqu 48(%r8, %rsi), %xmm3 170 movdqu 64(%r8, %rsi), %xmm4 171 movdqu 80(%r8, %rsi), %xmm5 172 movdqu 96(%r8, %rsi), %xmm6 173 movdqu 112(%r8, %rsi), %xmm7 174 movdqa %xmm0, (%r8) 175 movdqa %xmm1, 16(%r8) 176 movdqa %xmm2, 32(%r8) 177 movdqa %xmm3, 48(%r8) 178 movdqa %xmm4, 64(%r8) 179 movdqa %xmm5, 80(%r8) 180 movdqa %xmm6, 96(%r8) 181 movdqa %xmm7, 112(%r8) 182 jmp L(return) 183 184L(main_loop_just_one_iteration): 185 movdqu (%r8, %rsi), %xmm0 186 movdqu 16(%r8, %rsi), %xmm1 187 movdqu 32(%r8, %rsi), %xmm2 188 movdqu 48(%r8, %rsi), %xmm3 189 movdqa %xmm0, (%r8) 190 movdqa %xmm1, 16(%r8) 191 movdqa %xmm2, 32(%r8) 192 movdqa %xmm3, 48(%r8) 193 jmp L(return) 194 195L(large_page): 196 movdqu (%rsi), %xmm0 197 movdqu 16(%rsi), %xmm1 198 movdqu 32(%rsi), %xmm2 199 movdqu 48(%rsi), %xmm3 200 movdqu -64(%rsi, %rdx), %xmm4 201 movdqu -48(%rsi, %rdx), %xmm5 202 movdqu -32(%rsi, %rdx), %xmm6 203 movdqu -16(%rsi, %rdx), %xmm7 204 movdqu %xmm0, (%rdi) 205 movdqu %xmm1, 16(%rdi) 206 movdqu %xmm2, 32(%rdi) 207 movdqu %xmm3, 48(%rdi) 208 movdqu %xmm4, -64(%rdi, %rdx) 209 movdqu %xmm5, -48(%rdi, %rdx) 210 movdqu %xmm6, -32(%rdi, %rdx) 211 movdqu %xmm7, -16(%rdi, %rdx) 212 213 movdqu 64(%rsi), %xmm0 214 movdqu 80(%rsi), %xmm1 215 movdqu 96(%rsi), %xmm2 216 movdqu 112(%rsi), %xmm3 217 movdqu -128(%rsi, %rdx), %xmm4 218 movdqu -112(%rsi, %rdx), %xmm5 219 movdqu -96(%rsi, %rdx), %xmm6 220 movdqu -80(%rsi, %rdx), %xmm7 221 movdqu %xmm0, 64(%rdi) 222 movdqu %xmm1, 80(%rdi) 223 movdqu %xmm2, 96(%rdi) 224 movdqu %xmm3, 112(%rdi) 225 movdqu %xmm4, -128(%rdi, %rdx) 226 movdqu %xmm5, -112(%rdi, %rdx) 227 movdqu %xmm6, -96(%rdi, %rdx) 228 movdqu %xmm7, -80(%rdi, %rdx) 229 230/* Now the main loop with non temporal stores. We align 231 the address of the destination. */ 232 lea 128(%rdi), %r8 233 and $-128, %r8 234 235 add %rdi, %rdx 236 and $-128, %rdx 237 238 sub %rdi, %rsi 239 240 .p2align 4 241L(main_loop_large_page): 242 movdqu (%r8, %rsi), %xmm0 243 movdqu 16(%r8, %rsi), %xmm1 244 movdqu 32(%r8, %rsi), %xmm2 245 movdqu 48(%r8, %rsi), %xmm3 246 movdqu 64(%r8, %rsi), %xmm4 247 movdqu 80(%r8, %rsi), %xmm5 248 movdqu 96(%r8, %rsi), %xmm6 249 movdqu 112(%r8, %rsi), %xmm7 250 movntdq %xmm0, (%r8) 251 movntdq %xmm1, 16(%r8) 252 movntdq %xmm2, 32(%r8) 253 movntdq %xmm3, 48(%r8) 254 movntdq %xmm4, 64(%r8) 255 movntdq %xmm5, 80(%r8) 256 movntdq %xmm6, 96(%r8) 257 movntdq %xmm7, 112(%r8) 258 lea 128(%r8), %r8 259 cmp %r8, %rdx 260 jne L(main_loop_large_page) 261 sfence 262 jmp L(return) 263 264L(len_0_16_bytes): 265 testb $24, %dl 266 jne L(len_9_16_bytes) 267 testb $4, %dl 268 .p2align 4,,5 269 jne L(len_5_8_bytes) 270 test %rdx, %rdx 271 .p2align 4,,2 272 je L(return) 273 movzbl (%rsi), %ebx 274 testb $2, %dl 275 movb %bl, (%rdi) 276 je L(return) 277 movzwl -2(%rsi,%rdx), %ebx 278 movw %bx, -2(%rdi,%rdx) 279 jmp L(return) 280 281L(len_9_16_bytes): 282 movq (%rsi), %xmm0 283 movq -8(%rsi, %rdx), %xmm1 284 movq %xmm0, (%rdi) 285 movq %xmm1, -8(%rdi, %rdx) 286 jmp L(return) 287 288L(len_5_8_bytes): 289 movl (%rsi), %ebx 290 movl %ebx, (%rdi) 291 movl -4(%rsi,%rdx), %ebx 292 movl %ebx, -4(%rdi,%rdx) 293 jmp L(return) 294 295L(return): 296 mov %rdi, %rax 297 RETURN 298 299END (MEMCPY) 300