1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMCPY
34# define MEMCPY		memcpy
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#define CFI_PUSH(REG)		\
77	cfi_adjust_cfa_offset (4);		\
78	cfi_rel_offset (REG, 0)
79
80#define CFI_POP(REG)		\
81	cfi_adjust_cfa_offset (-4);		\
82	cfi_restore (REG)
83
84#define PUSH(REG)	push REG;
85#define POP(REG)	pop REG;
86
87#define ENTRANCE	PUSH (%rbx);
88#define RETURN_END	POP (%rbx); ret
89#define RETURN		RETURN_END;
90
91	.section .text.sse2,"ax",@progbits
92ENTRY (MEMCPY)
93	ENTRANCE
94	cmp	%rsi, %rdi
95	je	L(return)
96
97	cmp	$16, %rdx
98	jbe	L(len_0_16_bytes)
99
100	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
101	jae	L(large_page)
102
103	movdqu	(%rsi), %xmm0
104	movdqu	-16(%rsi, %rdx), %xmm1
105	cmp	$32, %rdx
106	movdqu	%xmm0, (%rdi)
107	movdqu	%xmm1, -16(%rdi, %rdx)
108	jbe	L(return)
109
110	movdqu	16(%rsi), %xmm0
111	movdqu	-32(%rsi, %rdx), %xmm1
112	cmp	$64, %rdx
113	movdqu	%xmm0, 16(%rdi)
114	movdqu	%xmm1, -32(%rdi, %rdx)
115	jbe	L(return)
116
117	movdqu	32(%rsi), %xmm0
118	movdqu	48(%rsi), %xmm1
119	movdqu	-48(%rsi, %rdx), %xmm2
120	movdqu	-64(%rsi, %rdx), %xmm3
121	cmp	$128, %rdx
122	movdqu	%xmm0, 32(%rdi)
123	movdqu	%xmm1, 48(%rdi)
124	movdqu	%xmm2, -48(%rdi, %rdx)
125	movdqu	%xmm3, -64(%rdi, %rdx)
126	jbe	L(return)
127
128/* Now the main loop: we align the address of the destination.  */
129	lea	64(%rdi), %r8
130	and	$-64, %r8
131
132	add	%rdi, %rdx
133	and	$-64, %rdx
134
135	sub	%rdi, %rsi
136
137/* We should stop two iterations before the termination
138	(in order not to misprefetch).  */
139	sub	$64, %rdx
140	cmp	%r8, %rdx
141	je	L(main_loop_just_one_iteration)
142
143	sub	$64, %rdx
144	cmp	%r8, %rdx
145	je	L(main_loop_last_two_iterations)
146
147
148	.p2align 4
149L(main_loop_cache):
150
151	prefetcht0 128(%r8, %rsi)
152
153	movdqu	(%r8, %rsi), %xmm0
154	movdqu	16(%r8, %rsi), %xmm1
155	movdqu	32(%r8, %rsi), %xmm2
156	movdqu	48(%r8, %rsi), %xmm3
157	movdqa	%xmm0, (%r8)
158	movdqa	%xmm1, 16(%r8)
159	movdqa	%xmm2, 32(%r8)
160	movdqa	%xmm3, 48(%r8)
161	lea	64(%r8), %r8
162	cmp	%r8, %rdx
163	jne	L(main_loop_cache)
164
165L(main_loop_last_two_iterations):
166	movdqu	(%r8, %rsi), %xmm0
167	movdqu	16(%r8, %rsi), %xmm1
168	movdqu	32(%r8, %rsi), %xmm2
169	movdqu	48(%r8, %rsi), %xmm3
170	movdqu	64(%r8, %rsi), %xmm4
171	movdqu	80(%r8, %rsi), %xmm5
172	movdqu	96(%r8, %rsi), %xmm6
173	movdqu	112(%r8, %rsi), %xmm7
174	movdqa	%xmm0, (%r8)
175	movdqa	%xmm1, 16(%r8)
176	movdqa	%xmm2, 32(%r8)
177	movdqa	%xmm3, 48(%r8)
178	movdqa	%xmm4, 64(%r8)
179	movdqa	%xmm5, 80(%r8)
180	movdqa	%xmm6, 96(%r8)
181	movdqa	%xmm7, 112(%r8)
182	jmp	L(return)
183
184L(main_loop_just_one_iteration):
185	movdqu	(%r8, %rsi), %xmm0
186	movdqu	16(%r8, %rsi), %xmm1
187	movdqu	32(%r8, %rsi), %xmm2
188	movdqu	48(%r8, %rsi), %xmm3
189	movdqa	%xmm0, (%r8)
190	movdqa	%xmm1, 16(%r8)
191	movdqa	%xmm2, 32(%r8)
192	movdqa	%xmm3, 48(%r8)
193	jmp	L(return)
194
195L(large_page):
196	movdqu	(%rsi), %xmm0
197	movdqu	16(%rsi), %xmm1
198	movdqu	32(%rsi), %xmm2
199	movdqu	48(%rsi), %xmm3
200	movdqu	-64(%rsi, %rdx), %xmm4
201	movdqu	-48(%rsi, %rdx), %xmm5
202	movdqu	-32(%rsi, %rdx), %xmm6
203	movdqu	-16(%rsi, %rdx), %xmm7
204	movdqu	%xmm0, (%rdi)
205	movdqu	%xmm1, 16(%rdi)
206	movdqu	%xmm2, 32(%rdi)
207	movdqu	%xmm3, 48(%rdi)
208	movdqu	%xmm4, -64(%rdi, %rdx)
209	movdqu	%xmm5, -48(%rdi, %rdx)
210	movdqu	%xmm6, -32(%rdi, %rdx)
211	movdqu	%xmm7, -16(%rdi, %rdx)
212
213	movdqu	64(%rsi), %xmm0
214	movdqu	80(%rsi), %xmm1
215	movdqu	96(%rsi), %xmm2
216	movdqu	112(%rsi), %xmm3
217	movdqu	-128(%rsi, %rdx), %xmm4
218	movdqu	-112(%rsi, %rdx), %xmm5
219	movdqu	-96(%rsi, %rdx), %xmm6
220	movdqu	-80(%rsi, %rdx), %xmm7
221	movdqu	%xmm0, 64(%rdi)
222	movdqu	%xmm1, 80(%rdi)
223	movdqu	%xmm2, 96(%rdi)
224	movdqu	%xmm3, 112(%rdi)
225	movdqu	%xmm4, -128(%rdi, %rdx)
226	movdqu	%xmm5, -112(%rdi, %rdx)
227	movdqu	%xmm6, -96(%rdi, %rdx)
228	movdqu	%xmm7, -80(%rdi, %rdx)
229
230/* Now the main loop with non temporal stores. We align
231	the address of the destination.  */
232	lea	128(%rdi), %r8
233	and	$-128, %r8
234
235	add	%rdi, %rdx
236	and	$-128, %rdx
237
238	sub	%rdi, %rsi
239
240	.p2align 4
241L(main_loop_large_page):
242	movdqu	(%r8, %rsi), %xmm0
243	movdqu	16(%r8, %rsi), %xmm1
244	movdqu	32(%r8, %rsi), %xmm2
245	movdqu	48(%r8, %rsi), %xmm3
246	movdqu	64(%r8, %rsi), %xmm4
247	movdqu	80(%r8, %rsi), %xmm5
248	movdqu	96(%r8, %rsi), %xmm6
249	movdqu	112(%r8, %rsi), %xmm7
250	movntdq	%xmm0, (%r8)
251	movntdq	%xmm1, 16(%r8)
252	movntdq	%xmm2, 32(%r8)
253	movntdq	%xmm3, 48(%r8)
254	movntdq	%xmm4, 64(%r8)
255	movntdq	%xmm5, 80(%r8)
256	movntdq	%xmm6, 96(%r8)
257	movntdq	%xmm7, 112(%r8)
258	lea	128(%r8), %r8
259	cmp	%r8, %rdx
260	jne	L(main_loop_large_page)
261	sfence
262	jmp	L(return)
263
264L(len_0_16_bytes):
265	testb	$24, %dl
266	jne	L(len_9_16_bytes)
267	testb	$4, %dl
268	.p2align 4,,5
269	jne	L(len_5_8_bytes)
270	test	%rdx, %rdx
271	.p2align 4,,2
272	je	L(return)
273	movzbl	(%rsi), %ebx
274	testb	$2, %dl
275	movb	%bl, (%rdi)
276	je	L(return)
277	movzwl	-2(%rsi,%rdx), %ebx
278	movw	%bx, -2(%rdi,%rdx)
279	jmp	L(return)
280
281L(len_9_16_bytes):
282	movq	(%rsi), %xmm0
283	movq	-8(%rsi, %rdx), %xmm1
284	movq	%xmm0, (%rdi)
285	movq	%xmm1, -8(%rdi, %rdx)
286	jmp	L(return)
287
288L(len_5_8_bytes):
289	movl	(%rsi), %ebx
290	movl	%ebx, (%rdi)
291	movl	-4(%rsi,%rdx), %ebx
292	movl	%ebx, -4(%rdi,%rdx)
293	jmp	L(return)
294
295L(return):
296	mov 	%rdi, %rax
297	RETURN
298
299END (MEMCPY)
300