1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE	memmove
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#ifdef USE_AS_BCOPY
77# define SRC		PARMS
78# define DEST		SRC+4
79# define LEN		DEST+4
80#else
81# define DEST		PARMS
82# define SRC		DEST+4
83# define LEN		SRC+4
84#endif
85
86#define CFI_PUSH(REG)		\
87  cfi_adjust_cfa_offset (4);		\
88  cfi_rel_offset (REG, 0)
89
90#define CFI_POP(REG)		\
91  cfi_adjust_cfa_offset (-4);		\
92  cfi_restore (REG)
93
94#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
95#define POP(REG)	popl REG; CFI_POP (REG)
96
97#define PARMS		8		/* Preserve EBX.  */
98#define ENTRANCE	PUSH (%ebx);
99#define RETURN_END	POP (%ebx); ret
100#define RETURN		RETURN_END; CFI_PUSH (%ebx)
101
102	.section .text.sse2,"ax",@progbits
103ENTRY (MEMMOVE)
104	ENTRANCE
105	movl	LEN(%esp), %ecx
106	movl	SRC(%esp), %eax
107	movl	DEST(%esp), %edx
108
109/* Check whether we should copy backward or forward.  */
110	cmp	%eax, %edx
111	je	L(mm_return)
112	jg	L(mm_len_0_or_more_backward)
113
114/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
115	separately.  */
116	cmp	$16, %ecx
117	jbe	L(mm_len_0_16_bytes_forward)
118
119	cmpl	$32, %ecx
120	ja	L(mm_len_32_or_more_forward)
121
122/* Copy [0..32] and return.  */
123	movdqu	(%eax), %xmm0
124	movdqu	-16(%eax, %ecx), %xmm1
125	movdqu	%xmm0, (%edx)
126	movdqu	%xmm1, -16(%edx, %ecx)
127	jmp	L(mm_return)
128
129L(mm_len_32_or_more_forward):
130	cmpl	$64, %ecx
131	ja	L(mm_len_64_or_more_forward)
132
133/* Copy [0..64] and return.  */
134	movdqu	(%eax), %xmm0
135	movdqu	16(%eax), %xmm1
136	movdqu	-16(%eax, %ecx), %xmm2
137	movdqu	-32(%eax, %ecx), %xmm3
138	movdqu	%xmm0, (%edx)
139	movdqu	%xmm1, 16(%edx)
140	movdqu	%xmm2, -16(%edx, %ecx)
141	movdqu	%xmm3, -32(%edx, %ecx)
142	jmp	L(mm_return)
143
144L(mm_len_64_or_more_forward):
145	cmpl	$128, %ecx
146	ja	L(mm_len_128_or_more_forward)
147
148/* Copy [0..128] and return.  */
149	movdqu	(%eax), %xmm0
150	movdqu	16(%eax), %xmm1
151	movdqu	32(%eax), %xmm2
152	movdqu	48(%eax), %xmm3
153	movdqu	-64(%eax, %ecx), %xmm4
154	movdqu	-48(%eax, %ecx), %xmm5
155	movdqu	-32(%eax, %ecx), %xmm6
156	movdqu	-16(%eax, %ecx), %xmm7
157	movdqu	%xmm0, (%edx)
158	movdqu	%xmm1, 16(%edx)
159	movdqu	%xmm2, 32(%edx)
160	movdqu	%xmm3, 48(%edx)
161	movdqu	%xmm4, -64(%edx, %ecx)
162	movdqu	%xmm5, -48(%edx, %ecx)
163	movdqu	%xmm6, -32(%edx, %ecx)
164	movdqu	%xmm7, -16(%edx, %ecx)
165	jmp	L(mm_return)
166
167L(mm_len_128_or_more_forward):
168	PUSH (%esi)
169	PUSH (%edi)
170
171/* Aligning the address of destination.  */
172	movdqu	(%eax), %xmm0
173	movdqu	16(%eax), %xmm1
174	movdqu	32(%eax), %xmm2
175	movdqu	48(%eax), %xmm3
176
177	leal	64(%edx), %edi
178	andl	$-64, %edi
179	subl	%edx, %eax
180
181	movdqu	(%eax, %edi), %xmm4
182	movdqu	16(%eax, %edi), %xmm5
183	movdqu	32(%eax, %edi), %xmm6
184	movdqu	48(%eax, %edi), %xmm7
185
186	movdqu	%xmm0, (%edx)
187	movdqu	%xmm1, 16(%edx)
188	movdqu	%xmm2, 32(%edx)
189	movdqu	%xmm3, 48(%edx)
190	movdqa	%xmm4, (%edi)
191	movaps	%xmm5, 16(%edi)
192	movaps	%xmm6, 32(%edi)
193	movaps	%xmm7, 48(%edi)
194	addl	$64, %edi
195
196	leal	(%edx, %ecx), %ebx
197	andl	$-64, %ebx
198	cmp	%edi, %ebx
199	jbe	L(mm_copy_remaining_forward)
200
201	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
202	jae	L(mm_large_page_loop_forward)
203
204	.p2align 4
205L(mm_main_loop_forward):
206
207	prefetcht0 128(%eax, %edi)
208
209	movdqu	(%eax, %edi), %xmm0
210	movdqu	16(%eax, %edi), %xmm1
211	movdqu	32(%eax, %edi), %xmm2
212	movdqu	48(%eax, %edi), %xmm3
213	movdqa	%xmm0, (%edi)
214	movaps	%xmm1, 16(%edi)
215	movaps	%xmm2, 32(%edi)
216	movaps	%xmm3, 48(%edi)
217	leal	64(%edi), %edi
218	cmp	%edi, %ebx
219	ja	L(mm_main_loop_forward)
220
221L(mm_copy_remaining_forward):
222	addl	%edx, %ecx
223	subl	%edi, %ecx
224/* We copied all up till %edi position in the dst.
225	In %ecx now is how many bytes are left to copy.
226	Now we need to advance %esi. */
227	leal	(%edi, %eax), %esi
228
229L(mm_remaining_0_64_bytes_forward):
230	cmp	$32, %ecx
231	ja	L(mm_remaining_33_64_bytes_forward)
232	cmp	$16, %ecx
233	ja	L(mm_remaining_17_32_bytes_forward)
234	testl	%ecx, %ecx
235	.p2align 4,,2
236	je	L(mm_return_pop_all)
237
238	cmpb	$8, %cl
239	ja	L(mm_remaining_9_16_bytes_forward)
240	cmpb	$4, %cl
241	.p2align 4,,5
242	ja	L(mm_remaining_5_8_bytes_forward)
243	cmpb	$2, %cl
244	.p2align 4,,1
245	ja	L(mm_remaining_3_4_bytes_forward)
246	movzbl	-1(%esi,%ecx), %eax
247	movzbl	(%esi), %ebx
248	movb	%al, -1(%edi,%ecx)
249	movb	%bl, (%edi)
250	jmp	L(mm_return_pop_all)
251
252L(mm_remaining_33_64_bytes_forward):
253	movdqu	(%esi), %xmm0
254	movdqu	16(%esi), %xmm1
255	movdqu	-32(%esi, %ecx), %xmm2
256	movdqu	-16(%esi, %ecx), %xmm3
257	movdqu	%xmm0, (%edi)
258	movdqu	%xmm1, 16(%edi)
259	movdqu	%xmm2, -32(%edi, %ecx)
260	movdqu	%xmm3, -16(%edi, %ecx)
261	jmp	L(mm_return_pop_all)
262
263L(mm_remaining_17_32_bytes_forward):
264	movdqu	(%esi), %xmm0
265	movdqu	-16(%esi, %ecx), %xmm1
266	movdqu	%xmm0, (%edi)
267	movdqu	%xmm1, -16(%edi, %ecx)
268	jmp	L(mm_return_pop_all)
269
270L(mm_remaining_9_16_bytes_forward):
271	movq	(%esi), %xmm0
272	movq	-8(%esi, %ecx), %xmm1
273	movq	%xmm0, (%edi)
274	movq	%xmm1, -8(%edi, %ecx)
275	jmp	L(mm_return_pop_all)
276
277L(mm_remaining_5_8_bytes_forward):
278	movl	(%esi), %eax
279	movl	-4(%esi,%ecx), %ebx
280	movl	%eax, (%edi)
281	movl	%ebx, -4(%edi,%ecx)
282	jmp	L(mm_return_pop_all)
283
284L(mm_remaining_3_4_bytes_forward):
285	movzwl	-2(%esi,%ecx), %eax
286	movzwl	(%esi), %ebx
287	movw	%ax, -2(%edi,%ecx)
288	movw	%bx, (%edi)
289	jmp	L(mm_return_pop_all)
290
291L(mm_len_0_16_bytes_forward):
292	testb	$24, %cl
293	jne	L(mm_len_9_16_bytes_forward)
294	testb	$4, %cl
295	.p2align 4,,5
296	jne	L(mm_len_5_8_bytes_forward)
297	testl	%ecx, %ecx
298	.p2align 4,,2
299	je	L(mm_return)
300	testb	$2, %cl
301	.p2align 4,,1
302	jne	L(mm_len_2_4_bytes_forward)
303	movzbl	-1(%eax,%ecx), %ebx
304	movzbl	(%eax), %eax
305	movb	%bl, -1(%edx,%ecx)
306	movb	%al, (%edx)
307	jmp	L(mm_return)
308
309L(mm_len_2_4_bytes_forward):
310	movzwl	-2(%eax,%ecx), %ebx
311	movzwl	(%eax), %eax
312	movw	%bx, -2(%edx,%ecx)
313	movw	%ax, (%edx)
314	jmp	L(mm_return)
315
316L(mm_len_5_8_bytes_forward):
317	movl	(%eax), %ebx
318	movl	-4(%eax,%ecx), %eax
319	movl	%ebx, (%edx)
320	movl	%eax, -4(%edx,%ecx)
321	jmp	L(mm_return)
322
323L(mm_len_9_16_bytes_forward):
324	movq	(%eax), %xmm0
325	movq	-8(%eax, %ecx), %xmm1
326	movq	%xmm0, (%edx)
327	movq	%xmm1, -8(%edx, %ecx)
328	jmp	L(mm_return)
329
330	CFI_POP (%edi)
331	CFI_POP (%esi)
332
333L(mm_recalc_len):
334/* Compute in %ecx how many bytes are left to copy after
335	the main loop stops.  */
336	movl	%ebx, %ecx
337	subl	%edx, %ecx
338/* The code for copying backwards.  */
339L(mm_len_0_or_more_backward):
340
341/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
342	separately.  */
343	cmp	$16, %ecx
344	jbe	L(mm_len_0_16_bytes_backward)
345
346	cmpl	$32, %ecx
347	jg	L(mm_len_32_or_more_backward)
348
349/* Copy [0..32] and return.  */
350	movdqu	(%eax), %xmm0
351	movdqu	-16(%eax, %ecx), %xmm1
352	movdqu	%xmm0, (%edx)
353	movdqu	%xmm1, -16(%edx, %ecx)
354	jmp	L(mm_return)
355
356L(mm_len_32_or_more_backward):
357	cmpl	$64, %ecx
358	jg	L(mm_len_64_or_more_backward)
359
360/* Copy [0..64] and return.  */
361	movdqu	(%eax), %xmm0
362	movdqu	16(%eax), %xmm1
363	movdqu	-16(%eax, %ecx), %xmm2
364	movdqu	-32(%eax, %ecx), %xmm3
365	movdqu	%xmm0, (%edx)
366	movdqu	%xmm1, 16(%edx)
367	movdqu	%xmm2, -16(%edx, %ecx)
368	movdqu	%xmm3, -32(%edx, %ecx)
369	jmp	L(mm_return)
370
371L(mm_len_64_or_more_backward):
372	cmpl	$128, %ecx
373	jg	L(mm_len_128_or_more_backward)
374
375/* Copy [0..128] and return.  */
376	movdqu	(%eax), %xmm0
377	movdqu	16(%eax), %xmm1
378	movdqu	32(%eax), %xmm2
379	movdqu	48(%eax), %xmm3
380	movdqu	-64(%eax, %ecx), %xmm4
381	movdqu	-48(%eax, %ecx), %xmm5
382	movdqu	-32(%eax, %ecx), %xmm6
383	movdqu	-16(%eax, %ecx), %xmm7
384	movdqu	%xmm0, (%edx)
385	movdqu	%xmm1, 16(%edx)
386	movdqu	%xmm2, 32(%edx)
387	movdqu	%xmm3, 48(%edx)
388	movdqu	%xmm4, -64(%edx, %ecx)
389	movdqu	%xmm5, -48(%edx, %ecx)
390	movdqu	%xmm6, -32(%edx, %ecx)
391	movdqu	%xmm7, -16(%edx, %ecx)
392	jmp	L(mm_return)
393
394L(mm_len_128_or_more_backward):
395	PUSH (%esi)
396	PUSH (%edi)
397
398/* Aligning the address of destination. We need to save
399	16 bits from the source in order not to overwrite them.  */
400	movdqu	-16(%eax, %ecx), %xmm0
401	movdqu	-32(%eax, %ecx), %xmm1
402	movdqu	-48(%eax, %ecx), %xmm2
403	movdqu	-64(%eax, %ecx), %xmm3
404
405	leal	(%edx, %ecx), %edi
406	andl	$-64, %edi
407
408	movl	%eax, %esi
409	subl	%edx, %esi
410
411	movdqu	-16(%edi, %esi), %xmm4
412	movdqu	-32(%edi, %esi), %xmm5
413	movdqu	-48(%edi, %esi), %xmm6
414	movdqu	-64(%edi, %esi), %xmm7
415
416	movdqu	%xmm0, -16(%edx, %ecx)
417	movdqu	%xmm1, -32(%edx, %ecx)
418	movdqu	%xmm2, -48(%edx, %ecx)
419	movdqu	%xmm3, -64(%edx, %ecx)
420	movdqa	%xmm4, -16(%edi)
421	movdqa	%xmm5, -32(%edi)
422	movdqa	%xmm6, -48(%edi)
423	movdqa	%xmm7, -64(%edi)
424	leal	-64(%edi), %edi
425
426	leal	64(%edx), %ebx
427	andl	$-64, %ebx
428
429	cmp	%edi, %ebx
430	jae	L(mm_main_loop_backward_end)
431
432	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
433	jae	L(mm_large_page_loop_backward)
434
435	.p2align 4
436L(mm_main_loop_backward):
437
438	prefetcht0 -128(%edi, %esi)
439
440	movdqu	-64(%edi, %esi), %xmm0
441	movdqu	-48(%edi, %esi), %xmm1
442	movdqu	-32(%edi, %esi), %xmm2
443	movdqu	-16(%edi, %esi), %xmm3
444	movdqa	%xmm0, -64(%edi)
445	movdqa	%xmm1, -48(%edi)
446	movdqa	%xmm2, -32(%edi)
447	movdqa	%xmm3, -16(%edi)
448	leal	-64(%edi), %edi
449	cmp	%edi, %ebx
450	jb	L(mm_main_loop_backward)
451L(mm_main_loop_backward_end):
452	POP (%edi)
453	POP (%esi)
454	jmp	L(mm_recalc_len)
455
456/* Copy [0..16] and return.  */
457L(mm_len_0_16_bytes_backward):
458	testb	$24, %cl
459	jnz	L(mm_len_9_16_bytes_backward)
460	testb	$4, %cl
461	.p2align 4,,5
462	jnz	L(mm_len_5_8_bytes_backward)
463	testl	%ecx, %ecx
464	.p2align 4,,2
465	je	L(mm_return)
466	testb	$2, %cl
467	.p2align 4,,1
468	jne	L(mm_len_3_4_bytes_backward)
469	movzbl	-1(%eax,%ecx), %ebx
470	movzbl	(%eax), %eax
471	movb	%bl, -1(%edx,%ecx)
472	movb	%al, (%edx)
473	jmp	L(mm_return)
474
475L(mm_len_3_4_bytes_backward):
476	movzwl	-2(%eax,%ecx), %ebx
477	movzwl	(%eax), %eax
478	movw	%bx, -2(%edx,%ecx)
479	movw	%ax, (%edx)
480	jmp	L(mm_return)
481
482L(mm_len_9_16_bytes_backward):
483	PUSH (%esi)
484	movl	-4(%eax,%ecx), %ebx
485	movl	-8(%eax,%ecx), %esi
486	movl	%ebx, -4(%edx,%ecx)
487	movl	%esi, -8(%edx,%ecx)
488	subl	$8, %ecx
489	POP (%esi)
490	jmp	L(mm_len_0_16_bytes_backward)
491
492L(mm_len_5_8_bytes_backward):
493	movl	(%eax), %ebx
494	movl	-4(%eax,%ecx), %eax
495	movl	%ebx, (%edx)
496	movl	%eax, -4(%edx,%ecx)
497
498L(mm_return):
499	movl	%edx, %eax
500	RETURN
501
502L(mm_return_pop_all):
503	movl	%edx, %eax
504	POP (%edi)
505	POP (%esi)
506	RETURN
507
508/* Big length copy forward part.  */
509
510	.p2align 4
511L(mm_large_page_loop_forward):
512	movdqu	(%eax, %edi), %xmm0
513	movdqu	16(%eax, %edi), %xmm1
514	movdqu	32(%eax, %edi), %xmm2
515	movdqu	48(%eax, %edi), %xmm3
516	movntdq	%xmm0, (%edi)
517	movntdq	%xmm1, 16(%edi)
518	movntdq	%xmm2, 32(%edi)
519	movntdq	%xmm3, 48(%edi)
520	leal	64(%edi), %edi
521	cmp	%edi, %ebx
522	ja	L(mm_large_page_loop_forward)
523	sfence
524	jmp	L(mm_copy_remaining_forward)
525
526/* Big length copy backward part.  */
527	.p2align 4
528L(mm_large_page_loop_backward):
529	movdqu	-64(%edi, %esi), %xmm0
530	movdqu	-48(%edi, %esi), %xmm1
531	movdqu	-32(%edi, %esi), %xmm2
532	movdqu	-16(%edi, %esi), %xmm3
533	movntdq	%xmm0, -64(%edi)
534	movntdq	%xmm1, -48(%edi)
535	movntdq	%xmm2, -32(%edi)
536	movntdq	%xmm3, -16(%edi)
537	leal	-64(%edi), %edi
538	cmp	%edi, %ebx
539	jb	L(mm_large_page_loop_backward)
540	sfence
541	POP (%edi)
542	POP (%esi)
543	jmp	L(mm_recalc_len)
544
545END (MEMMOVE)
546