1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE	memmove
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#ifdef USE_AS_BCOPY
77# define SRC		PARMS
78# define DEST		SRC+4
79# define LEN		DEST+4
80#else
81# define DEST		PARMS
82# define SRC		DEST+4
83# define LEN		SRC+4
84#endif
85
86#define CFI_PUSH(REG)		\
87  cfi_adjust_cfa_offset (4);		\
88  cfi_rel_offset (REG, 0)
89
90#define CFI_POP(REG)		\
91  cfi_adjust_cfa_offset (-4);		\
92  cfi_restore (REG)
93
94#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
95#define POP(REG)	popl REG; CFI_POP (REG)
96
97#define PARMS		8		/* Preserve EBX.  */
98#define ENTRANCE	PUSH (%ebx);
99#define RETURN_END	POP (%ebx); ret
100#define RETURN		RETURN_END; CFI_PUSH (%ebx)
101
102	.section .text.sse2,"ax",@progbits
103ENTRY (MEMMOVE)
104	ENTRANCE
105	movl	LEN(%esp), %ecx
106	movl	SRC(%esp), %eax
107	movl	DEST(%esp), %edx
108
109/* Check whether we should copy backward or forward.  */
110	cmp	%eax, %edx
111	je	L(mm_return)
112	jg	L(mm_len_0_or_more_backward)
113
114/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
115	separately.  */
116	cmp	$16, %ecx
117	jbe	L(mm_len_0_16_bytes_forward)
118
119	cmpl	$32, %ecx
120	ja	L(mm_len_32_or_more_forward)
121
122/* Copy [0..32] and return.  */
123	movdqu	(%eax), %xmm0
124	movdqu	-16(%eax, %ecx), %xmm1
125	movdqu	%xmm0, (%edx)
126	movdqu	%xmm1, -16(%edx, %ecx)
127	jmp	L(mm_return)
128
129L(mm_len_32_or_more_forward):
130	cmpl	$64, %ecx
131	ja	L(mm_len_64_or_more_forward)
132
133/* Copy [0..64] and return.  */
134	movdqu	(%eax), %xmm0
135	movdqu	16(%eax), %xmm1
136	movdqu	-16(%eax, %ecx), %xmm2
137	movdqu	-32(%eax, %ecx), %xmm3
138	movdqu	%xmm0, (%edx)
139	movdqu	%xmm1, 16(%edx)
140	movdqu	%xmm2, -16(%edx, %ecx)
141	movdqu	%xmm3, -32(%edx, %ecx)
142	jmp	L(mm_return)
143
144L(mm_len_64_or_more_forward):
145	cmpl	$128, %ecx
146	ja	L(mm_len_128_or_more_forward)
147
148/* Copy [0..128] and return.  */
149	movdqu	(%eax), %xmm0
150	movdqu	16(%eax), %xmm1
151	movdqu	32(%eax), %xmm2
152	movdqu	48(%eax), %xmm3
153	movdqu	-64(%eax, %ecx), %xmm4
154	movdqu	-48(%eax, %ecx), %xmm5
155	movdqu	-32(%eax, %ecx), %xmm6
156	movdqu	-16(%eax, %ecx), %xmm7
157	movdqu	%xmm0, (%edx)
158	movdqu	%xmm1, 16(%edx)
159	movdqu	%xmm2, 32(%edx)
160	movdqu	%xmm3, 48(%edx)
161	movdqu	%xmm4, -64(%edx, %ecx)
162	movdqu	%xmm5, -48(%edx, %ecx)
163	movdqu	%xmm6, -32(%edx, %ecx)
164	movdqu	%xmm7, -16(%edx, %ecx)
165	jmp	L(mm_return)
166
167L(mm_len_128_or_more_forward):
168	PUSH (%esi)
169	PUSH (%edi)
170
171/* Aligning the address of destination.  */
172	movdqu	(%eax), %xmm0
173	movdqu	16(%eax), %xmm1
174	movdqu	32(%eax), %xmm2
175	movdqu	48(%eax), %xmm3
176
177	leal	64(%edx), %edi
178	andl	$-64, %edi
179	subl	%edx, %eax
180
181	movdqu	(%eax, %edi), %xmm4
182	movdqu	16(%eax, %edi), %xmm5
183	movdqu	32(%eax, %edi), %xmm6
184	movdqu	48(%eax, %edi), %xmm7
185
186	movdqu	%xmm0, (%edx)
187	movdqu	%xmm1, 16(%edx)
188	movdqu	%xmm2, 32(%edx)
189	movdqu	%xmm3, 48(%edx)
190	movdqa	%xmm4, (%edi)
191	movaps	%xmm5, 16(%edi)
192	movaps	%xmm6, 32(%edi)
193	movaps	%xmm7, 48(%edi)
194	addl	$64, %edi
195
196	leal	(%edx, %ecx), %ebx
197	andl	$-64, %ebx
198	cmp	%edi, %ebx
199	jbe	L(mm_copy_remaining_forward)
200
201	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
202	jae	L(mm_large_page_loop_forward)
203
204	.p2align 4
205L(mm_main_loop_forward):
206
207	prefetcht0 128(%eax, %edi)
208
209	movdqu	(%eax, %edi), %xmm0
210	movdqu	16(%eax, %edi), %xmm1
211	movdqu	32(%eax, %edi), %xmm2
212	movdqu	48(%eax, %edi), %xmm3
213	movdqa	%xmm0, (%edi)
214	movaps	%xmm1, 16(%edi)
215	movaps	%xmm2, 32(%edi)
216	movaps	%xmm3, 48(%edi)
217	leal	64(%edi), %edi
218	cmp	%edi, %ebx
219	ja	L(mm_main_loop_forward)
220
221L(mm_copy_remaining_forward):
222	addl	%edx, %ecx
223	subl	%edi, %ecx
224/* We copied all up till %edi position in the dst.
225	In %ecx now is how many bytes are left to copy.
226	Now we need to advance %esi. */
227	leal	(%edi, %eax), %esi
228
229L(mm_remaining_0_64_bytes_forward):
230	cmp	$32, %ecx
231	ja	L(mm_remaining_33_64_bytes_forward)
232	cmp	$16, %ecx
233	ja	L(mm_remaining_17_32_bytes_forward)
234	testl	%ecx, %ecx
235	.p2align 4,,2
236	je	L(mm_return_pop_all)
237
238	cmpb	$8, %cl
239	ja	L(mm_remaining_9_16_bytes_forward)
240	cmpb	$4, %cl
241	.p2align 4,,5
242	ja	L(mm_remaining_5_8_bytes_forward)
243	cmpb	$2, %cl
244	.p2align 4,,1
245	ja	L(mm_remaining_3_4_bytes_forward)
246	movzbl	-1(%esi,%ecx), %eax
247	movzbl	(%esi), %ebx
248	movb	%al, -1(%edi,%ecx)
249	movb	%bl, (%edi)
250	jmp	L(mm_return_pop_all)
251
252L(mm_remaining_33_64_bytes_forward):
253	movdqu	(%esi), %xmm0
254	movdqu	16(%esi), %xmm1
255	movdqu	-32(%esi, %ecx), %xmm2
256	movdqu	-16(%esi, %ecx), %xmm3
257	movdqu	%xmm0, (%edi)
258	movdqu	%xmm1, 16(%edi)
259	movdqu	%xmm2, -32(%edi, %ecx)
260	movdqu	%xmm3, -16(%edi, %ecx)
261	jmp	L(mm_return_pop_all)
262
263L(mm_remaining_17_32_bytes_forward):
264	movdqu	(%esi), %xmm0
265	movdqu	-16(%esi, %ecx), %xmm1
266	movdqu	%xmm0, (%edi)
267	movdqu	%xmm1, -16(%edi, %ecx)
268	jmp	L(mm_return_pop_all)
269
270L(mm_remaining_9_16_bytes_forward):
271	movq	(%esi), %xmm0
272	movq	-8(%esi, %ecx), %xmm1
273	movq	%xmm0, (%edi)
274	movq	%xmm1, -8(%edi, %ecx)
275	jmp	L(mm_return_pop_all)
276
277L(mm_remaining_5_8_bytes_forward):
278	movl	(%esi), %eax
279	movl	-4(%esi,%ecx), %ebx
280	movl	%eax, (%edi)
281	movl	%ebx, -4(%edi,%ecx)
282	jmp	L(mm_return_pop_all)
283
284L(mm_remaining_3_4_bytes_forward):
285	movzwl	-2(%esi,%ecx), %eax
286	movzwl	(%esi), %ebx
287	movw	%ax, -2(%edi,%ecx)
288	movw	%bx, (%edi)
289	jmp	L(mm_return_pop_all)
290
291L(mm_len_0_16_bytes_forward):
292	testb	$24, %cl
293	jne	L(mm_len_9_16_bytes_forward)
294	testb	$4, %cl
295	.p2align 4,,5
296	jne	L(mm_len_5_8_bytes_forward)
297	testl	%ecx, %ecx
298	.p2align 4,,2
299	je	L(mm_return)
300	testb	$2, %cl
301	.p2align 4,,1
302	jne	L(mm_len_2_4_bytes_forward)
303	movzbl	-1(%eax,%ecx), %ebx
304	movzbl	(%eax), %eax
305	movb	%bl, -1(%edx,%ecx)
306	movb	%al, (%edx)
307	jmp	L(mm_return)
308
309L(mm_len_2_4_bytes_forward):
310	movzwl	-2(%eax,%ecx), %ebx
311	movzwl	(%eax), %eax
312	movw	%bx, -2(%edx,%ecx)
313	movw	%ax, (%edx)
314	jmp	L(mm_return)
315
316L(mm_len_5_8_bytes_forward):
317	movl	(%eax), %ebx
318	movl	-4(%eax,%ecx), %eax
319	movl	%ebx, (%edx)
320	movl	%eax, -4(%edx,%ecx)
321	jmp	L(mm_return)
322
323L(mm_len_9_16_bytes_forward):
324	movq	(%eax), %xmm0
325	movq	-8(%eax, %ecx), %xmm1
326	movq	%xmm0, (%edx)
327	movq	%xmm1, -8(%edx, %ecx)
328	jmp	L(mm_return)
329
330L(mm_recalc_len):
331/* Compute in %ecx how many bytes are left to copy after
332	the main loop stops.  */
333	movl	%ebx, %ecx
334	subl	%edx, %ecx
335/* The code for copying backwards.  */
336L(mm_len_0_or_more_backward):
337
338/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
339	separately.  */
340	cmp	$16, %ecx
341	jbe	L(mm_len_0_16_bytes_backward)
342
343	cmpl	$32, %ecx
344	jg	L(mm_len_32_or_more_backward)
345
346/* Copy [0..32] and return.  */
347	movdqu	(%eax), %xmm0
348	movdqu	-16(%eax, %ecx), %xmm1
349	movdqu	%xmm0, (%edx)
350	movdqu	%xmm1, -16(%edx, %ecx)
351	jmp	L(mm_return)
352
353L(mm_len_32_or_more_backward):
354	cmpl	$64, %ecx
355	jg	L(mm_len_64_or_more_backward)
356
357/* Copy [0..64] and return.  */
358	movdqu	(%eax), %xmm0
359	movdqu	16(%eax), %xmm1
360	movdqu	-16(%eax, %ecx), %xmm2
361	movdqu	-32(%eax, %ecx), %xmm3
362	movdqu	%xmm0, (%edx)
363	movdqu	%xmm1, 16(%edx)
364	movdqu	%xmm2, -16(%edx, %ecx)
365	movdqu	%xmm3, -32(%edx, %ecx)
366	jmp	L(mm_return)
367
368L(mm_len_64_or_more_backward):
369	cmpl	$128, %ecx
370	jg	L(mm_len_128_or_more_backward)
371
372/* Copy [0..128] and return.  */
373	movdqu	(%eax), %xmm0
374	movdqu	16(%eax), %xmm1
375	movdqu	32(%eax), %xmm2
376	movdqu	48(%eax), %xmm3
377	movdqu	-64(%eax, %ecx), %xmm4
378	movdqu	-48(%eax, %ecx), %xmm5
379	movdqu	-32(%eax, %ecx), %xmm6
380	movdqu	-16(%eax, %ecx), %xmm7
381	movdqu	%xmm0, (%edx)
382	movdqu	%xmm1, 16(%edx)
383	movdqu	%xmm2, 32(%edx)
384	movdqu	%xmm3, 48(%edx)
385	movdqu	%xmm4, -64(%edx, %ecx)
386	movdqu	%xmm5, -48(%edx, %ecx)
387	movdqu	%xmm6, -32(%edx, %ecx)
388	movdqu	%xmm7, -16(%edx, %ecx)
389	jmp	L(mm_return)
390
391L(mm_len_128_or_more_backward):
392	PUSH (%esi)
393	PUSH (%edi)
394
395/* Aligning the address of destination. We need to save
396	16 bits from the source in order not to overwrite them.  */
397	movdqu	-16(%eax, %ecx), %xmm0
398	movdqu	-32(%eax, %ecx), %xmm1
399	movdqu	-48(%eax, %ecx), %xmm2
400	movdqu	-64(%eax, %ecx), %xmm3
401
402	leal	(%edx, %ecx), %edi
403	andl	$-64, %edi
404
405	movl	%eax, %esi
406	subl	%edx, %esi
407
408	movdqu	-16(%edi, %esi), %xmm4
409	movdqu	-32(%edi, %esi), %xmm5
410	movdqu	-48(%edi, %esi), %xmm6
411	movdqu	-64(%edi, %esi), %xmm7
412
413	movdqu	%xmm0, -16(%edx, %ecx)
414	movdqu	%xmm1, -32(%edx, %ecx)
415	movdqu	%xmm2, -48(%edx, %ecx)
416	movdqu	%xmm3, -64(%edx, %ecx)
417	movdqa	%xmm4, -16(%edi)
418	movdqa	%xmm5, -32(%edi)
419	movdqa	%xmm6, -48(%edi)
420	movdqa	%xmm7, -64(%edi)
421	leal	-64(%edi), %edi
422
423	leal	64(%edx), %ebx
424	andl	$-64, %ebx
425
426	cmp	%edi, %ebx
427	jae	L(mm_main_loop_backward_end)
428
429	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
430	jae	L(mm_large_page_loop_backward)
431
432	.p2align 4
433L(mm_main_loop_backward):
434
435	prefetcht0 -128(%edi, %esi)
436
437	movdqu	-64(%edi, %esi), %xmm0
438	movdqu	-48(%edi, %esi), %xmm1
439	movdqu	-32(%edi, %esi), %xmm2
440	movdqu	-16(%edi, %esi), %xmm3
441	movdqa	%xmm0, -64(%edi)
442	movdqa	%xmm1, -48(%edi)
443	movdqa	%xmm2, -32(%edi)
444	movdqa	%xmm3, -16(%edi)
445	leal	-64(%edi), %edi
446	cmp	%edi, %ebx
447	jb	L(mm_main_loop_backward)
448L(mm_main_loop_backward_end):
449	POP (%edi)
450	POP (%esi)
451	jmp	L(mm_recalc_len)
452
453/* Copy [0..16] and return.  */
454L(mm_len_0_16_bytes_backward):
455	testb	$24, %cl
456	jnz	L(mm_len_9_16_bytes_backward)
457	testb	$4, %cl
458	.p2align 4,,5
459	jnz	L(mm_len_5_8_bytes_backward)
460	testl	%ecx, %ecx
461	.p2align 4,,2
462	je	L(mm_return)
463	testb	$2, %cl
464	.p2align 4,,1
465	jne	L(mm_len_3_4_bytes_backward)
466	movzbl	-1(%eax,%ecx), %ebx
467	movzbl	(%eax), %eax
468	movb	%bl, -1(%edx,%ecx)
469	movb	%al, (%edx)
470	jmp	L(mm_return)
471
472L(mm_len_3_4_bytes_backward):
473	movzwl	-2(%eax,%ecx), %ebx
474	movzwl	(%eax), %eax
475	movw	%bx, -2(%edx,%ecx)
476	movw	%ax, (%edx)
477	jmp	L(mm_return)
478
479L(mm_len_9_16_bytes_backward):
480	PUSH (%esi)
481	movl	-4(%eax,%ecx), %ebx
482	movl	-8(%eax,%ecx), %esi
483	movl	%ebx, -4(%edx,%ecx)
484	movl	%esi, -8(%edx,%ecx)
485	subl	$8, %ecx
486	POP (%esi)
487	jmp	L(mm_len_0_16_bytes_backward)
488
489L(mm_len_5_8_bytes_backward):
490	movl	(%eax), %ebx
491	movl	-4(%eax,%ecx), %eax
492	movl	%ebx, (%edx)
493	movl	%eax, -4(%edx,%ecx)
494
495L(mm_return):
496	movl	%edx, %eax
497	RETURN
498
499L(mm_return_pop_all):
500	movl	%edx, %eax
501	POP (%edi)
502	POP (%esi)
503	RETURN
504
505/* Big length copy forward part.  */
506
507	.p2align 4
508L(mm_large_page_loop_forward):
509	movdqu	(%eax, %edi), %xmm0
510	movdqu	16(%eax, %edi), %xmm1
511	movdqu	32(%eax, %edi), %xmm2
512	movdqu	48(%eax, %edi), %xmm3
513	movntdq	%xmm0, (%edi)
514	movntdq	%xmm1, 16(%edi)
515	movntdq	%xmm2, 32(%edi)
516	movntdq	%xmm3, 48(%edi)
517	leal	64(%edi), %edi
518	cmp	%edi, %ebx
519	ja	L(mm_large_page_loop_forward)
520	sfence
521	jmp	L(mm_copy_remaining_forward)
522
523/* Big length copy backward part.  */
524	.p2align 4
525L(mm_large_page_loop_backward):
526	movdqu	-64(%edi, %esi), %xmm0
527	movdqu	-48(%edi, %esi), %xmm1
528	movdqu	-32(%edi, %esi), %xmm2
529	movdqu	-16(%edi, %esi), %xmm3
530	movntdq	%xmm0, -64(%edi)
531	movntdq	%xmm1, -48(%edi)
532	movntdq	%xmm2, -32(%edi)
533	movntdq	%xmm3, -16(%edi)
534	leal	-64(%edi), %edi
535	cmp	%edi, %ebx
536	jb	L(mm_large_page_loop_backward)
537	sfence
538	POP (%edi)
539	POP (%esi)
540	jmp	L(mm_recalc_len)
541
542END (MEMMOVE)
543