1/* Copyright (c) 2012-2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 *    products derived from this software without specific prior written
41 *    permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55/* Assumptions:
56 *
57 * ARMv8-a, AArch64, unaligned accesses.
58 *
59 */
60
61#include <private/bionic_asm.h>
62
63#define dstin	x0
64#define src	x1
65#define count	x2
66#define dst	x3
67#define srcend	x4
68#define dstend	x5
69#define A_l	x6
70#define A_lw	w6
71#define A_h	x7
72#define A_hw	w7
73#define B_l	x8
74#define B_lw   w8
75#define B_h	x9
76#define C_l	x10
77#define C_h	x11
78#define D_l	x12
79#define D_h	x13
80#define E_l	src
81#define E_h	count
82#define F_l	srcend
83#define F_h	dst
84#define tmp1	x9
85
86#define L(l) .L ## l
87
88/* Copies are split into 3 main cases: small copies of up to 16 bytes,
89   medium copies of 17..96 bytes which are fully unrolled. Large copies
90   of more than 96 bytes align the destination and use an unrolled loop
91   processing 64 bytes per iteration.
92   Small and medium copies read all data before writing, allowing any
93   kind of overlap, and memmove tailcalls memcpy for these cases as
94   well as non-overlapping copies.
95*/
96
97	prfm    PLDL1KEEP, [src]
98	add	srcend, src, count
99	add	dstend, dstin, count
100        cmp     count, 16
101        b.ls    L(copy16)
102	cmp	count, 96
103	b.hi	L(copy_long)
104
105	/* Medium copies: 17..96 bytes.  */
106	sub	tmp1, count, 1
107	ldp	A_l, A_h, [src]
108	tbnz	tmp1, 6, L(copy96)
109	ldp	D_l, D_h, [srcend, -16]
110	tbz	tmp1, 5, 1f
111	ldp	B_l, B_h, [src, 16]
112	ldp	C_l, C_h, [srcend, -32]
113	stp	B_l, B_h, [dstin, 16]
114	stp	C_l, C_h, [dstend, -32]
1151:
116	stp	A_l, A_h, [dstin]
117	stp	D_l, D_h, [dstend, -16]
118	ret
119
120	.p2align 4
121
122	/* Small copies: 0..16 bytes.  */
123L(copy16):
124	cmp	count, 8
125	b.lo	1f
126	ldr	A_l, [src]
127	ldr	A_h, [srcend, -8]
128	str	A_l, [dstin]
129	str	A_h, [dstend, -8]
130	ret
131	.p2align 4
1321:
133	tbz	count, 2, 1f
134	ldr	A_lw, [src]
135	ldr	A_hw, [srcend, -4]
136	str	A_lw, [dstin]
137	str	A_hw, [dstend, -4]
138	ret
139
140	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
141	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
1421:
143	cbz	count, 2f
144	lsr	tmp1, count, 1
145	ldrb	A_lw, [src]
146	ldrb	A_hw, [srcend, -1]
147	ldrb	B_lw, [src, tmp1]
148	strb	A_lw, [dstin]
149	strb	B_lw, [dstin, tmp1]
150	strb	A_hw, [dstend, -1]
1512:	ret
152
153	.p2align 4
154	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
155	   32 bytes from the end.  */
156L(copy96):
157	ldp	B_l, B_h, [src, 16]
158	ldp	C_l, C_h, [src, 32]
159	ldp	D_l, D_h, [src, 48]
160	ldp	E_l, E_h, [srcend, -32]
161	ldp	F_l, F_h, [srcend, -16]
162	stp	A_l, A_h, [dstin]
163	stp	B_l, B_h, [dstin, 16]
164	stp	C_l, C_h, [dstin, 32]
165	stp	D_l, D_h, [dstin, 48]
166	stp	E_l, E_h, [dstend, -32]
167	stp	F_l, F_h, [dstend, -16]
168	ret
169
170	/* Align DST to 16 byte alignment so that we don't cross cache line
171	   boundaries on both loads and stores.	 There are at least 96 bytes
172	   to copy, so copy 16 bytes unaligned and then align.	The loop
173	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
174
175	.p2align 4
176L(copy_long):
177	and	tmp1, dstin, 15
178	bic	dst, dstin, 15
179	ldp	D_l, D_h, [src]
180	sub	src, src, tmp1
181	add	count, count, tmp1	/* Count is now 16 too large.  */
182	ldp	A_l, A_h, [src, 16]
183	stp	D_l, D_h, [dstin]
184	ldp	B_l, B_h, [src, 32]
185	ldp	C_l, C_h, [src, 48]
186	ldp	D_l, D_h, [src, 64]!
187	subs	count, count, 128 + 16	/* Test and readjust count.  */
188	b.ls	2f
1891:
190	stp	A_l, A_h, [dst, 16]
191	ldp	A_l, A_h, [src, 16]
192	stp	B_l, B_h, [dst, 32]
193	ldp	B_l, B_h, [src, 32]
194	stp	C_l, C_h, [dst, 48]
195	ldp	C_l, C_h, [src, 48]
196	stp	D_l, D_h, [dst, 64]!
197	ldp	D_l, D_h, [src, 64]!
198	subs	count, count, 64
199	b.hi	1b
200
201	/* Write the last full set of 64 bytes.	 The remainder is at most 64
202	   bytes, so it is safe to always copy 64 bytes from the end even if
203	   there is just 1 byte left.  */
2042:
205	ldp	E_l, E_h, [srcend, -64]
206	stp	A_l, A_h, [dst, 16]
207	ldp	A_l, A_h, [srcend, -48]
208	stp	B_l, B_h, [dst, 32]
209	ldp	B_l, B_h, [srcend, -32]
210	stp	C_l, C_h, [dst, 48]
211	ldp	C_l, C_h, [srcend, -16]
212	stp	D_l, D_h, [dst, 64]
213	stp	E_l, E_h, [dstend, -64]
214	stp	A_l, A_h, [dstend, -48]
215	stp	B_l, B_h, [dstend, -32]
216	stp	C_l, C_h, [dstend, -16]
217	ret
218