1/*
2 * Copyright (c) 2013 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
30   unaligned access.
31
32   If compiled with GCC, this file should be enclosed within following
33   pre-processing check:
34   if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
35
36   Prototype: void *memcpy (void *dst, const void *src, size_t count);
37
38   The job will be done in 5 steps.
39   Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
40   Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
41   Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
42   Step 4: Copy word by word
43   Step 5: Copy byte-to-byte
44
45   Tunable options:
46     __OPT_BIG_BLOCK_SIZE: Size of big block in words.  Default to 64.
47     __OPT_MID_BLOCK_SIZE: Size of big block in words.  Default to 16.
48 */
49#ifndef __OPT_BIG_BLOCK_SIZE
50#define __OPT_BIG_BLOCK_SIZE (4 * 16)
51#endif
52
53#ifndef __OPT_MID_BLOCK_SIZE
54#define __OPT_MID_BLOCK_SIZE (4 * 4)
55#endif
56
57#if __OPT_BIG_BLOCK_SIZE == 16
58#define BEGIN_UNROLL_BIG_BLOCK \
59  .irp offset, 0,4,8,12
60#elif __OPT_BIG_BLOCK_SIZE == 32
61#define BEGIN_UNROLL_BIG_BLOCK \
62  .irp offset, 0,4,8,12,16,20,24,28
63#elif __OPT_BIG_BLOCK_SIZE == 64
64#define BEGIN_UNROLL_BIG_BLOCK \
65  .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
66#else
67#error "Illegal __OPT_BIG_BLOCK_SIZE"
68#endif
69
70#if __OPT_MID_BLOCK_SIZE == 8
71#define BEGIN_UNROLL_MID_BLOCK \
72  .irp offset, 0,4
73#elif __OPT_MID_BLOCK_SIZE == 16
74#define BEGIN_UNROLL_MID_BLOCK \
75  .irp offset, 0,4,8,12
76#else
77#error "Illegal __OPT_MID_BLOCK_SIZE"
78#endif
79
80#define END_UNROLL .endr
81
82	.syntax unified
83	.text
84	.align	2
85	.global	memcpy
86	.thumb
87	.thumb_func
88	.type	memcpy, %function
89memcpy:
90	@ r0: dst
91	@ r1: src
92	@ r2: len
93#ifdef __ARM_FEATURE_UNALIGNED
94	/* In case of UNALIGNED access supported, ip is not used in
95	   function body.  */
96	mov	ip, r0
97#else
98	push	{r0}
99#endif
100	orr	r3, r1, r0
101	ands	r3, r3, #3
102	bne	.Lmisaligned_copy
103
104.Lbig_block:
105	subs	r2, __OPT_BIG_BLOCK_SIZE
106	blo	.Lmid_block
107
108	/* Kernel loop for big block copy */
109	.align 2
110.Lbig_block_loop:
111	BEGIN_UNROLL_BIG_BLOCK
112#ifdef __ARM_ARCH_7EM__
113	ldr	r3, [r1], #4
114	str	r3, [r0], #4
115	END_UNROLL
116#else /* __ARM_ARCH_7M__ */
117	ldr	r3, [r1, \offset]
118	str	r3, [r0, \offset]
119	END_UNROLL
120	adds	r0, __OPT_BIG_BLOCK_SIZE
121	adds	r1, __OPT_BIG_BLOCK_SIZE
122#endif
123	subs	r2, __OPT_BIG_BLOCK_SIZE
124	bhs .Lbig_block_loop
125
126.Lmid_block:
127	adds	r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
128	blo	.Lcopy_word_by_word
129
130	/* Kernel loop for mid-block copy */
131	.align 2
132.Lmid_block_loop:
133	BEGIN_UNROLL_MID_BLOCK
134#ifdef __ARM_ARCH_7EM__
135	ldr	r3, [r1], #4
136	str	r3, [r0], #4
137	END_UNROLL
138#else /* __ARM_ARCH_7M__ */
139	ldr	r3, [r1, \offset]
140	str	r3, [r0, \offset]
141	END_UNROLL
142	adds    r0, __OPT_MID_BLOCK_SIZE
143	adds    r1, __OPT_MID_BLOCK_SIZE
144#endif
145	subs	r2, __OPT_MID_BLOCK_SIZE
146	bhs	.Lmid_block_loop
147
148.Lcopy_word_by_word:
149	adds	r2, __OPT_MID_BLOCK_SIZE - 4
150	blo	.Lcopy_less_than_4
151
152	/* Kernel loop for small block copy */
153	.align 2
154.Lcopy_word_by_word_loop:
155	ldr	r3, [r1], #4
156	str	r3, [r0], #4
157	subs	r2, #4
158	bhs	.Lcopy_word_by_word_loop
159
160.Lcopy_less_than_4:
161	adds	r2, #4
162	beq	.Ldone
163
164	lsls	r2, r2, #31
165	itt ne
166	ldrbne  r3, [r1], #1
167	strbne  r3, [r0], #1
168
169	bcc	.Ldone
170#ifdef __ARM_FEATURE_UNALIGNED
171	ldrh	r3, [r1]
172	strh	r3, [r0]
173#else
174	ldrb	r3, [r1]
175	strb	r3, [r0]
176	ldrb	r3, [r1, #1]
177	strb	r3, [r0, #1]
178#endif /* __ARM_FEATURE_UNALIGNED */
179
180.Ldone:
181#ifdef __ARM_FEATURE_UNALIGNED
182	mov	r0, ip
183#else
184	pop	{r0}
185#endif
186	bx	lr
187
188	.align 2
189.Lmisaligned_copy:
190#ifdef __ARM_FEATURE_UNALIGNED
191	/* Define label DST_ALIGNED to BIG_BLOCK.  It will go to aligned copy
192	   once destination is adjusted to aligned.  */
193#define Ldst_aligned Lbig_block
194
195	/* Copy word by word using LDR when alignment can be done in hardware,
196	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
197
198	cmp	r2, #8
199	blo	.Lbyte_copy
200
201	/* if src is aligned, just go to the big block loop.  */
202	lsls	r3, r1, #30
203	beq	.Ldst_aligned
204#else
205	/* if len < 12, misalignment adjustment has more overhead than
206	just byte-to-byte copy.  Also, len must >=8 to guarantee code
207	afterward work correctly.  */
208	cmp	r2, #12
209	blo	.Lbyte_copy
210#endif /* __ARM_FEATURE_UNALIGNED */
211
212	/* Align dst only, not trying to align src.  That is the because
213	handling of aligned src and misaligned dst need more overhead than
214	otherwise.  By doing this the worst case is when initial src is aligned,
215	additional up to 4 byte additional copy will executed, which is
216	acceptable.  */
217
218	ands	r3, r0, #3
219	beq	.Ldst_aligned
220
221	rsb	r3, #4
222	subs	r2, r3
223
224	lsls    r3, r3, #31
225	itt ne
226	ldrbne  r3, [r1], #1
227	strbne  r3, [r0], #1
228
229	bcc .Ldst_aligned
230
231#ifdef __ARM_FEATURE_UNALIGNED
232	ldrh    r3, [r1], #2
233	strh    r3, [r0], #2
234	b	.Ldst_aligned
235#else
236	ldrb    r3, [r1], #1
237	strb    r3, [r0], #1
238	ldrb    r3, [r1], #1
239	strb    r3, [r0], #1
240	/* Now that dst is aligned */
241.Ldst_aligned:
242	/* if r1 is aligned now, it means r0/r1 has the same misalignment,
243	and they are both aligned now.  Go aligned copy.  */
244	ands	r3, r1, #3
245	beq	.Lbig_block
246
247	/* dst is aligned, but src isn't.  Misaligned copy.  */
248
249	push	{r4, r5}
250	subs	r2, #4
251
252	/* Backward r1 by misaligned bytes, to make r1 aligned.
253	Since we need to restore r1 to unaligned address after the loop,
254	we need keep the offset bytes to ip and sub it from r1 afterward.  */
255	subs	r1, r3
256	rsb	ip, r3, #4
257
258	/* Pre-load on word */
259	ldr	r4, [r1], #4
260
261	cmp	r3, #2
262	beq	.Lmisaligned_copy_2_2
263	cmp	r3, #3
264	beq	.Lmisaligned_copy_3_1
265
266	.macro mis_src_copy shift
2671:
268	lsrs	r4, r4, \shift
269	ldr	r3, [r1], #4
270	lsls	r5, r3, 32-\shift
271	orr	r4, r4, r5
272	str	r4, [r0], #4
273	mov	r4, r3
274	subs	r2, #4
275	bhs	1b
276	.endm
277
278.Lmisaligned_copy_1_3:
279	mis_src_copy shift=8
280	b	.Lsrc_misaligned_tail
281
282.Lmisaligned_copy_3_1:
283	mis_src_copy shift=24
284	b	.Lsrc_misaligned_tail
285
286.Lmisaligned_copy_2_2:
287	/* For 2_2 misalignment, ldr is still faster than 2 x ldrh.  */
288	mis_src_copy shift=16
289
290.Lsrc_misaligned_tail:
291	adds	r2, #4
292	subs	r1, ip
293	pop	{r4, r5}
294
295#endif /* __ARM_FEATURE_UNALIGNED */
296
297.Lbyte_copy:
298	subs	r2, #4
299	blo	.Lcopy_less_than_4
300
301.Lbyte_copy_loop:
302	subs    r2, #1
303	ldrb    r3, [r1], #1
304	strb    r3, [r0], #1
305	bhs	.Lbyte_copy_loop
306
307	ldrb	r3, [r1]
308	strb	r3, [r0]
309	ldrb	r3, [r1, #1]
310	strb	r3, [r0, #1]
311	ldrb	r3, [r1, #2]
312	strb	r3, [r0, #2]
313
314#ifdef __ARM_FEATURE_UNALIGNED
315	mov	r0, ip
316#else
317	pop	{r0}
318#endif
319	bx	lr
320
321	.size	memcpy, .-memcpy
322