1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 *
33 */
34
35#include <private/bionic_asm.h>
36
37/* By default we assume that the DC instruction can be used to zero
38   data blocks more efficiently.  In some circumstances this might be
39   unsafe, for example in an asymmetric multiprocessor environment with
40   different DC clear lengths (neither the upper nor lower lengths are
41   safe to use).
42
43   If code may be run in a virtualized environment, then define
44   MAYBE_VIRT.  This will cause the code to cache the system register
45   values rather than re-reading them each call.  */
46
47#define dstin		x0
48#ifdef BZERO
49#define count		x1
50#else
51#define count		x2
52#endif
53#define val		w1
54#define tmp1		x3
55#define tmp1w		w3
56#define tmp2		x4
57#define tmp2w		w4
58#define zva_len_x	x5
59#define zva_len		w5
60#define zva_bits_x	x6
61
62#define A_l		x7
63#define A_lw		w7
64#define dst		x8
65#define tmp3w		w9
66
67#ifdef BZERO
68ENTRY(bzero)
69#else
70ENTRY(memset)
71#endif
72
73	mov	dst, dstin		/* Preserve return value.  */
74#ifdef BZERO
75	b	.Lzero_mem
76#endif
77	ands	A_lw, val, #255
78	b.eq	.Lzero_mem
79	orr	A_lw, A_lw, A_lw, lsl #8
80	orr	A_lw, A_lw, A_lw, lsl #16
81	orr	A_l, A_l, A_l, lsl #32
82.Ltail_maybe_long:
83	cmp	count, #64
84	b.ge	.Lnot_short
85.Ltail_maybe_tiny:
86	cmp	count, #15
87	b.le	.Ltail15tiny
88.Ltail63:
89	ands	tmp1, count, #0x30
90	b.eq	.Ltail15
91	add	dst, dst, tmp1
92	cmp	tmp1w, #0x20
93	b.eq	1f
94	b.lt	2f
95	stp	A_l, A_l, [dst, #-48]
961:
97	stp	A_l, A_l, [dst, #-32]
982:
99	stp	A_l, A_l, [dst, #-16]
100
101.Ltail15:
102	and	count, count, #15
103	add	dst, dst, count
104	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
105	ret
106
107.Ltail15tiny:
108	/* Set up to 15 bytes.  Does not assume earlier memory
109	   being set.  */
110	tbz	count, #3, 1f
111	str	A_l, [dst], #8
1121:
113	tbz	count, #2, 1f
114	str	A_lw, [dst], #4
1151:
116	tbz	count, #1, 1f
117	strh	A_lw, [dst], #2
1181:
119	tbz	count, #0, 1f
120	strb	A_lw, [dst]
1211:
122	ret
123
124	/* Critical loop.  Start at a new cache line boundary.  Assuming
125	 * 64 bytes per line, this ensures the entire loop is in one line.  */
126	.p2align 6
127.Lnot_short:
128	neg	tmp2, dst
129	ands	tmp2, tmp2, #15
130	b.eq	2f
131	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
132	 * more than that to set, so we simply store 16 bytes and advance by
133	 * the amount required to reach alignment.  */
134	sub	count, count, tmp2
135	stp	A_l, A_l, [dst]
136	add	dst, dst, tmp2
137	/* There may be less than 63 bytes to go now.  */
138	cmp	count, #63
139	b.le	.Ltail63
1402:
141	sub	dst, dst, #16		/* Pre-bias.  */
142	sub	count, count, #64
1431:
144	stp	A_l, A_l, [dst, #16]
145	stp	A_l, A_l, [dst, #32]
146	stp	A_l, A_l, [dst, #48]
147	stp	A_l, A_l, [dst, #64]!
148	subs	count, count, #64
149	b.ge	1b
150	tst	count, #0x3f
151	add	dst, dst, #16
152	b.ne	.Ltail63
153	ret
154
155	/* For zeroing memory, check to see if we can use the ZVA feature to
156	 * zero entire 'cache' lines.  */
157.Lzero_mem:
158	mov	A_l, #0
159	cmp	count, #63
160	b.le	.Ltail_maybe_tiny
161	neg	tmp2, dst
162	ands	tmp2, tmp2, #15
163	b.eq	1f
164	sub	count, count, tmp2
165	stp	A_l, A_l, [dst]
166	add	dst, dst, tmp2
167	cmp	count, #63
168	b.le	.Ltail63
1691:
170	/* For zeroing small amounts of memory, it's not worth setting up
171	 * the line-clear code.  */
172	cmp	count, #128
173	b.lt	.Lnot_short
174#ifdef MAYBE_VIRT
175	/* For efficiency when virtualized, we cache the ZVA capability.  */
176	adrp	tmp2, .Lcache_clear
177	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
178	tbnz	zva_len, #31, .Lnot_short
179	cbnz	zva_len, .Lzero_by_line
180	mrs	tmp1, dczid_el0
181	tbz	tmp1, #4, 1f
182	/* ZVA not available.  Remember this for next time.  */
183	mov	zva_len, #~0
184	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
185	b	.Lnot_short
1861:
187	mov	tmp3w, #4
188	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
189	lsl	zva_len, tmp3w, zva_len
190	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
191#else
192	mrs	tmp1, dczid_el0
193	tbnz	tmp1, #4, .Lnot_short
194	mov	tmp3w, #4
195	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
196	lsl	zva_len, tmp3w, zva_len
197#endif
198
199.Lzero_by_line:
200	/* Compute how far we need to go to become suitably aligned.  We're
201	 * already at quad-word alignment.  */
202	cmp	count, zva_len_x
203	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
204	sub	zva_bits_x, zva_len_x, #1
205	neg	tmp2, dst
206	ands	tmp2, tmp2, zva_bits_x
207	b.eq	1f			/* Already aligned.  */
208	/* Not aligned, check that there's enough to copy after alignment.  */
209	sub	tmp1, count, tmp2
210	cmp	tmp1, #64
211	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
212	b.lt	.Lnot_short
213	/* We know that there's at least 64 bytes to zero and that it's safe
214	 * to overrun by 64 bytes.  */
215	mov	count, tmp1
2162:
217	stp	A_l, A_l, [dst]
218	stp	A_l, A_l, [dst, #16]
219	stp	A_l, A_l, [dst, #32]
220	subs	tmp2, tmp2, #64
221	stp	A_l, A_l, [dst, #48]
222	add	dst, dst, #64
223	b.ge	2b
224	/* We've overrun a bit, so adjust dst downwards.  */
225	add	dst, dst, tmp2
2261:
227	sub	count, count, zva_len_x
2283:
229	dc	zva, dst
230	add	dst, dst, zva_len_x
231	subs	count, count, zva_len_x
232	b.ge	3b
233	ands	count, count, zva_bits_x
234	b.ne	.Ltail_maybe_long
235	ret
236#ifdef BZERO
237END(bzero)
238#else
239END(memset)
240#endif
241
242#ifdef MAYBE_VIRT
243	.bss
244	.p2align 2
245.Lcache_clear:
246	.space 4
247#endif
248