1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3   Copyright (c) 2014, NVIDIA Corporation.  All rights reserved.
4
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions are met:
7       * Redistributions of source code must retain the above copyright
8         notice, this list of conditions and the following disclaimer.
9       * Redistributions in binary form must reproduce the above copyright
10         notice, this list of conditions and the following disclaimer in the
11         documentation and/or other materials provided with the distribution.
12       * Neither the name of the Linaro nor the
13         names of its contributors may be used to endorse or promote products
14         derived from this software without specific prior written permission.
15
16   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29/* Assumptions:
30 *
31 * denver, ARMv8-a, AArch64
32 * Unaligned accesses
33 *
34 */
35
36#include <private/bionic_asm.h>
37
38/* By default we assume that the DC instruction can be used to zero
39   data blocks more efficiently.  In some circumstances this might be
40   unsafe, for example in an asymmetric multiprocessor environment with
41   different DC clear lengths (neither the upper nor lower lengths are
42   safe to use).  The feature can be disabled by defining DONT_USE_DC.
43
44   If code may be run in a virtualized environment, then define
45   MAYBE_VIRT.  This will cause the code to cache the system register
46   values rather than re-reading them each call.  */
47
48#define dstin		x0
49#define val		w1
50#define count		x2
51#define dst_count x3 /* for __memset_chk */
52#define tmp1		x3
53#define tmp1w		w3
54#define tmp2		x4
55#define tmp2w		w4
56#define zva_len_x	x5
57#define zva_len		w5
58#define zva_bits_x	x6
59
60#define A_l		x7
61#define A_lw		w7
62#define dst		x8
63#define tmp3w		w9
64
65#define QA_l		q0
66
67ENTRY(__memset_chk)
68  cmp count, dst_count
69  bls memset
70
71  // Preserve for accurate backtrace.
72  stp x29, x30, [sp, -16]!
73  .cfi_def_cfa_offset 16
74  .cfi_rel_offset x29, 0
75  .cfi_rel_offset x30, 8
76
77  bl __memset_chk_fail
78END(__memset_chk)
79
80ENTRY(memset)
81
82	mov	dst, dstin		/* Preserve return value.  */
83	ands	A_lw, val, #255
84#ifndef DONT_USE_DC
85#	b.eq	.Lzero_mem
86#endif
87	orr	A_lw, A_lw, A_lw, lsl #8
88	orr	A_lw, A_lw, A_lw, lsl #16
89	orr	A_l, A_l, A_l, lsl #32
90.Ltail_maybe_long:
91	cmp	count, #256
92	b.ge	.Lnot_short
93.Ltail_maybe_tiny:
94	cmp	count, #15
95	b.le	.Ltail15tiny
96.Ltail255:
97	ands	tmp1, count, #0xC0
98	b.eq	.Ltail63
99	dup	v0.4s, A_lw
100	cmp	tmp1w, #0x80
101	b.eq	1f
102	b.lt	2f
103	stp	QA_l, QA_l, [dst], #32
104	stp	QA_l, QA_l, [dst], #32
1051:
106	stp	QA_l, QA_l, [dst], #32
107	stp	QA_l, QA_l, [dst], #32
1082:
109	stp	QA_l, QA_l, [dst], #32
110	stp	QA_l, QA_l, [dst], #32
111.Ltail63:
112	ands	tmp1, count, #0x30
113	b.eq	.Ltail15
114	add	dst, dst, tmp1
115	cmp	tmp1w, #0x20
116	b.eq	1f
117	b.lt	2f
118	stp	A_l, A_l, [dst, #-48]
1191:
120	stp	A_l, A_l, [dst, #-32]
1212:
122	stp	A_l, A_l, [dst, #-16]
123
124.Ltail15:
125	and	count, count, #15
126	add	dst, dst, count
127	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
128	ret
129
130.Ltail15tiny:
131	/* Set up to 15 bytes.  Does not assume earlier memory
132	   being set.  */
133	tbz	count, #3, 1f
134	str	A_l, [dst], #8
1351:
136	tbz	count, #2, 1f
137	str	A_lw, [dst], #4
1381:
139	tbz	count, #1, 1f
140	strh	A_lw, [dst], #2
1411:
142	tbz	count, #0, 1f
143	strb	A_lw, [dst]
1441:
145	ret
146
147	/* Critical loop.  Start at a new cache line boundary.  Assuming
148	 * 64 bytes per line, this ensures the entire loop is in one line.  */
149	.p2align 6
150.Lnot_short:
151	dup	v0.4s, A_lw
152	neg	tmp2, dst
153	ands	tmp2, tmp2, #15
154	b.eq	2f
155	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
156	 * more than that to set, so we simply store 16 bytes and advance by
157	 * the amount required to reach alignment.  */
158	sub	count, count, tmp2
159	stp	A_l, A_l, [dst]
160	add	dst, dst, tmp2
161	/* There may be less than 63 bytes to go now.  */
162	cmp	count, #255
163	b.le	.Ltail255
1642:
165	cmp	count, #2097152
166	b.gt	3f
1671:
168	sub	count, count, #256
1692:
170	stp	QA_l, QA_l, [dst], #32
171	stp	QA_l, QA_l, [dst], #32
172	stp	QA_l, QA_l, [dst], #32
173	stp	QA_l, QA_l, [dst], #32
174	stp	QA_l, QA_l, [dst], #32
175	stp	QA_l, QA_l, [dst], #32
176	stp	QA_l, QA_l, [dst], #32
177	stp	QA_l, QA_l, [dst], #32
178	subs	count, count, #256
179	b.ge	2b
180	tst	count, #0xff
181	b.ne	.Ltail255
182	ret
1833:
184	sub	count, count, #64
1854:
186	subs	count, count, #64
187	stnp	QA_l, QA_l, [dst]
188	stnp	QA_l, QA_l, [dst, #32]
189	add	dst, dst, #64
190	b.ge	4b
191	tst	count, #0x3f
192	b.ne	.Ltail63
193	ret
194
195#ifndef DONT_USE_DC
196	/* For zeroing memory, check to see if we can use the ZVA feature to
197	 * zero entire 'cache' lines.  */
198.Lzero_mem:
199	mov	A_l, #0
200	cmp	count, #63
201	b.le	.Ltail_maybe_tiny
202	neg	tmp2, dst
203	ands	tmp2, tmp2, #15
204	b.eq	1f
205	sub	count, count, tmp2
206	stp	A_l, A_l, [dst]
207	add	dst, dst, tmp2
208	cmp	count, #63
209	b.le	.Ltail63
2101:
211	/* For zeroing small amounts of memory, it's not worth setting up
212	 * the line-clear code.  */
213	cmp	count, #128
214	b.lt	.Lnot_short
215#ifdef MAYBE_VIRT
216	/* For efficiency when virtualized, we cache the ZVA capability.  */
217	adrp	tmp2, .Lcache_clear
218	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
219	tbnz	zva_len, #31, .Lnot_short
220	cbnz	zva_len, .Lzero_by_line
221	mrs	tmp1, dczid_el0
222	tbz	tmp1, #4, 1f
223	/* ZVA not available.  Remember this for next time.  */
224	mov	zva_len, #~0
225	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
226	b	.Lnot_short
2271:
228	mov	tmp3w, #4
229	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
230	lsl	zva_len, tmp3w, zva_len
231	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
232#else
233	mrs	tmp1, dczid_el0
234	tbnz	tmp1, #4, .Lnot_short
235	mov	tmp3w, #4
236	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
237	lsl	zva_len, tmp3w, zva_len
238#endif
239
240.Lzero_by_line:
241	/* Compute how far we need to go to become suitably aligned.  We're
242	 * already at quad-word alignment.  */
243	cmp	count, zva_len_x
244	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
245	sub	zva_bits_x, zva_len_x, #1
246	neg	tmp2, dst
247	ands	tmp2, tmp2, zva_bits_x
248	b.eq	1f			/* Already aligned.  */
249	/* Not aligned, check that there's enough to copy after alignment.  */
250	sub	tmp1, count, tmp2
251	cmp	tmp1, #64
252	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
253	b.lt	.Lnot_short
254	/* We know that there's at least 64 bytes to zero and that it's safe
255	 * to overrun by 64 bytes.  */
256	mov	count, tmp1
2572:
258	stp	A_l, A_l, [dst]
259	stp	A_l, A_l, [dst, #16]
260	stp	A_l, A_l, [dst, #32]
261	subs	tmp2, tmp2, #64
262	stp	A_l, A_l, [dst, #48]
263	add	dst, dst, #64
264	b.ge	2b
265	/* We've overrun a bit, so adjust dst downwards.  */
266	add	dst, dst, tmp2
2671:
268	sub	count, count, zva_len_x
2693:
270	dc	zva, dst
271	add	dst, dst, zva_len_x
272	subs	count, count, zva_len_x
273	b.ge	3b
274	ands	count, count, zva_bits_x
275	b.ne	.Ltail_maybe_long
276	ret
277END(memset)
278
279#ifdef MAYBE_VIRT
280	.bss
281	.p2align 2
282.Lcache_clear:
283	.space 4
284#endif
285#endif /* DONT_USE_DC */
286