1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3   Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
18   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
20   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*/
26/* Assumptions:
27 *
28 * ARMv8-a, AArch64
29 * Unaligned accesses
30 *
31 */
32#include <private/bionic_asm.h>
33
34#define dstin		x0
35#define val		    w1
36#define count		x2
37#define tmp1		x3
38#define tmp1w		w3
39#define tmp2		x4
40#define tmp2w		w4
41#define zva_len_x	x5
42#define zva_len		w5
43#define zva_bits_x	x6
44#define A_l		    x7
45#define A_lw		w7
46#define dst		    x8
47#define tmp3w		w9
48#define tmp4        x10
49#define SMALL_BUFFER_SIZE    96
50
51ENTRY(__memset_aarch64_nt)
52    mov	dst, dstin		/* Preserve return value.  */
53    ands	A_lw, val, #255
54    b.eq	.Lzero_mem  /* Use DC ZVA instruction if the val = 0 */
55    orr	A_lw, A_lw, A_lw, lsl #8
56    orr	A_lw, A_lw, A_lw, lsl #16
57    orr	A_l, A_l, A_l, lsl #32
58.Ltail_maybe_long:
59    cmp	count, #64
60    b.ge	.Lnot_short
61.Ltail_maybe_tiny:
62    cmp	count, #15
63    b.le	.Ltail15tiny
64.Ltail63:
65    ands	tmp1, count, #0x30
66    b.eq	.Ltail15
67    add	dst, dst, tmp1
68    cmp	tmp1w, #0x20
69    b.eq	1f
70    b.lt	2f
71    stp	A_l, A_l, [dst, #-48]
721:
73    stp	A_l, A_l, [dst, #-32]
742:
75    stp	A_l, A_l, [dst, #-16]
76.Ltail15:
77    and	count, count, #15
78    add	dst, dst, count
79    stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
80    ret
81.Ltail15tiny:
82    /* Set up to 15 bytes.  Does not assume earlier memory
83       being set.  */
84    tbz	count, #3, 1f
85    str	A_l, [dst], #8
861:
87    tbz	count, #2, 1f
88    str	A_lw, [dst], #4
891:
90    tbz	count, #1, 1f
91    strh	A_lw, [dst], #2
921:
93    tbz	count, #0, 1f
94    strb	A_lw, [dst]
951:
96    ret
97    /* Critical loop.  Start at a new cache line boundary.  Assuming
98     * 64 bytes per line, this ensures the entire loop is in one line.  */
99    .p2align 6
100.Lnot_short:
101    mov tmp4, #SMALL_BUFFER_SIZE
102    cmp count, tmp4, LSL#10
103    /* Use non-temporal instruction if count > SMALL_BUFFER_SIZE */
104    bgt L(not_short_nt)
105    neg	tmp2, dst
106    ands	tmp2, tmp2, #15
107    b.eq	2f
108    /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
109     * more than that to set, so we simply store 16 bytes and advance by
110     * the amount required to reach alignment.  */
111    sub	count, count, tmp2
112    stp	A_l, A_l, [dst]
113    add	dst, dst, tmp2
114    /* There may be less than 63 bytes to go now.  */
115    cmp	count, #63
116    b.le	.Ltail63
1172:
118    sub	dst, dst, #16		/* Pre-bias.  */
119    sub	count, count, #64
1201:
121    stp	A_l, A_l, [dst, #16]
122    stp	A_l, A_l, [dst, #32]
123    stp	A_l, A_l, [dst, #48]
124    stp	A_l, A_l, [dst, #64]!
125    subs	count, count, #64
126    b.ge	1b
127    tst	count, #0x3f
128    add	dst, dst, #16
129    b.ne	.Ltail63
130    ret
131.Lnot_short_nt:
132    neg	tmp2, dst
133    ands	tmp2, tmp2, #15
134    b.eq	2f
135    /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
136     * more than that to set, so we simply store 16 bytes and advance by
137     * the amount required to reach alignment.  */
138    sub	count, count, tmp2
139    stnp	A_l, A_l, [dst]
140    add	dst, dst, tmp2
141    /* There may be less than 63 bytes to go now.  */
142    cmp	count, #63
143    b.le	.Ltail63
1442:
145    sub	dst, dst, #16		/* Pre-bias.  */
146    sub	count, count, #64
1471:
148    stnp	A_l, A_l, [dst, #16]
149    stnp	A_l, A_l, [dst, #32]
150    stnp	A_l, A_l, [dst, #48]
151    stnp	A_l, A_l, [dst, #64]
152    add     dst, dst, #64
153    subs	count, count, #64
154    b.ge	1b
155    tst	count, #0x3f
156    add	dst, dst, #16
157    b.ne	.Ltail63
158    ret
159.Lzero_mem:
160    mov	A_l, #0
161    cmp	count, #63
162    b.le	.Ltail_maybe_tiny
163    neg	tmp2, dst
164    ands	tmp2, tmp2, #15
165    b.eq	1f
166    sub	count, count, tmp2
167    stp	A_l, A_l, [dst]
168    add	dst, dst, tmp2
169    cmp	count, #63
170    b.le	.Ltail63
1711:
172    /* For zeroing small amounts of memory, it's not worth setting up
173     * the line-clear code.  */
174    cmp	count, #128
175    b.lt	.Lnot_short
176    mrs	tmp1, dczid_el0
177    tbnz	tmp1, #4, .Lnot_short
178    mov	tmp3w, #4
179    and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
180    lsl	zva_len, tmp3w, zva_len
181.Lzero_by_line:
182    /* Compute how far we need to go to become suitably aligned.  We're
183     * already at quad-word alignment.  */
184    cmp	count, zva_len_x
185    b.lt	.Lnot_short		/* Not enough to reach alignment.  */
186    sub	zva_bits_x, zva_len_x, #1
187    neg	tmp2, dst
188    ands	tmp2, tmp2, zva_bits_x
189    b.eq	1f			/* Already aligned.  */
190    /* Not aligned, check that there's enough to copy after alignment.  */
191    sub	tmp1, count, tmp2
192    cmp	tmp1, #64
193    ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
194    b.lt	.Lnot_short
195    /* We know that there's at least 64 bytes to zero and that it's safe
196     * to overrun by 64 bytes.  */
197    mov	count, tmp1
1982:
199    stp	A_l, A_l, [dst]
200    stp	A_l, A_l, [dst, #16]
201    stp	A_l, A_l, [dst, #32]
202    subs	tmp2, tmp2, #64
203    stp	A_l, A_l, [dst, #48]
204    add	dst, dst, #64
205    b.ge	2b
206    /* We've overrun a bit, so adjust dst downwards.  */
207    add	dst, dst, tmp2
2081:
209    sub	count, count, zva_len_x
2103:
211    dc	zva, dst
212    add	dst, dst, zva_len_x
213    subs	count, count, zva_len_x
214    b.ge	3b
215    ands	count, count, zva_bits_x
216    b.ne	.Ltail_maybe_long
217    ret
218END(__memset_aarch64_nt)
219