1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *  * Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 *  * Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in
13 *    the documentation and/or other materials provided with the
14 *    distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <private/bionic_asm.h>
31
32        /*
33         * Optimized memset() for ARM.
34         *
35         * memset() returns its first argument.
36         */
37
38        .cpu        cortex-a15
39        .fpu        neon
40        .syntax     unified
41
42ENTRY(__memset_chk)
43        cmp         r2, r3
44        bls         memset
45
46        // Preserve lr for backtrace.
47        push        {lr}
48        .cfi_def_cfa_offset 4
49        .cfi_rel_offset lr, 0
50
51        bl          __memset_chk_fail
52END(__memset_chk)
53
54ENTRY(memset)
55        pldw        [r0]
56        mov         r3, r0
57
58        // Duplicate the low byte of r1
59        mov         r1, r1, lsl #24
60        orr         r1, r1, r1, lsr #8
61        orr         r1, r1, r1, lsr #16
62
63        cmp         r2, #16
64        blo         .L_less_than_16
65
66        // This section handles regions 16 bytes or larger
67        //
68        // Use aligned vst1.8 and vstm when possible.  Register values will be:
69        //   ip is scratch
70        //   q0, q1, and r1 contain the memset value
71        //   r2 is the number of bytes to set
72        //   r3 is the advancing destination pointer
73        vdup.32     q0, r1
74
75        ands        ip, r3, 0xF
76        beq         .L_memset_aligned
77
78        // Align dest pointer to 16-byte boundary.
79        pldw        [r0, #64]
80        rsb         ip, ip, #16
81
82        // Pre-adjust the byte count to reflect post-aligment value.  Expecting
83        // 8-byte alignment to be rather common so we special case that one.
84        sub         r2, r2, ip
85
86        /* set 1 byte */
87        tst         ip, #1
88        it          ne
89        strbne      r1, [r3], #1
90        /* set 2 bytes */
91        tst         ip, #2
92        it          ne
93        strhne      r1, [r3], #2
94        /* set 4 bytes */
95        movs        ip, ip, lsl #29
96        it          mi
97        strmi       r1, [r3], #4
98        /* set 8 bytes */
99        itt         cs
100        strcs       r1, [r3], #4
101        strcs       r1, [r3], #4
102
103.L_memset_aligned:
104        // Destination is now 16-byte aligned.  Determine how to handle
105        // remaining bytes.
106        vmov        q1, q0
107        cmp         r2, #128
108        blo         .L_less_than_128
109
110        // We need to set a larger block of memory.  Use four Q regs to
111        // set a full cache line in one instruction.  Pre-decrement
112        // r2 to simplify end-of-loop detection
113        vmov        q2, q0
114        vmov        q3, q0
115        pldw        [r0, #128]
116        sub         r2, r2, #128
117        .align 4
118.L_memset_loop_128:
119        pldw        [r3, #192]
120        vstm        r3!, {q0, q1, q2, q3}
121        vstm        r3!, {q0, q1, q2, q3}
122        subs        r2, r2, #128
123        bhs         .L_memset_loop_128
124
125        // Un-bias r2 so it contains the number of bytes left.  Early
126        // exit if we are done.
127        adds        r2, r2, #128
128        beq         2f
129
130        .align 4
131.L_less_than_128:
132        // set 64 bytes
133        movs        ip, r2, lsl #26
134        bcc         1f
135        vst1.8      {q0, q1}, [r3, :128]!
136        vst1.8      {q0, q1}, [r3, :128]!
137        beq         2f
1381:
139        // set 32 bytes
140        bpl         1f
141        vst1.8      {q0, q1}, [r3, :128]!
1421:
143        // set 16 bytes
144        movs        ip, r2, lsl #28
145        bcc         1f
146        vst1.8      {q0}, [r3, :128]!
147        beq         2f
1481:
149        // set 8 bytes
150        bpl         1f
151        vst1.8      {d0}, [r3, :64]!
1521:
153        // set 4 bytes
154        tst         r2, #4
155        it          ne
156        strne       r1, [r3], #4
1571:
158        // set 2 bytes
159        movs        ip, r2, lsl #31
160        it          cs
161        strhcs      r1, [r3], #2
162        // set 1 byte
163        it          mi
164        strbmi      r1, [r3]
1652:
166        bx          lr
167
168.L_less_than_16:
169        // Store up to 15 bytes without worrying about byte alignment
170        movs        ip, r2, lsl #29
171        bcc         1f
172        str         r1, [r3], #4
173        str         r1, [r3], #4
174        beq         2f
1751:
176        it          mi
177        strmi       r1, [r3], #4
178        movs        ip, r2, lsl #31
179        it          mi
180        strbmi      r1, [r3], #1
181        itt         cs
182        strbcs      r1, [r3], #1
183        strbcs      r1, [r3]
1842:
185        bx          lr
186END(memset)
187