1/* Copyright (c) 2012, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*/ 27 28/* Assumptions: 29 * 30 * ARMv8-a, AArch64 31 * Unaligned accesses 32 * 33 */ 34 35#include <private/bionic_asm.h> 36 37/* By default we assume that the DC instruction can be used to zero 38 data blocks more efficiently. In some circumstances this might be 39 unsafe, for example in an asymmetric multiprocessor environment with 40 different DC clear lengths (neither the upper nor lower lengths are 41 safe to use). 42 43 If code may be run in a virtualized environment, then define 44 MAYBE_VIRT. This will cause the code to cache the system register 45 values rather than re-reading them each call. */ 46 47#define dstin x0 48#ifdef BZERO 49#define count x1 50#else 51#define count x2 52#endif 53#define val w1 54#define tmp1 x3 55#define tmp1w w3 56#define tmp2 x4 57#define tmp2w w4 58#define zva_len_x x5 59#define zva_len w5 60#define zva_bits_x x6 61 62#define A_l x7 63#define A_lw w7 64#define dst x8 65#define tmp3w w9 66 67#ifdef BZERO 68ENTRY(bzero) 69#else 70ENTRY(memset) 71#endif 72 73 mov dst, dstin /* Preserve return value. */ 74#ifdef BZERO 75 b .Lzero_mem 76#endif 77 ands A_lw, val, #255 78 b.eq .Lzero_mem 79 orr A_lw, A_lw, A_lw, lsl #8 80 orr A_lw, A_lw, A_lw, lsl #16 81 orr A_l, A_l, A_l, lsl #32 82.Ltail_maybe_long: 83 cmp count, #64 84 b.ge .Lnot_short 85.Ltail_maybe_tiny: 86 cmp count, #15 87 b.le .Ltail15tiny 88.Ltail63: 89 ands tmp1, count, #0x30 90 b.eq .Ltail15 91 add dst, dst, tmp1 92 cmp tmp1w, #0x20 93 b.eq 1f 94 b.lt 2f 95 stp A_l, A_l, [dst, #-48] 961: 97 stp A_l, A_l, [dst, #-32] 982: 99 stp A_l, A_l, [dst, #-16] 100 101.Ltail15: 102 and count, count, #15 103 add dst, dst, count 104 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 105 ret 106 107.Ltail15tiny: 108 /* Set up to 15 bytes. Does not assume earlier memory 109 being set. */ 110 tbz count, #3, 1f 111 str A_l, [dst], #8 1121: 113 tbz count, #2, 1f 114 str A_lw, [dst], #4 1151: 116 tbz count, #1, 1f 117 strh A_lw, [dst], #2 1181: 119 tbz count, #0, 1f 120 strb A_lw, [dst] 1211: 122 ret 123 124 /* Critical loop. Start at a new cache line boundary. Assuming 125 * 64 bytes per line, this ensures the entire loop is in one line. */ 126 .p2align 6 127.Lnot_short: 128 neg tmp2, dst 129 ands tmp2, tmp2, #15 130 b.eq 2f 131 /* Bring DST to 128-bit (16-byte) alignment. We know that there's 132 * more than that to set, so we simply store 16 bytes and advance by 133 * the amount required to reach alignment. */ 134 sub count, count, tmp2 135 stp A_l, A_l, [dst] 136 add dst, dst, tmp2 137 /* There may be less than 63 bytes to go now. */ 138 cmp count, #63 139 b.le .Ltail63 1402: 141 sub dst, dst, #16 /* Pre-bias. */ 142 sub count, count, #64 1431: 144 stp A_l, A_l, [dst, #16] 145 stp A_l, A_l, [dst, #32] 146 stp A_l, A_l, [dst, #48] 147 stp A_l, A_l, [dst, #64]! 148 subs count, count, #64 149 b.ge 1b 150 tst count, #0x3f 151 add dst, dst, #16 152 b.ne .Ltail63 153 ret 154 155 /* For zeroing memory, check to see if we can use the ZVA feature to 156 * zero entire 'cache' lines. */ 157.Lzero_mem: 158 mov A_l, #0 159 cmp count, #63 160 b.le .Ltail_maybe_tiny 161 neg tmp2, dst 162 ands tmp2, tmp2, #15 163 b.eq 1f 164 sub count, count, tmp2 165 stp A_l, A_l, [dst] 166 add dst, dst, tmp2 167 cmp count, #63 168 b.le .Ltail63 1691: 170 /* For zeroing small amounts of memory, it's not worth setting up 171 * the line-clear code. */ 172 cmp count, #128 173 b.lt .Lnot_short 174#ifdef MAYBE_VIRT 175 /* For efficiency when virtualized, we cache the ZVA capability. */ 176 adrp tmp2, .Lcache_clear 177 ldr zva_len, [tmp2, #:lo12:.Lcache_clear] 178 tbnz zva_len, #31, .Lnot_short 179 cbnz zva_len, .Lzero_by_line 180 mrs tmp1, dczid_el0 181 tbz tmp1, #4, 1f 182 /* ZVA not available. Remember this for next time. */ 183 mov zva_len, #~0 184 str zva_len, [tmp2, #:lo12:.Lcache_clear] 185 b .Lnot_short 1861: 187 mov tmp3w, #4 188 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 189 lsl zva_len, tmp3w, zva_len 190 str zva_len, [tmp2, #:lo12:.Lcache_clear] 191#else 192 mrs tmp1, dczid_el0 193 tbnz tmp1, #4, .Lnot_short 194 mov tmp3w, #4 195 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 196 lsl zva_len, tmp3w, zva_len 197#endif 198 199.Lzero_by_line: 200 /* Compute how far we need to go to become suitably aligned. We're 201 * already at quad-word alignment. */ 202 cmp count, zva_len_x 203 b.lt .Lnot_short /* Not enough to reach alignment. */ 204 sub zva_bits_x, zva_len_x, #1 205 neg tmp2, dst 206 ands tmp2, tmp2, zva_bits_x 207 b.eq 1f /* Already aligned. */ 208 /* Not aligned, check that there's enough to copy after alignment. */ 209 sub tmp1, count, tmp2 210 cmp tmp1, #64 211 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 212 b.lt .Lnot_short 213 /* We know that there's at least 64 bytes to zero and that it's safe 214 * to overrun by 64 bytes. */ 215 mov count, tmp1 2162: 217 stp A_l, A_l, [dst] 218 stp A_l, A_l, [dst, #16] 219 stp A_l, A_l, [dst, #32] 220 subs tmp2, tmp2, #64 221 stp A_l, A_l, [dst, #48] 222 add dst, dst, #64 223 b.ge 2b 224 /* We've overrun a bit, so adjust dst downwards. */ 225 add dst, dst, tmp2 2261: 227 sub count, count, zva_len_x 2283: 229 dc zva, dst 230 add dst, dst, zva_len_x 231 subs count, count, zva_len_x 232 b.ge 3b 233 ands count, count, zva_bits_x 234 b.ne .Ltail_maybe_long 235 ret 236#ifdef BZERO 237END(bzero) 238#else 239END(memset) 240#endif 241 242#ifdef MAYBE_VIRT 243 .bss 244 .p2align 2 245.Lcache_clear: 246 .space 4 247#endif 248