1/* Copyright (c) 2012, Linaro Limited 2 All rights reserved. 3 Copyright (c) 2014, NVIDIA Corporation. All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in the 11 documentation and/or other materials provided with the distribution. 12 * Neither the name of the Linaro nor the 13 names of its contributors may be used to endorse or promote products 14 derived from this software without specific prior written permission. 15 16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27*/ 28 29/* Assumptions: 30 * 31 * denver, ARMv8-a, AArch64 32 * Unaligned accesses 33 * 34 */ 35 36#include <private/bionic_asm.h> 37 38/* By default we assume that the DC instruction can be used to zero 39 data blocks more efficiently. In some circumstances this might be 40 unsafe, for example in an asymmetric multiprocessor environment with 41 different DC clear lengths (neither the upper nor lower lengths are 42 safe to use). The feature can be disabled by defining DONT_USE_DC. 43 44 If code may be run in a virtualized environment, then define 45 MAYBE_VIRT. This will cause the code to cache the system register 46 values rather than re-reading them each call. */ 47 48#define dstin x0 49#define val w1 50#define count x2 51#define dst_count x3 /* for __memset_chk */ 52#define tmp1 x3 53#define tmp1w w3 54#define tmp2 x4 55#define tmp2w w4 56#define zva_len_x x5 57#define zva_len w5 58#define zva_bits_x x6 59 60#define A_l x7 61#define A_lw w7 62#define dst x8 63#define tmp3w w9 64 65#define QA_l q0 66 67ENTRY(__memset_chk) 68 cmp count, dst_count 69 bls memset 70 71 // Preserve for accurate backtrace. 72 stp x29, x30, [sp, -16]! 73 .cfi_def_cfa_offset 16 74 .cfi_rel_offset x29, 0 75 .cfi_rel_offset x30, 8 76 77 bl __memset_chk_fail 78END(__memset_chk) 79 80ENTRY(memset) 81 82 mov dst, dstin /* Preserve return value. */ 83 ands A_lw, val, #255 84#ifndef DONT_USE_DC 85# b.eq .Lzero_mem 86#endif 87 orr A_lw, A_lw, A_lw, lsl #8 88 orr A_lw, A_lw, A_lw, lsl #16 89 orr A_l, A_l, A_l, lsl #32 90.Ltail_maybe_long: 91 cmp count, #256 92 b.ge .Lnot_short 93.Ltail_maybe_tiny: 94 cmp count, #15 95 b.le .Ltail15tiny 96.Ltail255: 97 ands tmp1, count, #0xC0 98 b.eq .Ltail63 99 dup v0.4s, A_lw 100 cmp tmp1w, #0x80 101 b.eq 1f 102 b.lt 2f 103 stp QA_l, QA_l, [dst], #32 104 stp QA_l, QA_l, [dst], #32 1051: 106 stp QA_l, QA_l, [dst], #32 107 stp QA_l, QA_l, [dst], #32 1082: 109 stp QA_l, QA_l, [dst], #32 110 stp QA_l, QA_l, [dst], #32 111.Ltail63: 112 ands tmp1, count, #0x30 113 b.eq .Ltail15 114 add dst, dst, tmp1 115 cmp tmp1w, #0x20 116 b.eq 1f 117 b.lt 2f 118 stp A_l, A_l, [dst, #-48] 1191: 120 stp A_l, A_l, [dst, #-32] 1212: 122 stp A_l, A_l, [dst, #-16] 123 124.Ltail15: 125 and count, count, #15 126 add dst, dst, count 127 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 128 ret 129 130.Ltail15tiny: 131 /* Set up to 15 bytes. Does not assume earlier memory 132 being set. */ 133 tbz count, #3, 1f 134 str A_l, [dst], #8 1351: 136 tbz count, #2, 1f 137 str A_lw, [dst], #4 1381: 139 tbz count, #1, 1f 140 strh A_lw, [dst], #2 1411: 142 tbz count, #0, 1f 143 strb A_lw, [dst] 1441: 145 ret 146 147 /* Critical loop. Start at a new cache line boundary. Assuming 148 * 64 bytes per line, this ensures the entire loop is in one line. */ 149 .p2align 6 150.Lnot_short: 151 dup v0.4s, A_lw 152 neg tmp2, dst 153 ands tmp2, tmp2, #15 154 b.eq 2f 155 /* Bring DST to 128-bit (16-byte) alignment. We know that there's 156 * more than that to set, so we simply store 16 bytes and advance by 157 * the amount required to reach alignment. */ 158 sub count, count, tmp2 159 stp A_l, A_l, [dst] 160 add dst, dst, tmp2 161 /* There may be less than 63 bytes to go now. */ 162 cmp count, #255 163 b.le .Ltail255 1642: 165 cmp count, #2097152 166 b.gt 3f 1671: 168 sub count, count, #256 1692: 170 stp QA_l, QA_l, [dst], #32 171 stp QA_l, QA_l, [dst], #32 172 stp QA_l, QA_l, [dst], #32 173 stp QA_l, QA_l, [dst], #32 174 stp QA_l, QA_l, [dst], #32 175 stp QA_l, QA_l, [dst], #32 176 stp QA_l, QA_l, [dst], #32 177 stp QA_l, QA_l, [dst], #32 178 subs count, count, #256 179 b.ge 2b 180 tst count, #0xff 181 b.ne .Ltail255 182 ret 1833: 184 sub count, count, #64 1854: 186 subs count, count, #64 187 stnp QA_l, QA_l, [dst] 188 stnp QA_l, QA_l, [dst, #32] 189 add dst, dst, #64 190 b.ge 4b 191 tst count, #0x3f 192 b.ne .Ltail63 193 ret 194 195#ifndef DONT_USE_DC 196 /* For zeroing memory, check to see if we can use the ZVA feature to 197 * zero entire 'cache' lines. */ 198.Lzero_mem: 199 mov A_l, #0 200 cmp count, #63 201 b.le .Ltail_maybe_tiny 202 neg tmp2, dst 203 ands tmp2, tmp2, #15 204 b.eq 1f 205 sub count, count, tmp2 206 stp A_l, A_l, [dst] 207 add dst, dst, tmp2 208 cmp count, #63 209 b.le .Ltail63 2101: 211 /* For zeroing small amounts of memory, it's not worth setting up 212 * the line-clear code. */ 213 cmp count, #128 214 b.lt .Lnot_short 215#ifdef MAYBE_VIRT 216 /* For efficiency when virtualized, we cache the ZVA capability. */ 217 adrp tmp2, .Lcache_clear 218 ldr zva_len, [tmp2, #:lo12:.Lcache_clear] 219 tbnz zva_len, #31, .Lnot_short 220 cbnz zva_len, .Lzero_by_line 221 mrs tmp1, dczid_el0 222 tbz tmp1, #4, 1f 223 /* ZVA not available. Remember this for next time. */ 224 mov zva_len, #~0 225 str zva_len, [tmp2, #:lo12:.Lcache_clear] 226 b .Lnot_short 2271: 228 mov tmp3w, #4 229 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 230 lsl zva_len, tmp3w, zva_len 231 str zva_len, [tmp2, #:lo12:.Lcache_clear] 232#else 233 mrs tmp1, dczid_el0 234 tbnz tmp1, #4, .Lnot_short 235 mov tmp3w, #4 236 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 237 lsl zva_len, tmp3w, zva_len 238#endif 239 240.Lzero_by_line: 241 /* Compute how far we need to go to become suitably aligned. We're 242 * already at quad-word alignment. */ 243 cmp count, zva_len_x 244 b.lt .Lnot_short /* Not enough to reach alignment. */ 245 sub zva_bits_x, zva_len_x, #1 246 neg tmp2, dst 247 ands tmp2, tmp2, zva_bits_x 248 b.eq 1f /* Already aligned. */ 249 /* Not aligned, check that there's enough to copy after alignment. */ 250 sub tmp1, count, tmp2 251 cmp tmp1, #64 252 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 253 b.lt .Lnot_short 254 /* We know that there's at least 64 bytes to zero and that it's safe 255 * to overrun by 64 bytes. */ 256 mov count, tmp1 2572: 258 stp A_l, A_l, [dst] 259 stp A_l, A_l, [dst, #16] 260 stp A_l, A_l, [dst, #32] 261 subs tmp2, tmp2, #64 262 stp A_l, A_l, [dst, #48] 263 add dst, dst, #64 264 b.ge 2b 265 /* We've overrun a bit, so adjust dst downwards. */ 266 add dst, dst, tmp2 2671: 268 sub count, count, zva_len_x 2693: 270 dc zva, dst 271 add dst, dst, zva_len_x 272 subs count, count, zva_len_x 273 b.ge 3b 274 ands count, count, zva_bits_x 275 b.ne .Ltail_maybe_long 276 ret 277END(memset) 278 279#ifdef MAYBE_VIRT 280 .bss 281 .p2align 2 282.Lcache_clear: 283 .space 4 284#endif 285#endif /* DONT_USE_DC */ 286