1/* 2 * strcpy/stpcpy - copy a string returning pointer to start/end. 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 */ 8 9/* Assumptions: 10 * 11 * ARMv8-a, AArch64, unaligned accesses, min page size 4k. 12 */ 13 14#include "../asmdefs.h" 15 16/* To build as stpcpy, define BUILD_STPCPY before compiling this file. 17 18 To test the page crossing code path more thoroughly, compile with 19 -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower 20 entry path. This option is not intended for production use. */ 21 22/* Arguments and results. */ 23#define dstin x0 24#define srcin x1 25 26/* Locals and temporaries. */ 27#define src x2 28#define dst x3 29#define data1 x4 30#define data1w w4 31#define data2 x5 32#define data2w w5 33#define has_nul1 x6 34#define has_nul2 x7 35#define tmp1 x8 36#define tmp2 x9 37#define tmp3 x10 38#define tmp4 x11 39#define zeroones x12 40#define data1a x13 41#define data2a x14 42#define pos x15 43#define len x16 44#define to_align x17 45 46#ifdef BUILD_STPCPY 47#define STRCPY __stpcpy_aarch64 48#else 49#define STRCPY __strcpy_aarch64 50#endif 51 52 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 53 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 54 can be done in parallel across the entire word. */ 55 56#define REP8_01 0x0101010101010101 57#define REP8_7f 0x7f7f7f7f7f7f7f7f 58#define REP8_80 0x8080808080808080 59 60 /* AArch64 systems have a minimum page size of 4k. We can do a quick 61 page size check for crossing this boundary on entry and if we 62 do not, then we can short-circuit much of the entry code. We 63 expect early page-crossing strings to be rare (probability of 64 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite 65 predictable, even with random strings. 66 67 We don't bother checking for larger page sizes, the cost of setting 68 up the correct page size is just not worth the extra gain from 69 a small reduction in the cases taking the slow path. Note that 70 we only care about whether the first fetch, which may be 71 misaligned, crosses a page boundary - after that we move to aligned 72 fetches for the remainder of the string. */ 73 74#ifdef STRCPY_TEST_PAGE_CROSS 75 /* Make everything that isn't Qword aligned look like a page cross. */ 76#define MIN_PAGE_P2 4 77#else 78#define MIN_PAGE_P2 12 79#endif 80 81#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) 82 83ENTRY (STRCPY) 84 /* For moderately short strings, the fastest way to do the copy is to 85 calculate the length of the string in the same way as strlen, then 86 essentially do a memcpy of the result. This avoids the need for 87 multiple byte copies and further means that by the time we 88 reach the bulk copy loop we know we can always use DWord 89 accesses. We expect __strcpy_aarch64 to rarely be called repeatedly 90 with the same source string, so branch prediction is likely to 91 always be difficult - we mitigate against this by preferring 92 conditional select operations over branches whenever this is 93 feasible. */ 94 and tmp2, srcin, #(MIN_PAGE_SIZE - 1) 95 mov zeroones, #REP8_01 96 and to_align, srcin, #15 97 cmp tmp2, #(MIN_PAGE_SIZE - 16) 98 neg tmp1, to_align 99 /* The first fetch will straddle a (possible) page boundary iff 100 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte 101 aligned string will never fail the page align check, so will 102 always take the fast path. */ 103 b.gt L(page_cross) 104 105L(page_cross_ok): 106 ldp data1, data2, [srcin] 107#ifdef __AARCH64EB__ 108 /* Because we expect the end to be found within 16 characters 109 (profiling shows this is the most common case), it's worth 110 swapping the bytes now to save having to recalculate the 111 termination syndrome later. We preserve data1 and data2 112 so that we can re-use the values later on. */ 113 rev tmp2, data1 114 sub tmp1, tmp2, zeroones 115 orr tmp2, tmp2, #REP8_7f 116 bics has_nul1, tmp1, tmp2 117 b.ne L(fp_le8) 118 rev tmp4, data2 119 sub tmp3, tmp4, zeroones 120 orr tmp4, tmp4, #REP8_7f 121#else 122 sub tmp1, data1, zeroones 123 orr tmp2, data1, #REP8_7f 124 bics has_nul1, tmp1, tmp2 125 b.ne L(fp_le8) 126 sub tmp3, data2, zeroones 127 orr tmp4, data2, #REP8_7f 128#endif 129 bics has_nul2, tmp3, tmp4 130 b.eq L(bulk_entry) 131 132 /* The string is short (<=16 bytes). We don't know exactly how 133 short though, yet. Work out the exact length so that we can 134 quickly select the optimal copy strategy. */ 135L(fp_gt8): 136 rev has_nul2, has_nul2 137 clz pos, has_nul2 138 mov tmp2, #56 139 add dst, dstin, pos, lsr #3 /* Bits to bytes. */ 140 sub pos, tmp2, pos 141#ifdef __AARCH64EB__ 142 lsr data2, data2, pos 143#else 144 lsl data2, data2, pos 145#endif 146 str data2, [dst, #1] 147 str data1, [dstin] 148#ifdef BUILD_STPCPY 149 add dstin, dst, #8 150#endif 151 ret 152 153L(fp_le8): 154 rev has_nul1, has_nul1 155 clz pos, has_nul1 156 add dst, dstin, pos, lsr #3 /* Bits to bytes. */ 157 subs tmp2, pos, #24 /* Pos in bits. */ 158 b.lt L(fp_lt4) 159#ifdef __AARCH64EB__ 160 mov tmp2, #56 161 sub pos, tmp2, pos 162 lsr data2, data1, pos 163 lsr data1, data1, #32 164#else 165 lsr data2, data1, tmp2 166#endif 167 /* 4->7 bytes to copy. */ 168 str data2w, [dst, #-3] 169 str data1w, [dstin] 170#ifdef BUILD_STPCPY 171 mov dstin, dst 172#endif 173 ret 174L(fp_lt4): 175 cbz pos, L(fp_lt2) 176 /* 2->3 bytes to copy. */ 177#ifdef __AARCH64EB__ 178 lsr data1, data1, #48 179#endif 180 strh data1w, [dstin] 181 /* Fall-through, one byte (max) to go. */ 182L(fp_lt2): 183 /* Null-terminated string. Last character must be zero! */ 184 strb wzr, [dst] 185#ifdef BUILD_STPCPY 186 mov dstin, dst 187#endif 188 ret 189 190 .p2align 6 191 /* Aligning here ensures that the entry code and main loop all lies 192 within one 64-byte cache line. */ 193L(bulk_entry): 194 sub to_align, to_align, #16 195 stp data1, data2, [dstin] 196 sub src, srcin, to_align 197 sub dst, dstin, to_align 198 b L(entry_no_page_cross) 199 200 /* The inner loop deals with two Dwords at a time. This has a 201 slightly higher start-up cost, but we should win quite quickly, 202 especially on cores with a high number of issue slots per 203 cycle, as we get much better parallelism out of the operations. */ 204L(main_loop): 205 stp data1, data2, [dst], #16 206L(entry_no_page_cross): 207 ldp data1, data2, [src], #16 208 sub tmp1, data1, zeroones 209 orr tmp2, data1, #REP8_7f 210 sub tmp3, data2, zeroones 211 orr tmp4, data2, #REP8_7f 212 bic has_nul1, tmp1, tmp2 213 bics has_nul2, tmp3, tmp4 214 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ 215 b.eq L(main_loop) 216 217 /* Since we know we are copying at least 16 bytes, the fastest way 218 to deal with the tail is to determine the location of the 219 trailing NUL, then (re)copy the 16 bytes leading up to that. */ 220 cmp has_nul1, #0 221#ifdef __AARCH64EB__ 222 /* For big-endian, carry propagation (if the final byte in the 223 string is 0x01) means we cannot use has_nul directly. The 224 easiest way to get the correct byte is to byte-swap the data 225 and calculate the syndrome a second time. */ 226 csel data1, data1, data2, ne 227 rev data1, data1 228 sub tmp1, data1, zeroones 229 orr tmp2, data1, #REP8_7f 230 bic has_nul1, tmp1, tmp2 231#else 232 csel has_nul1, has_nul1, has_nul2, ne 233#endif 234 rev has_nul1, has_nul1 235 clz pos, has_nul1 236 add tmp1, pos, #72 237 add pos, pos, #8 238 csel pos, pos, tmp1, ne 239 add src, src, pos, lsr #3 240 add dst, dst, pos, lsr #3 241 ldp data1, data2, [src, #-32] 242 stp data1, data2, [dst, #-16] 243#ifdef BUILD_STPCPY 244 sub dstin, dst, #1 245#endif 246 ret 247 248L(page_cross): 249 bic src, srcin, #15 250 /* Start by loading two words at [srcin & ~15], then forcing the 251 bytes that precede srcin to 0xff. This means they never look 252 like termination bytes. */ 253 ldp data1, data2, [src] 254 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 255 tst to_align, #7 256 csetm tmp2, ne 257#ifdef __AARCH64EB__ 258 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 259#else 260 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 261#endif 262 orr data1, data1, tmp2 263 orr data2a, data2, tmp2 264 cmp to_align, #8 265 csinv data1, data1, xzr, lt 266 csel data2, data2, data2a, lt 267 sub tmp1, data1, zeroones 268 orr tmp2, data1, #REP8_7f 269 sub tmp3, data2, zeroones 270 orr tmp4, data2, #REP8_7f 271 bic has_nul1, tmp1, tmp2 272 bics has_nul2, tmp3, tmp4 273 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ 274 b.eq L(page_cross_ok) 275 /* We now need to make data1 and data2 look like they've been 276 loaded directly from srcin. Do a rotate on the 128-bit value. */ 277 lsl tmp1, to_align, #3 /* Bytes->bits. */ 278 neg tmp2, to_align, lsl #3 279#ifdef __AARCH64EB__ 280 lsl data1a, data1, tmp1 281 lsr tmp4, data2, tmp2 282 lsl data2, data2, tmp1 283 orr tmp4, tmp4, data1a 284 cmp to_align, #8 285 csel data1, tmp4, data2, lt 286 rev tmp2, data1 287 rev tmp4, data2 288 sub tmp1, tmp2, zeroones 289 orr tmp2, tmp2, #REP8_7f 290 sub tmp3, tmp4, zeroones 291 orr tmp4, tmp4, #REP8_7f 292#else 293 lsr data1a, data1, tmp1 294 lsl tmp4, data2, tmp2 295 lsr data2, data2, tmp1 296 orr tmp4, tmp4, data1a 297 cmp to_align, #8 298 csel data1, tmp4, data2, lt 299 sub tmp1, data1, zeroones 300 orr tmp2, data1, #REP8_7f 301 sub tmp3, data2, zeroones 302 orr tmp4, data2, #REP8_7f 303#endif 304 bic has_nul1, tmp1, tmp2 305 cbnz has_nul1, L(fp_le8) 306 bic has_nul2, tmp3, tmp4 307 b L(fp_gt8) 308 309END (STRCPY) 310