1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * All rights reserved. 4 * Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in 13 * the documentation and/or other materials provided with the 14 * distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include <private/bionic_asm.h> 31#include <private/libc_events.h> 32 33 .text 34 .syntax unified 35 .fpu neon 36 37#define CACHE_LINE_SIZE (64) 38#define MEMCPY_BLOCK_SIZE_SMALL (32768) 39#define MEMCPY_BLOCK_SIZE_MID (1048576) 40#define PREFETCH_DISTANCE_NEAR (CACHE_LINE_SIZE*4) 41#define PREFETCH_DISTANCE_MID (CACHE_LINE_SIZE*4) 42#define PREFETCH_DISTANCE_FAR (CACHE_LINE_SIZE*16) 43 44ENTRY(memmove) 45 cmp r2, #0 46 cmpne r0, r1 47 bxeq lr 48 subs r3, r0, r1 49 bls .L_jump_to_memcpy 50 cmp r2, r3 51 bhi .L_reversed_memcpy 52 53.L_jump_to_memcpy: 54 b memcpy 55 56.L_reversed_memcpy: 57 push {r0, lr} 58 .cfi_def_cfa_offset 8 59 .cfi_rel_offset r0, 0 60 .cfi_rel_offset lr, 4 61 62 add r0, r0, r2 63 add r1, r1, r2 64 65 /* preload next cache line */ 66 pld [r1, #-CACHE_LINE_SIZE] 67 pld [r1, #-CACHE_LINE_SIZE*2] 68 69.L_reversed_memcpy_align_dest: 70 /* Deal with very small blocks (< 32bytes) asap */ 71 cmp r2, #32 72 blo .L_reversed_memcpy_lt_32bytes 73 /* no need to align if len < 128 bytes */ 74 cmp r2, #128 75 blo .L_reversed_memcpy_lt_128bytes 76 /* align destination to 64 bytes (1 cache line) */ 77 ands r3, r0, #0x3f 78 beq .L_reversed_memcpy_dispatch 79 sub r2, r2, r3 800: /* copy 1 byte */ 81 movs ip, r3, lsl #31 82 ldrbmi ip, [r1, #-1]! 83 strbmi ip, [r0, #-1]! 841: /* copy 2 bytes */ 85 ldrbcs ip, [r1, #-1]! 86 strbcs ip, [r0, #-1]! 87 ldrbcs ip, [r1, #-1]! 88 strbcs ip, [r0, #-1]! 892: /* copy 4 bytes */ 90 movs ip, r3, lsl #29 91 bpl 3f 92 sub r1, r1, #4 93 sub r0, r0, #4 94 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1] 95 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32] 963: /* copy 8 bytes */ 97 bcc 4f 98 sub r1, r1, #8 99 sub r0, r0, #8 100 vld1.8 {d0}, [r1] 101 vst1.8 {d0}, [r0, :64] 1024: /* copy 16 bytes */ 103 movs ip, r3, lsl #27 104 bpl 5f 105 sub r1, r1, #16 106 sub r0, r0, #16 107 vld1.8 {q0}, [r1] 108 vst1.8 {q0}, [r0, :128] 1095: /* copy 32 bytes */ 110 bcc .L_reversed_memcpy_dispatch 111 sub r1, r1, #32 112 sub r0, r0, #32 113 vld1.8 {q0, q1}, [r1] 114 vst1.8 {q0, q1}, [r0, :256] 115 116.L_reversed_memcpy_dispatch: 117 /* preload more cache lines */ 118 pld [r1, #-CACHE_LINE_SIZE*3] 119 pld [r1, #-CACHE_LINE_SIZE*4] 120 121 cmp r2, #MEMCPY_BLOCK_SIZE_SMALL 122 blo .L_reversed_memcpy_neon_pld_near 123 cmp r2, #MEMCPY_BLOCK_SIZE_MID 124 blo .L_reversed_memcpy_neon_pld_mid 125 b .L_reversed_memcpy_neon_pld_far 126 127.L_reversed_memcpy_neon_pld_near: 128 /* less than 128 bytes? */ 129 subs r2, r2, #128 130 blo 1f 131 sub r1, r1, #32 132 sub r0, r0, #32 133 mov r3, #-32 134 .align 4 1350: 136 /* copy 128 bytes in each loop */ 137 subs r2, r2, #128 138 139 /* preload to cache */ 140 pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32] 141 /* copy a cache line */ 142 vld1.8 {q0, q1}, [r1], r3 143 vst1.8 {q0, q1}, [r0, :256], r3 144 vld1.8 {q0, q1}, [r1], r3 145 vst1.8 {q0, q1}, [r0, :256], r3 146 147 /* preload to cache */ 148 pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32] 149 /* copy a cache line */ 150 vld1.8 {q0, q1}, [r1], r3 151 vst1.8 {q0, q1}, [r0, :256], r3 152 vld1.8 {q0, q1}, [r1], r3 153 vst1.8 {q0, q1}, [r0, :256], r3 154 155 bhs 0b 156 add r1, r1, #32 157 add r0, r0, #32 1581: 159 adds r2, r2, #128 160 bne .L_reversed_memcpy_lt_128bytes 161 pop {r0, pc} 162 163.L_reversed_memcpy_neon_pld_mid: 164 subs r2, r2, #128 165 sub r1, r1, #32 166 sub r0, r0, #32 167 mov r3, #-32 168 .align 4 1690: 170 /* copy 128 bytes in each loop */ 171 subs r2, r2, #128 172 173 /* preload to cache */ 174 pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32] 175 /* copy a cache line */ 176 vld1.8 {q0, q1}, [r1], r3 177 vst1.8 {q0, q1}, [r0, :256], r3 178 vld1.8 {q0, q1}, [r1], r3 179 vst1.8 {q0, q1}, [r0, :256], r3 180 181 /* preload to cache */ 182 pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32] 183 /* copy a cache line */ 184 vld1.8 {q0, q1}, [r1], r3 185 vst1.8 {q0, q1}, [r0, :256], r3 186 vld1.8 {q0, q1}, [r1], r3 187 vst1.8 {q0, q1}, [r0, :256], r3 188 189 bhs 0b 190 add r1, r1, #32 191 add r0, r0, #32 1921: 193 adds r2, r2, #128 194 bne .L_reversed_memcpy_lt_128bytes 195 pop {r0, pc} 196 197.L_reversed_memcpy_neon_pld_far: 198 sub r2, r2, #128 199 sub r0, r0, #128 200 sub r1, r1, #128 201 .align 4 2020: 203 /* copy 128 bytes in each loop */ 204 subs r2, r2, #128 205 206 /* preload to cache */ 207 pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128] 208 pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128] 209 /* read */ 210 vld1.8 {q0, q1}, [r1]! 211 vld1.8 {q2, q3}, [r1]! 212 vld1.8 {q8, q9}, [r1]! 213 vld1.8 {q10, q11}, [r1]! 214 /* write */ 215 vst1.8 {q0, q1}, [r0, :256]! 216 vst1.8 {q2, q3}, [r0, :256]! 217 vst1.8 {q8, q9}, [r0, :256]! 218 vst1.8 {q10, q11}, [r0, :256]! 219 220 sub r0, r0, #256 221 sub r1, r1, #256 222 bhs 0b 223 add r0, r0, #128 224 add r1, r1, #128 2251: 226 adds r2, r2, #128 227 bne .L_reversed_memcpy_lt_128bytes 228 pop {r0, pc} 229 230.L_reversed_memcpy_lt_128bytes: 2316: /* copy 64 bytes */ 232 movs ip, r2, lsl #26 233 bcc 5f 234 sub r1, r1, #32 235 sub r0, r0, #32 236 vld1.8 {q0, q1}, [r1] 237 vst1.8 {q0, q1}, [r0] 238 sub r1, r1, #32 239 sub r0, r0, #32 240 vld1.8 {q0, q1}, [r1] 241 vst1.8 {q0, q1}, [r0] 2425: /* copy 32 bytes */ 243 bpl 4f 244 sub r1, r1, #32 245 sub r0, r0, #32 246 vld1.8 {q0, q1}, [r1] 247 vst1.8 {q0, q1}, [r0] 248.L_reversed_memcpy_lt_32bytes: 2494: /* copy 16 bytes */ 250 movs ip, r2, lsl #28 251 bcc 3f 252 sub r1, r1, #16 253 sub r0, r0, #16 254 vld1.8 {q0}, [r1] 255 vst1.8 {q0}, [r0] 2563: /* copy 8 bytes */ 257 bpl 2f 258 sub r1, r1, #8 259 sub r0, r0, #8 260 vld1.8 {d0}, [r1] 261 vst1.8 {d0}, [r0] 2622: /* copy 4 bytes */ 263 ands ip, r2, #0x4 264 beq 1f 265 sub r1, r1, #4 266 sub r0, r0, #4 267 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1] 268 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 2691: /* copy 2 bytes */ 270 movs ip, r2, lsl #31 271 ldrbcs ip, [r1, #-1]! 272 strbcs ip, [r0, #-1]! 273 ldrbcs ip, [r1, #-1]! 274 strbcs ip, [r0, #-1]! 2750: /* copy 1 byte */ 276 ldrbmi ip, [r1, #-1]! 277 strbmi ip, [r0, #-1]! 278 279 pop {r0, pc} 280 281END(memmove) 282