1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <machine/cpu-features.h> 30#include <private/bionic_asm.h> 31#include <private/libc_events.h> 32 33 /* 34 * Optimized memcpy() for ARM. 35 * 36 * note that memcpy() always returns the destination pointer, 37 * so we have to preserve R0. 38 */ 39 40 .syntax unified 41 42ENTRY(__memcpy_chk) 43 cmp r2, r3 44 bhi __memcpy_chk_fail 45 46 // Fall through to memcpy... 47END(__memcpy_chk) 48 49ENTRY(memcpy) 50 /* The stack must always be 64-bits aligned to be compliant with the 51 * ARM ABI. Since we have to save R0, we might as well save R4 52 * which we can use for better pipelining of the reads below 53 */ 54 stmfd sp!, {r0, r4, lr} 55 .cfi_def_cfa_offset 12 56 .cfi_rel_offset r0, 0 57 .cfi_rel_offset r4, 4 58 .cfi_rel_offset lr, 8 59 /* Making room for r5-r11 which will be spilled later */ 60 sub sp, sp, #28 61 .cfi_adjust_cfa_offset 28 62 63 // preload the destination because we'll align it to a cache line 64 // with small writes. Also start the source "pump". 65 pld [r0, #0] 66 pld [r1, #0] 67 pld [r1, #32] 68 69 /* it simplifies things to take care of len<4 early */ 70 cmp r2, #4 71 blo .Lcopy_last_3_and_return 72 73 /* compute the offset to align the source 74 * offset = (4-(src&3))&3 = -src & 3 75 */ 76 rsb r3, r1, #0 77 ands r3, r3, #3 78 beq .Lsrc_aligned 79 80 /* align source to 32 bits. We need to insert 2 instructions between 81 * a ldr[b|h] and str[b|h] because byte and half-word instructions 82 * stall 2 cycles. 83 */ 84 movs r12, r3, lsl #31 85 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ 86 ldrbmi r3, [r1], #1 87 ldrbcs r4, [r1], #1 88 ldrbcs r12,[r1], #1 89 strbmi r3, [r0], #1 90 strbcs r4, [r0], #1 91 strbcs r12,[r0], #1 92 93.Lsrc_aligned: 94 95 /* see if src and dst are aligned together (congruent) */ 96 eor r12, r0, r1 97 tst r12, #3 98 bne .Lnon_congruent 99 100 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 101 * frame. Don't update sp. 102 */ 103 stmea sp, {r5-r11} 104 105 /* align the destination to a cache-line */ 106 rsb r3, r0, #0 107 ands r3, r3, #0x1C 108 beq .Lcongruent_aligned32 109 cmp r3, r2 110 andhi r3, r2, #0x1C 111 112 /* conditionally copies 0 to 7 words (length in r3) */ 113 movs r12, r3, lsl #28 114 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ 115 ldmmi r1!, {r8, r9} /* 8 bytes */ 116 stmcs r0!, {r4, r5, r6, r7} 117 stmmi r0!, {r8, r9} 118 tst r3, #0x4 119 ldrne r10,[r1], #4 /* 4 bytes */ 120 strne r10,[r0], #4 121 sub r2, r2, r3 122 123.Lcongruent_aligned32: 124 /* 125 * here source is aligned to 32 bytes. 126 */ 127 128.Lcached_aligned32: 129 subs r2, r2, #32 130 blo .Lless_than_32_left 131 132 /* 133 * We preload a cache-line up to 64 bytes ahead. On the 926, this will 134 * stall only until the requested world is fetched, but the linefill 135 * continues in the the background. 136 * While the linefill is going, we write our previous cache-line 137 * into the write-buffer (which should have some free space). 138 * When the linefill is done, the writebuffer will 139 * start dumping its content into memory 140 * 141 * While all this is going, we then load a full cache line into 142 * 8 registers, this cache line should be in the cache by now 143 * (or partly in the cache). 144 * 145 * This code should work well regardless of the source/dest alignment. 146 * 147 */ 148 149 // Align the preload register to a cache-line because the cpu does 150 // "critical word first" (the first word requested is loaded first). 151 bic r12, r1, #0x1F 152 add r12, r12, #64 153 1541: ldmia r1!, { r4-r11 } 155 pld [r12, #64] 156 subs r2, r2, #32 157 158 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi 159 // for ARM9 preload will not be safely guarded by the preceding subs. 160 // When it is safely guarded the only possibility to have SIGSEGV here 161 // is because the caller overstates the length. 162 ldrhi r3, [r12], #32 /* cheap ARM9 preload */ 163 stmia r0!, { r4-r11 } 164 bhs 1b 165 166 add r2, r2, #32 167 168.Lless_than_32_left: 169 /* 170 * less than 32 bytes left at this point (length in r2) 171 */ 172 173 /* skip all this if there is nothing to do, which should 174 * be a common case (if not executed the code below takes 175 * about 16 cycles) 176 */ 177 tst r2, #0x1F 178 beq 1f 179 180 /* conditionnaly copies 0 to 31 bytes */ 181 movs r12, r2, lsl #28 182 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ 183 ldmmi r1!, {r8, r9} /* 8 bytes */ 184 stmcs r0!, {r4, r5, r6, r7} 185 stmmi r0!, {r8, r9} 186 movs r12, r2, lsl #30 187 ldrcs r3, [r1], #4 /* 4 bytes */ 188 ldrhmi r4, [r1], #2 /* 2 bytes */ 189 strcs r3, [r0], #4 190 strhmi r4, [r0], #2 191 tst r2, #0x1 192 ldrbne r3, [r1] /* last byte */ 193 strbne r3, [r0] 194 195 /* we're done! restore everything and return */ 1961: ldmfd sp!, {r5-r11} 197 ldmfd sp!, {r0, r4, lr} 198 bx lr 199 200 /********************************************************************/ 201 202.Lnon_congruent: 203 /* 204 * here source is aligned to 4 bytes 205 * but destination is not. 206 * 207 * in the code below r2 is the number of bytes read 208 * (the number of bytes written is always smaller, because we have 209 * partial words in the shift queue) 210 */ 211 cmp r2, #4 212 blo .Lcopy_last_3_and_return 213 214 /* Use post-increment mode for stm to spill r5-r11 to reserved stack 215 * frame. Don't update sp. 216 */ 217 stmea sp, {r5-r11} 218 219 /* compute shifts needed to align src to dest */ 220 rsb r5, r0, #0 221 and r5, r5, #3 /* r5 = # bytes in partial words */ 222 mov r12, r5, lsl #3 /* r12 = right */ 223 rsb lr, r12, #32 /* lr = left */ 224 225 /* read the first word */ 226 ldr r3, [r1], #4 227 sub r2, r2, #4 228 229 /* write a partial word (0 to 3 bytes), such that destination 230 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) 231 */ 232 movs r5, r5, lsl #31 233 strbmi r3, [r0], #1 234 movmi r3, r3, lsr #8 235 strbcs r3, [r0], #1 236 movcs r3, r3, lsr #8 237 strbcs r3, [r0], #1 238 movcs r3, r3, lsr #8 239 240 cmp r2, #4 241 blo .Lpartial_word_tail 242 243 /* Align destination to 32 bytes (cache line boundary) */ 2441: tst r0, #0x1c 245 beq 2f 246 ldr r5, [r1], #4 247 sub r2, r2, #4 248 orr r4, r3, r5, lsl lr 249 mov r3, r5, lsr r12 250 str r4, [r0], #4 251 cmp r2, #4 252 bhs 1b 253 blo .Lpartial_word_tail 254 255 /* copy 32 bytes at a time */ 2562: subs r2, r2, #32 257 blo .Lless_than_thirtytwo 258 259 /* Use immediate mode for the shifts, because there is an extra cycle 260 * for register shifts, which could account for up to 50% of 261 * performance hit. 262 */ 263 264 cmp r12, #24 265 beq .Lloop24 266 cmp r12, #8 267 beq .Lloop8 268 269.Lloop16: 270 ldr r12, [r1], #4 2711: mov r4, r12 272 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 273 pld [r1, #64] 274 subs r2, r2, #32 275 ldrhs r12, [r1], #4 276 orr r3, r3, r4, lsl #16 277 mov r4, r4, lsr #16 278 orr r4, r4, r5, lsl #16 279 mov r5, r5, lsr #16 280 orr r5, r5, r6, lsl #16 281 mov r6, r6, lsr #16 282 orr r6, r6, r7, lsl #16 283 mov r7, r7, lsr #16 284 orr r7, r7, r8, lsl #16 285 mov r8, r8, lsr #16 286 orr r8, r8, r9, lsl #16 287 mov r9, r9, lsr #16 288 orr r9, r9, r10, lsl #16 289 mov r10, r10, lsr #16 290 orr r10, r10, r11, lsl #16 291 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 292 mov r3, r11, lsr #16 293 bhs 1b 294 b .Lless_than_thirtytwo 295 296.Lloop8: 297 ldr r12, [r1], #4 2981: mov r4, r12 299 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 300 pld [r1, #64] 301 subs r2, r2, #32 302 ldrhs r12, [r1], #4 303 orr r3, r3, r4, lsl #24 304 mov r4, r4, lsr #8 305 orr r4, r4, r5, lsl #24 306 mov r5, r5, lsr #8 307 orr r5, r5, r6, lsl #24 308 mov r6, r6, lsr #8 309 orr r6, r6, r7, lsl #24 310 mov r7, r7, lsr #8 311 orr r7, r7, r8, lsl #24 312 mov r8, r8, lsr #8 313 orr r8, r8, r9, lsl #24 314 mov r9, r9, lsr #8 315 orr r9, r9, r10, lsl #24 316 mov r10, r10, lsr #8 317 orr r10, r10, r11, lsl #24 318 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 319 mov r3, r11, lsr #8 320 bhs 1b 321 b .Lless_than_thirtytwo 322 323.Lloop24: 324 ldr r12, [r1], #4 3251: mov r4, r12 326 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 327 pld [r1, #64] 328 subs r2, r2, #32 329 ldrhs r12, [r1], #4 330 orr r3, r3, r4, lsl #8 331 mov r4, r4, lsr #24 332 orr r4, r4, r5, lsl #8 333 mov r5, r5, lsr #24 334 orr r5, r5, r6, lsl #8 335 mov r6, r6, lsr #24 336 orr r6, r6, r7, lsl #8 337 mov r7, r7, lsr #24 338 orr r7, r7, r8, lsl #8 339 mov r8, r8, lsr #24 340 orr r8, r8, r9, lsl #8 341 mov r9, r9, lsr #24 342 orr r9, r9, r10, lsl #8 343 mov r10, r10, lsr #24 344 orr r10, r10, r11, lsl #8 345 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 346 mov r3, r11, lsr #24 347 bhs 1b 348 349 350.Lless_than_thirtytwo: 351 /* copy the last 0 to 31 bytes of the source */ 352 rsb r12, lr, #32 /* we corrupted r12, recompute it */ 353 add r2, r2, #32 354 cmp r2, #4 355 blo .Lpartial_word_tail 356 3571: ldr r5, [r1], #4 358 sub r2, r2, #4 359 orr r4, r3, r5, lsl lr 360 mov r3, r5, lsr r12 361 str r4, [r0], #4 362 cmp r2, #4 363 bhs 1b 364 365.Lpartial_word_tail: 366 /* we have a partial word in the input buffer */ 367 movs r5, lr, lsl #(31-3) 368 strbmi r3, [r0], #1 369 movmi r3, r3, lsr #8 370 strbcs r3, [r0], #1 371 movcs r3, r3, lsr #8 372 strbcs r3, [r0], #1 373 374 /* Refill spilled registers from the stack. Don't update sp. */ 375 ldmfd sp, {r5-r11} 376 377.Lcopy_last_3_and_return: 378 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ 379 ldrbmi r2, [r1], #1 380 ldrbcs r3, [r1], #1 381 ldrbcs r12,[r1] 382 strbmi r2, [r0], #1 383 strbcs r3, [r0], #1 384 strbcs r12,[r0] 385 386 /* we're done! restore sp and spilled registers and return */ 387 add sp, sp, #28 388 ldmfd sp!, {r0, r4, lr} 389 bx lr 390END(memcpy) 391 392 // Only reached when the __memcpy_chk check fails. 393ENTRY_PRIVATE(__memcpy_chk_fail) 394 // Preserve lr for backtrace. 395 push {lr} 396 .cfi_def_cfa_offset 4 397 .cfi_rel_offset lr, 0 398 399 ldr r0, error_message 400 ldr r1, error_code 4011: 402 add r0, pc 403 bl __fortify_chk_fail 404error_code: 405 .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW 406error_message: 407 .word error_string-(1b+8) 408END(__memcpy_chk_fail) 409 410 .data 411error_string: 412 .string "memcpy: prevented write past end of buffer" 413