1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <machine/cpu-features.h> 30#include <private/bionic_asm.h> 31 32 33#ifdef HAVE_32_BYTE_CACHE_LINE 34#define CACHE_LINE_SIZE 32 35#else 36#define CACHE_LINE_SIZE 64 37#endif 38 39/* 40 * Optimized memcmp() for Cortex-A9. 41 */ 42 43.syntax unified 44 45ENTRY(memcmp) 46 pld [r0, #(CACHE_LINE_SIZE * 0)] 47 pld [r0, #(CACHE_LINE_SIZE * 1)] 48 49 /* take of the case where length is 0 or the buffers are the same */ 50 cmp r0, r1 51 moveq r0, #0 52 bxeq lr 53 54 pld [r1, #(CACHE_LINE_SIZE * 0)] 55 pld [r1, #(CACHE_LINE_SIZE * 1)] 56 57 /* make sure we have at least 8+4 bytes, this simplify things below 58 * and avoid some overhead for small blocks 59 */ 60 cmp r2, #(8+4) 61 bmi 10f 62/* 63 * Neon optimization 64 * Comparing 32 bytes at a time 65 */ 66#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS) 67 subs r2, r2, #32 68 blo 3f 69 70 /* preload all the cache lines we need. */ 71 pld [r0, #(CACHE_LINE_SIZE * 2)] 72 pld [r1, #(CACHE_LINE_SIZE * 2)] 73 741: /* The main loop compares 32 bytes at a time */ 75 vld1.8 {d0 - d3}, [r0]! 76 pld [r0, #(CACHE_LINE_SIZE * 2)] 77 vld1.8 {d4 - d7}, [r1]! 78 pld [r1, #(CACHE_LINE_SIZE * 2)] 79 80 /* Start subtracting the values and merge results */ 81 vsub.i8 q0, q2 82 vsub.i8 q1, q3 83 vorr q2, q0, q1 84 vorr d4, d5 85 vmov r3, ip, d4 86 /* Check if there are any differences among the 32 bytes */ 87 orrs r3, ip 88 bne 2f 89 subs r2, r2, #32 90 bhs 1b 91 b 3f 922: 93 /* Check if the difference was in the first or last 16 bytes */ 94 sub r0, #32 95 vorr d0, d1 96 sub r1, #32 97 vmov r3, ip, d0 98 orrs r3, ip 99 /* if the first 16 bytes are equal, we only have to rewind 16 bytes */ 100 ittt eq 101 subeq r2, #16 102 addeq r0, #16 103 addeq r1, #16 104 1053: /* fix-up the remaining count */ 106 add r2, r2, #32 107 108 cmp r2, #(8+4) 109 bmi 10f 110#endif 111 112 /* save registers */ 113 stmfd sp!, {r4, lr} 114 .cfi_def_cfa_offset 8 115 .cfi_rel_offset r4, 0 116 .cfi_rel_offset lr, 4 117 118 /* since r0 hold the result, move the first source 119 * pointer somewhere else 120 */ 121 mov r4, r0 122 123 /* align first pointer to word boundary 124 * offset = -src & 3 125 */ 126 rsb r3, r4, #0 127 ands r3, r3, #3 128 beq 0f 129 130 /* align first pointer */ 131 sub r2, r2, r3 1321: ldrb r0, [r4], #1 133 ldrb ip, [r1], #1 134 subs r0, r0, ip 135 bne 9f 136 subs r3, r3, #1 137 bne 1b 138 139 1400: /* here the first pointer is aligned, and we have at least 4 bytes 141 * to process. 142 */ 143 144 /* see if the pointers are congruent */ 145 eor r0, r4, r1 146 ands r0, r0, #3 147 bne 5f 148 149 /* congruent case, 32 bytes per iteration 150 * We need to make sure there are at least 32+4 bytes left 151 * because we effectively read ahead one word, and we could 152 * read past the buffer (and segfault) if we're not careful. 153 */ 154 155 ldr ip, [r1] 156 subs r2, r2, #(32 + 4) 157 bmi 1f 158 1590: pld [r4, #(CACHE_LINE_SIZE * 2)] 160 pld [r1, #(CACHE_LINE_SIZE * 2)] 161 ldr r0, [r4], #4 162 ldr lr, [r1, #4]! 163 eors r0, r0, ip 164 ldreq r0, [r4], #4 165 ldreq ip, [r1, #4]! 166 eorseq r0, r0, lr 167 ldreq r0, [r4], #4 168 ldreq lr, [r1, #4]! 169 eorseq r0, r0, ip 170 ldreq r0, [r4], #4 171 ldreq ip, [r1, #4]! 172 eorseq r0, r0, lr 173 ldreq r0, [r4], #4 174 ldreq lr, [r1, #4]! 175 eorseq r0, r0, ip 176 ldreq r0, [r4], #4 177 ldreq ip, [r1, #4]! 178 eorseq r0, r0, lr 179 ldreq r0, [r4], #4 180 ldreq lr, [r1, #4]! 181 eorseq r0, r0, ip 182 ldreq r0, [r4], #4 183 ldreq ip, [r1, #4]! 184 eorseq r0, r0, lr 185 bne 2f 186 subs r2, r2, #32 187 bhs 0b 188 189 /* do we have at least 4 bytes left? */ 1901: adds r2, r2, #(32 - 4 + 4) 191 bmi 4f 192 193 /* finish off 4 bytes at a time */ 1943: ldr r0, [r4], #4 195 ldr ip, [r1], #4 196 eors r0, r0, ip 197 bne 2f 198 subs r2, r2, #4 199 bhs 3b 200 201 /* are we done? */ 2024: adds r2, r2, #4 203 moveq r0, #0 204 beq 9f 205 206 /* finish off the remaining bytes */ 207 b 8f 208 2092: /* the last 4 bytes are different, restart them */ 210 sub r4, r4, #4 211 sub r1, r1, #4 212 mov r2, #4 213 214 /* process the last few bytes */ 2158: ldrb r0, [r4], #1 216 ldrb ip, [r1], #1 217 // stall 218 subs r0, r0, ip 219 bne 9f 220 subs r2, r2, #1 221 bne 8b 222 2239: /* restore registers and return */ 224 ldmfd sp!, {r4, lr} 225 bx lr 226 22710: /* process less than 12 bytes */ 228 cmp r2, #0 229 moveq r0, #0 230 bxeq lr 231 mov r3, r0 23211: 233 ldrb r0, [r3], #1 234 ldrb ip, [r1], #1 235 subs r0, ip 236 bxne lr 237 subs r2, r2, #1 238 bne 11b 239 bx lr 240 2415: /*************** non-congruent case ***************/ 242 and r0, r1, #3 243 cmp r0, #2 244 bne 4f 245 246 /* here, offset is 2 (16-bits aligned, special cased) */ 247 248 /* make sure we have at least 16 bytes to process */ 249 subs r2, r2, #16 250 addmi r2, r2, #16 251 bmi 8b 252 253 /* align the unaligned pointer */ 254 bic r1, r1, #3 255 ldr lr, [r1], #4 256 2576: pld [r1, #(CACHE_LINE_SIZE * 2)] 258 pld [r4, #(CACHE_LINE_SIZE * 2)] 259 mov ip, lr, lsr #16 260 ldr lr, [r1], #4 261 ldr r0, [r4], #4 262 orr ip, ip, lr, lsl #16 263 eors r0, r0, ip 264 moveq ip, lr, lsr #16 265 ldreq lr, [r1], #4 266 ldreq r0, [r4], #4 267 orreq ip, ip, lr, lsl #16 268 eorseq r0, r0, ip 269 moveq ip, lr, lsr #16 270 ldreq lr, [r1], #4 271 ldreq r0, [r4], #4 272 orreq ip, ip, lr, lsl #16 273 eorseq r0, r0, ip 274 moveq ip, lr, lsr #16 275 ldreq lr, [r1], #4 276 ldreq r0, [r4], #4 277 orreq ip, ip, lr, lsl #16 278 eorseq r0, r0, ip 279 bne 7f 280 subs r2, r2, #16 281 bhs 6b 282 sub r1, r1, #2 283 /* are we done? */ 284 adds r2, r2, #16 285 moveq r0, #0 286 beq 9b 287 /* finish off the remaining bytes */ 288 b 8b 289 2907: /* fix up the 2 pointers and fallthrough... */ 291 sub r1, r1, #(4+2) 292 sub r4, r4, #4 293 mov r2, #4 294 b 8b 295 296 2974: /*************** offset is 1 or 3 (less optimized) ***************/ 298 299 stmfd sp!, {r5, r6, r7} 300 301 // r5 = rhs 302 // r6 = lhs 303 // r7 = scratch 304 305 mov r5, r0, lsl #3 /* r5 = right shift */ 306 rsb r6, r5, #32 /* r6 = left shift */ 307 308 /* align the unaligned pointer */ 309 bic r1, r1, #3 310 ldr r7, [r1], #4 311 sub r2, r2, #8 312 3136: mov ip, r7, lsr r5 314 ldr r7, [r1], #4 315 ldr r0, [r4], #4 316 orr ip, ip, r7, lsl r6 317 eors r0, r0, ip 318 moveq ip, r7, lsr r5 319 ldreq r7, [r1], #4 320 ldreq r0, [r4], #4 321 orreq ip, ip, r7, lsl r6 322 eorseq r0, r0, ip 323 bne 7f 324 subs r2, r2, #8 325 bhs 6b 326 327 sub r1, r1, r6, lsr #3 328 ldmfd sp!, {r5, r6, r7} 329 330 /* are we done? */ 331 adds r2, r2, #8 332 moveq r0, #0 333 beq 9b 334 335 /* finish off the remaining bytes */ 336 b 8b 337 3387: /* fix up the 2 pointers and fallthrough... */ 339 sub r1, r1, #4 340 sub r1, r1, r6, lsr #3 341 sub r4, r4, #4 342 mov r2, #4 343 ldmfd sp!, {r5, r6, r7} 344 b 8b 345END(memcmp) 346