1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <private/bionic_asm.h>
30
31
32#ifdef HAVE_32_BYTE_CACHE_LINE
33#define CACHE_LINE_SIZE     32
34#else
35#define CACHE_LINE_SIZE     64
36#endif
37
38/*
39 * Optimized memcmp() for Cortex-A9.
40 */
41
42.syntax unified
43
44ENTRY(memcmp)
45        pld         [r0, #(CACHE_LINE_SIZE * 0)]
46        pld         [r0, #(CACHE_LINE_SIZE * 1)]
47
48        /* take of the case where length is 0 or the buffers are the same */
49        cmp         r0, r1
50        moveq       r0, #0
51        bxeq        lr
52
53        pld         [r1, #(CACHE_LINE_SIZE * 0)]
54        pld         [r1, #(CACHE_LINE_SIZE * 1)]
55
56        /* make sure we have at least 8+4 bytes, this simplify things below
57         * and avoid some overhead for small blocks
58         */
59        cmp        r2, #(8+4)
60        bmi        10f
61/*
62 * Neon optimization
63 * Comparing 32 bytes at a time
64 */
65#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
66        subs        r2, r2, #32
67        blo         3f
68
69        /* preload all the cache lines we need. */
70        pld         [r0, #(CACHE_LINE_SIZE * 2)]
71        pld         [r1, #(CACHE_LINE_SIZE * 2)]
72
731:      /* The main loop compares 32 bytes at a time */
74        vld1.8      {d0 - d3}, [r0]!
75        pld         [r0, #(CACHE_LINE_SIZE * 2)]
76        vld1.8      {d4 - d7}, [r1]!
77        pld         [r1, #(CACHE_LINE_SIZE * 2)]
78
79        /* Start subtracting the values and merge results */
80        vsub.i8     q0, q2
81        vsub.i8     q1, q3
82        vorr        q2, q0, q1
83        vorr        d4, d5
84        vmov        r3, ip, d4
85        /* Check if there are any differences among the 32 bytes */
86        orrs        r3, ip
87        bne         2f
88        subs        r2, r2, #32
89        bhs         1b
90        b           3f
912:
92        /* Check if the difference was in the first or last 16 bytes */
93        sub         r0, #32
94        vorr        d0, d1
95        sub         r1, #32
96        vmov        r3, ip, d0
97        orrs        r3, ip
98        /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
99        ittt        eq
100        subeq       r2, #16
101        addeq       r0, #16
102        addeq       r1, #16
103
1043:      /* fix-up the remaining count */
105        add         r2, r2, #32
106
107        cmp        r2, #(8+4)
108        bmi        10f
109#endif
110
111        /* save registers */
112        stmfd       sp!, {r4, lr}
113        .cfi_def_cfa_offset 8
114        .cfi_rel_offset r4, 0
115        .cfi_rel_offset lr, 4
116
117        /* since r0 hold the result, move the first source
118         * pointer somewhere else
119         */
120         mov        r4, r0
121
122        /* align first pointer to word boundary
123         * offset = -src & 3
124         */
125        rsb         r3, r4, #0
126        ands        r3, r3, #3
127        beq         0f
128
129        /* align first pointer  */
130        sub         r2, r2, r3
1311:      ldrb        r0, [r4], #1
132        ldrb        ip, [r1], #1
133        subs        r0, r0, ip
134        bne         9f
135        subs        r3, r3, #1
136        bne         1b
137
138
1390:      /* here the first pointer is aligned, and we have at least 4 bytes
140         * to process.
141         */
142
143        /* see if the pointers are congruent */
144        eor         r0, r4, r1
145        ands        r0, r0, #3
146        bne         5f
147
148        /* congruent case, 32 bytes per iteration
149         * We need to make sure there are at least 32+4 bytes left
150         * because we effectively read ahead one word, and we could
151         * read past the buffer (and segfault) if we're not careful.
152         */
153
154        ldr         ip, [r1]
155        subs        r2, r2, #(32 + 4)
156        bmi         1f
157
1580:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
159        pld         [r1, #(CACHE_LINE_SIZE * 2)]
160        ldr         r0, [r4], #4
161        ldr         lr, [r1, #4]!
162        eors        r0, r0, ip
163        ldreq       r0, [r4], #4
164        ldreq       ip, [r1, #4]!
165        eorseq      r0, r0, lr
166        ldreq       r0, [r4], #4
167        ldreq       lr, [r1, #4]!
168        eorseq      r0, r0, ip
169        ldreq       r0, [r4], #4
170        ldreq       ip, [r1, #4]!
171        eorseq      r0, r0, lr
172        ldreq       r0, [r4], #4
173        ldreq       lr, [r1, #4]!
174        eorseq      r0, r0, ip
175        ldreq       r0, [r4], #4
176        ldreq       ip, [r1, #4]!
177        eorseq      r0, r0, lr
178        ldreq       r0, [r4], #4
179        ldreq       lr, [r1, #4]!
180        eorseq      r0, r0, ip
181        ldreq       r0, [r4], #4
182        ldreq       ip, [r1, #4]!
183        eorseq      r0, r0, lr
184        bne         2f
185        subs        r2, r2, #32
186        bhs         0b
187
188        /* do we have at least 4 bytes left? */
1891:      adds        r2, r2, #(32 - 4 + 4)
190        bmi         4f
191
192        /* finish off 4 bytes at a time */
1933:      ldr         r0, [r4], #4
194        ldr         ip, [r1], #4
195        eors        r0, r0, ip
196        bne         2f
197        subs        r2, r2, #4
198        bhs         3b
199
200        /* are we done? */
2014:      adds        r2, r2, #4
202        moveq       r0, #0
203        beq         9f
204
205        /* finish off the remaining bytes */
206        b           8f
207
2082:      /* the last 4 bytes are different, restart them */
209        sub         r4, r4, #4
210        sub         r1, r1, #4
211        mov         r2, #4
212
213        /* process the last few bytes */
2148:      ldrb        r0, [r4], #1
215        ldrb        ip, [r1], #1
216        // stall
217        subs        r0, r0, ip
218        bne         9f
219        subs        r2, r2, #1
220        bne         8b
221
2229:      /* restore registers and return */
223        ldmfd       sp!, {r4, pc}
224
22510:     /* process less than 12 bytes */
226        cmp         r2, #0
227        moveq       r0, #0
228        bxeq        lr
229        mov         r3, r0
23011:
231        ldrb        r0, [r3], #1
232        ldrb        ip, [r1], #1
233        subs        r0, ip
234        bxne        lr
235        subs        r2, r2, #1
236        bne         11b
237        bx          lr
238
2395:      /*************** non-congruent case ***************/
240        and         r0, r1, #3
241        cmp         r0, #2
242        bne         4f
243
244        /* here, offset is 2 (16-bits aligned, special cased) */
245
246        /* make sure we have at least 16 bytes to process */
247        subs        r2, r2, #16
248        addmi       r2, r2, #16
249        bmi         8b
250
251        /* align the unaligned pointer */
252        bic         r1, r1, #3
253        ldr         lr, [r1], #4
254
2556:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
256        pld         [r4, #(CACHE_LINE_SIZE * 2)]
257        mov         ip, lr, lsr #16
258        ldr         lr, [r1], #4
259        ldr         r0, [r4], #4
260        orr         ip, ip, lr, lsl #16
261        eors        r0, r0, ip
262        moveq       ip, lr, lsr #16
263        ldreq       lr, [r1], #4
264        ldreq       r0, [r4], #4
265        orreq       ip, ip, lr, lsl #16
266        eorseq      r0, r0, ip
267        moveq       ip, lr, lsr #16
268        ldreq       lr, [r1], #4
269        ldreq       r0, [r4], #4
270        orreq       ip, ip, lr, lsl #16
271        eorseq      r0, r0, ip
272        moveq       ip, lr, lsr #16
273        ldreq       lr, [r1], #4
274        ldreq       r0, [r4], #4
275        orreq       ip, ip, lr, lsl #16
276        eorseq      r0, r0, ip
277        bne         7f
278        subs        r2, r2, #16
279        bhs         6b
280        sub         r1, r1, #2
281        /* are we done? */
282        adds        r2, r2, #16
283        moveq       r0, #0
284        beq         9b
285        /* finish off the remaining bytes */
286        b           8b
287
2887:      /* fix up the 2 pointers and fallthrough... */
289        sub         r1, r1, #(4+2)
290        sub         r4, r4, #4
291        mov         r2, #4
292        b           8b
293
294
2954:      /*************** offset is 1 or 3 (less optimized) ***************/
296
297		stmfd		sp!, {r5, r6, r7}
298
299        // r5 = rhs
300        // r6 = lhs
301        // r7 = scratch
302
303        mov         r5, r0, lsl #3		/* r5 = right shift */
304        rsb         r6, r5, #32         /* r6 = left shift */
305
306        /* align the unaligned pointer */
307        bic         r1, r1, #3
308        ldr         r7, [r1], #4
309        sub         r2, r2, #8
310
3116:      mov         ip, r7, lsr r5
312        ldr         r7, [r1], #4
313        ldr         r0, [r4], #4
314        orr         ip, ip, r7, lsl r6
315        eors        r0, r0, ip
316        moveq       ip, r7, lsr r5
317        ldreq       r7, [r1], #4
318        ldreq       r0, [r4], #4
319        orreq       ip, ip, r7, lsl r6
320        eorseq      r0, r0, ip
321        bne         7f
322        subs        r2, r2, #8
323        bhs         6b
324
325        sub         r1, r1, r6, lsr #3
326		ldmfd       sp!, {r5, r6, r7}
327
328        /* are we done? */
329        adds        r2, r2, #8
330        moveq       r0, #0
331        beq         9b
332
333        /* finish off the remaining bytes */
334        b           8b
335
3367:      /* fix up the 2 pointers and fallthrough... */
337        sub         r1, r1, #4
338        sub         r1, r1, r6, lsr #3
339        sub         r4, r4, #4
340        mov         r2, #4
341		ldmfd		sp!, {r5, r6, r7}
342        b           8b
343END(memcmp)
344