1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30#include <private/bionic_asm.h>
31
32
33#ifdef HAVE_32_BYTE_CACHE_LINE
34#define CACHE_LINE_SIZE     32
35#else
36#define CACHE_LINE_SIZE     64
37#endif
38
39/*
40 * Optimized memcmp() for Cortex-A9.
41 */
42
43.syntax unified
44
45ENTRY(memcmp)
46        pld         [r0, #(CACHE_LINE_SIZE * 0)]
47        pld         [r0, #(CACHE_LINE_SIZE * 1)]
48
49        /* take of the case where length is 0 or the buffers are the same */
50        cmp         r0, r1
51        moveq       r0, #0
52        bxeq        lr
53
54        pld         [r1, #(CACHE_LINE_SIZE * 0)]
55        pld         [r1, #(CACHE_LINE_SIZE * 1)]
56
57        /* make sure we have at least 8+4 bytes, this simplify things below
58         * and avoid some overhead for small blocks
59         */
60        cmp        r2, #(8+4)
61        bmi        10f
62/*
63 * Neon optimization
64 * Comparing 32 bytes at a time
65 */
66#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
67        subs        r2, r2, #32
68        blo         3f
69
70        /* preload all the cache lines we need. */
71        pld         [r0, #(CACHE_LINE_SIZE * 2)]
72        pld         [r1, #(CACHE_LINE_SIZE * 2)]
73
741:      /* The main loop compares 32 bytes at a time */
75        vld1.8      {d0 - d3}, [r0]!
76        pld         [r0, #(CACHE_LINE_SIZE * 2)]
77        vld1.8      {d4 - d7}, [r1]!
78        pld         [r1, #(CACHE_LINE_SIZE * 2)]
79
80        /* Start subtracting the values and merge results */
81        vsub.i8     q0, q2
82        vsub.i8     q1, q3
83        vorr        q2, q0, q1
84        vorr        d4, d5
85        vmov        r3, ip, d4
86        /* Check if there are any differences among the 32 bytes */
87        orrs        r3, ip
88        bne         2f
89        subs        r2, r2, #32
90        bhs         1b
91        b           3f
922:
93        /* Check if the difference was in the first or last 16 bytes */
94        sub         r0, #32
95        vorr        d0, d1
96        sub         r1, #32
97        vmov        r3, ip, d0
98        orrs        r3, ip
99        /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
100        ittt        eq
101        subeq       r2, #16
102        addeq       r0, #16
103        addeq       r1, #16
104
1053:      /* fix-up the remaining count */
106        add         r2, r2, #32
107
108        cmp        r2, #(8+4)
109        bmi        10f
110#endif
111
112        /* save registers */
113        stmfd       sp!, {r4, lr}
114        .cfi_def_cfa_offset 8
115        .cfi_rel_offset r4, 0
116        .cfi_rel_offset lr, 4
117
118        /* since r0 hold the result, move the first source
119         * pointer somewhere else
120         */
121         mov        r4, r0
122
123        /* align first pointer to word boundary
124         * offset = -src & 3
125         */
126        rsb         r3, r4, #0
127        ands        r3, r3, #3
128        beq         0f
129
130        /* align first pointer  */
131        sub         r2, r2, r3
1321:      ldrb        r0, [r4], #1
133        ldrb        ip, [r1], #1
134        subs        r0, r0, ip
135        bne         9f
136        subs        r3, r3, #1
137        bne         1b
138
139
1400:      /* here the first pointer is aligned, and we have at least 4 bytes
141         * to process.
142         */
143
144        /* see if the pointers are congruent */
145        eor         r0, r4, r1
146        ands        r0, r0, #3
147        bne         5f
148
149        /* congruent case, 32 bytes per iteration
150         * We need to make sure there are at least 32+4 bytes left
151         * because we effectively read ahead one word, and we could
152         * read past the buffer (and segfault) if we're not careful.
153         */
154
155        ldr         ip, [r1]
156        subs        r2, r2, #(32 + 4)
157        bmi         1f
158
1590:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
160        pld         [r1, #(CACHE_LINE_SIZE * 2)]
161        ldr         r0, [r4], #4
162        ldr         lr, [r1, #4]!
163        eors        r0, r0, ip
164        ldreq       r0, [r4], #4
165        ldreq       ip, [r1, #4]!
166        eorseq      r0, r0, lr
167        ldreq       r0, [r4], #4
168        ldreq       lr, [r1, #4]!
169        eorseq      r0, r0, ip
170        ldreq       r0, [r4], #4
171        ldreq       ip, [r1, #4]!
172        eorseq      r0, r0, lr
173        ldreq       r0, [r4], #4
174        ldreq       lr, [r1, #4]!
175        eorseq      r0, r0, ip
176        ldreq       r0, [r4], #4
177        ldreq       ip, [r1, #4]!
178        eorseq      r0, r0, lr
179        ldreq       r0, [r4], #4
180        ldreq       lr, [r1, #4]!
181        eorseq      r0, r0, ip
182        ldreq       r0, [r4], #4
183        ldreq       ip, [r1, #4]!
184        eorseq      r0, r0, lr
185        bne         2f
186        subs        r2, r2, #32
187        bhs         0b
188
189        /* do we have at least 4 bytes left? */
1901:      adds        r2, r2, #(32 - 4 + 4)
191        bmi         4f
192
193        /* finish off 4 bytes at a time */
1943:      ldr         r0, [r4], #4
195        ldr         ip, [r1], #4
196        eors        r0, r0, ip
197        bne         2f
198        subs        r2, r2, #4
199        bhs         3b
200
201        /* are we done? */
2024:      adds        r2, r2, #4
203        moveq       r0, #0
204        beq         9f
205
206        /* finish off the remaining bytes */
207        b           8f
208
2092:      /* the last 4 bytes are different, restart them */
210        sub         r4, r4, #4
211        sub         r1, r1, #4
212        mov         r2, #4
213
214        /* process the last few bytes */
2158:      ldrb        r0, [r4], #1
216        ldrb        ip, [r1], #1
217        // stall
218        subs        r0, r0, ip
219        bne         9f
220        subs        r2, r2, #1
221        bne         8b
222
2239:      /* restore registers and return */
224        ldmfd       sp!, {r4, lr}
225        bx          lr
226
22710:     /* process less than 12 bytes */
228        cmp         r2, #0
229        moveq       r0, #0
230        bxeq        lr
231        mov         r3, r0
23211:
233        ldrb        r0, [r3], #1
234        ldrb        ip, [r1], #1
235        subs        r0, ip
236        bxne        lr
237        subs        r2, r2, #1
238        bne         11b
239        bx          lr
240
2415:      /*************** non-congruent case ***************/
242        and         r0, r1, #3
243        cmp         r0, #2
244        bne         4f
245
246        /* here, offset is 2 (16-bits aligned, special cased) */
247
248        /* make sure we have at least 16 bytes to process */
249        subs        r2, r2, #16
250        addmi       r2, r2, #16
251        bmi         8b
252
253        /* align the unaligned pointer */
254        bic         r1, r1, #3
255        ldr         lr, [r1], #4
256
2576:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
258        pld         [r4, #(CACHE_LINE_SIZE * 2)]
259        mov         ip, lr, lsr #16
260        ldr         lr, [r1], #4
261        ldr         r0, [r4], #4
262        orr         ip, ip, lr, lsl #16
263        eors        r0, r0, ip
264        moveq       ip, lr, lsr #16
265        ldreq       lr, [r1], #4
266        ldreq       r0, [r4], #4
267        orreq       ip, ip, lr, lsl #16
268        eorseq      r0, r0, ip
269        moveq       ip, lr, lsr #16
270        ldreq       lr, [r1], #4
271        ldreq       r0, [r4], #4
272        orreq       ip, ip, lr, lsl #16
273        eorseq      r0, r0, ip
274        moveq       ip, lr, lsr #16
275        ldreq       lr, [r1], #4
276        ldreq       r0, [r4], #4
277        orreq       ip, ip, lr, lsl #16
278        eorseq      r0, r0, ip
279        bne         7f
280        subs        r2, r2, #16
281        bhs         6b
282        sub         r1, r1, #2
283        /* are we done? */
284        adds        r2, r2, #16
285        moveq       r0, #0
286        beq         9b
287        /* finish off the remaining bytes */
288        b           8b
289
2907:      /* fix up the 2 pointers and fallthrough... */
291        sub         r1, r1, #(4+2)
292        sub         r4, r4, #4
293        mov         r2, #4
294        b           8b
295
296
2974:      /*************** offset is 1 or 3 (less optimized) ***************/
298
299		stmfd		sp!, {r5, r6, r7}
300
301        // r5 = rhs
302        // r6 = lhs
303        // r7 = scratch
304
305        mov         r5, r0, lsl #3		/* r5 = right shift */
306        rsb         r6, r5, #32         /* r6 = left shift */
307
308        /* align the unaligned pointer */
309        bic         r1, r1, #3
310        ldr         r7, [r1], #4
311        sub         r2, r2, #8
312
3136:      mov         ip, r7, lsr r5
314        ldr         r7, [r1], #4
315        ldr         r0, [r4], #4
316        orr         ip, ip, r7, lsl r6
317        eors        r0, r0, ip
318        moveq       ip, r7, lsr r5
319        ldreq       r7, [r1], #4
320        ldreq       r0, [r4], #4
321        orreq       ip, ip, r7, lsl r6
322        eorseq      r0, r0, ip
323        bne         7f
324        subs        r2, r2, #8
325        bhs         6b
326
327        sub         r1, r1, r6, lsr #3
328		ldmfd       sp!, {r5, r6, r7}
329
330        /* are we done? */
331        adds        r2, r2, #8
332        moveq       r0, #0
333        beq         9b
334
335        /* finish off the remaining bytes */
336        b           8b
337
3387:      /* fix up the 2 pointers and fallthrough... */
339        sub         r1, r1, #4
340        sub         r1, r1, r6, lsr #3
341        sub         r4, r4, #4
342        mov         r2, #4
343		ldmfd		sp!, {r5, r6, r7}
344        b           8b
345END(memcmp)
346