1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
18#define ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
19
20#include "asm_support_arm.S"
21
22/*
23 * Optimized memcmp16() for ARM9.
24 * This would not be optimal on XScale or ARM11, where more prefetching
25 * and use of pld will be needed.
26 * The 2 major optimzations here are
27 * (1) The main loop compares 16 bytes at a time
28 * (2) The loads are scheduled in a way they won't stall
29 */
30
31ARM_ENTRY __memcmp16
32        pld         [r0, #0]
33        pld         [r1, #0]
34
35        /* take of the case where length is nul or the buffers are the same */
36        cmp         r0, r1
37        cmpne       r2, #0
38        moveq       r0, #0
39        bxeq        lr
40
41        /* since r0 hold the result, move the first source
42         * pointer somewhere else
43         */
44
45        mov         r3, r0
46
47         /* make sure we have at least 12 words, this simplify things below
48          * and avoid some overhead for small blocks
49          */
50
51        cmp         r2, #12
52        bpl         0f
53
54        /* small blocks (less then 12 words) */
55        pld         [r0, #32]
56        pld         [r1, #32]
57
581:      ldrh        r0, [r3], #2
59        ldrh        ip, [r1], #2
60        subs        r0, r0, ip
61        bxne        lr
62        subs        r2, r2, #1
63        bne         1b
64        bx          lr
65
66
67        /* save registers */
680:      stmfd       sp!, {r4, lr}
69        .cfi_def_cfa_offset 8
70        .cfi_rel_offset r4, 0
71        .cfi_rel_offset lr, 4
72
73        /* align first pointer to word boundary */
74        tst         r3, #2
75        beq         0f
76
77        ldrh        r0, [r3], #2
78        ldrh        ip, [r1], #2
79        sub         r2, r2, #1
80        subs        r0, r0, ip
81        /* restore registers and return */
82        ldmnefd     sp!, {r4, lr}
83        bxne        lr
84
85
860:      /* here the first pointer is aligned, and we have at least 3 words
87         * to process.
88         */
89
90        /* see if the pointers are congruent */
91        eor         r0, r3, r1
92        ands        r0, r0, #2
93        bne         5f
94
95        /* congruent case, 16 half-words per iteration
96         * We need to make sure there are at least 16+2 words left
97         * because we effectively read ahead one long word, and we could
98         * read past the buffer (and segfault) if we're not careful.
99         */
100
101        ldr         ip, [r1]
102        subs        r2, r2, #(16 + 2)
103        bmi         1f
104
1050:
106        pld         [r3, #64]
107        pld         [r1, #64]
108        ldr         r0, [r3], #4
109        ldr         lr, [r1, #4]!
110        eors        r0, r0, ip
111        ldreq       r0, [r3], #4
112        ldreq       ip, [r1, #4]!
113        eoreqs      r0, r0, lr
114        ldreq       r0, [r3], #4
115        ldreq       lr, [r1, #4]!
116        eoreqs      r0, r0, ip
117        ldreq       r0, [r3], #4
118        ldreq       ip, [r1, #4]!
119        eoreqs      r0, r0, lr
120        ldreq       r0, [r3], #4
121        ldreq       lr, [r1, #4]!
122        eoreqs      r0, r0, ip
123        ldreq       r0, [r3], #4
124        ldreq       ip, [r1, #4]!
125        eoreqs      r0, r0, lr
126        ldreq       r0, [r3], #4
127        ldreq       lr, [r1, #4]!
128        eoreqs      r0, r0, ip
129        ldreq       r0, [r3], #4
130        ldreq       ip, [r1, #4]!
131        eoreqs      r0, r0, lr
132        bne         2f
133        subs        r2, r2, #16
134        bhs         0b
135
136        /* do we have at least 2 words left? */
1371:      adds        r2, r2, #(16 - 2 + 2)
138        bmi         4f
139
140        /* finish off 2 words at a time */
1413:      ldr         r0, [r3], #4
142        ldr         ip, [r1], #4
143        eors        r0, r0, ip
144        bne         2f
145        subs        r2, r2, #2
146        bhs         3b
147
148        /* are we done? */
1494:      adds        r2, r2, #2
150        bne         8f
151        /* restore registers and return */
152        mov         r0, #0
153        ldmfd       sp!, {r4, lr}
154        bx          lr
155
1562:      /* the last 2 words are different, restart them */
157        ldrh        r0, [r3, #-4]
158        ldrh        ip, [r1, #-4]
159        subs        r0, r0, ip
160        ldreqh      r0, [r3, #-2]
161        ldreqh      ip, [r1, #-2]
162        subeqs      r0, r0, ip
163        /* restore registers and return */
164        ldmfd       sp!, {r4, lr}
165        bx          lr
166
167        /* process the last few words */
1688:      ldrh        r0, [r3], #2
169        ldrh        ip, [r1], #2
170        subs        r0, r0, ip
171        bne         9f
172        subs        r2, r2, #1
173        bne         8b
174
1759:      /* restore registers and return */
176        ldmfd       sp!, {r4, lr}
177        bx          lr
178
179
1805:      /*************** non-congruent case ***************/
181
182        /* align the unaligned pointer */
183        bic         r1, r1, #3
184        ldr         lr, [r1], #4
185        sub         r2, r2, #8
186
1876:
188        pld         [r3, #64]
189        pld         [r1, #64]
190        mov         ip, lr, lsr #16
191        ldr         lr, [r1], #4
192        ldr         r0, [r3], #4
193        orr         ip, ip, lr, lsl #16
194        eors        r0, r0, ip
195        moveq       ip, lr, lsr #16
196        ldreq       lr, [r1], #4
197        ldreq       r0, [r3], #4
198        orreq       ip, ip, lr, lsl #16
199        eoreqs      r0, r0, ip
200        moveq       ip, lr, lsr #16
201        ldreq       lr, [r1], #4
202        ldreq       r0, [r3], #4
203        orreq       ip, ip, lr, lsl #16
204        eoreqs      r0, r0, ip
205        moveq       ip, lr, lsr #16
206        ldreq       lr, [r1], #4
207        ldreq       r0, [r3], #4
208        orreq       ip, ip, lr, lsl #16
209        eoreqs      r0, r0, ip
210        bne         7f
211        subs        r2, r2, #8
212        bhs         6b
213        sub         r1, r1, #2
214        /* are we done? */
215        adds        r2, r2, #8
216        moveq       r0, #0
217        beq         9b
218        /* finish off the remaining bytes */
219        b           8b
220
2217:      /* fix up the 2 pointers and fallthrough... */
222        sub         r1, r1, #2
223        b           2b
224END __memcmp16
225
226
227#endif  // ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
228