1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
18#define ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
19
20#include "asm_support_arm.S"
21
22/*
23 * Optimized memcmp16() for ARM9.
24 * This would not be optimal on XScale or ARM11, where more prefetching
25 * and use of pld will be needed.
26 * The 2 major optimzations here are
27 * (1) The main loop compares 16 bytes at a time
28 * (2) The loads are scheduled in a way they won't stall
29 */
30
31ARM_ENTRY __memcmp16
32        pld         [r0, #0]
33        pld         [r1, #0]
34
35        /* take of the case where length is nul or the buffers are the same */
36        cmp         r0, r1
37        cmpne       r2, #0
38        moveq       r0, #0
39        bxeq        lr
40
41        /* since r0 hold the result, move the first source
42         * pointer somewhere else
43         */
44
45        mov         r3, r0
46
47         /* make sure we have at least 12 words, this simplify things below
48          * and avoid some overhead for small blocks
49          */
50
51        cmp         r2, #12
52        bpl         0f
53
54        /* small blocks (less then 12 words) */
55        pld         [r0, #32]
56        pld         [r1, #32]
57
581:      ldrh        r0, [r3], #2
59        ldrh        ip, [r1], #2
60        subs        r0, r0, ip
61        bxne        lr
62        subs        r2, r2, #1
63        bne         1b
64        bx          lr
65
66
67        /* save registers */
680:      push        {r4, lr}
69        .cfi_def_cfa_offset 8
70        .cfi_rel_offset r4, 0
71        .cfi_rel_offset lr, 4
72
73        /* align first pointer to word boundary */
74        tst         r3, #2
75        beq         0f
76
77        ldrh        r0, [r3], #2
78        ldrh        ip, [r1], #2
79        sub         r2, r2, #1
80        subs        r0, r0, ip
81        /* restore registers and return */
82        popne       {r4, lr}
83        bxne        lr
84
85
860:      /* here the first pointer is aligned, and we have at least 3 words
87         * to process.
88         */
89
90        /* see if the pointers are congruent */
91        eor         r0, r3, r1
92        ands        r0, r0, #2
93        bne         5f
94
95        /* congruent case, 16 half-words per iteration
96         * We need to make sure there are at least 16+2 words left
97         * because we effectively read ahead one long word, and we could
98         * read past the buffer (and segfault) if we're not careful.
99         */
100
101        ldr         ip, [r1]
102        subs        r2, r2, #(16 + 2)
103        bmi         1f
104
1050:
106        pld         [r3, #64]
107        pld         [r1, #64]
108        ldr         r0, [r3], #4
109        ldr         lr, [r1, #4]!
110        eors        r0, r0, ip
111        ldreq       r0, [r3], #4
112        ldreq       ip, [r1, #4]!
113        eorseq      r0, r0, lr
114        ldreq       r0, [r3], #4
115        ldreq       lr, [r1, #4]!
116        eorseq      r0, r0, ip
117        ldreq       r0, [r3], #4
118        ldreq       ip, [r1, #4]!
119        eorseq      r0, r0, lr
120        ldreq       r0, [r3], #4
121        ldreq       lr, [r1, #4]!
122        eorseq      r0, r0, ip
123        ldreq       r0, [r3], #4
124        ldreq       ip, [r1, #4]!
125        eorseq      r0, r0, lr
126        ldreq       r0, [r3], #4
127        ldreq       lr, [r1, #4]!
128        eorseq      r0, r0, ip
129        ldreq       r0, [r3], #4
130        ldreq       ip, [r1, #4]!
131        eorseq      r0, r0, lr
132        bne         2f
133        subs        r2, r2, #16
134        bhs         0b
135
136        /* do we have at least 2 words left? */
1371:      adds        r2, r2, #(16 - 2 + 2)
138        bmi         4f
139
140        /* finish off 2 words at a time */
1413:      ldr         r0, [r3], #4
142        ldr         ip, [r1], #4
143        eors        r0, r0, ip
144        bne         2f
145        subs        r2, r2, #2
146        bhs         3b
147
148        /* are we done? */
1494:      adds        r2, r2, #2
150        bne         8f
151        /* restore registers and return */
152        mov         r0, #0
153        pop         {r4, lr}
154        .cfi_restore r4
155        .cfi_restore lr
156        .cfi_adjust_cfa_offset -8
157        bx          lr
158
1592:      /* the last 2 words are different, restart them */
160        ldrh        r0, [r3, #-4]
161        ldrh        ip, [r1, #-4]
162        subs        r0, r0, ip
163        ldrheq      r0, [r3, #-2]
164        ldrheq      ip, [r1, #-2]
165        subseq      r0, r0, ip
166        /* restore registers and return */
167        pop         {r4, lr}
168        .cfi_restore r4
169        .cfi_restore lr
170        .cfi_adjust_cfa_offset -8
171        bx          lr
172
173        /* process the last few words */
1748:      ldrh        r0, [r3], #2
175        ldrh        ip, [r1], #2
176        subs        r0, r0, ip
177        bne         9f
178        subs        r2, r2, #1
179        bne         8b
180
1819:      /* restore registers and return */
182        pop         {r4, lr}
183        .cfi_restore r4
184        .cfi_restore lr
185        .cfi_adjust_cfa_offset -8
186        bx          lr
187
188
1895:      /*************** non-congruent case ***************/
190
191        /* align the unaligned pointer */
192        bic         r1, r1, #3
193        ldr         lr, [r1], #4
194        sub         r2, r2, #8
195
1966:
197        pld         [r3, #64]
198        pld         [r1, #64]
199        mov         ip, lr, lsr #16
200        ldr         lr, [r1], #4
201        ldr         r0, [r3], #4
202        orr         ip, ip, lr, lsl #16
203        eors        r0, r0, ip
204        moveq       ip, lr, lsr #16
205        ldreq       lr, [r1], #4
206        ldreq       r0, [r3], #4
207        orreq       ip, ip, lr, lsl #16
208        eorseq      r0, r0, ip
209        moveq       ip, lr, lsr #16
210        ldreq       lr, [r1], #4
211        ldreq       r0, [r3], #4
212        orreq       ip, ip, lr, lsl #16
213        eorseq      r0, r0, ip
214        moveq       ip, lr, lsr #16
215        ldreq       lr, [r1], #4
216        ldreq       r0, [r3], #4
217        orreq       ip, ip, lr, lsl #16
218        eorseq      r0, r0, ip
219        bne         7f
220        subs        r2, r2, #8
221        bhs         6b
222        sub         r1, r1, #2
223        /* are we done? */
224        adds        r2, r2, #8
225        moveq       r0, #0
226        beq         9b
227        /* finish off the remaining bytes */
228        b           8b
229
2307:      /* fix up the 2 pointers and fallthrough... */
231        sub         r1, r1, #2
232        b           2b
233END __memcmp16
234
235
236#endif  // ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
237