1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 * All rights reserved.
4 * Copyright (c) 2013-2014 NVIDIA Corporation.  All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *  * Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 *  * Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in
13 *    the documentation and/or other materials provided with the
14 *    distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <private/bionic_asm.h>
31#include <private/libc_events.h>
32
33        .text
34        .syntax unified
35        .fpu    neon
36
37#define CACHE_LINE_SIZE         (64)
38#define MEMCPY_BLOCK_SIZE_SMALL (32768)
39#define MEMCPY_BLOCK_SIZE_MID   (1048576)
40#define PREFETCH_DISTANCE_NEAR  (CACHE_LINE_SIZE*4)
41#define PREFETCH_DISTANCE_MID   (CACHE_LINE_SIZE*4)
42#define PREFETCH_DISTANCE_FAR   (CACHE_LINE_SIZE*16)
43
44ENTRY(memmove)
45        cmp         r2, #0
46        cmpne       r0, r1
47        bxeq        lr
48        subs        r3, r0, r1
49        bls         .L_jump_to_memcpy
50        cmp         r2, r3
51        bhi         .L_reversed_memcpy
52
53.L_jump_to_memcpy:
54        b           memcpy
55
56.L_reversed_memcpy:
57        push        {r0, lr}
58        .cfi_def_cfa_offset 8
59        .cfi_rel_offset r0, 0
60        .cfi_rel_offset lr, 4
61
62        add         r0, r0, r2
63        add         r1, r1, r2
64
65        /* preload next cache line */
66        pld         [r1, #-CACHE_LINE_SIZE]
67        pld         [r1, #-CACHE_LINE_SIZE*2]
68
69.L_reversed_memcpy_align_dest:
70        /* Deal with very small blocks (< 32bytes) asap */
71        cmp         r2, #32
72        blo         .L_reversed_memcpy_lt_32bytes
73        /* no need to align if len < 128 bytes */
74        cmp         r2, #128
75        blo         .L_reversed_memcpy_lt_128bytes
76        /* align destination to 64 bytes (1 cache line) */
77        ands        r3, r0, #0x3f
78        beq         .L_reversed_memcpy_dispatch
79        sub         r2, r2, r3
800:      /* copy 1 byte */
81        movs        ip, r3, lsl #31
82        ldrbmi      ip, [r1, #-1]!
83        strbmi      ip, [r0, #-1]!
841:      /* copy 2 bytes */
85        ldrbcs      ip, [r1, #-1]!
86        strbcs      ip, [r0, #-1]!
87        ldrbcs      ip, [r1, #-1]!
88        strbcs      ip, [r0, #-1]!
892:      /* copy 4 bytes */
90        movs        ip, r3, lsl #29
91        bpl         3f
92        sub         r1, r1, #4
93        sub         r0, r0, #4
94        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
95        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]
963:      /* copy 8 bytes */
97        bcc         4f
98        sub         r1, r1, #8
99        sub         r0, r0, #8
100        vld1.8      {d0}, [r1]
101        vst1.8      {d0}, [r0, :64]
1024:      /* copy 16 bytes */
103        movs        ip, r3, lsl #27
104        bpl         5f
105        sub         r1, r1, #16
106        sub         r0, r0, #16
107        vld1.8      {q0}, [r1]
108        vst1.8      {q0}, [r0, :128]
1095:      /* copy 32 bytes */
110        bcc         .L_reversed_memcpy_dispatch
111        sub         r1, r1, #32
112        sub         r0, r0, #32
113        vld1.8      {q0, q1}, [r1]
114        vst1.8      {q0, q1}, [r0, :256]
115
116.L_reversed_memcpy_dispatch:
117        /* preload more cache lines */
118        pld         [r1, #-CACHE_LINE_SIZE*3]
119        pld         [r1, #-CACHE_LINE_SIZE*4]
120
121        cmp         r2, #MEMCPY_BLOCK_SIZE_SMALL
122        blo         .L_reversed_memcpy_neon_pld_near
123        cmp         r2, #MEMCPY_BLOCK_SIZE_MID
124        blo         .L_reversed_memcpy_neon_pld_mid
125        b           .L_reversed_memcpy_neon_pld_far
126
127.L_reversed_memcpy_neon_pld_near:
128        /* less than 128 bytes? */
129        subs        r2, r2, #128
130        blo         1f
131        sub         r1, r1, #32
132        sub         r0, r0, #32
133        mov         r3, #-32
134        .align      4
1350:
136        /* copy 128 bytes in each loop */
137        subs        r2, r2, #128
138
139        /* preload to cache */
140        pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
141        /* copy a cache line */
142        vld1.8      {q0, q1}, [r1], r3
143        vst1.8      {q0, q1}, [r0, :256], r3
144        vld1.8      {q0, q1}, [r1], r3
145        vst1.8      {q0, q1}, [r0, :256], r3
146
147        /* preload to cache */
148        pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
149        /* copy a cache line */
150        vld1.8      {q0, q1}, [r1], r3
151        vst1.8      {q0, q1}, [r0, :256], r3
152        vld1.8      {q0, q1}, [r1], r3
153        vst1.8      {q0, q1}, [r0, :256], r3
154
155        bhs         0b
156        add         r1, r1, #32
157        add         r0, r0, #32
1581:
159        adds        r2, r2, #128
160        bne         .L_reversed_memcpy_lt_128bytes
161        pop         {r0, pc}
162
163.L_reversed_memcpy_neon_pld_mid:
164        subs        r2, r2, #128
165        sub         r1, r1, #32
166        sub         r0, r0, #32
167        mov         r3, #-32
168        .align      4
1690:
170        /* copy 128 bytes in each loop */
171        subs        r2, r2, #128
172
173        /* preload to cache */
174        pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
175        /* copy a cache line */
176        vld1.8      {q0, q1}, [r1], r3
177        vst1.8      {q0, q1}, [r0, :256], r3
178        vld1.8      {q0, q1}, [r1], r3
179        vst1.8      {q0, q1}, [r0, :256], r3
180
181        /* preload to cache */
182        pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
183        /* copy a cache line */
184        vld1.8      {q0, q1}, [r1], r3
185        vst1.8      {q0, q1}, [r0, :256], r3
186        vld1.8      {q0, q1}, [r1], r3
187        vst1.8      {q0, q1}, [r0, :256], r3
188
189        bhs         0b
190        add         r1, r1, #32
191        add         r0, r0, #32
1921:
193        adds        r2, r2, #128
194        bne         .L_reversed_memcpy_lt_128bytes
195        pop         {r0, pc}
196
197.L_reversed_memcpy_neon_pld_far:
198        sub         r2, r2, #128
199        sub         r0, r0, #128
200        sub         r1, r1, #128
201        .align      4
2020:
203        /* copy 128 bytes in each loop */
204        subs        r2, r2, #128
205
206        /* preload to cache */
207        pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128]
208        pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128]
209        /* read */
210        vld1.8      {q0, q1}, [r1]!
211        vld1.8      {q2, q3}, [r1]!
212        vld1.8      {q8, q9}, [r1]!
213        vld1.8      {q10, q11}, [r1]!
214        /* write */
215        vst1.8      {q0, q1}, [r0, :256]!
216        vst1.8      {q2, q3}, [r0, :256]!
217        vst1.8      {q8, q9}, [r0, :256]!
218        vst1.8      {q10, q11}, [r0, :256]!
219
220        sub         r0, r0, #256
221        sub         r1, r1, #256
222        bhs         0b
223        add         r0, r0, #128
224        add         r1, r1, #128
2251:
226        adds        r2, r2, #128
227        bne         .L_reversed_memcpy_lt_128bytes
228        pop         {r0, pc}
229
230.L_reversed_memcpy_lt_128bytes:
2316:      /* copy 64 bytes */
232        movs        ip, r2, lsl #26
233        bcc         5f
234        sub         r1, r1, #32
235        sub         r0, r0, #32
236        vld1.8      {q0, q1}, [r1]
237        vst1.8      {q0, q1}, [r0]
238        sub         r1, r1, #32
239        sub         r0, r0, #32
240        vld1.8      {q0, q1}, [r1]
241        vst1.8      {q0, q1}, [r0]
2425:      /* copy 32 bytes */
243        bpl         4f
244        sub         r1, r1, #32
245        sub         r0, r0, #32
246        vld1.8      {q0, q1}, [r1]
247        vst1.8      {q0, q1}, [r0]
248.L_reversed_memcpy_lt_32bytes:
2494:      /* copy 16 bytes */
250        movs        ip, r2, lsl #28
251        bcc         3f
252        sub         r1, r1, #16
253        sub         r0, r0, #16
254        vld1.8      {q0}, [r1]
255        vst1.8      {q0}, [r0]
2563:      /* copy 8 bytes */
257        bpl         2f
258        sub         r1, r1, #8
259        sub         r0, r0, #8
260        vld1.8      {d0}, [r1]
261        vst1.8      {d0}, [r0]
2622:      /* copy 4 bytes */
263        ands        ip, r2, #0x4
264        beq         1f
265        sub         r1, r1, #4
266        sub         r0, r0, #4
267        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
268        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]
2691:      /* copy 2 bytes */
270        movs        ip, r2, lsl #31
271        ldrbcs      ip, [r1, #-1]!
272        strbcs      ip, [r0, #-1]!
273        ldrbcs      ip, [r1, #-1]!
274        strbcs      ip, [r0, #-1]!
2750:      /* copy 1 byte */
276        ldrbmi      ip, [r1, #-1]!
277        strbmi      ip, [r0, #-1]!
278
279        pop         {r0, pc}
280
281END(memmove)
282