1/*************************************************************************** 2 Copyright (c) 2009-2013 The Linux Foundation. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of The Linux Foundation nor the names of its contributors may 12 be used to endorse or promote products derived from this software 13 without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 POSSIBILITY OF SUCH DAMAGE. 26 ***************************************************************************/ 27 28/* Assumes neon instructions and a cache line size of 64 bytes. */ 29 30#include <machine/cpu-features.h> 31#include <machine/asm.h> 32 33#define PLDOFFS (10) 34#define PLDTHRESH (PLDOFFS) 35#define BBTHRESH (4096/64) 36#define PLDSIZE (64) 37 38#if (PLDOFFS < 1) 39#error Routine does not support offsets less than 1 40#endif 41 42#if (PLDTHRESH < PLDOFFS) 43#error PLD threshold must be greater than or equal to the PLD offset 44#endif 45 46 .text 47 .fpu neon 48 49.L_memcpy_base: 50 cmp r2, #4 51 blt .L_neon_lt4 52 cmp r2, #16 53 blt .L_neon_lt16 54 cmp r2, #32 55 blt .L_neon_16 56 cmp r2, #64 57 blt .L_neon_copy_32_a 58 59 mov r12, r2, lsr #6 60 cmp r12, #PLDTHRESH 61 ble .L_neon_copy_64_loop_nopld 62 63 push {r9, r10} 64 .cfi_adjust_cfa_offset 8 65 .cfi_rel_offset r9, 0 66 .cfi_rel_offset r10, 4 67 68 cmp r12, #BBTHRESH 69 ble .L_neon_prime_pump 70 71 add lr, r0, #0x400 72 add r9, r1, #(PLDOFFS*PLDSIZE) 73 sub lr, lr, r9 74 lsl lr, lr, #21 75 lsr lr, lr, #21 76 add lr, lr, #(PLDOFFS*PLDSIZE) 77 cmp r12, lr, lsr #6 78 ble .L_neon_prime_pump 79 80 itt gt 81 movgt r9, #(PLDOFFS) 82 rsbsgt r9, r9, lr, lsr #6 83 ble .L_neon_prime_pump 84 85 add r10, r1, lr 86 bic r10, #0x3F 87 88 sub r12, r12, lr, lsr #6 89 90 cmp r9, r12 91 itee le 92 suble r12, r12, r9 93 movgt r9, r12 94 movgt r12, #0 95 96 pld [r1, #((PLDOFFS-1)*PLDSIZE)] 97.L_neon_copy_64_loop_outer_doublepld: 98 pld [r1, #((PLDOFFS)*PLDSIZE)] 99 vld1.32 {q0, q1}, [r1]! 100 vld1.32 {q2, q3}, [r1]! 101 ldr r3, [r10] 102 subs r9, r9, #1 103 vst1.32 {q0, q1}, [r0]! 104 vst1.32 {q2, q3}, [r0]! 105 add r10, #64 106 bne .L_neon_copy_64_loop_outer_doublepld 107 cmp r12, #0 108 beq .L_neon_pop_before_nopld 109 110 cmp r12, #(512*1024/64) 111 blt .L_neon_copy_64_loop_outer 112 113.L_neon_copy_64_loop_ddr: 114 vld1.32 {q0, q1}, [r1]! 115 vld1.32 {q2, q3}, [r1]! 116 pld [r10] 117 subs r12, r12, #1 118 vst1.32 {q0, q1}, [r0]! 119 vst1.32 {q2, q3}, [r0]! 120 add r10, #64 121 bne .L_neon_copy_64_loop_ddr 122 b .L_neon_pop_before_nopld 123 124.L_neon_prime_pump: 125 mov lr, #(PLDOFFS*PLDSIZE) 126 add r10, r1, #(PLDOFFS*PLDSIZE) 127 bic r10, #0x3F 128 sub r12, r12, #PLDOFFS 129 ldr r3, [r10, #(-1*PLDSIZE)] 130 131.L_neon_copy_64_loop_outer: 132 vld1.32 {q0, q1}, [r1]! 133 vld1.32 {q2, q3}, [r1]! 134 ldr r3, [r10] 135 subs r12, r12, #1 136 vst1.32 {q0, q1}, [r0]! 137 vst1.32 {q2, q3}, [r0]! 138 add r10, #64 139 bne .L_neon_copy_64_loop_outer 140 141.L_neon_pop_before_nopld: 142 mov r12, lr, lsr #6 143 pop {r9, r10} 144 .cfi_adjust_cfa_offset -8 145 .cfi_restore r9 146 .cfi_restore r10 147 148.L_neon_copy_64_loop_nopld: 149 vld1.32 {q8, q9}, [r1]! 150 vld1.32 {q10, q11}, [r1]! 151 subs r12, r12, #1 152 vst1.32 {q8, q9}, [r0]! 153 vst1.32 {q10, q11}, [r0]! 154 bne .L_neon_copy_64_loop_nopld 155 ands r2, r2, #0x3f 156 beq .L_neon_exit 157 158.L_neon_copy_32_a: 159 movs r3, r2, lsl #27 160 bcc .L_neon_16 161 vld1.32 {q0,q1}, [r1]! 162 vst1.32 {q0,q1}, [r0]! 163 164.L_neon_16: 165 bpl .L_neon_lt16 166 vld1.32 {q8}, [r1]! 167 vst1.32 {q8}, [r0]! 168 ands r2, r2, #0x0f 169 beq .L_neon_exit 170 171.L_neon_lt16: 172 movs r3, r2, lsl #29 173 bcc 1f 174 vld1.8 {d0}, [r1]! 175 vst1.8 {d0}, [r0]! 1761: 177 bge .L_neon_lt4 178 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! 179 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! 180 181.L_neon_lt4: 182 movs r2, r2, lsl #31 183 itt cs 184 ldrhcs r3, [r1], #2 185 strhcs r3, [r0], #2 186 itt mi 187 ldrbmi r3, [r1] 188 strbmi r3, [r0] 189 190.L_neon_exit: 191 pop {r0, pc} 192