1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in 13 * the documentation and/or other materials provided with the 14 * distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include <private/bionic_asm.h> 31 32 /* 33 * Optimized memset() for ARM. 34 * 35 * memset() returns its first argument. 36 */ 37 38 .cpu cortex-a15 39 .fpu neon 40 .syntax unified 41 42ENTRY(__memset_chk) 43 cmp r2, r3 44 bls memset 45 46 // Preserve lr for backtrace. 47 push {lr} 48 .cfi_def_cfa_offset 4 49 .cfi_rel_offset lr, 0 50 51 bl __memset_chk_fail 52END(__memset_chk) 53 54ENTRY(memset) 55 pldw [r0] 56 mov r3, r0 57 58 // Duplicate the low byte of r1 59 mov r1, r1, lsl #24 60 orr r1, r1, r1, lsr #8 61 orr r1, r1, r1, lsr #16 62 63 cmp r2, #16 64 blo .L_less_than_16 65 66 // This section handles regions 16 bytes or larger 67 // 68 // Use aligned vst1.8 and vstm when possible. Register values will be: 69 // ip is scratch 70 // q0, q1, and r1 contain the memset value 71 // r2 is the number of bytes to set 72 // r3 is the advancing destination pointer 73 vdup.32 q0, r1 74 75 ands ip, r3, 0xF 76 beq .L_memset_aligned 77 78 // Align dest pointer to 16-byte boundary. 79 pldw [r0, #64] 80 rsb ip, ip, #16 81 82 // Pre-adjust the byte count to reflect post-aligment value. Expecting 83 // 8-byte alignment to be rather common so we special case that one. 84 sub r2, r2, ip 85 86 /* set 1 byte */ 87 tst ip, #1 88 it ne 89 strbne r1, [r3], #1 90 /* set 2 bytes */ 91 tst ip, #2 92 it ne 93 strhne r1, [r3], #2 94 /* set 4 bytes */ 95 movs ip, ip, lsl #29 96 it mi 97 strmi r1, [r3], #4 98 /* set 8 bytes */ 99 itt cs 100 strcs r1, [r3], #4 101 strcs r1, [r3], #4 102 103.L_memset_aligned: 104 // Destination is now 16-byte aligned. Determine how to handle 105 // remaining bytes. 106 vmov q1, q0 107 cmp r2, #128 108 blo .L_less_than_128 109 110 // We need to set a larger block of memory. Use four Q regs to 111 // set a full cache line in one instruction. Pre-decrement 112 // r2 to simplify end-of-loop detection 113 vmov q2, q0 114 vmov q3, q0 115 pldw [r0, #128] 116 sub r2, r2, #128 117 .align 4 118.L_memset_loop_128: 119 pldw [r3, #192] 120 vstm r3!, {q0, q1, q2, q3} 121 vstm r3!, {q0, q1, q2, q3} 122 subs r2, r2, #128 123 bhs .L_memset_loop_128 124 125 // Un-bias r2 so it contains the number of bytes left. Early 126 // exit if we are done. 127 adds r2, r2, #128 128 beq 2f 129 130 .align 4 131.L_less_than_128: 132 // set 64 bytes 133 movs ip, r2, lsl #26 134 bcc 1f 135 vst1.8 {q0, q1}, [r3, :128]! 136 vst1.8 {q0, q1}, [r3, :128]! 137 beq 2f 1381: 139 // set 32 bytes 140 bpl 1f 141 vst1.8 {q0, q1}, [r3, :128]! 1421: 143 // set 16 bytes 144 movs ip, r2, lsl #28 145 bcc 1f 146 vst1.8 {q0}, [r3, :128]! 147 beq 2f 1481: 149 // set 8 bytes 150 bpl 1f 151 vst1.8 {d0}, [r3, :64]! 1521: 153 // set 4 bytes 154 tst r2, #4 155 it ne 156 strne r1, [r3], #4 1571: 158 // set 2 bytes 159 movs ip, r2, lsl #31 160 it cs 161 strhcs r1, [r3], #2 162 // set 1 byte 163 it mi 164 strbmi r1, [r3] 1652: 166 bx lr 167 168.L_less_than_16: 169 // Store up to 15 bytes without worrying about byte alignment 170 movs ip, r2, lsl #29 171 bcc 1f 172 str r1, [r3], #4 173 str r1, [r3], #4 174 beq 2f 1751: 176 it mi 177 strmi r1, [r3], #4 178 movs ip, r2, lsl #31 179 it mi 180 strbmi r1, [r3], #1 181 itt cs 182 strbcs r1, [r3], #1 183 strbcs r1, [r3] 1842: 185 bx lr 186END(memset) 187