1@ 2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3@ 4@ Use of this source code is governed by a BSD-style license 5@ that can be found in the LICENSE file in the root of the source 6@ tree. An additional intellectual property rights grant can be found 7@ in the file PATENTS. All contributing project authors may 8@ be found in the AUTHORS file in the root of the source tree. 9@ 10 11@ Contains a function for WebRtcIsacfix_CalculateResidualEnergyNeon() in 12@ iSAC codec, optimized for ARM Neon platform. Reference code in 13@ lpc_masking_model.c. 14 15.arch armv7-a 16.fpu neon 17.global WebRtcIsacfix_CalculateResidualEnergyNeon 18.align 2 19 20@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order, 21@ int32_t q_val_corr, 22@ int q_val_polynomial, 23@ int16_t* a_polynomial, 24@ int32_t* corr_coeffs, 25@ int* q_val_residual_energy); 26 27WebRtcIsacfix_CalculateResidualEnergyNeon: 28.fnstart 29.save {r4-r11} 30 push {r4-r11} 31 32 sub r13, r13, #16 33 str r1, [r13, #8] 34 str r2, [r13, #12] 35 36 mov r4, #1 37 vmov.s64 q11, #0 @ Initialize shift_internal. 38 vmov.s64 q13, #0 @ Initialize sum64. 39 vmov.s64 q10, #0 40 vmov.u8 d20[0], r4 @ Set q10 to 1. 41 42 cmp r0, #0 43 blt POST_LOOP_I 44 45 add r9, r3, r0, asl #1 @ &a_polynomial[lpc_order] 46 mov r6, #0 @ Loop counter i. 47 ldr r11, [r13, #48] 48 sub r10, r0, #1 49 mov r7, r3 @ &a_polynomial[0] 50 str r9, [r13, #4] 51 52LOOP_I: 53 ldr r2, [r11], #4 @ corr_coeffs[i] 54 vmov.s64 q15, #0 @ Initialize the sum64_tmp. 55 vdup.s32 d25, r2 56 57 cmp r0, r6 @ Compare lpc_order to i. 58 movle r2, r6 59 ble POST_LOOP_J 60 61 mov r1, r6 @ j = i; 62 mov r12, r7 @ &a_polynomial[i] 63 mov r4, r3 @ &a_polynomial[j - i] 64 65LOOP_J: 66 ldr r8, [r12], #4 67 ldr r5, [r4], #4 68 vmov.u32 d0[0], r8 69 vmov.u32 d1[0], r5 70 vmull.s16 q0, d0, d1 71 vmull.s32 q0, d0, d25 72 cmp r6, #0 @ i == 0? 73 vshl.s64 q0, q11 74 beq SUM1 75 vshl.s64 q0, #1 76 77SUM1: 78 vqadd.s64 q14, q0, q15 @ Sum and test overflow. 79 add r1, r1, #2 80 bvc MOV1 @ Skip the shift if there's no overflow. 81 vshr.s64 q0, #1 82 vshr.s64 q15, #1 83 vadd.s64 q14, q0, q15 84 vsub.s64 q11, q10 85 86MOV1: 87 cmp r0, r1 @ Compare lpc_order to j. 88 vmov.s64 q15, q14 89 bgt LOOP_J 90 91 bic r1, r10, #1 92 add r2, r6, #2 93 add r2, r1, r2 94 95POST_LOOP_J: 96 vqadd.s64 q0, q13, q15 @ Sum and test overflow. 97 bvc MOV2 @ Skip the shift if there's no overflow. 98 vshr.s64 q13, #1 99 vshr.s64 q15, #1 100 vadd.s64 q0, q13, q15 101 vsub.s64 q11, q10 102 103MOV2: 104 vmov.s64 q13, q0 @ update sum64. 105 cmp r2, r0 106 bne CHECK_LOOP_CONDITION 107 108 @ Last sample in the inner loop. 109 ldr r4, [r13, #4] 110 ldrsh r8, [r4] 111 ldrsh r12, [r9] 112 mul r8, r8, r12 113 vmov.s32 d0[0], r8 114 vmull.s32 q0, d0, d25 115 cmp r6, #0 @ i == 0? 116 vshl.s64 q0, q11 117 beq SUM2 118 vshl.s64 q0, #1 119 120SUM2: 121 vqadd.s64 d1, d0, d26 @ Sum and test overflow. 122 bvc MOV3 @ Skip the shift if there's no overflow. 123 vshr.s64 q13, #1 124 vshr.s64 d0, #1 125 vadd.s64 d1, d0, d26 126 vsub.s64 q11, q10 127 128MOV3: 129 vmov.s64 d26, d1 @ update sum64. 130 131CHECK_LOOP_CONDITION: 132 add r6, r6, #1 133 sub r9, r9, #2 134 cmp r0, r6 @ Compare i to lpc_order. 135 sub r10, r10, #1 136 add r7, r7, #2 137 bge LOOP_I 138 139POST_LOOP_I: 140 mov r3, #0 141 vqadd.s64 d0, d26, d27 @ Sum and test overflow. 142 bvc GET_SHIFT_NORM @ Skip the shift if there's no overflow. 143 vshr.s64 q13, #1 144 vadd.s64 d0, d26, d27 145 vsub.s64 q11, q10 146 147GET_SHIFT_NORM: 148 vcls.s32 d1, d0 @ Count leading extra sign bits. 149 vmov.32 r2, d1[1] @ Store # of sign bits of only the 32 MSBs. 150 vmovl.s32 q1, d1 151 vshl.s64 d0, d3 @ d3 contains # of sign bits of the 32 MSBs. 152 153 vcls.s32 d1, d0 @ Count again the leading extra sign bits. 154 vmov.s32 r1, d1[1] @ Store # of sign bits of only the 32 MSBs. 155 vmovl.s32 q1, d1 156 vshl.s64 d0, d3 @ d3 contains # of sign bits of the 32 MSBs. 157 158 vmov.s32 r0, d0[1] @ residual_energy 159 vmov.s32 r3, d22[0] @ shift_internal 160 161 @ Calculate the value for q_val_residual_energy. 162 ldr r4, [r13, #8] @ q_val_corr 163 ldr r5, [r13, #12] @ q_val_polynomial 164 sub r12, r4, #32 165 add r12, r12, r5, asl #1 166 add r1, r12, r1 @ add 1st part of shift_internal. 167 add r12, r1, r2 @ add 2nd part of shift_internal. 168 ldr r2, [r13, #52] 169 add r3, r12, r3 @ value for q_val_residual_energy. 170 str r3, [r2, #0] 171 172 add r13, r13, #16 173 pop {r4-r11} 174 bx r14 175 176.fnend 177 178