1@ 2@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3@ 4@ Use of this source code is governed by a BSD-style license 5@ that can be found in the LICENSE file in the root of the source 6@ tree. An additional intellectual property rights grant can be found 7@ in the file PATENTS. All contributing project authors may 8@ be found in the AUTHORS file in the root of the source tree. 9@ 10 11@ lattice_neon.s 12@ 13@ Contains a function for the core loop in the normalized lattice MA 14@ filter routine for iSAC codec, optimized for ARM Neon platform. 15@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, 16@ int16_t input1, 17@ int32_t input2, 18@ int32_t* ptr0, 19@ int32_t* ptr1, 20@ int32_t* __restrict ptr2); 21@ It calculates 22@ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); 23@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); 24@ in Q15 domain. 25@ 26@ Reference code in lattice.c. 27@ Output is not bit-exact with the reference C code, due to the replacement 28@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon 29@ instructions, smulwb, and smull. Speech quality was not degraded by 30@ testing speech and tone vectors. 31 32.arch armv7-a 33.fpu neon 34 35#include "settings.h" 36 37.global WebRtcIsacfix_FilterMaLoopNeon 38 39.align 2 40 41WebRtcIsacfix_FilterMaLoopNeon: 42.fnstart 43 44.save {r4-r8} 45 push {r4-r8} 46 47 vdup.32 d28, r0 @ Initialize Neon register with input0 48 vdup.32 d29, r1 @ Initialize Neon register with input1 49 vdup.32 d30, r2 @ Initialize Neon register with input2 50 ldr r4, [sp, #20] @ ptr1 51 ldr r12, [sp, #24] @ ptr2 52 53 @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2 54 @ Leftover samples after the loop, in r6: 55 @ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2 56 mov r6, #HALF_SUBFRAMELEN 57 sub r6, #1 58 lsr r5, r6, #2 59 sub r6, r5, lsl #2 60 61 @ First r5 iterations in a loop. 62 63LOOP: 64 vld1.32 {d0, d1}, [r3]! @ *ptr0 65 66 vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0) 67 vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0) 68 vmull.s32 q12, d0, d29 @ input1 * (*ptr0) 69 vmull.s32 q13, d1, d29 @ input1 * (*ptr0) 70 71 vrshrn.i64 d4, q10, #15 72 vrshrn.i64 d5, q11, #15 73 74 vld1.32 {d2, d3}, [r12] @ *ptr2 75 vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a 76 77 vrshrn.i64 d0, q12, #15 78 79 vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b) 80 vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b) 81 82 vrshrn.i64 d16, q10, #16 83 vrshrn.i64 d17, q11, #16 84 85 vmull.s32 q10, d16, d28 @ input0 * (*ptr2) 86 vmull.s32 q11, d17, d28 @ input0 * (*ptr2) 87 88 vrshrn.i64 d1, q13, #15 89 vrshrn.i64 d18, q10, #15 90 vrshrn.i64 d19, q11, #15 91 92 vst1.32 {d16, d17}, [r12]! @ *ptr2 93 94 vadd.i32 q9, q0, q9 95 subs r5, #1 96 vst1.32 {d18, d19}, [r4]! @ *ptr1 97 98 bgt LOOP 99 100 @ Check how many samples still need to be processed. 101 subs r6, #2 102 blt LAST_SAMPLE 103 104 @ Process two more samples: 105 vld1.32 d0, [r3]! @ *ptr0 106 107 vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0) 108 vmull.s32 q13, d0, d29 @ input1 * (*ptr0) 109 110 vld1.32 d18, [r12] @ *ptr2 111 vrshrn.i64 d4, q11, #15 112 113 vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a 114 vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b) 115 vrshrn.i64 d16, q11, #16 116 117 vmull.s32 q11, d16, d28 @ input0 * (*ptr2) 118 vst1.32 d16, [r12]! @ *ptr2 119 120 vrshrn.i64 d0, q13, #15 121 vrshrn.i64 d19, q11, #15 122 vadd.i32 d19, d0, d19 123 124 vst1.32 d19, [r4]! @ *ptr1 125 126 @ If there's still one more sample, process it here. 127LAST_SAMPLE: 128 cmp r6, #1 129 bne END 130 131 @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)); 132 133 ldr r7, [r3] @ *ptr0 134 ldr r8, [r12] @ *ptr2 135 136 smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16 137 add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1) 138 smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits 139 lsl r6, #16 140 add r6, r5, lsr #16 @ Only take the middle 32 bits 141 str r6, [r12] @ Output (*ptr2, as 32 bits) 142 143 @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); 144 145 smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16 146 smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16 147 lsl r5, r5, #1 148 add r5, r6, lsl #1 149 str r5, [r4] @ Output (*ptr1) 150 151END: 152 pop {r4-r8} 153 bx lr 154 155.fnend 156