1 /*
2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
14 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
15 
16 // Contains a function for the core loop in the normalized lattice MA
17 // filter routine for iSAC codec, optimized for ARM Neon platform.
18 // It does:
19 //  for 0 <= n < HALF_SUBFRAMELEN - 1:
20 //    *ptr2 = input2 * ((*ptr2) + input0 * (*ptr0));
21 //    *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
22 // Output is not bit-exact with the reference C code, due to the replacement
23 // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
24 // instructions. The difference should not be bigger than 1.
WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,int16_t input1,int32_t input2,int32_t * ptr0,int32_t * ptr1,int32_t * ptr2)25 void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
26                                     int16_t input1,  // Filter coefficient
27                                     int32_t input2,  // Inverse coefficient
28                                     int32_t* ptr0,   // Sample buffer
29                                     int32_t* ptr1,   // Sample buffer
30                                     int32_t* ptr2)   // Sample buffer
31 {
32   int n = 0;
33   int loop = (HALF_SUBFRAMELEN - 1) >> 3;
34   int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7;
35 
36   int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16);
37   int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16);
38   int32x4_t input2_v = vdupq_n_s32(input2);
39   int32x4_t tmp0a, tmp1a, tmp2a, tmp3a;
40   int32x4_t tmp0b, tmp1b, tmp2b, tmp3b;
41   int32x4_t ptr0va, ptr1va, ptr2va;
42   int32x4_t ptr0vb, ptr1vb, ptr2vb;
43 
44   int64x2_t tmp2al_low, tmp2al_high, tmp2bl_low, tmp2bl_high;
45   // Unroll to process 8 samples at once.
46   for (n = 0; n < loop; n++) {
47     ptr0va = vld1q_s32(ptr0);
48     ptr0vb = vld1q_s32(ptr0 + 4);
49     ptr0 += 8;
50 
51     ptr2va = vld1q_s32(ptr2);
52     ptr2vb = vld1q_s32(ptr2 + 4);
53 
54     // Calculate tmp0 = (*ptr0) * input0.
55     tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
56     tmp0b = vqrdmulhq_s32(ptr0vb, input0_v);
57 
58     // Calculate tmp1 = (*ptr0) * input1.
59     tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
60     tmp1b = vqrdmulhq_s32(ptr0vb, input1_v);
61 
62     // Calculate tmp2 = tmp0 + *(ptr2).
63     tmp2a = vaddq_s32(tmp0a, ptr2va);
64     tmp2b = vaddq_s32(tmp0b, ptr2vb);
65 
66     // Calculate *ptr2 = input2 * tmp2.
67     tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
68 #if defined(WEBRTC_ARCH_ARM64)
69     tmp2al_high = vmull_high_s32(tmp2a, input2_v);
70 #else
71     tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
72 #endif
73     ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
74                           vrshrn_n_s64(tmp2al_high, 16));
75 
76     tmp2bl_low = vmull_s32(vget_low_s32(tmp2b), vget_low_s32(input2_v));
77 #if defined(WEBRTC_ARCH_ARM64)
78     tmp2bl_high = vmull_high_s32(tmp2b, input2_v);
79 #else
80     tmp2bl_high = vmull_s32(vget_high_s32(tmp2b), vget_high_s32(input2_v));
81 #endif
82     ptr2vb = vcombine_s32(vrshrn_n_s64(tmp2bl_low, 16),
83                           vrshrn_n_s64(tmp2bl_high, 16));
84 
85     vst1q_s32(ptr2, ptr2va);
86     vst1q_s32(ptr2 + 4, ptr2vb);
87     ptr2 += 8;
88 
89     // Calculate tmp3 = ptr2v * input0.
90     tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
91     tmp3b = vqrdmulhq_s32(ptr2vb, input0_v);
92 
93     // Calculate *ptr1 = tmp1 + tmp3.
94     ptr1va = vaddq_s32(tmp1a, tmp3a);
95     ptr1vb = vaddq_s32(tmp1b, tmp3b);
96 
97     vst1q_s32(ptr1, ptr1va);
98     vst1q_s32(ptr1 + 4, ptr1vb);
99     ptr1 += 8;
100   }
101 
102   // Process four more samples.
103   if (loop_tail & 0x4) {
104     ptr0va = vld1q_s32(ptr0);
105     ptr2va = vld1q_s32(ptr2);
106     ptr0 += 4;
107 
108     // Calculate tmp0 = (*ptr0) * input0.
109     tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
110 
111     // Calculate tmp1 = (*ptr0) * input1.
112     tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
113 
114     // Calculate tmp2 = tmp0 + *(ptr2).
115     tmp2a = vaddq_s32(tmp0a, ptr2va);
116 
117     // Calculate *ptr2 = input2 * tmp2.
118     tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
119 
120 #if defined(WEBRTC_ARCH_ARM64)
121     tmp2al_high = vmull_high_s32(tmp2a, input2_v);
122 #else
123     tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
124 #endif
125     ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
126                           vrshrn_n_s64(tmp2al_high, 16));
127 
128     vst1q_s32(ptr2, ptr2va);
129     ptr2 += 4;
130 
131     // Calculate tmp3 = *(ptr2) * input0.
132     tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
133 
134     // Calculate *ptr1 = tmp1 + tmp3.
135     ptr1va = vaddq_s32(tmp1a, tmp3a);
136 
137     vst1q_s32(ptr1, ptr1va);
138     ptr1 += 4;
139   }
140 
141   // Process two more samples.
142   if (loop_tail & 0x2) {
143     int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
144     int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
145     int64x2_t tmp2l_tail;
146     ptr0v_tail = vld1_s32(ptr0);
147     ptr2v_tail = vld1_s32(ptr2);
148     ptr0 += 2;
149 
150     // Calculate tmp0 = (*ptr0) * input0.
151     tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v));
152 
153     // Calculate tmp1 = (*ptr0) * input1.
154     tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v));
155 
156     // Calculate tmp2 = tmp0 + *(ptr2).
157     tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
158 
159     // Calculate *ptr2 = input2 * tmp2.
160     tmp2l_tail = vmull_s32(tmp2_tail, vget_low_s32(input2_v));
161     ptr2v_tail = vrshrn_n_s64(tmp2l_tail, 16);
162 
163     vst1_s32(ptr2, ptr2v_tail);
164     ptr2 += 2;
165 
166     // Calculate tmp3 = *(ptr2) * input0.
167     tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v));
168 
169     // Calculate *ptr1 = tmp1 + tmp3.
170     ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail);
171 
172     vst1_s32(ptr1, ptr1v_tail);
173     ptr1 += 2;
174   }
175 
176   // Process one more sample.
177   if (loop_tail & 0x1) {
178     int16_t t16a = (int16_t)(input2 >> 16);
179     int16_t t16b = (int16_t)input2;
180     if (t16b < 0) t16a++;
181     int32_t tmp32a;
182     int32_t tmp32b;
183 
184     // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)).
185     tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0);
186     tmp32b = *ptr2 + tmp32a;
187     *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) +
188                        (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b)));
189 
190     // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2).
191     tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0);
192     tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2);
193     *ptr1 = tmp32a + tmp32b;
194   }
195 }
196