1 /*
2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 // Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon()
12 // in iSAC codec, optimized for ARM Neon platform. Bit exact with function
13 // WebRtcIsacfix_AllpassFilter2FixDec16C() in filterbanks.c. Prototype
14 // C code is at end of this file.
15 
16 #include <arm_neon.h>
17 #include <assert.h>
18 
WebRtcIsacfix_AllpassFilter2FixDec16Neon(int16_t * data_ch1,int16_t * data_ch2,const int16_t * factor_ch1,const int16_t * factor_ch2,const int length,int32_t * filter_state_ch1,int32_t * filter_state_ch2)19 void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
20     int16_t* data_ch1,  // Input and output in channel 1, in Q0
21     int16_t* data_ch2,  // Input and output in channel 2, in Q0
22     const int16_t* factor_ch1,  // Scaling factor for channel 1, in Q15
23     const int16_t* factor_ch2,  // Scaling factor for channel 2, in Q15
24     const int length,  // Length of the data buffers
25     int32_t* filter_state_ch1,  // Filter state for channel 1, in Q16
26     int32_t* filter_state_ch2) {  // Filter state for channel 2, in Q16
27   assert(length % 2 == 0);
28   int n = 0;
29   int16x4_t factorv;
30   int16x4_t datav;
31   int32x4_t statev;
32 
33   // Load factor_ch1 and factor_ch2.
34   factorv = vld1_dup_s16(factor_ch1);
35   factorv = vld1_lane_s16(factor_ch1 + 1, factorv, 1);
36   factorv = vld1_lane_s16(factor_ch2, factorv, 2);
37   factorv = vld1_lane_s16(factor_ch2 + 1, factorv, 3);
38 
39   // Load filter_state_ch1[0] and filter_state_ch2[0].
40   statev = vld1q_dup_s32(filter_state_ch1);
41   statev = vld1q_lane_s32(filter_state_ch2, statev, 2);
42 
43   // Loop unrolling preprocessing.
44   int32x4_t a;
45   int16x4_t tmp1, tmp2;
46 
47   // Load data_ch1[0] and data_ch2[0].
48   datav = vld1_dup_s16(data_ch1);
49   datav = vld1_lane_s16(data_ch2, datav, 2);
50 
51   a = vqdmlal_s16(statev, datav, factorv);
52   tmp1 = vshrn_n_s32(a, 16);
53 
54   // Update filter_state_ch1[0] and filter_state_ch2[0].
55   statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv);
56 
57   // Load filter_state_ch1[1] and filter_state_ch2[1].
58   statev = vld1q_lane_s32(filter_state_ch1 + 1, statev, 1);
59   statev = vld1q_lane_s32(filter_state_ch2 + 1, statev, 3);
60 
61   // Load data_ch1[1] and data_ch2[1].
62   tmp1 = vld1_lane_s16(data_ch1 + 1, tmp1, 1);
63   tmp1 = vld1_lane_s16(data_ch2 + 1, tmp1, 3);
64   datav = vrev32_s16(tmp1);
65 
66   // Loop unrolling processing.
67   for (n = 0; n < length - 2; n += 2) {
68     a = vqdmlal_s16(statev, datav, factorv);
69     tmp1 = vshrn_n_s32(a, 16);
70     // Store data_ch1[n] and data_ch2[n].
71     vst1_lane_s16(data_ch1 + n, tmp1, 1);
72     vst1_lane_s16(data_ch2 + n, tmp1, 3);
73 
74     // Update filter_state_ch1[0], filter_state_ch1[1]
75     // and filter_state_ch2[0], filter_state_ch2[1].
76     statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv);
77 
78     // Load data_ch1[n + 2] and data_ch2[n + 2].
79     tmp1 = vld1_lane_s16(data_ch1 + n + 2, tmp1, 1);
80     tmp1 = vld1_lane_s16(data_ch2 + n + 2, tmp1, 3);
81     datav = vrev32_s16(tmp1);
82 
83     a = vqdmlal_s16(statev, datav, factorv);
84     tmp2 = vshrn_n_s32(a, 16);
85     // Store data_ch1[n + 1] and data_ch2[n + 1].
86     vst1_lane_s16(data_ch1 + n + 1, tmp2, 1);
87     vst1_lane_s16(data_ch2 + n + 1, tmp2, 3);
88 
89     // Update filter_state_ch1[0], filter_state_ch1[1]
90     // and filter_state_ch2[0], filter_state_ch2[1].
91     statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv);
92 
93     // Load data_ch1[n + 3] and data_ch2[n + 3].
94     tmp2 = vld1_lane_s16(data_ch1 + n + 3, tmp2, 1);
95     tmp2 = vld1_lane_s16(data_ch2 + n + 3, tmp2, 3);
96     datav = vrev32_s16(tmp2);
97   }
98 
99   // Loop unrolling post-processing.
100   a = vqdmlal_s16(statev, datav, factorv);
101   tmp1 = vshrn_n_s32(a, 16);
102   // Store data_ch1[n] and data_ch2[n].
103   vst1_lane_s16(data_ch1 + n, tmp1, 1);
104   vst1_lane_s16(data_ch2 + n, tmp1, 3);
105 
106   // Update filter_state_ch1[0], filter_state_ch1[1]
107   // and filter_state_ch2[0], filter_state_ch2[1].
108   statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv);
109   // Store filter_state_ch1[0] and filter_state_ch2[0].
110   vst1q_lane_s32(filter_state_ch1, statev, 0);
111   vst1q_lane_s32(filter_state_ch2, statev, 2);
112 
113   datav = vrev32_s16(tmp1);
114   a = vqdmlal_s16(statev, datav, factorv);
115   tmp2 = vshrn_n_s32(a, 16);
116   // Store data_ch1[n + 1] and data_ch2[n + 1].
117   vst1_lane_s16(data_ch1 + n + 1, tmp2, 1);
118   vst1_lane_s16(data_ch2 + n + 1, tmp2, 3);
119 
120   // Update filter_state_ch1[1] and filter_state_ch2[1].
121   statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv);
122   // Store filter_state_ch1[1] and filter_state_ch2[1].
123   vst1q_lane_s32(filter_state_ch1 + 1, statev, 1);
124   vst1q_lane_s32(filter_state_ch2 + 1, statev, 3);
125 }
126 
127 // This function is the prototype for above neon optimized function.
128 //void AllpassFilter2FixDec16BothChannels(
129 //    int16_t *data_ch1,  // Input and output in channel 1, in Q0
130 //    int16_t *data_ch2,  // Input and output in channel 2, in Q0
131 //    const int16_t *factor_ch1,  // Scaling factor for channel 1, in Q15
132 //    const int16_t *factor_ch2,  // Scaling factor for channel 2, in Q15
133 //    const int length,  // Length of the data buffers
134 //    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
135 //    int32_t *filter_state_ch2) {  // Filter state for channel 2, in Q16
136 //  int n = 0;
137 //  int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1];
138 //  int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1];
139 //  int16_t sample0_ch1 = 0, sample0_ch2 = 0;
140 //  int16_t sample1_ch1 = 0, sample1_ch2  = 0;
141 //  int32_t a0_ch1 = 0, a0_ch2 = 0;
142 //  int32_t b0_ch1 = 0, b0_ch2 = 0;
143 //
144 //  int32_t a1_ch1 = 0, a1_ch2 = 0;
145 //  int32_t b1_ch1 = 0, b1_ch2 = 0;
146 //  int32_t b2_ch1  = 0, b2_ch2 = 0;
147 //
148 //  // Loop unrolling preprocessing.
149 //
150 //  sample0_ch1 = data_ch1[n];
151 //  sample0_ch2 = data_ch2[n];
152 //
153 //  a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
154 //  a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
155 //
156 //  b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
157 //  b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
158 //
159 //  a0_ch1 = -factor_ch1[0] * (int16_t)(b0_ch1 >> 16);
160 //  a0_ch2 = -factor_ch2[0] * (int16_t)(b0_ch2 >> 16);
161 //
162 //  state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16);
163 //  state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16);
164 //
165 //  sample1_ch1 = data_ch1[n + 1];
166 //  sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0
167 //  sample1_ch2  = data_ch2[n + 1];
168 //  sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0
169 //
170 //
171 //  for (n = 0; n < length - 2; n += 2) {
172 //    a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
173 //    a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
174 //    a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
175 //    a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
176 //
177 //    b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
178 //    b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1); //Q16+Q16=Q16
179 //    b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2); //Q16+Q16=Q16
180 //    b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2); //Q16+Q16=Q16
181 //
182 //    a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
183 //    a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
184 //    a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
185 //    a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
186 //
187 //    state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16);
188 //    state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16);
189 //    state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16);
190 //    state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16);
191 //
192 //    sample0_ch1 = data_ch1[n + 2];
193 //    sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
194 //    sample0_ch2 = data_ch2[n + 2];
195 //    sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
196 //
197 //    a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
198 //    a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
199 //    a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
200 //    a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
201 //
202 //    b2_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
203 //    b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
204 //    b2_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
205 //    b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
206 //
207 //    a0_ch1 = -factor_ch1[0] * (int16_t)(b2_ch1 >> 16);
208 //    a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
209 //    a0_ch2 = -factor_ch2[0] * (int16_t)(b2_ch2 >> 16);
210 //    a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
211 //
212 //    state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1<<16);
213 //    state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
214 //    state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2<<16);
215 //    state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
216 //
217 //
218 //    sample1_ch1 = data_ch1[n + 3];
219 //    sample0_ch1 = (int16_t) (b2_ch1  >> 16); //Save as Q0
220 //    sample1_ch2 = data_ch2[n + 3];
221 //    sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0
222 //
223 //    data_ch1[n]     = (int16_t) (b0_ch1 >> 16); //Save as Q0
224 //    data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
225 //    data_ch2[n]     = (int16_t) (b0_ch2 >> 16);
226 //    data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
227 //  }
228 //
229 //  // Loop unrolling post-processing.
230 //
231 //  a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
232 //  a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
233 //  a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
234 //  a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
235 //
236 //  b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
237 //  b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1);
238 //  b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2);
239 //  b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2);
240 //
241 //  a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
242 //  a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
243 //  a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
244 //  a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
245 //
246 //  state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16);
247 //  state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16);
248 //  state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16);
249 //  state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16);
250 //
251 //  data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
252 //  data_ch2[n] = (int16_t) (b0_ch2 >> 16);
253 //
254 //  sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
255 //  sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
256 //
257 //  a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
258 //  a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
259 //
260 //  b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
261 //  b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
262 //
263 //  a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
264 //  a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
265 //
266 //  state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
267 //  state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
268 //
269 //  data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
270 //  data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
271 //
272 //  filter_state_ch1[0] = state0_ch1;
273 //  filter_state_ch1[1] = state1_ch1;
274 //  filter_state_ch2[0] = state0_ch2;
275 //  filter_state_ch2[1] = state1_ch2;
276 //}
277