1 /*
2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 #include <stdlib.h>
14 
15 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
16 
17 // Maximum absolute value of word16 vector. C version for generic platforms.
WebRtcSpl_MaxAbsValueW16Neon(const int16_t * vector,size_t length)18 int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) {
19   int absolute = 0, maximum = 0;
20 
21   assert(length > 0);
22 
23   const int16_t* p_start = vector;
24   size_t rest = length & 7;
25   const int16_t* p_end = vector + length - rest;
26 
27   int16x8_t v;
28   uint16x8_t max_qv;
29   max_qv = vdupq_n_u16(0);
30 
31   while (p_start < p_end) {
32     v = vld1q_s16(p_start);
33     // Note vabs doesn't change the value of -32768.
34     v = vabsq_s16(v);
35     // Use u16 so we don't lose the value -32768.
36     max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
37     p_start += 8;
38   }
39 
40 #ifdef WEBRTC_ARCH_ARM64
41   maximum = (int)vmaxvq_u16(max_qv);
42 #else
43   uint16x4_t max_dv;
44   max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
45   max_dv = vpmax_u16(max_dv, max_dv);
46   max_dv = vpmax_u16(max_dv, max_dv);
47 
48   maximum = (int)vget_lane_u16(max_dv, 0);
49 #endif
50 
51   p_end = vector + length;
52   while (p_start < p_end) {
53     absolute = abs((int)(*p_start));
54 
55     if (absolute > maximum) {
56       maximum = absolute;
57     }
58     p_start++;
59   }
60 
61   // Guard the case for abs(-32768).
62   if (maximum > WEBRTC_SPL_WORD16_MAX) {
63     maximum = WEBRTC_SPL_WORD16_MAX;
64   }
65 
66   return (int16_t)maximum;
67 }
68 
69 // Maximum absolute value of word32 vector. NEON intrinsics version for
70 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MaxAbsValueW32Neon(const int32_t * vector,size_t length)71 int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) {
72   // Use uint32_t for the local variables, to accommodate the return value
73   // of abs(0x80000000), which is 0x80000000.
74 
75   uint32_t absolute = 0, maximum = 0;
76   size_t i = 0;
77   size_t residual = length & 0x7;
78 
79   assert(length > 0);
80 
81   const int32_t* p_start = vector;
82   uint32x4_t max32x4_0 = vdupq_n_u32(0);
83   uint32x4_t max32x4_1 = vdupq_n_u32(0);
84 
85   // First part, unroll the loop 8 times.
86   for (i = 0; i < length - residual; i += 8) {
87     int32x4_t in32x4_0 = vld1q_s32(p_start);
88     p_start += 4;
89     int32x4_t in32x4_1 = vld1q_s32(p_start);
90     p_start += 4;
91     in32x4_0 = vabsq_s32(in32x4_0);
92     in32x4_1 = vabsq_s32(in32x4_1);
93     // vabs doesn't change the value of 0x80000000.
94     // Use u32 so we don't lose the value 0x80000000.
95     max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
96     max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
97   }
98 
99   uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
100 #if defined(WEBRTC_ARCH_ARM64)
101   maximum = vmaxvq_u32(max32x4);
102 #else
103   uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
104   max32x2 = vpmax_u32(max32x2, max32x2);
105 
106   maximum = vget_lane_u32(max32x2, 0);
107 #endif
108 
109   // Second part, do the remaining iterations (if any).
110   for (i = residual; i > 0; i--) {
111     absolute = abs((int)(*p_start));
112     if (absolute > maximum) {
113       maximum = absolute;
114     }
115     p_start++;
116   }
117 
118   // Guard against the case for 0x80000000.
119   maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
120 
121   return (int32_t)maximum;
122 }
123 
124 // Maximum value of word16 vector. NEON intrinsics version for
125 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MaxValueW16Neon(const int16_t * vector,size_t length)126 int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) {
127   int16_t maximum = WEBRTC_SPL_WORD16_MIN;
128   size_t i = 0;
129   size_t residual = length & 0x7;
130 
131   assert(length > 0);
132 
133   const int16_t* p_start = vector;
134   int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
135 
136   // First part, unroll the loop 8 times.
137   for (i = 0; i < length - residual; i += 8) {
138     int16x8_t in16x8 = vld1q_s16(p_start);
139     max16x8 = vmaxq_s16(max16x8, in16x8);
140     p_start += 8;
141   }
142 
143 #if defined(WEBRTC_ARCH_ARM64)
144   maximum = vmaxvq_s16(max16x8);
145 #else
146   int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
147   max16x4 = vpmax_s16(max16x4, max16x4);
148   max16x4 = vpmax_s16(max16x4, max16x4);
149 
150   maximum = vget_lane_s16(max16x4, 0);
151 #endif
152 
153   // Second part, do the remaining iterations (if any).
154   for (i = residual; i > 0; i--) {
155     if (*p_start > maximum)
156       maximum = *p_start;
157     p_start++;
158   }
159   return maximum;
160 }
161 
162 // Maximum value of word32 vector. NEON intrinsics version for
163 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MaxValueW32Neon(const int32_t * vector,size_t length)164 int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) {
165   int32_t maximum = WEBRTC_SPL_WORD32_MIN;
166   size_t i = 0;
167   size_t residual = length & 0x7;
168 
169   assert(length > 0);
170 
171   const int32_t* p_start = vector;
172   int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
173   int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
174 
175   // First part, unroll the loop 8 times.
176   for (i = 0; i < length - residual; i += 8) {
177     int32x4_t in32x4_0 = vld1q_s32(p_start);
178     p_start += 4;
179     int32x4_t in32x4_1 = vld1q_s32(p_start);
180     p_start += 4;
181     max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
182     max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
183   }
184 
185   int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
186 #if defined(WEBRTC_ARCH_ARM64)
187   maximum = vmaxvq_s32(max32x4);
188 #else
189   int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
190   max32x2 = vpmax_s32(max32x2, max32x2);
191 
192   maximum = vget_lane_s32(max32x2, 0);
193 #endif
194 
195   // Second part, do the remaining iterations (if any).
196   for (i = residual; i > 0; i--) {
197     if (*p_start > maximum)
198       maximum = *p_start;
199     p_start++;
200   }
201   return maximum;
202 }
203 
204 // Minimum value of word16 vector. NEON intrinsics version for
205 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MinValueW16Neon(const int16_t * vector,size_t length)206 int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) {
207   int16_t minimum = WEBRTC_SPL_WORD16_MAX;
208   size_t i = 0;
209   size_t residual = length & 0x7;
210 
211   assert(length > 0);
212 
213   const int16_t* p_start = vector;
214   int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
215 
216   // First part, unroll the loop 8 times.
217   for (i = 0; i < length - residual; i += 8) {
218     int16x8_t in16x8 = vld1q_s16(p_start);
219     min16x8 = vminq_s16(min16x8, in16x8);
220     p_start += 8;
221   }
222 
223 #if defined(WEBRTC_ARCH_ARM64)
224   minimum = vminvq_s16(min16x8);
225 #else
226   int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
227   min16x4 = vpmin_s16(min16x4, min16x4);
228   min16x4 = vpmin_s16(min16x4, min16x4);
229 
230   minimum = vget_lane_s16(min16x4, 0);
231 #endif
232 
233   // Second part, do the remaining iterations (if any).
234   for (i = residual; i > 0; i--) {
235     if (*p_start < minimum)
236       minimum = *p_start;
237     p_start++;
238   }
239   return minimum;
240 }
241 
242 // Minimum value of word32 vector. NEON intrinsics version for
243 // ARM 32-bit/64-bit platforms.
WebRtcSpl_MinValueW32Neon(const int32_t * vector,size_t length)244 int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) {
245   int32_t minimum = WEBRTC_SPL_WORD32_MAX;
246   size_t i = 0;
247   size_t residual = length & 0x7;
248 
249   assert(length > 0);
250 
251   const int32_t* p_start = vector;
252   int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
253   int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
254 
255   // First part, unroll the loop 8 times.
256   for (i = 0; i < length - residual; i += 8) {
257     int32x4_t in32x4_0 = vld1q_s32(p_start);
258     p_start += 4;
259     int32x4_t in32x4_1 = vld1q_s32(p_start);
260     p_start += 4;
261     min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
262     min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
263   }
264 
265   int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
266 #if defined(WEBRTC_ARCH_ARM64)
267   minimum = vminvq_s32(min32x4);
268 #else
269   int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
270   min32x2 = vpmin_s32(min32x2, min32x2);
271 
272   minimum = vget_lane_s32(min32x2, 0);
273 #endif
274 
275   // Second part, do the remaining iterations (if any).
276   for (i = residual; i > 0; i--) {
277     if (*p_start < minimum)
278       minimum = *p_start;
279     p_start++;
280   }
281   return minimum;
282 }
283 
284