1 /***********************************************************************
2 Copyright (c) 2017 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <arm_neon.h>
33 #include "pitch.h"
34 
35 #ifdef FIXED_POINT
36 
celt_inner_prod_neon(const opus_val16 * x,const opus_val16 * y,int N)37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
38 {
39     int i;
40     opus_val32 xy;
41     int16x8_t x_s16x8, y_s16x8;
42     int32x4_t xy_s32x4 = vdupq_n_s32(0);
43     int64x2_t xy_s64x2;
44     int64x1_t xy_s64x1;
45 
46     for (i = 0; i < N - 7; i += 8) {
47         x_s16x8  = vld1q_s16(&x[i]);
48         y_s16x8  = vld1q_s16(&y[i]);
49         xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
50         xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
51     }
52 
53     if (N - i >= 4) {
54         const int16x4_t x_s16x4 = vld1_s16(&x[i]);
55         const int16x4_t y_s16x4 = vld1_s16(&y[i]);
56         xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
57         i += 4;
58     }
59 
60     xy_s64x2 = vpaddlq_s32(xy_s32x4);
61     xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
62     xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
63 
64     for (; i < N; i++) {
65         xy = MAC16_16(xy, x[i], y[i]);
66     }
67 
68 #ifdef OPUS_CHECK_ASM
69     celt_assert(celt_inner_prod_c(x, y, N) == xy);
70 #endif
71 
72     return xy;
73 }
74 
dual_inner_prod_neon(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
76         int N, opus_val32 *xy1, opus_val32 *xy2)
77 {
78     int i;
79     opus_val32 xy01, xy02;
80     int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
81     int32x4_t xy01_s32x4 = vdupq_n_s32(0);
82     int32x4_t xy02_s32x4 = vdupq_n_s32(0);
83     int64x2_t xy01_s64x2, xy02_s64x2;
84     int64x1_t xy01_s64x1, xy02_s64x1;
85 
86     for (i = 0; i < N - 7; i += 8) {
87         x_s16x8    = vld1q_s16(&x[i]);
88         y01_s16x8  = vld1q_s16(&y01[i]);
89         y02_s16x8  = vld1q_s16(&y02[i]);
90         xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
91         xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
92         xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
93         xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
94     }
95 
96     if (N - i >= 4) {
97         const int16x4_t x_s16x4   = vld1_s16(&x[i]);
98         const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
99         const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
100         xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
101         xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
102         i += 4;
103     }
104 
105     xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
106     xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
107     xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
108     xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
109     xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
110     xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
111 
112     for (; i < N; i++) {
113         xy01 = MAC16_16(xy01, x[i], y01[i]);
114         xy02 = MAC16_16(xy02, x[i], y02[i]);
115     }
116     *xy1 = xy01;
117     *xy2 = xy02;
118 
119 #ifdef OPUS_CHECK_ASM
120     {
121         opus_val32 xy1_c, xy2_c;
122         dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
123         celt_assert(xy1_c == *xy1);
124         celt_assert(xy2_c == *xy2);
125     }
126 #endif
127 }
128 
129 #else /* !FIXED_POINT */
130 
131 /* ========================================================================== */
132 
133 #ifdef OPUS_CHECK_ASM
134 
135 /* This part of code simulates floating-point NEON operations. */
136 
137 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
138 /* operations of celt_inner_prod_neon(), and both functions should have bit */
139 /* exact output.                                                            */
celt_inner_prod_neon_float_c_simulation(const opus_val16 * x,const opus_val16 * y,int N)140 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
141 {
142    int i;
143    opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
144    for (i = 0; i < N - 3; i += 4) {
145       xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
146       xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
147       xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
148       xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
149    }
150    xy0 += xy2;
151    xy1 += xy3;
152    xy = xy0 + xy1;
153    for (; i < N; i++) {
154       xy = MAC16_16(xy, x[i], y[i]);
155    }
156    return xy;
157 }
158 
159 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
160 /* operations of dual_inner_prod_neon(), and both functions should have bit */
161 /* exact output.                                                            */
dual_inner_prod_neon_float_c_simulation(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)162 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
163       int N, opus_val32 *xy1, opus_val32 *xy2)
164 {
165    int i;
166    opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
167    for (i = 0; i < N - 3; i += 4) {
168       xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
169       xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
170       xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
171       xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
172       xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
173       xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
174       xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
175       xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
176    }
177    xy01_0 += xy01_2;
178    xy02_0 += xy02_2;
179    xy01_1 += xy01_3;
180    xy02_1 += xy02_3;
181    xy01 = xy01_0 + xy01_1;
182    xy02 = xy02_0 + xy02_1;
183    for (; i < N; i++) {
184       xy01 = MAC16_16(xy01, x[i], y01[i]);
185       xy02 = MAC16_16(xy02, x[i], y02[i]);
186    }
187    *xy1 = xy01;
188    *xy2 = xy02;
189 }
190 
191 #endif /* OPUS_CHECK_ASM */
192 
193 /* ========================================================================== */
194 
celt_inner_prod_neon(const opus_val16 * x,const opus_val16 * y,int N)195 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
196 {
197     int i;
198     opus_val32 xy;
199     float32x4_t xy_f32x4 = vdupq_n_f32(0);
200     float32x2_t xy_f32x2;
201 
202     for (i = 0; i < N - 7; i += 8) {
203         float32x4_t x_f32x4, y_f32x4;
204         x_f32x4  = vld1q_f32(&x[i]);
205         y_f32x4  = vld1q_f32(&y[i]);
206         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
207         x_f32x4  = vld1q_f32(&x[i + 4]);
208         y_f32x4  = vld1q_f32(&y[i + 4]);
209         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
210     }
211 
212     if (N - i >= 4) {
213         const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
214         const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
215         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
216         i += 4;
217     }
218 
219     xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
220     xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
221     xy       = vget_lane_f32(xy_f32x2, 0);
222 
223     for (; i < N; i++) {
224         xy = MAC16_16(xy, x[i], y[i]);
225     }
226 
227 #ifdef OPUS_CHECK_ASM
228     celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
229 #endif
230 
231     return xy;
232 }
233 
dual_inner_prod_neon(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)234 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
235         int N, opus_val32 *xy1, opus_val32 *xy2)
236 {
237     int i;
238     opus_val32 xy01, xy02;
239     float32x4_t xy01_f32x4 = vdupq_n_f32(0);
240     float32x4_t xy02_f32x4 = vdupq_n_f32(0);
241     float32x2_t xy01_f32x2, xy02_f32x2;
242 
243     for (i = 0; i < N - 7; i += 8) {
244         float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
245         x_f32x4    = vld1q_f32(&x[i]);
246         y01_f32x4  = vld1q_f32(&y01[i]);
247         y02_f32x4  = vld1q_f32(&y02[i]);
248         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
249         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
250         x_f32x4    = vld1q_f32(&x[i + 4]);
251         y01_f32x4  = vld1q_f32(&y01[i + 4]);
252         y02_f32x4  = vld1q_f32(&y02[i + 4]);
253         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
254         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
255     }
256 
257     if (N - i >= 4) {
258         const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
259         const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
260         const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
261         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
262         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
263         i += 4;
264     }
265 
266     xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
267     xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
268     xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
269     xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
270     xy01       = vget_lane_f32(xy01_f32x2, 0);
271     xy02       = vget_lane_f32(xy02_f32x2, 0);
272 
273     for (; i < N; i++) {
274         xy01 = MAC16_16(xy01, x[i], y01[i]);
275         xy02 = MAC16_16(xy02, x[i], y02[i]);
276     }
277     *xy1 = xy01;
278     *xy2 = xy02;
279 
280 #ifdef OPUS_CHECK_ASM
281     {
282         opus_val32 xy1_c, xy2_c;
283         dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
284         celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
285         celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
286     }
287 #endif
288 }
289 
290 #endif /* FIXED_POINT */
291