1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
15
16 // Autocorrelation function in fixed point.
17 // NOTE! Different from SPLIB-version in how it scales the signal.
WebRtcIsacfix_AutocorrNeon(int32_t * __restrict r,const int16_t * x,int16_t n,int16_t order,int16_t * __restrict scale)18 int WebRtcIsacfix_AutocorrNeon(int32_t* __restrict r,
19 const int16_t* x,
20 int16_t n,
21 int16_t order,
22 int16_t* __restrict scale) {
23 int i = 0;
24 int16_t scaling = 0;
25 uint32_t temp = 0;
26 int64_t prod = 0;
27 int64_t prod_tail = 0;
28
29 assert(n % 4 == 0);
30 assert(n >= 8);
31
32 // Calculate r[0].
33 int16x4_t x0_v;
34 int32x4_t tmpa0_v;
35 int64x2_t tmpb_v;
36
37 tmpb_v = vdupq_n_s64(0);
38 const int16_t* x_start = x;
39 const int16_t* x_end0 = x_start + n;
40 while (x_start < x_end0) {
41 x0_v = vld1_s16(x_start);
42 tmpa0_v = vmull_s16(x0_v, x0_v);
43 tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
44 x_start += 4;
45 }
46
47 #ifdef WEBRTC_ARCH_ARM64
48 prod = vaddvq_s64(tmpb_v);
49 #else
50 prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
51 0);
52 #endif
53 // Calculate scaling (the value of shifting).
54 temp = (uint32_t)(prod >> 31);
55
56 scaling = temp ? 32 - WebRtcSpl_NormU32(temp) : 0;
57 r[0] = (int32_t)(prod >> scaling);
58
59 int16x8_t x1_v;
60 int16x8_t y_v;
61 int32x4_t tmpa1_v;
62 // Perform the actual correlation calculation.
63 for (i = 1; i < order + 1; i++) {
64 tmpb_v = vdupq_n_s64(0);
65 int rest = (n - i) % 8;
66 x_start = x;
67 x_end0 = x_start + n - i - rest;
68 const int16_t* y_start = x_start + i;
69 while (x_start < x_end0) {
70 x1_v = vld1q_s16(x_start);
71 y_v = vld1q_s16(y_start);
72 tmpa0_v = vmull_s16(vget_low_s16(x1_v), vget_low_s16(y_v));
73 #ifdef WEBRTC_ARCH_ARM64
74 tmpa1_v = vmull_high_s16(x1_v, y_v);
75 #else
76 tmpa1_v = vmull_s16(vget_high_s16(x1_v), vget_high_s16(y_v));
77 #endif
78 tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
79 tmpb_v = vpadalq_s32(tmpb_v, tmpa1_v);
80 x_start += 8;
81 y_start += 8;
82 }
83 // The remaining calculation.
84 const int16_t* x_end1 = x + n - i;
85 if (rest >= 4) {
86 int16x4_t x2_v = vld1_s16(x_start);
87 int16x4_t y2_v = vld1_s16(y_start);
88 tmpa0_v = vmull_s16(x2_v, y2_v);
89 tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
90 x_start += 4;
91 y_start += 4;
92 }
93 #ifdef WEBRTC_ARCH_ARM64
94 prod = vaddvq_s64(tmpb_v);
95 #else
96 prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
97 0);
98 #endif
99
100 prod_tail = 0;
101 while (x_start < x_end1) {
102 prod_tail += *x_start * *y_start;
103 ++x_start;
104 ++y_start;
105 }
106
107 r[i] = (int32_t)((prod + prod_tail) >> scaling);
108 }
109
110 *scale = scaling;
111
112 return order + 1;
113 }
114
115