1 /*
2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
15 
16 // Autocorrelation function in fixed point.
17 // NOTE! Different from SPLIB-version in how it scales the signal.
WebRtcIsacfix_AutocorrNeon(int32_t * __restrict r,const int16_t * x,int16_t n,int16_t order,int16_t * __restrict scale)18 int WebRtcIsacfix_AutocorrNeon(int32_t* __restrict r,
19                                const int16_t* x,
20                                int16_t n,
21                                int16_t order,
22                                int16_t* __restrict scale) {
23   int i = 0;
24   int16_t scaling = 0;
25   uint32_t temp = 0;
26   int64_t prod = 0;
27   int64_t prod_tail = 0;
28 
29   assert(n % 4 == 0);
30   assert(n >= 8);
31 
32   // Calculate r[0].
33   int16x4_t x0_v;
34   int32x4_t tmpa0_v;
35   int64x2_t tmpb_v;
36 
37   tmpb_v = vdupq_n_s64(0);
38   const int16_t* x_start = x;
39   const int16_t* x_end0 = x_start + n;
40   while (x_start < x_end0) {
41     x0_v = vld1_s16(x_start);
42     tmpa0_v = vmull_s16(x0_v, x0_v);
43     tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
44     x_start += 4;
45   }
46 
47 #ifdef WEBRTC_ARCH_ARM64
48   prod = vaddvq_s64(tmpb_v);
49 #else
50   prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
51                        0);
52 #endif
53   // Calculate scaling (the value of shifting).
54   temp = (uint32_t)(prod >> 31);
55 
56   scaling = temp ? 32 - WebRtcSpl_NormU32(temp) : 0;
57   r[0] = (int32_t)(prod >> scaling);
58 
59   int16x8_t x1_v;
60   int16x8_t y_v;
61   int32x4_t tmpa1_v;
62   // Perform the actual correlation calculation.
63   for (i = 1; i < order + 1; i++) {
64     tmpb_v = vdupq_n_s64(0);
65     int rest = (n - i) % 8;
66     x_start = x;
67     x_end0 = x_start + n - i - rest;
68     const int16_t* y_start = x_start + i;
69     while (x_start < x_end0) {
70       x1_v = vld1q_s16(x_start);
71       y_v = vld1q_s16(y_start);
72       tmpa0_v = vmull_s16(vget_low_s16(x1_v), vget_low_s16(y_v));
73 #ifdef WEBRTC_ARCH_ARM64
74       tmpa1_v = vmull_high_s16(x1_v, y_v);
75 #else
76       tmpa1_v = vmull_s16(vget_high_s16(x1_v), vget_high_s16(y_v));
77 #endif
78       tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
79       tmpb_v = vpadalq_s32(tmpb_v, tmpa1_v);
80       x_start += 8;
81       y_start += 8;
82     }
83     // The remaining calculation.
84     const int16_t* x_end1 = x + n - i;
85     if (rest >= 4) {
86         int16x4_t x2_v = vld1_s16(x_start);
87         int16x4_t y2_v = vld1_s16(y_start);
88         tmpa0_v = vmull_s16(x2_v, y2_v);
89         tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
90         x_start += 4;
91         y_start += 4;
92     }
93 #ifdef WEBRTC_ARCH_ARM64
94     prod = vaddvq_s64(tmpb_v);
95 #else
96     prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
97                          0);
98 #endif
99 
100     prod_tail = 0;
101     while (x_start < x_end1) {
102       prod_tail += *x_start * *y_start;
103       ++x_start;
104       ++y_start;
105     }
106 
107     r[i] = (int32_t)((prod + prod_tail) >> scaling);
108   }
109 
110   *scale = scaling;
111 
112   return order + 1;
113 }
114 
115