1 /*
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 /*
12 * filters_neon.c
13 *
14 * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for
15 * ARM Neon platform.
16 *
17 */
18
19 #include <arm_neon.h>
20 #include <assert.h>
21
22 #include "codec.h"
23
24 // Autocorrelation function in fixed point.
25 // NOTE! Different from SPLIB-version in how it scales the signal.
WebRtcIsacfix_AutocorrNeon(WebRtc_Word32 * __restrict r,const WebRtc_Word16 * __restrict x,WebRtc_Word16 N,WebRtc_Word16 order,WebRtc_Word16 * __restrict scale)26 int WebRtcIsacfix_AutocorrNeon(
27 WebRtc_Word32* __restrict r,
28 const WebRtc_Word16* __restrict x,
29 WebRtc_Word16 N,
30 WebRtc_Word16 order,
31 WebRtc_Word16* __restrict scale) {
32
33 // The 1st for loop assumed N % 4 == 0.
34 assert(N % 4 == 0);
35
36 int i = 0;
37 int zeros_low = 0;
38 int zeros_high = 0;
39 int16_t scaling = 0;
40 int32_t sum = 0;
41
42 // Step 1, calculate r[0] and how much scaling is needed.
43
44 int16x4_t reg16x4;
45 int64x1_t reg64x1a;
46 int64x1_t reg64x1b;
47 int32x4_t reg32x4;
48 int64x2_t reg64x2 = vdupq_n_s64(0); // zeros
49
50 // Loop over the samples and do:
51 // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]);
52 for (i = 0; i < N; i += 4) {
53 reg16x4 = vld1_s16(&x[i]);
54 reg32x4 = vmull_s16(reg16x4, reg16x4);
55 reg64x2 = vpadalq_s32(reg64x2, reg32x4);
56 }
57 reg64x1a = vget_low_s64(reg64x2);
58 reg64x1b = vget_high_s64(reg64x2);
59 reg64x1a = vadd_s64(reg64x1a, reg64x1b);
60
61 // Calculate the value of shifting (scaling).
62 __asm__ __volatile__(
63 "vmov %[z_l], %[z_h], %P[reg]\n\t"
64 "clz %[z_l], %[z_l]\n\t"
65 "clz %[z_h], %[z_h]\n\t"
66 :[z_l]"+r"(zeros_low),
67 [z_h]"+r"(zeros_high)
68 :[reg]"w"(reg64x1a)
69 );
70 if (zeros_high != 32) {
71 scaling = (32 - zeros_high + 1);
72 } else if (zeros_low == 0) {
73 scaling = 1;
74 }
75 reg64x1b = -scaling;
76 reg64x1a = vshl_s64(reg64x1a, reg64x1b);
77
78 // Record the result.
79 r[0] = (int32_t)vget_lane_s64(reg64x1a, 0);
80
81
82 // Step 2, perform the actual correlation calculation.
83
84 /* Original C code (for the rest of the function):
85 for (i = 1; i < order + 1; i++) {
86 prod = 0;
87 for (j = 0; j < N - i; j++) {
88 prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]);
89 }
90 sum = (int32_t)(prod >> scaling);
91 r[i] = sum;
92 }
93 */
94
95 for (i = 1; i < order + 1; i++) {
96 int32_t prod_lower = 0;
97 int32_t prod_upper = 0;
98 const int16_t* ptr0 = &x[0];
99 const int16_t* ptr1 = &x[i];
100 int32_t tmp = 0;
101
102 // Initialize the sum (q9) to zero.
103 __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9");
104
105 // Calculate the major block of the samples (a multiple of 8).
106 for (; ptr0 < &x[N - i - 7];) {
107 __asm__ __volatile__(
108 "vld1.16 {d20, d21}, [%[ptr0]]!\n\t"
109 "vld1.16 {d22, d23}, [%[ptr1]]!\n\t"
110 "vmull.s16 q12, d20, d22\n\t"
111 "vmull.s16 q13, d21, d23\n\t"
112 "vpadal.s32 q9, q12\n\t"
113 "vpadal.s32 q9, q13\n\t"
114
115 // Specify constraints.
116 :[ptr0]"+r"(ptr0),
117 [ptr1]"+r"(ptr1)
118 :
119 :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27"
120 );
121 }
122
123 // Calculate the rest of the samples.
124 for (; ptr0 < &x[N - i]; ptr0++, ptr1++) {
125 __asm__ __volatile__(
126 "smulbb %[tmp], %[ptr0], %[ptr1]\n\t"
127 "adds %[prod_lower], %[prod_lower], %[tmp]\n\t"
128 "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t"
129
130 // Specify constraints.
131 :[prod_lower]"+r"(prod_lower),
132 [prod_upper]"+r"(prod_upper),
133 [tmp]"+r"(tmp)
134 :[ptr0]"r"(*ptr0),
135 [ptr1]"r"(*ptr1)
136 );
137 }
138
139 // Sum the results up, and do shift.
140 __asm__ __volatile__(
141 "vadd.i64 d18, d19\n\t"
142 "vmov.32 d17[0], %[prod_lower]\n\t"
143 "vmov.32 d17[1], %[prod_upper]\n\t"
144 "vadd.i64 d17, d18\n\t"
145 "mov %[tmp], %[scaling], asr #31\n\t"
146 "vmov.32 d16, %[scaling], %[tmp]\n\t"
147 "vshl.s64 d17, d16\n\t"
148 "vmov.32 %[sum], d17[0]\n\t"
149
150 // Specify constraints.
151 :[sum]"=r"(sum),
152 [tmp]"+r"(tmp)
153 :[prod_upper]"r"(prod_upper),
154 [prod_lower]"r"(prod_lower),
155 [scaling]"r"(-scaling)
156 :"d16", "d17", "d18", "d19"
157 );
158
159 // Record the result.
160 r[i] = sum;
161 }
162
163 // Record the result.
164 *scale = scaling;
165
166 return(order + 1);
167 }
168