1 /*
2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 /* This file contains WebRtcIsacfix_MatrixProduct1Neon() and
12  * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in
13  * entropy_coding.c. Results are bit exact with the c code for
14  * generic platforms.
15  */
16 
17 #include "entropy_coding.h"
18 
19 #include <arm_neon.h>
20 #include <assert.h>
21 #include <stddef.h>
22 
23 #include "signal_processing_library.h"
24 
WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],const int32_t matrix1[],int32_t matrix_product[],const int matrix1_index_factor1,const int matrix0_index_factor1,const int matrix1_index_init_case,const int matrix1_index_step,const int matrix0_index_step,const int inner_loop_count,const int mid_loop_count,const int shift)25 void WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],
26                                       const int32_t matrix1[],
27                                       int32_t matrix_product[],
28                                       const int matrix1_index_factor1,
29                                       const int matrix0_index_factor1,
30                                       const int matrix1_index_init_case,
31                                       const int matrix1_index_step,
32                                       const int matrix0_index_step,
33                                       const int inner_loop_count,
34                                       const int mid_loop_count,
35                                       const int shift) {
36   int j = 0, k = 0, n = 0;
37   int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
38   int* matrix1_index_factor2 = &j;
39   int* matrix0_index_factor2 = &k;
40   if (matrix1_index_init_case != 0) {
41     matrix1_index_factor2 = &k;
42     matrix0_index_factor2 = &j;
43   }
44   int32x4_t shift32x4 = vdupq_n_s32(shift);
45   int32x2_t shift32x2 = vdup_n_s32(shift);
46   int32x4_t sum_32x4 =  vdupq_n_s32(0);
47   int32x2_t sum_32x2 =  vdup_n_s32(0);
48 
49   assert(inner_loop_count % 2 == 0);
50   assert(mid_loop_count % 2 == 0);
51 
52   if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) {
53     for (j = 0; j < SUBFRAMES; j++) {
54       matrix_prod_index = mid_loop_count * j;
55       for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
56         sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
57         matrix1_index = k;
58         matrix0_index = matrix0_index_factor1 * j;
59         for (n = 0; n < inner_loop_count; n++) {
60           int32x4_t matrix0_32x4 =
61               vdupq_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
62           int32x4_t matrix1_32x4 =
63               vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
64           int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
65           sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
66           matrix1_index += matrix1_index_step;
67           matrix0_index += matrix0_index_step;
68         }
69         vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
70         matrix_prod_index += 4;
71       }
72       if (mid_loop_count % 4 > 1) {
73         sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
74         matrix1_index = k;
75         k += 2;
76         matrix0_index = matrix0_index_factor1 * j;
77         for (n = 0; n < inner_loop_count; n++) {
78           int32x2_t matrix0_32x2 =
79               vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
80           int32x2_t matrix1_32x2 =
81               vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
82           int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
83           sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
84           matrix1_index += matrix1_index_step;
85           matrix0_index += matrix0_index_step;
86         }
87         vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
88         matrix_prod_index += 2;
89       }
90     }
91   }
92   else if (matrix1_index_init_case == 0 && matrix0_index_factor1 == 1) {
93     int32x2_t multi_32x2 = vdup_n_s32(0);
94     int32x2_t matrix0_32x2 = vdup_n_s32(0);
95     for (j = 0; j < SUBFRAMES; j++) {
96       matrix_prod_index = mid_loop_count * j;
97       for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
98         sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
99         matrix1_index = matrix1_index_factor1 * j;
100         matrix0_index = k;
101         for (n = 0; n < inner_loop_count; n++) {
102           int32x4_t matrix1_32x4 = vdupq_n_s32(matrix1[matrix1_index] << shift);
103           int32x4_t matrix0_32x4 =
104               vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
105           int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
106           sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
107           matrix1_index += matrix1_index_step;
108           matrix0_index += matrix0_index_step;
109         }
110         vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
111         matrix_prod_index += 4;
112       }
113       if (mid_loop_count % 4 > 1) {
114         sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
115         matrix1_index = matrix1_index_factor1 * j;
116         matrix0_index = k;
117         for (n = 0; n < inner_loop_count; n++) {
118           int32x2_t matrix1_32x2 = vdup_n_s32(matrix1[matrix1_index] << shift);
119           matrix0_32x2 =
120               vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
121           matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
122                                      matrix0_32x2, 1);
123           matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
124           multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
125           sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
126           matrix1_index += matrix1_index_step;
127           matrix0_index += matrix0_index_step;
128         }
129         vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
130         matrix_prod_index += 2;
131       }
132     }
133   }
134   else if (matrix1_index_init_case == 0 &&
135            matrix1_index_step == 1 &&
136            matrix0_index_step == 1) {
137     int32x2_t multi_32x2 = vdup_n_s32(0);
138     int32x2_t matrix0_32x2 = vdup_n_s32(0);
139     for (j = 0; j < SUBFRAMES; j++) {
140       matrix_prod_index = mid_loop_count * j;
141       for (k = 0; k < mid_loop_count; k++) {
142         sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
143         matrix1_index = matrix1_index_factor1 * j;
144         matrix0_index = matrix0_index_factor1 * k;
145         for (n = 0; n < (inner_loop_count >> 2) << 2; n += 4) {
146           int32x4_t matrix1_32x4 =
147               vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
148           int32x4_t matrix0_32x4 =
149               vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
150           int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
151           sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
152           matrix1_index += 4;
153           matrix0_index += 4;
154         }
155         sum_32x2 = vqadd_s32(vget_low_s32(sum_32x4), vget_high_s32(sum_32x4));
156         if (inner_loop_count % 4 > 1) {
157           int32x2_t matrix1_32x2 =
158               vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
159           matrix0_32x2 =
160               vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
161           matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
162                                      matrix0_32x2, 1);
163           matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
164           multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
165           sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
166         }
167         sum_32x2 = vpadd_s32(sum_32x2, sum_32x2);
168         vst1_lane_s32(&matrix_product[matrix_prod_index], sum_32x2, 0);
169         matrix_prod_index++;
170       }
171     }
172   }
173   else {
174     for (j = 0; j < SUBFRAMES; j++) {
175       matrix_prod_index = mid_loop_count * j;
176       for (k=0; k < mid_loop_count; k++) {
177         int32_t sum32 = 0;
178         matrix1_index = matrix1_index_factor1 * (*matrix1_index_factor2);
179         matrix0_index = matrix0_index_factor1 * (*matrix0_index_factor2);
180         for (n = 0; n < inner_loop_count; n++) {
181           sum32 += (WEBRTC_SPL_MUL_16_32_RSFT16(matrix0[matrix0_index],
182               matrix1[matrix1_index] << shift));
183           matrix1_index += matrix1_index_step;
184           matrix0_index += matrix0_index_step;
185         }
186         matrix_product[matrix_prod_index] = sum32;
187         matrix_prod_index++;
188       }
189     }
190   }
191 }
192 
WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],const int32_t matrix1[],int32_t matrix_product[],const int matrix0_index_factor,const int matrix0_index_step)193 void WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],
194                                       const int32_t matrix1[],
195                                       int32_t matrix_product[],
196                                       const int matrix0_index_factor,
197                                       const int matrix0_index_step) {
198   int j = 0, n = 0;
199   int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
200   int32x2_t sum_32x2 = vdup_n_s32(0);
201   for (j = 0; j < SUBFRAMES; j++) {
202     sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
203     matrix1_index = 0;
204     matrix0_index = matrix0_index_factor * j;
205     for (n = SUBFRAMES; n > 0; n--) {
206       int32x2_t matrix0_32x2 =
207           vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
208       int32x2_t matrix1_32x2 = vld1_s32(&matrix1[matrix1_index]);
209       int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
210       sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
211       matrix1_index += 2;
212       matrix0_index += matrix0_index_step;
213     }
214     sum_32x2 = vshr_n_s32(sum_32x2, 3);
215     vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
216     matrix_prod_index += 2;
217   }
218 }
219