1 /***********************************************************************
2 Copyright (c) 2017 Google Inc., Jean-Marc Valin
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <arm_neon.h>
33 #ifdef OPUS_CHECK_ASM
34 # include <string.h>
35 #endif
36 #include "stack_alloc.h"
37 #include "main_FIX.h"
38 
calc_corr(const opus_int32 * const input_QS,opus_int64 * const corr_QC,const opus_int offset,const int32x4_t state_QS_s32x4)39 static OPUS_INLINE void calc_corr( const opus_int32 *const input_QS, opus_int64 *const corr_QC, const opus_int offset, const int32x4_t state_QS_s32x4 )
40 {
41     int64x2_t corr_QC_s64x2[ 2 ], t_s64x2[ 2 ];
42     const int32x4_t input_QS_s32x4 = vld1q_s32( input_QS + offset );
43     corr_QC_s64x2[ 0 ] = vld1q_s64( corr_QC + offset + 0 );
44     corr_QC_s64x2[ 1 ] = vld1q_s64( corr_QC + offset + 2 );
45     t_s64x2[ 0 ] = vmull_s32( vget_low_s32( state_QS_s32x4 ), vget_low_s32( input_QS_s32x4 ) );
46     t_s64x2[ 1 ] = vmull_s32( vget_high_s32( state_QS_s32x4 ), vget_high_s32( input_QS_s32x4 ) );
47     corr_QC_s64x2[ 0 ] = vsraq_n_s64( corr_QC_s64x2[ 0 ], t_s64x2[ 0 ], 2 * QS - QC );
48     corr_QC_s64x2[ 1 ] = vsraq_n_s64( corr_QC_s64x2[ 1 ], t_s64x2[ 1 ], 2 * QS - QC );
49     vst1q_s64( corr_QC + offset + 0, corr_QC_s64x2[ 0 ] );
50     vst1q_s64( corr_QC + offset + 2, corr_QC_s64x2[ 1 ] );
51 }
52 
calc_state(const int32x4_t state_QS0_s32x4,const int32x4_t state_QS0_1_s32x4,const int32x4_t state_QS1_1_s32x4,const int32x4_t warping_Q16_s32x4)53 static OPUS_INLINE int32x4_t calc_state( const int32x4_t state_QS0_s32x4, const int32x4_t state_QS0_1_s32x4, const int32x4_t state_QS1_1_s32x4, const int32x4_t warping_Q16_s32x4 )
54 {
55     int32x4_t t_s32x4 = vsubq_s32( state_QS0_s32x4, state_QS0_1_s32x4 );
56     t_s32x4 = vqdmulhq_s32( t_s32x4, warping_Q16_s32x4 );
57     return vaddq_s32( state_QS1_1_s32x4, t_s32x4 );
58 }
59 
silk_warped_autocorrelation_FIX_neon(opus_int32 * corr,opus_int * scale,const opus_int16 * input,const opus_int warping_Q16,const opus_int length,const opus_int order)60 void silk_warped_autocorrelation_FIX_neon(
61           opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
62           opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
63     const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
64     const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
65     const opus_int                  length,                                 /* I    Length of input                                                             */
66     const opus_int                  order                                   /* I    Correlation order (even)                                                    */
67 )
68 {
69     if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) {
70         silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order );
71     } else {
72         opus_int       n, i, lsh;
73         opus_int64     corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */
74         opus_int64     corr_QC_orderT;
75         int64x2_t      lsh_s64x2;
76         const opus_int orderT = ( order + 3 ) & ~3;
77         opus_int64     *corr_QCT;
78         opus_int32     *input_QS;
79         VARDECL( opus_int32, input_QST );
80         VARDECL( opus_int32, state );
81         SAVE_STACK;
82 
83         /* Order must be even */
84         silk_assert( ( order & 1 ) == 0 );
85         silk_assert( 2 * QS - QC >= 0 );
86 
87         ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 );
88 
89         input_QS = input_QST;
90         /* input_QS has zero paddings in the beginning and end. */
91         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
92         input_QS += 4;
93         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
94         input_QS += 4;
95         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
96         input_QS += 4;
97         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
98         input_QS += 4;
99         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
100         input_QS += 4;
101         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
102         input_QS += 4;
103 
104         /* Loop over samples */
105         for( n = 0; n < length - 7; n += 8, input_QS += 8 ) {
106             const int16x8_t t0_s16x4 = vld1q_s16( input + n );
107             vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) );
108             vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) );
109         }
110         for( ; n < length; n++, input_QS++ ) {
111             input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
112         }
113         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
114         input_QS += 4;
115         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
116         input_QS += 4;
117         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
118         input_QS += 4;
119         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
120         input_QS += 4;
121         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
122         input_QS += 4;
123         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
124         input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT;
125 
126         /* The following loop runs ( length + order ) times, with ( order ) extra epilogues.                  */
127         /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues.       */
128         /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */
129 
130         /* Keep the C code here to help understand the intrinsics optimization. */
131         /*
132         {
133             opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
134             opus_int32 *state_QST[ 3 ];
135             state_QST[ 0 ] = state_QS[ 0 ];
136             state_QST[ 1 ] = state_QS[ 1 ];
137             for( n = 0; n < length + order; n++, input_QS++ ) {
138                 state_QST[ 0 ][ orderT ] = input_QS[ orderT ];
139                 for( i = 0; i < orderT; i++ ) {
140                     corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC );
141                     state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 );
142                 }
143                 state_QST[ 2 ] = state_QST[ 0 ];
144                 state_QST[ 0 ] = state_QST[ 1 ];
145                 state_QST[ 1 ] = state_QST[ 2 ];
146             }
147         }
148         */
149 
150         {
151             const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 );
152             const opus_int32 *in = input_QS + orderT;
153             opus_int o = orderT;
154             int32x4_t state_QS_s32x4[ 3 ][ 2 ];
155 
156             ALLOC( state, length + orderT, opus_int32 );
157             state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 );
158 
159             /* Calculate 8 taps of all inputs in each loop. */
160             do {
161                 state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] =
162                 state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 );
163                 n = 0;
164                 do {
165                     calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] );
166                     calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] );
167                     state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n );
168                     vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 );
169                     state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 );
170                     state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 );
171                     state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
172                     state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 );
173                     state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
174                     state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ];
175                 } while( ++n < ( length + order - 3) );
176                 in = state;
177                 o -= 8;
178             } while( o > 4 );
179 
180             if( o ) {
181                 /* Calculate the last 4 taps of all inputs. */
182                 opus_int32 *stateT = state;
183                 silk_assert( o == 4 );
184                 state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 );
185                 n = length + order;
186                 do {
187                     calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] );
188                     state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT );
189                     vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 );
190                     state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 );
191                     state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
192                     state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
193                     input_QS++;
194                     stateT++;
195                 } while( --n );
196             }
197         }
198 
199         {
200             const opus_int16 *inputT = input;
201             int32x4_t t_s32x4;
202             int64x1_t t_s64x1;
203             int64x2_t t_s64x2 = vdupq_n_s64( 0 );
204             for( n = 0; n <= length - 8; n += 8 ) {
205                 int16x8_t input_s16x8 = vld1q_s16( inputT );
206                 t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) );
207                 t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) );
208                 t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) );
209                 t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) );
210                 inputT += 8;
211             }
212             t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) );
213             corr_QC_orderT = vget_lane_s64( t_s64x1, 0 );
214             for( ; n < length; n++ ) {
215                 corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] );
216             }
217             corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC );
218             corr_QC[ orderT ] = corr_QC_orderT;
219         }
220 
221         corr_QCT = corr_QC + orderT - order;
222         lsh = silk_CLZ64( corr_QC_orderT ) - 35;
223         lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
224         *scale = -( QC + lsh );
225         silk_assert( *scale >= -30 && *scale <= 12 );
226         lsh_s64x2 = vdupq_n_s64( lsh );
227         for( i = 0; i <= order - 3; i += 4 ) {
228             int32x4_t corr_s32x4;
229             int64x2_t corr_QC0_s64x2, corr_QC1_s64x2;
230             corr_QC0_s64x2 = vld1q_s64( corr_QCT + i );
231             corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 );
232             corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 );
233             corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 );
234             corr_s32x4     = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) );
235             corr_s32x4     = vrev64q_s32( corr_s32x4 );
236             vst1q_s32( corr + order - i - 3, corr_s32x4 );
237         }
238         if( lsh >= 0 ) {
239             for( ; i < order + 1; i++ ) {
240                 corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) );
241             }
242         } else {
243             for( ; i < order + 1; i++ ) {
244                 corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) );
245             }
246         }
247         silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/
248         RESTORE_STACK;
249     }
250 
251 #ifdef OPUS_CHECK_ASM
252     {
253         opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ];
254         opus_int   scale_c;
255         silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order );
256         silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) );
257         silk_assert( scale_c == *scale );
258     }
259 #endif
260 }
261