1 /* Copyright (c) 2014, Cisco Systems, INC
2    Written by XiangMingZhu WeiZhou MinPeng YanWang
3 
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 
35 #include "macros.h"
36 #include "celt_lpc.h"
37 #include "stack_alloc.h"
38 #include "mathops.h"
39 #include "pitch.h"
40 
41 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
42 #include <smmintrin.h>
43 #include "x86cpu.h"
44 
celt_inner_prod_sse4_1(const opus_val16 * x,const opus_val16 * y,int N)45 opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
46       int N)
47 {
48     opus_int  i, dataSize16;
49     opus_int32 sum;
50     __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
51     __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
52     __m128i inVec1_3210, inVec2_3210;
53 
54     sum = 0;
55     dataSize16 = N & ~15;
56 
57     acc1 = _mm_setzero_si128();
58     acc2 = _mm_setzero_si128();
59 
60     for (i=0;i<dataSize16;i+=16) {
61         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
62         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
63 
64         inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
65         inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
66 
67         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
68         inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
69 
70         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
71         acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
72     }
73 
74     acc1 = _mm_add_epi32(acc1, acc2);
75 
76     if (N - i >= 8)
77     {
78         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
79         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
80 
81         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
82 
83         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
84         i += 8;
85     }
86 
87     if (N - i >= 4)
88     {
89         inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
90         inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
91 
92         inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
93 
94         acc1 = _mm_add_epi32(acc1, inVec1_3210);
95         i += 4;
96     }
97 
98     acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
99     acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
100 
101     sum += _mm_cvtsi128_si32(acc1);
102 
103     for (;i<N;i++)
104     {
105         sum = silk_SMLABB(sum, x[i], y[i]);
106     }
107 
108     return sum;
109 }
110 
xcorr_kernel_sse4_1(const opus_val16 * x,const opus_val16 * y,opus_val32 sum[4],int len)111 void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
112 {
113     int j;
114 
115     __m128i vecX, vecX0, vecX1, vecX2, vecX3;
116     __m128i vecY0, vecY1, vecY2, vecY3;
117     __m128i sum0, sum1, sum2, sum3, vecSum;
118     __m128i initSum;
119 
120     celt_assert(len >= 3);
121 
122     sum0 = _mm_setzero_si128();
123     sum1 = _mm_setzero_si128();
124     sum2 = _mm_setzero_si128();
125     sum3 = _mm_setzero_si128();
126 
127     for (j=0;j<(len-7);j+=8)
128     {
129         vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
130         vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
131         vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
132         vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
133         vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
134 
135         sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
136         sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
137         sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
138         sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
139     }
140 
141     sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
142     sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
143 
144     sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
145     sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
146 
147     sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
148     sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
149 
150     sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
151     sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
152 
153     vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
154           _mm_unpacklo_epi32(sum2, sum3));
155 
156     for (;j<(len-3);j+=4)
157     {
158         vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
159         vecX0 = _mm_shuffle_epi32(vecX, 0x00);
160         vecX1 = _mm_shuffle_epi32(vecX, 0x55);
161         vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
162         vecX3 = _mm_shuffle_epi32(vecX, 0xff);
163 
164         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
165         vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
166         vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
167         vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
168 
169         sum0 = _mm_mullo_epi32(vecX0, vecY0);
170         sum1 = _mm_mullo_epi32(vecX1, vecY1);
171         sum2 = _mm_mullo_epi32(vecX2, vecY2);
172         sum3 = _mm_mullo_epi32(vecX3, vecY3);
173 
174         sum0 = _mm_add_epi32(sum0, sum1);
175         sum2 = _mm_add_epi32(sum2, sum3);
176         vecSum = _mm_add_epi32(vecSum, sum0);
177         vecSum = _mm_add_epi32(vecSum, sum2);
178     }
179 
180     for (;j<len;j++)
181     {
182         vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
183         vecX0 = _mm_shuffle_epi32(vecX, 0x00);
184 
185         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
186 
187         sum0 = _mm_mullo_epi32(vecX0, vecY0);
188         vecSum = _mm_add_epi32(vecSum, sum0);
189     }
190 
191     initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
192     initSum = _mm_add_epi32(initSum, vecSum);
193     _mm_storeu_si128((__m128i *)sum, initSum);
194 }
195 #endif
196