1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>  // SSE2
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/vpx_dsp_common.h"
16 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
17 
vpx_fdct4x4_1_sse2(const int16_t * input,tran_low_t * output,int stride)18 void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
19   __m128i in0, in1;
20   __m128i tmp;
21   const __m128i zero = _mm_setzero_si128();
22   in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
23   in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
24   in1 = _mm_unpacklo_epi64(
25       in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
26   in0 = _mm_unpacklo_epi64(
27       in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
28 
29   tmp = _mm_add_epi16(in0, in1);
30   in0 = _mm_unpacklo_epi16(zero, tmp);
31   in1 = _mm_unpackhi_epi16(zero, tmp);
32   in0 = _mm_srai_epi32(in0, 16);
33   in1 = _mm_srai_epi32(in1, 16);
34 
35   tmp = _mm_add_epi32(in0, in1);
36   in0 = _mm_unpacklo_epi32(tmp, zero);
37   in1 = _mm_unpackhi_epi32(tmp, zero);
38 
39   tmp = _mm_add_epi32(in0, in1);
40   in0 = _mm_srli_si128(tmp, 8);
41 
42   in1 = _mm_add_epi32(tmp, in0);
43   in0 = _mm_slli_epi32(in1, 1);
44   output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
45 }
46 
vpx_fdct8x8_1_sse2(const int16_t * input,tran_low_t * output,int stride)47 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
48   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
49   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
50   __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
51   __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
52   __m128i u0, u1, sum;
53 
54   u0 = _mm_add_epi16(in0, in1);
55   u1 = _mm_add_epi16(in2, in3);
56 
57   in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
58   in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
59   in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
60   in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
61 
62   sum = _mm_add_epi16(u0, u1);
63 
64   in0 = _mm_add_epi16(in0, in1);
65   in2 = _mm_add_epi16(in2, in3);
66   sum = _mm_add_epi16(sum, in0);
67 
68   u0 = _mm_setzero_si128();
69   sum = _mm_add_epi16(sum, in2);
70 
71   in0 = _mm_unpacklo_epi16(u0, sum);
72   in1 = _mm_unpackhi_epi16(u0, sum);
73   in0 = _mm_srai_epi32(in0, 16);
74   in1 = _mm_srai_epi32(in1, 16);
75 
76   sum = _mm_add_epi32(in0, in1);
77   in0 = _mm_unpacklo_epi32(sum, u0);
78   in1 = _mm_unpackhi_epi32(sum, u0);
79 
80   sum = _mm_add_epi32(in0, in1);
81   in0 = _mm_srli_si128(sum, 8);
82 
83   in1 = _mm_add_epi32(sum, in0);
84   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
85 }
86 
vpx_fdct16x16_1_sse2(const int16_t * input,tran_low_t * output,int stride)87 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
88                           int stride) {
89   __m128i in0, in1, in2, in3;
90   __m128i u0, u1;
91   __m128i sum = _mm_setzero_si128();
92   int i;
93 
94   for (i = 0; i < 2; ++i) {
95     in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
96     in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
97     in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
98     in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
99 
100     u0 = _mm_add_epi16(in0, in1);
101     u1 = _mm_add_epi16(in2, in3);
102     sum = _mm_add_epi16(sum, u0);
103 
104     in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
105     in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
106     in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
107     in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
108 
109     sum = _mm_add_epi16(sum, u1);
110     u0 = _mm_add_epi16(in0, in1);
111     u1 = _mm_add_epi16(in2, in3);
112     sum = _mm_add_epi16(sum, u0);
113 
114     in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
115     in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
116     in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
117     in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
118 
119     sum = _mm_add_epi16(sum, u1);
120     u0 = _mm_add_epi16(in0, in1);
121     u1 = _mm_add_epi16(in2, in3);
122     sum = _mm_add_epi16(sum, u0);
123 
124     in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
125     in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
126     in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
127     in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
128 
129     sum = _mm_add_epi16(sum, u1);
130     u0 = _mm_add_epi16(in0, in1);
131     u1 = _mm_add_epi16(in2, in3);
132     sum = _mm_add_epi16(sum, u0);
133 
134     sum = _mm_add_epi16(sum, u1);
135     input += 8 * stride;
136   }
137 
138   u0 = _mm_setzero_si128();
139   in0 = _mm_unpacklo_epi16(u0, sum);
140   in1 = _mm_unpackhi_epi16(u0, sum);
141   in0 = _mm_srai_epi32(in0, 16);
142   in1 = _mm_srai_epi32(in1, 16);
143 
144   sum = _mm_add_epi32(in0, in1);
145   in0 = _mm_unpacklo_epi32(sum, u0);
146   in1 = _mm_unpackhi_epi32(sum, u0);
147 
148   sum = _mm_add_epi32(in0, in1);
149   in0 = _mm_srli_si128(sum, 8);
150 
151   in1 = _mm_add_epi32(sum, in0);
152   in1 = _mm_srai_epi32(in1, 1);
153   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
154 }
155 
vpx_fdct32x32_1_sse2(const int16_t * input,tran_low_t * output,int stride)156 void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
157                           int stride) {
158   __m128i in0, in1, in2, in3;
159   __m128i u0, u1;
160   __m128i sum = _mm_setzero_si128();
161   int i;
162 
163   for (i = 0; i < 8; ++i) {
164     in0 = _mm_load_si128((const __m128i *)(input + 0));
165     in1 = _mm_load_si128((const __m128i *)(input + 8));
166     in2 = _mm_load_si128((const __m128i *)(input + 16));
167     in3 = _mm_load_si128((const __m128i *)(input + 24));
168 
169     input += stride;
170     u0 = _mm_add_epi16(in0, in1);
171     u1 = _mm_add_epi16(in2, in3);
172     sum = _mm_add_epi16(sum, u0);
173 
174     in0 = _mm_load_si128((const __m128i *)(input + 0));
175     in1 = _mm_load_si128((const __m128i *)(input + 8));
176     in2 = _mm_load_si128((const __m128i *)(input + 16));
177     in3 = _mm_load_si128((const __m128i *)(input + 24));
178 
179     input += stride;
180     sum = _mm_add_epi16(sum, u1);
181     u0 = _mm_add_epi16(in0, in1);
182     u1 = _mm_add_epi16(in2, in3);
183     sum = _mm_add_epi16(sum, u0);
184 
185     in0 = _mm_load_si128((const __m128i *)(input + 0));
186     in1 = _mm_load_si128((const __m128i *)(input + 8));
187     in2 = _mm_load_si128((const __m128i *)(input + 16));
188     in3 = _mm_load_si128((const __m128i *)(input + 24));
189 
190     input += stride;
191     sum = _mm_add_epi16(sum, u1);
192     u0 = _mm_add_epi16(in0, in1);
193     u1 = _mm_add_epi16(in2, in3);
194     sum = _mm_add_epi16(sum, u0);
195 
196     in0 = _mm_load_si128((const __m128i *)(input + 0));
197     in1 = _mm_load_si128((const __m128i *)(input + 8));
198     in2 = _mm_load_si128((const __m128i *)(input + 16));
199     in3 = _mm_load_si128((const __m128i *)(input + 24));
200 
201     input += stride;
202     sum = _mm_add_epi16(sum, u1);
203     u0 = _mm_add_epi16(in0, in1);
204     u1 = _mm_add_epi16(in2, in3);
205     sum = _mm_add_epi16(sum, u0);
206 
207     sum = _mm_add_epi16(sum, u1);
208   }
209 
210   u0 = _mm_setzero_si128();
211   in0 = _mm_unpacklo_epi16(u0, sum);
212   in1 = _mm_unpackhi_epi16(u0, sum);
213   in0 = _mm_srai_epi32(in0, 16);
214   in1 = _mm_srai_epi32(in1, 16);
215 
216   sum = _mm_add_epi32(in0, in1);
217   in0 = _mm_unpacklo_epi32(sum, u0);
218   in1 = _mm_unpackhi_epi32(sum, u0);
219 
220   sum = _mm_add_epi32(in0, in1);
221   in0 = _mm_srli_si128(sum, 8);
222 
223   in1 = _mm_add_epi32(sum, in0);
224   in1 = _mm_srai_epi32(in1, 3);
225   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
226 }
227 
228 #define DCT_HIGH_BIT_DEPTH 0
229 #define FDCT4x4_2D vpx_fdct4x4_sse2
230 #define FDCT8x8_2D vpx_fdct8x8_sse2
231 #define FDCT16x16_2D vpx_fdct16x16_sse2
232 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
233 #undef FDCT4x4_2D
234 #undef FDCT8x8_2D
235 #undef FDCT16x16_2D
236 
237 #define FDCT32x32_2D vpx_fdct32x32_rd_sse2
238 #define FDCT32x32_HIGH_PRECISION 0
239 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
240 #undef FDCT32x32_2D
241 #undef FDCT32x32_HIGH_PRECISION
242 
243 #define FDCT32x32_2D vpx_fdct32x32_sse2
244 #define FDCT32x32_HIGH_PRECISION 1
245 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
246 #undef FDCT32x32_2D
247 #undef FDCT32x32_HIGH_PRECISION
248 #undef DCT_HIGH_BIT_DEPTH
249 
250 #if CONFIG_VP9_HIGHBITDEPTH
251 #define DCT_HIGH_BIT_DEPTH 1
252 #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
253 #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
254 #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
255 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
256 #undef FDCT4x4_2D
257 #undef FDCT8x8_2D
258 #undef FDCT16x16_2D
259 
260 #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
261 #define FDCT32x32_HIGH_PRECISION 0
262 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
263 #undef FDCT32x32_2D
264 #undef FDCT32x32_HIGH_PRECISION
265 
266 #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
267 #define FDCT32x32_HIGH_PRECISION 1
268 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
269 #undef FDCT32x32_2D
270 #undef FDCT32x32_HIGH_PRECISION
271 #undef DCT_HIGH_BIT_DEPTH
272 #endif  // CONFIG_VP9_HIGHBITDEPTH
273