1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h> // SSE2
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/vpx_dsp_common.h"
16 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
17
vpx_fdct4x4_1_sse2(const int16_t * input,tran_low_t * output,int stride)18 void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
19 __m128i in0, in1;
20 __m128i tmp;
21 const __m128i zero = _mm_setzero_si128();
22 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
23 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
24 in1 = _mm_unpacklo_epi64(
25 in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
26 in0 = _mm_unpacklo_epi64(
27 in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
28
29 tmp = _mm_add_epi16(in0, in1);
30 in0 = _mm_unpacklo_epi16(zero, tmp);
31 in1 = _mm_unpackhi_epi16(zero, tmp);
32 in0 = _mm_srai_epi32(in0, 16);
33 in1 = _mm_srai_epi32(in1, 16);
34
35 tmp = _mm_add_epi32(in0, in1);
36 in0 = _mm_unpacklo_epi32(tmp, zero);
37 in1 = _mm_unpackhi_epi32(tmp, zero);
38
39 tmp = _mm_add_epi32(in0, in1);
40 in0 = _mm_srli_si128(tmp, 8);
41
42 in1 = _mm_add_epi32(tmp, in0);
43 in0 = _mm_slli_epi32(in1, 1);
44 output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
45 }
46
vpx_fdct8x8_1_sse2(const int16_t * input,tran_low_t * output,int stride)47 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
48 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
49 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
50 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
51 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
52 __m128i u0, u1, sum;
53
54 u0 = _mm_add_epi16(in0, in1);
55 u1 = _mm_add_epi16(in2, in3);
56
57 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
58 in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
59 in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
60 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
61
62 sum = _mm_add_epi16(u0, u1);
63
64 in0 = _mm_add_epi16(in0, in1);
65 in2 = _mm_add_epi16(in2, in3);
66 sum = _mm_add_epi16(sum, in0);
67
68 u0 = _mm_setzero_si128();
69 sum = _mm_add_epi16(sum, in2);
70
71 in0 = _mm_unpacklo_epi16(u0, sum);
72 in1 = _mm_unpackhi_epi16(u0, sum);
73 in0 = _mm_srai_epi32(in0, 16);
74 in1 = _mm_srai_epi32(in1, 16);
75
76 sum = _mm_add_epi32(in0, in1);
77 in0 = _mm_unpacklo_epi32(sum, u0);
78 in1 = _mm_unpackhi_epi32(sum, u0);
79
80 sum = _mm_add_epi32(in0, in1);
81 in0 = _mm_srli_si128(sum, 8);
82
83 in1 = _mm_add_epi32(sum, in0);
84 output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
85 }
86
vpx_fdct16x16_1_sse2(const int16_t * input,tran_low_t * output,int stride)87 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
88 int stride) {
89 __m128i in0, in1, in2, in3;
90 __m128i u0, u1;
91 __m128i sum = _mm_setzero_si128();
92 int i;
93
94 for (i = 0; i < 2; ++i) {
95 in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
96 in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
97 in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
98 in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
99
100 u0 = _mm_add_epi16(in0, in1);
101 u1 = _mm_add_epi16(in2, in3);
102 sum = _mm_add_epi16(sum, u0);
103
104 in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
105 in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
106 in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
107 in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
108
109 sum = _mm_add_epi16(sum, u1);
110 u0 = _mm_add_epi16(in0, in1);
111 u1 = _mm_add_epi16(in2, in3);
112 sum = _mm_add_epi16(sum, u0);
113
114 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
115 in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
116 in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
117 in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
118
119 sum = _mm_add_epi16(sum, u1);
120 u0 = _mm_add_epi16(in0, in1);
121 u1 = _mm_add_epi16(in2, in3);
122 sum = _mm_add_epi16(sum, u0);
123
124 in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
125 in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
126 in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
127 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
128
129 sum = _mm_add_epi16(sum, u1);
130 u0 = _mm_add_epi16(in0, in1);
131 u1 = _mm_add_epi16(in2, in3);
132 sum = _mm_add_epi16(sum, u0);
133
134 sum = _mm_add_epi16(sum, u1);
135 input += 8 * stride;
136 }
137
138 u0 = _mm_setzero_si128();
139 in0 = _mm_unpacklo_epi16(u0, sum);
140 in1 = _mm_unpackhi_epi16(u0, sum);
141 in0 = _mm_srai_epi32(in0, 16);
142 in1 = _mm_srai_epi32(in1, 16);
143
144 sum = _mm_add_epi32(in0, in1);
145 in0 = _mm_unpacklo_epi32(sum, u0);
146 in1 = _mm_unpackhi_epi32(sum, u0);
147
148 sum = _mm_add_epi32(in0, in1);
149 in0 = _mm_srli_si128(sum, 8);
150
151 in1 = _mm_add_epi32(sum, in0);
152 in1 = _mm_srai_epi32(in1, 1);
153 output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
154 }
155
vpx_fdct32x32_1_sse2(const int16_t * input,tran_low_t * output,int stride)156 void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
157 int stride) {
158 __m128i in0, in1, in2, in3;
159 __m128i u0, u1;
160 __m128i sum = _mm_setzero_si128();
161 int i;
162
163 for (i = 0; i < 8; ++i) {
164 in0 = _mm_load_si128((const __m128i *)(input + 0));
165 in1 = _mm_load_si128((const __m128i *)(input + 8));
166 in2 = _mm_load_si128((const __m128i *)(input + 16));
167 in3 = _mm_load_si128((const __m128i *)(input + 24));
168
169 input += stride;
170 u0 = _mm_add_epi16(in0, in1);
171 u1 = _mm_add_epi16(in2, in3);
172 sum = _mm_add_epi16(sum, u0);
173
174 in0 = _mm_load_si128((const __m128i *)(input + 0));
175 in1 = _mm_load_si128((const __m128i *)(input + 8));
176 in2 = _mm_load_si128((const __m128i *)(input + 16));
177 in3 = _mm_load_si128((const __m128i *)(input + 24));
178
179 input += stride;
180 sum = _mm_add_epi16(sum, u1);
181 u0 = _mm_add_epi16(in0, in1);
182 u1 = _mm_add_epi16(in2, in3);
183 sum = _mm_add_epi16(sum, u0);
184
185 in0 = _mm_load_si128((const __m128i *)(input + 0));
186 in1 = _mm_load_si128((const __m128i *)(input + 8));
187 in2 = _mm_load_si128((const __m128i *)(input + 16));
188 in3 = _mm_load_si128((const __m128i *)(input + 24));
189
190 input += stride;
191 sum = _mm_add_epi16(sum, u1);
192 u0 = _mm_add_epi16(in0, in1);
193 u1 = _mm_add_epi16(in2, in3);
194 sum = _mm_add_epi16(sum, u0);
195
196 in0 = _mm_load_si128((const __m128i *)(input + 0));
197 in1 = _mm_load_si128((const __m128i *)(input + 8));
198 in2 = _mm_load_si128((const __m128i *)(input + 16));
199 in3 = _mm_load_si128((const __m128i *)(input + 24));
200
201 input += stride;
202 sum = _mm_add_epi16(sum, u1);
203 u0 = _mm_add_epi16(in0, in1);
204 u1 = _mm_add_epi16(in2, in3);
205 sum = _mm_add_epi16(sum, u0);
206
207 sum = _mm_add_epi16(sum, u1);
208 }
209
210 u0 = _mm_setzero_si128();
211 in0 = _mm_unpacklo_epi16(u0, sum);
212 in1 = _mm_unpackhi_epi16(u0, sum);
213 in0 = _mm_srai_epi32(in0, 16);
214 in1 = _mm_srai_epi32(in1, 16);
215
216 sum = _mm_add_epi32(in0, in1);
217 in0 = _mm_unpacklo_epi32(sum, u0);
218 in1 = _mm_unpackhi_epi32(sum, u0);
219
220 sum = _mm_add_epi32(in0, in1);
221 in0 = _mm_srli_si128(sum, 8);
222
223 in1 = _mm_add_epi32(sum, in0);
224 in1 = _mm_srai_epi32(in1, 3);
225 output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
226 }
227
228 #define DCT_HIGH_BIT_DEPTH 0
229 #define FDCT4x4_2D vpx_fdct4x4_sse2
230 #define FDCT8x8_2D vpx_fdct8x8_sse2
231 #define FDCT16x16_2D vpx_fdct16x16_sse2
232 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
233 #undef FDCT4x4_2D
234 #undef FDCT8x8_2D
235 #undef FDCT16x16_2D
236
237 #define FDCT32x32_2D vpx_fdct32x32_rd_sse2
238 #define FDCT32x32_HIGH_PRECISION 0
239 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
240 #undef FDCT32x32_2D
241 #undef FDCT32x32_HIGH_PRECISION
242
243 #define FDCT32x32_2D vpx_fdct32x32_sse2
244 #define FDCT32x32_HIGH_PRECISION 1
245 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
246 #undef FDCT32x32_2D
247 #undef FDCT32x32_HIGH_PRECISION
248 #undef DCT_HIGH_BIT_DEPTH
249
250 #if CONFIG_VP9_HIGHBITDEPTH
251 #define DCT_HIGH_BIT_DEPTH 1
252 #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
253 #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
254 #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
255 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
256 #undef FDCT4x4_2D
257 #undef FDCT8x8_2D
258 #undef FDCT16x16_2D
259
260 #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
261 #define FDCT32x32_HIGH_PRECISION 0
262 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
263 #undef FDCT32x32_2D
264 #undef FDCT32x32_HIGH_PRECISION
265
266 #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
267 #define FDCT32x32_HIGH_PRECISION 1
268 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
269 #undef FDCT32x32_2D
270 #undef FDCT32x32_HIGH_PRECISION
271 #undef DCT_HIGH_BIT_DEPTH
272 #endif // CONFIG_VP9_HIGHBITDEPTH
273