1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/txfm_common_sse2.h"
14
15 #define RECON_AND_STORE4X4(dest, in_x) \
16 { \
17 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
18 d0 = _mm_unpacklo_epi8(d0, zero); \
19 d0 = _mm_add_epi16(in_x, d0); \
20 d0 = _mm_packus_epi16(d0, d0); \
21 *(int *)(dest) = _mm_cvtsi128_si32(d0); \
22 }
23
vpx_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)24 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
25 int stride) {
26 const __m128i zero = _mm_setzero_si128();
27 const __m128i eight = _mm_set1_epi16(8);
28 const __m128i cst = _mm_setr_epi16(
29 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
30 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
31 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
32 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
33 __m128i input0, input1, input2, input3;
34
35 // Rows
36 input0 = load_input_data(input);
37 input2 = load_input_data(input + 8);
38
39 // Construct i3, i1, i3, i1, i2, i0, i2, i0
40 input0 = _mm_shufflelo_epi16(input0, 0xd8);
41 input0 = _mm_shufflehi_epi16(input0, 0xd8);
42 input2 = _mm_shufflelo_epi16(input2, 0xd8);
43 input2 = _mm_shufflehi_epi16(input2, 0xd8);
44
45 input1 = _mm_unpackhi_epi32(input0, input0);
46 input0 = _mm_unpacklo_epi32(input0, input0);
47 input3 = _mm_unpackhi_epi32(input2, input2);
48 input2 = _mm_unpacklo_epi32(input2, input2);
49
50 // Stage 1
51 input0 = _mm_madd_epi16(input0, cst);
52 input1 = _mm_madd_epi16(input1, cst);
53 input2 = _mm_madd_epi16(input2, cst);
54 input3 = _mm_madd_epi16(input3, cst);
55
56 input0 = _mm_add_epi32(input0, rounding);
57 input1 = _mm_add_epi32(input1, rounding);
58 input2 = _mm_add_epi32(input2, rounding);
59 input3 = _mm_add_epi32(input3, rounding);
60
61 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
62 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
63 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
64 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
65
66 // Stage 2
67 input0 = _mm_packs_epi32(input0, input1);
68 input1 = _mm_packs_epi32(input2, input3);
69
70 // Transpose
71 input2 = _mm_unpacklo_epi16(input0, input1);
72 input3 = _mm_unpackhi_epi16(input0, input1);
73 input0 = _mm_unpacklo_epi32(input2, input3);
74 input1 = _mm_unpackhi_epi32(input2, input3);
75
76 // Switch column2, column 3, and then, we got:
77 // input2: column1, column 0; input3: column2, column 3.
78 input1 = _mm_shuffle_epi32(input1, 0x4e);
79 input2 = _mm_add_epi16(input0, input1);
80 input3 = _mm_sub_epi16(input0, input1);
81
82 // Columns
83 // Construct i3, i1, i3, i1, i2, i0, i2, i0
84 input0 = _mm_unpacklo_epi32(input2, input2);
85 input1 = _mm_unpackhi_epi32(input2, input2);
86 input2 = _mm_unpackhi_epi32(input3, input3);
87 input3 = _mm_unpacklo_epi32(input3, input3);
88
89 // Stage 1
90 input0 = _mm_madd_epi16(input0, cst);
91 input1 = _mm_madd_epi16(input1, cst);
92 input2 = _mm_madd_epi16(input2, cst);
93 input3 = _mm_madd_epi16(input3, cst);
94
95 input0 = _mm_add_epi32(input0, rounding);
96 input1 = _mm_add_epi32(input1, rounding);
97 input2 = _mm_add_epi32(input2, rounding);
98 input3 = _mm_add_epi32(input3, rounding);
99
100 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
101 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
102 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
103 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
104
105 // Stage 2
106 input0 = _mm_packs_epi32(input0, input2);
107 input1 = _mm_packs_epi32(input1, input3);
108
109 // Transpose
110 input2 = _mm_unpacklo_epi16(input0, input1);
111 input3 = _mm_unpackhi_epi16(input0, input1);
112 input0 = _mm_unpacklo_epi32(input2, input3);
113 input1 = _mm_unpackhi_epi32(input2, input3);
114
115 // Switch column2, column 3, and then, we got:
116 // input2: column1, column 0; input3: column2, column 3.
117 input1 = _mm_shuffle_epi32(input1, 0x4e);
118 input2 = _mm_add_epi16(input0, input1);
119 input3 = _mm_sub_epi16(input0, input1);
120
121 // Final round and shift
122 input2 = _mm_add_epi16(input2, eight);
123 input3 = _mm_add_epi16(input3, eight);
124
125 input2 = _mm_srai_epi16(input2, 4);
126 input3 = _mm_srai_epi16(input3, 4);
127
128 // Reconstruction and Store
129 {
130 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
131 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
132 d0 = _mm_unpacklo_epi32(d0,
133 _mm_cvtsi32_si128(*(const int *)(dest + stride)));
134 d2 = _mm_unpacklo_epi32(
135 _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
136 d0 = _mm_unpacklo_epi8(d0, zero);
137 d2 = _mm_unpacklo_epi8(d2, zero);
138 d0 = _mm_add_epi16(d0, input2);
139 d2 = _mm_add_epi16(d2, input3);
140 d0 = _mm_packus_epi16(d0, d2);
141 // store input0
142 *(int *)dest = _mm_cvtsi128_si32(d0);
143 // store input1
144 d0 = _mm_srli_si128(d0, 4);
145 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
146 // store input2
147 d0 = _mm_srli_si128(d0, 4);
148 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
149 // store input3
150 d0 = _mm_srli_si128(d0, 4);
151 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
152 }
153 }
154
vpx_idct4x4_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)155 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
156 int stride) {
157 __m128i dc_value;
158 const __m128i zero = _mm_setzero_si128();
159 int a;
160
161 a = dct_const_round_shift(input[0] * cospi_16_64);
162 a = dct_const_round_shift(a * cospi_16_64);
163 a = ROUND_POWER_OF_TWO(a, 4);
164
165 dc_value = _mm_set1_epi16(a);
166
167 RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
168 RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
169 RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
170 RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
171 }
172
transpose_4x4(__m128i * res)173 static INLINE void transpose_4x4(__m128i *res) {
174 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
175 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
176
177 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
178 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
179 }
180
idct4_sse2(__m128i * in)181 void idct4_sse2(__m128i *in) {
182 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
183 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
184 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
185 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
186 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
187 __m128i u[8], v[8];
188
189 transpose_4x4(in);
190 // stage 1
191 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
192 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
193 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
194 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
195 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
196 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
197
198 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
199 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
200 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
201 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
202
203 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
204 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
205 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
206 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
207
208 u[0] = _mm_packs_epi32(v[0], v[1]);
209 u[1] = _mm_packs_epi32(v[3], v[2]);
210
211 // stage 2
212 in[0] = _mm_add_epi16(u[0], u[1]);
213 in[1] = _mm_sub_epi16(u[0], u[1]);
214 in[1] = _mm_shuffle_epi32(in[1], 0x4E);
215 }
216
iadst4_sse2(__m128i * in)217 void iadst4_sse2(__m128i *in) {
218 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
219 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
220 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
221 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
222 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
223 const __m128i kZero = _mm_set1_epi16(0);
224 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
225 __m128i u[8], v[8], in7;
226
227 transpose_4x4(in);
228 in7 = _mm_srli_si128(in[1], 8);
229 in7 = _mm_add_epi16(in7, in[0]);
230 in7 = _mm_sub_epi16(in7, in[1]);
231
232 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
233 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
234 u[2] = _mm_unpacklo_epi16(in7, kZero);
235 u[3] = _mm_unpackhi_epi16(in[0], kZero);
236
237 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
238 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
239 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
240 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
241 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
242 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
243
244 u[0] = _mm_add_epi32(v[0], v[1]);
245 u[1] = _mm_add_epi32(v[3], v[4]);
246 u[2] = v[2];
247 u[3] = _mm_add_epi32(u[0], u[1]);
248 u[4] = _mm_slli_epi32(v[5], 2);
249 u[5] = _mm_add_epi32(u[3], v[5]);
250 u[6] = _mm_sub_epi32(u[5], u[4]);
251
252 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
253 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
254 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
255 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
256
257 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
258 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
259 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
260 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
261
262 in[0] = _mm_packs_epi32(u[0], u[1]);
263 in[1] = _mm_packs_epi32(u[2], u[3]);
264 }
265
266 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
267 out0, out1, out2, out3, out4, out5, out6, out7) \
268 { \
269 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
270 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
271 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
272 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
273 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
274 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
275 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
276 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
277 \
278 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
279 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
280 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
281 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
282 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
283 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
284 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
285 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
286 \
287 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
288 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
289 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
290 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
291 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
292 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
293 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
294 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
295 }
296
297 #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
298 out0, out1, out2, out3) \
299 { \
300 const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
301 const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
302 const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
303 const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
304 \
305 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
306 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
307 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
308 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
309 \
310 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
311 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
312 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
313 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
314 }
315
316 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
317 { \
318 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
319 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
320 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
321 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
322 }
323
324 // Define Macro for multiplying elements by constants and adding them together.
325 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
326 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
327 { \
328 tmp0 = _mm_madd_epi16(lo_0, cst0); \
329 tmp1 = _mm_madd_epi16(hi_0, cst0); \
330 tmp2 = _mm_madd_epi16(lo_0, cst1); \
331 tmp3 = _mm_madd_epi16(hi_0, cst1); \
332 tmp4 = _mm_madd_epi16(lo_1, cst2); \
333 tmp5 = _mm_madd_epi16(hi_1, cst2); \
334 tmp6 = _mm_madd_epi16(lo_1, cst3); \
335 tmp7 = _mm_madd_epi16(hi_1, cst3); \
336 \
337 tmp0 = _mm_add_epi32(tmp0, rounding); \
338 tmp1 = _mm_add_epi32(tmp1, rounding); \
339 tmp2 = _mm_add_epi32(tmp2, rounding); \
340 tmp3 = _mm_add_epi32(tmp3, rounding); \
341 tmp4 = _mm_add_epi32(tmp4, rounding); \
342 tmp5 = _mm_add_epi32(tmp5, rounding); \
343 tmp6 = _mm_add_epi32(tmp6, rounding); \
344 tmp7 = _mm_add_epi32(tmp7, rounding); \
345 \
346 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
347 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
348 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
349 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
350 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
351 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
352 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
353 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
354 \
355 res0 = _mm_packs_epi32(tmp0, tmp1); \
356 res1 = _mm_packs_epi32(tmp2, tmp3); \
357 res2 = _mm_packs_epi32(tmp4, tmp5); \
358 res3 = _mm_packs_epi32(tmp6, tmp7); \
359 }
360
361 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
362 { \
363 tmp0 = _mm_madd_epi16(lo_0, cst0); \
364 tmp1 = _mm_madd_epi16(hi_0, cst0); \
365 tmp2 = _mm_madd_epi16(lo_0, cst1); \
366 tmp3 = _mm_madd_epi16(hi_0, cst1); \
367 \
368 tmp0 = _mm_add_epi32(tmp0, rounding); \
369 tmp1 = _mm_add_epi32(tmp1, rounding); \
370 tmp2 = _mm_add_epi32(tmp2, rounding); \
371 tmp3 = _mm_add_epi32(tmp3, rounding); \
372 \
373 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
374 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
375 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
376 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
377 \
378 res0 = _mm_packs_epi32(tmp0, tmp1); \
379 res1 = _mm_packs_epi32(tmp2, tmp3); \
380 }
381
382 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
383 out0, out1, out2, out3, out4, out5, out6, out7) \
384 { \
385 /* Stage1 */ \
386 { \
387 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
388 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
389 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
390 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
391 \
392 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
393 stg1_1, stg1_2, stg1_3, stp1_4, \
394 stp1_7, stp1_5, stp1_6) \
395 } \
396 \
397 /* Stage2 */ \
398 { \
399 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
400 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
401 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
402 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
403 \
404 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
405 stg2_1, stg2_2, stg2_3, stp2_0, \
406 stp2_1, stp2_2, stp2_3) \
407 \
408 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
409 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
410 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
411 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
412 } \
413 \
414 /* Stage3 */ \
415 { \
416 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
417 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
418 \
419 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
420 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
421 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
422 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
423 \
424 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
425 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
426 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
427 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
428 \
429 tmp0 = _mm_add_epi32(tmp0, rounding); \
430 tmp1 = _mm_add_epi32(tmp1, rounding); \
431 tmp2 = _mm_add_epi32(tmp2, rounding); \
432 tmp3 = _mm_add_epi32(tmp3, rounding); \
433 \
434 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
435 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
436 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
437 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
438 \
439 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
440 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
441 } \
442 \
443 /* Stage4 */ \
444 out0 = _mm_adds_epi16(stp1_0, stp2_7); \
445 out1 = _mm_adds_epi16(stp1_1, stp1_6); \
446 out2 = _mm_adds_epi16(stp1_2, stp1_5); \
447 out3 = _mm_adds_epi16(stp1_3, stp2_4); \
448 out4 = _mm_subs_epi16(stp1_3, stp2_4); \
449 out5 = _mm_subs_epi16(stp1_2, stp1_5); \
450 out6 = _mm_subs_epi16(stp1_1, stp1_6); \
451 out7 = _mm_subs_epi16(stp1_0, stp2_7); \
452 }
453
vpx_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)454 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
455 int stride) {
456 const __m128i zero = _mm_setzero_si128();
457 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
458 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
459 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
460 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
461 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
462 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
463 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
464 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
465 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
466 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
467
468 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
469 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
470 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
471 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
472 int i;
473
474 // Load input data.
475 in0 = load_input_data(input);
476 in1 = load_input_data(input + 8 * 1);
477 in2 = load_input_data(input + 8 * 2);
478 in3 = load_input_data(input + 8 * 3);
479 in4 = load_input_data(input + 8 * 4);
480 in5 = load_input_data(input + 8 * 5);
481 in6 = load_input_data(input + 8 * 6);
482 in7 = load_input_data(input + 8 * 7);
483
484 // 2-D
485 for (i = 0; i < 2; i++) {
486 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
487 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
488 in0, in1, in2, in3, in4, in5, in6, in7);
489
490 // 4-stage 1D idct8x8
491 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
492 in0, in1, in2, in3, in4, in5, in6, in7);
493 }
494
495 // Final rounding and shift
496 in0 = _mm_adds_epi16(in0, final_rounding);
497 in1 = _mm_adds_epi16(in1, final_rounding);
498 in2 = _mm_adds_epi16(in2, final_rounding);
499 in3 = _mm_adds_epi16(in3, final_rounding);
500 in4 = _mm_adds_epi16(in4, final_rounding);
501 in5 = _mm_adds_epi16(in5, final_rounding);
502 in6 = _mm_adds_epi16(in6, final_rounding);
503 in7 = _mm_adds_epi16(in7, final_rounding);
504
505 in0 = _mm_srai_epi16(in0, 5);
506 in1 = _mm_srai_epi16(in1, 5);
507 in2 = _mm_srai_epi16(in2, 5);
508 in3 = _mm_srai_epi16(in3, 5);
509 in4 = _mm_srai_epi16(in4, 5);
510 in5 = _mm_srai_epi16(in5, 5);
511 in6 = _mm_srai_epi16(in6, 5);
512 in7 = _mm_srai_epi16(in7, 5);
513
514 RECON_AND_STORE(dest + 0 * stride, in0);
515 RECON_AND_STORE(dest + 1 * stride, in1);
516 RECON_AND_STORE(dest + 2 * stride, in2);
517 RECON_AND_STORE(dest + 3 * stride, in3);
518 RECON_AND_STORE(dest + 4 * stride, in4);
519 RECON_AND_STORE(dest + 5 * stride, in5);
520 RECON_AND_STORE(dest + 6 * stride, in6);
521 RECON_AND_STORE(dest + 7 * stride, in7);
522 }
523
vpx_idct8x8_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)524 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
525 int stride) {
526 __m128i dc_value;
527 const __m128i zero = _mm_setzero_si128();
528 int a;
529
530 a = dct_const_round_shift(input[0] * cospi_16_64);
531 a = dct_const_round_shift(a * cospi_16_64);
532 a = ROUND_POWER_OF_TWO(a, 5);
533
534 dc_value = _mm_set1_epi16(a);
535
536 RECON_AND_STORE(dest + 0 * stride, dc_value);
537 RECON_AND_STORE(dest + 1 * stride, dc_value);
538 RECON_AND_STORE(dest + 2 * stride, dc_value);
539 RECON_AND_STORE(dest + 3 * stride, dc_value);
540 RECON_AND_STORE(dest + 4 * stride, dc_value);
541 RECON_AND_STORE(dest + 5 * stride, dc_value);
542 RECON_AND_STORE(dest + 6 * stride, dc_value);
543 RECON_AND_STORE(dest + 7 * stride, dc_value);
544 }
545
idct8_sse2(__m128i * in)546 void idct8_sse2(__m128i *in) {
547 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
548 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
549 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
550 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
551 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
552 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
553 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
554 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
555 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
556
557 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
558 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
559 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
560 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
561
562 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
563 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
564 in0, in1, in2, in3, in4, in5, in6, in7);
565
566 // 4-stage 1D idct8x8
567 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
568 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
569 }
570
iadst8_sse2(__m128i * in)571 void iadst8_sse2(__m128i *in) {
572 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
573 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
574 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
575 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
576 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
577 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
578 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
579 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
580 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
581 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
582 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
583 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
584 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
585 const __m128i k__const_0 = _mm_set1_epi16(0);
586 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
587
588 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
589 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
590 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
591 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
592 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
593
594 // transpose
595 array_transpose_8x8(in, in);
596
597 // properly aligned for butterfly input
598 in0 = in[7];
599 in1 = in[0];
600 in2 = in[5];
601 in3 = in[2];
602 in4 = in[3];
603 in5 = in[4];
604 in6 = in[1];
605 in7 = in[6];
606
607 // column transformation
608 // stage 1
609 // interleave and multiply/add into 32-bit integer
610 s0 = _mm_unpacklo_epi16(in0, in1);
611 s1 = _mm_unpackhi_epi16(in0, in1);
612 s2 = _mm_unpacklo_epi16(in2, in3);
613 s3 = _mm_unpackhi_epi16(in2, in3);
614 s4 = _mm_unpacklo_epi16(in4, in5);
615 s5 = _mm_unpackhi_epi16(in4, in5);
616 s6 = _mm_unpacklo_epi16(in6, in7);
617 s7 = _mm_unpackhi_epi16(in6, in7);
618
619 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
620 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
621 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
622 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
623 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
624 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
625 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
626 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
627 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
628 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
629 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
630 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
631 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
632 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
633 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
634 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
635
636 // addition
637 w0 = _mm_add_epi32(u0, u8);
638 w1 = _mm_add_epi32(u1, u9);
639 w2 = _mm_add_epi32(u2, u10);
640 w3 = _mm_add_epi32(u3, u11);
641 w4 = _mm_add_epi32(u4, u12);
642 w5 = _mm_add_epi32(u5, u13);
643 w6 = _mm_add_epi32(u6, u14);
644 w7 = _mm_add_epi32(u7, u15);
645 w8 = _mm_sub_epi32(u0, u8);
646 w9 = _mm_sub_epi32(u1, u9);
647 w10 = _mm_sub_epi32(u2, u10);
648 w11 = _mm_sub_epi32(u3, u11);
649 w12 = _mm_sub_epi32(u4, u12);
650 w13 = _mm_sub_epi32(u5, u13);
651 w14 = _mm_sub_epi32(u6, u14);
652 w15 = _mm_sub_epi32(u7, u15);
653
654 // shift and rounding
655 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
656 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
657 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
658 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
659 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
660 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
661 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
662 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
663 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
664 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
665 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
666 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
667 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
668 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
669 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
670 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
671
672 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
673 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
674 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
675 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
676 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
677 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
678 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
679 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
680 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
681 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
682 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
683 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
684 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
685 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
686 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
687 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
688
689 // back to 16-bit and pack 8 integers into __m128i
690 in[0] = _mm_packs_epi32(u0, u1);
691 in[1] = _mm_packs_epi32(u2, u3);
692 in[2] = _mm_packs_epi32(u4, u5);
693 in[3] = _mm_packs_epi32(u6, u7);
694 in[4] = _mm_packs_epi32(u8, u9);
695 in[5] = _mm_packs_epi32(u10, u11);
696 in[6] = _mm_packs_epi32(u12, u13);
697 in[7] = _mm_packs_epi32(u14, u15);
698
699 // stage 2
700 s0 = _mm_add_epi16(in[0], in[2]);
701 s1 = _mm_add_epi16(in[1], in[3]);
702 s2 = _mm_sub_epi16(in[0], in[2]);
703 s3 = _mm_sub_epi16(in[1], in[3]);
704 u0 = _mm_unpacklo_epi16(in[4], in[5]);
705 u1 = _mm_unpackhi_epi16(in[4], in[5]);
706 u2 = _mm_unpacklo_epi16(in[6], in[7]);
707 u3 = _mm_unpackhi_epi16(in[6], in[7]);
708
709 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
710 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
711 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
712 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
713 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
714 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
715 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
716 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
717
718 w0 = _mm_add_epi32(v0, v4);
719 w1 = _mm_add_epi32(v1, v5);
720 w2 = _mm_add_epi32(v2, v6);
721 w3 = _mm_add_epi32(v3, v7);
722 w4 = _mm_sub_epi32(v0, v4);
723 w5 = _mm_sub_epi32(v1, v5);
724 w6 = _mm_sub_epi32(v2, v6);
725 w7 = _mm_sub_epi32(v3, v7);
726
727 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
728 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
729 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
730 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
731 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
732 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
733 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
734 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
735
736 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
737 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
738 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
739 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
740 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
741 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
742 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
743 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
744
745 // back to 16-bit intergers
746 s4 = _mm_packs_epi32(u0, u1);
747 s5 = _mm_packs_epi32(u2, u3);
748 s6 = _mm_packs_epi32(u4, u5);
749 s7 = _mm_packs_epi32(u6, u7);
750
751 // stage 3
752 u0 = _mm_unpacklo_epi16(s2, s3);
753 u1 = _mm_unpackhi_epi16(s2, s3);
754 u2 = _mm_unpacklo_epi16(s6, s7);
755 u3 = _mm_unpackhi_epi16(s6, s7);
756
757 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
758 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
759 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
760 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
761 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
762 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
763 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
764 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
765
766 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
767 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
768 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
769 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
770 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
771 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
772 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
773 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
774
775 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
776 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
777 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
778 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
779 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
780 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
781 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
782 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
783
784 s2 = _mm_packs_epi32(v0, v1);
785 s3 = _mm_packs_epi32(v2, v3);
786 s6 = _mm_packs_epi32(v4, v5);
787 s7 = _mm_packs_epi32(v6, v7);
788
789 in[0] = s0;
790 in[1] = _mm_sub_epi16(k__const_0, s4);
791 in[2] = s6;
792 in[3] = _mm_sub_epi16(k__const_0, s2);
793 in[4] = s3;
794 in[5] = _mm_sub_epi16(k__const_0, s7);
795 in[6] = s5;
796 in[7] = _mm_sub_epi16(k__const_0, s1);
797 }
798
vpx_idct8x8_12_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)799 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
800 int stride) {
801 const __m128i zero = _mm_setzero_si128();
802 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
803 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
804 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
805 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
806 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
807 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
808 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
809 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
810 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
811 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
812 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
813
814 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
815 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
816 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
817 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
818
819 // Rows. Load 4-row input data.
820 in0 = load_input_data(input);
821 in1 = load_input_data(input + 8 * 1);
822 in2 = load_input_data(input + 8 * 2);
823 in3 = load_input_data(input + 8 * 3);
824
825 // 8x4 Transpose
826 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
827 // Stage1
828 {
829 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
830 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
831
832 tmp0 = _mm_madd_epi16(lo_17, stg1_0);
833 tmp2 = _mm_madd_epi16(lo_17, stg1_1);
834 tmp4 = _mm_madd_epi16(lo_35, stg1_2);
835 tmp6 = _mm_madd_epi16(lo_35, stg1_3);
836
837 tmp0 = _mm_add_epi32(tmp0, rounding);
838 tmp2 = _mm_add_epi32(tmp2, rounding);
839 tmp4 = _mm_add_epi32(tmp4, rounding);
840 tmp6 = _mm_add_epi32(tmp6, rounding);
841 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
842 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
843 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
844 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
845
846 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
847 stp1_5 = _mm_packs_epi32(tmp4, tmp6);
848 }
849
850 // Stage2
851 {
852 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
853 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
854
855 tmp0 = _mm_madd_epi16(lo_04, stg2_0);
856 tmp2 = _mm_madd_epi16(lo_04, stg2_1);
857 tmp4 = _mm_madd_epi16(lo_26, stg2_2);
858 tmp6 = _mm_madd_epi16(lo_26, stg2_3);
859
860 tmp0 = _mm_add_epi32(tmp0, rounding);
861 tmp2 = _mm_add_epi32(tmp2, rounding);
862 tmp4 = _mm_add_epi32(tmp4, rounding);
863 tmp6 = _mm_add_epi32(tmp6, rounding);
864 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
865 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
866 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
867 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
868
869 stp2_0 = _mm_packs_epi32(tmp0, tmp2);
870 stp2_2 = _mm_packs_epi32(tmp6, tmp4);
871
872 tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
873 tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
874
875 stp2_4 = tmp0;
876 stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
877 stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
878 }
879
880 // Stage3
881 {
882 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
883
884 tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
885 tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
886
887 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
888 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
889
890 tmp0 = _mm_madd_epi16(lo_56, stg3_0);
891 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
892
893 tmp0 = _mm_add_epi32(tmp0, rounding);
894 tmp2 = _mm_add_epi32(tmp2, rounding);
895 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
896 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
897
898 stp1_5 = _mm_packs_epi32(tmp0, tmp2);
899 }
900
901 // Stage4
902 tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
903 tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
904 tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
905 tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
906
907 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
908
909 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
910 in0, in1, in2, in3, in4, in5, in6, in7);
911 // Final rounding and shift
912 in0 = _mm_adds_epi16(in0, final_rounding);
913 in1 = _mm_adds_epi16(in1, final_rounding);
914 in2 = _mm_adds_epi16(in2, final_rounding);
915 in3 = _mm_adds_epi16(in3, final_rounding);
916 in4 = _mm_adds_epi16(in4, final_rounding);
917 in5 = _mm_adds_epi16(in5, final_rounding);
918 in6 = _mm_adds_epi16(in6, final_rounding);
919 in7 = _mm_adds_epi16(in7, final_rounding);
920
921 in0 = _mm_srai_epi16(in0, 5);
922 in1 = _mm_srai_epi16(in1, 5);
923 in2 = _mm_srai_epi16(in2, 5);
924 in3 = _mm_srai_epi16(in3, 5);
925 in4 = _mm_srai_epi16(in4, 5);
926 in5 = _mm_srai_epi16(in5, 5);
927 in6 = _mm_srai_epi16(in6, 5);
928 in7 = _mm_srai_epi16(in7, 5);
929
930 RECON_AND_STORE(dest + 0 * stride, in0);
931 RECON_AND_STORE(dest + 1 * stride, in1);
932 RECON_AND_STORE(dest + 2 * stride, in2);
933 RECON_AND_STORE(dest + 3 * stride, in3);
934 RECON_AND_STORE(dest + 4 * stride, in4);
935 RECON_AND_STORE(dest + 5 * stride, in5);
936 RECON_AND_STORE(dest + 6 * stride, in6);
937 RECON_AND_STORE(dest + 7 * stride, in7);
938 }
939
940 #define IDCT16 \
941 /* Stage2 */ \
942 { \
943 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
944 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
945 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
946 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
947 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
948 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
949 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
950 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
951 \
952 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
953 stg2_0, stg2_1, stg2_2, stg2_3, \
954 stp2_8, stp2_15, stp2_9, stp2_14) \
955 \
956 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
957 stg2_4, stg2_5, stg2_6, stg2_7, \
958 stp2_10, stp2_13, stp2_11, stp2_12) \
959 } \
960 \
961 /* Stage3 */ \
962 { \
963 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
964 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
965 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
966 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
967 \
968 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
969 stg3_0, stg3_1, stg3_2, stg3_3, \
970 stp1_4, stp1_7, stp1_5, stp1_6) \
971 \
972 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
973 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
974 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
975 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
976 \
977 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
978 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
979 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
980 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
981 } \
982 \
983 /* Stage4 */ \
984 { \
985 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
986 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
987 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
988 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
989 \
990 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
991 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
992 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
993 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
994 \
995 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
996 stg4_0, stg4_1, stg4_2, stg4_3, \
997 stp2_0, stp2_1, stp2_2, stp2_3) \
998 \
999 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1000 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1001 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1002 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1003 \
1004 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1005 stg4_4, stg4_5, stg4_6, stg4_7, \
1006 stp2_9, stp2_14, stp2_10, stp2_13) \
1007 } \
1008 \
1009 /* Stage5 */ \
1010 { \
1011 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1012 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1013 \
1014 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1015 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1016 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1017 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1018 \
1019 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1020 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1021 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1022 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1023 \
1024 tmp0 = _mm_add_epi32(tmp0, rounding); \
1025 tmp1 = _mm_add_epi32(tmp1, rounding); \
1026 tmp2 = _mm_add_epi32(tmp2, rounding); \
1027 tmp3 = _mm_add_epi32(tmp3, rounding); \
1028 \
1029 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1030 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1031 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1032 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1033 \
1034 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1035 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1036 \
1037 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1038 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1039 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1040 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1041 \
1042 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1043 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1044 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1045 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1046 } \
1047 \
1048 /* Stage6 */ \
1049 { \
1050 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1051 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1052 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1053 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1054 \
1055 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1056 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1057 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1058 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1059 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1060 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1061 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1062 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1063 \
1064 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1065 stg6_0, stg4_0, stg6_0, stg4_0, \
1066 stp2_10, stp2_13, stp2_11, stp2_12) \
1067 }
1068
1069 #define IDCT16_10 \
1070 /* Stage2 */ \
1071 { \
1072 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1073 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1074 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1075 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1076 \
1077 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1078 stg2_0, stg2_1, stg2_6, stg2_7, \
1079 stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1080 } \
1081 \
1082 /* Stage3 */ \
1083 { \
1084 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1085 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1086 \
1087 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1088 stg3_0, stg3_1, \
1089 stp2_4, stp2_7) \
1090 \
1091 stp1_9 = stp1_8_0; \
1092 stp1_10 = stp1_11; \
1093 \
1094 stp1_13 = stp1_12_0; \
1095 stp1_14 = stp1_15; \
1096 } \
1097 \
1098 /* Stage4 */ \
1099 { \
1100 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1101 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1102 \
1103 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1104 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1105 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1106 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1107 \
1108 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1109 stg4_0, stg4_1, \
1110 stp1_0, stp1_1) \
1111 stp2_5 = stp2_4; \
1112 stp2_6 = stp2_7; \
1113 \
1114 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1115 stg4_4, stg4_5, stg4_6, stg4_7, \
1116 stp2_9, stp2_14, stp2_10, stp2_13) \
1117 } \
1118 \
1119 /* Stage5 */ \
1120 { \
1121 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1122 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1123 \
1124 stp1_2 = stp1_1; \
1125 stp1_3 = stp1_0; \
1126 \
1127 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1128 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1129 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1130 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1131 \
1132 tmp0 = _mm_add_epi32(tmp0, rounding); \
1133 tmp1 = _mm_add_epi32(tmp1, rounding); \
1134 tmp2 = _mm_add_epi32(tmp2, rounding); \
1135 tmp3 = _mm_add_epi32(tmp3, rounding); \
1136 \
1137 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1138 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1139 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1140 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1141 \
1142 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1143 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1144 \
1145 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1146 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1147 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1148 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1149 \
1150 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1151 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1152 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1153 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1154 } \
1155 \
1156 /* Stage6 */ \
1157 { \
1158 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1159 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1160 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1161 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1162 \
1163 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1164 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1165 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1166 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1167 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1168 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1169 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1170 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1171 \
1172 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1173 stg6_0, stg4_0, stg6_0, stg4_0, \
1174 stp2_10, stp2_13, stp2_11, stp2_12) \
1175 }
1176
vpx_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1177 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
1178 int stride) {
1179 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1180 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1181 const __m128i zero = _mm_setzero_si128();
1182
1183 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1184 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1185 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1186 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1187 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1188 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1189 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1190 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1191
1192 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1193 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1194 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1195 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1196
1197 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1198 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1199 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1200 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1201 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1202 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1203 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1204 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1205
1206 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1207
1208 __m128i in[16], l[16], r[16], *curr1;
1209 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1210 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1211 stp1_8_0, stp1_12_0;
1212 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1213 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1214 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1215 int i;
1216
1217 curr1 = l;
1218 for (i = 0; i < 2; i++) {
1219 // 1-D idct
1220
1221 // Load input data.
1222 in[0] = load_input_data(input);
1223 in[8] = load_input_data(input + 8 * 1);
1224 in[1] = load_input_data(input + 8 * 2);
1225 in[9] = load_input_data(input + 8 * 3);
1226 in[2] = load_input_data(input + 8 * 4);
1227 in[10] = load_input_data(input + 8 * 5);
1228 in[3] = load_input_data(input + 8 * 6);
1229 in[11] = load_input_data(input + 8 * 7);
1230 in[4] = load_input_data(input + 8 * 8);
1231 in[12] = load_input_data(input + 8 * 9);
1232 in[5] = load_input_data(input + 8 * 10);
1233 in[13] = load_input_data(input + 8 * 11);
1234 in[6] = load_input_data(input + 8 * 12);
1235 in[14] = load_input_data(input + 8 * 13);
1236 in[7] = load_input_data(input + 8 * 14);
1237 in[15] = load_input_data(input + 8 * 15);
1238
1239 array_transpose_8x8(in, in);
1240 array_transpose_8x8(in + 8, in + 8);
1241
1242 IDCT16
1243
1244 // Stage7
1245 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1246 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1247 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1248 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1249 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1250 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1251 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1252 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1253 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1254 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1255 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1256 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1257 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1258 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1259 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1260 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1261
1262 curr1 = r;
1263 input += 128;
1264 }
1265 for (i = 0; i < 2; i++) {
1266 int j;
1267 // 1-D idct
1268 array_transpose_8x8(l + i * 8, in);
1269 array_transpose_8x8(r + i * 8, in + 8);
1270
1271 IDCT16
1272
1273 // 2-D
1274 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1275 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1276 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1277 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1278 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1279 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1280 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1281 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1282 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1283 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1284 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1285 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1286 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1287 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1288 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1289 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1290
1291 for (j = 0; j < 16; ++j) {
1292 // Final rounding and shift
1293 in[j] = _mm_adds_epi16(in[j], final_rounding);
1294 in[j] = _mm_srai_epi16(in[j], 6);
1295 RECON_AND_STORE(dest + j * stride, in[j]);
1296 }
1297
1298 dest += 8;
1299 }
1300 }
1301
vpx_idct16x16_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1302 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1303 int stride) {
1304 __m128i dc_value;
1305 const __m128i zero = _mm_setzero_si128();
1306 int a, i;
1307
1308 a = dct_const_round_shift(input[0] * cospi_16_64);
1309 a = dct_const_round_shift(a * cospi_16_64);
1310 a = ROUND_POWER_OF_TWO(a, 6);
1311
1312 dc_value = _mm_set1_epi16(a);
1313
1314 for (i = 0; i < 2; ++i) {
1315 RECON_AND_STORE(dest + 0 * stride, dc_value);
1316 RECON_AND_STORE(dest + 1 * stride, dc_value);
1317 RECON_AND_STORE(dest + 2 * stride, dc_value);
1318 RECON_AND_STORE(dest + 3 * stride, dc_value);
1319 RECON_AND_STORE(dest + 4 * stride, dc_value);
1320 RECON_AND_STORE(dest + 5 * stride, dc_value);
1321 RECON_AND_STORE(dest + 6 * stride, dc_value);
1322 RECON_AND_STORE(dest + 7 * stride, dc_value);
1323 RECON_AND_STORE(dest + 8 * stride, dc_value);
1324 RECON_AND_STORE(dest + 9 * stride, dc_value);
1325 RECON_AND_STORE(dest + 10 * stride, dc_value);
1326 RECON_AND_STORE(dest + 11 * stride, dc_value);
1327 RECON_AND_STORE(dest + 12 * stride, dc_value);
1328 RECON_AND_STORE(dest + 13 * stride, dc_value);
1329 RECON_AND_STORE(dest + 14 * stride, dc_value);
1330 RECON_AND_STORE(dest + 15 * stride, dc_value);
1331 dest += 8;
1332 }
1333 }
1334
iadst16_8col(__m128i * in)1335 static void iadst16_8col(__m128i *in) {
1336 // perform 16x16 1-D ADST for 8 columns
1337 __m128i s[16], x[16], u[32], v[32];
1338 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1339 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1340 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1341 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1342 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1343 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1344 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1345 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1346 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1347 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1348 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1349 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1350 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1351 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1352 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1353 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1354 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1355 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1356 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1357 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1358 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1359 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1360 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1361 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1362 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1363 const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1364 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1365 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1366 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1367 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1368 const __m128i kZero = _mm_set1_epi16(0);
1369
1370 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1371 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1372 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1373 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1374 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1375 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1376 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1377 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1378 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1379 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1380 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1381 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1382 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1383 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1384 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1385 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1386
1387 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1388 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1389 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1390 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1391 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1392 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1393 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1394 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1395 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1396 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1397 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1398 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1399 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1400 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1401 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1402 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1403 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1404 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1405 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1406 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1407 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1408 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1409 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1410 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1411 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1412 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1413 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1414 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1415 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1416 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1417 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1418 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1419
1420 u[0] = _mm_add_epi32(v[0], v[16]);
1421 u[1] = _mm_add_epi32(v[1], v[17]);
1422 u[2] = _mm_add_epi32(v[2], v[18]);
1423 u[3] = _mm_add_epi32(v[3], v[19]);
1424 u[4] = _mm_add_epi32(v[4], v[20]);
1425 u[5] = _mm_add_epi32(v[5], v[21]);
1426 u[6] = _mm_add_epi32(v[6], v[22]);
1427 u[7] = _mm_add_epi32(v[7], v[23]);
1428 u[8] = _mm_add_epi32(v[8], v[24]);
1429 u[9] = _mm_add_epi32(v[9], v[25]);
1430 u[10] = _mm_add_epi32(v[10], v[26]);
1431 u[11] = _mm_add_epi32(v[11], v[27]);
1432 u[12] = _mm_add_epi32(v[12], v[28]);
1433 u[13] = _mm_add_epi32(v[13], v[29]);
1434 u[14] = _mm_add_epi32(v[14], v[30]);
1435 u[15] = _mm_add_epi32(v[15], v[31]);
1436 u[16] = _mm_sub_epi32(v[0], v[16]);
1437 u[17] = _mm_sub_epi32(v[1], v[17]);
1438 u[18] = _mm_sub_epi32(v[2], v[18]);
1439 u[19] = _mm_sub_epi32(v[3], v[19]);
1440 u[20] = _mm_sub_epi32(v[4], v[20]);
1441 u[21] = _mm_sub_epi32(v[5], v[21]);
1442 u[22] = _mm_sub_epi32(v[6], v[22]);
1443 u[23] = _mm_sub_epi32(v[7], v[23]);
1444 u[24] = _mm_sub_epi32(v[8], v[24]);
1445 u[25] = _mm_sub_epi32(v[9], v[25]);
1446 u[26] = _mm_sub_epi32(v[10], v[26]);
1447 u[27] = _mm_sub_epi32(v[11], v[27]);
1448 u[28] = _mm_sub_epi32(v[12], v[28]);
1449 u[29] = _mm_sub_epi32(v[13], v[29]);
1450 u[30] = _mm_sub_epi32(v[14], v[30]);
1451 u[31] = _mm_sub_epi32(v[15], v[31]);
1452
1453 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1454 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1455 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1456 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1457 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1458 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1459 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1460 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1461 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1462 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1463 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1464 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1465 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1466 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1467 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1468 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1469 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1470 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1471 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1472 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1473 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1474 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1475 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1476 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1477 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1478 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1479 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1480 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1481 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1482 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1483 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1484 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1485
1486 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1487 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1488 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1489 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1490 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1491 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1492 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1493 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1494 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1495 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1496 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1497 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1498 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1499 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1500 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1501 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1502 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1503 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1504 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1505 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1506 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1507 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1508 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1509 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1510 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1511 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1512 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1513 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1514 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1515 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1516 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1517 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1518
1519 s[0] = _mm_packs_epi32(u[0], u[1]);
1520 s[1] = _mm_packs_epi32(u[2], u[3]);
1521 s[2] = _mm_packs_epi32(u[4], u[5]);
1522 s[3] = _mm_packs_epi32(u[6], u[7]);
1523 s[4] = _mm_packs_epi32(u[8], u[9]);
1524 s[5] = _mm_packs_epi32(u[10], u[11]);
1525 s[6] = _mm_packs_epi32(u[12], u[13]);
1526 s[7] = _mm_packs_epi32(u[14], u[15]);
1527 s[8] = _mm_packs_epi32(u[16], u[17]);
1528 s[9] = _mm_packs_epi32(u[18], u[19]);
1529 s[10] = _mm_packs_epi32(u[20], u[21]);
1530 s[11] = _mm_packs_epi32(u[22], u[23]);
1531 s[12] = _mm_packs_epi32(u[24], u[25]);
1532 s[13] = _mm_packs_epi32(u[26], u[27]);
1533 s[14] = _mm_packs_epi32(u[28], u[29]);
1534 s[15] = _mm_packs_epi32(u[30], u[31]);
1535
1536 // stage 2
1537 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1538 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1539 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1540 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1541 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1542 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1543 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1544 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1545
1546 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1547 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1548 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1549 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1550 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1551 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1552 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1553 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1554 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1555 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1556 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1557 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1558 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1559 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1560 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1561 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1562
1563 u[0] = _mm_add_epi32(v[0], v[8]);
1564 u[1] = _mm_add_epi32(v[1], v[9]);
1565 u[2] = _mm_add_epi32(v[2], v[10]);
1566 u[3] = _mm_add_epi32(v[3], v[11]);
1567 u[4] = _mm_add_epi32(v[4], v[12]);
1568 u[5] = _mm_add_epi32(v[5], v[13]);
1569 u[6] = _mm_add_epi32(v[6], v[14]);
1570 u[7] = _mm_add_epi32(v[7], v[15]);
1571 u[8] = _mm_sub_epi32(v[0], v[8]);
1572 u[9] = _mm_sub_epi32(v[1], v[9]);
1573 u[10] = _mm_sub_epi32(v[2], v[10]);
1574 u[11] = _mm_sub_epi32(v[3], v[11]);
1575 u[12] = _mm_sub_epi32(v[4], v[12]);
1576 u[13] = _mm_sub_epi32(v[5], v[13]);
1577 u[14] = _mm_sub_epi32(v[6], v[14]);
1578 u[15] = _mm_sub_epi32(v[7], v[15]);
1579
1580 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1581 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1582 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1583 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1584 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1585 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1586 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1587 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1588 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1589 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1590 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1591 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1592 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1593 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1594 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1595 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1596
1597 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1598 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1599 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1600 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1601 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1602 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1603 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1604 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1605 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1606 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1607 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1608 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1609 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1610 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1611 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1612 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1613
1614 x[0] = _mm_add_epi16(s[0], s[4]);
1615 x[1] = _mm_add_epi16(s[1], s[5]);
1616 x[2] = _mm_add_epi16(s[2], s[6]);
1617 x[3] = _mm_add_epi16(s[3], s[7]);
1618 x[4] = _mm_sub_epi16(s[0], s[4]);
1619 x[5] = _mm_sub_epi16(s[1], s[5]);
1620 x[6] = _mm_sub_epi16(s[2], s[6]);
1621 x[7] = _mm_sub_epi16(s[3], s[7]);
1622 x[8] = _mm_packs_epi32(u[0], u[1]);
1623 x[9] = _mm_packs_epi32(u[2], u[3]);
1624 x[10] = _mm_packs_epi32(u[4], u[5]);
1625 x[11] = _mm_packs_epi32(u[6], u[7]);
1626 x[12] = _mm_packs_epi32(u[8], u[9]);
1627 x[13] = _mm_packs_epi32(u[10], u[11]);
1628 x[14] = _mm_packs_epi32(u[12], u[13]);
1629 x[15] = _mm_packs_epi32(u[14], u[15]);
1630
1631 // stage 3
1632 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1633 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1634 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1635 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1636 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1637 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1638 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1639 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1640
1641 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1642 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1643 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1644 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1645 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1646 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1647 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1648 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1649 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1650 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1651 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1652 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1653 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1654 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1655 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1656 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1657
1658 u[0] = _mm_add_epi32(v[0], v[4]);
1659 u[1] = _mm_add_epi32(v[1], v[5]);
1660 u[2] = _mm_add_epi32(v[2], v[6]);
1661 u[3] = _mm_add_epi32(v[3], v[7]);
1662 u[4] = _mm_sub_epi32(v[0], v[4]);
1663 u[5] = _mm_sub_epi32(v[1], v[5]);
1664 u[6] = _mm_sub_epi32(v[2], v[6]);
1665 u[7] = _mm_sub_epi32(v[3], v[7]);
1666 u[8] = _mm_add_epi32(v[8], v[12]);
1667 u[9] = _mm_add_epi32(v[9], v[13]);
1668 u[10] = _mm_add_epi32(v[10], v[14]);
1669 u[11] = _mm_add_epi32(v[11], v[15]);
1670 u[12] = _mm_sub_epi32(v[8], v[12]);
1671 u[13] = _mm_sub_epi32(v[9], v[13]);
1672 u[14] = _mm_sub_epi32(v[10], v[14]);
1673 u[15] = _mm_sub_epi32(v[11], v[15]);
1674
1675 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1676 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1677 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1678 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1679 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1680 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1681 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1682 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1683 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1684 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1685 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1686 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1687 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1688 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1689 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1690 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1691
1692 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1693 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1694 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1695 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1696 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1697 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1698 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1699 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1700 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1701 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1702 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1703 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1704 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1705 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1706 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1707 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1708
1709 s[0] = _mm_add_epi16(x[0], x[2]);
1710 s[1] = _mm_add_epi16(x[1], x[3]);
1711 s[2] = _mm_sub_epi16(x[0], x[2]);
1712 s[3] = _mm_sub_epi16(x[1], x[3]);
1713 s[4] = _mm_packs_epi32(v[0], v[1]);
1714 s[5] = _mm_packs_epi32(v[2], v[3]);
1715 s[6] = _mm_packs_epi32(v[4], v[5]);
1716 s[7] = _mm_packs_epi32(v[6], v[7]);
1717 s[8] = _mm_add_epi16(x[8], x[10]);
1718 s[9] = _mm_add_epi16(x[9], x[11]);
1719 s[10] = _mm_sub_epi16(x[8], x[10]);
1720 s[11] = _mm_sub_epi16(x[9], x[11]);
1721 s[12] = _mm_packs_epi32(v[8], v[9]);
1722 s[13] = _mm_packs_epi32(v[10], v[11]);
1723 s[14] = _mm_packs_epi32(v[12], v[13]);
1724 s[15] = _mm_packs_epi32(v[14], v[15]);
1725
1726 // stage 4
1727 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1728 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1729 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1730 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1731 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1732 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1733 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1734 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1735
1736 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1737 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1738 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1739 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1740 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1741 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1742 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1743 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1744 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1745 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1746 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1747 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1748 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1749 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1750 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1751 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1752
1753 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1754 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1755 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1756 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1757 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1758 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1759 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1760 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1761 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1762 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1763 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1764 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1765 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1766 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1767 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1768 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1769
1770 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1771 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1772 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1773 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1774 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1775 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1776 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1777 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1778 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1779 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1780 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1781 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1782 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1783 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1784 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1785 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1786
1787 in[0] = s[0];
1788 in[1] = _mm_sub_epi16(kZero, s[8]);
1789 in[2] = s[12];
1790 in[3] = _mm_sub_epi16(kZero, s[4]);
1791 in[4] = _mm_packs_epi32(v[4], v[5]);
1792 in[5] = _mm_packs_epi32(v[12], v[13]);
1793 in[6] = _mm_packs_epi32(v[8], v[9]);
1794 in[7] = _mm_packs_epi32(v[0], v[1]);
1795 in[8] = _mm_packs_epi32(v[2], v[3]);
1796 in[9] = _mm_packs_epi32(v[10], v[11]);
1797 in[10] = _mm_packs_epi32(v[14], v[15]);
1798 in[11] = _mm_packs_epi32(v[6], v[7]);
1799 in[12] = s[5];
1800 in[13] = _mm_sub_epi16(kZero, s[13]);
1801 in[14] = s[9];
1802 in[15] = _mm_sub_epi16(kZero, s[1]);
1803 }
1804
idct16_8col(__m128i * in)1805 static void idct16_8col(__m128i *in) {
1806 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1807 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1808 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1809 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1810 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1811 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1812 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1813 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1814 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1815 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1816 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1817 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1818 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1819 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1820 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1821 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1822 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1823 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1824 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1825 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1826 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1827 __m128i v[16], u[16], s[16], t[16];
1828
1829 // stage 1
1830 s[0] = in[0];
1831 s[1] = in[8];
1832 s[2] = in[4];
1833 s[3] = in[12];
1834 s[4] = in[2];
1835 s[5] = in[10];
1836 s[6] = in[6];
1837 s[7] = in[14];
1838 s[8] = in[1];
1839 s[9] = in[9];
1840 s[10] = in[5];
1841 s[11] = in[13];
1842 s[12] = in[3];
1843 s[13] = in[11];
1844 s[14] = in[7];
1845 s[15] = in[15];
1846
1847 // stage 2
1848 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1849 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1850 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1851 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1852 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1853 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1854 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1855 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1856
1857 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1858 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1859 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1860 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1861 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1862 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1863 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1864 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1865 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1866 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1867 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1868 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1869 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1870 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1871 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1872 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1873
1874 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1875 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1876 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1877 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1878 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1879 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1880 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1881 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1882 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1883 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1884 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1885 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1886 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1887 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1888 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1889 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1890
1891 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1892 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1893 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1894 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1895 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1896 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1897 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1898 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1899 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1900 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1901 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1902 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1903 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1904 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1905 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1906 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1907
1908 s[8] = _mm_packs_epi32(u[0], u[1]);
1909 s[15] = _mm_packs_epi32(u[2], u[3]);
1910 s[9] = _mm_packs_epi32(u[4], u[5]);
1911 s[14] = _mm_packs_epi32(u[6], u[7]);
1912 s[10] = _mm_packs_epi32(u[8], u[9]);
1913 s[13] = _mm_packs_epi32(u[10], u[11]);
1914 s[11] = _mm_packs_epi32(u[12], u[13]);
1915 s[12] = _mm_packs_epi32(u[14], u[15]);
1916
1917 // stage 3
1918 t[0] = s[0];
1919 t[1] = s[1];
1920 t[2] = s[2];
1921 t[3] = s[3];
1922 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1923 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1924 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1925 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1926
1927 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1928 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1929 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1930 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1931 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1932 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1933 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1934 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1935
1936 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1937 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1938 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1939 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1940 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1941 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1942 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1943 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1944
1945 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1946 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1947 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1948 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1949 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1950 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1951 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1952 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1953
1954 t[4] = _mm_packs_epi32(u[0], u[1]);
1955 t[7] = _mm_packs_epi32(u[2], u[3]);
1956 t[5] = _mm_packs_epi32(u[4], u[5]);
1957 t[6] = _mm_packs_epi32(u[6], u[7]);
1958 t[8] = _mm_add_epi16(s[8], s[9]);
1959 t[9] = _mm_sub_epi16(s[8], s[9]);
1960 t[10] = _mm_sub_epi16(s[11], s[10]);
1961 t[11] = _mm_add_epi16(s[10], s[11]);
1962 t[12] = _mm_add_epi16(s[12], s[13]);
1963 t[13] = _mm_sub_epi16(s[12], s[13]);
1964 t[14] = _mm_sub_epi16(s[15], s[14]);
1965 t[15] = _mm_add_epi16(s[14], s[15]);
1966
1967 // stage 4
1968 u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1969 u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1970 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1971 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1972 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1973 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1974 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1975 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1976
1977 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1978 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1979 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1980 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1981 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1982 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1983 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1984 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1985 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1986 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1987 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1988 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1989 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1990 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1991 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1992 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1993
1994 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1995 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1996 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1997 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1998 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1999 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2000 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2001 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2002 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2003 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2004 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2005 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2006 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2007 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2008 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2009 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2010
2011 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2012 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2013 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2014 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2015 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2016 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2017 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2018 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2019 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2020 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2021 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2022 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2023 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2024 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2025 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2026 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2027
2028 s[0] = _mm_packs_epi32(u[0], u[1]);
2029 s[1] = _mm_packs_epi32(u[2], u[3]);
2030 s[2] = _mm_packs_epi32(u[4], u[5]);
2031 s[3] = _mm_packs_epi32(u[6], u[7]);
2032 s[4] = _mm_add_epi16(t[4], t[5]);
2033 s[5] = _mm_sub_epi16(t[4], t[5]);
2034 s[6] = _mm_sub_epi16(t[7], t[6]);
2035 s[7] = _mm_add_epi16(t[6], t[7]);
2036 s[8] = t[8];
2037 s[15] = t[15];
2038 s[9] = _mm_packs_epi32(u[8], u[9]);
2039 s[14] = _mm_packs_epi32(u[10], u[11]);
2040 s[10] = _mm_packs_epi32(u[12], u[13]);
2041 s[13] = _mm_packs_epi32(u[14], u[15]);
2042 s[11] = t[11];
2043 s[12] = t[12];
2044
2045 // stage 5
2046 t[0] = _mm_add_epi16(s[0], s[3]);
2047 t[1] = _mm_add_epi16(s[1], s[2]);
2048 t[2] = _mm_sub_epi16(s[1], s[2]);
2049 t[3] = _mm_sub_epi16(s[0], s[3]);
2050 t[4] = s[4];
2051 t[7] = s[7];
2052
2053 u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2054 u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2055 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2056 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2057 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2058 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2059 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2060 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2061 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2062 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2063 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2064 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2065 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2066 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2067 t[5] = _mm_packs_epi32(u[0], u[1]);
2068 t[6] = _mm_packs_epi32(u[2], u[3]);
2069
2070 t[8] = _mm_add_epi16(s[8], s[11]);
2071 t[9] = _mm_add_epi16(s[9], s[10]);
2072 t[10] = _mm_sub_epi16(s[9], s[10]);
2073 t[11] = _mm_sub_epi16(s[8], s[11]);
2074 t[12] = _mm_sub_epi16(s[15], s[12]);
2075 t[13] = _mm_sub_epi16(s[14], s[13]);
2076 t[14] = _mm_add_epi16(s[13], s[14]);
2077 t[15] = _mm_add_epi16(s[12], s[15]);
2078
2079 // stage 6
2080 s[0] = _mm_add_epi16(t[0], t[7]);
2081 s[1] = _mm_add_epi16(t[1], t[6]);
2082 s[2] = _mm_add_epi16(t[2], t[5]);
2083 s[3] = _mm_add_epi16(t[3], t[4]);
2084 s[4] = _mm_sub_epi16(t[3], t[4]);
2085 s[5] = _mm_sub_epi16(t[2], t[5]);
2086 s[6] = _mm_sub_epi16(t[1], t[6]);
2087 s[7] = _mm_sub_epi16(t[0], t[7]);
2088 s[8] = t[8];
2089 s[9] = t[9];
2090
2091 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2092 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2093 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2094 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2095
2096 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2097 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2098 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2099 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2100 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2101 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2102 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2103 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2104
2105 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2106 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2107 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2108 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2109 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2110 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2111 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2112 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2113
2114 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2115 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2116 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2117 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2118 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2119 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2120 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2121 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2122
2123 s[10] = _mm_packs_epi32(u[0], u[1]);
2124 s[13] = _mm_packs_epi32(u[2], u[3]);
2125 s[11] = _mm_packs_epi32(u[4], u[5]);
2126 s[12] = _mm_packs_epi32(u[6], u[7]);
2127 s[14] = t[14];
2128 s[15] = t[15];
2129
2130 // stage 7
2131 in[0] = _mm_add_epi16(s[0], s[15]);
2132 in[1] = _mm_add_epi16(s[1], s[14]);
2133 in[2] = _mm_add_epi16(s[2], s[13]);
2134 in[3] = _mm_add_epi16(s[3], s[12]);
2135 in[4] = _mm_add_epi16(s[4], s[11]);
2136 in[5] = _mm_add_epi16(s[5], s[10]);
2137 in[6] = _mm_add_epi16(s[6], s[9]);
2138 in[7] = _mm_add_epi16(s[7], s[8]);
2139 in[8] = _mm_sub_epi16(s[7], s[8]);
2140 in[9] = _mm_sub_epi16(s[6], s[9]);
2141 in[10] = _mm_sub_epi16(s[5], s[10]);
2142 in[11] = _mm_sub_epi16(s[4], s[11]);
2143 in[12] = _mm_sub_epi16(s[3], s[12]);
2144 in[13] = _mm_sub_epi16(s[2], s[13]);
2145 in[14] = _mm_sub_epi16(s[1], s[14]);
2146 in[15] = _mm_sub_epi16(s[0], s[15]);
2147 }
2148
idct16_sse2(__m128i * in0,__m128i * in1)2149 void idct16_sse2(__m128i *in0, __m128i *in1) {
2150 array_transpose_16x16(in0, in1);
2151 idct16_8col(in0);
2152 idct16_8col(in1);
2153 }
2154
iadst16_sse2(__m128i * in0,__m128i * in1)2155 void iadst16_sse2(__m128i *in0, __m128i *in1) {
2156 array_transpose_16x16(in0, in1);
2157 iadst16_8col(in0);
2158 iadst16_8col(in1);
2159 }
2160
vpx_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)2161 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
2162 int stride) {
2163 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2164 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2165 const __m128i zero = _mm_setzero_si128();
2166
2167 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2168 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2169 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2170 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2171
2172 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2173 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2174
2175 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2176 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2177 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2178 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2179 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2180 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2181
2182 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2183 __m128i in[16], l[16];
2184 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2185 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2186 stp1_8_0, stp1_12_0;
2187 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2188 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2189 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2190 int i;
2191 // First 1-D inverse DCT
2192 // Load input data.
2193 in[0] = load_input_data(input);
2194 in[1] = load_input_data(input + 8 * 2);
2195 in[2] = load_input_data(input + 8 * 4);
2196 in[3] = load_input_data(input + 8 * 6);
2197
2198 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2199
2200 // Stage2
2201 {
2202 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2203 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2204
2205 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2206 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2207 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2208 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2209
2210 tmp0 = _mm_add_epi32(tmp0, rounding);
2211 tmp2 = _mm_add_epi32(tmp2, rounding);
2212 tmp5 = _mm_add_epi32(tmp5, rounding);
2213 tmp7 = _mm_add_epi32(tmp7, rounding);
2214
2215 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2216 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2217 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2218 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2219
2220 stp2_8 = _mm_packs_epi32(tmp0, tmp2);
2221 stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2222 }
2223
2224 // Stage3
2225 {
2226 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2227
2228 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2229 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2230
2231 tmp0 = _mm_add_epi32(tmp0, rounding);
2232 tmp2 = _mm_add_epi32(tmp2, rounding);
2233 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2234 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2235
2236 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2237 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2238
2239 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2240 }
2241
2242 // Stage4
2243 {
2244 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2245 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2246 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2247
2248 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2249 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2250 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2251 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2252 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2253 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2254
2255 tmp0 = _mm_add_epi32(tmp0, rounding);
2256 tmp2 = _mm_add_epi32(tmp2, rounding);
2257 tmp1 = _mm_add_epi32(tmp1, rounding);
2258 tmp3 = _mm_add_epi32(tmp3, rounding);
2259 tmp5 = _mm_add_epi32(tmp5, rounding);
2260 tmp7 = _mm_add_epi32(tmp7, rounding);
2261
2262 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2263 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2264 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2265 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2266 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2267 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2268
2269 stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2270 stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2271 stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2272 stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2273
2274 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2275 }
2276
2277 // Stage5 and Stage6
2278 {
2279 tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2280 tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2281 tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2282 tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2283
2284 stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
2285 stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2286 stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
2287 stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2288
2289 stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2290 stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2291 stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2292 stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2293 }
2294
2295 // Stage6
2296 {
2297 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2298 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2299 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2300
2301 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2302 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2303 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2304 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2305 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2306 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2307
2308 tmp1 = _mm_add_epi32(tmp1, rounding);
2309 tmp3 = _mm_add_epi32(tmp3, rounding);
2310 tmp0 = _mm_add_epi32(tmp0, rounding);
2311 tmp2 = _mm_add_epi32(tmp2, rounding);
2312 tmp4 = _mm_add_epi32(tmp4, rounding);
2313 tmp6 = _mm_add_epi32(tmp6, rounding);
2314
2315 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2316 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2317 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2318 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2319 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2320 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2321
2322 stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2323
2324 stp2_10 = _mm_packs_epi32(tmp0, zero);
2325 stp2_13 = _mm_packs_epi32(tmp2, zero);
2326 stp2_11 = _mm_packs_epi32(tmp4, zero);
2327 stp2_12 = _mm_packs_epi32(tmp6, zero);
2328
2329 tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2330 tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2331 tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2332 tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2333
2334 stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2335 stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2336 stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2337 stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2338 stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2339 stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2340 stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2341 stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2342 }
2343
2344 // Stage7. Left 8x16 only.
2345 l[0] = _mm_add_epi16(stp2_0, stp1_15);
2346 l[1] = _mm_add_epi16(stp2_1, stp1_14);
2347 l[2] = _mm_add_epi16(stp2_2, stp2_13);
2348 l[3] = _mm_add_epi16(stp2_3, stp2_12);
2349 l[4] = _mm_add_epi16(stp2_4, stp2_11);
2350 l[5] = _mm_add_epi16(stp2_5, stp2_10);
2351 l[6] = _mm_add_epi16(stp2_6, stp1_9);
2352 l[7] = _mm_add_epi16(stp2_7, stp1_8);
2353 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2354 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2355 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2356 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2357 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2358 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2359 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2360 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2361
2362 // Second 1-D inverse transform, performed per 8x16 block
2363 for (i = 0; i < 2; i++) {
2364 int j;
2365 array_transpose_4X8(l + 8 * i, in);
2366
2367 IDCT16_10
2368
2369 // Stage7
2370 in[0] = _mm_add_epi16(stp2_0, stp1_15);
2371 in[1] = _mm_add_epi16(stp2_1, stp1_14);
2372 in[2] = _mm_add_epi16(stp2_2, stp2_13);
2373 in[3] = _mm_add_epi16(stp2_3, stp2_12);
2374 in[4] = _mm_add_epi16(stp2_4, stp2_11);
2375 in[5] = _mm_add_epi16(stp2_5, stp2_10);
2376 in[6] = _mm_add_epi16(stp2_6, stp1_9);
2377 in[7] = _mm_add_epi16(stp2_7, stp1_8);
2378 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2379 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2380 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2381 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2382 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2383 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2384 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2385 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2386
2387 for (j = 0; j < 16; ++j) {
2388 // Final rounding and shift
2389 in[j] = _mm_adds_epi16(in[j], final_rounding);
2390 in[j] = _mm_srai_epi16(in[j], 6);
2391 RECON_AND_STORE(dest + j * stride, in[j]);
2392 }
2393
2394 dest += 8;
2395 }
2396 }
2397
2398 #define LOAD_DQCOEFF(reg, input) \
2399 { \
2400 reg = load_input_data(input); \
2401 input += 8; \
2402 } \
2403
2404 #define IDCT32_34 \
2405 /* Stage1 */ \
2406 { \
2407 const __m128i zero = _mm_setzero_si128();\
2408 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2409 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2410 \
2411 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2412 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2413 \
2414 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2415 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2416 \
2417 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2418 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2419 \
2420 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2421 stg1_1, stp1_16, stp1_31); \
2422 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2423 stg1_7, stp1_19, stp1_28); \
2424 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2425 stg1_9, stp1_20, stp1_27); \
2426 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2427 stg1_15, stp1_23, stp1_24); \
2428 } \
2429 \
2430 /* Stage2 */ \
2431 { \
2432 const __m128i zero = _mm_setzero_si128();\
2433 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2434 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2435 \
2436 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2437 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2438 \
2439 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2440 stg2_1, stp2_8, stp2_15); \
2441 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2442 stg2_7, stp2_11, stp2_12); \
2443 \
2444 stp2_16 = stp1_16; \
2445 stp2_19 = stp1_19; \
2446 \
2447 stp2_20 = stp1_20; \
2448 stp2_23 = stp1_23; \
2449 \
2450 stp2_24 = stp1_24; \
2451 stp2_27 = stp1_27; \
2452 \
2453 stp2_28 = stp1_28; \
2454 stp2_31 = stp1_31; \
2455 } \
2456 \
2457 /* Stage3 */ \
2458 { \
2459 const __m128i zero = _mm_setzero_si128();\
2460 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2461 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2462 \
2463 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2464 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2465 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2466 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2467 \
2468 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2469 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2470 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2471 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2472 \
2473 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2474 stg3_1, stp1_4, stp1_7); \
2475 \
2476 stp1_8 = stp2_8; \
2477 stp1_11 = stp2_11; \
2478 stp1_12 = stp2_12; \
2479 stp1_15 = stp2_15; \
2480 \
2481 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2482 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2483 stp1_18, stp1_29) \
2484 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2485 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2486 stp1_22, stp1_25) \
2487 \
2488 stp1_16 = stp2_16; \
2489 stp1_31 = stp2_31; \
2490 stp1_19 = stp2_19; \
2491 stp1_20 = stp2_20; \
2492 stp1_23 = stp2_23; \
2493 stp1_24 = stp2_24; \
2494 stp1_27 = stp2_27; \
2495 stp1_28 = stp2_28; \
2496 } \
2497 \
2498 /* Stage4 */ \
2499 { \
2500 const __m128i zero = _mm_setzero_si128();\
2501 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2502 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2503 \
2504 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2505 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2506 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2507 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2508 \
2509 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2510 stg4_1, stp2_0, stp2_1); \
2511 \
2512 stp2_4 = stp1_4; \
2513 stp2_5 = stp1_4; \
2514 stp2_6 = stp1_7; \
2515 stp2_7 = stp1_7; \
2516 \
2517 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2518 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2519 stp2_10, stp2_13) \
2520 \
2521 stp2_8 = stp1_8; \
2522 stp2_15 = stp1_15; \
2523 stp2_11 = stp1_11; \
2524 stp2_12 = stp1_12; \
2525 \
2526 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2527 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2528 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2529 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2530 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2531 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2532 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2533 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2534 \
2535 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2536 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2537 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2538 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2539 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2540 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2541 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2542 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2543 } \
2544 \
2545 /* Stage5 */ \
2546 { \
2547 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2548 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2549 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2550 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2551 \
2552 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2553 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2554 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2555 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2556 \
2557 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2558 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2559 \
2560 stp1_0 = stp2_0; \
2561 stp1_1 = stp2_1; \
2562 stp1_2 = stp2_1; \
2563 stp1_3 = stp2_0; \
2564 \
2565 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2566 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2567 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2568 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2569 \
2570 tmp0 = _mm_add_epi32(tmp0, rounding); \
2571 tmp1 = _mm_add_epi32(tmp1, rounding); \
2572 tmp2 = _mm_add_epi32(tmp2, rounding); \
2573 tmp3 = _mm_add_epi32(tmp3, rounding); \
2574 \
2575 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2576 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2577 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2578 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2579 \
2580 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2581 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2582 \
2583 stp1_4 = stp2_4; \
2584 stp1_7 = stp2_7; \
2585 \
2586 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2587 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2588 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2589 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2590 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2591 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2592 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2593 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2594 \
2595 stp1_16 = stp2_16; \
2596 stp1_17 = stp2_17; \
2597 \
2598 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2599 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2600 stp1_19, stp1_28) \
2601 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2602 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2603 stp1_21, stp1_26) \
2604 \
2605 stp1_22 = stp2_22; \
2606 stp1_23 = stp2_23; \
2607 stp1_24 = stp2_24; \
2608 stp1_25 = stp2_25; \
2609 stp1_30 = stp2_30; \
2610 stp1_31 = stp2_31; \
2611 } \
2612 \
2613 /* Stage6 */ \
2614 { \
2615 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2616 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2617 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2618 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2619 \
2620 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2621 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2622 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2623 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2624 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2625 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2626 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2627 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2628 \
2629 stp2_8 = stp1_8; \
2630 stp2_9 = stp1_9; \
2631 stp2_14 = stp1_14; \
2632 stp2_15 = stp1_15; \
2633 \
2634 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2635 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2636 stp2_13, stp2_11, stp2_12) \
2637 \
2638 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2639 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2640 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2641 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2642 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2643 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2644 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2645 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2646 \
2647 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2648 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2649 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2650 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2651 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2652 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2653 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2654 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2655 } \
2656 \
2657 /* Stage7 */ \
2658 { \
2659 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2660 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2661 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2662 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2663 \
2664 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2665 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2666 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2667 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2668 \
2669 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2670 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2671 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2672 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2673 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2674 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2675 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2676 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2677 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2678 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2679 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2680 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2681 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2682 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2683 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2684 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2685 \
2686 stp1_16 = stp2_16; \
2687 stp1_17 = stp2_17; \
2688 stp1_18 = stp2_18; \
2689 stp1_19 = stp2_19; \
2690 \
2691 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2692 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
2693 stp1_21, stp1_26) \
2694 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2695 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
2696 stp1_23, stp1_24) \
2697 \
2698 stp1_28 = stp2_28; \
2699 stp1_29 = stp2_29; \
2700 stp1_30 = stp2_30; \
2701 stp1_31 = stp2_31; \
2702 }
2703
2704
2705 #define IDCT32 \
2706 /* Stage1 */ \
2707 { \
2708 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2709 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2710 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2711 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2712 \
2713 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2714 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2715 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
2716 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2717 \
2718 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2719 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2720 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2721 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2722 \
2723 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2724 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2725 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2726 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2727 \
2728 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2729 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2730 stp1_17, stp1_30) \
2731 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2732 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2733 stp1_19, stp1_28) \
2734 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2735 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2736 stp1_21, stp1_26) \
2737 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2738 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2739 stp1_23, stp1_24) \
2740 } \
2741 \
2742 /* Stage2 */ \
2743 { \
2744 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2745 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2746 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2747 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2748 \
2749 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2750 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2751 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2752 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2753 \
2754 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2755 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2756 stp2_14) \
2757 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2758 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
2759 stp2_11, stp2_12) \
2760 \
2761 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2762 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2763 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2764 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2765 \
2766 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2767 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2768 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2769 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2770 \
2771 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2772 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2773 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2774 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2775 \
2776 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2777 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2778 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2779 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2780 } \
2781 \
2782 /* Stage3 */ \
2783 { \
2784 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2785 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2786 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2787 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2788 \
2789 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2790 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2791 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2792 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2793 \
2794 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2795 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2796 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2797 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2798 \
2799 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2800 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2801 stp1_6) \
2802 \
2803 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2804 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2805 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2806 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2807 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2808 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2809 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2810 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2811 \
2812 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2813 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2814 stp1_18, stp1_29) \
2815 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2816 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2817 stp1_22, stp1_25) \
2818 \
2819 stp1_16 = stp2_16; \
2820 stp1_31 = stp2_31; \
2821 stp1_19 = stp2_19; \
2822 stp1_20 = stp2_20; \
2823 stp1_23 = stp2_23; \
2824 stp1_24 = stp2_24; \
2825 stp1_27 = stp2_27; \
2826 stp1_28 = stp2_28; \
2827 } \
2828 \
2829 /* Stage4 */ \
2830 { \
2831 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2832 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2833 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2834 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2835 \
2836 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2837 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2838 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2839 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2840 \
2841 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
2842 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
2843 stp2_2, stp2_3) \
2844 \
2845 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2846 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2847 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2848 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2849 \
2850 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2851 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2852 stp2_10, stp2_13) \
2853 \
2854 stp2_8 = stp1_8; \
2855 stp2_15 = stp1_15; \
2856 stp2_11 = stp1_11; \
2857 stp2_12 = stp1_12; \
2858 \
2859 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2860 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2861 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2862 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2863 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2864 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2865 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2866 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2867 \
2868 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2869 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2870 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2871 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2872 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2873 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2874 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2875 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2876 } \
2877 \
2878 /* Stage5 */ \
2879 { \
2880 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2881 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2882 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2883 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2884 \
2885 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2886 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2887 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2888 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2889 \
2890 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2891 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2892 \
2893 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2894 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2895 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2896 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2897 \
2898 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2899 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2900 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2901 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2902 \
2903 tmp0 = _mm_add_epi32(tmp0, rounding); \
2904 tmp1 = _mm_add_epi32(tmp1, rounding); \
2905 tmp2 = _mm_add_epi32(tmp2, rounding); \
2906 tmp3 = _mm_add_epi32(tmp3, rounding); \
2907 \
2908 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2909 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2910 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2911 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2912 \
2913 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2914 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2915 \
2916 stp1_4 = stp2_4; \
2917 stp1_7 = stp2_7; \
2918 \
2919 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2920 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2921 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2922 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2923 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2924 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2925 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2926 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2927 \
2928 stp1_16 = stp2_16; \
2929 stp1_17 = stp2_17; \
2930 \
2931 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2932 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2933 stp1_19, stp1_28) \
2934 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2935 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2936 stp1_21, stp1_26) \
2937 \
2938 stp1_22 = stp2_22; \
2939 stp1_23 = stp2_23; \
2940 stp1_24 = stp2_24; \
2941 stp1_25 = stp2_25; \
2942 stp1_30 = stp2_30; \
2943 stp1_31 = stp2_31; \
2944 } \
2945 \
2946 /* Stage6 */ \
2947 { \
2948 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2949 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2950 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2951 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2952 \
2953 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2954 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2955 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2956 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2957 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2958 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2959 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2960 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2961 \
2962 stp2_8 = stp1_8; \
2963 stp2_9 = stp1_9; \
2964 stp2_14 = stp1_14; \
2965 stp2_15 = stp1_15; \
2966 \
2967 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2968 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2969 stp2_13, stp2_11, stp2_12) \
2970 \
2971 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2972 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2973 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2974 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2975 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2976 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2977 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2978 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2979 \
2980 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2981 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2982 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2983 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2984 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2985 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2986 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2987 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2988 } \
2989 \
2990 /* Stage7 */ \
2991 { \
2992 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2993 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2994 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2995 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2996 \
2997 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2998 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2999 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3000 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3001 \
3002 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3003 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3004 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3005 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3006 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3007 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3008 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3009 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3010 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3011 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3012 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3013 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3014 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3015 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3016 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3017 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3018 \
3019 stp1_16 = stp2_16; \
3020 stp1_17 = stp2_17; \
3021 stp1_18 = stp2_18; \
3022 stp1_19 = stp2_19; \
3023 \
3024 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3025 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3026 stp1_21, stp1_26) \
3027 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3028 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3029 stp1_23, stp1_24) \
3030 \
3031 stp1_28 = stp2_28; \
3032 stp1_29 = stp2_29; \
3033 stp1_30 = stp2_30; \
3034 stp1_31 = stp2_31; \
3035 }
3036
3037 // Only upper-left 8x8 has non-zero coeff
vpx_idct32x32_34_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3038 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
3039 int stride) {
3040 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3041 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3042
3043 // idct constants for each stage
3044 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3045 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3046 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3047 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3048 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3049 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3050 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3051 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3052
3053 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3054 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3055 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3056 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3057
3058 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3059 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3060 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3061 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3062 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3063 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3064 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3065 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3066
3067 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3068 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3069 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3070 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3071 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3072
3073 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3074
3075 __m128i in[32], col[32];
3076 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3077 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3078 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3079 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3080 stp1_30, stp1_31;
3081 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3082 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3083 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3084 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3085 stp2_30, stp2_31;
3086 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3087 int i;
3088
3089 // Load input data. Only need to load the top left 8x8 block.
3090 in[0] = load_input_data(input);
3091 in[1] = load_input_data(input + 32);
3092 in[2] = load_input_data(input + 64);
3093 in[3] = load_input_data(input + 96);
3094 in[4] = load_input_data(input + 128);
3095 in[5] = load_input_data(input + 160);
3096 in[6] = load_input_data(input + 192);
3097 in[7] = load_input_data(input + 224);
3098
3099 for (i = 8; i < 32; ++i) {
3100 in[i] = _mm_setzero_si128();
3101 }
3102
3103 array_transpose_8x8(in, in);
3104 // TODO(hkuang): Following transposes are unnecessary. But remove them will
3105 // lead to performance drop on some devices.
3106 array_transpose_8x8(in + 8, in + 8);
3107 array_transpose_8x8(in + 16, in + 16);
3108 array_transpose_8x8(in + 24, in + 24);
3109
3110 IDCT32_34
3111
3112 // 1_D: Store 32 intermediate results for each 8x32 block.
3113 col[0] = _mm_add_epi16(stp1_0, stp1_31);
3114 col[1] = _mm_add_epi16(stp1_1, stp1_30);
3115 col[2] = _mm_add_epi16(stp1_2, stp1_29);
3116 col[3] = _mm_add_epi16(stp1_3, stp1_28);
3117 col[4] = _mm_add_epi16(stp1_4, stp1_27);
3118 col[5] = _mm_add_epi16(stp1_5, stp1_26);
3119 col[6] = _mm_add_epi16(stp1_6, stp1_25);
3120 col[7] = _mm_add_epi16(stp1_7, stp1_24);
3121 col[8] = _mm_add_epi16(stp1_8, stp1_23);
3122 col[9] = _mm_add_epi16(stp1_9, stp1_22);
3123 col[10] = _mm_add_epi16(stp1_10, stp1_21);
3124 col[11] = _mm_add_epi16(stp1_11, stp1_20);
3125 col[12] = _mm_add_epi16(stp1_12, stp1_19);
3126 col[13] = _mm_add_epi16(stp1_13, stp1_18);
3127 col[14] = _mm_add_epi16(stp1_14, stp1_17);
3128 col[15] = _mm_add_epi16(stp1_15, stp1_16);
3129 col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3130 col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3131 col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3132 col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3133 col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3134 col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3135 col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3136 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3137 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3138 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3139 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3140 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3141 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3142 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3143 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3144 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3145 for (i = 0; i < 4; i++) {
3146 int j;
3147 const __m128i zero = _mm_setzero_si128();
3148 // Transpose 32x8 block to 8x32 block
3149 array_transpose_8x8(col + i * 8, in);
3150 IDCT32_34
3151
3152 // 2_D: Calculate the results and store them to destination.
3153 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3154 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3155 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3156 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3157 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3158 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3159 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3160 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3161 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3162 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3163 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3164 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3165 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3166 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3167 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3168 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3169 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3170 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3171 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3172 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3173 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3174 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3175 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3176 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3177 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3178 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3179 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3180 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3181 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3182 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3183 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3184 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3185
3186 for (j = 0; j < 32; ++j) {
3187 // Final rounding and shift
3188 in[j] = _mm_adds_epi16(in[j], final_rounding);
3189 in[j] = _mm_srai_epi16(in[j], 6);
3190 RECON_AND_STORE(dest + j * stride, in[j]);
3191 }
3192
3193 dest += 8;
3194 }
3195 }
3196
vpx_idct32x32_1024_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3197 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
3198 int stride) {
3199 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3200 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3201 const __m128i zero = _mm_setzero_si128();
3202
3203 // idct constants for each stage
3204 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3205 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3206 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3207 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3208 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3209 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3210 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3211 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3212 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3213 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3214 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3215 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3216 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3217 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3218 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3219 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3220
3221 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3222 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3223 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3224 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3225 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3226 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3227 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3228 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3229
3230 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3231 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3232 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3233 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3234 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3235 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3236 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3237 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3238 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3239 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3240
3241 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3242 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3243 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3244 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3245 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3246 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3247 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3248
3249 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3250
3251 __m128i in[32], col[128], zero_idx[16];
3252 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3253 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3254 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3255 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3256 stp1_30, stp1_31;
3257 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3258 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3259 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3260 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3261 stp2_30, stp2_31;
3262 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3263 int i, j, i32;
3264
3265 for (i = 0; i < 4; i++) {
3266 i32 = (i << 5);
3267 // First 1-D idct
3268 // Load input data.
3269 LOAD_DQCOEFF(in[0], input);
3270 LOAD_DQCOEFF(in[8], input);
3271 LOAD_DQCOEFF(in[16], input);
3272 LOAD_DQCOEFF(in[24], input);
3273 LOAD_DQCOEFF(in[1], input);
3274 LOAD_DQCOEFF(in[9], input);
3275 LOAD_DQCOEFF(in[17], input);
3276 LOAD_DQCOEFF(in[25], input);
3277 LOAD_DQCOEFF(in[2], input);
3278 LOAD_DQCOEFF(in[10], input);
3279 LOAD_DQCOEFF(in[18], input);
3280 LOAD_DQCOEFF(in[26], input);
3281 LOAD_DQCOEFF(in[3], input);
3282 LOAD_DQCOEFF(in[11], input);
3283 LOAD_DQCOEFF(in[19], input);
3284 LOAD_DQCOEFF(in[27], input);
3285
3286 LOAD_DQCOEFF(in[4], input);
3287 LOAD_DQCOEFF(in[12], input);
3288 LOAD_DQCOEFF(in[20], input);
3289 LOAD_DQCOEFF(in[28], input);
3290 LOAD_DQCOEFF(in[5], input);
3291 LOAD_DQCOEFF(in[13], input);
3292 LOAD_DQCOEFF(in[21], input);
3293 LOAD_DQCOEFF(in[29], input);
3294 LOAD_DQCOEFF(in[6], input);
3295 LOAD_DQCOEFF(in[14], input);
3296 LOAD_DQCOEFF(in[22], input);
3297 LOAD_DQCOEFF(in[30], input);
3298 LOAD_DQCOEFF(in[7], input);
3299 LOAD_DQCOEFF(in[15], input);
3300 LOAD_DQCOEFF(in[23], input);
3301 LOAD_DQCOEFF(in[31], input);
3302
3303 // checking if all entries are zero
3304 zero_idx[0] = _mm_or_si128(in[0], in[1]);
3305 zero_idx[1] = _mm_or_si128(in[2], in[3]);
3306 zero_idx[2] = _mm_or_si128(in[4], in[5]);
3307 zero_idx[3] = _mm_or_si128(in[6], in[7]);
3308 zero_idx[4] = _mm_or_si128(in[8], in[9]);
3309 zero_idx[5] = _mm_or_si128(in[10], in[11]);
3310 zero_idx[6] = _mm_or_si128(in[12], in[13]);
3311 zero_idx[7] = _mm_or_si128(in[14], in[15]);
3312 zero_idx[8] = _mm_or_si128(in[16], in[17]);
3313 zero_idx[9] = _mm_or_si128(in[18], in[19]);
3314 zero_idx[10] = _mm_or_si128(in[20], in[21]);
3315 zero_idx[11] = _mm_or_si128(in[22], in[23]);
3316 zero_idx[12] = _mm_or_si128(in[24], in[25]);
3317 zero_idx[13] = _mm_or_si128(in[26], in[27]);
3318 zero_idx[14] = _mm_or_si128(in[28], in[29]);
3319 zero_idx[15] = _mm_or_si128(in[30], in[31]);
3320
3321 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3322 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3323 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3324 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3325 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3326 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3327 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3328 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3329
3330 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3331 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3332 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3333 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3334 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3335 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3336 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3337
3338 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3339 col[i32 + 0] = _mm_setzero_si128();
3340 col[i32 + 1] = _mm_setzero_si128();
3341 col[i32 + 2] = _mm_setzero_si128();
3342 col[i32 + 3] = _mm_setzero_si128();
3343 col[i32 + 4] = _mm_setzero_si128();
3344 col[i32 + 5] = _mm_setzero_si128();
3345 col[i32 + 6] = _mm_setzero_si128();
3346 col[i32 + 7] = _mm_setzero_si128();
3347 col[i32 + 8] = _mm_setzero_si128();
3348 col[i32 + 9] = _mm_setzero_si128();
3349 col[i32 + 10] = _mm_setzero_si128();
3350 col[i32 + 11] = _mm_setzero_si128();
3351 col[i32 + 12] = _mm_setzero_si128();
3352 col[i32 + 13] = _mm_setzero_si128();
3353 col[i32 + 14] = _mm_setzero_si128();
3354 col[i32 + 15] = _mm_setzero_si128();
3355 col[i32 + 16] = _mm_setzero_si128();
3356 col[i32 + 17] = _mm_setzero_si128();
3357 col[i32 + 18] = _mm_setzero_si128();
3358 col[i32 + 19] = _mm_setzero_si128();
3359 col[i32 + 20] = _mm_setzero_si128();
3360 col[i32 + 21] = _mm_setzero_si128();
3361 col[i32 + 22] = _mm_setzero_si128();
3362 col[i32 + 23] = _mm_setzero_si128();
3363 col[i32 + 24] = _mm_setzero_si128();
3364 col[i32 + 25] = _mm_setzero_si128();
3365 col[i32 + 26] = _mm_setzero_si128();
3366 col[i32 + 27] = _mm_setzero_si128();
3367 col[i32 + 28] = _mm_setzero_si128();
3368 col[i32 + 29] = _mm_setzero_si128();
3369 col[i32 + 30] = _mm_setzero_si128();
3370 col[i32 + 31] = _mm_setzero_si128();
3371 continue;
3372 }
3373
3374 // Transpose 32x8 block to 8x32 block
3375 array_transpose_8x8(in, in);
3376 array_transpose_8x8(in + 8, in + 8);
3377 array_transpose_8x8(in + 16, in + 16);
3378 array_transpose_8x8(in + 24, in + 24);
3379
3380 IDCT32
3381
3382 // 1_D: Store 32 intermediate results for each 8x32 block.
3383 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3384 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3385 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3386 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3387 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3388 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3389 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3390 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3391 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3392 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3393 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3394 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3395 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3396 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3397 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3398 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3399 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3400 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3401 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3402 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3403 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3404 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3405 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3406 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3407 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3408 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3409 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3410 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3411 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3412 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3413 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3414 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3415 }
3416 for (i = 0; i < 4; i++) {
3417 // Second 1-D idct
3418 j = i << 3;
3419
3420 // Transpose 32x8 block to 8x32 block
3421 array_transpose_8x8(col + j, in);
3422 array_transpose_8x8(col + j + 32, in + 8);
3423 array_transpose_8x8(col + j + 64, in + 16);
3424 array_transpose_8x8(col + j + 96, in + 24);
3425
3426 IDCT32
3427
3428 // 2_D: Calculate the results and store them to destination.
3429 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3430 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3431 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3432 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3433 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3434 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3435 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3436 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3437 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3438 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3439 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3440 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3441 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3442 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3443 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3444 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3445 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3446 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3447 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3448 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3449 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3450 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3451 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3452 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3453 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3454 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3455 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3456 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3457 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3458 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3459 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3460 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3461
3462 for (j = 0; j < 32; ++j) {
3463 // Final rounding and shift
3464 in[j] = _mm_adds_epi16(in[j], final_rounding);
3465 in[j] = _mm_srai_epi16(in[j], 6);
3466 RECON_AND_STORE(dest + j * stride, in[j]);
3467 }
3468
3469 dest += 8;
3470 }
3471 }
3472
vpx_idct32x32_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3473 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
3474 int stride) {
3475 __m128i dc_value;
3476 const __m128i zero = _mm_setzero_si128();
3477 int a, j;
3478
3479 a = dct_const_round_shift(input[0] * cospi_16_64);
3480 a = dct_const_round_shift(a * cospi_16_64);
3481 a = ROUND_POWER_OF_TWO(a, 6);
3482
3483 dc_value = _mm_set1_epi16(a);
3484
3485 for (j = 0; j < 32; ++j) {
3486 RECON_AND_STORE(dest + 0 + j * stride, dc_value);
3487 RECON_AND_STORE(dest + 8 + j * stride, dc_value);
3488 RECON_AND_STORE(dest + 16 + j * stride, dc_value);
3489 RECON_AND_STORE(dest + 24 + j * stride, dc_value);
3490 }
3491 }
3492
3493 #if CONFIG_VP9_HIGHBITDEPTH
clamp_high_sse2(__m128i value,int bd)3494 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
3495 __m128i ubounded, retval;
3496 const __m128i zero = _mm_set1_epi16(0);
3497 const __m128i one = _mm_set1_epi16(1);
3498 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
3499 ubounded = _mm_cmpgt_epi16(value, max);
3500 retval = _mm_andnot_si128(ubounded, value);
3501 ubounded = _mm_and_si128(ubounded, max);
3502 retval = _mm_or_si128(retval, ubounded);
3503 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
3504 return retval;
3505 }
3506
vpx_highbd_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3507 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
3508 int stride, int bd) {
3509 tran_low_t out[4 * 4];
3510 tran_low_t *outptr = out;
3511 int i, j;
3512 __m128i inptr[4];
3513 __m128i sign_bits[2];
3514 __m128i temp_mm, min_input, max_input;
3515 int test;
3516 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3517 int optimised_cols = 0;
3518 const __m128i zero = _mm_set1_epi16(0);
3519 const __m128i eight = _mm_set1_epi16(8);
3520 const __m128i max = _mm_set1_epi16(12043);
3521 const __m128i min = _mm_set1_epi16(-12043);
3522 // Load input into __m128i
3523 inptr[0] = _mm_loadu_si128((const __m128i *)input);
3524 inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
3525 inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
3526 inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
3527
3528 // Pack to 16 bits
3529 inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
3530 inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
3531
3532 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3533 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3534 max_input = _mm_cmpgt_epi16(max_input, max);
3535 min_input = _mm_cmplt_epi16(min_input, min);
3536 temp_mm = _mm_or_si128(max_input, min_input);
3537 test = _mm_movemask_epi8(temp_mm);
3538
3539 if (!test) {
3540 // Do the row transform
3541 idct4_sse2(inptr);
3542
3543 // Check the min & max values
3544 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3545 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3546 max_input = _mm_cmpgt_epi16(max_input, max);
3547 min_input = _mm_cmplt_epi16(min_input, min);
3548 temp_mm = _mm_or_si128(max_input, min_input);
3549 test = _mm_movemask_epi8(temp_mm);
3550
3551 if (test) {
3552 transpose_4x4(inptr);
3553 sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
3554 sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
3555 inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
3556 inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
3557 inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
3558 inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
3559 _mm_storeu_si128((__m128i *)outptr, inptr[0]);
3560 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
3561 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
3562 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
3563 } else {
3564 // Set to use the optimised transform for the column
3565 optimised_cols = 1;
3566 }
3567 } else {
3568 // Run the un-optimised row transform
3569 for (i = 0; i < 4; ++i) {
3570 vpx_highbd_idct4_c(input, outptr, bd);
3571 input += 4;
3572 outptr += 4;
3573 }
3574 }
3575
3576 if (optimised_cols) {
3577 idct4_sse2(inptr);
3578
3579 // Final round and shift
3580 inptr[0] = _mm_add_epi16(inptr[0], eight);
3581 inptr[1] = _mm_add_epi16(inptr[1], eight);
3582
3583 inptr[0] = _mm_srai_epi16(inptr[0], 4);
3584 inptr[1] = _mm_srai_epi16(inptr[1], 4);
3585
3586 // Reconstruction and Store
3587 {
3588 __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
3589 __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
3590 d0 = _mm_unpacklo_epi64(
3591 d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
3592 d2 = _mm_unpacklo_epi64(
3593 d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
3594 d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
3595 d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
3596 // store input0
3597 _mm_storel_epi64((__m128i *)dest, d0);
3598 // store input1
3599 d0 = _mm_srli_si128(d0, 8);
3600 _mm_storel_epi64((__m128i *)(dest + stride), d0);
3601 // store input2
3602 _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
3603 // store input3
3604 d2 = _mm_srli_si128(d2, 8);
3605 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
3606 }
3607 } else {
3608 // Run the un-optimised column transform
3609 tran_low_t temp_in[4], temp_out[4];
3610 // Columns
3611 for (i = 0; i < 4; ++i) {
3612 for (j = 0; j < 4; ++j)
3613 temp_in[j] = out[j * 4 + i];
3614 vpx_highbd_idct4_c(temp_in, temp_out, bd);
3615 for (j = 0; j < 4; ++j) {
3616 dest[j * stride + i] = highbd_clip_pixel_add(
3617 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
3618 }
3619 }
3620 }
3621 }
3622
vpx_highbd_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3623 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
3624 int stride, int bd) {
3625 tran_low_t out[8 * 8];
3626 tran_low_t *outptr = out;
3627 int i, j, test;
3628 __m128i inptr[8];
3629 __m128i min_input, max_input, temp1, temp2, sign_bits;
3630 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3631 const __m128i zero = _mm_set1_epi16(0);
3632 const __m128i sixteen = _mm_set1_epi16(16);
3633 const __m128i max = _mm_set1_epi16(6201);
3634 const __m128i min = _mm_set1_epi16(-6201);
3635 int optimised_cols = 0;
3636
3637 // Load input into __m128i & pack to 16 bits
3638 for (i = 0; i < 8; i++) {
3639 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3640 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3641 inptr[i] = _mm_packs_epi32(temp1, temp2);
3642 }
3643
3644 // Find the min & max for the row transform
3645 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3646 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3647 for (i = 2; i < 8; i++) {
3648 max_input = _mm_max_epi16(max_input, inptr[i]);
3649 min_input = _mm_min_epi16(min_input, inptr[i]);
3650 }
3651 max_input = _mm_cmpgt_epi16(max_input, max);
3652 min_input = _mm_cmplt_epi16(min_input, min);
3653 temp1 = _mm_or_si128(max_input, min_input);
3654 test = _mm_movemask_epi8(temp1);
3655
3656 if (!test) {
3657 // Do the row transform
3658 idct8_sse2(inptr);
3659
3660 // Find the min & max for the column transform
3661 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3662 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3663 for (i = 2; i < 8; i++) {
3664 max_input = _mm_max_epi16(max_input, inptr[i]);
3665 min_input = _mm_min_epi16(min_input, inptr[i]);
3666 }
3667 max_input = _mm_cmpgt_epi16(max_input, max);
3668 min_input = _mm_cmplt_epi16(min_input, min);
3669 temp1 = _mm_or_si128(max_input, min_input);
3670 test = _mm_movemask_epi8(temp1);
3671
3672 if (test) {
3673 array_transpose_8x8(inptr, inptr);
3674 for (i = 0; i < 8; i++) {
3675 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3676 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3677 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3678 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3679 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3680 }
3681 } else {
3682 // Set to use the optimised transform for the column
3683 optimised_cols = 1;
3684 }
3685 } else {
3686 // Run the un-optimised row transform
3687 for (i = 0; i < 8; ++i) {
3688 vpx_highbd_idct8_c(input, outptr, bd);
3689 input += 8;
3690 outptr += 8;
3691 }
3692 }
3693
3694 if (optimised_cols) {
3695 idct8_sse2(inptr);
3696
3697 // Final round & shift and Reconstruction and Store
3698 {
3699 __m128i d[8];
3700 for (i = 0; i < 8; i++) {
3701 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3702 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3703 inptr[i] = _mm_srai_epi16(inptr[i], 5);
3704 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3705 // Store
3706 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3707 }
3708 }
3709 } else {
3710 // Run the un-optimised column transform
3711 tran_low_t temp_in[8], temp_out[8];
3712 for (i = 0; i < 8; ++i) {
3713 for (j = 0; j < 8; ++j)
3714 temp_in[j] = out[j * 8 + i];
3715 vpx_highbd_idct8_c(temp_in, temp_out, bd);
3716 for (j = 0; j < 8; ++j) {
3717 dest[j * stride + i] = highbd_clip_pixel_add(
3718 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3719 }
3720 }
3721 }
3722 }
3723
vpx_highbd_idct8x8_10_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3724 void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3725 int stride, int bd) {
3726 tran_low_t out[8 * 8] = { 0 };
3727 tran_low_t *outptr = out;
3728 int i, j, test;
3729 __m128i inptr[8];
3730 __m128i min_input, max_input, temp1, temp2, sign_bits;
3731 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3732 const __m128i zero = _mm_set1_epi16(0);
3733 const __m128i sixteen = _mm_set1_epi16(16);
3734 const __m128i max = _mm_set1_epi16(6201);
3735 const __m128i min = _mm_set1_epi16(-6201);
3736 int optimised_cols = 0;
3737
3738 // Load input into __m128i & pack to 16 bits
3739 for (i = 0; i < 8; i++) {
3740 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3741 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3742 inptr[i] = _mm_packs_epi32(temp1, temp2);
3743 }
3744
3745 // Find the min & max for the row transform
3746 // only first 4 row has non-zero coefs
3747 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3748 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3749 for (i = 2; i < 4; i++) {
3750 max_input = _mm_max_epi16(max_input, inptr[i]);
3751 min_input = _mm_min_epi16(min_input, inptr[i]);
3752 }
3753 max_input = _mm_cmpgt_epi16(max_input, max);
3754 min_input = _mm_cmplt_epi16(min_input, min);
3755 temp1 = _mm_or_si128(max_input, min_input);
3756 test = _mm_movemask_epi8(temp1);
3757
3758 if (!test) {
3759 // Do the row transform
3760 idct8_sse2(inptr);
3761
3762 // Find the min & max for the column transform
3763 // N.B. Only first 4 cols contain non-zero coeffs
3764 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3765 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3766 for (i = 2; i < 8; i++) {
3767 max_input = _mm_max_epi16(max_input, inptr[i]);
3768 min_input = _mm_min_epi16(min_input, inptr[i]);
3769 }
3770 max_input = _mm_cmpgt_epi16(max_input, max);
3771 min_input = _mm_cmplt_epi16(min_input, min);
3772 temp1 = _mm_or_si128(max_input, min_input);
3773 test = _mm_movemask_epi8(temp1);
3774
3775 if (test) {
3776 // Use fact only first 4 rows contain non-zero coeffs
3777 array_transpose_4X8(inptr, inptr);
3778 for (i = 0; i < 4; i++) {
3779 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3780 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3781 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3782 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3783 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3784 }
3785 } else {
3786 // Set to use the optimised transform for the column
3787 optimised_cols = 1;
3788 }
3789 } else {
3790 // Run the un-optimised row transform
3791 for (i = 0; i < 4; ++i) {
3792 vpx_highbd_idct8_c(input, outptr, bd);
3793 input += 8;
3794 outptr += 8;
3795 }
3796 }
3797
3798 if (optimised_cols) {
3799 idct8_sse2(inptr);
3800
3801 // Final round & shift and Reconstruction and Store
3802 {
3803 __m128i d[8];
3804 for (i = 0; i < 8; i++) {
3805 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3806 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3807 inptr[i] = _mm_srai_epi16(inptr[i], 5);
3808 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3809 // Store
3810 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3811 }
3812 }
3813 } else {
3814 // Run the un-optimised column transform
3815 tran_low_t temp_in[8], temp_out[8];
3816 for (i = 0; i < 8; ++i) {
3817 for (j = 0; j < 8; ++j)
3818 temp_in[j] = out[j * 8 + i];
3819 vpx_highbd_idct8_c(temp_in, temp_out, bd);
3820 for (j = 0; j < 8; ++j) {
3821 dest[j * stride + i] = highbd_clip_pixel_add(
3822 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3823 }
3824 }
3825 }
3826 }
3827
vpx_highbd_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3828 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
3829 int stride, int bd) {
3830 tran_low_t out[16 * 16];
3831 tran_low_t *outptr = out;
3832 int i, j, test;
3833 __m128i inptr[32];
3834 __m128i min_input, max_input, temp1, temp2, sign_bits;
3835 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3836 const __m128i zero = _mm_set1_epi16(0);
3837 const __m128i rounding = _mm_set1_epi16(32);
3838 const __m128i max = _mm_set1_epi16(3155);
3839 const __m128i min = _mm_set1_epi16(-3155);
3840 int optimised_cols = 0;
3841
3842 // Load input into __m128i & pack to 16 bits
3843 for (i = 0; i < 16; i++) {
3844 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3845 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3846 inptr[i] = _mm_packs_epi32(temp1, temp2);
3847 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3848 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3849 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3850 }
3851
3852 // Find the min & max for the row transform
3853 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3854 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3855 for (i = 2; i < 32; i++) {
3856 max_input = _mm_max_epi16(max_input, inptr[i]);
3857 min_input = _mm_min_epi16(min_input, inptr[i]);
3858 }
3859 max_input = _mm_cmpgt_epi16(max_input, max);
3860 min_input = _mm_cmplt_epi16(min_input, min);
3861 temp1 = _mm_or_si128(max_input, min_input);
3862 test = _mm_movemask_epi8(temp1);
3863
3864 if (!test) {
3865 // Do the row transform
3866 idct16_sse2(inptr, inptr + 16);
3867
3868 // Find the min & max for the column transform
3869 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3870 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3871 for (i = 2; i < 32; i++) {
3872 max_input = _mm_max_epi16(max_input, inptr[i]);
3873 min_input = _mm_min_epi16(min_input, inptr[i]);
3874 }
3875 max_input = _mm_cmpgt_epi16(max_input, max);
3876 min_input = _mm_cmplt_epi16(min_input, min);
3877 temp1 = _mm_or_si128(max_input, min_input);
3878 test = _mm_movemask_epi8(temp1);
3879
3880 if (test) {
3881 array_transpose_16x16(inptr, inptr + 16);
3882 for (i = 0; i < 16; i++) {
3883 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3884 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3885 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3886 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3887 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3888 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3889 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3890 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3891 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3892 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3893 }
3894 } else {
3895 // Set to use the optimised transform for the column
3896 optimised_cols = 1;
3897 }
3898 } else {
3899 // Run the un-optimised row transform
3900 for (i = 0; i < 16; ++i) {
3901 vpx_highbd_idct16_c(input, outptr, bd);
3902 input += 16;
3903 outptr += 16;
3904 }
3905 }
3906
3907 if (optimised_cols) {
3908 idct16_sse2(inptr, inptr + 16);
3909
3910 // Final round & shift and Reconstruction and Store
3911 {
3912 __m128i d[2];
3913 for (i = 0; i < 16; i++) {
3914 inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
3915 inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
3916 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3917 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
3918 inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
3919 inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
3920 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
3921 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
3922 // Store
3923 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
3924 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
3925 }
3926 }
3927 } else {
3928 // Run the un-optimised column transform
3929 tran_low_t temp_in[16], temp_out[16];
3930 for (i = 0; i < 16; ++i) {
3931 for (j = 0; j < 16; ++j)
3932 temp_in[j] = out[j * 16 + i];
3933 vpx_highbd_idct16_c(temp_in, temp_out, bd);
3934 for (j = 0; j < 16; ++j) {
3935 dest[j * stride + i] = highbd_clip_pixel_add(
3936 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
3937 }
3938 }
3939 }
3940 }
3941
vpx_highbd_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3942 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3943 int stride, int bd) {
3944 tran_low_t out[16 * 16] = { 0 };
3945 tran_low_t *outptr = out;
3946 int i, j, test;
3947 __m128i inptr[32];
3948 __m128i min_input, max_input, temp1, temp2, sign_bits;
3949 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3950 const __m128i zero = _mm_set1_epi16(0);
3951 const __m128i rounding = _mm_set1_epi16(32);
3952 const __m128i max = _mm_set1_epi16(3155);
3953 const __m128i min = _mm_set1_epi16(-3155);
3954 int optimised_cols = 0;
3955
3956 // Load input into __m128i & pack to 16 bits
3957 for (i = 0; i < 16; i++) {
3958 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3959 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3960 inptr[i] = _mm_packs_epi32(temp1, temp2);
3961 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3962 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3963 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3964 }
3965
3966 // Find the min & max for the row transform
3967 // Since all non-zero dct coefficients are in upper-left 4x4 area,
3968 // we only need to consider first 4 rows here.
3969 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3970 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3971 for (i = 2; i < 4; i++) {
3972 max_input = _mm_max_epi16(max_input, inptr[i]);
3973 min_input = _mm_min_epi16(min_input, inptr[i]);
3974 }
3975 max_input = _mm_cmpgt_epi16(max_input, max);
3976 min_input = _mm_cmplt_epi16(min_input, min);
3977 temp1 = _mm_or_si128(max_input, min_input);
3978 test = _mm_movemask_epi8(temp1);
3979
3980 if (!test) {
3981 // Do the row transform (N.B. This transposes inptr)
3982 idct16_sse2(inptr, inptr + 16);
3983
3984 // Find the min & max for the column transform
3985 // N.B. Only first 4 cols contain non-zero coeffs
3986 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3987 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3988 for (i = 2; i < 16; i++) {
3989 max_input = _mm_max_epi16(max_input, inptr[i]);
3990 min_input = _mm_min_epi16(min_input, inptr[i]);
3991 }
3992 max_input = _mm_cmpgt_epi16(max_input, max);
3993 min_input = _mm_cmplt_epi16(min_input, min);
3994 temp1 = _mm_or_si128(max_input, min_input);
3995 test = _mm_movemask_epi8(temp1);
3996
3997 if (test) {
3998 // Use fact only first 4 rows contain non-zero coeffs
3999 array_transpose_8x8(inptr, inptr);
4000 array_transpose_8x8(inptr + 8, inptr + 16);
4001 for (i = 0; i < 4; i++) {
4002 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
4003 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
4004 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
4005 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
4006 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
4007 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
4008 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
4009 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
4010 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
4011 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
4012 }
4013 } else {
4014 // Set to use the optimised transform for the column
4015 optimised_cols = 1;
4016 }
4017 } else {
4018 // Run the un-optimised row transform
4019 for (i = 0; i < 4; ++i) {
4020 vpx_highbd_idct16_c(input, outptr, bd);
4021 input += 16;
4022 outptr += 16;
4023 }
4024 }
4025
4026 if (optimised_cols) {
4027 idct16_sse2(inptr, inptr + 16);
4028
4029 // Final round & shift and Reconstruction and Store
4030 {
4031 __m128i d[2];
4032 for (i = 0; i < 16; i++) {
4033 inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
4034 inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
4035 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
4036 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
4037 inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
4038 inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
4039 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
4040 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
4041 // Store
4042 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
4043 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
4044 }
4045 }
4046 } else {
4047 // Run the un-optimised column transform
4048 tran_low_t temp_in[16], temp_out[16];
4049 for (i = 0; i < 16; ++i) {
4050 for (j = 0; j < 16; ++j)
4051 temp_in[j] = out[j * 16 + i];
4052 vpx_highbd_idct16_c(temp_in, temp_out, bd);
4053 for (j = 0; j < 16; ++j) {
4054 dest[j * stride + i] = highbd_clip_pixel_add(
4055 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4056 }
4057 }
4058 }
4059 }
4060 #endif // CONFIG_VP9_HIGHBITDEPTH
4061