1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <emmintrin.h> // SSE2
13 #include "./vpx_config.h"
14 #include "vpx/vpx_integer.h"
15 #include "vp9/common/vp9_common.h"
16 #include "vp9/common/vp9_idct.h"
17
18 #define RECON_AND_STORE4X4(dest, in_x) \
19 { \
20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
21 d0 = _mm_unpacklo_epi8(d0, zero); \
22 d0 = _mm_add_epi16(in_x, d0); \
23 d0 = _mm_packus_epi16(d0, d0); \
24 *(int *)dest = _mm_cvtsi128_si32(d0); \
25 dest += stride; \
26 }
27
vp9_idct4x4_16_add_sse2(const int16_t * input,uint8_t * dest,int stride)28 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
29 const __m128i zero = _mm_setzero_si128();
30 const __m128i eight = _mm_set1_epi16(8);
31 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
32 (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
33 (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
34 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
35 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
36 __m128i input0, input1, input2, input3;
37
38 // Rows
39 input0 = _mm_load_si128((const __m128i *)input);
40 input2 = _mm_load_si128((const __m128i *)(input + 8));
41
42 // Construct i3, i1, i3, i1, i2, i0, i2, i0
43 input0 = _mm_shufflelo_epi16(input0, 0xd8);
44 input0 = _mm_shufflehi_epi16(input0, 0xd8);
45 input2 = _mm_shufflelo_epi16(input2, 0xd8);
46 input2 = _mm_shufflehi_epi16(input2, 0xd8);
47
48 input1 = _mm_unpackhi_epi32(input0, input0);
49 input0 = _mm_unpacklo_epi32(input0, input0);
50 input3 = _mm_unpackhi_epi32(input2, input2);
51 input2 = _mm_unpacklo_epi32(input2, input2);
52
53 // Stage 1
54 input0 = _mm_madd_epi16(input0, cst);
55 input1 = _mm_madd_epi16(input1, cst);
56 input2 = _mm_madd_epi16(input2, cst);
57 input3 = _mm_madd_epi16(input3, cst);
58
59 input0 = _mm_add_epi32(input0, rounding);
60 input1 = _mm_add_epi32(input1, rounding);
61 input2 = _mm_add_epi32(input2, rounding);
62 input3 = _mm_add_epi32(input3, rounding);
63
64 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
65 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
66 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
67 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
68
69 // Stage 2
70 input0 = _mm_packs_epi32(input0, input1);
71 input1 = _mm_packs_epi32(input2, input3);
72
73 // Transpose
74 input2 = _mm_unpacklo_epi16(input0, input1);
75 input3 = _mm_unpackhi_epi16(input0, input1);
76 input0 = _mm_unpacklo_epi32(input2, input3);
77 input1 = _mm_unpackhi_epi32(input2, input3);
78
79 // Switch column2, column 3, and then, we got:
80 // input2: column1, column 0; input3: column2, column 3.
81 input1 = _mm_shuffle_epi32(input1, 0x4e);
82 input2 = _mm_add_epi16(input0, input1);
83 input3 = _mm_sub_epi16(input0, input1);
84
85 // Columns
86 // Construct i3, i1, i3, i1, i2, i0, i2, i0
87 input0 = _mm_unpacklo_epi32(input2, input2);
88 input1 = _mm_unpackhi_epi32(input2, input2);
89 input2 = _mm_unpackhi_epi32(input3, input3);
90 input3 = _mm_unpacklo_epi32(input3, input3);
91
92 // Stage 1
93 input0 = _mm_madd_epi16(input0, cst);
94 input1 = _mm_madd_epi16(input1, cst);
95 input2 = _mm_madd_epi16(input2, cst);
96 input3 = _mm_madd_epi16(input3, cst);
97
98 input0 = _mm_add_epi32(input0, rounding);
99 input1 = _mm_add_epi32(input1, rounding);
100 input2 = _mm_add_epi32(input2, rounding);
101 input3 = _mm_add_epi32(input3, rounding);
102
103 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
104 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
105 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
106 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
107
108 // Stage 2
109 input0 = _mm_packs_epi32(input0, input2);
110 input1 = _mm_packs_epi32(input1, input3);
111
112 // Transpose
113 input2 = _mm_unpacklo_epi16(input0, input1);
114 input3 = _mm_unpackhi_epi16(input0, input1);
115 input0 = _mm_unpacklo_epi32(input2, input3);
116 input1 = _mm_unpackhi_epi32(input2, input3);
117
118 // Switch column2, column 3, and then, we got:
119 // input2: column1, column 0; input3: column2, column 3.
120 input1 = _mm_shuffle_epi32(input1, 0x4e);
121 input2 = _mm_add_epi16(input0, input1);
122 input3 = _mm_sub_epi16(input0, input1);
123
124 // Final round and shift
125 input2 = _mm_add_epi16(input2, eight);
126 input3 = _mm_add_epi16(input3, eight);
127
128 input2 = _mm_srai_epi16(input2, 4);
129 input3 = _mm_srai_epi16(input3, 4);
130
131 // Reconstruction and Store
132 {
133 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
134 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
135 d0 = _mm_unpacklo_epi32(d0,
136 _mm_cvtsi32_si128(*(const int *) (dest + stride)));
137 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
138 *(const int *) (dest + stride * 3)), d2);
139 d0 = _mm_unpacklo_epi8(d0, zero);
140 d2 = _mm_unpacklo_epi8(d2, zero);
141 d0 = _mm_add_epi16(d0, input2);
142 d2 = _mm_add_epi16(d2, input3);
143 d0 = _mm_packus_epi16(d0, d2);
144 // store input0
145 *(int *)dest = _mm_cvtsi128_si32(d0);
146 // store input1
147 d0 = _mm_srli_si128(d0, 4);
148 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
149 // store input2
150 d0 = _mm_srli_si128(d0, 4);
151 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
152 // store input3
153 d0 = _mm_srli_si128(d0, 4);
154 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
155 }
156 }
157
vp9_idct4x4_1_add_sse2(const int16_t * input,uint8_t * dest,int stride)158 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
159 __m128i dc_value;
160 const __m128i zero = _mm_setzero_si128();
161 int a;
162
163 a = dct_const_round_shift(input[0] * cospi_16_64);
164 a = dct_const_round_shift(a * cospi_16_64);
165 a = ROUND_POWER_OF_TWO(a, 4);
166
167 dc_value = _mm_set1_epi16(a);
168
169 RECON_AND_STORE4X4(dest, dc_value);
170 RECON_AND_STORE4X4(dest, dc_value);
171 RECON_AND_STORE4X4(dest, dc_value);
172 RECON_AND_STORE4X4(dest, dc_value);
173 }
174
transpose_4x4(__m128i * res)175 static INLINE void transpose_4x4(__m128i *res) {
176 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
177 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
178
179 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
180 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
181 }
182
idct4_sse2(__m128i * in)183 static void idct4_sse2(__m128i *in) {
184 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
185 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
186 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
187 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
188 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
189 __m128i u[8], v[8];
190
191 transpose_4x4(in);
192 // stage 1
193 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
194 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
195 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
196 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
197 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
198 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
199
200 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
201 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
202 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
203 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
204
205 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
206 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
207 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
208 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
209
210 u[0] = _mm_packs_epi32(v[0], v[1]);
211 u[1] = _mm_packs_epi32(v[3], v[2]);
212
213 // stage 2
214 in[0] = _mm_add_epi16(u[0], u[1]);
215 in[1] = _mm_sub_epi16(u[0], u[1]);
216 in[1] = _mm_shuffle_epi32(in[1], 0x4E);
217 }
218
iadst4_sse2(__m128i * in)219 static void iadst4_sse2(__m128i *in) {
220 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
221 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
222 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
223 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
224 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
225 const __m128i kZero = _mm_set1_epi16(0);
226 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
227 __m128i u[8], v[8], in7;
228
229 transpose_4x4(in);
230 in7 = _mm_srli_si128(in[1], 8);
231 in7 = _mm_add_epi16(in7, in[0]);
232 in7 = _mm_sub_epi16(in7, in[1]);
233
234 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
235 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
236 u[2] = _mm_unpacklo_epi16(in7, kZero);
237 u[3] = _mm_unpackhi_epi16(in[0], kZero);
238
239 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
240 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
241 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
242 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
243 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
244 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
245
246 u[0] = _mm_add_epi32(v[0], v[1]);
247 u[1] = _mm_add_epi32(v[3], v[4]);
248 u[2] = v[2];
249 u[3] = _mm_add_epi32(u[0], u[1]);
250 u[4] = _mm_slli_epi32(v[5], 2);
251 u[5] = _mm_add_epi32(u[3], v[5]);
252 u[6] = _mm_sub_epi32(u[5], u[4]);
253
254 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
255 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
256 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
257 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
258
259 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
260 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
261 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
262 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
263
264 in[0] = _mm_packs_epi32(u[0], u[1]);
265 in[1] = _mm_packs_epi32(u[2], u[3]);
266 }
267
vp9_iht4x4_16_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)268 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
269 int tx_type) {
270 __m128i in[2];
271 const __m128i zero = _mm_setzero_si128();
272 const __m128i eight = _mm_set1_epi16(8);
273
274 in[0]= _mm_loadu_si128((const __m128i *)(input));
275 in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
276
277 switch (tx_type) {
278 case 0: // DCT_DCT
279 idct4_sse2(in);
280 idct4_sse2(in);
281 break;
282 case 1: // ADST_DCT
283 idct4_sse2(in);
284 iadst4_sse2(in);
285 break;
286 case 2: // DCT_ADST
287 iadst4_sse2(in);
288 idct4_sse2(in);
289 break;
290 case 3: // ADST_ADST
291 iadst4_sse2(in);
292 iadst4_sse2(in);
293 break;
294 default:
295 assert(0);
296 break;
297 }
298
299 // Final round and shift
300 in[0] = _mm_add_epi16(in[0], eight);
301 in[1] = _mm_add_epi16(in[1], eight);
302
303 in[0] = _mm_srai_epi16(in[0], 4);
304 in[1] = _mm_srai_epi16(in[1], 4);
305
306 // Reconstruction and Store
307 {
308 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
309 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
310 d0 = _mm_unpacklo_epi32(d0,
311 _mm_cvtsi32_si128(*(const int *) (dest + stride)));
312 d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
313 *(const int *) (dest + stride * 3)));
314 d0 = _mm_unpacklo_epi8(d0, zero);
315 d2 = _mm_unpacklo_epi8(d2, zero);
316 d0 = _mm_add_epi16(d0, in[0]);
317 d2 = _mm_add_epi16(d2, in[1]);
318 d0 = _mm_packus_epi16(d0, d2);
319 // store result[0]
320 *(int *)dest = _mm_cvtsi128_si32(d0);
321 // store result[1]
322 d0 = _mm_srli_si128(d0, 4);
323 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
324 // store result[2]
325 d0 = _mm_srli_si128(d0, 4);
326 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
327 // store result[3]
328 d0 = _mm_srli_si128(d0, 4);
329 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
330 }
331 }
332
333 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
334 out0, out1, out2, out3, out4, out5, out6, out7) \
335 { \
336 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
337 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
338 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
339 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
340 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
341 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
342 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
343 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
344 \
345 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
346 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
347 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
348 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
349 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
350 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
351 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
352 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
353 \
354 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
355 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
356 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
357 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
358 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
359 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
360 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
361 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
362 }
363
364 #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
365 out0, out1, out2, out3) \
366 { \
367 const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
368 const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
369 const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
370 const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
371 \
372 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
373 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
374 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
375 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
376 \
377 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
378 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
379 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
380 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
381 }
382
383 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
384 { \
385 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
386 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
387 \
388 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
389 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
390 }
391
392 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
393 { \
394 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
395 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
396 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
397 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
398 }
399
400 // Define Macro for multiplying elements by constants and adding them together.
401 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
402 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
403 { \
404 tmp0 = _mm_madd_epi16(lo_0, cst0); \
405 tmp1 = _mm_madd_epi16(hi_0, cst0); \
406 tmp2 = _mm_madd_epi16(lo_0, cst1); \
407 tmp3 = _mm_madd_epi16(hi_0, cst1); \
408 tmp4 = _mm_madd_epi16(lo_1, cst2); \
409 tmp5 = _mm_madd_epi16(hi_1, cst2); \
410 tmp6 = _mm_madd_epi16(lo_1, cst3); \
411 tmp7 = _mm_madd_epi16(hi_1, cst3); \
412 \
413 tmp0 = _mm_add_epi32(tmp0, rounding); \
414 tmp1 = _mm_add_epi32(tmp1, rounding); \
415 tmp2 = _mm_add_epi32(tmp2, rounding); \
416 tmp3 = _mm_add_epi32(tmp3, rounding); \
417 tmp4 = _mm_add_epi32(tmp4, rounding); \
418 tmp5 = _mm_add_epi32(tmp5, rounding); \
419 tmp6 = _mm_add_epi32(tmp6, rounding); \
420 tmp7 = _mm_add_epi32(tmp7, rounding); \
421 \
422 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
423 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
424 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
425 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
426 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
427 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
428 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
429 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
430 \
431 res0 = _mm_packs_epi32(tmp0, tmp1); \
432 res1 = _mm_packs_epi32(tmp2, tmp3); \
433 res2 = _mm_packs_epi32(tmp4, tmp5); \
434 res3 = _mm_packs_epi32(tmp6, tmp7); \
435 }
436
437 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
438 { \
439 tmp0 = _mm_madd_epi16(lo_0, cst0); \
440 tmp1 = _mm_madd_epi16(hi_0, cst0); \
441 tmp2 = _mm_madd_epi16(lo_0, cst1); \
442 tmp3 = _mm_madd_epi16(hi_0, cst1); \
443 \
444 tmp0 = _mm_add_epi32(tmp0, rounding); \
445 tmp1 = _mm_add_epi32(tmp1, rounding); \
446 tmp2 = _mm_add_epi32(tmp2, rounding); \
447 tmp3 = _mm_add_epi32(tmp3, rounding); \
448 \
449 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
450 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
451 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
452 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
453 \
454 res0 = _mm_packs_epi32(tmp0, tmp1); \
455 res1 = _mm_packs_epi32(tmp2, tmp3); \
456 }
457
458 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
459 out0, out1, out2, out3, out4, out5, out6, out7) \
460 { \
461 /* Stage1 */ \
462 { \
463 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
464 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
465 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
466 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
467 \
468 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
469 stg1_1, stg1_2, stg1_3, stp1_4, \
470 stp1_7, stp1_5, stp1_6) \
471 } \
472 \
473 /* Stage2 */ \
474 { \
475 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
476 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
477 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
478 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
479 \
480 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
481 stg2_1, stg2_2, stg2_3, stp2_0, \
482 stp2_1, stp2_2, stp2_3) \
483 \
484 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
485 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
486 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
487 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
488 } \
489 \
490 /* Stage3 */ \
491 { \
492 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
493 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
494 \
495 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
496 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
497 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
498 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
499 \
500 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
501 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
502 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
503 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
504 \
505 tmp0 = _mm_add_epi32(tmp0, rounding); \
506 tmp1 = _mm_add_epi32(tmp1, rounding); \
507 tmp2 = _mm_add_epi32(tmp2, rounding); \
508 tmp3 = _mm_add_epi32(tmp3, rounding); \
509 \
510 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
511 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
512 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
513 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
514 \
515 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
516 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
517 } \
518 \
519 /* Stage4 */ \
520 out0 = _mm_adds_epi16(stp1_0, stp2_7); \
521 out1 = _mm_adds_epi16(stp1_1, stp1_6); \
522 out2 = _mm_adds_epi16(stp1_2, stp1_5); \
523 out3 = _mm_adds_epi16(stp1_3, stp2_4); \
524 out4 = _mm_subs_epi16(stp1_3, stp2_4); \
525 out5 = _mm_subs_epi16(stp1_2, stp1_5); \
526 out6 = _mm_subs_epi16(stp1_1, stp1_6); \
527 out7 = _mm_subs_epi16(stp1_0, stp2_7); \
528 }
529
530 #define RECON_AND_STORE(dest, in_x) \
531 { \
532 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
533 d0 = _mm_unpacklo_epi8(d0, zero); \
534 d0 = _mm_add_epi16(in_x, d0); \
535 d0 = _mm_packus_epi16(d0, d0); \
536 _mm_storel_epi64((__m128i *)(dest), d0); \
537 dest += stride; \
538 }
539
vp9_idct8x8_64_add_sse2(const int16_t * input,uint8_t * dest,int stride)540 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
541 const __m128i zero = _mm_setzero_si128();
542 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
543 const __m128i final_rounding = _mm_set1_epi16(1<<4);
544 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
545 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
546 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
547 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
548 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
549 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
550 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
551 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
552
553 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
554 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
555 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
556 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
557 int i;
558
559 // Load input data.
560 in0 = _mm_load_si128((const __m128i *)input);
561 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
562 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
563 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
564 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
565 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
566 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
567 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
568
569 // 2-D
570 for (i = 0; i < 2; i++) {
571 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
572 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
573 in0, in1, in2, in3, in4, in5, in6, in7);
574
575 // 4-stage 1D idct8x8
576 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
577 in0, in1, in2, in3, in4, in5, in6, in7);
578 }
579
580 // Final rounding and shift
581 in0 = _mm_adds_epi16(in0, final_rounding);
582 in1 = _mm_adds_epi16(in1, final_rounding);
583 in2 = _mm_adds_epi16(in2, final_rounding);
584 in3 = _mm_adds_epi16(in3, final_rounding);
585 in4 = _mm_adds_epi16(in4, final_rounding);
586 in5 = _mm_adds_epi16(in5, final_rounding);
587 in6 = _mm_adds_epi16(in6, final_rounding);
588 in7 = _mm_adds_epi16(in7, final_rounding);
589
590 in0 = _mm_srai_epi16(in0, 5);
591 in1 = _mm_srai_epi16(in1, 5);
592 in2 = _mm_srai_epi16(in2, 5);
593 in3 = _mm_srai_epi16(in3, 5);
594 in4 = _mm_srai_epi16(in4, 5);
595 in5 = _mm_srai_epi16(in5, 5);
596 in6 = _mm_srai_epi16(in6, 5);
597 in7 = _mm_srai_epi16(in7, 5);
598
599 RECON_AND_STORE(dest, in0);
600 RECON_AND_STORE(dest, in1);
601 RECON_AND_STORE(dest, in2);
602 RECON_AND_STORE(dest, in3);
603 RECON_AND_STORE(dest, in4);
604 RECON_AND_STORE(dest, in5);
605 RECON_AND_STORE(dest, in6);
606 RECON_AND_STORE(dest, in7);
607 }
608
vp9_idct8x8_1_add_sse2(const int16_t * input,uint8_t * dest,int stride)609 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
610 __m128i dc_value;
611 const __m128i zero = _mm_setzero_si128();
612 int a;
613
614 a = dct_const_round_shift(input[0] * cospi_16_64);
615 a = dct_const_round_shift(a * cospi_16_64);
616 a = ROUND_POWER_OF_TWO(a, 5);
617
618 dc_value = _mm_set1_epi16(a);
619
620 RECON_AND_STORE(dest, dc_value);
621 RECON_AND_STORE(dest, dc_value);
622 RECON_AND_STORE(dest, dc_value);
623 RECON_AND_STORE(dest, dc_value);
624 RECON_AND_STORE(dest, dc_value);
625 RECON_AND_STORE(dest, dc_value);
626 RECON_AND_STORE(dest, dc_value);
627 RECON_AND_STORE(dest, dc_value);
628 }
629
630 // perform 8x8 transpose
array_transpose_8x8(__m128i * in,__m128i * res)631 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
632 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
633 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
634 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
635 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
636 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
637 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
638 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
639 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
640
641 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
642 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
643 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
644 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
645 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
646 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
647 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
648 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
649
650 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
651 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
652 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
653 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
654 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
655 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
656 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
657 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
658 }
659
array_transpose_4X8(__m128i * in,__m128i * out)660 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
661 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
662 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
663 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
664 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
665
666 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
667 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
668 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
669 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
670
671 out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
672 out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
673 out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
674 out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
675 }
676
idct8_sse2(__m128i * in)677 static void idct8_sse2(__m128i *in) {
678 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
679 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
680 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
681 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
682 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
683 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
684 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
685 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
686 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
687
688 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
689 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
690 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
691 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
692
693 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
694 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
695 in0, in1, in2, in3, in4, in5, in6, in7);
696
697 // 4-stage 1D idct8x8
698 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
699 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
700 }
701
iadst8_sse2(__m128i * in)702 static void iadst8_sse2(__m128i *in) {
703 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
704 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
705 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
706 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
707 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
708 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
709 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
710 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
711 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
712 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
713 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
714 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
715 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
716 const __m128i k__const_0 = _mm_set1_epi16(0);
717 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
718
719 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
720 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
721 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
722 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
723 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
724
725 // transpose
726 array_transpose_8x8(in, in);
727
728 // properly aligned for butterfly input
729 in0 = in[7];
730 in1 = in[0];
731 in2 = in[5];
732 in3 = in[2];
733 in4 = in[3];
734 in5 = in[4];
735 in6 = in[1];
736 in7 = in[6];
737
738 // column transformation
739 // stage 1
740 // interleave and multiply/add into 32-bit integer
741 s0 = _mm_unpacklo_epi16(in0, in1);
742 s1 = _mm_unpackhi_epi16(in0, in1);
743 s2 = _mm_unpacklo_epi16(in2, in3);
744 s3 = _mm_unpackhi_epi16(in2, in3);
745 s4 = _mm_unpacklo_epi16(in4, in5);
746 s5 = _mm_unpackhi_epi16(in4, in5);
747 s6 = _mm_unpacklo_epi16(in6, in7);
748 s7 = _mm_unpackhi_epi16(in6, in7);
749
750 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
751 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
752 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
753 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
754 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
755 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
756 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
757 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
758 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
759 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
760 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
761 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
762 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
763 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
764 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
765 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
766
767 // addition
768 w0 = _mm_add_epi32(u0, u8);
769 w1 = _mm_add_epi32(u1, u9);
770 w2 = _mm_add_epi32(u2, u10);
771 w3 = _mm_add_epi32(u3, u11);
772 w4 = _mm_add_epi32(u4, u12);
773 w5 = _mm_add_epi32(u5, u13);
774 w6 = _mm_add_epi32(u6, u14);
775 w7 = _mm_add_epi32(u7, u15);
776 w8 = _mm_sub_epi32(u0, u8);
777 w9 = _mm_sub_epi32(u1, u9);
778 w10 = _mm_sub_epi32(u2, u10);
779 w11 = _mm_sub_epi32(u3, u11);
780 w12 = _mm_sub_epi32(u4, u12);
781 w13 = _mm_sub_epi32(u5, u13);
782 w14 = _mm_sub_epi32(u6, u14);
783 w15 = _mm_sub_epi32(u7, u15);
784
785 // shift and rounding
786 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
787 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
788 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
789 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
790 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
791 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
792 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
793 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
794 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
795 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
796 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
797 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
798 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
799 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
800 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
801 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
802
803 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
804 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
805 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
806 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
807 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
808 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
809 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
810 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
811 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
812 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
813 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
814 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
815 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
816 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
817 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
818 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
819
820 // back to 16-bit and pack 8 integers into __m128i
821 in[0] = _mm_packs_epi32(u0, u1);
822 in[1] = _mm_packs_epi32(u2, u3);
823 in[2] = _mm_packs_epi32(u4, u5);
824 in[3] = _mm_packs_epi32(u6, u7);
825 in[4] = _mm_packs_epi32(u8, u9);
826 in[5] = _mm_packs_epi32(u10, u11);
827 in[6] = _mm_packs_epi32(u12, u13);
828 in[7] = _mm_packs_epi32(u14, u15);
829
830 // stage 2
831 s0 = _mm_add_epi16(in[0], in[2]);
832 s1 = _mm_add_epi16(in[1], in[3]);
833 s2 = _mm_sub_epi16(in[0], in[2]);
834 s3 = _mm_sub_epi16(in[1], in[3]);
835 u0 = _mm_unpacklo_epi16(in[4], in[5]);
836 u1 = _mm_unpackhi_epi16(in[4], in[5]);
837 u2 = _mm_unpacklo_epi16(in[6], in[7]);
838 u3 = _mm_unpackhi_epi16(in[6], in[7]);
839
840 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
841 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
842 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
843 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
844 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
845 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
846 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
847 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
848
849 w0 = _mm_add_epi32(v0, v4);
850 w1 = _mm_add_epi32(v1, v5);
851 w2 = _mm_add_epi32(v2, v6);
852 w3 = _mm_add_epi32(v3, v7);
853 w4 = _mm_sub_epi32(v0, v4);
854 w5 = _mm_sub_epi32(v1, v5);
855 w6 = _mm_sub_epi32(v2, v6);
856 w7 = _mm_sub_epi32(v3, v7);
857
858 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
859 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
860 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
861 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
862 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
863 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
864 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
865 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
866
867 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
868 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
869 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
870 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
871 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
872 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
873 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
874 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
875
876 // back to 16-bit intergers
877 s4 = _mm_packs_epi32(u0, u1);
878 s5 = _mm_packs_epi32(u2, u3);
879 s6 = _mm_packs_epi32(u4, u5);
880 s7 = _mm_packs_epi32(u6, u7);
881
882 // stage 3
883 u0 = _mm_unpacklo_epi16(s2, s3);
884 u1 = _mm_unpackhi_epi16(s2, s3);
885 u2 = _mm_unpacklo_epi16(s6, s7);
886 u3 = _mm_unpackhi_epi16(s6, s7);
887
888 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
889 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
890 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
891 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
892 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
893 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
894 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
895 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
896
897 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
898 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
899 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
900 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
901 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
902 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
903 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
904 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
905
906 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
907 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
908 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
909 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
910 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
911 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
912 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
913 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
914
915 s2 = _mm_packs_epi32(v0, v1);
916 s3 = _mm_packs_epi32(v2, v3);
917 s6 = _mm_packs_epi32(v4, v5);
918 s7 = _mm_packs_epi32(v6, v7);
919
920 in[0] = s0;
921 in[1] = _mm_sub_epi16(k__const_0, s4);
922 in[2] = s6;
923 in[3] = _mm_sub_epi16(k__const_0, s2);
924 in[4] = s3;
925 in[5] = _mm_sub_epi16(k__const_0, s7);
926 in[6] = s5;
927 in[7] = _mm_sub_epi16(k__const_0, s1);
928 }
929
930
vp9_iht8x8_64_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)931 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
932 int tx_type) {
933 __m128i in[8];
934 const __m128i zero = _mm_setzero_si128();
935 const __m128i final_rounding = _mm_set1_epi16(1<<4);
936
937 // load input data
938 in[0] = _mm_load_si128((const __m128i *)input);
939 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
940 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
941 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
942 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
943 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
944 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
945 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
946
947 switch (tx_type) {
948 case 0: // DCT_DCT
949 idct8_sse2(in);
950 idct8_sse2(in);
951 break;
952 case 1: // ADST_DCT
953 idct8_sse2(in);
954 iadst8_sse2(in);
955 break;
956 case 2: // DCT_ADST
957 iadst8_sse2(in);
958 idct8_sse2(in);
959 break;
960 case 3: // ADST_ADST
961 iadst8_sse2(in);
962 iadst8_sse2(in);
963 break;
964 default:
965 assert(0);
966 break;
967 }
968
969 // Final rounding and shift
970 in[0] = _mm_adds_epi16(in[0], final_rounding);
971 in[1] = _mm_adds_epi16(in[1], final_rounding);
972 in[2] = _mm_adds_epi16(in[2], final_rounding);
973 in[3] = _mm_adds_epi16(in[3], final_rounding);
974 in[4] = _mm_adds_epi16(in[4], final_rounding);
975 in[5] = _mm_adds_epi16(in[5], final_rounding);
976 in[6] = _mm_adds_epi16(in[6], final_rounding);
977 in[7] = _mm_adds_epi16(in[7], final_rounding);
978
979 in[0] = _mm_srai_epi16(in[0], 5);
980 in[1] = _mm_srai_epi16(in[1], 5);
981 in[2] = _mm_srai_epi16(in[2], 5);
982 in[3] = _mm_srai_epi16(in[3], 5);
983 in[4] = _mm_srai_epi16(in[4], 5);
984 in[5] = _mm_srai_epi16(in[5], 5);
985 in[6] = _mm_srai_epi16(in[6], 5);
986 in[7] = _mm_srai_epi16(in[7], 5);
987
988 RECON_AND_STORE(dest, in[0]);
989 RECON_AND_STORE(dest, in[1]);
990 RECON_AND_STORE(dest, in[2]);
991 RECON_AND_STORE(dest, in[3]);
992 RECON_AND_STORE(dest, in[4]);
993 RECON_AND_STORE(dest, in[5]);
994 RECON_AND_STORE(dest, in[6]);
995 RECON_AND_STORE(dest, in[7]);
996 }
997
vp9_idct8x8_10_add_sse2(const int16_t * input,uint8_t * dest,int stride)998 void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
999 const __m128i zero = _mm_setzero_si128();
1000 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1001 const __m128i final_rounding = _mm_set1_epi16(1<<4);
1002 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1003 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1004 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1005 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
1006 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1007 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1008 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1009 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1010 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1011
1012 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1013 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
1014 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
1015 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1016
1017 // Rows. Load 4-row input data.
1018 in0 = _mm_load_si128((const __m128i *)input);
1019 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
1020 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
1021 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
1022
1023 // 8x4 Transpose
1024 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
1025 // Stage1
1026 { //NOLINT
1027 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
1028 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
1029
1030 tmp0 = _mm_madd_epi16(lo_17, stg1_0);
1031 tmp2 = _mm_madd_epi16(lo_17, stg1_1);
1032 tmp4 = _mm_madd_epi16(lo_35, stg1_2);
1033 tmp6 = _mm_madd_epi16(lo_35, stg1_3);
1034
1035 tmp0 = _mm_add_epi32(tmp0, rounding);
1036 tmp2 = _mm_add_epi32(tmp2, rounding);
1037 tmp4 = _mm_add_epi32(tmp4, rounding);
1038 tmp6 = _mm_add_epi32(tmp6, rounding);
1039 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1040 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1041 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1042 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1043
1044 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
1045 stp1_5 = _mm_packs_epi32(tmp4, tmp6);
1046 }
1047
1048 // Stage2
1049 { //NOLINT
1050 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
1051 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
1052
1053 tmp0 = _mm_madd_epi16(lo_04, stg2_0);
1054 tmp2 = _mm_madd_epi16(lo_04, stg2_1);
1055 tmp4 = _mm_madd_epi16(lo_26, stg2_2);
1056 tmp6 = _mm_madd_epi16(lo_26, stg2_3);
1057
1058 tmp0 = _mm_add_epi32(tmp0, rounding);
1059 tmp2 = _mm_add_epi32(tmp2, rounding);
1060 tmp4 = _mm_add_epi32(tmp4, rounding);
1061 tmp6 = _mm_add_epi32(tmp6, rounding);
1062 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1063 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1064 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1065 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1066
1067 stp2_0 = _mm_packs_epi32(tmp0, tmp2);
1068 stp2_2 = _mm_packs_epi32(tmp6, tmp4);
1069
1070 tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
1071 tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
1072
1073 stp2_4 = tmp0;
1074 stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
1075 stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
1076 }
1077
1078 // Stage3
1079 { //NOLINT
1080 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1081
1082 tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
1083 tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
1084
1085 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
1086 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
1087
1088 tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1089 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
1090
1091 tmp0 = _mm_add_epi32(tmp0, rounding);
1092 tmp2 = _mm_add_epi32(tmp2, rounding);
1093 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1094 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1095
1096 stp1_5 = _mm_packs_epi32(tmp0, tmp2);
1097 }
1098
1099 // Stage4
1100 tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
1101 tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
1102 tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
1103 tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
1104
1105 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
1106
1107 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
1108 in0, in1, in2, in3, in4, in5, in6, in7);
1109 // Final rounding and shift
1110 in0 = _mm_adds_epi16(in0, final_rounding);
1111 in1 = _mm_adds_epi16(in1, final_rounding);
1112 in2 = _mm_adds_epi16(in2, final_rounding);
1113 in3 = _mm_adds_epi16(in3, final_rounding);
1114 in4 = _mm_adds_epi16(in4, final_rounding);
1115 in5 = _mm_adds_epi16(in5, final_rounding);
1116 in6 = _mm_adds_epi16(in6, final_rounding);
1117 in7 = _mm_adds_epi16(in7, final_rounding);
1118
1119 in0 = _mm_srai_epi16(in0, 5);
1120 in1 = _mm_srai_epi16(in1, 5);
1121 in2 = _mm_srai_epi16(in2, 5);
1122 in3 = _mm_srai_epi16(in3, 5);
1123 in4 = _mm_srai_epi16(in4, 5);
1124 in5 = _mm_srai_epi16(in5, 5);
1125 in6 = _mm_srai_epi16(in6, 5);
1126 in7 = _mm_srai_epi16(in7, 5);
1127
1128 RECON_AND_STORE(dest, in0);
1129 RECON_AND_STORE(dest, in1);
1130 RECON_AND_STORE(dest, in2);
1131 RECON_AND_STORE(dest, in3);
1132 RECON_AND_STORE(dest, in4);
1133 RECON_AND_STORE(dest, in5);
1134 RECON_AND_STORE(dest, in6);
1135 RECON_AND_STORE(dest, in7);
1136 }
1137
1138 #define IDCT16 \
1139 /* Stage2 */ \
1140 { \
1141 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
1142 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
1143 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
1144 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
1145 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
1146 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
1147 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
1148 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
1149 \
1150 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1151 stg2_0, stg2_1, stg2_2, stg2_3, \
1152 stp2_8, stp2_15, stp2_9, stp2_14) \
1153 \
1154 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1155 stg2_4, stg2_5, stg2_6, stg2_7, \
1156 stp2_10, stp2_13, stp2_11, stp2_12) \
1157 } \
1158 \
1159 /* Stage3 */ \
1160 { \
1161 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
1162 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
1163 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
1164 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
1165 \
1166 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1167 stg3_0, stg3_1, stg3_2, stg3_3, \
1168 stp1_4, stp1_7, stp1_5, stp1_6) \
1169 \
1170 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
1171 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
1172 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1173 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1174 \
1175 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1176 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1177 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1178 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1179 } \
1180 \
1181 /* Stage4 */ \
1182 { \
1183 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
1184 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
1185 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
1186 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
1187 \
1188 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1189 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1190 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1191 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1192 \
1193 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1194 stg4_0, stg4_1, stg4_2, stg4_3, \
1195 stp2_0, stp2_1, stp2_2, stp2_3) \
1196 \
1197 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1198 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1199 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1200 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1201 \
1202 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1203 stg4_4, stg4_5, stg4_6, stg4_7, \
1204 stp2_9, stp2_14, stp2_10, stp2_13) \
1205 } \
1206 \
1207 /* Stage5 */ \
1208 { \
1209 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1210 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1211 \
1212 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1213 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1214 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1215 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1216 \
1217 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1218 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1219 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1220 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1221 \
1222 tmp0 = _mm_add_epi32(tmp0, rounding); \
1223 tmp1 = _mm_add_epi32(tmp1, rounding); \
1224 tmp2 = _mm_add_epi32(tmp2, rounding); \
1225 tmp3 = _mm_add_epi32(tmp3, rounding); \
1226 \
1227 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1228 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1229 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1230 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1231 \
1232 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1233 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1234 \
1235 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1236 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1237 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1238 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1239 \
1240 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1241 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1242 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1243 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1244 } \
1245 \
1246 /* Stage6 */ \
1247 { \
1248 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1249 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1250 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1251 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1252 \
1253 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1254 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1255 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1256 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1257 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1258 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1259 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1260 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1261 \
1262 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1263 stg6_0, stg4_0, stg6_0, stg4_0, \
1264 stp2_10, stp2_13, stp2_11, stp2_12) \
1265 }
1266
1267 #define IDCT16_10 \
1268 /* Stage2 */ \
1269 { \
1270 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1271 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1272 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1273 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1274 \
1275 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1276 stg2_0, stg2_1, stg2_6, stg2_7, \
1277 stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1278 } \
1279 \
1280 /* Stage3 */ \
1281 { \
1282 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1283 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1284 \
1285 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1286 stg3_0, stg3_1, \
1287 stp2_4, stp2_7) \
1288 \
1289 stp1_9 = stp1_8_0; \
1290 stp1_10 = stp1_11; \
1291 \
1292 stp1_13 = stp1_12_0; \
1293 stp1_14 = stp1_15; \
1294 } \
1295 \
1296 /* Stage4 */ \
1297 { \
1298 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1299 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1300 \
1301 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1302 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1303 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1304 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1305 \
1306 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1307 stg4_0, stg4_1, \
1308 stp1_0, stp1_1) \
1309 stp2_5 = stp2_4; \
1310 stp2_6 = stp2_7; \
1311 \
1312 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1313 stg4_4, stg4_5, stg4_6, stg4_7, \
1314 stp2_9, stp2_14, stp2_10, stp2_13) \
1315 } \
1316 \
1317 /* Stage5 */ \
1318 { \
1319 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1320 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1321 \
1322 stp1_2 = stp1_1; \
1323 stp1_3 = stp1_0; \
1324 \
1325 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1326 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1327 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1328 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1329 \
1330 tmp0 = _mm_add_epi32(tmp0, rounding); \
1331 tmp1 = _mm_add_epi32(tmp1, rounding); \
1332 tmp2 = _mm_add_epi32(tmp2, rounding); \
1333 tmp3 = _mm_add_epi32(tmp3, rounding); \
1334 \
1335 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1336 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1337 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1338 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1339 \
1340 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1341 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1342 \
1343 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1344 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1345 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1346 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1347 \
1348 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1349 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1350 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1351 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1352 } \
1353 \
1354 /* Stage6 */ \
1355 { \
1356 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1357 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1358 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1359 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1360 \
1361 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1362 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1363 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1364 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1365 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1366 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1367 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1368 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1369 \
1370 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1371 stg6_0, stg4_0, stg6_0, stg4_0, \
1372 stp2_10, stp2_13, stp2_11, stp2_12) \
1373 }
1374
vp9_idct16x16_256_add_sse2(const int16_t * input,uint8_t * dest,int stride)1375 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1376 int stride) {
1377 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1378 const __m128i final_rounding = _mm_set1_epi16(1<<5);
1379 const __m128i zero = _mm_setzero_si128();
1380
1381 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1382 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1383 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1384 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1385 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1386 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1387 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1388 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1389
1390 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1391 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1392 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1393 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1394
1395 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1396 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1397 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1398 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1399 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1400 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1401 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1402 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1403
1404 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1405
1406 __m128i in[16], l[16], r[16], *curr1;
1407 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1408 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1409 stp1_8_0, stp1_12_0;
1410 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1411 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1412 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1413 int i;
1414
1415 curr1 = l;
1416 for (i = 0; i < 2; i++) {
1417 // 1-D idct
1418
1419 // Load input data.
1420 in[0] = _mm_load_si128((const __m128i *)input);
1421 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
1422 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
1423 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
1424 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
1425 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
1426 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
1427 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
1428 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
1429 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
1430 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
1431 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
1432 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
1433 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
1434 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
1435 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
1436
1437 array_transpose_8x8(in, in);
1438 array_transpose_8x8(in+8, in+8);
1439
1440 IDCT16
1441
1442 // Stage7
1443 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1444 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1445 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1446 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1447 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1448 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1449 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1450 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1451 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1452 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1453 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1454 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1455 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1456 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1457 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1458 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1459
1460 curr1 = r;
1461 input += 128;
1462 }
1463 for (i = 0; i < 2; i++) {
1464 // 1-D idct
1465 array_transpose_8x8(l+i*8, in);
1466 array_transpose_8x8(r+i*8, in+8);
1467
1468 IDCT16
1469
1470 // 2-D
1471 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1472 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1473 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1474 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1475 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1476 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1477 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1478 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1479 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1480 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1481 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1482 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1483 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1484 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1485 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1486 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1487
1488 // Final rounding and shift
1489 in[0] = _mm_adds_epi16(in[0], final_rounding);
1490 in[1] = _mm_adds_epi16(in[1], final_rounding);
1491 in[2] = _mm_adds_epi16(in[2], final_rounding);
1492 in[3] = _mm_adds_epi16(in[3], final_rounding);
1493 in[4] = _mm_adds_epi16(in[4], final_rounding);
1494 in[5] = _mm_adds_epi16(in[5], final_rounding);
1495 in[6] = _mm_adds_epi16(in[6], final_rounding);
1496 in[7] = _mm_adds_epi16(in[7], final_rounding);
1497 in[8] = _mm_adds_epi16(in[8], final_rounding);
1498 in[9] = _mm_adds_epi16(in[9], final_rounding);
1499 in[10] = _mm_adds_epi16(in[10], final_rounding);
1500 in[11] = _mm_adds_epi16(in[11], final_rounding);
1501 in[12] = _mm_adds_epi16(in[12], final_rounding);
1502 in[13] = _mm_adds_epi16(in[13], final_rounding);
1503 in[14] = _mm_adds_epi16(in[14], final_rounding);
1504 in[15] = _mm_adds_epi16(in[15], final_rounding);
1505
1506 in[0] = _mm_srai_epi16(in[0], 6);
1507 in[1] = _mm_srai_epi16(in[1], 6);
1508 in[2] = _mm_srai_epi16(in[2], 6);
1509 in[3] = _mm_srai_epi16(in[3], 6);
1510 in[4] = _mm_srai_epi16(in[4], 6);
1511 in[5] = _mm_srai_epi16(in[5], 6);
1512 in[6] = _mm_srai_epi16(in[6], 6);
1513 in[7] = _mm_srai_epi16(in[7], 6);
1514 in[8] = _mm_srai_epi16(in[8], 6);
1515 in[9] = _mm_srai_epi16(in[9], 6);
1516 in[10] = _mm_srai_epi16(in[10], 6);
1517 in[11] = _mm_srai_epi16(in[11], 6);
1518 in[12] = _mm_srai_epi16(in[12], 6);
1519 in[13] = _mm_srai_epi16(in[13], 6);
1520 in[14] = _mm_srai_epi16(in[14], 6);
1521 in[15] = _mm_srai_epi16(in[15], 6);
1522
1523 RECON_AND_STORE(dest, in[0]);
1524 RECON_AND_STORE(dest, in[1]);
1525 RECON_AND_STORE(dest, in[2]);
1526 RECON_AND_STORE(dest, in[3]);
1527 RECON_AND_STORE(dest, in[4]);
1528 RECON_AND_STORE(dest, in[5]);
1529 RECON_AND_STORE(dest, in[6]);
1530 RECON_AND_STORE(dest, in[7]);
1531 RECON_AND_STORE(dest, in[8]);
1532 RECON_AND_STORE(dest, in[9]);
1533 RECON_AND_STORE(dest, in[10]);
1534 RECON_AND_STORE(dest, in[11]);
1535 RECON_AND_STORE(dest, in[12]);
1536 RECON_AND_STORE(dest, in[13]);
1537 RECON_AND_STORE(dest, in[14]);
1538 RECON_AND_STORE(dest, in[15]);
1539
1540 dest += 8 - (stride * 16);
1541 }
1542 }
1543
vp9_idct16x16_1_add_sse2(const int16_t * input,uint8_t * dest,int stride)1544 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1545 __m128i dc_value;
1546 const __m128i zero = _mm_setzero_si128();
1547 int a, i;
1548
1549 a = dct_const_round_shift(input[0] * cospi_16_64);
1550 a = dct_const_round_shift(a * cospi_16_64);
1551 a = ROUND_POWER_OF_TWO(a, 6);
1552
1553 dc_value = _mm_set1_epi16(a);
1554
1555 for (i = 0; i < 2; ++i) {
1556 RECON_AND_STORE(dest, dc_value);
1557 RECON_AND_STORE(dest, dc_value);
1558 RECON_AND_STORE(dest, dc_value);
1559 RECON_AND_STORE(dest, dc_value);
1560 RECON_AND_STORE(dest, dc_value);
1561 RECON_AND_STORE(dest, dc_value);
1562 RECON_AND_STORE(dest, dc_value);
1563 RECON_AND_STORE(dest, dc_value);
1564 RECON_AND_STORE(dest, dc_value);
1565 RECON_AND_STORE(dest, dc_value);
1566 RECON_AND_STORE(dest, dc_value);
1567 RECON_AND_STORE(dest, dc_value);
1568 RECON_AND_STORE(dest, dc_value);
1569 RECON_AND_STORE(dest, dc_value);
1570 RECON_AND_STORE(dest, dc_value);
1571 RECON_AND_STORE(dest, dc_value);
1572 dest += 8 - (stride * 16);
1573 }
1574 }
1575
array_transpose_16x16(__m128i * res0,__m128i * res1)1576 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1577 __m128i tbuf[8];
1578 array_transpose_8x8(res0, res0);
1579 array_transpose_8x8(res1, tbuf);
1580 array_transpose_8x8(res0 + 8, res1);
1581 array_transpose_8x8(res1 + 8, res1 + 8);
1582
1583 res0[8] = tbuf[0];
1584 res0[9] = tbuf[1];
1585 res0[10] = tbuf[2];
1586 res0[11] = tbuf[3];
1587 res0[12] = tbuf[4];
1588 res0[13] = tbuf[5];
1589 res0[14] = tbuf[6];
1590 res0[15] = tbuf[7];
1591 }
1592
iadst16_8col(__m128i * in)1593 static void iadst16_8col(__m128i *in) {
1594 // perform 16x16 1-D ADST for 8 columns
1595 __m128i s[16], x[16], u[32], v[32];
1596 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1597 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1598 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1599 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1600 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1601 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1602 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1603 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1604 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1605 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1606 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1607 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1608 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1609 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1610 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1611 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1612 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1613 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1614 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1615 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1616 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1617 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1618 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1619 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1620 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1621 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1622 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1623 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1624 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1625 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1626 const __m128i kZero = _mm_set1_epi16(0);
1627
1628 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1629 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1630 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1631 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1632 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1633 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1634 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1635 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1636 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1637 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1638 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1639 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1640 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1641 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1642 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1643 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1644
1645 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1646 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1647 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1648 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1649 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1650 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1651 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1652 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1653 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1654 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1655 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1656 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1657 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1658 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1659 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1660 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1661 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1662 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1663 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1664 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1665 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1666 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1667 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1668 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1669 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1670 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1671 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1672 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1673 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1674 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1675 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1676 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1677
1678 u[0] = _mm_add_epi32(v[0], v[16]);
1679 u[1] = _mm_add_epi32(v[1], v[17]);
1680 u[2] = _mm_add_epi32(v[2], v[18]);
1681 u[3] = _mm_add_epi32(v[3], v[19]);
1682 u[4] = _mm_add_epi32(v[4], v[20]);
1683 u[5] = _mm_add_epi32(v[5], v[21]);
1684 u[6] = _mm_add_epi32(v[6], v[22]);
1685 u[7] = _mm_add_epi32(v[7], v[23]);
1686 u[8] = _mm_add_epi32(v[8], v[24]);
1687 u[9] = _mm_add_epi32(v[9], v[25]);
1688 u[10] = _mm_add_epi32(v[10], v[26]);
1689 u[11] = _mm_add_epi32(v[11], v[27]);
1690 u[12] = _mm_add_epi32(v[12], v[28]);
1691 u[13] = _mm_add_epi32(v[13], v[29]);
1692 u[14] = _mm_add_epi32(v[14], v[30]);
1693 u[15] = _mm_add_epi32(v[15], v[31]);
1694 u[16] = _mm_sub_epi32(v[0], v[16]);
1695 u[17] = _mm_sub_epi32(v[1], v[17]);
1696 u[18] = _mm_sub_epi32(v[2], v[18]);
1697 u[19] = _mm_sub_epi32(v[3], v[19]);
1698 u[20] = _mm_sub_epi32(v[4], v[20]);
1699 u[21] = _mm_sub_epi32(v[5], v[21]);
1700 u[22] = _mm_sub_epi32(v[6], v[22]);
1701 u[23] = _mm_sub_epi32(v[7], v[23]);
1702 u[24] = _mm_sub_epi32(v[8], v[24]);
1703 u[25] = _mm_sub_epi32(v[9], v[25]);
1704 u[26] = _mm_sub_epi32(v[10], v[26]);
1705 u[27] = _mm_sub_epi32(v[11], v[27]);
1706 u[28] = _mm_sub_epi32(v[12], v[28]);
1707 u[29] = _mm_sub_epi32(v[13], v[29]);
1708 u[30] = _mm_sub_epi32(v[14], v[30]);
1709 u[31] = _mm_sub_epi32(v[15], v[31]);
1710
1711 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1712 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1713 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1714 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1715 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1716 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1717 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1718 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1719 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1720 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1721 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1722 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1723 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1724 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1725 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1726 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1727 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1728 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1729 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1730 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1731 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1732 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1733 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1734 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1735 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1736 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1737 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1738 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1739 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1740 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1741 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1742 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1743
1744 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1745 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1746 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1747 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1748 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1749 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1750 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1751 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1752 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1753 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1754 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1755 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1756 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1757 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1758 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1759 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1760 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1761 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1762 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1763 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1764 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1765 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1766 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1767 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1768 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1769 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1770 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1771 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1772 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1773 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1774 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1775 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1776
1777 s[0] = _mm_packs_epi32(u[0], u[1]);
1778 s[1] = _mm_packs_epi32(u[2], u[3]);
1779 s[2] = _mm_packs_epi32(u[4], u[5]);
1780 s[3] = _mm_packs_epi32(u[6], u[7]);
1781 s[4] = _mm_packs_epi32(u[8], u[9]);
1782 s[5] = _mm_packs_epi32(u[10], u[11]);
1783 s[6] = _mm_packs_epi32(u[12], u[13]);
1784 s[7] = _mm_packs_epi32(u[14], u[15]);
1785 s[8] = _mm_packs_epi32(u[16], u[17]);
1786 s[9] = _mm_packs_epi32(u[18], u[19]);
1787 s[10] = _mm_packs_epi32(u[20], u[21]);
1788 s[11] = _mm_packs_epi32(u[22], u[23]);
1789 s[12] = _mm_packs_epi32(u[24], u[25]);
1790 s[13] = _mm_packs_epi32(u[26], u[27]);
1791 s[14] = _mm_packs_epi32(u[28], u[29]);
1792 s[15] = _mm_packs_epi32(u[30], u[31]);
1793
1794 // stage 2
1795 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1796 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1797 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1798 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1799 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1800 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1801 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1802 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1803
1804 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1805 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1806 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1807 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1808 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1809 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1810 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1811 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1812 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1813 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1814 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1815 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1816 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1817 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1818 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1819 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1820
1821 u[0] = _mm_add_epi32(v[0], v[8]);
1822 u[1] = _mm_add_epi32(v[1], v[9]);
1823 u[2] = _mm_add_epi32(v[2], v[10]);
1824 u[3] = _mm_add_epi32(v[3], v[11]);
1825 u[4] = _mm_add_epi32(v[4], v[12]);
1826 u[5] = _mm_add_epi32(v[5], v[13]);
1827 u[6] = _mm_add_epi32(v[6], v[14]);
1828 u[7] = _mm_add_epi32(v[7], v[15]);
1829 u[8] = _mm_sub_epi32(v[0], v[8]);
1830 u[9] = _mm_sub_epi32(v[1], v[9]);
1831 u[10] = _mm_sub_epi32(v[2], v[10]);
1832 u[11] = _mm_sub_epi32(v[3], v[11]);
1833 u[12] = _mm_sub_epi32(v[4], v[12]);
1834 u[13] = _mm_sub_epi32(v[5], v[13]);
1835 u[14] = _mm_sub_epi32(v[6], v[14]);
1836 u[15] = _mm_sub_epi32(v[7], v[15]);
1837
1838 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1839 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1840 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1841 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1842 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1843 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1844 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1845 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1846 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1847 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1848 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1849 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1850 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1851 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1852 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1853 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1854
1855 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1856 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1857 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1858 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1859 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1860 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1861 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1862 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1863 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1864 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1865 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1866 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1867 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1868 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1869 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1870 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1871
1872 x[0] = _mm_add_epi16(s[0], s[4]);
1873 x[1] = _mm_add_epi16(s[1], s[5]);
1874 x[2] = _mm_add_epi16(s[2], s[6]);
1875 x[3] = _mm_add_epi16(s[3], s[7]);
1876 x[4] = _mm_sub_epi16(s[0], s[4]);
1877 x[5] = _mm_sub_epi16(s[1], s[5]);
1878 x[6] = _mm_sub_epi16(s[2], s[6]);
1879 x[7] = _mm_sub_epi16(s[3], s[7]);
1880 x[8] = _mm_packs_epi32(u[0], u[1]);
1881 x[9] = _mm_packs_epi32(u[2], u[3]);
1882 x[10] = _mm_packs_epi32(u[4], u[5]);
1883 x[11] = _mm_packs_epi32(u[6], u[7]);
1884 x[12] = _mm_packs_epi32(u[8], u[9]);
1885 x[13] = _mm_packs_epi32(u[10], u[11]);
1886 x[14] = _mm_packs_epi32(u[12], u[13]);
1887 x[15] = _mm_packs_epi32(u[14], u[15]);
1888
1889 // stage 3
1890 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1891 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1892 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1893 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1894 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1895 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1896 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1897 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1898
1899 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1900 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1901 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1902 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1903 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1904 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1905 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1906 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1907 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1908 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1909 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1910 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1911 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1912 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1913 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1914 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1915
1916 u[0] = _mm_add_epi32(v[0], v[4]);
1917 u[1] = _mm_add_epi32(v[1], v[5]);
1918 u[2] = _mm_add_epi32(v[2], v[6]);
1919 u[3] = _mm_add_epi32(v[3], v[7]);
1920 u[4] = _mm_sub_epi32(v[0], v[4]);
1921 u[5] = _mm_sub_epi32(v[1], v[5]);
1922 u[6] = _mm_sub_epi32(v[2], v[6]);
1923 u[7] = _mm_sub_epi32(v[3], v[7]);
1924 u[8] = _mm_add_epi32(v[8], v[12]);
1925 u[9] = _mm_add_epi32(v[9], v[13]);
1926 u[10] = _mm_add_epi32(v[10], v[14]);
1927 u[11] = _mm_add_epi32(v[11], v[15]);
1928 u[12] = _mm_sub_epi32(v[8], v[12]);
1929 u[13] = _mm_sub_epi32(v[9], v[13]);
1930 u[14] = _mm_sub_epi32(v[10], v[14]);
1931 u[15] = _mm_sub_epi32(v[11], v[15]);
1932
1933 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1934 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1935 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1936 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1937 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1938 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1939 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1940 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1941 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1942 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1943 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1944 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1945 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1946 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1947 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1948 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1949
1950 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1951 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1952 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1953 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1954 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1955 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1956 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1957 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1958 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1959 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1960 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1961 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1962 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1963 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1964 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1965 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1966
1967 s[0] = _mm_add_epi16(x[0], x[2]);
1968 s[1] = _mm_add_epi16(x[1], x[3]);
1969 s[2] = _mm_sub_epi16(x[0], x[2]);
1970 s[3] = _mm_sub_epi16(x[1], x[3]);
1971 s[4] = _mm_packs_epi32(v[0], v[1]);
1972 s[5] = _mm_packs_epi32(v[2], v[3]);
1973 s[6] = _mm_packs_epi32(v[4], v[5]);
1974 s[7] = _mm_packs_epi32(v[6], v[7]);
1975 s[8] = _mm_add_epi16(x[8], x[10]);
1976 s[9] = _mm_add_epi16(x[9], x[11]);
1977 s[10] = _mm_sub_epi16(x[8], x[10]);
1978 s[11] = _mm_sub_epi16(x[9], x[11]);
1979 s[12] = _mm_packs_epi32(v[8], v[9]);
1980 s[13] = _mm_packs_epi32(v[10], v[11]);
1981 s[14] = _mm_packs_epi32(v[12], v[13]);
1982 s[15] = _mm_packs_epi32(v[14], v[15]);
1983
1984 // stage 4
1985 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1986 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1987 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1988 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1989 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1990 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1991 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1992 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1993
1994 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1995 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1996 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1997 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1998 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1999 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2000 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2001 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2002 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
2003 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
2004 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
2005 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
2006 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
2007 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
2008 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
2009 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
2010
2011 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2012 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2013 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2014 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2015 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2016 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2017 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2018 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2019 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2020 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2021 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2022 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2023 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2024 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2025 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2026 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2027
2028 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2029 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2030 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2031 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2032 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2033 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2034 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2035 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2036 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2037 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2038 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2039 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2040 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2041 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2042 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2043 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2044
2045 in[0] = s[0];
2046 in[1] = _mm_sub_epi16(kZero, s[8]);
2047 in[2] = s[12];
2048 in[3] = _mm_sub_epi16(kZero, s[4]);
2049 in[4] = _mm_packs_epi32(v[4], v[5]);
2050 in[5] = _mm_packs_epi32(v[12], v[13]);
2051 in[6] = _mm_packs_epi32(v[8], v[9]);
2052 in[7] = _mm_packs_epi32(v[0], v[1]);
2053 in[8] = _mm_packs_epi32(v[2], v[3]);
2054 in[9] = _mm_packs_epi32(v[10], v[11]);
2055 in[10] = _mm_packs_epi32(v[14], v[15]);
2056 in[11] = _mm_packs_epi32(v[6], v[7]);
2057 in[12] = s[5];
2058 in[13] = _mm_sub_epi16(kZero, s[13]);
2059 in[14] = s[9];
2060 in[15] = _mm_sub_epi16(kZero, s[1]);
2061 }
2062
idct16_8col(__m128i * in)2063 static void idct16_8col(__m128i *in) {
2064 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2065 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
2066 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2067 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
2068 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2069 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
2070 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2071 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
2072 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2073 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
2074 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2075 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
2076 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
2077 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2078 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2079 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
2080 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2081 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
2082 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2083 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2084 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
2085 __m128i v[16], u[16], s[16], t[16];
2086
2087 // stage 1
2088 s[0] = in[0];
2089 s[1] = in[8];
2090 s[2] = in[4];
2091 s[3] = in[12];
2092 s[4] = in[2];
2093 s[5] = in[10];
2094 s[6] = in[6];
2095 s[7] = in[14];
2096 s[8] = in[1];
2097 s[9] = in[9];
2098 s[10] = in[5];
2099 s[11] = in[13];
2100 s[12] = in[3];
2101 s[13] = in[11];
2102 s[14] = in[7];
2103 s[15] = in[15];
2104
2105 // stage 2
2106 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
2107 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
2108 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
2109 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
2110 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
2111 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
2112 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
2113 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
2114
2115 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
2116 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
2117 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
2118 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
2119 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
2120 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
2121 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
2122 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
2123 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
2124 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
2125 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
2126 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
2127 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
2128 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
2129 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
2130 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
2131
2132 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2133 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2134 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2135 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2136 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2137 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2138 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2139 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2140 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2141 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2142 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2143 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2144 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2145 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2146 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2147 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2148
2149 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2150 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2151 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2152 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2153 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2154 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2155 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2156 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2157 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2158 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2159 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2160 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2161 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2162 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2163 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2164 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2165
2166 s[8] = _mm_packs_epi32(u[0], u[1]);
2167 s[15] = _mm_packs_epi32(u[2], u[3]);
2168 s[9] = _mm_packs_epi32(u[4], u[5]);
2169 s[14] = _mm_packs_epi32(u[6], u[7]);
2170 s[10] = _mm_packs_epi32(u[8], u[9]);
2171 s[13] = _mm_packs_epi32(u[10], u[11]);
2172 s[11] = _mm_packs_epi32(u[12], u[13]);
2173 s[12] = _mm_packs_epi32(u[14], u[15]);
2174
2175 // stage 3
2176 t[0] = s[0];
2177 t[1] = s[1];
2178 t[2] = s[2];
2179 t[3] = s[3];
2180 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
2181 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
2182 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
2183 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
2184
2185 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2186 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2187 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2188 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2189 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2190 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2191 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2192 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2193
2194 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2195 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2196 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2197 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2198 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2199 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2200 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2201 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2202
2203 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2204 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2205 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2206 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2207 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2208 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2209 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2210 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2211
2212 t[4] = _mm_packs_epi32(u[0], u[1]);
2213 t[7] = _mm_packs_epi32(u[2], u[3]);
2214 t[5] = _mm_packs_epi32(u[4], u[5]);
2215 t[6] = _mm_packs_epi32(u[6], u[7]);
2216 t[8] = _mm_add_epi16(s[8], s[9]);
2217 t[9] = _mm_sub_epi16(s[8], s[9]);
2218 t[10] = _mm_sub_epi16(s[11], s[10]);
2219 t[11] = _mm_add_epi16(s[10], s[11]);
2220 t[12] = _mm_add_epi16(s[12], s[13]);
2221 t[13] = _mm_sub_epi16(s[12], s[13]);
2222 t[14] = _mm_sub_epi16(s[15], s[14]);
2223 t[15] = _mm_add_epi16(s[14], s[15]);
2224
2225 // stage 4
2226 u[0] = _mm_unpacklo_epi16(t[0], t[1]);
2227 u[1] = _mm_unpackhi_epi16(t[0], t[1]);
2228 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
2229 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
2230 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
2231 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
2232 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
2233 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
2234
2235 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2236 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2237 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2238 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2239 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
2240 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
2241 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2242 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2243 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
2244 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
2245 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
2246 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
2247 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
2248 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
2249 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
2250 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
2251
2252 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2253 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2254 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2255 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2256 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2257 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2258 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2259 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2260 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2261 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2262 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2263 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2264 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2265 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2266 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2267 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2268
2269 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2270 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2271 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2272 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2273 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2274 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2275 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2276 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2277 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2278 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2279 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2280 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2281 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2282 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2283 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2284 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2285
2286 s[0] = _mm_packs_epi32(u[0], u[1]);
2287 s[1] = _mm_packs_epi32(u[2], u[3]);
2288 s[2] = _mm_packs_epi32(u[4], u[5]);
2289 s[3] = _mm_packs_epi32(u[6], u[7]);
2290 s[4] = _mm_add_epi16(t[4], t[5]);
2291 s[5] = _mm_sub_epi16(t[4], t[5]);
2292 s[6] = _mm_sub_epi16(t[7], t[6]);
2293 s[7] = _mm_add_epi16(t[6], t[7]);
2294 s[8] = t[8];
2295 s[15] = t[15];
2296 s[9] = _mm_packs_epi32(u[8], u[9]);
2297 s[14] = _mm_packs_epi32(u[10], u[11]);
2298 s[10] = _mm_packs_epi32(u[12], u[13]);
2299 s[13] = _mm_packs_epi32(u[14], u[15]);
2300 s[11] = t[11];
2301 s[12] = t[12];
2302
2303 // stage 5
2304 t[0] = _mm_add_epi16(s[0], s[3]);
2305 t[1] = _mm_add_epi16(s[1], s[2]);
2306 t[2] = _mm_sub_epi16(s[1], s[2]);
2307 t[3] = _mm_sub_epi16(s[0], s[3]);
2308 t[4] = s[4];
2309 t[7] = s[7];
2310
2311 u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2312 u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2313 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2314 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2315 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2316 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2317 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2318 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2319 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2320 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2321 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2322 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2323 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2324 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2325 t[5] = _mm_packs_epi32(u[0], u[1]);
2326 t[6] = _mm_packs_epi32(u[2], u[3]);
2327
2328 t[8] = _mm_add_epi16(s[8], s[11]);
2329 t[9] = _mm_add_epi16(s[9], s[10]);
2330 t[10] = _mm_sub_epi16(s[9], s[10]);
2331 t[11] = _mm_sub_epi16(s[8], s[11]);
2332 t[12] = _mm_sub_epi16(s[15], s[12]);
2333 t[13] = _mm_sub_epi16(s[14], s[13]);
2334 t[14] = _mm_add_epi16(s[13], s[14]);
2335 t[15] = _mm_add_epi16(s[12], s[15]);
2336
2337 // stage 6
2338 s[0] = _mm_add_epi16(t[0], t[7]);
2339 s[1] = _mm_add_epi16(t[1], t[6]);
2340 s[2] = _mm_add_epi16(t[2], t[5]);
2341 s[3] = _mm_add_epi16(t[3], t[4]);
2342 s[4] = _mm_sub_epi16(t[3], t[4]);
2343 s[5] = _mm_sub_epi16(t[2], t[5]);
2344 s[6] = _mm_sub_epi16(t[1], t[6]);
2345 s[7] = _mm_sub_epi16(t[0], t[7]);
2346 s[8] = t[8];
2347 s[9] = t[9];
2348
2349 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2350 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2351 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2352 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2353
2354 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2355 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2356 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2357 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2358 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2359 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2360 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2361 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2362
2363 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2364 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2365 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2366 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2367 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2368 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2369 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2370 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2371
2372 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2373 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2374 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2375 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2376 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2377 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2378 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2379 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2380
2381 s[10] = _mm_packs_epi32(u[0], u[1]);
2382 s[13] = _mm_packs_epi32(u[2], u[3]);
2383 s[11] = _mm_packs_epi32(u[4], u[5]);
2384 s[12] = _mm_packs_epi32(u[6], u[7]);
2385 s[14] = t[14];
2386 s[15] = t[15];
2387
2388 // stage 7
2389 in[0] = _mm_add_epi16(s[0], s[15]);
2390 in[1] = _mm_add_epi16(s[1], s[14]);
2391 in[2] = _mm_add_epi16(s[2], s[13]);
2392 in[3] = _mm_add_epi16(s[3], s[12]);
2393 in[4] = _mm_add_epi16(s[4], s[11]);
2394 in[5] = _mm_add_epi16(s[5], s[10]);
2395 in[6] = _mm_add_epi16(s[6], s[9]);
2396 in[7] = _mm_add_epi16(s[7], s[8]);
2397 in[8] = _mm_sub_epi16(s[7], s[8]);
2398 in[9] = _mm_sub_epi16(s[6], s[9]);
2399 in[10] = _mm_sub_epi16(s[5], s[10]);
2400 in[11] = _mm_sub_epi16(s[4], s[11]);
2401 in[12] = _mm_sub_epi16(s[3], s[12]);
2402 in[13] = _mm_sub_epi16(s[2], s[13]);
2403 in[14] = _mm_sub_epi16(s[1], s[14]);
2404 in[15] = _mm_sub_epi16(s[0], s[15]);
2405 }
2406
idct16_sse2(__m128i * in0,__m128i * in1)2407 static void idct16_sse2(__m128i *in0, __m128i *in1) {
2408 array_transpose_16x16(in0, in1);
2409 idct16_8col(in0);
2410 idct16_8col(in1);
2411 }
2412
iadst16_sse2(__m128i * in0,__m128i * in1)2413 static void iadst16_sse2(__m128i *in0, __m128i *in1) {
2414 array_transpose_16x16(in0, in1);
2415 iadst16_8col(in0);
2416 iadst16_8col(in1);
2417 }
2418
load_buffer_8x16(const int16_t * input,__m128i * in)2419 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
2420 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
2421 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
2422 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
2423 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
2424 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
2425 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
2426 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
2427 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
2428
2429 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
2430 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
2431 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
2432 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
2433 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
2434 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
2435 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
2436 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
2437 }
2438
write_buffer_8x16(uint8_t * dest,__m128i * in,int stride)2439 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
2440 const __m128i final_rounding = _mm_set1_epi16(1<<5);
2441 const __m128i zero = _mm_setzero_si128();
2442 // Final rounding and shift
2443 in[0] = _mm_adds_epi16(in[0], final_rounding);
2444 in[1] = _mm_adds_epi16(in[1], final_rounding);
2445 in[2] = _mm_adds_epi16(in[2], final_rounding);
2446 in[3] = _mm_adds_epi16(in[3], final_rounding);
2447 in[4] = _mm_adds_epi16(in[4], final_rounding);
2448 in[5] = _mm_adds_epi16(in[5], final_rounding);
2449 in[6] = _mm_adds_epi16(in[6], final_rounding);
2450 in[7] = _mm_adds_epi16(in[7], final_rounding);
2451 in[8] = _mm_adds_epi16(in[8], final_rounding);
2452 in[9] = _mm_adds_epi16(in[9], final_rounding);
2453 in[10] = _mm_adds_epi16(in[10], final_rounding);
2454 in[11] = _mm_adds_epi16(in[11], final_rounding);
2455 in[12] = _mm_adds_epi16(in[12], final_rounding);
2456 in[13] = _mm_adds_epi16(in[13], final_rounding);
2457 in[14] = _mm_adds_epi16(in[14], final_rounding);
2458 in[15] = _mm_adds_epi16(in[15], final_rounding);
2459
2460 in[0] = _mm_srai_epi16(in[0], 6);
2461 in[1] = _mm_srai_epi16(in[1], 6);
2462 in[2] = _mm_srai_epi16(in[2], 6);
2463 in[3] = _mm_srai_epi16(in[3], 6);
2464 in[4] = _mm_srai_epi16(in[4], 6);
2465 in[5] = _mm_srai_epi16(in[5], 6);
2466 in[6] = _mm_srai_epi16(in[6], 6);
2467 in[7] = _mm_srai_epi16(in[7], 6);
2468 in[8] = _mm_srai_epi16(in[8], 6);
2469 in[9] = _mm_srai_epi16(in[9], 6);
2470 in[10] = _mm_srai_epi16(in[10], 6);
2471 in[11] = _mm_srai_epi16(in[11], 6);
2472 in[12] = _mm_srai_epi16(in[12], 6);
2473 in[13] = _mm_srai_epi16(in[13], 6);
2474 in[14] = _mm_srai_epi16(in[14], 6);
2475 in[15] = _mm_srai_epi16(in[15], 6);
2476
2477 RECON_AND_STORE(dest, in[0]);
2478 RECON_AND_STORE(dest, in[1]);
2479 RECON_AND_STORE(dest, in[2]);
2480 RECON_AND_STORE(dest, in[3]);
2481 RECON_AND_STORE(dest, in[4]);
2482 RECON_AND_STORE(dest, in[5]);
2483 RECON_AND_STORE(dest, in[6]);
2484 RECON_AND_STORE(dest, in[7]);
2485 RECON_AND_STORE(dest, in[8]);
2486 RECON_AND_STORE(dest, in[9]);
2487 RECON_AND_STORE(dest, in[10]);
2488 RECON_AND_STORE(dest, in[11]);
2489 RECON_AND_STORE(dest, in[12]);
2490 RECON_AND_STORE(dest, in[13]);
2491 RECON_AND_STORE(dest, in[14]);
2492 RECON_AND_STORE(dest, in[15]);
2493 }
2494
vp9_iht16x16_256_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)2495 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2496 int tx_type) {
2497 __m128i in0[16], in1[16];
2498
2499 load_buffer_8x16(input, in0);
2500 input += 8;
2501 load_buffer_8x16(input, in1);
2502
2503 switch (tx_type) {
2504 case 0: // DCT_DCT
2505 idct16_sse2(in0, in1);
2506 idct16_sse2(in0, in1);
2507 break;
2508 case 1: // ADST_DCT
2509 idct16_sse2(in0, in1);
2510 iadst16_sse2(in0, in1);
2511 break;
2512 case 2: // DCT_ADST
2513 iadst16_sse2(in0, in1);
2514 idct16_sse2(in0, in1);
2515 break;
2516 case 3: // ADST_ADST
2517 iadst16_sse2(in0, in1);
2518 iadst16_sse2(in0, in1);
2519 break;
2520 default:
2521 assert(0);
2522 break;
2523 }
2524
2525 write_buffer_8x16(dest, in0, stride);
2526 dest += 8;
2527 write_buffer_8x16(dest, in1, stride);
2528 }
2529
vp9_idct16x16_10_add_sse2(const int16_t * input,uint8_t * dest,int stride)2530 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2531 int stride) {
2532 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2533 const __m128i final_rounding = _mm_set1_epi16(1<<5);
2534 const __m128i zero = _mm_setzero_si128();
2535
2536 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2537 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2538 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2539 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2540
2541 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2542 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2543
2544 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2545 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2546 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2547 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2548 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2549 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2550
2551 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2552 __m128i in[16], l[16];
2553 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2554 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2555 stp1_8_0, stp1_12_0;
2556 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2557 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2558 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2559 int i;
2560 // First 1-D inverse DCT
2561 // Load input data.
2562 in[0] = _mm_load_si128((const __m128i *)input);
2563 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
2564 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
2565 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
2566
2567 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2568
2569 // Stage2
2570 {
2571 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2572 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2573
2574 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2575 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2576 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2577 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2578
2579 tmp0 = _mm_add_epi32(tmp0, rounding);
2580 tmp2 = _mm_add_epi32(tmp2, rounding);
2581 tmp5 = _mm_add_epi32(tmp5, rounding);
2582 tmp7 = _mm_add_epi32(tmp7, rounding);
2583
2584 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2585 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2586 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2587 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2588
2589 stp2_8 = _mm_packs_epi32(tmp0, tmp2);
2590 stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2591 }
2592
2593 // Stage3
2594 {
2595 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2596
2597 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2598 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2599
2600 tmp0 = _mm_add_epi32(tmp0, rounding);
2601 tmp2 = _mm_add_epi32(tmp2, rounding);
2602 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2603 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2604
2605 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2606 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2607
2608 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2609 }
2610
2611 // Stage4
2612 {
2613 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2614 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2615 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2616
2617 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2618 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2619 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2620 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2621 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2622 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2623
2624 tmp0 = _mm_add_epi32(tmp0, rounding);
2625 tmp2 = _mm_add_epi32(tmp2, rounding);
2626 tmp1 = _mm_add_epi32(tmp1, rounding);
2627 tmp3 = _mm_add_epi32(tmp3, rounding);
2628 tmp5 = _mm_add_epi32(tmp5, rounding);
2629 tmp7 = _mm_add_epi32(tmp7, rounding);
2630
2631 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2632 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2633 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2634 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2635 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2636 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2637
2638 stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2639 stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2640 stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2641 stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2642
2643 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2644 }
2645
2646 // Stage5 and Stage6
2647 {
2648 tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2649 tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2650 tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2651 tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2652
2653 stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
2654 stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2655 stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
2656 stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2657
2658 stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2659 stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2660 stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2661 stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2662 }
2663
2664 // Stage6
2665 {
2666 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2667 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2668 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2669
2670 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2671 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2672 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2673 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2674 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2675 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2676
2677 tmp1 = _mm_add_epi32(tmp1, rounding);
2678 tmp3 = _mm_add_epi32(tmp3, rounding);
2679 tmp0 = _mm_add_epi32(tmp0, rounding);
2680 tmp2 = _mm_add_epi32(tmp2, rounding);
2681 tmp4 = _mm_add_epi32(tmp4, rounding);
2682 tmp6 = _mm_add_epi32(tmp6, rounding);
2683
2684 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2685 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2686 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2687 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2688 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2689 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2690
2691 stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2692
2693 stp2_10 = _mm_packs_epi32(tmp0, zero);
2694 stp2_13 = _mm_packs_epi32(tmp2, zero);
2695 stp2_11 = _mm_packs_epi32(tmp4, zero);
2696 stp2_12 = _mm_packs_epi32(tmp6, zero);
2697
2698 tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2699 tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2700 tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2701 tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2702
2703 stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2704 stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2705 stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2706 stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2707 stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2708 stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2709 stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2710 stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2711 }
2712
2713 // Stage7. Left 8x16 only.
2714 l[0] = _mm_add_epi16(stp2_0, stp1_15);
2715 l[1] = _mm_add_epi16(stp2_1, stp1_14);
2716 l[2] = _mm_add_epi16(stp2_2, stp2_13);
2717 l[3] = _mm_add_epi16(stp2_3, stp2_12);
2718 l[4] = _mm_add_epi16(stp2_4, stp2_11);
2719 l[5] = _mm_add_epi16(stp2_5, stp2_10);
2720 l[6] = _mm_add_epi16(stp2_6, stp1_9);
2721 l[7] = _mm_add_epi16(stp2_7, stp1_8);
2722 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2723 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2724 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2725 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2726 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2727 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2728 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2729 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2730
2731 // Second 1-D inverse transform, performed per 8x16 block
2732 for (i = 0; i < 2; i++) {
2733 array_transpose_4X8(l + 8*i, in);
2734
2735 IDCT16_10
2736
2737 // Stage7
2738 in[0] = _mm_add_epi16(stp2_0, stp1_15);
2739 in[1] = _mm_add_epi16(stp2_1, stp1_14);
2740 in[2] = _mm_add_epi16(stp2_2, stp2_13);
2741 in[3] = _mm_add_epi16(stp2_3, stp2_12);
2742 in[4] = _mm_add_epi16(stp2_4, stp2_11);
2743 in[5] = _mm_add_epi16(stp2_5, stp2_10);
2744 in[6] = _mm_add_epi16(stp2_6, stp1_9);
2745 in[7] = _mm_add_epi16(stp2_7, stp1_8);
2746 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2747 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2748 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2749 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2750 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2751 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2752 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2753 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2754
2755 // Final rounding and shift
2756 in[0] = _mm_adds_epi16(in[0], final_rounding);
2757 in[1] = _mm_adds_epi16(in[1], final_rounding);
2758 in[2] = _mm_adds_epi16(in[2], final_rounding);
2759 in[3] = _mm_adds_epi16(in[3], final_rounding);
2760 in[4] = _mm_adds_epi16(in[4], final_rounding);
2761 in[5] = _mm_adds_epi16(in[5], final_rounding);
2762 in[6] = _mm_adds_epi16(in[6], final_rounding);
2763 in[7] = _mm_adds_epi16(in[7], final_rounding);
2764 in[8] = _mm_adds_epi16(in[8], final_rounding);
2765 in[9] = _mm_adds_epi16(in[9], final_rounding);
2766 in[10] = _mm_adds_epi16(in[10], final_rounding);
2767 in[11] = _mm_adds_epi16(in[11], final_rounding);
2768 in[12] = _mm_adds_epi16(in[12], final_rounding);
2769 in[13] = _mm_adds_epi16(in[13], final_rounding);
2770 in[14] = _mm_adds_epi16(in[14], final_rounding);
2771 in[15] = _mm_adds_epi16(in[15], final_rounding);
2772
2773 in[0] = _mm_srai_epi16(in[0], 6);
2774 in[1] = _mm_srai_epi16(in[1], 6);
2775 in[2] = _mm_srai_epi16(in[2], 6);
2776 in[3] = _mm_srai_epi16(in[3], 6);
2777 in[4] = _mm_srai_epi16(in[4], 6);
2778 in[5] = _mm_srai_epi16(in[5], 6);
2779 in[6] = _mm_srai_epi16(in[6], 6);
2780 in[7] = _mm_srai_epi16(in[7], 6);
2781 in[8] = _mm_srai_epi16(in[8], 6);
2782 in[9] = _mm_srai_epi16(in[9], 6);
2783 in[10] = _mm_srai_epi16(in[10], 6);
2784 in[11] = _mm_srai_epi16(in[11], 6);
2785 in[12] = _mm_srai_epi16(in[12], 6);
2786 in[13] = _mm_srai_epi16(in[13], 6);
2787 in[14] = _mm_srai_epi16(in[14], 6);
2788 in[15] = _mm_srai_epi16(in[15], 6);
2789
2790 RECON_AND_STORE(dest, in[0]);
2791 RECON_AND_STORE(dest, in[1]);
2792 RECON_AND_STORE(dest, in[2]);
2793 RECON_AND_STORE(dest, in[3]);
2794 RECON_AND_STORE(dest, in[4]);
2795 RECON_AND_STORE(dest, in[5]);
2796 RECON_AND_STORE(dest, in[6]);
2797 RECON_AND_STORE(dest, in[7]);
2798 RECON_AND_STORE(dest, in[8]);
2799 RECON_AND_STORE(dest, in[9]);
2800 RECON_AND_STORE(dest, in[10]);
2801 RECON_AND_STORE(dest, in[11]);
2802 RECON_AND_STORE(dest, in[12]);
2803 RECON_AND_STORE(dest, in[13]);
2804 RECON_AND_STORE(dest, in[14]);
2805 RECON_AND_STORE(dest, in[15]);
2806
2807 dest += 8 - (stride * 16);
2808 }
2809 }
2810
2811 #define LOAD_DQCOEFF(reg, input) \
2812 { \
2813 reg = _mm_load_si128((const __m128i *) input); \
2814 input += 8; \
2815 } \
2816
2817 #define IDCT32_34 \
2818 /* Stage1 */ \
2819 { \
2820 const __m128i zero = _mm_setzero_si128();\
2821 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2822 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2823 \
2824 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2825 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2826 \
2827 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2828 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2829 \
2830 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2831 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2832 \
2833 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2834 stg1_1, stp1_16, stp1_31); \
2835 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2836 stg1_7, stp1_19, stp1_28); \
2837 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2838 stg1_9, stp1_20, stp1_27); \
2839 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2840 stg1_15, stp1_23, stp1_24); \
2841 } \
2842 \
2843 /* Stage2 */ \
2844 { \
2845 const __m128i zero = _mm_setzero_si128();\
2846 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2847 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2848 \
2849 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2850 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2851 \
2852 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2853 stg2_1, stp2_8, stp2_15); \
2854 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2855 stg2_7, stp2_11, stp2_12); \
2856 \
2857 stp2_16 = stp1_16; \
2858 stp2_19 = stp1_19; \
2859 \
2860 stp2_20 = stp1_20; \
2861 stp2_23 = stp1_23; \
2862 \
2863 stp2_24 = stp1_24; \
2864 stp2_27 = stp1_27; \
2865 \
2866 stp2_28 = stp1_28; \
2867 stp2_31 = stp1_31; \
2868 } \
2869 \
2870 /* Stage3 */ \
2871 { \
2872 const __m128i zero = _mm_setzero_si128();\
2873 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2874 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2875 \
2876 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2877 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2878 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2879 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2880 \
2881 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2882 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2883 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2884 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2885 \
2886 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2887 stg3_1, stp1_4, stp1_7); \
2888 \
2889 stp1_8 = stp2_8; \
2890 stp1_11 = stp2_11; \
2891 stp1_12 = stp2_12; \
2892 stp1_15 = stp2_15; \
2893 \
2894 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2895 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2896 stp1_18, stp1_29) \
2897 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2898 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2899 stp1_22, stp1_25) \
2900 \
2901 stp1_16 = stp2_16; \
2902 stp1_31 = stp2_31; \
2903 stp1_19 = stp2_19; \
2904 stp1_20 = stp2_20; \
2905 stp1_23 = stp2_23; \
2906 stp1_24 = stp2_24; \
2907 stp1_27 = stp2_27; \
2908 stp1_28 = stp2_28; \
2909 } \
2910 \
2911 /* Stage4 */ \
2912 { \
2913 const __m128i zero = _mm_setzero_si128();\
2914 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2915 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2916 \
2917 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2918 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2919 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2920 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2921 \
2922 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2923 stg4_1, stp2_0, stp2_1); \
2924 \
2925 stp2_4 = stp1_4; \
2926 stp2_5 = stp1_4; \
2927 stp2_6 = stp1_7; \
2928 stp2_7 = stp1_7; \
2929 \
2930 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2931 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2932 stp2_10, stp2_13) \
2933 \
2934 stp2_8 = stp1_8; \
2935 stp2_15 = stp1_15; \
2936 stp2_11 = stp1_11; \
2937 stp2_12 = stp1_12; \
2938 \
2939 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2940 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2941 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2942 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2943 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2944 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2945 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2946 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2947 \
2948 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2949 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2950 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2951 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2952 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2953 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2954 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2955 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2956 } \
2957 \
2958 /* Stage5 */ \
2959 { \
2960 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2961 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2962 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2963 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2964 \
2965 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2966 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2967 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2968 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2969 \
2970 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2971 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2972 \
2973 stp1_0 = stp2_0; \
2974 stp1_1 = stp2_1; \
2975 stp1_2 = stp2_1; \
2976 stp1_3 = stp2_0; \
2977 \
2978 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2979 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2980 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2981 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2982 \
2983 tmp0 = _mm_add_epi32(tmp0, rounding); \
2984 tmp1 = _mm_add_epi32(tmp1, rounding); \
2985 tmp2 = _mm_add_epi32(tmp2, rounding); \
2986 tmp3 = _mm_add_epi32(tmp3, rounding); \
2987 \
2988 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2989 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2990 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2991 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2992 \
2993 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2994 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2995 \
2996 stp1_4 = stp2_4; \
2997 stp1_7 = stp2_7; \
2998 \
2999 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3000 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3001 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3002 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3003 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3004 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3005 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3006 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3007 \
3008 stp1_16 = stp2_16; \
3009 stp1_17 = stp2_17; \
3010 \
3011 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3012 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3013 stp1_19, stp1_28) \
3014 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3015 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3016 stp1_21, stp1_26) \
3017 \
3018 stp1_22 = stp2_22; \
3019 stp1_23 = stp2_23; \
3020 stp1_24 = stp2_24; \
3021 stp1_25 = stp2_25; \
3022 stp1_30 = stp2_30; \
3023 stp1_31 = stp2_31; \
3024 } \
3025 \
3026 /* Stage6 */ \
3027 { \
3028 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3029 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3030 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3031 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3032 \
3033 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3034 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3035 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3036 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3037 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3038 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3039 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3040 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3041 \
3042 stp2_8 = stp1_8; \
3043 stp2_9 = stp1_9; \
3044 stp2_14 = stp1_14; \
3045 stp2_15 = stp1_15; \
3046 \
3047 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3048 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3049 stp2_13, stp2_11, stp2_12) \
3050 \
3051 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3052 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3053 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3054 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3055 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3056 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3057 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3058 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3059 \
3060 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3061 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3062 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3063 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3064 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3065 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3066 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3067 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3068 } \
3069 \
3070 /* Stage7 */ \
3071 { \
3072 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3073 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3074 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3075 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3076 \
3077 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3078 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3079 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3080 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3081 \
3082 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3083 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3084 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3085 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3086 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3087 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3088 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3089 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3090 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3091 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3092 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3093 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3094 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3095 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3096 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3097 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3098 \
3099 stp1_16 = stp2_16; \
3100 stp1_17 = stp2_17; \
3101 stp1_18 = stp2_18; \
3102 stp1_19 = stp2_19; \
3103 \
3104 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3105 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3106 stp1_21, stp1_26) \
3107 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3108 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3109 stp1_23, stp1_24) \
3110 \
3111 stp1_28 = stp2_28; \
3112 stp1_29 = stp2_29; \
3113 stp1_30 = stp2_30; \
3114 stp1_31 = stp2_31; \
3115 }
3116
3117
3118 #define IDCT32 \
3119 /* Stage1 */ \
3120 { \
3121 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
3122 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
3123 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
3124 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
3125 \
3126 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
3127 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
3128 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
3129 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
3130 \
3131 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
3132 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
3133 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
3134 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
3135 \
3136 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
3137 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
3138 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
3139 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
3140 \
3141 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
3142 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
3143 stp1_17, stp1_30) \
3144 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
3145 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
3146 stp1_19, stp1_28) \
3147 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
3148 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
3149 stp1_21, stp1_26) \
3150 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
3151 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
3152 stp1_23, stp1_24) \
3153 } \
3154 \
3155 /* Stage2 */ \
3156 { \
3157 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
3158 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
3159 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
3160 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
3161 \
3162 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
3163 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
3164 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
3165 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
3166 \
3167 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
3168 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
3169 stp2_14) \
3170 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
3171 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
3172 stp2_11, stp2_12) \
3173 \
3174 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
3175 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
3176 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
3177 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
3178 \
3179 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
3180 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
3181 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
3182 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
3183 \
3184 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
3185 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
3186 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
3187 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
3188 \
3189 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
3190 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
3191 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
3192 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
3193 } \
3194 \
3195 /* Stage3 */ \
3196 { \
3197 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
3198 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
3199 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
3200 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
3201 \
3202 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
3203 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
3204 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3205 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3206 \
3207 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3208 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3209 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3210 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3211 \
3212 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
3213 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
3214 stp1_6) \
3215 \
3216 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
3217 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
3218 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
3219 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
3220 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
3221 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
3222 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
3223 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
3224 \
3225 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
3226 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
3227 stp1_18, stp1_29) \
3228 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
3229 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
3230 stp1_22, stp1_25) \
3231 \
3232 stp1_16 = stp2_16; \
3233 stp1_31 = stp2_31; \
3234 stp1_19 = stp2_19; \
3235 stp1_20 = stp2_20; \
3236 stp1_23 = stp2_23; \
3237 stp1_24 = stp2_24; \
3238 stp1_27 = stp2_27; \
3239 stp1_28 = stp2_28; \
3240 } \
3241 \
3242 /* Stage4 */ \
3243 { \
3244 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
3245 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
3246 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
3247 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
3248 \
3249 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
3250 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
3251 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3252 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3253 \
3254 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
3255 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
3256 stp2_2, stp2_3) \
3257 \
3258 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
3259 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
3260 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
3261 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
3262 \
3263 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
3264 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
3265 stp2_10, stp2_13) \
3266 \
3267 stp2_8 = stp1_8; \
3268 stp2_15 = stp1_15; \
3269 stp2_11 = stp1_11; \
3270 stp2_12 = stp1_12; \
3271 \
3272 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
3273 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
3274 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
3275 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
3276 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
3277 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
3278 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
3279 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
3280 \
3281 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
3282 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
3283 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
3284 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
3285 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
3286 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
3287 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
3288 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
3289 } \
3290 \
3291 /* Stage5 */ \
3292 { \
3293 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
3294 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
3295 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
3296 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
3297 \
3298 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
3299 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
3300 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3301 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3302 \
3303 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3304 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3305 \
3306 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
3307 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
3308 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
3309 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
3310 \
3311 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
3312 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
3313 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
3314 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
3315 \
3316 tmp0 = _mm_add_epi32(tmp0, rounding); \
3317 tmp1 = _mm_add_epi32(tmp1, rounding); \
3318 tmp2 = _mm_add_epi32(tmp2, rounding); \
3319 tmp3 = _mm_add_epi32(tmp3, rounding); \
3320 \
3321 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
3322 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
3323 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
3324 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
3325 \
3326 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
3327 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
3328 \
3329 stp1_4 = stp2_4; \
3330 stp1_7 = stp2_7; \
3331 \
3332 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3333 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3334 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3335 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3336 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3337 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3338 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3339 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3340 \
3341 stp1_16 = stp2_16; \
3342 stp1_17 = stp2_17; \
3343 \
3344 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3345 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3346 stp1_19, stp1_28) \
3347 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3348 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3349 stp1_21, stp1_26) \
3350 \
3351 stp1_22 = stp2_22; \
3352 stp1_23 = stp2_23; \
3353 stp1_24 = stp2_24; \
3354 stp1_25 = stp2_25; \
3355 stp1_30 = stp2_30; \
3356 stp1_31 = stp2_31; \
3357 } \
3358 \
3359 /* Stage6 */ \
3360 { \
3361 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3362 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3363 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3364 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3365 \
3366 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3367 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3368 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3369 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3370 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3371 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3372 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3373 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3374 \
3375 stp2_8 = stp1_8; \
3376 stp2_9 = stp1_9; \
3377 stp2_14 = stp1_14; \
3378 stp2_15 = stp1_15; \
3379 \
3380 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3381 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3382 stp2_13, stp2_11, stp2_12) \
3383 \
3384 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3385 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3386 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3387 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3388 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3389 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3390 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3391 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3392 \
3393 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3394 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3395 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3396 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3397 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3398 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3399 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3400 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3401 } \
3402 \
3403 /* Stage7 */ \
3404 { \
3405 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3406 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3407 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3408 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3409 \
3410 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3411 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3412 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3413 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3414 \
3415 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3416 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3417 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3418 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3419 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3420 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3421 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3422 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3423 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3424 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3425 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3426 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3427 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3428 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3429 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3430 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3431 \
3432 stp1_16 = stp2_16; \
3433 stp1_17 = stp2_17; \
3434 stp1_18 = stp2_18; \
3435 stp1_19 = stp2_19; \
3436 \
3437 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3438 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3439 stp1_21, stp1_26) \
3440 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3441 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3442 stp1_23, stp1_24) \
3443 \
3444 stp1_28 = stp2_28; \
3445 stp1_29 = stp2_29; \
3446 stp1_30 = stp2_30; \
3447 stp1_31 = stp2_31; \
3448 }
3449
3450 // Only upper-left 8x8 has non-zero coeff
vp9_idct32x32_34_add_sse2(const int16_t * input,uint8_t * dest,int stride)3451 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3452 int stride) {
3453 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3454 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3455
3456 // idct constants for each stage
3457 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3458 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3459 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3460 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3461 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3462 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3463 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3464 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3465 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3466 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3467 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3468 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3469 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3470 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3471 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3472 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3473
3474 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3475 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3476 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3477 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3478 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3479 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3480 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3481 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3482
3483 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3484 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3485 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3486 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3487 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3488 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3489 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3490 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3491 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3492 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3493
3494 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3495 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3496 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3497 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3498 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3499 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3500 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3501
3502 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3503
3504 __m128i in[32], col[32];
3505 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3506 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3507 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3508 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3509 stp1_30, stp1_31;
3510 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3511 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3512 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3513 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3514 stp2_30, stp2_31;
3515 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3516 int i;
3517 // Load input data.
3518 LOAD_DQCOEFF(in[0], input);
3519 LOAD_DQCOEFF(in[8], input);
3520 LOAD_DQCOEFF(in[16], input);
3521 LOAD_DQCOEFF(in[24], input);
3522 LOAD_DQCOEFF(in[1], input);
3523 LOAD_DQCOEFF(in[9], input);
3524 LOAD_DQCOEFF(in[17], input);
3525 LOAD_DQCOEFF(in[25], input);
3526 LOAD_DQCOEFF(in[2], input);
3527 LOAD_DQCOEFF(in[10], input);
3528 LOAD_DQCOEFF(in[18], input);
3529 LOAD_DQCOEFF(in[26], input);
3530 LOAD_DQCOEFF(in[3], input);
3531 LOAD_DQCOEFF(in[11], input);
3532 LOAD_DQCOEFF(in[19], input);
3533 LOAD_DQCOEFF(in[27], input);
3534
3535 LOAD_DQCOEFF(in[4], input);
3536 LOAD_DQCOEFF(in[12], input);
3537 LOAD_DQCOEFF(in[20], input);
3538 LOAD_DQCOEFF(in[28], input);
3539 LOAD_DQCOEFF(in[5], input);
3540 LOAD_DQCOEFF(in[13], input);
3541 LOAD_DQCOEFF(in[21], input);
3542 LOAD_DQCOEFF(in[29], input);
3543 LOAD_DQCOEFF(in[6], input);
3544 LOAD_DQCOEFF(in[14], input);
3545 LOAD_DQCOEFF(in[22], input);
3546 LOAD_DQCOEFF(in[30], input);
3547 LOAD_DQCOEFF(in[7], input);
3548 LOAD_DQCOEFF(in[15], input);
3549 LOAD_DQCOEFF(in[23], input);
3550 LOAD_DQCOEFF(in[31], input);
3551
3552 array_transpose_8x8(in, in);
3553 array_transpose_8x8(in+8, in+8);
3554 array_transpose_8x8(in+16, in+16);
3555 array_transpose_8x8(in+24, in+24);
3556
3557 IDCT32
3558
3559 // 1_D: Store 32 intermediate results for each 8x32 block.
3560 col[0] = _mm_add_epi16(stp1_0, stp1_31);
3561 col[1] = _mm_add_epi16(stp1_1, stp1_30);
3562 col[2] = _mm_add_epi16(stp1_2, stp1_29);
3563 col[3] = _mm_add_epi16(stp1_3, stp1_28);
3564 col[4] = _mm_add_epi16(stp1_4, stp1_27);
3565 col[5] = _mm_add_epi16(stp1_5, stp1_26);
3566 col[6] = _mm_add_epi16(stp1_6, stp1_25);
3567 col[7] = _mm_add_epi16(stp1_7, stp1_24);
3568 col[8] = _mm_add_epi16(stp1_8, stp1_23);
3569 col[9] = _mm_add_epi16(stp1_9, stp1_22);
3570 col[10] = _mm_add_epi16(stp1_10, stp1_21);
3571 col[11] = _mm_add_epi16(stp1_11, stp1_20);
3572 col[12] = _mm_add_epi16(stp1_12, stp1_19);
3573 col[13] = _mm_add_epi16(stp1_13, stp1_18);
3574 col[14] = _mm_add_epi16(stp1_14, stp1_17);
3575 col[15] = _mm_add_epi16(stp1_15, stp1_16);
3576 col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3577 col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3578 col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3579 col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3580 col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3581 col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3582 col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3583 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3584 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3585 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3586 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3587 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3588 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3589 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3590 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3591 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3592 for (i = 0; i < 4; i++) {
3593 const __m128i zero = _mm_setzero_si128();
3594 // Transpose 32x8 block to 8x32 block
3595 array_transpose_8x8(col+i*8, in);
3596 IDCT32_34
3597
3598 // 2_D: Calculate the results and store them to destination.
3599 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3600 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3601 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3602 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3603 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3604 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3605 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3606 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3607 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3608 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3609 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3610 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3611 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3612 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3613 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3614 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3615 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3616 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3617 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3618 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3619 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3620 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3621 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3622 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3623 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3624 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3625 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3626 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3627 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3628 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3629 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3630 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3631
3632 // Final rounding and shift
3633 in[0] = _mm_adds_epi16(in[0], final_rounding);
3634 in[1] = _mm_adds_epi16(in[1], final_rounding);
3635 in[2] = _mm_adds_epi16(in[2], final_rounding);
3636 in[3] = _mm_adds_epi16(in[3], final_rounding);
3637 in[4] = _mm_adds_epi16(in[4], final_rounding);
3638 in[5] = _mm_adds_epi16(in[5], final_rounding);
3639 in[6] = _mm_adds_epi16(in[6], final_rounding);
3640 in[7] = _mm_adds_epi16(in[7], final_rounding);
3641 in[8] = _mm_adds_epi16(in[8], final_rounding);
3642 in[9] = _mm_adds_epi16(in[9], final_rounding);
3643 in[10] = _mm_adds_epi16(in[10], final_rounding);
3644 in[11] = _mm_adds_epi16(in[11], final_rounding);
3645 in[12] = _mm_adds_epi16(in[12], final_rounding);
3646 in[13] = _mm_adds_epi16(in[13], final_rounding);
3647 in[14] = _mm_adds_epi16(in[14], final_rounding);
3648 in[15] = _mm_adds_epi16(in[15], final_rounding);
3649 in[16] = _mm_adds_epi16(in[16], final_rounding);
3650 in[17] = _mm_adds_epi16(in[17], final_rounding);
3651 in[18] = _mm_adds_epi16(in[18], final_rounding);
3652 in[19] = _mm_adds_epi16(in[19], final_rounding);
3653 in[20] = _mm_adds_epi16(in[20], final_rounding);
3654 in[21] = _mm_adds_epi16(in[21], final_rounding);
3655 in[22] = _mm_adds_epi16(in[22], final_rounding);
3656 in[23] = _mm_adds_epi16(in[23], final_rounding);
3657 in[24] = _mm_adds_epi16(in[24], final_rounding);
3658 in[25] = _mm_adds_epi16(in[25], final_rounding);
3659 in[26] = _mm_adds_epi16(in[26], final_rounding);
3660 in[27] = _mm_adds_epi16(in[27], final_rounding);
3661 in[28] = _mm_adds_epi16(in[28], final_rounding);
3662 in[29] = _mm_adds_epi16(in[29], final_rounding);
3663 in[30] = _mm_adds_epi16(in[30], final_rounding);
3664 in[31] = _mm_adds_epi16(in[31], final_rounding);
3665
3666 in[0] = _mm_srai_epi16(in[0], 6);
3667 in[1] = _mm_srai_epi16(in[1], 6);
3668 in[2] = _mm_srai_epi16(in[2], 6);
3669 in[3] = _mm_srai_epi16(in[3], 6);
3670 in[4] = _mm_srai_epi16(in[4], 6);
3671 in[5] = _mm_srai_epi16(in[5], 6);
3672 in[6] = _mm_srai_epi16(in[6], 6);
3673 in[7] = _mm_srai_epi16(in[7], 6);
3674 in[8] = _mm_srai_epi16(in[8], 6);
3675 in[9] = _mm_srai_epi16(in[9], 6);
3676 in[10] = _mm_srai_epi16(in[10], 6);
3677 in[11] = _mm_srai_epi16(in[11], 6);
3678 in[12] = _mm_srai_epi16(in[12], 6);
3679 in[13] = _mm_srai_epi16(in[13], 6);
3680 in[14] = _mm_srai_epi16(in[14], 6);
3681 in[15] = _mm_srai_epi16(in[15], 6);
3682 in[16] = _mm_srai_epi16(in[16], 6);
3683 in[17] = _mm_srai_epi16(in[17], 6);
3684 in[18] = _mm_srai_epi16(in[18], 6);
3685 in[19] = _mm_srai_epi16(in[19], 6);
3686 in[20] = _mm_srai_epi16(in[20], 6);
3687 in[21] = _mm_srai_epi16(in[21], 6);
3688 in[22] = _mm_srai_epi16(in[22], 6);
3689 in[23] = _mm_srai_epi16(in[23], 6);
3690 in[24] = _mm_srai_epi16(in[24], 6);
3691 in[25] = _mm_srai_epi16(in[25], 6);
3692 in[26] = _mm_srai_epi16(in[26], 6);
3693 in[27] = _mm_srai_epi16(in[27], 6);
3694 in[28] = _mm_srai_epi16(in[28], 6);
3695 in[29] = _mm_srai_epi16(in[29], 6);
3696 in[30] = _mm_srai_epi16(in[30], 6);
3697 in[31] = _mm_srai_epi16(in[31], 6);
3698
3699 RECON_AND_STORE(dest, in[0]);
3700 RECON_AND_STORE(dest, in[1]);
3701 RECON_AND_STORE(dest, in[2]);
3702 RECON_AND_STORE(dest, in[3]);
3703 RECON_AND_STORE(dest, in[4]);
3704 RECON_AND_STORE(dest, in[5]);
3705 RECON_AND_STORE(dest, in[6]);
3706 RECON_AND_STORE(dest, in[7]);
3707 RECON_AND_STORE(dest, in[8]);
3708 RECON_AND_STORE(dest, in[9]);
3709 RECON_AND_STORE(dest, in[10]);
3710 RECON_AND_STORE(dest, in[11]);
3711 RECON_AND_STORE(dest, in[12]);
3712 RECON_AND_STORE(dest, in[13]);
3713 RECON_AND_STORE(dest, in[14]);
3714 RECON_AND_STORE(dest, in[15]);
3715 RECON_AND_STORE(dest, in[16]);
3716 RECON_AND_STORE(dest, in[17]);
3717 RECON_AND_STORE(dest, in[18]);
3718 RECON_AND_STORE(dest, in[19]);
3719 RECON_AND_STORE(dest, in[20]);
3720 RECON_AND_STORE(dest, in[21]);
3721 RECON_AND_STORE(dest, in[22]);
3722 RECON_AND_STORE(dest, in[23]);
3723 RECON_AND_STORE(dest, in[24]);
3724 RECON_AND_STORE(dest, in[25]);
3725 RECON_AND_STORE(dest, in[26]);
3726 RECON_AND_STORE(dest, in[27]);
3727 RECON_AND_STORE(dest, in[28]);
3728 RECON_AND_STORE(dest, in[29]);
3729 RECON_AND_STORE(dest, in[30]);
3730 RECON_AND_STORE(dest, in[31]);
3731
3732 dest += 8 - (stride * 32);
3733 }
3734 }
3735
vp9_idct32x32_1024_add_sse2(const int16_t * input,uint8_t * dest,int stride)3736 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3737 int stride) {
3738 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3739 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3740
3741 // idct constants for each stage
3742 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3743 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3744 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3745 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3746 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3747 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3748 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3749 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3750 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3751 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3752 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3753 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3754 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3755 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3756 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3757 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3758
3759 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3760 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3761 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3762 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3763 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3764 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3765 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3766 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3767
3768 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3769 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3770 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3771 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3772 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3773 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3774 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3775 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3776 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3777 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3778
3779 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3780 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3781 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3782 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3783 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3784 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3785 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3786
3787 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3788
3789 __m128i in[32], col[128], zero_idx[16];
3790 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3791 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3792 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3793 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3794 stp1_30, stp1_31;
3795 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3796 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3797 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3798 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3799 stp2_30, stp2_31;
3800 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3801 int i, j, i32;
3802 int zero_flag[2];
3803
3804 for (i = 0; i < 4; i++) {
3805 i32 = (i << 5);
3806 // First 1-D idct
3807 // Load input data.
3808 LOAD_DQCOEFF(in[0], input);
3809 LOAD_DQCOEFF(in[8], input);
3810 LOAD_DQCOEFF(in[16], input);
3811 LOAD_DQCOEFF(in[24], input);
3812 LOAD_DQCOEFF(in[1], input);
3813 LOAD_DQCOEFF(in[9], input);
3814 LOAD_DQCOEFF(in[17], input);
3815 LOAD_DQCOEFF(in[25], input);
3816 LOAD_DQCOEFF(in[2], input);
3817 LOAD_DQCOEFF(in[10], input);
3818 LOAD_DQCOEFF(in[18], input);
3819 LOAD_DQCOEFF(in[26], input);
3820 LOAD_DQCOEFF(in[3], input);
3821 LOAD_DQCOEFF(in[11], input);
3822 LOAD_DQCOEFF(in[19], input);
3823 LOAD_DQCOEFF(in[27], input);
3824
3825 LOAD_DQCOEFF(in[4], input);
3826 LOAD_DQCOEFF(in[12], input);
3827 LOAD_DQCOEFF(in[20], input);
3828 LOAD_DQCOEFF(in[28], input);
3829 LOAD_DQCOEFF(in[5], input);
3830 LOAD_DQCOEFF(in[13], input);
3831 LOAD_DQCOEFF(in[21], input);
3832 LOAD_DQCOEFF(in[29], input);
3833 LOAD_DQCOEFF(in[6], input);
3834 LOAD_DQCOEFF(in[14], input);
3835 LOAD_DQCOEFF(in[22], input);
3836 LOAD_DQCOEFF(in[30], input);
3837 LOAD_DQCOEFF(in[7], input);
3838 LOAD_DQCOEFF(in[15], input);
3839 LOAD_DQCOEFF(in[23], input);
3840 LOAD_DQCOEFF(in[31], input);
3841
3842 // checking if all entries are zero
3843 zero_idx[0] = _mm_or_si128(in[0], in[1]);
3844 zero_idx[1] = _mm_or_si128(in[2], in[3]);
3845 zero_idx[2] = _mm_or_si128(in[4], in[5]);
3846 zero_idx[3] = _mm_or_si128(in[6], in[7]);
3847 zero_idx[4] = _mm_or_si128(in[8], in[9]);
3848 zero_idx[5] = _mm_or_si128(in[10], in[11]);
3849 zero_idx[6] = _mm_or_si128(in[12], in[13]);
3850 zero_idx[7] = _mm_or_si128(in[14], in[15]);
3851 zero_idx[8] = _mm_or_si128(in[16], in[17]);
3852 zero_idx[9] = _mm_or_si128(in[18], in[19]);
3853 zero_idx[10] = _mm_or_si128(in[20], in[21]);
3854 zero_idx[11] = _mm_or_si128(in[22], in[23]);
3855 zero_idx[12] = _mm_or_si128(in[24], in[25]);
3856 zero_idx[13] = _mm_or_si128(in[26], in[27]);
3857 zero_idx[14] = _mm_or_si128(in[28], in[29]);
3858 zero_idx[15] = _mm_or_si128(in[30], in[31]);
3859
3860 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3861 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3862 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3863 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3864 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3865 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3866 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3867 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3868
3869 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3870 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3871 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3872 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3873 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3874 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3875 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3876
3877 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
3878 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
3879 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
3880 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
3881 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
3882
3883 if (!zero_flag[0] && !zero_flag[1]) {
3884 col[i32 + 0] = _mm_setzero_si128();
3885 col[i32 + 1] = _mm_setzero_si128();
3886 col[i32 + 2] = _mm_setzero_si128();
3887 col[i32 + 3] = _mm_setzero_si128();
3888 col[i32 + 4] = _mm_setzero_si128();
3889 col[i32 + 5] = _mm_setzero_si128();
3890 col[i32 + 6] = _mm_setzero_si128();
3891 col[i32 + 7] = _mm_setzero_si128();
3892 col[i32 + 8] = _mm_setzero_si128();
3893 col[i32 + 9] = _mm_setzero_si128();
3894 col[i32 + 10] = _mm_setzero_si128();
3895 col[i32 + 11] = _mm_setzero_si128();
3896 col[i32 + 12] = _mm_setzero_si128();
3897 col[i32 + 13] = _mm_setzero_si128();
3898 col[i32 + 14] = _mm_setzero_si128();
3899 col[i32 + 15] = _mm_setzero_si128();
3900 col[i32 + 16] = _mm_setzero_si128();
3901 col[i32 + 17] = _mm_setzero_si128();
3902 col[i32 + 18] = _mm_setzero_si128();
3903 col[i32 + 19] = _mm_setzero_si128();
3904 col[i32 + 20] = _mm_setzero_si128();
3905 col[i32 + 21] = _mm_setzero_si128();
3906 col[i32 + 22] = _mm_setzero_si128();
3907 col[i32 + 23] = _mm_setzero_si128();
3908 col[i32 + 24] = _mm_setzero_si128();
3909 col[i32 + 25] = _mm_setzero_si128();
3910 col[i32 + 26] = _mm_setzero_si128();
3911 col[i32 + 27] = _mm_setzero_si128();
3912 col[i32 + 28] = _mm_setzero_si128();
3913 col[i32 + 29] = _mm_setzero_si128();
3914 col[i32 + 30] = _mm_setzero_si128();
3915 col[i32 + 31] = _mm_setzero_si128();
3916 continue;
3917 }
3918
3919 // Transpose 32x8 block to 8x32 block
3920 array_transpose_8x8(in, in);
3921 array_transpose_8x8(in+8, in+8);
3922 array_transpose_8x8(in+16, in+16);
3923 array_transpose_8x8(in+24, in+24);
3924
3925 IDCT32
3926
3927 // 1_D: Store 32 intermediate results for each 8x32 block.
3928 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3929 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3930 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3931 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3932 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3933 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3934 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3935 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3936 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3937 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3938 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3939 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3940 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3941 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3942 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3943 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3944 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3945 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3946 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3947 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3948 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3949 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3950 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3951 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3952 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3953 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3954 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3955 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3956 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3957 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3958 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3959 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3960 }
3961 for (i = 0; i < 4; i++) {
3962 const __m128i zero = _mm_setzero_si128();
3963 // Second 1-D idct
3964 j = i << 3;
3965
3966 // Transpose 32x8 block to 8x32 block
3967 array_transpose_8x8(col+j, in);
3968 array_transpose_8x8(col+j+32, in+8);
3969 array_transpose_8x8(col+j+64, in+16);
3970 array_transpose_8x8(col+j+96, in+24);
3971
3972 IDCT32
3973
3974 // 2_D: Calculate the results and store them to destination.
3975 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3976 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3977 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3978 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3979 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3980 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3981 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3982 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3983 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3984 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3985 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3986 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3987 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3988 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3989 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3990 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3991 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3992 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3993 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3994 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3995 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3996 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3997 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3998 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3999 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
4000 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
4001 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
4002 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
4003 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
4004 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
4005 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
4006 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
4007
4008 // Final rounding and shift
4009 in[0] = _mm_adds_epi16(in[0], final_rounding);
4010 in[1] = _mm_adds_epi16(in[1], final_rounding);
4011 in[2] = _mm_adds_epi16(in[2], final_rounding);
4012 in[3] = _mm_adds_epi16(in[3], final_rounding);
4013 in[4] = _mm_adds_epi16(in[4], final_rounding);
4014 in[5] = _mm_adds_epi16(in[5], final_rounding);
4015 in[6] = _mm_adds_epi16(in[6], final_rounding);
4016 in[7] = _mm_adds_epi16(in[7], final_rounding);
4017 in[8] = _mm_adds_epi16(in[8], final_rounding);
4018 in[9] = _mm_adds_epi16(in[9], final_rounding);
4019 in[10] = _mm_adds_epi16(in[10], final_rounding);
4020 in[11] = _mm_adds_epi16(in[11], final_rounding);
4021 in[12] = _mm_adds_epi16(in[12], final_rounding);
4022 in[13] = _mm_adds_epi16(in[13], final_rounding);
4023 in[14] = _mm_adds_epi16(in[14], final_rounding);
4024 in[15] = _mm_adds_epi16(in[15], final_rounding);
4025 in[16] = _mm_adds_epi16(in[16], final_rounding);
4026 in[17] = _mm_adds_epi16(in[17], final_rounding);
4027 in[18] = _mm_adds_epi16(in[18], final_rounding);
4028 in[19] = _mm_adds_epi16(in[19], final_rounding);
4029 in[20] = _mm_adds_epi16(in[20], final_rounding);
4030 in[21] = _mm_adds_epi16(in[21], final_rounding);
4031 in[22] = _mm_adds_epi16(in[22], final_rounding);
4032 in[23] = _mm_adds_epi16(in[23], final_rounding);
4033 in[24] = _mm_adds_epi16(in[24], final_rounding);
4034 in[25] = _mm_adds_epi16(in[25], final_rounding);
4035 in[26] = _mm_adds_epi16(in[26], final_rounding);
4036 in[27] = _mm_adds_epi16(in[27], final_rounding);
4037 in[28] = _mm_adds_epi16(in[28], final_rounding);
4038 in[29] = _mm_adds_epi16(in[29], final_rounding);
4039 in[30] = _mm_adds_epi16(in[30], final_rounding);
4040 in[31] = _mm_adds_epi16(in[31], final_rounding);
4041
4042 in[0] = _mm_srai_epi16(in[0], 6);
4043 in[1] = _mm_srai_epi16(in[1], 6);
4044 in[2] = _mm_srai_epi16(in[2], 6);
4045 in[3] = _mm_srai_epi16(in[3], 6);
4046 in[4] = _mm_srai_epi16(in[4], 6);
4047 in[5] = _mm_srai_epi16(in[5], 6);
4048 in[6] = _mm_srai_epi16(in[6], 6);
4049 in[7] = _mm_srai_epi16(in[7], 6);
4050 in[8] = _mm_srai_epi16(in[8], 6);
4051 in[9] = _mm_srai_epi16(in[9], 6);
4052 in[10] = _mm_srai_epi16(in[10], 6);
4053 in[11] = _mm_srai_epi16(in[11], 6);
4054 in[12] = _mm_srai_epi16(in[12], 6);
4055 in[13] = _mm_srai_epi16(in[13], 6);
4056 in[14] = _mm_srai_epi16(in[14], 6);
4057 in[15] = _mm_srai_epi16(in[15], 6);
4058 in[16] = _mm_srai_epi16(in[16], 6);
4059 in[17] = _mm_srai_epi16(in[17], 6);
4060 in[18] = _mm_srai_epi16(in[18], 6);
4061 in[19] = _mm_srai_epi16(in[19], 6);
4062 in[20] = _mm_srai_epi16(in[20], 6);
4063 in[21] = _mm_srai_epi16(in[21], 6);
4064 in[22] = _mm_srai_epi16(in[22], 6);
4065 in[23] = _mm_srai_epi16(in[23], 6);
4066 in[24] = _mm_srai_epi16(in[24], 6);
4067 in[25] = _mm_srai_epi16(in[25], 6);
4068 in[26] = _mm_srai_epi16(in[26], 6);
4069 in[27] = _mm_srai_epi16(in[27], 6);
4070 in[28] = _mm_srai_epi16(in[28], 6);
4071 in[29] = _mm_srai_epi16(in[29], 6);
4072 in[30] = _mm_srai_epi16(in[30], 6);
4073 in[31] = _mm_srai_epi16(in[31], 6);
4074
4075 RECON_AND_STORE(dest, in[0]);
4076 RECON_AND_STORE(dest, in[1]);
4077 RECON_AND_STORE(dest, in[2]);
4078 RECON_AND_STORE(dest, in[3]);
4079 RECON_AND_STORE(dest, in[4]);
4080 RECON_AND_STORE(dest, in[5]);
4081 RECON_AND_STORE(dest, in[6]);
4082 RECON_AND_STORE(dest, in[7]);
4083 RECON_AND_STORE(dest, in[8]);
4084 RECON_AND_STORE(dest, in[9]);
4085 RECON_AND_STORE(dest, in[10]);
4086 RECON_AND_STORE(dest, in[11]);
4087 RECON_AND_STORE(dest, in[12]);
4088 RECON_AND_STORE(dest, in[13]);
4089 RECON_AND_STORE(dest, in[14]);
4090 RECON_AND_STORE(dest, in[15]);
4091 RECON_AND_STORE(dest, in[16]);
4092 RECON_AND_STORE(dest, in[17]);
4093 RECON_AND_STORE(dest, in[18]);
4094 RECON_AND_STORE(dest, in[19]);
4095 RECON_AND_STORE(dest, in[20]);
4096 RECON_AND_STORE(dest, in[21]);
4097 RECON_AND_STORE(dest, in[22]);
4098 RECON_AND_STORE(dest, in[23]);
4099 RECON_AND_STORE(dest, in[24]);
4100 RECON_AND_STORE(dest, in[25]);
4101 RECON_AND_STORE(dest, in[26]);
4102 RECON_AND_STORE(dest, in[27]);
4103 RECON_AND_STORE(dest, in[28]);
4104 RECON_AND_STORE(dest, in[29]);
4105 RECON_AND_STORE(dest, in[30]);
4106 RECON_AND_STORE(dest, in[31]);
4107
4108 dest += 8 - (stride * 32);
4109 }
4110 } //NOLINT
4111
vp9_idct32x32_1_add_sse2(const int16_t * input,uint8_t * dest,int stride)4112 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
4113 __m128i dc_value;
4114 const __m128i zero = _mm_setzero_si128();
4115 int a, i;
4116
4117 a = dct_const_round_shift(input[0] * cospi_16_64);
4118 a = dct_const_round_shift(a * cospi_16_64);
4119 a = ROUND_POWER_OF_TWO(a, 6);
4120
4121 dc_value = _mm_set1_epi16(a);
4122
4123 for (i = 0; i < 4; ++i) {
4124 RECON_AND_STORE(dest, dc_value);
4125 RECON_AND_STORE(dest, dc_value);
4126 RECON_AND_STORE(dest, dc_value);
4127 RECON_AND_STORE(dest, dc_value);
4128 RECON_AND_STORE(dest, dc_value);
4129 RECON_AND_STORE(dest, dc_value);
4130 RECON_AND_STORE(dest, dc_value);
4131 RECON_AND_STORE(dest, dc_value);
4132 RECON_AND_STORE(dest, dc_value);
4133 RECON_AND_STORE(dest, dc_value);
4134 RECON_AND_STORE(dest, dc_value);
4135 RECON_AND_STORE(dest, dc_value);
4136 RECON_AND_STORE(dest, dc_value);
4137 RECON_AND_STORE(dest, dc_value);
4138 RECON_AND_STORE(dest, dc_value);
4139 RECON_AND_STORE(dest, dc_value);
4140 RECON_AND_STORE(dest, dc_value);
4141 RECON_AND_STORE(dest, dc_value);
4142 RECON_AND_STORE(dest, dc_value);
4143 RECON_AND_STORE(dest, dc_value);
4144 RECON_AND_STORE(dest, dc_value);
4145 RECON_AND_STORE(dest, dc_value);
4146 RECON_AND_STORE(dest, dc_value);
4147 RECON_AND_STORE(dest, dc_value);
4148 RECON_AND_STORE(dest, dc_value);
4149 RECON_AND_STORE(dest, dc_value);
4150 RECON_AND_STORE(dest, dc_value);
4151 RECON_AND_STORE(dest, dc_value);
4152 RECON_AND_STORE(dest, dc_value);
4153 RECON_AND_STORE(dest, dc_value);
4154 RECON_AND_STORE(dest, dc_value);
4155 RECON_AND_STORE(dest, dc_value);
4156 dest += 8 - (stride * 32);
4157 }
4158 }
4159