1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/transpose_sse2.h"
14 #include "vpx_dsp/x86/txfm_common_sse2.h"
15 
vpx_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)16 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
17                              int stride) {
18   const __m128i eight = _mm_set1_epi16(8);
19   __m128i in[2];
20 
21   // Rows
22   in[0] = load_input_data(input);
23   in[1] = load_input_data(input + 8);
24   idct4_sse2(in);
25 
26   // Columns
27   idct4_sse2(in);
28 
29   // Final round and shift
30   in[0] = _mm_add_epi16(in[0], eight);
31   in[1] = _mm_add_epi16(in[1], eight);
32   in[0] = _mm_srai_epi16(in[0], 4);
33   in[1] = _mm_srai_epi16(in[1], 4);
34 
35   recon_and_store4x4_sse2(in, dest, stride);
36 }
37 
vpx_idct4x4_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)38 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
39                             int stride) {
40   const __m128i zero = _mm_setzero_si128();
41   int a;
42   __m128i dc_value, d[2];
43 
44   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
45   a = (int)dct_const_round_shift(a * cospi_16_64);
46   a = ROUND_POWER_OF_TWO(a, 4);
47 
48   dc_value = _mm_set1_epi16(a);
49 
50   // Reconstruction and Store
51   d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
52   d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
53   d[0] = _mm_unpacklo_epi32(d[0],
54                             _mm_cvtsi32_si128(*(const int *)(dest + stride)));
55   d[1] = _mm_unpacklo_epi32(
56       _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
57   d[0] = _mm_unpacklo_epi8(d[0], zero);
58   d[1] = _mm_unpacklo_epi8(d[1], zero);
59   d[0] = _mm_add_epi16(d[0], dc_value);
60   d[1] = _mm_add_epi16(d[1], dc_value);
61   d[0] = _mm_packus_epi16(d[0], d[1]);
62 
63   *(int *)dest = _mm_cvtsi128_si32(d[0]);
64   d[0] = _mm_srli_si128(d[0], 4);
65   *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
66   d[0] = _mm_srli_si128(d[0], 4);
67   *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
68   d[0] = _mm_srli_si128(d[0], 4);
69   *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
70 }
71 
idct4_sse2(__m128i * in)72 void idct4_sse2(__m128i *in) {
73   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
74   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
75   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
76   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
77   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
78   __m128i u[8], v[8];
79 
80   transpose_16bit_4x4(in);
81   // stage 1
82   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
83   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
84   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
85   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
86   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
87   v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
88 
89   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
90   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
91   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
92   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
93 
94   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
95   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
96   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
97   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
98 
99   u[0] = _mm_packs_epi32(v[0], v[1]);
100   u[1] = _mm_packs_epi32(v[3], v[2]);
101 
102   // stage 2
103   in[0] = _mm_add_epi16(u[0], u[1]);
104   in[1] = _mm_sub_epi16(u[0], u[1]);
105   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
106 }
107 
iadst4_sse2(__m128i * in)108 void iadst4_sse2(__m128i *in) {
109   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
110   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
111   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
112   const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
113   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
114   const __m128i kZero = _mm_set1_epi16(0);
115   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
116   __m128i u[8], v[8], in7;
117 
118   transpose_16bit_4x4(in);
119   in7 = _mm_srli_si128(in[1], 8);
120   in7 = _mm_add_epi16(in7, in[0]);
121   in7 = _mm_sub_epi16(in7, in[1]);
122 
123   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
124   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
125   u[2] = _mm_unpacklo_epi16(in7, kZero);
126   u[3] = _mm_unpackhi_epi16(in[0], kZero);
127 
128   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
129   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
130   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
131   v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
132   v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
133   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
134 
135   u[0] = _mm_add_epi32(v[0], v[1]);
136   u[1] = _mm_add_epi32(v[3], v[4]);
137   u[2] = v[2];
138   u[3] = _mm_add_epi32(u[0], u[1]);
139   u[4] = _mm_slli_epi32(v[5], 2);
140   u[5] = _mm_add_epi32(u[3], v[5]);
141   u[6] = _mm_sub_epi32(u[5], u[4]);
142 
143   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
144   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
145   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
146   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
147 
148   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
149   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
150   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
151   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
152 
153   in[0] = _mm_packs_epi32(u[0], u[1]);
154   in[1] = _mm_packs_epi32(u[2], u[3]);
155 }
156 
157 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
158   {                                                                  \
159     tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
160     tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
161     tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
162     tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
163                                                                      \
164     tmp0 = _mm_add_epi32(tmp0, rounding);                            \
165     tmp1 = _mm_add_epi32(tmp1, rounding);                            \
166     tmp2 = _mm_add_epi32(tmp2, rounding);                            \
167     tmp3 = _mm_add_epi32(tmp3, rounding);                            \
168                                                                      \
169     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
170     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
171     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
172     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
173                                                                      \
174     res0 = _mm_packs_epi32(tmp0, tmp1);                              \
175     res1 = _mm_packs_epi32(tmp2, tmp3);                              \
176   }
177 
178 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
179               out4, out5, out6, out7)                                         \
180   {                                                                           \
181     /* Stage1 */                                                              \
182     {                                                                         \
183       const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
184       const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
185       const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
186       const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
187                                                                               \
188       MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
189                              stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
190     }                                                                         \
191                                                                               \
192     /* Stage2 */                                                              \
193     {                                                                         \
194       const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
195       const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
196       const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
197       const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
198                                                                               \
199       MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
200                              stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
201                                                                               \
202       stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \
203       stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \
204       stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \
205       stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \
206     }                                                                         \
207                                                                               \
208     /* Stage3 */                                                              \
209     {                                                                         \
210       const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
211       const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
212                                                                               \
213       stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \
214       stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \
215       stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
216       stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
217                                                                               \
218       tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
219       tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
220       tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
221       tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
222                                                                               \
223       tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
224       tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
225       tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
226       tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
227                                                                               \
228       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
229       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
230       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
231       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
232                                                                               \
233       stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
234       stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
235     }                                                                         \
236                                                                               \
237     /* Stage4  */                                                             \
238     out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \
239     out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
240     out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
241     out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \
242     out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \
243     out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
244     out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
245     out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \
246   }
247 
vpx_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)248 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
249                              int stride) {
250   const __m128i zero = _mm_setzero_si128();
251   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
252   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
253   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
254   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
255   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
256   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
257   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
258   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
259   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
260   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
261 
262   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
263   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
264   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
265   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
266   int i;
267 
268   // Load input data.
269   in0 = load_input_data(input);
270   in1 = load_input_data(input + 8 * 1);
271   in2 = load_input_data(input + 8 * 2);
272   in3 = load_input_data(input + 8 * 3);
273   in4 = load_input_data(input + 8 * 4);
274   in5 = load_input_data(input + 8 * 5);
275   in6 = load_input_data(input + 8 * 6);
276   in7 = load_input_data(input + 8 * 7);
277 
278   // 2-D
279   for (i = 0; i < 2; i++) {
280     // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
281     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
282                   in4, in5, in6, in7);
283 
284     // 4-stage 1D idct8x8
285     IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
286           in6, in7);
287   }
288 
289   // Final rounding and shift
290   in0 = _mm_adds_epi16(in0, final_rounding);
291   in1 = _mm_adds_epi16(in1, final_rounding);
292   in2 = _mm_adds_epi16(in2, final_rounding);
293   in3 = _mm_adds_epi16(in3, final_rounding);
294   in4 = _mm_adds_epi16(in4, final_rounding);
295   in5 = _mm_adds_epi16(in5, final_rounding);
296   in6 = _mm_adds_epi16(in6, final_rounding);
297   in7 = _mm_adds_epi16(in7, final_rounding);
298 
299   in0 = _mm_srai_epi16(in0, 5);
300   in1 = _mm_srai_epi16(in1, 5);
301   in2 = _mm_srai_epi16(in2, 5);
302   in3 = _mm_srai_epi16(in3, 5);
303   in4 = _mm_srai_epi16(in4, 5);
304   in5 = _mm_srai_epi16(in5, 5);
305   in6 = _mm_srai_epi16(in6, 5);
306   in7 = _mm_srai_epi16(in7, 5);
307 
308   RECON_AND_STORE(dest + 0 * stride, in0);
309   RECON_AND_STORE(dest + 1 * stride, in1);
310   RECON_AND_STORE(dest + 2 * stride, in2);
311   RECON_AND_STORE(dest + 3 * stride, in3);
312   RECON_AND_STORE(dest + 4 * stride, in4);
313   RECON_AND_STORE(dest + 5 * stride, in5);
314   RECON_AND_STORE(dest + 6 * stride, in6);
315   RECON_AND_STORE(dest + 7 * stride, in7);
316 }
317 
vpx_idct8x8_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)318 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
319                             int stride) {
320   __m128i dc_value;
321   const __m128i zero = _mm_setzero_si128();
322   int a;
323 
324   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
325   a = (int)dct_const_round_shift(a * cospi_16_64);
326   a = ROUND_POWER_OF_TWO(a, 5);
327 
328   dc_value = _mm_set1_epi16(a);
329 
330   RECON_AND_STORE(dest + 0 * stride, dc_value);
331   RECON_AND_STORE(dest + 1 * stride, dc_value);
332   RECON_AND_STORE(dest + 2 * stride, dc_value);
333   RECON_AND_STORE(dest + 3 * stride, dc_value);
334   RECON_AND_STORE(dest + 4 * stride, dc_value);
335   RECON_AND_STORE(dest + 5 * stride, dc_value);
336   RECON_AND_STORE(dest + 6 * stride, dc_value);
337   RECON_AND_STORE(dest + 7 * stride, dc_value);
338 }
339 
idct8_sse2(__m128i * in)340 void idct8_sse2(__m128i *in) {
341   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
342   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
343   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
344   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
345   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
346   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
347   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
348   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
349   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
350 
351   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
352   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
353   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
354   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
355 
356   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
357   TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
358                 in1, in2, in3, in4, in5, in6, in7);
359 
360   // 4-stage 1D idct8x8
361   IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
362         in[4], in[5], in[6], in[7]);
363 }
364 
iadst8_sse2(__m128i * in)365 void iadst8_sse2(__m128i *in) {
366   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
367   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
368   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
369   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
370   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
371   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
372   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
373   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
374   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
375   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
376   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
377   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
378   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
379   const __m128i k__const_0 = _mm_set1_epi16(0);
380   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
381 
382   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
383   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
384   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
385   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
386   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
387 
388   // transpose
389   array_transpose_8x8(in, in);
390 
391   // properly aligned for butterfly input
392   in0 = in[7];
393   in1 = in[0];
394   in2 = in[5];
395   in3 = in[2];
396   in4 = in[3];
397   in5 = in[4];
398   in6 = in[1];
399   in7 = in[6];
400 
401   // column transformation
402   // stage 1
403   // interleave and multiply/add into 32-bit integer
404   s0 = _mm_unpacklo_epi16(in0, in1);
405   s1 = _mm_unpackhi_epi16(in0, in1);
406   s2 = _mm_unpacklo_epi16(in2, in3);
407   s3 = _mm_unpackhi_epi16(in2, in3);
408   s4 = _mm_unpacklo_epi16(in4, in5);
409   s5 = _mm_unpackhi_epi16(in4, in5);
410   s6 = _mm_unpacklo_epi16(in6, in7);
411   s7 = _mm_unpackhi_epi16(in6, in7);
412 
413   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
414   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
415   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
416   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
417   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
418   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
419   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
420   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
421   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
422   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
423   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
424   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
425   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
426   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
427   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
428   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
429 
430   // addition
431   w0 = _mm_add_epi32(u0, u8);
432   w1 = _mm_add_epi32(u1, u9);
433   w2 = _mm_add_epi32(u2, u10);
434   w3 = _mm_add_epi32(u3, u11);
435   w4 = _mm_add_epi32(u4, u12);
436   w5 = _mm_add_epi32(u5, u13);
437   w6 = _mm_add_epi32(u6, u14);
438   w7 = _mm_add_epi32(u7, u15);
439   w8 = _mm_sub_epi32(u0, u8);
440   w9 = _mm_sub_epi32(u1, u9);
441   w10 = _mm_sub_epi32(u2, u10);
442   w11 = _mm_sub_epi32(u3, u11);
443   w12 = _mm_sub_epi32(u4, u12);
444   w13 = _mm_sub_epi32(u5, u13);
445   w14 = _mm_sub_epi32(u6, u14);
446   w15 = _mm_sub_epi32(u7, u15);
447 
448   // shift and rounding
449   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
450   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
451   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
452   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
453   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
454   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
455   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
456   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
457   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
458   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
459   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
460   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
461   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
462   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
463   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
464   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
465 
466   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
467   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
468   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
469   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
470   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
471   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
472   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
473   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
474   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
475   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
476   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
477   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
478   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
479   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
480   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
481   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
482 
483   // back to 16-bit and pack 8 integers into __m128i
484   in[0] = _mm_packs_epi32(u0, u1);
485   in[1] = _mm_packs_epi32(u2, u3);
486   in[2] = _mm_packs_epi32(u4, u5);
487   in[3] = _mm_packs_epi32(u6, u7);
488   in[4] = _mm_packs_epi32(u8, u9);
489   in[5] = _mm_packs_epi32(u10, u11);
490   in[6] = _mm_packs_epi32(u12, u13);
491   in[7] = _mm_packs_epi32(u14, u15);
492 
493   // stage 2
494   s0 = _mm_add_epi16(in[0], in[2]);
495   s1 = _mm_add_epi16(in[1], in[3]);
496   s2 = _mm_sub_epi16(in[0], in[2]);
497   s3 = _mm_sub_epi16(in[1], in[3]);
498   u0 = _mm_unpacklo_epi16(in[4], in[5]);
499   u1 = _mm_unpackhi_epi16(in[4], in[5]);
500   u2 = _mm_unpacklo_epi16(in[6], in[7]);
501   u3 = _mm_unpackhi_epi16(in[6], in[7]);
502 
503   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
504   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
505   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
506   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
507   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
508   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
509   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
510   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
511 
512   w0 = _mm_add_epi32(v0, v4);
513   w1 = _mm_add_epi32(v1, v5);
514   w2 = _mm_add_epi32(v2, v6);
515   w3 = _mm_add_epi32(v3, v7);
516   w4 = _mm_sub_epi32(v0, v4);
517   w5 = _mm_sub_epi32(v1, v5);
518   w6 = _mm_sub_epi32(v2, v6);
519   w7 = _mm_sub_epi32(v3, v7);
520 
521   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
522   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
523   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
524   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
525   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
526   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
527   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
528   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
529 
530   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
531   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
532   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
533   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
534   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
535   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
536   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
537   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
538 
539   // back to 16-bit intergers
540   s4 = _mm_packs_epi32(u0, u1);
541   s5 = _mm_packs_epi32(u2, u3);
542   s6 = _mm_packs_epi32(u4, u5);
543   s7 = _mm_packs_epi32(u6, u7);
544 
545   // stage 3
546   u0 = _mm_unpacklo_epi16(s2, s3);
547   u1 = _mm_unpackhi_epi16(s2, s3);
548   u2 = _mm_unpacklo_epi16(s6, s7);
549   u3 = _mm_unpackhi_epi16(s6, s7);
550 
551   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
552   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
553   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
554   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
555   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
556   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
557   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
558   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
559 
560   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
561   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
562   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
563   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
564   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
565   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
566   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
567   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
568 
569   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
570   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
571   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
572   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
573   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
574   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
575   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
576   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
577 
578   s2 = _mm_packs_epi32(v0, v1);
579   s3 = _mm_packs_epi32(v2, v3);
580   s6 = _mm_packs_epi32(v4, v5);
581   s7 = _mm_packs_epi32(v6, v7);
582 
583   in[0] = s0;
584   in[1] = _mm_sub_epi16(k__const_0, s4);
585   in[2] = s6;
586   in[3] = _mm_sub_epi16(k__const_0, s2);
587   in[4] = s3;
588   in[5] = _mm_sub_epi16(k__const_0, s7);
589   in[6] = s5;
590   in[7] = _mm_sub_epi16(k__const_0, s1);
591 }
592 
vpx_idct8x8_12_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)593 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
594                              int stride) {
595   const __m128i zero = _mm_setzero_si128();
596   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
597   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
598   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
599   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
600   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
601   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
602   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
603   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
604   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
605   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
606   const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
607 
608   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
609   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
610   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
611   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
612 
613   // Rows. Load 4-row input data.
614   in0 = load_input_data(input);
615   in1 = load_input_data(input + 8 * 1);
616   in2 = load_input_data(input + 8 * 2);
617   in3 = load_input_data(input + 8 * 3);
618 
619   // 8x4 Transpose
620   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
621   // Stage1
622   {
623     const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
624     const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
625 
626     tmp0 = _mm_madd_epi16(lo_17, stg1_0);
627     tmp2 = _mm_madd_epi16(lo_17, stg1_1);
628     tmp4 = _mm_madd_epi16(lo_35, stg1_2);
629     tmp6 = _mm_madd_epi16(lo_35, stg1_3);
630 
631     tmp0 = _mm_add_epi32(tmp0, rounding);
632     tmp2 = _mm_add_epi32(tmp2, rounding);
633     tmp4 = _mm_add_epi32(tmp4, rounding);
634     tmp6 = _mm_add_epi32(tmp6, rounding);
635     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
636     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
637     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
638     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
639 
640     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
641     stp1_5 = _mm_packs_epi32(tmp4, tmp6);
642   }
643 
644   // Stage2
645   {
646     const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
647     const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
648 
649     tmp0 = _mm_madd_epi16(lo_04, stg2_0);
650     tmp2 = _mm_madd_epi16(lo_04, stg2_1);
651     tmp4 = _mm_madd_epi16(lo_26, stg2_2);
652     tmp6 = _mm_madd_epi16(lo_26, stg2_3);
653 
654     tmp0 = _mm_add_epi32(tmp0, rounding);
655     tmp2 = _mm_add_epi32(tmp2, rounding);
656     tmp4 = _mm_add_epi32(tmp4, rounding);
657     tmp6 = _mm_add_epi32(tmp6, rounding);
658     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
659     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
660     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
661     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
662 
663     stp2_0 = _mm_packs_epi32(tmp0, tmp2);
664     stp2_2 = _mm_packs_epi32(tmp6, tmp4);
665 
666     tmp0 = _mm_add_epi16(stp1_4, stp1_5);
667     tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
668 
669     stp2_4 = tmp0;
670     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
671     stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
672   }
673 
674   // Stage3
675   {
676     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
677 
678     tmp4 = _mm_add_epi16(stp2_0, stp2_2);
679     tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
680 
681     stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
682     stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
683 
684     tmp0 = _mm_madd_epi16(lo_56, stg3_0);
685     tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
686 
687     tmp0 = _mm_add_epi32(tmp0, rounding);
688     tmp2 = _mm_add_epi32(tmp2, rounding);
689     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
690     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
691 
692     stp1_5 = _mm_packs_epi32(tmp0, tmp2);
693   }
694 
695   // Stage4
696   tmp0 = _mm_add_epi16(stp1_3, stp2_4);
697   tmp1 = _mm_add_epi16(stp1_2, stp1_5);
698   tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
699   tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
700 
701   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
702 
703   IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
704         in5, in6, in7);
705   // Final rounding and shift
706   in0 = _mm_adds_epi16(in0, final_rounding);
707   in1 = _mm_adds_epi16(in1, final_rounding);
708   in2 = _mm_adds_epi16(in2, final_rounding);
709   in3 = _mm_adds_epi16(in3, final_rounding);
710   in4 = _mm_adds_epi16(in4, final_rounding);
711   in5 = _mm_adds_epi16(in5, final_rounding);
712   in6 = _mm_adds_epi16(in6, final_rounding);
713   in7 = _mm_adds_epi16(in7, final_rounding);
714 
715   in0 = _mm_srai_epi16(in0, 5);
716   in1 = _mm_srai_epi16(in1, 5);
717   in2 = _mm_srai_epi16(in2, 5);
718   in3 = _mm_srai_epi16(in3, 5);
719   in4 = _mm_srai_epi16(in4, 5);
720   in5 = _mm_srai_epi16(in5, 5);
721   in6 = _mm_srai_epi16(in6, 5);
722   in7 = _mm_srai_epi16(in7, 5);
723 
724   RECON_AND_STORE(dest + 0 * stride, in0);
725   RECON_AND_STORE(dest + 1 * stride, in1);
726   RECON_AND_STORE(dest + 2 * stride, in2);
727   RECON_AND_STORE(dest + 3 * stride, in3);
728   RECON_AND_STORE(dest + 4 * stride, in4);
729   RECON_AND_STORE(dest + 5 * stride, in5);
730   RECON_AND_STORE(dest + 6 * stride, in6);
731   RECON_AND_STORE(dest + 7 * stride, in7);
732 }
733 
734 #define IDCT16                                                                 \
735   /* Stage2 */                                                                 \
736   {                                                                            \
737     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
738     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
739     const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
740     const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
741     const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
742     const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
743     const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
744     const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
745                                                                                \
746     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
747                            stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
748                                                                                \
749     MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
750                            stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
751   }                                                                            \
752                                                                                \
753   /* Stage3 */                                                                 \
754   {                                                                            \
755     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
756     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
757     const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
758     const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
759                                                                                \
760     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
761                            stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
762                                                                                \
763     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
764     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
765     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
766     stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
767                                                                                \
768     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
769     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
770     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
771     stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
772   }                                                                            \
773                                                                                \
774   /* Stage4 */                                                                 \
775   {                                                                            \
776     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
777     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
778     const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
779     const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
780                                                                                \
781     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
782     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
783     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
784     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
785                                                                                \
786     MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
787                            stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
788                                                                                \
789     stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
790     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
791     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
792     stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
793                                                                                \
794     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
795                            stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
796                            stp2_13)                                            \
797   }                                                                            \
798                                                                                \
799   /* Stage5 */                                                                 \
800   {                                                                            \
801     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
802     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
803                                                                                \
804     stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
805     stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
806     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
807     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
808                                                                                \
809     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
810     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
811     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
812     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
813                                                                                \
814     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
815     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
816     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
817     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
818                                                                                \
819     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
820     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
821     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
822     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
823                                                                                \
824     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
825     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
826                                                                                \
827     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
828     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
829     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
830     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
831                                                                                \
832     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
833     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
834     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
835     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
836   }                                                                            \
837                                                                                \
838   /* Stage6 */                                                                 \
839   {                                                                            \
840     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
841     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
842     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
843     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
844                                                                                \
845     stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
846     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
847     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
848     stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
849     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
850     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
851     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
852     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
853                                                                                \
854     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
855                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
856                            stp2_12)                                            \
857   }
858 
859 #define IDCT16_10                                                              \
860   /* Stage2 */                                                                 \
861   {                                                                            \
862     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
863     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
864     const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
865     const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
866                                                                                \
867     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
868                            stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
869                            stp1_12_0)                                          \
870   }                                                                            \
871                                                                                \
872   /* Stage3 */                                                                 \
873   {                                                                            \
874     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
875     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
876                                                                                \
877     MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
878                                                                                \
879     stp1_9 = stp1_8_0;                                                         \
880     stp1_10 = stp1_11;                                                         \
881                                                                                \
882     stp1_13 = stp1_12_0;                                                       \
883     stp1_14 = stp1_15;                                                         \
884   }                                                                            \
885                                                                                \
886   /* Stage4 */                                                                 \
887   {                                                                            \
888     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
889     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
890                                                                                \
891     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
892     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
893     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
894     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
895                                                                                \
896     MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
897     stp2_5 = stp2_4;                                                           \
898     stp2_6 = stp2_7;                                                           \
899                                                                                \
900     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
901                            stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
902                            stp2_13)                                            \
903   }                                                                            \
904                                                                                \
905   /* Stage5 */                                                                 \
906   {                                                                            \
907     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
908     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
909                                                                                \
910     stp1_2 = stp1_1;                                                           \
911     stp1_3 = stp1_0;                                                           \
912                                                                                \
913     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
914     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
915     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
916     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
917                                                                                \
918     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
919     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
920     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
921     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
922                                                                                \
923     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
924     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
925     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
926     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
927                                                                                \
928     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
929     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
930                                                                                \
931     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
932     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
933     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
934     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
935                                                                                \
936     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
937     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
938     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
939     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
940   }                                                                            \
941                                                                                \
942   /* Stage6 */                                                                 \
943   {                                                                            \
944     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
945     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
946     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
947     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
948                                                                                \
949     stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
950     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
951     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
952     stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
953     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
954     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
955     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
956     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
957                                                                                \
958     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
959                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
960                            stp2_12)                                            \
961   }
962 
vpx_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)963 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
964                                 int stride) {
965   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
966   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
967   const __m128i zero = _mm_setzero_si128();
968 
969   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
970   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
971   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
972   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
973   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
974   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
975   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
976   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
977 
978   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
979   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
980   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
981   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
982 
983   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
984   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
985   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
986   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
987   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
988   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
989   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
990   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
991 
992   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
993 
994   __m128i in[16], l[16], r[16], *curr1;
995   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
996       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
997       stp1_8_0, stp1_12_0;
998   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
999       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1000   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1001   int i;
1002 
1003   curr1 = l;
1004   for (i = 0; i < 2; i++) {
1005     // 1-D idct
1006 
1007     // Load input data.
1008     in[0] = load_input_data(input);
1009     in[8] = load_input_data(input + 8 * 1);
1010     in[1] = load_input_data(input + 8 * 2);
1011     in[9] = load_input_data(input + 8 * 3);
1012     in[2] = load_input_data(input + 8 * 4);
1013     in[10] = load_input_data(input + 8 * 5);
1014     in[3] = load_input_data(input + 8 * 6);
1015     in[11] = load_input_data(input + 8 * 7);
1016     in[4] = load_input_data(input + 8 * 8);
1017     in[12] = load_input_data(input + 8 * 9);
1018     in[5] = load_input_data(input + 8 * 10);
1019     in[13] = load_input_data(input + 8 * 11);
1020     in[6] = load_input_data(input + 8 * 12);
1021     in[14] = load_input_data(input + 8 * 13);
1022     in[7] = load_input_data(input + 8 * 14);
1023     in[15] = load_input_data(input + 8 * 15);
1024 
1025     array_transpose_8x8(in, in);
1026     array_transpose_8x8(in + 8, in + 8);
1027 
1028     IDCT16
1029 
1030     // Stage7
1031     curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1032     curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1033     curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1034     curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1035     curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1036     curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1037     curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1038     curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1039     curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1040     curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1041     curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1042     curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1043     curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1044     curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1045     curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1046     curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1047 
1048     curr1 = r;
1049     input += 128;
1050   }
1051   for (i = 0; i < 2; i++) {
1052     int j;
1053     // 1-D idct
1054     array_transpose_8x8(l + i * 8, in);
1055     array_transpose_8x8(r + i * 8, in + 8);
1056 
1057     IDCT16
1058 
1059     // 2-D
1060     in[0] = _mm_add_epi16(stp2_0, stp1_15);
1061     in[1] = _mm_add_epi16(stp2_1, stp1_14);
1062     in[2] = _mm_add_epi16(stp2_2, stp2_13);
1063     in[3] = _mm_add_epi16(stp2_3, stp2_12);
1064     in[4] = _mm_add_epi16(stp2_4, stp2_11);
1065     in[5] = _mm_add_epi16(stp2_5, stp2_10);
1066     in[6] = _mm_add_epi16(stp2_6, stp1_9);
1067     in[7] = _mm_add_epi16(stp2_7, stp1_8);
1068     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1069     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1070     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1071     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1072     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1073     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1074     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1075     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1076 
1077     for (j = 0; j < 16; ++j) {
1078       // Final rounding and shift
1079       in[j] = _mm_adds_epi16(in[j], final_rounding);
1080       in[j] = _mm_srai_epi16(in[j], 6);
1081       RECON_AND_STORE(dest + j * stride, in[j]);
1082     }
1083 
1084     dest += 8;
1085   }
1086 }
1087 
vpx_idct16x16_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1088 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1089                               int stride) {
1090   __m128i dc_value;
1091   const __m128i zero = _mm_setzero_si128();
1092   int a, i;
1093 
1094   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
1095   a = (int)dct_const_round_shift(a * cospi_16_64);
1096   a = ROUND_POWER_OF_TWO(a, 6);
1097 
1098   dc_value = _mm_set1_epi16(a);
1099 
1100   for (i = 0; i < 16; ++i) {
1101     RECON_AND_STORE(dest + 0, dc_value);
1102     RECON_AND_STORE(dest + 8, dc_value);
1103     dest += stride;
1104   }
1105 }
1106 
iadst16_8col(__m128i * in)1107 static void iadst16_8col(__m128i *in) {
1108   // perform 16x16 1-D ADST for 8 columns
1109   __m128i s[16], x[16], u[32], v[32];
1110   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1111   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1112   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1113   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1114   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1115   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1116   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1117   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1118   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1119   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1120   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1121   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1122   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1123   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1124   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1125   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1126   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1127   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1128   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1129   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1130   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1131   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1132   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1133   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1134   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1135   const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1136   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1137   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1138   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1139   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1140   const __m128i kZero = _mm_set1_epi16(0);
1141 
1142   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1143   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1144   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1145   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1146   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1147   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1148   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1149   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1150   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1151   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1152   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1153   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1154   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1155   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1156   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1157   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1158 
1159   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1160   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1161   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1162   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1163   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1164   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1165   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1166   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1167   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1168   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1169   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1170   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1171   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1172   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1173   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1174   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1175   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1176   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1177   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1178   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1179   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1180   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1181   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1182   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1183   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1184   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1185   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1186   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1187   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1188   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1189   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1190   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1191 
1192   u[0] = _mm_add_epi32(v[0], v[16]);
1193   u[1] = _mm_add_epi32(v[1], v[17]);
1194   u[2] = _mm_add_epi32(v[2], v[18]);
1195   u[3] = _mm_add_epi32(v[3], v[19]);
1196   u[4] = _mm_add_epi32(v[4], v[20]);
1197   u[5] = _mm_add_epi32(v[5], v[21]);
1198   u[6] = _mm_add_epi32(v[6], v[22]);
1199   u[7] = _mm_add_epi32(v[7], v[23]);
1200   u[8] = _mm_add_epi32(v[8], v[24]);
1201   u[9] = _mm_add_epi32(v[9], v[25]);
1202   u[10] = _mm_add_epi32(v[10], v[26]);
1203   u[11] = _mm_add_epi32(v[11], v[27]);
1204   u[12] = _mm_add_epi32(v[12], v[28]);
1205   u[13] = _mm_add_epi32(v[13], v[29]);
1206   u[14] = _mm_add_epi32(v[14], v[30]);
1207   u[15] = _mm_add_epi32(v[15], v[31]);
1208   u[16] = _mm_sub_epi32(v[0], v[16]);
1209   u[17] = _mm_sub_epi32(v[1], v[17]);
1210   u[18] = _mm_sub_epi32(v[2], v[18]);
1211   u[19] = _mm_sub_epi32(v[3], v[19]);
1212   u[20] = _mm_sub_epi32(v[4], v[20]);
1213   u[21] = _mm_sub_epi32(v[5], v[21]);
1214   u[22] = _mm_sub_epi32(v[6], v[22]);
1215   u[23] = _mm_sub_epi32(v[7], v[23]);
1216   u[24] = _mm_sub_epi32(v[8], v[24]);
1217   u[25] = _mm_sub_epi32(v[9], v[25]);
1218   u[26] = _mm_sub_epi32(v[10], v[26]);
1219   u[27] = _mm_sub_epi32(v[11], v[27]);
1220   u[28] = _mm_sub_epi32(v[12], v[28]);
1221   u[29] = _mm_sub_epi32(v[13], v[29]);
1222   u[30] = _mm_sub_epi32(v[14], v[30]);
1223   u[31] = _mm_sub_epi32(v[15], v[31]);
1224 
1225   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1226   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1227   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1228   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1229   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1230   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1231   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1232   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1233   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1234   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1235   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1236   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1237   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1238   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1239   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1240   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1241   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1242   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1243   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1244   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1245   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1246   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1247   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1248   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1249   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1250   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1251   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1252   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1253   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1254   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1255   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1256   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1257 
1258   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1259   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1260   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1261   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1262   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1263   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1264   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1265   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1266   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1267   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1268   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1269   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1270   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1271   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1272   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1273   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1274   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1275   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1276   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1277   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1278   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1279   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1280   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1281   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1282   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1283   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1284   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1285   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1286   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1287   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1288   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1289   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1290 
1291   s[0] = _mm_packs_epi32(u[0], u[1]);
1292   s[1] = _mm_packs_epi32(u[2], u[3]);
1293   s[2] = _mm_packs_epi32(u[4], u[5]);
1294   s[3] = _mm_packs_epi32(u[6], u[7]);
1295   s[4] = _mm_packs_epi32(u[8], u[9]);
1296   s[5] = _mm_packs_epi32(u[10], u[11]);
1297   s[6] = _mm_packs_epi32(u[12], u[13]);
1298   s[7] = _mm_packs_epi32(u[14], u[15]);
1299   s[8] = _mm_packs_epi32(u[16], u[17]);
1300   s[9] = _mm_packs_epi32(u[18], u[19]);
1301   s[10] = _mm_packs_epi32(u[20], u[21]);
1302   s[11] = _mm_packs_epi32(u[22], u[23]);
1303   s[12] = _mm_packs_epi32(u[24], u[25]);
1304   s[13] = _mm_packs_epi32(u[26], u[27]);
1305   s[14] = _mm_packs_epi32(u[28], u[29]);
1306   s[15] = _mm_packs_epi32(u[30], u[31]);
1307 
1308   // stage 2
1309   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1310   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1311   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1312   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1313   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1314   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1315   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1316   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1317 
1318   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1319   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1320   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1321   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1322   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1323   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1324   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1325   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1326   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1327   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1328   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1329   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1330   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1331   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1332   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1333   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1334 
1335   u[0] = _mm_add_epi32(v[0], v[8]);
1336   u[1] = _mm_add_epi32(v[1], v[9]);
1337   u[2] = _mm_add_epi32(v[2], v[10]);
1338   u[3] = _mm_add_epi32(v[3], v[11]);
1339   u[4] = _mm_add_epi32(v[4], v[12]);
1340   u[5] = _mm_add_epi32(v[5], v[13]);
1341   u[6] = _mm_add_epi32(v[6], v[14]);
1342   u[7] = _mm_add_epi32(v[7], v[15]);
1343   u[8] = _mm_sub_epi32(v[0], v[8]);
1344   u[9] = _mm_sub_epi32(v[1], v[9]);
1345   u[10] = _mm_sub_epi32(v[2], v[10]);
1346   u[11] = _mm_sub_epi32(v[3], v[11]);
1347   u[12] = _mm_sub_epi32(v[4], v[12]);
1348   u[13] = _mm_sub_epi32(v[5], v[13]);
1349   u[14] = _mm_sub_epi32(v[6], v[14]);
1350   u[15] = _mm_sub_epi32(v[7], v[15]);
1351 
1352   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1353   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1354   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1355   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1356   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1357   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1358   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1359   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1360   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1361   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1362   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1363   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1364   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1365   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1366   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1367   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1368 
1369   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1370   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1371   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1372   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1373   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1374   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1375   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1376   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1377   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1378   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1379   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1380   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1381   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1382   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1383   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1384   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1385 
1386   x[0] = _mm_add_epi16(s[0], s[4]);
1387   x[1] = _mm_add_epi16(s[1], s[5]);
1388   x[2] = _mm_add_epi16(s[2], s[6]);
1389   x[3] = _mm_add_epi16(s[3], s[7]);
1390   x[4] = _mm_sub_epi16(s[0], s[4]);
1391   x[5] = _mm_sub_epi16(s[1], s[5]);
1392   x[6] = _mm_sub_epi16(s[2], s[6]);
1393   x[7] = _mm_sub_epi16(s[3], s[7]);
1394   x[8] = _mm_packs_epi32(u[0], u[1]);
1395   x[9] = _mm_packs_epi32(u[2], u[3]);
1396   x[10] = _mm_packs_epi32(u[4], u[5]);
1397   x[11] = _mm_packs_epi32(u[6], u[7]);
1398   x[12] = _mm_packs_epi32(u[8], u[9]);
1399   x[13] = _mm_packs_epi32(u[10], u[11]);
1400   x[14] = _mm_packs_epi32(u[12], u[13]);
1401   x[15] = _mm_packs_epi32(u[14], u[15]);
1402 
1403   // stage 3
1404   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1405   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1406   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1407   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1408   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1409   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1410   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1411   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1412 
1413   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1414   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1415   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1416   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1417   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1418   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1419   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1420   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1421   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1422   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1423   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1424   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1425   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1426   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1427   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1428   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1429 
1430   u[0] = _mm_add_epi32(v[0], v[4]);
1431   u[1] = _mm_add_epi32(v[1], v[5]);
1432   u[2] = _mm_add_epi32(v[2], v[6]);
1433   u[3] = _mm_add_epi32(v[3], v[7]);
1434   u[4] = _mm_sub_epi32(v[0], v[4]);
1435   u[5] = _mm_sub_epi32(v[1], v[5]);
1436   u[6] = _mm_sub_epi32(v[2], v[6]);
1437   u[7] = _mm_sub_epi32(v[3], v[7]);
1438   u[8] = _mm_add_epi32(v[8], v[12]);
1439   u[9] = _mm_add_epi32(v[9], v[13]);
1440   u[10] = _mm_add_epi32(v[10], v[14]);
1441   u[11] = _mm_add_epi32(v[11], v[15]);
1442   u[12] = _mm_sub_epi32(v[8], v[12]);
1443   u[13] = _mm_sub_epi32(v[9], v[13]);
1444   u[14] = _mm_sub_epi32(v[10], v[14]);
1445   u[15] = _mm_sub_epi32(v[11], v[15]);
1446 
1447   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1448   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1449   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1450   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1451   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1452   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1453   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1454   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1455   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1456   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1457   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1458   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1459   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1460   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1461   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1462   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1463 
1464   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1465   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1466   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1467   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1468   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1469   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1470   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1471   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1472   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1473   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1474   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1475   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1476   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1477   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1478   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1479   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1480 
1481   s[0] = _mm_add_epi16(x[0], x[2]);
1482   s[1] = _mm_add_epi16(x[1], x[3]);
1483   s[2] = _mm_sub_epi16(x[0], x[2]);
1484   s[3] = _mm_sub_epi16(x[1], x[3]);
1485   s[4] = _mm_packs_epi32(v[0], v[1]);
1486   s[5] = _mm_packs_epi32(v[2], v[3]);
1487   s[6] = _mm_packs_epi32(v[4], v[5]);
1488   s[7] = _mm_packs_epi32(v[6], v[7]);
1489   s[8] = _mm_add_epi16(x[8], x[10]);
1490   s[9] = _mm_add_epi16(x[9], x[11]);
1491   s[10] = _mm_sub_epi16(x[8], x[10]);
1492   s[11] = _mm_sub_epi16(x[9], x[11]);
1493   s[12] = _mm_packs_epi32(v[8], v[9]);
1494   s[13] = _mm_packs_epi32(v[10], v[11]);
1495   s[14] = _mm_packs_epi32(v[12], v[13]);
1496   s[15] = _mm_packs_epi32(v[14], v[15]);
1497 
1498   // stage 4
1499   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1500   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1501   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1502   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1503   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1504   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1505   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1506   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1507 
1508   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1509   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1510   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1511   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1512   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1513   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1514   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1515   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1516   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1517   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1518   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1519   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1520   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1521   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1522   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1523   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1524 
1525   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1526   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1527   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1528   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1529   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1530   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1531   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1532   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1533   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1534   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1535   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1536   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1537   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1538   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1539   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1540   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1541 
1542   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1543   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1544   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1545   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1546   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1547   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1548   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1549   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1550   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1551   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1552   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1553   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1554   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1555   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1556   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1557   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1558 
1559   in[0] = s[0];
1560   in[1] = _mm_sub_epi16(kZero, s[8]);
1561   in[2] = s[12];
1562   in[3] = _mm_sub_epi16(kZero, s[4]);
1563   in[4] = _mm_packs_epi32(v[4], v[5]);
1564   in[5] = _mm_packs_epi32(v[12], v[13]);
1565   in[6] = _mm_packs_epi32(v[8], v[9]);
1566   in[7] = _mm_packs_epi32(v[0], v[1]);
1567   in[8] = _mm_packs_epi32(v[2], v[3]);
1568   in[9] = _mm_packs_epi32(v[10], v[11]);
1569   in[10] = _mm_packs_epi32(v[14], v[15]);
1570   in[11] = _mm_packs_epi32(v[6], v[7]);
1571   in[12] = s[5];
1572   in[13] = _mm_sub_epi16(kZero, s[13]);
1573   in[14] = s[9];
1574   in[15] = _mm_sub_epi16(kZero, s[1]);
1575 }
1576 
idct16_8col(__m128i * in)1577 static void idct16_8col(__m128i *in) {
1578   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1579   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1580   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1581   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1582   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1583   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1584   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1585   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1586   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1587   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1588   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1589   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1590   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1591   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1592   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1593   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1594   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1595   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1596   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1597   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1598   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1599   __m128i v[16], u[16], s[16], t[16];
1600 
1601   // stage 1
1602   s[0] = in[0];
1603   s[1] = in[8];
1604   s[2] = in[4];
1605   s[3] = in[12];
1606   s[4] = in[2];
1607   s[5] = in[10];
1608   s[6] = in[6];
1609   s[7] = in[14];
1610   s[8] = in[1];
1611   s[9] = in[9];
1612   s[10] = in[5];
1613   s[11] = in[13];
1614   s[12] = in[3];
1615   s[13] = in[11];
1616   s[14] = in[7];
1617   s[15] = in[15];
1618 
1619   // stage 2
1620   u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1621   u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1622   u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1623   u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1624   u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1625   u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1626   u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1627   u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1628 
1629   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1630   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1631   v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1632   v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1633   v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1634   v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1635   v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1636   v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1637   v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1638   v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1639   v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1640   v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1641   v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1642   v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1643   v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1644   v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1645 
1646   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1647   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1648   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1649   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1650   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1651   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1652   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1653   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1654   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1655   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1656   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1657   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1658   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1659   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1660   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1661   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1662 
1663   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1664   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1665   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1666   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1667   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1668   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1669   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1670   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1671   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1672   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1673   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1674   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1675   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1676   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1677   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1678   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1679 
1680   s[8] = _mm_packs_epi32(u[0], u[1]);
1681   s[15] = _mm_packs_epi32(u[2], u[3]);
1682   s[9] = _mm_packs_epi32(u[4], u[5]);
1683   s[14] = _mm_packs_epi32(u[6], u[7]);
1684   s[10] = _mm_packs_epi32(u[8], u[9]);
1685   s[13] = _mm_packs_epi32(u[10], u[11]);
1686   s[11] = _mm_packs_epi32(u[12], u[13]);
1687   s[12] = _mm_packs_epi32(u[14], u[15]);
1688 
1689   // stage 3
1690   t[0] = s[0];
1691   t[1] = s[1];
1692   t[2] = s[2];
1693   t[3] = s[3];
1694   u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1695   u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1696   u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1697   u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1698 
1699   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1700   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1701   v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1702   v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1703   v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1704   v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1705   v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1706   v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1707 
1708   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1709   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1710   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1711   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1712   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1713   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1714   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1715   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1716 
1717   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1718   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1719   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1720   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1721   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1722   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1723   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1724   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1725 
1726   t[4] = _mm_packs_epi32(u[0], u[1]);
1727   t[7] = _mm_packs_epi32(u[2], u[3]);
1728   t[5] = _mm_packs_epi32(u[4], u[5]);
1729   t[6] = _mm_packs_epi32(u[6], u[7]);
1730   t[8] = _mm_add_epi16(s[8], s[9]);
1731   t[9] = _mm_sub_epi16(s[8], s[9]);
1732   t[10] = _mm_sub_epi16(s[11], s[10]);
1733   t[11] = _mm_add_epi16(s[10], s[11]);
1734   t[12] = _mm_add_epi16(s[12], s[13]);
1735   t[13] = _mm_sub_epi16(s[12], s[13]);
1736   t[14] = _mm_sub_epi16(s[15], s[14]);
1737   t[15] = _mm_add_epi16(s[14], s[15]);
1738 
1739   // stage 4
1740   u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1741   u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1742   u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1743   u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1744   u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1745   u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1746   u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1747   u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1748 
1749   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1750   v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1751   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1752   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1753   v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1754   v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1755   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1756   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1757   v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1758   v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1759   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1760   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1761   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1762   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1763   v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1764   v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1765 
1766   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1767   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1768   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1769   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1770   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1771   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1772   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1773   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1774   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1775   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1776   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1777   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1778   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1779   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1780   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1781   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1782 
1783   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1784   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1785   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1786   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1787   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1788   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1789   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1790   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1791   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1792   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1793   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1794   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1795   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1796   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1797   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1798   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1799 
1800   s[0] = _mm_packs_epi32(u[0], u[1]);
1801   s[1] = _mm_packs_epi32(u[2], u[3]);
1802   s[2] = _mm_packs_epi32(u[4], u[5]);
1803   s[3] = _mm_packs_epi32(u[6], u[7]);
1804   s[4] = _mm_add_epi16(t[4], t[5]);
1805   s[5] = _mm_sub_epi16(t[4], t[5]);
1806   s[6] = _mm_sub_epi16(t[7], t[6]);
1807   s[7] = _mm_add_epi16(t[6], t[7]);
1808   s[8] = t[8];
1809   s[15] = t[15];
1810   s[9] = _mm_packs_epi32(u[8], u[9]);
1811   s[14] = _mm_packs_epi32(u[10], u[11]);
1812   s[10] = _mm_packs_epi32(u[12], u[13]);
1813   s[13] = _mm_packs_epi32(u[14], u[15]);
1814   s[11] = t[11];
1815   s[12] = t[12];
1816 
1817   // stage 5
1818   t[0] = _mm_add_epi16(s[0], s[3]);
1819   t[1] = _mm_add_epi16(s[1], s[2]);
1820   t[2] = _mm_sub_epi16(s[1], s[2]);
1821   t[3] = _mm_sub_epi16(s[0], s[3]);
1822   t[4] = s[4];
1823   t[7] = s[7];
1824 
1825   u[0] = _mm_unpacklo_epi16(s[5], s[6]);
1826   u[1] = _mm_unpackhi_epi16(s[5], s[6]);
1827   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1828   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1829   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1830   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1831   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1832   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1833   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1834   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1835   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1836   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1837   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1838   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1839   t[5] = _mm_packs_epi32(u[0], u[1]);
1840   t[6] = _mm_packs_epi32(u[2], u[3]);
1841 
1842   t[8] = _mm_add_epi16(s[8], s[11]);
1843   t[9] = _mm_add_epi16(s[9], s[10]);
1844   t[10] = _mm_sub_epi16(s[9], s[10]);
1845   t[11] = _mm_sub_epi16(s[8], s[11]);
1846   t[12] = _mm_sub_epi16(s[15], s[12]);
1847   t[13] = _mm_sub_epi16(s[14], s[13]);
1848   t[14] = _mm_add_epi16(s[13], s[14]);
1849   t[15] = _mm_add_epi16(s[12], s[15]);
1850 
1851   // stage 6
1852   s[0] = _mm_add_epi16(t[0], t[7]);
1853   s[1] = _mm_add_epi16(t[1], t[6]);
1854   s[2] = _mm_add_epi16(t[2], t[5]);
1855   s[3] = _mm_add_epi16(t[3], t[4]);
1856   s[4] = _mm_sub_epi16(t[3], t[4]);
1857   s[5] = _mm_sub_epi16(t[2], t[5]);
1858   s[6] = _mm_sub_epi16(t[1], t[6]);
1859   s[7] = _mm_sub_epi16(t[0], t[7]);
1860   s[8] = t[8];
1861   s[9] = t[9];
1862 
1863   u[0] = _mm_unpacklo_epi16(t[10], t[13]);
1864   u[1] = _mm_unpackhi_epi16(t[10], t[13]);
1865   u[2] = _mm_unpacklo_epi16(t[11], t[12]);
1866   u[3] = _mm_unpackhi_epi16(t[11], t[12]);
1867 
1868   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1869   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1870   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1871   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1872   v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1873   v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1874   v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1875   v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1876 
1877   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1878   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1879   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1880   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1881   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1882   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1883   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1884   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1885 
1886   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1887   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1888   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1889   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1890   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1891   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1892   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1893   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1894 
1895   s[10] = _mm_packs_epi32(u[0], u[1]);
1896   s[13] = _mm_packs_epi32(u[2], u[3]);
1897   s[11] = _mm_packs_epi32(u[4], u[5]);
1898   s[12] = _mm_packs_epi32(u[6], u[7]);
1899   s[14] = t[14];
1900   s[15] = t[15];
1901 
1902   // stage 7
1903   in[0] = _mm_add_epi16(s[0], s[15]);
1904   in[1] = _mm_add_epi16(s[1], s[14]);
1905   in[2] = _mm_add_epi16(s[2], s[13]);
1906   in[3] = _mm_add_epi16(s[3], s[12]);
1907   in[4] = _mm_add_epi16(s[4], s[11]);
1908   in[5] = _mm_add_epi16(s[5], s[10]);
1909   in[6] = _mm_add_epi16(s[6], s[9]);
1910   in[7] = _mm_add_epi16(s[7], s[8]);
1911   in[8] = _mm_sub_epi16(s[7], s[8]);
1912   in[9] = _mm_sub_epi16(s[6], s[9]);
1913   in[10] = _mm_sub_epi16(s[5], s[10]);
1914   in[11] = _mm_sub_epi16(s[4], s[11]);
1915   in[12] = _mm_sub_epi16(s[3], s[12]);
1916   in[13] = _mm_sub_epi16(s[2], s[13]);
1917   in[14] = _mm_sub_epi16(s[1], s[14]);
1918   in[15] = _mm_sub_epi16(s[0], s[15]);
1919 }
1920 
idct16_sse2(__m128i * in0,__m128i * in1)1921 void idct16_sse2(__m128i *in0, __m128i *in1) {
1922   array_transpose_16x16(in0, in1);
1923   idct16_8col(in0);
1924   idct16_8col(in1);
1925 }
1926 
iadst16_sse2(__m128i * in0,__m128i * in1)1927 void iadst16_sse2(__m128i *in0, __m128i *in1) {
1928   array_transpose_16x16(in0, in1);
1929   iadst16_8col(in0);
1930   iadst16_8col(in1);
1931 }
1932 
vpx_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1933 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
1934                                int stride) {
1935   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1936   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1937   const __m128i zero = _mm_setzero_si128();
1938 
1939   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1940   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1941   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1942   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1943 
1944   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1945   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1946 
1947   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1948   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1949   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1950   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1951   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1952   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1953 
1954   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1955   __m128i in[16], l[16];
1956   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
1957       stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
1958       stp1_12_0;
1959   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1960       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
1961   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1962   int i;
1963   // First 1-D inverse DCT
1964   // Load input data.
1965   in[0] = load_input_data(input);
1966   in[1] = load_input_data(input + 8 * 2);
1967   in[2] = load_input_data(input + 8 * 4);
1968   in[3] = load_input_data(input + 8 * 6);
1969 
1970   TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
1971 
1972   // Stage2
1973   {
1974     const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
1975     const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
1976 
1977     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
1978     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
1979     tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
1980     tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
1981 
1982     tmp0 = _mm_add_epi32(tmp0, rounding);
1983     tmp2 = _mm_add_epi32(tmp2, rounding);
1984     tmp5 = _mm_add_epi32(tmp5, rounding);
1985     tmp7 = _mm_add_epi32(tmp7, rounding);
1986 
1987     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1988     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1989     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
1990     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
1991 
1992     stp2_8 = _mm_packs_epi32(tmp0, tmp2);
1993     stp2_11 = _mm_packs_epi32(tmp5, tmp7);
1994   }
1995 
1996   // Stage3
1997   {
1998     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
1999 
2000     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2001     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2002 
2003     tmp0 = _mm_add_epi32(tmp0, rounding);
2004     tmp2 = _mm_add_epi32(tmp2, rounding);
2005     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2006     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2007 
2008     stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2009     stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2010 
2011     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2012   }
2013 
2014   // Stage4
2015   {
2016     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2017     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2018     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2019 
2020     tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2021     tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2022     tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2023     tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2024     tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2025     tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2026 
2027     tmp0 = _mm_add_epi32(tmp0, rounding);
2028     tmp2 = _mm_add_epi32(tmp2, rounding);
2029     tmp1 = _mm_add_epi32(tmp1, rounding);
2030     tmp3 = _mm_add_epi32(tmp3, rounding);
2031     tmp5 = _mm_add_epi32(tmp5, rounding);
2032     tmp7 = _mm_add_epi32(tmp7, rounding);
2033 
2034     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2035     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2036     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2037     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2038     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2039     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2040 
2041     stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2042     stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2043     stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2044     stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2045 
2046     stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2047   }
2048 
2049   // Stage5 and Stage6
2050   {
2051     tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2052     tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2053     tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2054     tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2055 
2056     stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
2057     stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2058     stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
2059     stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2060 
2061     stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2062     stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2063     stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2064     stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2065   }
2066 
2067   // Stage6
2068   {
2069     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2070     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2071     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2072 
2073     tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2074     tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2075     tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2076     tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2077     tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2078     tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2079 
2080     tmp1 = _mm_add_epi32(tmp1, rounding);
2081     tmp3 = _mm_add_epi32(tmp3, rounding);
2082     tmp0 = _mm_add_epi32(tmp0, rounding);
2083     tmp2 = _mm_add_epi32(tmp2, rounding);
2084     tmp4 = _mm_add_epi32(tmp4, rounding);
2085     tmp6 = _mm_add_epi32(tmp6, rounding);
2086 
2087     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2088     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2089     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2090     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2091     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2092     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2093 
2094     stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2095 
2096     stp2_10 = _mm_packs_epi32(tmp0, zero);
2097     stp2_13 = _mm_packs_epi32(tmp2, zero);
2098     stp2_11 = _mm_packs_epi32(tmp4, zero);
2099     stp2_12 = _mm_packs_epi32(tmp6, zero);
2100 
2101     tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2102     tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2103     tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2104     tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2105 
2106     stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2107     stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2108     stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2109     stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2110     stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2111     stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2112     stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2113     stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2114   }
2115 
2116   // Stage7. Left 8x16 only.
2117   l[0] = _mm_add_epi16(stp2_0, stp1_15);
2118   l[1] = _mm_add_epi16(stp2_1, stp1_14);
2119   l[2] = _mm_add_epi16(stp2_2, stp2_13);
2120   l[3] = _mm_add_epi16(stp2_3, stp2_12);
2121   l[4] = _mm_add_epi16(stp2_4, stp2_11);
2122   l[5] = _mm_add_epi16(stp2_5, stp2_10);
2123   l[6] = _mm_add_epi16(stp2_6, stp1_9);
2124   l[7] = _mm_add_epi16(stp2_7, stp1_8);
2125   l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2126   l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2127   l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2128   l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2129   l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2130   l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2131   l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2132   l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2133 
2134   // Second 1-D inverse transform, performed per 8x16 block
2135   for (i = 0; i < 2; i++) {
2136     int j;
2137     array_transpose_4X8(l + 8 * i, in);
2138 
2139     IDCT16_10
2140 
2141     // Stage7
2142     in[0] = _mm_add_epi16(stp2_0, stp1_15);
2143     in[1] = _mm_add_epi16(stp2_1, stp1_14);
2144     in[2] = _mm_add_epi16(stp2_2, stp2_13);
2145     in[3] = _mm_add_epi16(stp2_3, stp2_12);
2146     in[4] = _mm_add_epi16(stp2_4, stp2_11);
2147     in[5] = _mm_add_epi16(stp2_5, stp2_10);
2148     in[6] = _mm_add_epi16(stp2_6, stp1_9);
2149     in[7] = _mm_add_epi16(stp2_7, stp1_8);
2150     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2151     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2152     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2153     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2154     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2155     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2156     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2157     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2158 
2159     for (j = 0; j < 16; ++j) {
2160       // Final rounding and shift
2161       in[j] = _mm_adds_epi16(in[j], final_rounding);
2162       in[j] = _mm_srai_epi16(in[j], 6);
2163       RECON_AND_STORE(dest + j * stride, in[j]);
2164     }
2165 
2166     dest += 8;
2167   }
2168 }
2169 
2170 #define LOAD_DQCOEFF(reg, input)  \
2171   {                               \
2172     reg = load_input_data(input); \
2173     input += 8;                   \
2174   }
2175 
2176 #define IDCT32_34                                                              \
2177   /* Stage1 */                                                                 \
2178   {                                                                            \
2179     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
2180     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
2181                                                                                \
2182     const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
2183     const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
2184                                                                                \
2185     const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
2186     const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
2187                                                                                \
2188     const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
2189     const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
2190                                                                                \
2191     MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
2192                              stp1_31);                                         \
2193     MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
2194                              stp1_28);                                         \
2195     MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
2196                              stp1_27);                                         \
2197     MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
2198                              stp1_24);                                         \
2199   }                                                                            \
2200                                                                                \
2201   /* Stage2 */                                                                 \
2202   {                                                                            \
2203     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
2204     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
2205                                                                                \
2206     const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
2207     const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
2208                                                                                \
2209     MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
2210                              stp2_15);                                         \
2211     MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
2212                              stp2_12);                                         \
2213                                                                                \
2214     stp2_16 = stp1_16;                                                         \
2215     stp2_19 = stp1_19;                                                         \
2216                                                                                \
2217     stp2_20 = stp1_20;                                                         \
2218     stp2_23 = stp1_23;                                                         \
2219                                                                                \
2220     stp2_24 = stp1_24;                                                         \
2221     stp2_27 = stp1_27;                                                         \
2222                                                                                \
2223     stp2_28 = stp1_28;                                                         \
2224     stp2_31 = stp1_31;                                                         \
2225   }                                                                            \
2226                                                                                \
2227   /* Stage3 */                                                                 \
2228   {                                                                            \
2229     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
2230     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
2231                                                                                \
2232     const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
2233     const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
2234     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
2235     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
2236                                                                                \
2237     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
2238     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
2239     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
2240     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
2241                                                                                \
2242     MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
2243                              stp1_7);                                          \
2244                                                                                \
2245     stp1_8 = stp2_8;                                                           \
2246     stp1_11 = stp2_11;                                                         \
2247     stp1_12 = stp2_12;                                                         \
2248     stp1_15 = stp2_15;                                                         \
2249                                                                                \
2250     MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
2251                            stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
2252                            stp1_29)                                            \
2253     MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
2254                            stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2255                            stp1_25)                                            \
2256                                                                                \
2257     stp1_16 = stp2_16;                                                         \
2258     stp1_31 = stp2_31;                                                         \
2259     stp1_19 = stp2_19;                                                         \
2260     stp1_20 = stp2_20;                                                         \
2261     stp1_23 = stp2_23;                                                         \
2262     stp1_24 = stp2_24;                                                         \
2263     stp1_27 = stp2_27;                                                         \
2264     stp1_28 = stp2_28;                                                         \
2265   }                                                                            \
2266                                                                                \
2267   /* Stage4 */                                                                 \
2268   {                                                                            \
2269     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
2270     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
2271                                                                                \
2272     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
2273     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
2274     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
2275     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
2276                                                                                \
2277     MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
2278                              stp2_1);                                          \
2279                                                                                \
2280     stp2_4 = stp1_4;                                                           \
2281     stp2_5 = stp1_4;                                                           \
2282     stp2_6 = stp1_7;                                                           \
2283     stp2_7 = stp1_7;                                                           \
2284                                                                                \
2285     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
2286                            stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
2287                            stp2_13)                                            \
2288                                                                                \
2289     stp2_8 = stp1_8;                                                           \
2290     stp2_15 = stp1_15;                                                         \
2291     stp2_11 = stp1_11;                                                         \
2292     stp2_12 = stp1_12;                                                         \
2293                                                                                \
2294     stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
2295     stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
2296     stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
2297     stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
2298     stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
2299     stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
2300     stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
2301     stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
2302                                                                                \
2303     stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
2304     stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
2305     stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
2306     stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
2307     stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
2308     stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
2309     stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
2310     stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
2311   }                                                                            \
2312                                                                                \
2313   /* Stage5 */                                                                 \
2314   {                                                                            \
2315     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
2316     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
2317     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
2318     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
2319                                                                                \
2320     const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
2321     const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
2322     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2323     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2324                                                                                \
2325     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2326     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2327                                                                                \
2328     stp1_0 = stp2_0;                                                           \
2329     stp1_1 = stp2_1;                                                           \
2330     stp1_2 = stp2_1;                                                           \
2331     stp1_3 = stp2_0;                                                           \
2332                                                                                \
2333     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
2334     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
2335     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
2336     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
2337                                                                                \
2338     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
2339     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
2340     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
2341     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
2342                                                                                \
2343     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
2344     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
2345     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
2346     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
2347                                                                                \
2348     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
2349     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
2350                                                                                \
2351     stp1_4 = stp2_4;                                                           \
2352     stp1_7 = stp2_7;                                                           \
2353                                                                                \
2354     stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
2355     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
2356     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
2357     stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
2358     stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
2359     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
2360     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
2361     stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
2362                                                                                \
2363     stp1_16 = stp2_16;                                                         \
2364     stp1_17 = stp2_17;                                                         \
2365                                                                                \
2366     MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
2367                            stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
2368                            stp1_28)                                            \
2369     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
2370                            stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
2371                            stp1_26)                                            \
2372                                                                                \
2373     stp1_22 = stp2_22;                                                         \
2374     stp1_23 = stp2_23;                                                         \
2375     stp1_24 = stp2_24;                                                         \
2376     stp1_25 = stp2_25;                                                         \
2377     stp1_30 = stp2_30;                                                         \
2378     stp1_31 = stp2_31;                                                         \
2379   }                                                                            \
2380                                                                                \
2381   /* Stage6 */                                                                 \
2382   {                                                                            \
2383     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
2384     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
2385     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
2386     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
2387                                                                                \
2388     stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
2389     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
2390     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
2391     stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
2392     stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
2393     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
2394     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
2395     stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
2396                                                                                \
2397     stp2_8 = stp1_8;                                                           \
2398     stp2_9 = stp1_9;                                                           \
2399     stp2_14 = stp1_14;                                                         \
2400     stp2_15 = stp1_15;                                                         \
2401                                                                                \
2402     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
2403                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
2404                            stp2_12)                                            \
2405                                                                                \
2406     stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
2407     stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
2408     stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
2409     stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
2410     stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
2411     stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
2412     stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
2413     stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
2414                                                                                \
2415     stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
2416     stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
2417     stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
2418     stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
2419     stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
2420     stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
2421     stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
2422     stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
2423   }                                                                            \
2424                                                                                \
2425   /* Stage7 */                                                                 \
2426   {                                                                            \
2427     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2428     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2429     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2430     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2431                                                                                \
2432     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
2433     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
2434     const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
2435     const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
2436                                                                                \
2437     stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
2438     stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
2439     stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
2440     stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
2441     stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
2442     stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
2443     stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
2444     stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
2445     stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
2446     stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
2447     stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
2448     stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
2449     stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
2450     stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
2451     stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
2452     stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
2453                                                                                \
2454     stp1_16 = stp2_16;                                                         \
2455     stp1_17 = stp2_17;                                                         \
2456     stp1_18 = stp2_18;                                                         \
2457     stp1_19 = stp2_19;                                                         \
2458                                                                                \
2459     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
2460                            stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
2461                            stp1_26)                                            \
2462     MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
2463                            stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
2464                            stp1_24)                                            \
2465                                                                                \
2466     stp1_28 = stp2_28;                                                         \
2467     stp1_29 = stp2_29;                                                         \
2468     stp1_30 = stp2_30;                                                         \
2469     stp1_31 = stp2_31;                                                         \
2470   }
2471 
2472 #define IDCT32                                                                 \
2473   /* Stage1 */                                                                 \
2474   {                                                                            \
2475     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
2476     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
2477     const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
2478     const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
2479                                                                                \
2480     const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
2481     const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
2482     const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
2483     const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
2484                                                                                \
2485     const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
2486     const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
2487     const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
2488     const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
2489                                                                                \
2490     const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
2491     const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
2492     const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
2493     const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
2494                                                                                \
2495     MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
2496                            stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
2497                            stp1_30)                                            \
2498     MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
2499                            stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
2500     MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
2501                            stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
2502                            stp1_21, stp1_26)                                   \
2503     MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
2504                            stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
2505                            stp1_23, stp1_24)                                   \
2506   }                                                                            \
2507                                                                                \
2508   /* Stage2 */                                                                 \
2509   {                                                                            \
2510     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
2511     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
2512     const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
2513     const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
2514                                                                                \
2515     const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
2516     const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
2517     const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
2518     const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
2519                                                                                \
2520     MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
2521                            stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
2522                            stp2_14)                                            \
2523     MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
2524                            stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
2525                            stp2_12)                                            \
2526                                                                                \
2527     stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
2528     stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
2529     stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
2530     stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
2531                                                                                \
2532     stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
2533     stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
2534     stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
2535     stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
2536                                                                                \
2537     stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
2538     stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
2539     stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
2540     stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
2541                                                                                \
2542     stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
2543     stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
2544     stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
2545     stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
2546   }                                                                            \
2547                                                                                \
2548   /* Stage3 */                                                                 \
2549   {                                                                            \
2550     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
2551     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
2552     const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
2553     const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
2554                                                                                \
2555     const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
2556     const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
2557     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
2558     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
2559                                                                                \
2560     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2561     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2562     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
2563     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
2564                                                                                \
2565     MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
2566                            stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
2567                            stp1_6)                                             \
2568                                                                                \
2569     stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
2570     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
2571     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
2572     stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
2573     stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
2574     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
2575     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
2576     stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
2577                                                                                \
2578     MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
2579                            stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
2580                            stp1_29)                                            \
2581     MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
2582                            stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2583                            stp1_25)                                            \
2584                                                                                \
2585     stp1_16 = stp2_16;                                                         \
2586     stp1_31 = stp2_31;                                                         \
2587     stp1_19 = stp2_19;                                                         \
2588     stp1_20 = stp2_20;                                                         \
2589     stp1_23 = stp2_23;                                                         \
2590     stp1_24 = stp2_24;                                                         \
2591     stp1_27 = stp2_27;                                                         \
2592     stp1_28 = stp2_28;                                                         \
2593   }                                                                            \
2594                                                                                \
2595   /* Stage4 */                                                                 \
2596   {                                                                            \
2597     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
2598     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
2599     const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
2600     const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
2601                                                                                \
2602     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
2603     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
2604     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
2605     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
2606                                                                                \
2607     MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
2608                            stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
2609                                                                                \
2610     stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
2611     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
2612     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
2613     stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
2614                                                                                \
2615     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
2616                            stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
2617                            stp2_13)                                            \
2618                                                                                \
2619     stp2_8 = stp1_8;                                                           \
2620     stp2_15 = stp1_15;                                                         \
2621     stp2_11 = stp1_11;                                                         \
2622     stp2_12 = stp1_12;                                                         \
2623                                                                                \
2624     stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
2625     stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
2626     stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
2627     stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
2628     stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
2629     stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
2630     stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
2631     stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
2632                                                                                \
2633     stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
2634     stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
2635     stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
2636     stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
2637     stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
2638     stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
2639     stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
2640     stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
2641   }                                                                            \
2642                                                                                \
2643   /* Stage5 */                                                                 \
2644   {                                                                            \
2645     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
2646     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
2647     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
2648     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
2649                                                                                \
2650     const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
2651     const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
2652     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2653     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2654                                                                                \
2655     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2656     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2657                                                                                \
2658     stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
2659     stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
2660     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
2661     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
2662                                                                                \
2663     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
2664     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
2665     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
2666     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
2667                                                                                \
2668     tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
2669     tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
2670     tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
2671     tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
2672                                                                                \
2673     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
2674     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
2675     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
2676     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
2677                                                                                \
2678     stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
2679     stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
2680                                                                                \
2681     stp1_4 = stp2_4;                                                           \
2682     stp1_7 = stp2_7;                                                           \
2683                                                                                \
2684     stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
2685     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
2686     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
2687     stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
2688     stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
2689     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
2690     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
2691     stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
2692                                                                                \
2693     stp1_16 = stp2_16;                                                         \
2694     stp1_17 = stp2_17;                                                         \
2695                                                                                \
2696     MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
2697                            stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
2698                            stp1_28)                                            \
2699     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
2700                            stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
2701                            stp1_26)                                            \
2702                                                                                \
2703     stp1_22 = stp2_22;                                                         \
2704     stp1_23 = stp2_23;                                                         \
2705     stp1_24 = stp2_24;                                                         \
2706     stp1_25 = stp2_25;                                                         \
2707     stp1_30 = stp2_30;                                                         \
2708     stp1_31 = stp2_31;                                                         \
2709   }                                                                            \
2710                                                                                \
2711   /* Stage6 */                                                                 \
2712   {                                                                            \
2713     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
2714     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
2715     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
2716     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
2717                                                                                \
2718     stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
2719     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
2720     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
2721     stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
2722     stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
2723     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
2724     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
2725     stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
2726                                                                                \
2727     stp2_8 = stp1_8;                                                           \
2728     stp2_9 = stp1_9;                                                           \
2729     stp2_14 = stp1_14;                                                         \
2730     stp2_15 = stp1_15;                                                         \
2731                                                                                \
2732     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
2733                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
2734                            stp2_12)                                            \
2735                                                                                \
2736     stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
2737     stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
2738     stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
2739     stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
2740     stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
2741     stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
2742     stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
2743     stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
2744                                                                                \
2745     stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
2746     stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
2747     stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
2748     stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
2749     stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
2750     stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
2751     stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
2752     stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
2753   }                                                                            \
2754                                                                                \
2755   /* Stage7 */                                                                 \
2756   {                                                                            \
2757     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2758     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2759     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2760     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2761                                                                                \
2762     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
2763     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
2764     const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
2765     const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
2766                                                                                \
2767     stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
2768     stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
2769     stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
2770     stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
2771     stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
2772     stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
2773     stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
2774     stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
2775     stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
2776     stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
2777     stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
2778     stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
2779     stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
2780     stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
2781     stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
2782     stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
2783                                                                                \
2784     stp1_16 = stp2_16;                                                         \
2785     stp1_17 = stp2_17;                                                         \
2786     stp1_18 = stp2_18;                                                         \
2787     stp1_19 = stp2_19;                                                         \
2788                                                                                \
2789     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
2790                            stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
2791                            stp1_26)                                            \
2792     MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
2793                            stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
2794                            stp1_24)                                            \
2795                                                                                \
2796     stp1_28 = stp2_28;                                                         \
2797     stp1_29 = stp2_29;                                                         \
2798     stp1_30 = stp2_30;                                                         \
2799     stp1_31 = stp2_31;                                                         \
2800   }
2801 
2802 // Only upper-left 8x8 has non-zero coeff
vpx_idct32x32_34_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)2803 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
2804                                int stride) {
2805   const __m128i zero = _mm_setzero_si128();
2806   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2807   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2808 
2809   // idct constants for each stage
2810   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2811   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2812   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2813   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2814   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2815   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2816   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2817   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2818 
2819   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2820   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2821   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2822   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2823 
2824   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2825   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2826   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2827   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2828   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2829   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2830   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2831   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2832 
2833   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2834   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2835   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2836   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2837   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2838 
2839   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2840 
2841   __m128i in[32], col[32];
2842   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2843       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2844       stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
2845       stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
2846   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2847       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
2848       stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
2849       stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
2850   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2851   int i;
2852 
2853   // Load input data. Only need to load the top left 8x8 block.
2854   in[0] = load_input_data(input);
2855   in[1] = load_input_data(input + 32);
2856   in[2] = load_input_data(input + 64);
2857   in[3] = load_input_data(input + 96);
2858   in[4] = load_input_data(input + 128);
2859   in[5] = load_input_data(input + 160);
2860   in[6] = load_input_data(input + 192);
2861   in[7] = load_input_data(input + 224);
2862 
2863   array_transpose_8x8(in, in);
2864   IDCT32_34
2865 
2866   // 1_D: Store 32 intermediate results for each 8x32 block.
2867   col[0] = _mm_add_epi16(stp1_0, stp1_31);
2868   col[1] = _mm_add_epi16(stp1_1, stp1_30);
2869   col[2] = _mm_add_epi16(stp1_2, stp1_29);
2870   col[3] = _mm_add_epi16(stp1_3, stp1_28);
2871   col[4] = _mm_add_epi16(stp1_4, stp1_27);
2872   col[5] = _mm_add_epi16(stp1_5, stp1_26);
2873   col[6] = _mm_add_epi16(stp1_6, stp1_25);
2874   col[7] = _mm_add_epi16(stp1_7, stp1_24);
2875   col[8] = _mm_add_epi16(stp1_8, stp1_23);
2876   col[9] = _mm_add_epi16(stp1_9, stp1_22);
2877   col[10] = _mm_add_epi16(stp1_10, stp1_21);
2878   col[11] = _mm_add_epi16(stp1_11, stp1_20);
2879   col[12] = _mm_add_epi16(stp1_12, stp1_19);
2880   col[13] = _mm_add_epi16(stp1_13, stp1_18);
2881   col[14] = _mm_add_epi16(stp1_14, stp1_17);
2882   col[15] = _mm_add_epi16(stp1_15, stp1_16);
2883   col[16] = _mm_sub_epi16(stp1_15, stp1_16);
2884   col[17] = _mm_sub_epi16(stp1_14, stp1_17);
2885   col[18] = _mm_sub_epi16(stp1_13, stp1_18);
2886   col[19] = _mm_sub_epi16(stp1_12, stp1_19);
2887   col[20] = _mm_sub_epi16(stp1_11, stp1_20);
2888   col[21] = _mm_sub_epi16(stp1_10, stp1_21);
2889   col[22] = _mm_sub_epi16(stp1_9, stp1_22);
2890   col[23] = _mm_sub_epi16(stp1_8, stp1_23);
2891   col[24] = _mm_sub_epi16(stp1_7, stp1_24);
2892   col[25] = _mm_sub_epi16(stp1_6, stp1_25);
2893   col[26] = _mm_sub_epi16(stp1_5, stp1_26);
2894   col[27] = _mm_sub_epi16(stp1_4, stp1_27);
2895   col[28] = _mm_sub_epi16(stp1_3, stp1_28);
2896   col[29] = _mm_sub_epi16(stp1_2, stp1_29);
2897   col[30] = _mm_sub_epi16(stp1_1, stp1_30);
2898   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
2899   for (i = 0; i < 4; i++) {
2900     int j;
2901     // Transpose 32x8 block to 8x32 block
2902     array_transpose_8x8(col + i * 8, in);
2903     IDCT32_34
2904 
2905     // 2_D: Calculate the results and store them to destination.
2906     in[0] = _mm_add_epi16(stp1_0, stp1_31);
2907     in[1] = _mm_add_epi16(stp1_1, stp1_30);
2908     in[2] = _mm_add_epi16(stp1_2, stp1_29);
2909     in[3] = _mm_add_epi16(stp1_3, stp1_28);
2910     in[4] = _mm_add_epi16(stp1_4, stp1_27);
2911     in[5] = _mm_add_epi16(stp1_5, stp1_26);
2912     in[6] = _mm_add_epi16(stp1_6, stp1_25);
2913     in[7] = _mm_add_epi16(stp1_7, stp1_24);
2914     in[8] = _mm_add_epi16(stp1_8, stp1_23);
2915     in[9] = _mm_add_epi16(stp1_9, stp1_22);
2916     in[10] = _mm_add_epi16(stp1_10, stp1_21);
2917     in[11] = _mm_add_epi16(stp1_11, stp1_20);
2918     in[12] = _mm_add_epi16(stp1_12, stp1_19);
2919     in[13] = _mm_add_epi16(stp1_13, stp1_18);
2920     in[14] = _mm_add_epi16(stp1_14, stp1_17);
2921     in[15] = _mm_add_epi16(stp1_15, stp1_16);
2922     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
2923     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
2924     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
2925     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
2926     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
2927     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
2928     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
2929     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
2930     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
2931     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
2932     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
2933     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
2934     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
2935     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
2936     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
2937     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
2938 
2939     for (j = 0; j < 32; ++j) {
2940       // Final rounding and shift
2941       in[j] = _mm_adds_epi16(in[j], final_rounding);
2942       in[j] = _mm_srai_epi16(in[j], 6);
2943       RECON_AND_STORE(dest + j * stride, in[j]);
2944     }
2945 
2946     dest += 8;
2947   }
2948 }
2949 
vpx_idct32x32_1024_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)2950 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
2951                                  int stride) {
2952   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2953   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2954   const __m128i zero = _mm_setzero_si128();
2955 
2956   // idct constants for each stage
2957   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2958   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2959   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
2960   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
2961   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
2962   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
2963   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2964   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2965   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2966   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2967   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
2968   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
2969   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
2970   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
2971   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2972   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2973 
2974   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2975   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2976   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2977   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2978   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2979   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2980   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2981   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2982 
2983   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2984   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2985   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2986   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2987   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2988   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2989   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2990   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2991   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2992   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2993 
2994   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2995   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2996   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2997   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2998   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2999   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3000   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3001 
3002   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3003 
3004   __m128i in[32], col[128], zero_idx[16];
3005   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3006       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3007       stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
3008       stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
3009   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3010       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3011       stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
3012       stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
3013   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3014   int i, j, i32;
3015 
3016   for (i = 0; i < 4; i++) {
3017     i32 = (i << 5);
3018     // First 1-D idct
3019     // Load input data.
3020     LOAD_DQCOEFF(in[0], input);
3021     LOAD_DQCOEFF(in[8], input);
3022     LOAD_DQCOEFF(in[16], input);
3023     LOAD_DQCOEFF(in[24], input);
3024     LOAD_DQCOEFF(in[1], input);
3025     LOAD_DQCOEFF(in[9], input);
3026     LOAD_DQCOEFF(in[17], input);
3027     LOAD_DQCOEFF(in[25], input);
3028     LOAD_DQCOEFF(in[2], input);
3029     LOAD_DQCOEFF(in[10], input);
3030     LOAD_DQCOEFF(in[18], input);
3031     LOAD_DQCOEFF(in[26], input);
3032     LOAD_DQCOEFF(in[3], input);
3033     LOAD_DQCOEFF(in[11], input);
3034     LOAD_DQCOEFF(in[19], input);
3035     LOAD_DQCOEFF(in[27], input);
3036 
3037     LOAD_DQCOEFF(in[4], input);
3038     LOAD_DQCOEFF(in[12], input);
3039     LOAD_DQCOEFF(in[20], input);
3040     LOAD_DQCOEFF(in[28], input);
3041     LOAD_DQCOEFF(in[5], input);
3042     LOAD_DQCOEFF(in[13], input);
3043     LOAD_DQCOEFF(in[21], input);
3044     LOAD_DQCOEFF(in[29], input);
3045     LOAD_DQCOEFF(in[6], input);
3046     LOAD_DQCOEFF(in[14], input);
3047     LOAD_DQCOEFF(in[22], input);
3048     LOAD_DQCOEFF(in[30], input);
3049     LOAD_DQCOEFF(in[7], input);
3050     LOAD_DQCOEFF(in[15], input);
3051     LOAD_DQCOEFF(in[23], input);
3052     LOAD_DQCOEFF(in[31], input);
3053 
3054     // checking if all entries are zero
3055     zero_idx[0] = _mm_or_si128(in[0], in[1]);
3056     zero_idx[1] = _mm_or_si128(in[2], in[3]);
3057     zero_idx[2] = _mm_or_si128(in[4], in[5]);
3058     zero_idx[3] = _mm_or_si128(in[6], in[7]);
3059     zero_idx[4] = _mm_or_si128(in[8], in[9]);
3060     zero_idx[5] = _mm_or_si128(in[10], in[11]);
3061     zero_idx[6] = _mm_or_si128(in[12], in[13]);
3062     zero_idx[7] = _mm_or_si128(in[14], in[15]);
3063     zero_idx[8] = _mm_or_si128(in[16], in[17]);
3064     zero_idx[9] = _mm_or_si128(in[18], in[19]);
3065     zero_idx[10] = _mm_or_si128(in[20], in[21]);
3066     zero_idx[11] = _mm_or_si128(in[22], in[23]);
3067     zero_idx[12] = _mm_or_si128(in[24], in[25]);
3068     zero_idx[13] = _mm_or_si128(in[26], in[27]);
3069     zero_idx[14] = _mm_or_si128(in[28], in[29]);
3070     zero_idx[15] = _mm_or_si128(in[30], in[31]);
3071 
3072     zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3073     zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3074     zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3075     zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3076     zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3077     zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3078     zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3079     zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3080 
3081     zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3082     zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3083     zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3084     zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3085     zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3086     zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3087     zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3088 
3089     if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3090       col[i32 + 0] = _mm_setzero_si128();
3091       col[i32 + 1] = _mm_setzero_si128();
3092       col[i32 + 2] = _mm_setzero_si128();
3093       col[i32 + 3] = _mm_setzero_si128();
3094       col[i32 + 4] = _mm_setzero_si128();
3095       col[i32 + 5] = _mm_setzero_si128();
3096       col[i32 + 6] = _mm_setzero_si128();
3097       col[i32 + 7] = _mm_setzero_si128();
3098       col[i32 + 8] = _mm_setzero_si128();
3099       col[i32 + 9] = _mm_setzero_si128();
3100       col[i32 + 10] = _mm_setzero_si128();
3101       col[i32 + 11] = _mm_setzero_si128();
3102       col[i32 + 12] = _mm_setzero_si128();
3103       col[i32 + 13] = _mm_setzero_si128();
3104       col[i32 + 14] = _mm_setzero_si128();
3105       col[i32 + 15] = _mm_setzero_si128();
3106       col[i32 + 16] = _mm_setzero_si128();
3107       col[i32 + 17] = _mm_setzero_si128();
3108       col[i32 + 18] = _mm_setzero_si128();
3109       col[i32 + 19] = _mm_setzero_si128();
3110       col[i32 + 20] = _mm_setzero_si128();
3111       col[i32 + 21] = _mm_setzero_si128();
3112       col[i32 + 22] = _mm_setzero_si128();
3113       col[i32 + 23] = _mm_setzero_si128();
3114       col[i32 + 24] = _mm_setzero_si128();
3115       col[i32 + 25] = _mm_setzero_si128();
3116       col[i32 + 26] = _mm_setzero_si128();
3117       col[i32 + 27] = _mm_setzero_si128();
3118       col[i32 + 28] = _mm_setzero_si128();
3119       col[i32 + 29] = _mm_setzero_si128();
3120       col[i32 + 30] = _mm_setzero_si128();
3121       col[i32 + 31] = _mm_setzero_si128();
3122       continue;
3123     }
3124 
3125     // Transpose 32x8 block to 8x32 block
3126     array_transpose_8x8(in, in);
3127     array_transpose_8x8(in + 8, in + 8);
3128     array_transpose_8x8(in + 16, in + 16);
3129     array_transpose_8x8(in + 24, in + 24);
3130 
3131     IDCT32
3132 
3133     // 1_D: Store 32 intermediate results for each 8x32 block.
3134     col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3135     col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3136     col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3137     col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3138     col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3139     col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3140     col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3141     col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3142     col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3143     col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3144     col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3145     col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3146     col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3147     col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3148     col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3149     col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3150     col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3151     col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3152     col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3153     col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3154     col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3155     col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3156     col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3157     col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3158     col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3159     col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3160     col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3161     col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3162     col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3163     col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3164     col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3165     col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3166   }
3167   for (i = 0; i < 4; i++) {
3168     // Second 1-D idct
3169     j = i << 3;
3170 
3171     // Transpose 32x8 block to 8x32 block
3172     array_transpose_8x8(col + j, in);
3173     array_transpose_8x8(col + j + 32, in + 8);
3174     array_transpose_8x8(col + j + 64, in + 16);
3175     array_transpose_8x8(col + j + 96, in + 24);
3176 
3177     IDCT32
3178 
3179     // 2_D: Calculate the results and store them to destination.
3180     in[0] = _mm_add_epi16(stp1_0, stp1_31);
3181     in[1] = _mm_add_epi16(stp1_1, stp1_30);
3182     in[2] = _mm_add_epi16(stp1_2, stp1_29);
3183     in[3] = _mm_add_epi16(stp1_3, stp1_28);
3184     in[4] = _mm_add_epi16(stp1_4, stp1_27);
3185     in[5] = _mm_add_epi16(stp1_5, stp1_26);
3186     in[6] = _mm_add_epi16(stp1_6, stp1_25);
3187     in[7] = _mm_add_epi16(stp1_7, stp1_24);
3188     in[8] = _mm_add_epi16(stp1_8, stp1_23);
3189     in[9] = _mm_add_epi16(stp1_9, stp1_22);
3190     in[10] = _mm_add_epi16(stp1_10, stp1_21);
3191     in[11] = _mm_add_epi16(stp1_11, stp1_20);
3192     in[12] = _mm_add_epi16(stp1_12, stp1_19);
3193     in[13] = _mm_add_epi16(stp1_13, stp1_18);
3194     in[14] = _mm_add_epi16(stp1_14, stp1_17);
3195     in[15] = _mm_add_epi16(stp1_15, stp1_16);
3196     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3197     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3198     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3199     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3200     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3201     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3202     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3203     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3204     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3205     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3206     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3207     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3208     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3209     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3210     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3211     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3212 
3213     for (j = 0; j < 32; ++j) {
3214       // Final rounding and shift
3215       in[j] = _mm_adds_epi16(in[j], final_rounding);
3216       in[j] = _mm_srai_epi16(in[j], 6);
3217       RECON_AND_STORE(dest + j * stride, in[j]);
3218     }
3219 
3220     dest += 8;
3221   }
3222 }
3223 
vpx_idct32x32_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3224 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
3225                               int stride) {
3226   __m128i dc_value;
3227   const __m128i zero = _mm_setzero_si128();
3228   int a, j;
3229 
3230   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
3231   a = (int)dct_const_round_shift(a * cospi_16_64);
3232   a = ROUND_POWER_OF_TWO(a, 6);
3233 
3234   dc_value = _mm_set1_epi16(a);
3235 
3236   for (j = 0; j < 32; ++j) {
3237     RECON_AND_STORE(dest + 0 + j * stride, dc_value);
3238     RECON_AND_STORE(dest + 8 + j * stride, dc_value);
3239     RECON_AND_STORE(dest + 16 + j * stride, dc_value);
3240     RECON_AND_STORE(dest + 24 + j * stride, dc_value);
3241   }
3242 }
3243