1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/txfm_common_sse2.h"
14 
15 #define RECON_AND_STORE4X4(dest, in_x) \
16 {                                                     \
17   __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
18   d0 = _mm_unpacklo_epi8(d0, zero); \
19   d0 = _mm_add_epi16(in_x, d0); \
20   d0 = _mm_packus_epi16(d0, d0); \
21   *(int *)(dest) = _mm_cvtsi128_si32(d0); \
22 }
23 
vpx_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)24 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
25                              int stride) {
26   const __m128i zero = _mm_setzero_si128();
27   const __m128i eight = _mm_set1_epi16(8);
28   const __m128i cst = _mm_setr_epi16(
29       (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
30       (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
31       (int16_t)cospi_8_64, (int16_t)cospi_24_64);
32   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
33   __m128i input0, input1, input2, input3;
34 
35   // Rows
36   input0 = load_input_data(input);
37   input2 = load_input_data(input + 8);
38 
39   // Construct i3, i1, i3, i1, i2, i0, i2, i0
40   input0 = _mm_shufflelo_epi16(input0, 0xd8);
41   input0 = _mm_shufflehi_epi16(input0, 0xd8);
42   input2 = _mm_shufflelo_epi16(input2, 0xd8);
43   input2 = _mm_shufflehi_epi16(input2, 0xd8);
44 
45   input1 = _mm_unpackhi_epi32(input0, input0);
46   input0 = _mm_unpacklo_epi32(input0, input0);
47   input3 = _mm_unpackhi_epi32(input2, input2);
48   input2 = _mm_unpacklo_epi32(input2, input2);
49 
50   // Stage 1
51   input0 = _mm_madd_epi16(input0, cst);
52   input1 = _mm_madd_epi16(input1, cst);
53   input2 = _mm_madd_epi16(input2, cst);
54   input3 = _mm_madd_epi16(input3, cst);
55 
56   input0 = _mm_add_epi32(input0, rounding);
57   input1 = _mm_add_epi32(input1, rounding);
58   input2 = _mm_add_epi32(input2, rounding);
59   input3 = _mm_add_epi32(input3, rounding);
60 
61   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
62   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
63   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
64   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
65 
66   // Stage 2
67   input0 = _mm_packs_epi32(input0, input1);
68   input1 = _mm_packs_epi32(input2, input3);
69 
70   // Transpose
71   input2 = _mm_unpacklo_epi16(input0, input1);
72   input3 = _mm_unpackhi_epi16(input0, input1);
73   input0 = _mm_unpacklo_epi32(input2, input3);
74   input1 = _mm_unpackhi_epi32(input2, input3);
75 
76   // Switch column2, column 3, and then, we got:
77   // input2: column1, column 0;  input3: column2, column 3.
78   input1 = _mm_shuffle_epi32(input1, 0x4e);
79   input2 = _mm_add_epi16(input0, input1);
80   input3 = _mm_sub_epi16(input0, input1);
81 
82   // Columns
83   // Construct i3, i1, i3, i1, i2, i0, i2, i0
84   input0 = _mm_unpacklo_epi32(input2, input2);
85   input1 = _mm_unpackhi_epi32(input2, input2);
86   input2 = _mm_unpackhi_epi32(input3, input3);
87   input3 = _mm_unpacklo_epi32(input3, input3);
88 
89   // Stage 1
90   input0 = _mm_madd_epi16(input0, cst);
91   input1 = _mm_madd_epi16(input1, cst);
92   input2 = _mm_madd_epi16(input2, cst);
93   input3 = _mm_madd_epi16(input3, cst);
94 
95   input0 = _mm_add_epi32(input0, rounding);
96   input1 = _mm_add_epi32(input1, rounding);
97   input2 = _mm_add_epi32(input2, rounding);
98   input3 = _mm_add_epi32(input3, rounding);
99 
100   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
101   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
102   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
103   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
104 
105   // Stage 2
106   input0 = _mm_packs_epi32(input0, input2);
107   input1 = _mm_packs_epi32(input1, input3);
108 
109   // Transpose
110   input2 = _mm_unpacklo_epi16(input0, input1);
111   input3 = _mm_unpackhi_epi16(input0, input1);
112   input0 = _mm_unpacklo_epi32(input2, input3);
113   input1 = _mm_unpackhi_epi32(input2, input3);
114 
115   // Switch column2, column 3, and then, we got:
116   // input2: column1, column 0;  input3: column2, column 3.
117   input1 = _mm_shuffle_epi32(input1, 0x4e);
118   input2 = _mm_add_epi16(input0, input1);
119   input3 = _mm_sub_epi16(input0, input1);
120 
121   // Final round and shift
122   input2 = _mm_add_epi16(input2, eight);
123   input3 = _mm_add_epi16(input3, eight);
124 
125   input2 = _mm_srai_epi16(input2, 4);
126   input3 = _mm_srai_epi16(input3, 4);
127 
128   // Reconstruction and Store
129   {
130     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
131     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
132     d0 = _mm_unpacklo_epi32(d0,
133                             _mm_cvtsi32_si128(*(const int *)(dest + stride)));
134     d2 = _mm_unpacklo_epi32(
135         _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
136     d0 = _mm_unpacklo_epi8(d0, zero);
137     d2 = _mm_unpacklo_epi8(d2, zero);
138     d0 = _mm_add_epi16(d0, input2);
139     d2 = _mm_add_epi16(d2, input3);
140     d0 = _mm_packus_epi16(d0, d2);
141     // store input0
142     *(int *)dest = _mm_cvtsi128_si32(d0);
143     // store input1
144     d0 = _mm_srli_si128(d0, 4);
145     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
146     // store input2
147     d0 = _mm_srli_si128(d0, 4);
148     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
149     // store input3
150     d0 = _mm_srli_si128(d0, 4);
151     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
152   }
153 }
154 
vpx_idct4x4_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)155 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
156                             int stride) {
157   __m128i dc_value;
158   const __m128i zero = _mm_setzero_si128();
159   int a;
160 
161   a = dct_const_round_shift(input[0] * cospi_16_64);
162   a = dct_const_round_shift(a * cospi_16_64);
163   a = ROUND_POWER_OF_TWO(a, 4);
164 
165   dc_value = _mm_set1_epi16(a);
166 
167   RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
168   RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
169   RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
170   RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
171 }
172 
transpose_4x4(__m128i * res)173 static INLINE void transpose_4x4(__m128i *res) {
174   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
175   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
176 
177   res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
178   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
179 }
180 
idct4_sse2(__m128i * in)181 void idct4_sse2(__m128i *in) {
182   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
183   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
184   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
185   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
186   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
187   __m128i u[8], v[8];
188 
189   transpose_4x4(in);
190   // stage 1
191   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
192   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
193   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
194   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
195   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
196   v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
197 
198   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
199   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
200   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
201   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
202 
203   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
204   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
205   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
206   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
207 
208   u[0] = _mm_packs_epi32(v[0], v[1]);
209   u[1] = _mm_packs_epi32(v[3], v[2]);
210 
211   // stage 2
212   in[0] = _mm_add_epi16(u[0], u[1]);
213   in[1] = _mm_sub_epi16(u[0], u[1]);
214   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
215 }
216 
iadst4_sse2(__m128i * in)217 void iadst4_sse2(__m128i *in) {
218   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
219   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
220   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
221   const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
222   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
223   const __m128i kZero = _mm_set1_epi16(0);
224   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
225   __m128i u[8], v[8], in7;
226 
227   transpose_4x4(in);
228   in7 = _mm_srli_si128(in[1], 8);
229   in7 = _mm_add_epi16(in7, in[0]);
230   in7 = _mm_sub_epi16(in7, in[1]);
231 
232   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
233   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
234   u[2] = _mm_unpacklo_epi16(in7, kZero);
235   u[3] = _mm_unpackhi_epi16(in[0], kZero);
236 
237   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
238   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
239   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
240   v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
241   v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
242   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
243 
244   u[0] = _mm_add_epi32(v[0], v[1]);
245   u[1] = _mm_add_epi32(v[3], v[4]);
246   u[2] = v[2];
247   u[3] = _mm_add_epi32(u[0], u[1]);
248   u[4] = _mm_slli_epi32(v[5], 2);
249   u[5] = _mm_add_epi32(u[3], v[5]);
250   u[6] = _mm_sub_epi32(u[5], u[4]);
251 
252   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
253   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
254   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
255   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
256 
257   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
258   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
259   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
260   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
261 
262   in[0] = _mm_packs_epi32(u[0], u[1]);
263   in[1] = _mm_packs_epi32(u[2], u[3]);
264 }
265 
266 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
267                       out0, out1, out2, out3, out4, out5, out6, out7) \
268   {                                                     \
269     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
270     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
271     const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
272     const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
273     const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
274     const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
275     const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
276     const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
277                                                         \
278     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
279     const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
280     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
281     const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
282     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
283     const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
284     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
285     const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
286                                                             \
287     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
288     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
289     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
290     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
291     out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
292     out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
293     out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
294     out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
295   }
296 
297 #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
298                          out0, out1, out2, out3) \
299   {                                              \
300     const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
301     const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
302     const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
303     const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
304     \
305     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
306     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
307     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
308     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
309     \
310     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
311     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
312     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
313     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
314   }
315 
316 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
317   {                                            \
318     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
319     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
320     out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
321     out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
322   }
323 
324 // Define Macro for multiplying elements by constants and adding them together.
325 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
326                                cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
327   {   \
328       tmp0 = _mm_madd_epi16(lo_0, cst0); \
329       tmp1 = _mm_madd_epi16(hi_0, cst0); \
330       tmp2 = _mm_madd_epi16(lo_0, cst1); \
331       tmp3 = _mm_madd_epi16(hi_0, cst1); \
332       tmp4 = _mm_madd_epi16(lo_1, cst2); \
333       tmp5 = _mm_madd_epi16(hi_1, cst2); \
334       tmp6 = _mm_madd_epi16(lo_1, cst3); \
335       tmp7 = _mm_madd_epi16(hi_1, cst3); \
336       \
337       tmp0 = _mm_add_epi32(tmp0, rounding); \
338       tmp1 = _mm_add_epi32(tmp1, rounding); \
339       tmp2 = _mm_add_epi32(tmp2, rounding); \
340       tmp3 = _mm_add_epi32(tmp3, rounding); \
341       tmp4 = _mm_add_epi32(tmp4, rounding); \
342       tmp5 = _mm_add_epi32(tmp5, rounding); \
343       tmp6 = _mm_add_epi32(tmp6, rounding); \
344       tmp7 = _mm_add_epi32(tmp7, rounding); \
345       \
346       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
347       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
348       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
349       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
350       tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
351       tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
352       tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
353       tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
354       \
355       res0 = _mm_packs_epi32(tmp0, tmp1); \
356       res1 = _mm_packs_epi32(tmp2, tmp3); \
357       res2 = _mm_packs_epi32(tmp4, tmp5); \
358       res3 = _mm_packs_epi32(tmp6, tmp7); \
359   }
360 
361 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
362   {   \
363       tmp0 = _mm_madd_epi16(lo_0, cst0); \
364       tmp1 = _mm_madd_epi16(hi_0, cst0); \
365       tmp2 = _mm_madd_epi16(lo_0, cst1); \
366       tmp3 = _mm_madd_epi16(hi_0, cst1); \
367       \
368       tmp0 = _mm_add_epi32(tmp0, rounding); \
369       tmp1 = _mm_add_epi32(tmp1, rounding); \
370       tmp2 = _mm_add_epi32(tmp2, rounding); \
371       tmp3 = _mm_add_epi32(tmp3, rounding); \
372       \
373       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
374       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
375       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
376       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
377       \
378       res0 = _mm_packs_epi32(tmp0, tmp1); \
379       res1 = _mm_packs_epi32(tmp2, tmp3); \
380   }
381 
382 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
383               out0, out1, out2, out3, out4, out5, out6, out7)  \
384   { \
385   /* Stage1 */      \
386   { \
387     const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
388     const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
389     const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
390     const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
391     \
392     MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
393                           stg1_1, stg1_2, stg1_3, stp1_4,      \
394                           stp1_7, stp1_5, stp1_6)              \
395   } \
396     \
397   /* Stage2 */ \
398   { \
399     const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
400     const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
401     const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
402     const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
403     \
404     MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
405                            stg2_1, stg2_2, stg2_3, stp2_0,     \
406                            stp2_1, stp2_2, stp2_3)             \
407     \
408     stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
409     stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
410     stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
411     stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
412   } \
413     \
414   /* Stage3 */ \
415   { \
416     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
417     const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
418     \
419     stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
420     stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
421     stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
422     stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
423     \
424     tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
425     tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
426     tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
427     tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
428     \
429     tmp0 = _mm_add_epi32(tmp0, rounding); \
430     tmp1 = _mm_add_epi32(tmp1, rounding); \
431     tmp2 = _mm_add_epi32(tmp2, rounding); \
432     tmp3 = _mm_add_epi32(tmp3, rounding); \
433     \
434     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
435     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
436     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
437     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
438     \
439     stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
440     stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
441   } \
442   \
443   /* Stage4  */ \
444   out0 = _mm_adds_epi16(stp1_0, stp2_7); \
445   out1 = _mm_adds_epi16(stp1_1, stp1_6); \
446   out2 = _mm_adds_epi16(stp1_2, stp1_5); \
447   out3 = _mm_adds_epi16(stp1_3, stp2_4); \
448   out4 = _mm_subs_epi16(stp1_3, stp2_4); \
449   out5 = _mm_subs_epi16(stp1_2, stp1_5); \
450   out6 = _mm_subs_epi16(stp1_1, stp1_6); \
451   out7 = _mm_subs_epi16(stp1_0, stp2_7); \
452   }
453 
vpx_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)454 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
455                              int stride) {
456   const __m128i zero = _mm_setzero_si128();
457   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
458   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
459   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
460   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
461   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
462   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
463   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
464   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
465   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
466   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
467 
468   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
469   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
470   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
471   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
472   int i;
473 
474   // Load input data.
475   in0 = load_input_data(input);
476   in1 = load_input_data(input + 8 * 1);
477   in2 = load_input_data(input + 8 * 2);
478   in3 = load_input_data(input + 8 * 3);
479   in4 = load_input_data(input + 8 * 4);
480   in5 = load_input_data(input + 8 * 5);
481   in6 = load_input_data(input + 8 * 6);
482   in7 = load_input_data(input + 8 * 7);
483 
484   // 2-D
485   for (i = 0; i < 2; i++) {
486     // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
487     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
488                   in0, in1, in2, in3, in4, in5, in6, in7);
489 
490     // 4-stage 1D idct8x8
491     IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
492           in0, in1, in2, in3, in4, in5, in6, in7);
493   }
494 
495   // Final rounding and shift
496   in0 = _mm_adds_epi16(in0, final_rounding);
497   in1 = _mm_adds_epi16(in1, final_rounding);
498   in2 = _mm_adds_epi16(in2, final_rounding);
499   in3 = _mm_adds_epi16(in3, final_rounding);
500   in4 = _mm_adds_epi16(in4, final_rounding);
501   in5 = _mm_adds_epi16(in5, final_rounding);
502   in6 = _mm_adds_epi16(in6, final_rounding);
503   in7 = _mm_adds_epi16(in7, final_rounding);
504 
505   in0 = _mm_srai_epi16(in0, 5);
506   in1 = _mm_srai_epi16(in1, 5);
507   in2 = _mm_srai_epi16(in2, 5);
508   in3 = _mm_srai_epi16(in3, 5);
509   in4 = _mm_srai_epi16(in4, 5);
510   in5 = _mm_srai_epi16(in5, 5);
511   in6 = _mm_srai_epi16(in6, 5);
512   in7 = _mm_srai_epi16(in7, 5);
513 
514   RECON_AND_STORE(dest + 0 * stride, in0);
515   RECON_AND_STORE(dest + 1 * stride, in1);
516   RECON_AND_STORE(dest + 2 * stride, in2);
517   RECON_AND_STORE(dest + 3 * stride, in3);
518   RECON_AND_STORE(dest + 4 * stride, in4);
519   RECON_AND_STORE(dest + 5 * stride, in5);
520   RECON_AND_STORE(dest + 6 * stride, in6);
521   RECON_AND_STORE(dest + 7 * stride, in7);
522 }
523 
vpx_idct8x8_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)524 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
525                             int stride) {
526   __m128i dc_value;
527   const __m128i zero = _mm_setzero_si128();
528   int a;
529 
530   a = dct_const_round_shift(input[0] * cospi_16_64);
531   a = dct_const_round_shift(a * cospi_16_64);
532   a = ROUND_POWER_OF_TWO(a, 5);
533 
534   dc_value = _mm_set1_epi16(a);
535 
536   RECON_AND_STORE(dest + 0 * stride, dc_value);
537   RECON_AND_STORE(dest + 1 * stride, dc_value);
538   RECON_AND_STORE(dest + 2 * stride, dc_value);
539   RECON_AND_STORE(dest + 3 * stride, dc_value);
540   RECON_AND_STORE(dest + 4 * stride, dc_value);
541   RECON_AND_STORE(dest + 5 * stride, dc_value);
542   RECON_AND_STORE(dest + 6 * stride, dc_value);
543   RECON_AND_STORE(dest + 7 * stride, dc_value);
544 }
545 
idct8_sse2(__m128i * in)546 void idct8_sse2(__m128i *in) {
547   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
548   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
549   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
550   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
551   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
552   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
553   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
554   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
555   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
556 
557   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
558   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
559   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
560   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
561 
562   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
563   TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
564                 in0, in1, in2, in3, in4, in5, in6, in7);
565 
566   // 4-stage 1D idct8x8
567   IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
568         in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
569 }
570 
iadst8_sse2(__m128i * in)571 void iadst8_sse2(__m128i *in) {
572   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
573   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
574   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
575   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
576   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
577   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
578   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
579   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
580   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
581   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
582   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
583   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
584   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
585   const __m128i k__const_0 = _mm_set1_epi16(0);
586   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
587 
588   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
589   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
590   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
591   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
592   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
593 
594   // transpose
595   array_transpose_8x8(in, in);
596 
597   // properly aligned for butterfly input
598   in0 = in[7];
599   in1 = in[0];
600   in2 = in[5];
601   in3 = in[2];
602   in4 = in[3];
603   in5 = in[4];
604   in6 = in[1];
605   in7 = in[6];
606 
607   // column transformation
608   // stage 1
609   // interleave and multiply/add into 32-bit integer
610   s0 = _mm_unpacklo_epi16(in0, in1);
611   s1 = _mm_unpackhi_epi16(in0, in1);
612   s2 = _mm_unpacklo_epi16(in2, in3);
613   s3 = _mm_unpackhi_epi16(in2, in3);
614   s4 = _mm_unpacklo_epi16(in4, in5);
615   s5 = _mm_unpackhi_epi16(in4, in5);
616   s6 = _mm_unpacklo_epi16(in6, in7);
617   s7 = _mm_unpackhi_epi16(in6, in7);
618 
619   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
620   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
621   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
622   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
623   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
624   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
625   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
626   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
627   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
628   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
629   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
630   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
631   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
632   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
633   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
634   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
635 
636   // addition
637   w0 = _mm_add_epi32(u0, u8);
638   w1 = _mm_add_epi32(u1, u9);
639   w2 = _mm_add_epi32(u2, u10);
640   w3 = _mm_add_epi32(u3, u11);
641   w4 = _mm_add_epi32(u4, u12);
642   w5 = _mm_add_epi32(u5, u13);
643   w6 = _mm_add_epi32(u6, u14);
644   w7 = _mm_add_epi32(u7, u15);
645   w8 = _mm_sub_epi32(u0, u8);
646   w9 = _mm_sub_epi32(u1, u9);
647   w10 = _mm_sub_epi32(u2, u10);
648   w11 = _mm_sub_epi32(u3, u11);
649   w12 = _mm_sub_epi32(u4, u12);
650   w13 = _mm_sub_epi32(u5, u13);
651   w14 = _mm_sub_epi32(u6, u14);
652   w15 = _mm_sub_epi32(u7, u15);
653 
654   // shift and rounding
655   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
656   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
657   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
658   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
659   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
660   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
661   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
662   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
663   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
664   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
665   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
666   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
667   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
668   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
669   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
670   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
671 
672   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
673   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
674   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
675   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
676   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
677   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
678   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
679   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
680   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
681   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
682   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
683   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
684   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
685   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
686   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
687   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
688 
689   // back to 16-bit and pack 8 integers into __m128i
690   in[0] = _mm_packs_epi32(u0, u1);
691   in[1] = _mm_packs_epi32(u2, u3);
692   in[2] = _mm_packs_epi32(u4, u5);
693   in[3] = _mm_packs_epi32(u6, u7);
694   in[4] = _mm_packs_epi32(u8, u9);
695   in[5] = _mm_packs_epi32(u10, u11);
696   in[6] = _mm_packs_epi32(u12, u13);
697   in[7] = _mm_packs_epi32(u14, u15);
698 
699   // stage 2
700   s0 = _mm_add_epi16(in[0], in[2]);
701   s1 = _mm_add_epi16(in[1], in[3]);
702   s2 = _mm_sub_epi16(in[0], in[2]);
703   s3 = _mm_sub_epi16(in[1], in[3]);
704   u0 = _mm_unpacklo_epi16(in[4], in[5]);
705   u1 = _mm_unpackhi_epi16(in[4], in[5]);
706   u2 = _mm_unpacklo_epi16(in[6], in[7]);
707   u3 = _mm_unpackhi_epi16(in[6], in[7]);
708 
709   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
710   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
711   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
712   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
713   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
714   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
715   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
716   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
717 
718   w0 = _mm_add_epi32(v0, v4);
719   w1 = _mm_add_epi32(v1, v5);
720   w2 = _mm_add_epi32(v2, v6);
721   w3 = _mm_add_epi32(v3, v7);
722   w4 = _mm_sub_epi32(v0, v4);
723   w5 = _mm_sub_epi32(v1, v5);
724   w6 = _mm_sub_epi32(v2, v6);
725   w7 = _mm_sub_epi32(v3, v7);
726 
727   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
728   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
729   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
730   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
731   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
732   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
733   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
734   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
735 
736   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
737   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
738   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
739   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
740   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
741   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
742   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
743   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
744 
745   // back to 16-bit intergers
746   s4 = _mm_packs_epi32(u0, u1);
747   s5 = _mm_packs_epi32(u2, u3);
748   s6 = _mm_packs_epi32(u4, u5);
749   s7 = _mm_packs_epi32(u6, u7);
750 
751   // stage 3
752   u0 = _mm_unpacklo_epi16(s2, s3);
753   u1 = _mm_unpackhi_epi16(s2, s3);
754   u2 = _mm_unpacklo_epi16(s6, s7);
755   u3 = _mm_unpackhi_epi16(s6, s7);
756 
757   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
758   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
759   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
760   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
761   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
762   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
763   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
764   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
765 
766   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
767   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
768   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
769   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
770   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
771   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
772   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
773   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
774 
775   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
776   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
777   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
778   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
779   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
780   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
781   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
782   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
783 
784   s2 = _mm_packs_epi32(v0, v1);
785   s3 = _mm_packs_epi32(v2, v3);
786   s6 = _mm_packs_epi32(v4, v5);
787   s7 = _mm_packs_epi32(v6, v7);
788 
789   in[0] = s0;
790   in[1] = _mm_sub_epi16(k__const_0, s4);
791   in[2] = s6;
792   in[3] = _mm_sub_epi16(k__const_0, s2);
793   in[4] = s3;
794   in[5] = _mm_sub_epi16(k__const_0, s7);
795   in[6] = s5;
796   in[7] = _mm_sub_epi16(k__const_0, s1);
797 }
798 
vpx_idct8x8_12_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)799 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
800                              int stride) {
801   const __m128i zero = _mm_setzero_si128();
802   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
803   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
804   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
805   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
806   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
807   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
808   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
809   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
810   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
811   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
812   const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
813 
814   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
815   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
816   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
817   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
818 
819   // Rows. Load 4-row input data.
820   in0 = load_input_data(input);
821   in1 = load_input_data(input + 8 * 1);
822   in2 = load_input_data(input + 8 * 2);
823   in3 = load_input_data(input + 8 * 3);
824 
825   // 8x4 Transpose
826   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
827   // Stage1
828   {
829     const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
830     const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
831 
832     tmp0 = _mm_madd_epi16(lo_17, stg1_0);
833     tmp2 = _mm_madd_epi16(lo_17, stg1_1);
834     tmp4 = _mm_madd_epi16(lo_35, stg1_2);
835     tmp6 = _mm_madd_epi16(lo_35, stg1_3);
836 
837     tmp0 = _mm_add_epi32(tmp0, rounding);
838     tmp2 = _mm_add_epi32(tmp2, rounding);
839     tmp4 = _mm_add_epi32(tmp4, rounding);
840     tmp6 = _mm_add_epi32(tmp6, rounding);
841     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
842     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
843     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
844     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
845 
846     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
847     stp1_5 = _mm_packs_epi32(tmp4, tmp6);
848   }
849 
850   // Stage2
851   {
852     const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
853     const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
854 
855     tmp0 = _mm_madd_epi16(lo_04, stg2_0);
856     tmp2 = _mm_madd_epi16(lo_04, stg2_1);
857     tmp4 = _mm_madd_epi16(lo_26, stg2_2);
858     tmp6 = _mm_madd_epi16(lo_26, stg2_3);
859 
860     tmp0 = _mm_add_epi32(tmp0, rounding);
861     tmp2 = _mm_add_epi32(tmp2, rounding);
862     tmp4 = _mm_add_epi32(tmp4, rounding);
863     tmp6 = _mm_add_epi32(tmp6, rounding);
864     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
865     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
866     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
867     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
868 
869     stp2_0 = _mm_packs_epi32(tmp0, tmp2);
870     stp2_2 = _mm_packs_epi32(tmp6, tmp4);
871 
872     tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
873     tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
874 
875     stp2_4 = tmp0;
876     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
877     stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
878   }
879 
880   // Stage3
881   {
882     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
883 
884     tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
885     tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
886 
887     stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
888     stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
889 
890     tmp0 = _mm_madd_epi16(lo_56, stg3_0);
891     tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
892 
893     tmp0 = _mm_add_epi32(tmp0, rounding);
894     tmp2 = _mm_add_epi32(tmp2, rounding);
895     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
896     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
897 
898     stp1_5 = _mm_packs_epi32(tmp0, tmp2);
899   }
900 
901   // Stage4
902   tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
903   tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
904   tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
905   tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
906 
907   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
908 
909   IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
910         in0, in1, in2, in3, in4, in5, in6, in7);
911   // Final rounding and shift
912   in0 = _mm_adds_epi16(in0, final_rounding);
913   in1 = _mm_adds_epi16(in1, final_rounding);
914   in2 = _mm_adds_epi16(in2, final_rounding);
915   in3 = _mm_adds_epi16(in3, final_rounding);
916   in4 = _mm_adds_epi16(in4, final_rounding);
917   in5 = _mm_adds_epi16(in5, final_rounding);
918   in6 = _mm_adds_epi16(in6, final_rounding);
919   in7 = _mm_adds_epi16(in7, final_rounding);
920 
921   in0 = _mm_srai_epi16(in0, 5);
922   in1 = _mm_srai_epi16(in1, 5);
923   in2 = _mm_srai_epi16(in2, 5);
924   in3 = _mm_srai_epi16(in3, 5);
925   in4 = _mm_srai_epi16(in4, 5);
926   in5 = _mm_srai_epi16(in5, 5);
927   in6 = _mm_srai_epi16(in6, 5);
928   in7 = _mm_srai_epi16(in7, 5);
929 
930   RECON_AND_STORE(dest + 0 * stride, in0);
931   RECON_AND_STORE(dest + 1 * stride, in1);
932   RECON_AND_STORE(dest + 2 * stride, in2);
933   RECON_AND_STORE(dest + 3 * stride, in3);
934   RECON_AND_STORE(dest + 4 * stride, in4);
935   RECON_AND_STORE(dest + 5 * stride, in5);
936   RECON_AND_STORE(dest + 6 * stride, in6);
937   RECON_AND_STORE(dest + 7 * stride, in7);
938 }
939 
940 #define IDCT16 \
941   /* Stage2 */ \
942   { \
943     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
944     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
945     const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
946     const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
947     const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
948     const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
949     const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
950     const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
951     \
952     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
953                            stg2_0, stg2_1, stg2_2, stg2_3, \
954                            stp2_8, stp2_15, stp2_9, stp2_14) \
955     \
956     MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
957                            stg2_4, stg2_5, stg2_6, stg2_7, \
958                            stp2_10, stp2_13, stp2_11, stp2_12) \
959   } \
960     \
961   /* Stage3 */ \
962   { \
963     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
964     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
965     const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
966     const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
967     \
968     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
969                            stg3_0, stg3_1, stg3_2, stg3_3, \
970                            stp1_4, stp1_7, stp1_5, stp1_6) \
971     \
972     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
973     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
974     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
975     stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
976     \
977     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
978     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
979     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
980     stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
981   } \
982   \
983   /* Stage4 */ \
984   { \
985     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
986     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
987     const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
988     const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
989     \
990     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
991     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
992     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
993     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
994     \
995     MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
996                            stg4_0, stg4_1, stg4_2, stg4_3, \
997                            stp2_0, stp2_1, stp2_2, stp2_3) \
998     \
999     stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1000     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1001     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1002     stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1003     \
1004     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1005                            stg4_4, stg4_5, stg4_6, stg4_7, \
1006                            stp2_9, stp2_14, stp2_10, stp2_13) \
1007   } \
1008     \
1009   /* Stage5 */ \
1010   { \
1011     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1012     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1013     \
1014     stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1015     stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1016     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1017     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1018     \
1019     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1020     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1021     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1022     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1023     \
1024     tmp0 = _mm_add_epi32(tmp0, rounding); \
1025     tmp1 = _mm_add_epi32(tmp1, rounding); \
1026     tmp2 = _mm_add_epi32(tmp2, rounding); \
1027     tmp3 = _mm_add_epi32(tmp3, rounding); \
1028     \
1029     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1030     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1031     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1032     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1033     \
1034     stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1035     stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1036     \
1037     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1038     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1039     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1040     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1041     \
1042     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1043     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1044     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1045     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1046   } \
1047     \
1048   /* Stage6 */ \
1049   { \
1050     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1051     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1052     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1053     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1054     \
1055     stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1056     stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1057     stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1058     stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1059     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1060     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1061     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1062     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1063     \
1064     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1065                            stg6_0, stg4_0, stg6_0, stg4_0, \
1066                            stp2_10, stp2_13, stp2_11, stp2_12) \
1067   }
1068 
1069 #define IDCT16_10 \
1070     /* Stage2 */ \
1071     { \
1072       const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1073       const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1074       const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1075       const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1076       \
1077       MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1078                              stg2_0, stg2_1, stg2_6, stg2_7, \
1079                              stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1080     } \
1081       \
1082     /* Stage3 */ \
1083     { \
1084       const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1085       const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1086       \
1087       MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1088                                stg3_0, stg3_1,  \
1089                                stp2_4, stp2_7) \
1090       \
1091       stp1_9  =  stp1_8_0; \
1092       stp1_10 =  stp1_11;  \
1093       \
1094       stp1_13 = stp1_12_0; \
1095       stp1_14 = stp1_15;   \
1096     } \
1097     \
1098     /* Stage4 */ \
1099     { \
1100       const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1101       const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1102       \
1103       const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1104       const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1105       const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1106       const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1107       \
1108       MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1109                                stg4_0, stg4_1, \
1110                                stp1_0, stp1_1) \
1111       stp2_5 = stp2_4; \
1112       stp2_6 = stp2_7; \
1113       \
1114       MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1115                              stg4_4, stg4_5, stg4_6, stg4_7, \
1116                              stp2_9, stp2_14, stp2_10, stp2_13) \
1117     } \
1118       \
1119     /* Stage5 */ \
1120     { \
1121       const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1122       const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1123       \
1124       stp1_2 = stp1_1; \
1125       stp1_3 = stp1_0; \
1126       \
1127       tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1128       tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1129       tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1130       tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1131       \
1132       tmp0 = _mm_add_epi32(tmp0, rounding); \
1133       tmp1 = _mm_add_epi32(tmp1, rounding); \
1134       tmp2 = _mm_add_epi32(tmp2, rounding); \
1135       tmp3 = _mm_add_epi32(tmp3, rounding); \
1136       \
1137       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1138       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1139       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1140       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1141       \
1142       stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1143       stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1144       \
1145       stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1146       stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1147       stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1148       stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1149       \
1150       stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1151       stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1152       stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1153       stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1154     } \
1155       \
1156     /* Stage6 */ \
1157     { \
1158       const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1159       const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1160       const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1161       const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1162       \
1163       stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1164       stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1165       stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1166       stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1167       stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1168       stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1169       stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1170       stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1171       \
1172       MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1173                              stg6_0, stg4_0, stg6_0, stg4_0, \
1174                              stp2_10, stp2_13, stp2_11, stp2_12) \
1175     }
1176 
vpx_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1177 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
1178                                 int stride) {
1179   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1180   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1181   const __m128i zero = _mm_setzero_si128();
1182 
1183   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1184   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1185   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1186   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1187   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1188   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1189   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1190   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1191 
1192   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1193   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1194   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1195   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1196 
1197   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1198   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1199   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1200   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1201   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1202   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1203   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1204   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1205 
1206   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1207 
1208   __m128i in[16], l[16], r[16], *curr1;
1209   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1210           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1211           stp1_8_0, stp1_12_0;
1212   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1213           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1214   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1215   int i;
1216 
1217   curr1 = l;
1218   for (i = 0; i < 2; i++) {
1219     // 1-D idct
1220 
1221     // Load input data.
1222     in[0] = load_input_data(input);
1223     in[8] = load_input_data(input + 8 * 1);
1224     in[1] = load_input_data(input + 8 * 2);
1225     in[9] = load_input_data(input + 8 * 3);
1226     in[2] = load_input_data(input + 8 * 4);
1227     in[10] = load_input_data(input + 8 * 5);
1228     in[3] = load_input_data(input + 8 * 6);
1229     in[11] = load_input_data(input + 8 * 7);
1230     in[4] = load_input_data(input + 8 * 8);
1231     in[12] = load_input_data(input + 8 * 9);
1232     in[5] = load_input_data(input + 8 * 10);
1233     in[13] = load_input_data(input + 8 * 11);
1234     in[6] = load_input_data(input + 8 * 12);
1235     in[14] = load_input_data(input + 8 * 13);
1236     in[7] = load_input_data(input + 8 * 14);
1237     in[15] = load_input_data(input + 8 * 15);
1238 
1239     array_transpose_8x8(in, in);
1240     array_transpose_8x8(in + 8, in + 8);
1241 
1242     IDCT16
1243 
1244     // Stage7
1245     curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1246     curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1247     curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1248     curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1249     curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1250     curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1251     curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1252     curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1253     curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1254     curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1255     curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1256     curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1257     curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1258     curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1259     curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1260     curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1261 
1262     curr1 = r;
1263     input += 128;
1264   }
1265   for (i = 0; i < 2; i++) {
1266     int j;
1267     // 1-D idct
1268     array_transpose_8x8(l + i * 8, in);
1269     array_transpose_8x8(r + i * 8, in + 8);
1270 
1271     IDCT16
1272 
1273     // 2-D
1274     in[0] = _mm_add_epi16(stp2_0, stp1_15);
1275     in[1] = _mm_add_epi16(stp2_1, stp1_14);
1276     in[2] = _mm_add_epi16(stp2_2, stp2_13);
1277     in[3] = _mm_add_epi16(stp2_3, stp2_12);
1278     in[4] = _mm_add_epi16(stp2_4, stp2_11);
1279     in[5] = _mm_add_epi16(stp2_5, stp2_10);
1280     in[6] = _mm_add_epi16(stp2_6, stp1_9);
1281     in[7] = _mm_add_epi16(stp2_7, stp1_8);
1282     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1283     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1284     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1285     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1286     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1287     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1288     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1289     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1290 
1291     for (j = 0; j < 16; ++j) {
1292       // Final rounding and shift
1293       in[j] = _mm_adds_epi16(in[j], final_rounding);
1294       in[j] = _mm_srai_epi16(in[j], 6);
1295       RECON_AND_STORE(dest + j * stride, in[j]);
1296     }
1297 
1298     dest += 8;
1299   }
1300 }
1301 
vpx_idct16x16_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1302 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1303                               int stride) {
1304   __m128i dc_value;
1305   const __m128i zero = _mm_setzero_si128();
1306   int a, i;
1307 
1308   a = dct_const_round_shift(input[0] * cospi_16_64);
1309   a = dct_const_round_shift(a * cospi_16_64);
1310   a = ROUND_POWER_OF_TWO(a, 6);
1311 
1312   dc_value = _mm_set1_epi16(a);
1313 
1314   for (i = 0; i < 2; ++i) {
1315     RECON_AND_STORE(dest +  0 * stride, dc_value);
1316     RECON_AND_STORE(dest +  1 * stride, dc_value);
1317     RECON_AND_STORE(dest +  2 * stride, dc_value);
1318     RECON_AND_STORE(dest +  3 * stride, dc_value);
1319     RECON_AND_STORE(dest +  4 * stride, dc_value);
1320     RECON_AND_STORE(dest +  5 * stride, dc_value);
1321     RECON_AND_STORE(dest +  6 * stride, dc_value);
1322     RECON_AND_STORE(dest +  7 * stride, dc_value);
1323     RECON_AND_STORE(dest +  8 * stride, dc_value);
1324     RECON_AND_STORE(dest +  9 * stride, dc_value);
1325     RECON_AND_STORE(dest + 10 * stride, dc_value);
1326     RECON_AND_STORE(dest + 11 * stride, dc_value);
1327     RECON_AND_STORE(dest + 12 * stride, dc_value);
1328     RECON_AND_STORE(dest + 13 * stride, dc_value);
1329     RECON_AND_STORE(dest + 14 * stride, dc_value);
1330     RECON_AND_STORE(dest + 15 * stride, dc_value);
1331     dest += 8;
1332   }
1333 }
1334 
iadst16_8col(__m128i * in)1335 static void iadst16_8col(__m128i *in) {
1336   // perform 16x16 1-D ADST for 8 columns
1337   __m128i s[16], x[16], u[32], v[32];
1338   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1339   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1340   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1341   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1342   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1343   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1344   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1345   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1346   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1347   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1348   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1349   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1350   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1351   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1352   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1353   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1354   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1355   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1356   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1357   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1358   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1359   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1360   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1361   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1362   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1363   const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1364   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1365   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1366   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1367   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1368   const __m128i kZero = _mm_set1_epi16(0);
1369 
1370   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1371   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1372   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1373   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1374   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1375   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1376   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1377   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1378   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1379   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1380   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1381   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1382   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1383   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1384   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1385   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1386 
1387   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1388   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1389   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1390   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1391   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1392   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1393   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1394   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1395   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1396   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1397   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1398   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1399   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1400   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1401   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1402   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1403   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1404   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1405   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1406   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1407   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1408   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1409   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1410   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1411   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1412   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1413   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1414   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1415   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1416   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1417   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1418   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1419 
1420   u[0] = _mm_add_epi32(v[0], v[16]);
1421   u[1] = _mm_add_epi32(v[1], v[17]);
1422   u[2] = _mm_add_epi32(v[2], v[18]);
1423   u[3] = _mm_add_epi32(v[3], v[19]);
1424   u[4] = _mm_add_epi32(v[4], v[20]);
1425   u[5] = _mm_add_epi32(v[5], v[21]);
1426   u[6] = _mm_add_epi32(v[6], v[22]);
1427   u[7] = _mm_add_epi32(v[7], v[23]);
1428   u[8] = _mm_add_epi32(v[8], v[24]);
1429   u[9] = _mm_add_epi32(v[9], v[25]);
1430   u[10] = _mm_add_epi32(v[10], v[26]);
1431   u[11] = _mm_add_epi32(v[11], v[27]);
1432   u[12] = _mm_add_epi32(v[12], v[28]);
1433   u[13] = _mm_add_epi32(v[13], v[29]);
1434   u[14] = _mm_add_epi32(v[14], v[30]);
1435   u[15] = _mm_add_epi32(v[15], v[31]);
1436   u[16] = _mm_sub_epi32(v[0], v[16]);
1437   u[17] = _mm_sub_epi32(v[1], v[17]);
1438   u[18] = _mm_sub_epi32(v[2], v[18]);
1439   u[19] = _mm_sub_epi32(v[3], v[19]);
1440   u[20] = _mm_sub_epi32(v[4], v[20]);
1441   u[21] = _mm_sub_epi32(v[5], v[21]);
1442   u[22] = _mm_sub_epi32(v[6], v[22]);
1443   u[23] = _mm_sub_epi32(v[7], v[23]);
1444   u[24] = _mm_sub_epi32(v[8], v[24]);
1445   u[25] = _mm_sub_epi32(v[9], v[25]);
1446   u[26] = _mm_sub_epi32(v[10], v[26]);
1447   u[27] = _mm_sub_epi32(v[11], v[27]);
1448   u[28] = _mm_sub_epi32(v[12], v[28]);
1449   u[29] = _mm_sub_epi32(v[13], v[29]);
1450   u[30] = _mm_sub_epi32(v[14], v[30]);
1451   u[31] = _mm_sub_epi32(v[15], v[31]);
1452 
1453   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1454   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1455   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1456   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1457   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1458   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1459   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1460   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1461   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1462   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1463   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1464   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1465   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1466   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1467   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1468   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1469   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1470   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1471   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1472   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1473   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1474   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1475   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1476   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1477   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1478   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1479   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1480   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1481   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1482   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1483   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1484   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1485 
1486   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1487   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1488   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1489   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1490   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1491   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1492   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1493   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1494   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1495   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1496   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1497   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1498   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1499   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1500   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1501   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1502   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1503   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1504   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1505   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1506   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1507   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1508   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1509   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1510   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1511   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1512   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1513   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1514   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1515   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1516   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1517   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1518 
1519   s[0] = _mm_packs_epi32(u[0], u[1]);
1520   s[1] = _mm_packs_epi32(u[2], u[3]);
1521   s[2] = _mm_packs_epi32(u[4], u[5]);
1522   s[3] = _mm_packs_epi32(u[6], u[7]);
1523   s[4] = _mm_packs_epi32(u[8], u[9]);
1524   s[5] = _mm_packs_epi32(u[10], u[11]);
1525   s[6] = _mm_packs_epi32(u[12], u[13]);
1526   s[7] = _mm_packs_epi32(u[14], u[15]);
1527   s[8] = _mm_packs_epi32(u[16], u[17]);
1528   s[9] = _mm_packs_epi32(u[18], u[19]);
1529   s[10] = _mm_packs_epi32(u[20], u[21]);
1530   s[11] = _mm_packs_epi32(u[22], u[23]);
1531   s[12] = _mm_packs_epi32(u[24], u[25]);
1532   s[13] = _mm_packs_epi32(u[26], u[27]);
1533   s[14] = _mm_packs_epi32(u[28], u[29]);
1534   s[15] = _mm_packs_epi32(u[30], u[31]);
1535 
1536   // stage 2
1537   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1538   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1539   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1540   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1541   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1542   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1543   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1544   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1545 
1546   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1547   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1548   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1549   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1550   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1551   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1552   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1553   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1554   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1555   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1556   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1557   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1558   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1559   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1560   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1561   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1562 
1563   u[0] = _mm_add_epi32(v[0], v[8]);
1564   u[1] = _mm_add_epi32(v[1], v[9]);
1565   u[2] = _mm_add_epi32(v[2], v[10]);
1566   u[3] = _mm_add_epi32(v[3], v[11]);
1567   u[4] = _mm_add_epi32(v[4], v[12]);
1568   u[5] = _mm_add_epi32(v[5], v[13]);
1569   u[6] = _mm_add_epi32(v[6], v[14]);
1570   u[7] = _mm_add_epi32(v[7], v[15]);
1571   u[8] = _mm_sub_epi32(v[0], v[8]);
1572   u[9] = _mm_sub_epi32(v[1], v[9]);
1573   u[10] = _mm_sub_epi32(v[2], v[10]);
1574   u[11] = _mm_sub_epi32(v[3], v[11]);
1575   u[12] = _mm_sub_epi32(v[4], v[12]);
1576   u[13] = _mm_sub_epi32(v[5], v[13]);
1577   u[14] = _mm_sub_epi32(v[6], v[14]);
1578   u[15] = _mm_sub_epi32(v[7], v[15]);
1579 
1580   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1581   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1582   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1583   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1584   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1585   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1586   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1587   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1588   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1589   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1590   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1591   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1592   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1593   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1594   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1595   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1596 
1597   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1598   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1599   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1600   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1601   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1602   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1603   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1604   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1605   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1606   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1607   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1608   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1609   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1610   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1611   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1612   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1613 
1614   x[0] = _mm_add_epi16(s[0], s[4]);
1615   x[1] = _mm_add_epi16(s[1], s[5]);
1616   x[2] = _mm_add_epi16(s[2], s[6]);
1617   x[3] = _mm_add_epi16(s[3], s[7]);
1618   x[4] = _mm_sub_epi16(s[0], s[4]);
1619   x[5] = _mm_sub_epi16(s[1], s[5]);
1620   x[6] = _mm_sub_epi16(s[2], s[6]);
1621   x[7] = _mm_sub_epi16(s[3], s[7]);
1622   x[8] = _mm_packs_epi32(u[0], u[1]);
1623   x[9] = _mm_packs_epi32(u[2], u[3]);
1624   x[10] = _mm_packs_epi32(u[4], u[5]);
1625   x[11] = _mm_packs_epi32(u[6], u[7]);
1626   x[12] = _mm_packs_epi32(u[8], u[9]);
1627   x[13] = _mm_packs_epi32(u[10], u[11]);
1628   x[14] = _mm_packs_epi32(u[12], u[13]);
1629   x[15] = _mm_packs_epi32(u[14], u[15]);
1630 
1631   // stage 3
1632   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1633   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1634   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1635   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1636   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1637   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1638   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1639   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1640 
1641   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1642   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1643   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1644   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1645   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1646   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1647   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1648   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1649   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1650   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1651   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1652   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1653   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1654   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1655   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1656   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1657 
1658   u[0] = _mm_add_epi32(v[0], v[4]);
1659   u[1] = _mm_add_epi32(v[1], v[5]);
1660   u[2] = _mm_add_epi32(v[2], v[6]);
1661   u[3] = _mm_add_epi32(v[3], v[7]);
1662   u[4] = _mm_sub_epi32(v[0], v[4]);
1663   u[5] = _mm_sub_epi32(v[1], v[5]);
1664   u[6] = _mm_sub_epi32(v[2], v[6]);
1665   u[7] = _mm_sub_epi32(v[3], v[7]);
1666   u[8] = _mm_add_epi32(v[8], v[12]);
1667   u[9] = _mm_add_epi32(v[9], v[13]);
1668   u[10] = _mm_add_epi32(v[10], v[14]);
1669   u[11] = _mm_add_epi32(v[11], v[15]);
1670   u[12] = _mm_sub_epi32(v[8], v[12]);
1671   u[13] = _mm_sub_epi32(v[9], v[13]);
1672   u[14] = _mm_sub_epi32(v[10], v[14]);
1673   u[15] = _mm_sub_epi32(v[11], v[15]);
1674 
1675   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1676   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1677   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1678   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1679   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1680   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1681   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1682   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1683   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1684   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1685   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1686   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1687   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1688   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1689   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1690   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1691 
1692   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1693   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1694   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1695   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1696   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1697   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1698   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1699   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1700   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1701   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1702   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1703   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1704   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1705   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1706   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1707   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1708 
1709   s[0] = _mm_add_epi16(x[0], x[2]);
1710   s[1] = _mm_add_epi16(x[1], x[3]);
1711   s[2] = _mm_sub_epi16(x[0], x[2]);
1712   s[3] = _mm_sub_epi16(x[1], x[3]);
1713   s[4] = _mm_packs_epi32(v[0], v[1]);
1714   s[5] = _mm_packs_epi32(v[2], v[3]);
1715   s[6] = _mm_packs_epi32(v[4], v[5]);
1716   s[7] = _mm_packs_epi32(v[6], v[7]);
1717   s[8] = _mm_add_epi16(x[8], x[10]);
1718   s[9] = _mm_add_epi16(x[9], x[11]);
1719   s[10] = _mm_sub_epi16(x[8], x[10]);
1720   s[11] = _mm_sub_epi16(x[9], x[11]);
1721   s[12] = _mm_packs_epi32(v[8], v[9]);
1722   s[13] = _mm_packs_epi32(v[10], v[11]);
1723   s[14] = _mm_packs_epi32(v[12], v[13]);
1724   s[15] = _mm_packs_epi32(v[14], v[15]);
1725 
1726   // stage 4
1727   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1728   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1729   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1730   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1731   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1732   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1733   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1734   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1735 
1736   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1737   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1738   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1739   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1740   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1741   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1742   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1743   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1744   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1745   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1746   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1747   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1748   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1749   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1750   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1751   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1752 
1753   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1754   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1755   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1756   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1757   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1758   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1759   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1760   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1761   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1762   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1763   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1764   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1765   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1766   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1767   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1768   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1769 
1770   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1771   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1772   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1773   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1774   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1775   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1776   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1777   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1778   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1779   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1780   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1781   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1782   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1783   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1784   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1785   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1786 
1787   in[0] = s[0];
1788   in[1] = _mm_sub_epi16(kZero, s[8]);
1789   in[2] = s[12];
1790   in[3] = _mm_sub_epi16(kZero, s[4]);
1791   in[4] = _mm_packs_epi32(v[4], v[5]);
1792   in[5] = _mm_packs_epi32(v[12], v[13]);
1793   in[6] = _mm_packs_epi32(v[8], v[9]);
1794   in[7] = _mm_packs_epi32(v[0], v[1]);
1795   in[8] = _mm_packs_epi32(v[2], v[3]);
1796   in[9] = _mm_packs_epi32(v[10], v[11]);
1797   in[10] = _mm_packs_epi32(v[14], v[15]);
1798   in[11] = _mm_packs_epi32(v[6], v[7]);
1799   in[12] = s[5];
1800   in[13] = _mm_sub_epi16(kZero, s[13]);
1801   in[14] = s[9];
1802   in[15] = _mm_sub_epi16(kZero, s[1]);
1803 }
1804 
idct16_8col(__m128i * in)1805 static void idct16_8col(__m128i *in) {
1806   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1807   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1808   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1809   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1810   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1811   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1812   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1813   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1814   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1815   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1816   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1817   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1818   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1819   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1820   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1821   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1822   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1823   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1824   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1825   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1826   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1827   __m128i v[16], u[16], s[16], t[16];
1828 
1829   // stage 1
1830   s[0] = in[0];
1831   s[1] = in[8];
1832   s[2] = in[4];
1833   s[3] = in[12];
1834   s[4] = in[2];
1835   s[5] = in[10];
1836   s[6] = in[6];
1837   s[7] = in[14];
1838   s[8] = in[1];
1839   s[9] = in[9];
1840   s[10] = in[5];
1841   s[11] = in[13];
1842   s[12] = in[3];
1843   s[13] = in[11];
1844   s[14] = in[7];
1845   s[15] = in[15];
1846 
1847   // stage 2
1848   u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1849   u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1850   u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1851   u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1852   u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1853   u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1854   u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1855   u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1856 
1857   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1858   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1859   v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1860   v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1861   v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1862   v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1863   v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1864   v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1865   v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1866   v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1867   v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1868   v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1869   v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1870   v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1871   v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1872   v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1873 
1874   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1875   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1876   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1877   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1878   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1879   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1880   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1881   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1882   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1883   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1884   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1885   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1886   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1887   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1888   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1889   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1890 
1891   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1892   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1893   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1894   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1895   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1896   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1897   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1898   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1899   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1900   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1901   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1902   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1903   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1904   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1905   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1906   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1907 
1908   s[8]  = _mm_packs_epi32(u[0], u[1]);
1909   s[15] = _mm_packs_epi32(u[2], u[3]);
1910   s[9]  = _mm_packs_epi32(u[4], u[5]);
1911   s[14] = _mm_packs_epi32(u[6], u[7]);
1912   s[10] = _mm_packs_epi32(u[8], u[9]);
1913   s[13] = _mm_packs_epi32(u[10], u[11]);
1914   s[11] = _mm_packs_epi32(u[12], u[13]);
1915   s[12] = _mm_packs_epi32(u[14], u[15]);
1916 
1917   // stage 3
1918   t[0] = s[0];
1919   t[1] = s[1];
1920   t[2] = s[2];
1921   t[3] = s[3];
1922   u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1923   u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1924   u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1925   u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1926 
1927   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1928   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1929   v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1930   v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1931   v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1932   v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1933   v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1934   v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1935 
1936   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1937   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1938   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1939   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1940   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1941   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1942   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1943   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1944 
1945   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1946   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1947   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1948   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1949   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1950   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1951   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1952   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1953 
1954   t[4] = _mm_packs_epi32(u[0], u[1]);
1955   t[7] = _mm_packs_epi32(u[2], u[3]);
1956   t[5] = _mm_packs_epi32(u[4], u[5]);
1957   t[6] = _mm_packs_epi32(u[6], u[7]);
1958   t[8] = _mm_add_epi16(s[8], s[9]);
1959   t[9] = _mm_sub_epi16(s[8], s[9]);
1960   t[10] = _mm_sub_epi16(s[11], s[10]);
1961   t[11] = _mm_add_epi16(s[10], s[11]);
1962   t[12] = _mm_add_epi16(s[12], s[13]);
1963   t[13] = _mm_sub_epi16(s[12], s[13]);
1964   t[14] = _mm_sub_epi16(s[15], s[14]);
1965   t[15] = _mm_add_epi16(s[14], s[15]);
1966 
1967   // stage 4
1968   u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1969   u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1970   u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1971   u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1972   u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1973   u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1974   u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1975   u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1976 
1977   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1978   v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1979   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1980   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1981   v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1982   v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1983   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1984   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1985   v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1986   v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1987   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1988   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1989   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1990   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1991   v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1992   v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1993 
1994   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1995   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1996   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1997   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1998   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1999   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2000   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2001   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2002   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2003   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2004   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2005   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2006   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2007   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2008   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2009   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2010 
2011   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2012   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2013   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2014   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2015   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2016   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2017   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2018   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2019   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2020   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2021   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2022   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2023   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2024   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2025   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2026   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2027 
2028   s[0] = _mm_packs_epi32(u[0], u[1]);
2029   s[1] = _mm_packs_epi32(u[2], u[3]);
2030   s[2] = _mm_packs_epi32(u[4], u[5]);
2031   s[3] = _mm_packs_epi32(u[6], u[7]);
2032   s[4] = _mm_add_epi16(t[4], t[5]);
2033   s[5] = _mm_sub_epi16(t[4], t[5]);
2034   s[6] = _mm_sub_epi16(t[7], t[6]);
2035   s[7] = _mm_add_epi16(t[6], t[7]);
2036   s[8] = t[8];
2037   s[15] = t[15];
2038   s[9]  = _mm_packs_epi32(u[8], u[9]);
2039   s[14] = _mm_packs_epi32(u[10], u[11]);
2040   s[10] = _mm_packs_epi32(u[12], u[13]);
2041   s[13] = _mm_packs_epi32(u[14], u[15]);
2042   s[11] = t[11];
2043   s[12] = t[12];
2044 
2045   // stage 5
2046   t[0] = _mm_add_epi16(s[0], s[3]);
2047   t[1] = _mm_add_epi16(s[1], s[2]);
2048   t[2] = _mm_sub_epi16(s[1], s[2]);
2049   t[3] = _mm_sub_epi16(s[0], s[3]);
2050   t[4] = s[4];
2051   t[7] = s[7];
2052 
2053   u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2054   u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2055   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2056   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2057   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2058   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2059   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2060   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2061   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2062   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2063   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2064   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2065   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2066   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2067   t[5] = _mm_packs_epi32(u[0], u[1]);
2068   t[6] = _mm_packs_epi32(u[2], u[3]);
2069 
2070   t[8] = _mm_add_epi16(s[8], s[11]);
2071   t[9] = _mm_add_epi16(s[9], s[10]);
2072   t[10] = _mm_sub_epi16(s[9], s[10]);
2073   t[11] = _mm_sub_epi16(s[8], s[11]);
2074   t[12] = _mm_sub_epi16(s[15], s[12]);
2075   t[13] = _mm_sub_epi16(s[14], s[13]);
2076   t[14] = _mm_add_epi16(s[13], s[14]);
2077   t[15] = _mm_add_epi16(s[12], s[15]);
2078 
2079   // stage 6
2080   s[0] = _mm_add_epi16(t[0], t[7]);
2081   s[1] = _mm_add_epi16(t[1], t[6]);
2082   s[2] = _mm_add_epi16(t[2], t[5]);
2083   s[3] = _mm_add_epi16(t[3], t[4]);
2084   s[4] = _mm_sub_epi16(t[3], t[4]);
2085   s[5] = _mm_sub_epi16(t[2], t[5]);
2086   s[6] = _mm_sub_epi16(t[1], t[6]);
2087   s[7] = _mm_sub_epi16(t[0], t[7]);
2088   s[8] = t[8];
2089   s[9] = t[9];
2090 
2091   u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2092   u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2093   u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2094   u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2095 
2096   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2097   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2098   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2099   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2100   v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2101   v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2102   v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2103   v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2104 
2105   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2106   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2107   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2108   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2109   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2110   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2111   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2112   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2113 
2114   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2115   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2116   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2117   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2118   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2119   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2120   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2121   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2122 
2123   s[10] = _mm_packs_epi32(u[0], u[1]);
2124   s[13] = _mm_packs_epi32(u[2], u[3]);
2125   s[11] = _mm_packs_epi32(u[4], u[5]);
2126   s[12] = _mm_packs_epi32(u[6], u[7]);
2127   s[14] = t[14];
2128   s[15] = t[15];
2129 
2130   // stage 7
2131   in[0] = _mm_add_epi16(s[0], s[15]);
2132   in[1] = _mm_add_epi16(s[1], s[14]);
2133   in[2] = _mm_add_epi16(s[2], s[13]);
2134   in[3] = _mm_add_epi16(s[3], s[12]);
2135   in[4] = _mm_add_epi16(s[4], s[11]);
2136   in[5] = _mm_add_epi16(s[5], s[10]);
2137   in[6] = _mm_add_epi16(s[6], s[9]);
2138   in[7] = _mm_add_epi16(s[7], s[8]);
2139   in[8] = _mm_sub_epi16(s[7], s[8]);
2140   in[9] = _mm_sub_epi16(s[6], s[9]);
2141   in[10] = _mm_sub_epi16(s[5], s[10]);
2142   in[11] = _mm_sub_epi16(s[4], s[11]);
2143   in[12] = _mm_sub_epi16(s[3], s[12]);
2144   in[13] = _mm_sub_epi16(s[2], s[13]);
2145   in[14] = _mm_sub_epi16(s[1], s[14]);
2146   in[15] = _mm_sub_epi16(s[0], s[15]);
2147 }
2148 
idct16_sse2(__m128i * in0,__m128i * in1)2149 void idct16_sse2(__m128i *in0, __m128i *in1) {
2150   array_transpose_16x16(in0, in1);
2151   idct16_8col(in0);
2152   idct16_8col(in1);
2153 }
2154 
iadst16_sse2(__m128i * in0,__m128i * in1)2155 void iadst16_sse2(__m128i *in0, __m128i *in1) {
2156   array_transpose_16x16(in0, in1);
2157   iadst16_8col(in0);
2158   iadst16_8col(in1);
2159 }
2160 
vpx_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)2161 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
2162                                int stride) {
2163   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2164   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2165   const __m128i zero = _mm_setzero_si128();
2166 
2167   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2168   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2169   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2170   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2171 
2172   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2173   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2174 
2175   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2176   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2177   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2178   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2179   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2180   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2181 
2182   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2183   __m128i in[16], l[16];
2184   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2185           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2186           stp1_8_0, stp1_12_0;
2187   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2188           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2189   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2190   int i;
2191   // First 1-D inverse DCT
2192   // Load input data.
2193   in[0] = load_input_data(input);
2194   in[1] = load_input_data(input + 8 * 2);
2195   in[2] = load_input_data(input + 8 * 4);
2196   in[3] = load_input_data(input + 8 * 6);
2197 
2198   TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2199 
2200   // Stage2
2201   {
2202     const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2203     const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2204 
2205     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2206     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2207     tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2208     tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2209 
2210     tmp0 = _mm_add_epi32(tmp0, rounding);
2211     tmp2 = _mm_add_epi32(tmp2, rounding);
2212     tmp5 = _mm_add_epi32(tmp5, rounding);
2213     tmp7 = _mm_add_epi32(tmp7, rounding);
2214 
2215     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2216     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2217     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2218     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2219 
2220     stp2_8  = _mm_packs_epi32(tmp0, tmp2);
2221     stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2222   }
2223 
2224   // Stage3
2225   {
2226     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2227 
2228     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2229     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2230 
2231     tmp0 = _mm_add_epi32(tmp0, rounding);
2232     tmp2 = _mm_add_epi32(tmp2, rounding);
2233     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2234     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2235 
2236     stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2237     stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2238 
2239     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2240   }
2241 
2242   // Stage4
2243   {
2244     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2245     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2246     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2247 
2248     tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2249     tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2250     tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2251     tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2252     tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2253     tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2254 
2255     tmp0 = _mm_add_epi32(tmp0, rounding);
2256     tmp2 = _mm_add_epi32(tmp2, rounding);
2257     tmp1 = _mm_add_epi32(tmp1, rounding);
2258     tmp3 = _mm_add_epi32(tmp3, rounding);
2259     tmp5 = _mm_add_epi32(tmp5, rounding);
2260     tmp7 = _mm_add_epi32(tmp7, rounding);
2261 
2262     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2263     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2264     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2265     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2266     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2267     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2268 
2269     stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2270     stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2271     stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2272     stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2273 
2274     stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2275   }
2276 
2277   // Stage5 and Stage6
2278   {
2279     tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2280     tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2281     tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2282     tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2283 
2284     stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
2285     stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2286     stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
2287     stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2288 
2289     stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2290     stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2291     stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2292     stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2293   }
2294 
2295   // Stage6
2296   {
2297     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2298     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2299     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2300 
2301     tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2302     tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2303     tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2304     tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2305     tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2306     tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2307 
2308     tmp1 = _mm_add_epi32(tmp1, rounding);
2309     tmp3 = _mm_add_epi32(tmp3, rounding);
2310     tmp0 = _mm_add_epi32(tmp0, rounding);
2311     tmp2 = _mm_add_epi32(tmp2, rounding);
2312     tmp4 = _mm_add_epi32(tmp4, rounding);
2313     tmp6 = _mm_add_epi32(tmp6, rounding);
2314 
2315     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2316     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2317     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2318     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2319     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2320     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2321 
2322     stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2323 
2324     stp2_10 = _mm_packs_epi32(tmp0, zero);
2325     stp2_13 = _mm_packs_epi32(tmp2, zero);
2326     stp2_11 = _mm_packs_epi32(tmp4, zero);
2327     stp2_12 = _mm_packs_epi32(tmp6, zero);
2328 
2329     tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2330     tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2331     tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2332     tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2333 
2334     stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2335     stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2336     stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2337     stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2338     stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2339     stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2340     stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2341     stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2342   }
2343 
2344   // Stage7. Left 8x16 only.
2345   l[0] = _mm_add_epi16(stp2_0, stp1_15);
2346   l[1] = _mm_add_epi16(stp2_1, stp1_14);
2347   l[2] = _mm_add_epi16(stp2_2, stp2_13);
2348   l[3] = _mm_add_epi16(stp2_3, stp2_12);
2349   l[4] = _mm_add_epi16(stp2_4, stp2_11);
2350   l[5] = _mm_add_epi16(stp2_5, stp2_10);
2351   l[6] = _mm_add_epi16(stp2_6, stp1_9);
2352   l[7] = _mm_add_epi16(stp2_7, stp1_8);
2353   l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2354   l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2355   l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2356   l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2357   l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2358   l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2359   l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2360   l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2361 
2362   // Second 1-D inverse transform, performed per 8x16 block
2363   for (i = 0; i < 2; i++) {
2364     int j;
2365     array_transpose_4X8(l + 8 * i, in);
2366 
2367     IDCT16_10
2368 
2369     // Stage7
2370     in[0] = _mm_add_epi16(stp2_0, stp1_15);
2371     in[1] = _mm_add_epi16(stp2_1, stp1_14);
2372     in[2] = _mm_add_epi16(stp2_2, stp2_13);
2373     in[3] = _mm_add_epi16(stp2_3, stp2_12);
2374     in[4] = _mm_add_epi16(stp2_4, stp2_11);
2375     in[5] = _mm_add_epi16(stp2_5, stp2_10);
2376     in[6] = _mm_add_epi16(stp2_6, stp1_9);
2377     in[7] = _mm_add_epi16(stp2_7, stp1_8);
2378     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2379     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2380     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2381     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2382     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2383     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2384     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2385     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2386 
2387     for (j = 0; j < 16; ++j) {
2388       // Final rounding and shift
2389       in[j] = _mm_adds_epi16(in[j], final_rounding);
2390       in[j] = _mm_srai_epi16(in[j], 6);
2391       RECON_AND_STORE(dest + j * stride, in[j]);
2392     }
2393 
2394     dest += 8;
2395   }
2396 }
2397 
2398 #define LOAD_DQCOEFF(reg, input) \
2399   {  \
2400     reg = load_input_data(input); \
2401     input += 8; \
2402   }  \
2403 
2404 #define IDCT32_34 \
2405 /* Stage1 */ \
2406 { \
2407   const __m128i zero = _mm_setzero_si128();\
2408   const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2409   const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2410   \
2411   const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2412   const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2413   \
2414   const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2415   const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2416   \
2417   const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2418   const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2419   \
2420   MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2421                          stg1_1, stp1_16, stp1_31); \
2422   MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2423                          stg1_7, stp1_19, stp1_28); \
2424   MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2425                          stg1_9, stp1_20, stp1_27); \
2426   MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2427                          stg1_15, stp1_23, stp1_24); \
2428 } \
2429 \
2430 /* Stage2 */ \
2431 { \
2432   const __m128i zero = _mm_setzero_si128();\
2433   const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2434   const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2435   \
2436   const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2437   const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2438   \
2439   MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2440                          stg2_1, stp2_8, stp2_15); \
2441   MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2442                          stg2_7, stp2_11, stp2_12); \
2443   \
2444   stp2_16 = stp1_16; \
2445   stp2_19 = stp1_19; \
2446   \
2447   stp2_20 = stp1_20; \
2448   stp2_23 = stp1_23; \
2449   \
2450   stp2_24 = stp1_24; \
2451   stp2_27 = stp1_27; \
2452   \
2453   stp2_28 = stp1_28; \
2454   stp2_31 = stp1_31; \
2455 } \
2456 \
2457 /* Stage3 */ \
2458 { \
2459   const __m128i zero = _mm_setzero_si128();\
2460   const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2461   const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2462   \
2463   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2464   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2465   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2466   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2467   \
2468   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2469   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2470   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2471   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2472   \
2473   MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2474                          stg3_1, stp1_4, stp1_7); \
2475   \
2476   stp1_8 = stp2_8; \
2477   stp1_11 = stp2_11; \
2478   stp1_12 = stp2_12; \
2479   stp1_15 = stp2_15; \
2480   \
2481   MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2482                          stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2483                          stp1_18, stp1_29) \
2484   MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2485                          stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2486                          stp1_22, stp1_25) \
2487   \
2488   stp1_16 = stp2_16; \
2489   stp1_31 = stp2_31; \
2490   stp1_19 = stp2_19; \
2491   stp1_20 = stp2_20; \
2492   stp1_23 = stp2_23; \
2493   stp1_24 = stp2_24; \
2494   stp1_27 = stp2_27; \
2495   stp1_28 = stp2_28; \
2496 } \
2497 \
2498 /* Stage4 */ \
2499 { \
2500   const __m128i zero = _mm_setzero_si128();\
2501   const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2502   const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2503   \
2504   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2505   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2506   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2507   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2508   \
2509   MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2510                          stg4_1, stp2_0, stp2_1); \
2511   \
2512   stp2_4 = stp1_4; \
2513   stp2_5 = stp1_4; \
2514   stp2_6 = stp1_7; \
2515   stp2_7 = stp1_7; \
2516   \
2517   MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2518                          stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2519                          stp2_10, stp2_13) \
2520   \
2521   stp2_8 = stp1_8; \
2522   stp2_15 = stp1_15; \
2523   stp2_11 = stp1_11; \
2524   stp2_12 = stp1_12; \
2525   \
2526   stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2527   stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2528   stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2529   stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2530   stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2531   stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2532   stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2533   stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2534   \
2535   stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2536   stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2537   stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2538   stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2539   stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2540   stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2541   stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2542   stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2543 } \
2544 \
2545 /* Stage5 */ \
2546 { \
2547   const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2548   const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2549   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2550   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2551   \
2552   const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2553   const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2554   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2555   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2556   \
2557   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2558   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2559   \
2560   stp1_0 = stp2_0; \
2561   stp1_1 = stp2_1; \
2562   stp1_2 = stp2_1; \
2563   stp1_3 = stp2_0; \
2564   \
2565   tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2566   tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2567   tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2568   tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2569   \
2570   tmp0 = _mm_add_epi32(tmp0, rounding); \
2571   tmp1 = _mm_add_epi32(tmp1, rounding); \
2572   tmp2 = _mm_add_epi32(tmp2, rounding); \
2573   tmp3 = _mm_add_epi32(tmp3, rounding); \
2574   \
2575   tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2576   tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2577   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2578   tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2579   \
2580   stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2581   stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2582   \
2583   stp1_4 = stp2_4; \
2584   stp1_7 = stp2_7; \
2585   \
2586   stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2587   stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2588   stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2589   stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2590   stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2591   stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2592   stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2593   stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2594   \
2595   stp1_16 = stp2_16; \
2596   stp1_17 = stp2_17; \
2597   \
2598   MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2599                          stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2600                          stp1_19, stp1_28) \
2601   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2602                          stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2603                          stp1_21, stp1_26) \
2604   \
2605   stp1_22 = stp2_22; \
2606   stp1_23 = stp2_23; \
2607   stp1_24 = stp2_24; \
2608   stp1_25 = stp2_25; \
2609   stp1_30 = stp2_30; \
2610   stp1_31 = stp2_31; \
2611 } \
2612 \
2613 /* Stage6 */ \
2614 { \
2615   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2616   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2617   const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2618   const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2619   \
2620   stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2621   stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2622   stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2623   stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2624   stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2625   stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2626   stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2627   stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2628   \
2629   stp2_8 = stp1_8; \
2630   stp2_9 = stp1_9; \
2631   stp2_14 = stp1_14; \
2632   stp2_15 = stp1_15; \
2633   \
2634   MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2635                          stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2636                          stp2_13, stp2_11, stp2_12) \
2637   \
2638   stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2639   stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2640   stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2641   stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2642   stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2643   stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2644   stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2645   stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2646   \
2647   stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2648   stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2649   stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2650   stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2651   stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2652   stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2653   stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2654   stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2655 } \
2656 \
2657 /* Stage7 */ \
2658 { \
2659   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2660   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2661   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2662   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2663   \
2664   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2665   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2666   const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2667   const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2668   \
2669   stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2670   stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2671   stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2672   stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2673   stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2674   stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2675   stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2676   stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2677   stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2678   stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2679   stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2680   stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2681   stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2682   stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2683   stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2684   stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2685   \
2686   stp1_16 = stp2_16; \
2687   stp1_17 = stp2_17; \
2688   stp1_18 = stp2_18; \
2689   stp1_19 = stp2_19; \
2690   \
2691   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2692                          stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
2693                          stp1_21, stp1_26) \
2694   MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2695                          stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
2696                          stp1_23, stp1_24) \
2697   \
2698   stp1_28 = stp2_28; \
2699   stp1_29 = stp2_29; \
2700   stp1_30 = stp2_30; \
2701   stp1_31 = stp2_31; \
2702 }
2703 
2704 
2705 #define IDCT32 \
2706 /* Stage1 */ \
2707 { \
2708   const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2709   const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2710   const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2711   const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2712   \
2713   const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2714   const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2715   const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
2716   const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2717   \
2718   const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2719   const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2720   const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2721   const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2722   \
2723   const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2724   const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2725   const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2726   const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2727   \
2728   MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2729                          stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2730                          stp1_17, stp1_30) \
2731   MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2732                          stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2733                          stp1_19, stp1_28) \
2734   MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2735                          stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2736                          stp1_21, stp1_26) \
2737   MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2738                          stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2739                          stp1_23, stp1_24) \
2740 } \
2741 \
2742 /* Stage2 */ \
2743 { \
2744   const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2745   const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2746   const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2747   const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2748   \
2749   const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2750   const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2751   const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2752   const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2753   \
2754   MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2755                          stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2756                          stp2_14) \
2757   MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2758                          stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
2759                          stp2_11, stp2_12) \
2760   \
2761   stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2762   stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2763   stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2764   stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2765   \
2766   stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2767   stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2768   stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2769   stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2770   \
2771   stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2772   stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2773   stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2774   stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2775   \
2776   stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2777   stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2778   stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2779   stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2780 } \
2781 \
2782 /* Stage3 */ \
2783 { \
2784   const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2785   const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2786   const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2787   const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2788   \
2789   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2790   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2791   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2792   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2793   \
2794   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2795   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2796   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2797   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2798   \
2799   MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2800                          stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2801                          stp1_6) \
2802   \
2803   stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2804   stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2805   stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2806   stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2807   stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2808   stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2809   stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2810   stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2811   \
2812   MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2813                          stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2814                          stp1_18, stp1_29) \
2815   MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2816                          stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2817                          stp1_22, stp1_25) \
2818   \
2819   stp1_16 = stp2_16; \
2820   stp1_31 = stp2_31; \
2821   stp1_19 = stp2_19; \
2822   stp1_20 = stp2_20; \
2823   stp1_23 = stp2_23; \
2824   stp1_24 = stp2_24; \
2825   stp1_27 = stp2_27; \
2826   stp1_28 = stp2_28; \
2827 } \
2828 \
2829 /* Stage4 */ \
2830 { \
2831   const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2832   const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2833   const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2834   const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2835   \
2836   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2837   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2838   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2839   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2840   \
2841   MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
2842                          stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
2843                          stp2_2, stp2_3) \
2844   \
2845   stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2846   stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2847   stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2848   stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2849   \
2850   MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2851                          stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2852                          stp2_10, stp2_13) \
2853   \
2854   stp2_8 = stp1_8; \
2855   stp2_15 = stp1_15; \
2856   stp2_11 = stp1_11; \
2857   stp2_12 = stp1_12; \
2858   \
2859   stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2860   stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2861   stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2862   stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2863   stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2864   stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2865   stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2866   stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2867   \
2868   stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2869   stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2870   stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2871   stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2872   stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2873   stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2874   stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2875   stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2876 } \
2877 \
2878 /* Stage5 */ \
2879 { \
2880   const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2881   const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2882   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2883   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2884   \
2885   const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2886   const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2887   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2888   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2889   \
2890   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2891   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2892   \
2893   stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2894   stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2895   stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2896   stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2897   \
2898   tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2899   tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2900   tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2901   tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2902   \
2903   tmp0 = _mm_add_epi32(tmp0, rounding); \
2904   tmp1 = _mm_add_epi32(tmp1, rounding); \
2905   tmp2 = _mm_add_epi32(tmp2, rounding); \
2906   tmp3 = _mm_add_epi32(tmp3, rounding); \
2907   \
2908   tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2909   tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2910   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2911   tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2912   \
2913   stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2914   stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2915   \
2916   stp1_4 = stp2_4; \
2917   stp1_7 = stp2_7; \
2918   \
2919   stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2920   stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2921   stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2922   stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2923   stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2924   stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2925   stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2926   stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2927   \
2928   stp1_16 = stp2_16; \
2929   stp1_17 = stp2_17; \
2930   \
2931   MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2932                          stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2933                          stp1_19, stp1_28) \
2934   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2935                          stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2936                          stp1_21, stp1_26) \
2937   \
2938   stp1_22 = stp2_22; \
2939   stp1_23 = stp2_23; \
2940   stp1_24 = stp2_24; \
2941   stp1_25 = stp2_25; \
2942   stp1_30 = stp2_30; \
2943   stp1_31 = stp2_31; \
2944 } \
2945 \
2946 /* Stage6 */ \
2947 { \
2948   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2949   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2950   const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2951   const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2952   \
2953   stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2954   stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2955   stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2956   stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2957   stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2958   stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2959   stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2960   stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2961   \
2962   stp2_8 = stp1_8; \
2963   stp2_9 = stp1_9; \
2964   stp2_14 = stp1_14; \
2965   stp2_15 = stp1_15; \
2966   \
2967   MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2968                          stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2969                          stp2_13, stp2_11, stp2_12) \
2970   \
2971   stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2972   stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2973   stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2974   stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2975   stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2976   stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2977   stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2978   stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2979   \
2980   stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2981   stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2982   stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2983   stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2984   stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2985   stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2986   stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2987   stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2988 } \
2989 \
2990 /* Stage7 */ \
2991 { \
2992   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2993   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2994   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2995   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2996   \
2997   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2998   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2999   const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3000   const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3001   \
3002   stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3003   stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3004   stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3005   stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3006   stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3007   stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3008   stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3009   stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3010   stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3011   stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3012   stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3013   stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3014   stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3015   stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3016   stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3017   stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3018   \
3019   stp1_16 = stp2_16; \
3020   stp1_17 = stp2_17; \
3021   stp1_18 = stp2_18; \
3022   stp1_19 = stp2_19; \
3023   \
3024   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3025                          stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3026                          stp1_21, stp1_26) \
3027   MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3028                          stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3029                          stp1_23, stp1_24) \
3030   \
3031   stp1_28 = stp2_28; \
3032   stp1_29 = stp2_29; \
3033   stp1_30 = stp2_30; \
3034   stp1_31 = stp2_31; \
3035 }
3036 
3037 // Only upper-left 8x8 has non-zero coeff
vpx_idct32x32_34_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3038 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
3039                                int stride) {
3040   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3041   const __m128i final_rounding = _mm_set1_epi16(1<<5);
3042 
3043   // idct constants for each stage
3044   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3045   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3046   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3047   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3048   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3049   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3050   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3051   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3052 
3053   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3054   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3055   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3056   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3057 
3058   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3059   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3060   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3061   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3062   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3063   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3064   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3065   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3066 
3067   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3068   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3069   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3070   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3071   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3072 
3073   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3074 
3075   __m128i in[32], col[32];
3076   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3077           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3078           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3079           stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3080           stp1_30, stp1_31;
3081   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3082           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3083           stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3084           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3085           stp2_30, stp2_31;
3086   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3087   int i;
3088 
3089   // Load input data. Only need to load the top left 8x8 block.
3090   in[0] = load_input_data(input);
3091   in[1] = load_input_data(input + 32);
3092   in[2] = load_input_data(input + 64);
3093   in[3] = load_input_data(input + 96);
3094   in[4] = load_input_data(input + 128);
3095   in[5] = load_input_data(input + 160);
3096   in[6] = load_input_data(input + 192);
3097   in[7] = load_input_data(input + 224);
3098 
3099   for (i = 8; i < 32; ++i) {
3100     in[i] = _mm_setzero_si128();
3101   }
3102 
3103   array_transpose_8x8(in, in);
3104   // TODO(hkuang): Following transposes are unnecessary. But remove them will
3105   // lead to performance drop on some devices.
3106   array_transpose_8x8(in + 8, in + 8);
3107   array_transpose_8x8(in + 16, in + 16);
3108   array_transpose_8x8(in + 24, in + 24);
3109 
3110   IDCT32_34
3111 
3112   // 1_D: Store 32 intermediate results for each 8x32 block.
3113   col[0] = _mm_add_epi16(stp1_0, stp1_31);
3114   col[1] = _mm_add_epi16(stp1_1, stp1_30);
3115   col[2] = _mm_add_epi16(stp1_2, stp1_29);
3116   col[3] = _mm_add_epi16(stp1_3, stp1_28);
3117   col[4] = _mm_add_epi16(stp1_4, stp1_27);
3118   col[5] = _mm_add_epi16(stp1_5, stp1_26);
3119   col[6] = _mm_add_epi16(stp1_6, stp1_25);
3120   col[7] = _mm_add_epi16(stp1_7, stp1_24);
3121   col[8] = _mm_add_epi16(stp1_8, stp1_23);
3122   col[9] = _mm_add_epi16(stp1_9, stp1_22);
3123   col[10] = _mm_add_epi16(stp1_10, stp1_21);
3124   col[11] = _mm_add_epi16(stp1_11, stp1_20);
3125   col[12] = _mm_add_epi16(stp1_12, stp1_19);
3126   col[13] = _mm_add_epi16(stp1_13, stp1_18);
3127   col[14] = _mm_add_epi16(stp1_14, stp1_17);
3128   col[15] = _mm_add_epi16(stp1_15, stp1_16);
3129   col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3130   col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3131   col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3132   col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3133   col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3134   col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3135   col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3136   col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3137   col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3138   col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3139   col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3140   col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3141   col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3142   col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3143   col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3144   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3145   for (i = 0; i < 4; i++) {
3146     int j;
3147     const __m128i zero = _mm_setzero_si128();
3148     // Transpose 32x8 block to 8x32 block
3149     array_transpose_8x8(col + i * 8, in);
3150     IDCT32_34
3151 
3152     // 2_D: Calculate the results and store them to destination.
3153     in[0] = _mm_add_epi16(stp1_0, stp1_31);
3154     in[1] = _mm_add_epi16(stp1_1, stp1_30);
3155     in[2] = _mm_add_epi16(stp1_2, stp1_29);
3156     in[3] = _mm_add_epi16(stp1_3, stp1_28);
3157     in[4] = _mm_add_epi16(stp1_4, stp1_27);
3158     in[5] = _mm_add_epi16(stp1_5, stp1_26);
3159     in[6] = _mm_add_epi16(stp1_6, stp1_25);
3160     in[7] = _mm_add_epi16(stp1_7, stp1_24);
3161     in[8] = _mm_add_epi16(stp1_8, stp1_23);
3162     in[9] = _mm_add_epi16(stp1_9, stp1_22);
3163     in[10] = _mm_add_epi16(stp1_10, stp1_21);
3164     in[11] = _mm_add_epi16(stp1_11, stp1_20);
3165     in[12] = _mm_add_epi16(stp1_12, stp1_19);
3166     in[13] = _mm_add_epi16(stp1_13, stp1_18);
3167     in[14] = _mm_add_epi16(stp1_14, stp1_17);
3168     in[15] = _mm_add_epi16(stp1_15, stp1_16);
3169     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3170     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3171     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3172     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3173     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3174     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3175     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3176     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3177     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3178     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3179     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3180     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3181     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3182     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3183     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3184     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3185 
3186     for (j = 0; j < 32; ++j) {
3187       // Final rounding and shift
3188       in[j] = _mm_adds_epi16(in[j], final_rounding);
3189       in[j] = _mm_srai_epi16(in[j], 6);
3190       RECON_AND_STORE(dest + j * stride, in[j]);
3191     }
3192 
3193     dest += 8;
3194   }
3195 }
3196 
vpx_idct32x32_1024_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3197 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
3198                                  int stride) {
3199   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3200   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3201   const __m128i zero = _mm_setzero_si128();
3202 
3203   // idct constants for each stage
3204   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3205   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3206   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3207   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3208   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3209   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3210   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3211   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3212   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3213   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3214   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3215   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3216   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3217   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3218   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3219   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3220 
3221   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3222   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3223   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3224   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3225   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3226   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3227   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3228   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3229 
3230   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3231   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3232   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3233   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3234   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3235   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3236   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3237   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3238   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3239   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3240 
3241   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3242   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3243   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3244   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3245   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3246   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3247   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3248 
3249   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3250 
3251   __m128i in[32], col[128], zero_idx[16];
3252   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3253           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3254           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3255           stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3256           stp1_30, stp1_31;
3257   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3258           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3259           stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3260           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3261           stp2_30, stp2_31;
3262   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3263   int i, j, i32;
3264 
3265   for (i = 0; i < 4; i++) {
3266     i32 = (i << 5);
3267     // First 1-D idct
3268     // Load input data.
3269     LOAD_DQCOEFF(in[0], input);
3270     LOAD_DQCOEFF(in[8], input);
3271     LOAD_DQCOEFF(in[16], input);
3272     LOAD_DQCOEFF(in[24], input);
3273     LOAD_DQCOEFF(in[1], input);
3274     LOAD_DQCOEFF(in[9], input);
3275     LOAD_DQCOEFF(in[17], input);
3276     LOAD_DQCOEFF(in[25], input);
3277     LOAD_DQCOEFF(in[2], input);
3278     LOAD_DQCOEFF(in[10], input);
3279     LOAD_DQCOEFF(in[18], input);
3280     LOAD_DQCOEFF(in[26], input);
3281     LOAD_DQCOEFF(in[3], input);
3282     LOAD_DQCOEFF(in[11], input);
3283     LOAD_DQCOEFF(in[19], input);
3284     LOAD_DQCOEFF(in[27], input);
3285 
3286     LOAD_DQCOEFF(in[4], input);
3287     LOAD_DQCOEFF(in[12], input);
3288     LOAD_DQCOEFF(in[20], input);
3289     LOAD_DQCOEFF(in[28], input);
3290     LOAD_DQCOEFF(in[5], input);
3291     LOAD_DQCOEFF(in[13], input);
3292     LOAD_DQCOEFF(in[21], input);
3293     LOAD_DQCOEFF(in[29], input);
3294     LOAD_DQCOEFF(in[6], input);
3295     LOAD_DQCOEFF(in[14], input);
3296     LOAD_DQCOEFF(in[22], input);
3297     LOAD_DQCOEFF(in[30], input);
3298     LOAD_DQCOEFF(in[7], input);
3299     LOAD_DQCOEFF(in[15], input);
3300     LOAD_DQCOEFF(in[23], input);
3301     LOAD_DQCOEFF(in[31], input);
3302 
3303     // checking if all entries are zero
3304     zero_idx[0] = _mm_or_si128(in[0], in[1]);
3305     zero_idx[1] = _mm_or_si128(in[2], in[3]);
3306     zero_idx[2] = _mm_or_si128(in[4], in[5]);
3307     zero_idx[3] = _mm_or_si128(in[6], in[7]);
3308     zero_idx[4] = _mm_or_si128(in[8], in[9]);
3309     zero_idx[5] = _mm_or_si128(in[10], in[11]);
3310     zero_idx[6] = _mm_or_si128(in[12], in[13]);
3311     zero_idx[7] = _mm_or_si128(in[14], in[15]);
3312     zero_idx[8] = _mm_or_si128(in[16], in[17]);
3313     zero_idx[9] = _mm_or_si128(in[18], in[19]);
3314     zero_idx[10] = _mm_or_si128(in[20], in[21]);
3315     zero_idx[11] = _mm_or_si128(in[22], in[23]);
3316     zero_idx[12] = _mm_or_si128(in[24], in[25]);
3317     zero_idx[13] = _mm_or_si128(in[26], in[27]);
3318     zero_idx[14] = _mm_or_si128(in[28], in[29]);
3319     zero_idx[15] = _mm_or_si128(in[30], in[31]);
3320 
3321     zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3322     zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3323     zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3324     zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3325     zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3326     zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3327     zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3328     zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3329 
3330     zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3331     zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3332     zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3333     zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3334     zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3335     zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3336     zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3337 
3338     if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3339       col[i32 + 0] = _mm_setzero_si128();
3340       col[i32 + 1] = _mm_setzero_si128();
3341       col[i32 + 2] = _mm_setzero_si128();
3342       col[i32 + 3] = _mm_setzero_si128();
3343       col[i32 + 4] = _mm_setzero_si128();
3344       col[i32 + 5] = _mm_setzero_si128();
3345       col[i32 + 6] = _mm_setzero_si128();
3346       col[i32 + 7] = _mm_setzero_si128();
3347       col[i32 + 8] = _mm_setzero_si128();
3348       col[i32 + 9] = _mm_setzero_si128();
3349       col[i32 + 10] = _mm_setzero_si128();
3350       col[i32 + 11] = _mm_setzero_si128();
3351       col[i32 + 12] = _mm_setzero_si128();
3352       col[i32 + 13] = _mm_setzero_si128();
3353       col[i32 + 14] = _mm_setzero_si128();
3354       col[i32 + 15] = _mm_setzero_si128();
3355       col[i32 + 16] = _mm_setzero_si128();
3356       col[i32 + 17] = _mm_setzero_si128();
3357       col[i32 + 18] = _mm_setzero_si128();
3358       col[i32 + 19] = _mm_setzero_si128();
3359       col[i32 + 20] = _mm_setzero_si128();
3360       col[i32 + 21] = _mm_setzero_si128();
3361       col[i32 + 22] = _mm_setzero_si128();
3362       col[i32 + 23] = _mm_setzero_si128();
3363       col[i32 + 24] = _mm_setzero_si128();
3364       col[i32 + 25] = _mm_setzero_si128();
3365       col[i32 + 26] = _mm_setzero_si128();
3366       col[i32 + 27] = _mm_setzero_si128();
3367       col[i32 + 28] = _mm_setzero_si128();
3368       col[i32 + 29] = _mm_setzero_si128();
3369       col[i32 + 30] = _mm_setzero_si128();
3370       col[i32 + 31] = _mm_setzero_si128();
3371       continue;
3372     }
3373 
3374     // Transpose 32x8 block to 8x32 block
3375     array_transpose_8x8(in, in);
3376     array_transpose_8x8(in + 8, in + 8);
3377     array_transpose_8x8(in + 16, in + 16);
3378     array_transpose_8x8(in + 24, in + 24);
3379 
3380     IDCT32
3381 
3382     // 1_D: Store 32 intermediate results for each 8x32 block.
3383     col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3384     col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3385     col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3386     col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3387     col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3388     col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3389     col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3390     col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3391     col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3392     col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3393     col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3394     col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3395     col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3396     col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3397     col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3398     col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3399     col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3400     col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3401     col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3402     col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3403     col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3404     col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3405     col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3406     col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3407     col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3408     col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3409     col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3410     col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3411     col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3412     col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3413     col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3414     col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3415   }
3416   for (i = 0; i < 4; i++) {
3417     // Second 1-D idct
3418     j = i << 3;
3419 
3420     // Transpose 32x8 block to 8x32 block
3421     array_transpose_8x8(col + j, in);
3422     array_transpose_8x8(col + j + 32, in + 8);
3423     array_transpose_8x8(col + j + 64, in + 16);
3424     array_transpose_8x8(col + j + 96, in + 24);
3425 
3426     IDCT32
3427 
3428     // 2_D: Calculate the results and store them to destination.
3429     in[0] = _mm_add_epi16(stp1_0, stp1_31);
3430     in[1] = _mm_add_epi16(stp1_1, stp1_30);
3431     in[2] = _mm_add_epi16(stp1_2, stp1_29);
3432     in[3] = _mm_add_epi16(stp1_3, stp1_28);
3433     in[4] = _mm_add_epi16(stp1_4, stp1_27);
3434     in[5] = _mm_add_epi16(stp1_5, stp1_26);
3435     in[6] = _mm_add_epi16(stp1_6, stp1_25);
3436     in[7] = _mm_add_epi16(stp1_7, stp1_24);
3437     in[8] = _mm_add_epi16(stp1_8, stp1_23);
3438     in[9] = _mm_add_epi16(stp1_9, stp1_22);
3439     in[10] = _mm_add_epi16(stp1_10, stp1_21);
3440     in[11] = _mm_add_epi16(stp1_11, stp1_20);
3441     in[12] = _mm_add_epi16(stp1_12, stp1_19);
3442     in[13] = _mm_add_epi16(stp1_13, stp1_18);
3443     in[14] = _mm_add_epi16(stp1_14, stp1_17);
3444     in[15] = _mm_add_epi16(stp1_15, stp1_16);
3445     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3446     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3447     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3448     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3449     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3450     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3451     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3452     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3453     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3454     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3455     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3456     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3457     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3458     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3459     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3460     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3461 
3462     for (j = 0; j < 32; ++j) {
3463       // Final rounding and shift
3464       in[j] = _mm_adds_epi16(in[j], final_rounding);
3465       in[j] = _mm_srai_epi16(in[j], 6);
3466       RECON_AND_STORE(dest + j * stride, in[j]);
3467     }
3468 
3469     dest += 8;
3470   }
3471 }
3472 
vpx_idct32x32_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3473 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
3474                               int stride) {
3475   __m128i dc_value;
3476   const __m128i zero = _mm_setzero_si128();
3477   int a, j;
3478 
3479   a = dct_const_round_shift(input[0] * cospi_16_64);
3480   a = dct_const_round_shift(a * cospi_16_64);
3481   a = ROUND_POWER_OF_TWO(a, 6);
3482 
3483   dc_value = _mm_set1_epi16(a);
3484 
3485   for (j = 0; j < 32; ++j) {
3486     RECON_AND_STORE(dest +  0 + j * stride, dc_value);
3487     RECON_AND_STORE(dest +  8 + j * stride, dc_value);
3488     RECON_AND_STORE(dest + 16 + j * stride, dc_value);
3489     RECON_AND_STORE(dest + 24 + j * stride, dc_value);
3490   }
3491 }
3492 
3493 #if CONFIG_VP9_HIGHBITDEPTH
clamp_high_sse2(__m128i value,int bd)3494 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
3495   __m128i ubounded, retval;
3496   const __m128i zero = _mm_set1_epi16(0);
3497   const __m128i one = _mm_set1_epi16(1);
3498   const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
3499   ubounded = _mm_cmpgt_epi16(value, max);
3500   retval = _mm_andnot_si128(ubounded, value);
3501   ubounded = _mm_and_si128(ubounded, max);
3502   retval = _mm_or_si128(retval, ubounded);
3503   retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
3504   return retval;
3505 }
3506 
vpx_highbd_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3507 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
3508                                     int stride, int bd) {
3509   tran_low_t out[4 * 4];
3510   tran_low_t *outptr = out;
3511   int i, j;
3512   __m128i inptr[4];
3513   __m128i sign_bits[2];
3514   __m128i temp_mm, min_input, max_input;
3515   int test;
3516   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3517   int optimised_cols = 0;
3518   const __m128i zero = _mm_set1_epi16(0);
3519   const __m128i eight = _mm_set1_epi16(8);
3520   const __m128i max = _mm_set1_epi16(12043);
3521   const __m128i min = _mm_set1_epi16(-12043);
3522   // Load input into __m128i
3523   inptr[0] = _mm_loadu_si128((const __m128i *)input);
3524   inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
3525   inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
3526   inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
3527 
3528   // Pack to 16 bits
3529   inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
3530   inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
3531 
3532   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3533   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3534   max_input = _mm_cmpgt_epi16(max_input, max);
3535   min_input = _mm_cmplt_epi16(min_input, min);
3536   temp_mm = _mm_or_si128(max_input, min_input);
3537   test = _mm_movemask_epi8(temp_mm);
3538 
3539   if (!test) {
3540     // Do the row transform
3541     idct4_sse2(inptr);
3542 
3543     // Check the min & max values
3544     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3545     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3546     max_input = _mm_cmpgt_epi16(max_input, max);
3547     min_input = _mm_cmplt_epi16(min_input, min);
3548     temp_mm = _mm_or_si128(max_input, min_input);
3549     test = _mm_movemask_epi8(temp_mm);
3550 
3551     if (test) {
3552       transpose_4x4(inptr);
3553       sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
3554       sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
3555       inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
3556       inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
3557       inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
3558       inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
3559       _mm_storeu_si128((__m128i *)outptr, inptr[0]);
3560       _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
3561       _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
3562       _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
3563     } else {
3564       // Set to use the optimised transform for the column
3565       optimised_cols = 1;
3566     }
3567   } else {
3568     // Run the un-optimised row transform
3569     for (i = 0; i < 4; ++i) {
3570       vpx_highbd_idct4_c(input, outptr, bd);
3571       input += 4;
3572       outptr += 4;
3573     }
3574   }
3575 
3576   if (optimised_cols) {
3577     idct4_sse2(inptr);
3578 
3579     // Final round and shift
3580     inptr[0] = _mm_add_epi16(inptr[0], eight);
3581     inptr[1] = _mm_add_epi16(inptr[1], eight);
3582 
3583     inptr[0] = _mm_srai_epi16(inptr[0], 4);
3584     inptr[1] = _mm_srai_epi16(inptr[1], 4);
3585 
3586     // Reconstruction and Store
3587     {
3588       __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
3589       __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
3590       d0 = _mm_unpacklo_epi64(
3591           d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
3592       d2 = _mm_unpacklo_epi64(
3593           d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
3594       d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
3595       d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
3596       // store input0
3597       _mm_storel_epi64((__m128i *)dest, d0);
3598       // store input1
3599       d0 = _mm_srli_si128(d0, 8);
3600       _mm_storel_epi64((__m128i *)(dest + stride), d0);
3601       // store input2
3602       _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
3603       // store input3
3604       d2 = _mm_srli_si128(d2, 8);
3605       _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
3606     }
3607   } else {
3608     // Run the un-optimised column transform
3609     tran_low_t temp_in[4], temp_out[4];
3610     // Columns
3611     for (i = 0; i < 4; ++i) {
3612       for (j = 0; j < 4; ++j)
3613         temp_in[j] = out[j * 4 + i];
3614       vpx_highbd_idct4_c(temp_in, temp_out, bd);
3615       for (j = 0; j < 4; ++j) {
3616         dest[j * stride + i] = highbd_clip_pixel_add(
3617             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
3618       }
3619     }
3620   }
3621 }
3622 
vpx_highbd_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3623 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
3624                                     int stride, int bd) {
3625   tran_low_t out[8 * 8];
3626   tran_low_t *outptr = out;
3627   int i, j, test;
3628   __m128i inptr[8];
3629   __m128i min_input, max_input, temp1, temp2, sign_bits;
3630   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3631   const __m128i zero = _mm_set1_epi16(0);
3632   const __m128i sixteen = _mm_set1_epi16(16);
3633   const __m128i max = _mm_set1_epi16(6201);
3634   const __m128i min = _mm_set1_epi16(-6201);
3635   int optimised_cols = 0;
3636 
3637   // Load input into __m128i & pack to 16 bits
3638   for (i = 0; i < 8; i++) {
3639     temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3640     temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3641     inptr[i] = _mm_packs_epi32(temp1, temp2);
3642   }
3643 
3644   // Find the min & max for the row transform
3645   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3646   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3647   for (i = 2; i < 8; i++) {
3648     max_input = _mm_max_epi16(max_input, inptr[i]);
3649     min_input = _mm_min_epi16(min_input, inptr[i]);
3650   }
3651   max_input = _mm_cmpgt_epi16(max_input, max);
3652   min_input = _mm_cmplt_epi16(min_input, min);
3653   temp1 = _mm_or_si128(max_input, min_input);
3654   test = _mm_movemask_epi8(temp1);
3655 
3656   if (!test) {
3657     // Do the row transform
3658     idct8_sse2(inptr);
3659 
3660     // Find the min & max for the column transform
3661     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3662     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3663     for (i = 2; i < 8; i++) {
3664       max_input = _mm_max_epi16(max_input, inptr[i]);
3665       min_input = _mm_min_epi16(min_input, inptr[i]);
3666     }
3667     max_input = _mm_cmpgt_epi16(max_input, max);
3668     min_input = _mm_cmplt_epi16(min_input, min);
3669     temp1 = _mm_or_si128(max_input, min_input);
3670     test = _mm_movemask_epi8(temp1);
3671 
3672     if (test) {
3673       array_transpose_8x8(inptr, inptr);
3674       for (i = 0; i < 8; i++) {
3675         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3676         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3677         temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3678         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3679         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3680       }
3681     } else {
3682       // Set to use the optimised transform for the column
3683       optimised_cols = 1;
3684     }
3685   } else {
3686     // Run the un-optimised row transform
3687     for (i = 0; i < 8; ++i) {
3688       vpx_highbd_idct8_c(input, outptr, bd);
3689       input += 8;
3690       outptr += 8;
3691     }
3692   }
3693 
3694   if (optimised_cols) {
3695     idct8_sse2(inptr);
3696 
3697     // Final round & shift and Reconstruction and Store
3698     {
3699       __m128i d[8];
3700       for (i = 0; i < 8; i++) {
3701         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3702         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3703         inptr[i] = _mm_srai_epi16(inptr[i], 5);
3704         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3705         // Store
3706         _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3707       }
3708     }
3709   } else {
3710     // Run the un-optimised column transform
3711     tran_low_t temp_in[8], temp_out[8];
3712     for (i = 0; i < 8; ++i) {
3713       for (j = 0; j < 8; ++j)
3714         temp_in[j] = out[j * 8 + i];
3715       vpx_highbd_idct8_c(temp_in, temp_out, bd);
3716       for (j = 0; j < 8; ++j) {
3717         dest[j * stride + i] = highbd_clip_pixel_add(
3718             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3719       }
3720     }
3721   }
3722 }
3723 
vpx_highbd_idct8x8_10_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3724 void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3725                                     int stride, int bd) {
3726   tran_low_t out[8 * 8] = { 0 };
3727   tran_low_t *outptr = out;
3728   int i, j, test;
3729   __m128i inptr[8];
3730   __m128i min_input, max_input, temp1, temp2, sign_bits;
3731   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3732   const __m128i zero = _mm_set1_epi16(0);
3733   const __m128i sixteen = _mm_set1_epi16(16);
3734   const __m128i max = _mm_set1_epi16(6201);
3735   const __m128i min = _mm_set1_epi16(-6201);
3736   int optimised_cols = 0;
3737 
3738   // Load input into __m128i & pack to 16 bits
3739   for (i = 0; i < 8; i++) {
3740     temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3741     temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3742     inptr[i] = _mm_packs_epi32(temp1, temp2);
3743   }
3744 
3745   // Find the min & max for the row transform
3746   // only first 4 row has non-zero coefs
3747   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3748   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3749   for (i = 2; i < 4; i++) {
3750     max_input = _mm_max_epi16(max_input, inptr[i]);
3751     min_input = _mm_min_epi16(min_input, inptr[i]);
3752   }
3753   max_input = _mm_cmpgt_epi16(max_input, max);
3754   min_input = _mm_cmplt_epi16(min_input, min);
3755   temp1 = _mm_or_si128(max_input, min_input);
3756   test = _mm_movemask_epi8(temp1);
3757 
3758   if (!test) {
3759     // Do the row transform
3760     idct8_sse2(inptr);
3761 
3762     // Find the min & max for the column transform
3763     // N.B. Only first 4 cols contain non-zero coeffs
3764     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3765     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3766     for (i = 2; i < 8; i++) {
3767       max_input = _mm_max_epi16(max_input, inptr[i]);
3768       min_input = _mm_min_epi16(min_input, inptr[i]);
3769     }
3770     max_input = _mm_cmpgt_epi16(max_input, max);
3771     min_input = _mm_cmplt_epi16(min_input, min);
3772     temp1 = _mm_or_si128(max_input, min_input);
3773     test = _mm_movemask_epi8(temp1);
3774 
3775     if (test) {
3776       // Use fact only first 4 rows contain non-zero coeffs
3777       array_transpose_4X8(inptr, inptr);
3778       for (i = 0; i < 4; i++) {
3779         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3780         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3781         temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3782         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3783         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3784       }
3785     } else {
3786       // Set to use the optimised transform for the column
3787       optimised_cols = 1;
3788     }
3789   } else {
3790     // Run the un-optimised row transform
3791     for (i = 0; i < 4; ++i) {
3792       vpx_highbd_idct8_c(input, outptr, bd);
3793       input += 8;
3794       outptr += 8;
3795     }
3796   }
3797 
3798   if (optimised_cols) {
3799     idct8_sse2(inptr);
3800 
3801     // Final round & shift and Reconstruction and Store
3802     {
3803       __m128i d[8];
3804       for (i = 0; i < 8; i++) {
3805         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3806         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3807         inptr[i] = _mm_srai_epi16(inptr[i], 5);
3808         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3809         // Store
3810         _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3811       }
3812     }
3813   } else {
3814     // Run the un-optimised column transform
3815     tran_low_t temp_in[8], temp_out[8];
3816     for (i = 0; i < 8; ++i) {
3817       for (j = 0; j < 8; ++j)
3818         temp_in[j] = out[j * 8 + i];
3819       vpx_highbd_idct8_c(temp_in, temp_out, bd);
3820       for (j = 0; j < 8; ++j) {
3821         dest[j * stride + i] = highbd_clip_pixel_add(
3822             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3823       }
3824     }
3825   }
3826 }
3827 
vpx_highbd_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3828 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
3829                                        int stride, int bd) {
3830   tran_low_t out[16 * 16];
3831   tran_low_t *outptr = out;
3832   int i, j, test;
3833   __m128i inptr[32];
3834   __m128i min_input, max_input, temp1, temp2, sign_bits;
3835   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3836   const __m128i zero = _mm_set1_epi16(0);
3837   const __m128i rounding = _mm_set1_epi16(32);
3838   const __m128i max = _mm_set1_epi16(3155);
3839   const __m128i min = _mm_set1_epi16(-3155);
3840   int optimised_cols = 0;
3841 
3842   // Load input into __m128i & pack to 16 bits
3843   for (i = 0; i < 16; i++) {
3844     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3845     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3846     inptr[i] = _mm_packs_epi32(temp1, temp2);
3847     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3848     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3849     inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3850   }
3851 
3852   // Find the min & max for the row transform
3853   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3854   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3855   for (i = 2; i < 32; i++) {
3856     max_input = _mm_max_epi16(max_input, inptr[i]);
3857     min_input = _mm_min_epi16(min_input, inptr[i]);
3858   }
3859   max_input = _mm_cmpgt_epi16(max_input, max);
3860   min_input = _mm_cmplt_epi16(min_input, min);
3861   temp1 = _mm_or_si128(max_input, min_input);
3862   test = _mm_movemask_epi8(temp1);
3863 
3864   if (!test) {
3865     // Do the row transform
3866     idct16_sse2(inptr, inptr + 16);
3867 
3868     // Find the min & max for the column transform
3869     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3870     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3871     for (i = 2; i < 32; i++) {
3872       max_input = _mm_max_epi16(max_input, inptr[i]);
3873       min_input = _mm_min_epi16(min_input, inptr[i]);
3874     }
3875     max_input = _mm_cmpgt_epi16(max_input, max);
3876     min_input = _mm_cmplt_epi16(min_input, min);
3877     temp1 = _mm_or_si128(max_input, min_input);
3878     test = _mm_movemask_epi8(temp1);
3879 
3880     if (test) {
3881       array_transpose_16x16(inptr, inptr + 16);
3882       for (i = 0; i < 16; i++) {
3883         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3884         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3885         temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3886         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3887         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3888         sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3889         temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3890         temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3891         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3892         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3893       }
3894     } else {
3895       // Set to use the optimised transform for the column
3896       optimised_cols = 1;
3897     }
3898   } else {
3899     // Run the un-optimised row transform
3900     for (i = 0; i < 16; ++i) {
3901       vpx_highbd_idct16_c(input, outptr, bd);
3902       input += 16;
3903       outptr += 16;
3904     }
3905   }
3906 
3907   if (optimised_cols) {
3908     idct16_sse2(inptr, inptr + 16);
3909 
3910     // Final round & shift and Reconstruction and Store
3911     {
3912       __m128i d[2];
3913       for (i = 0; i < 16; i++) {
3914         inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
3915         inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
3916         d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3917         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
3918         inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
3919         inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
3920         d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
3921         d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
3922         // Store
3923         _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
3924         _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
3925       }
3926     }
3927   } else {
3928     // Run the un-optimised column transform
3929     tran_low_t temp_in[16], temp_out[16];
3930     for (i = 0; i < 16; ++i) {
3931       for (j = 0; j < 16; ++j)
3932         temp_in[j] = out[j * 16 + i];
3933       vpx_highbd_idct16_c(temp_in, temp_out, bd);
3934       for (j = 0; j < 16; ++j) {
3935         dest[j * stride + i] = highbd_clip_pixel_add(
3936             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
3937       }
3938     }
3939   }
3940 }
3941 
vpx_highbd_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3942 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3943                                       int stride, int bd) {
3944   tran_low_t out[16 * 16] = { 0 };
3945   tran_low_t *outptr = out;
3946   int i, j, test;
3947   __m128i inptr[32];
3948   __m128i min_input, max_input, temp1, temp2, sign_bits;
3949   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3950   const __m128i zero = _mm_set1_epi16(0);
3951   const __m128i rounding = _mm_set1_epi16(32);
3952   const __m128i max = _mm_set1_epi16(3155);
3953   const __m128i min = _mm_set1_epi16(-3155);
3954   int optimised_cols = 0;
3955 
3956   // Load input into __m128i & pack to 16 bits
3957   for (i = 0; i < 16; i++) {
3958     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3959     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3960     inptr[i] = _mm_packs_epi32(temp1, temp2);
3961     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3962     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3963     inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3964   }
3965 
3966   // Find the min & max for the row transform
3967   // Since all non-zero dct coefficients are in upper-left 4x4 area,
3968   // we only need to consider first 4 rows here.
3969   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3970   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3971   for (i = 2; i < 4; i++) {
3972     max_input = _mm_max_epi16(max_input, inptr[i]);
3973     min_input = _mm_min_epi16(min_input, inptr[i]);
3974   }
3975   max_input = _mm_cmpgt_epi16(max_input, max);
3976   min_input = _mm_cmplt_epi16(min_input, min);
3977   temp1 = _mm_or_si128(max_input, min_input);
3978   test = _mm_movemask_epi8(temp1);
3979 
3980   if (!test) {
3981     // Do the row transform (N.B. This transposes inptr)
3982     idct16_sse2(inptr, inptr + 16);
3983 
3984     // Find the min & max for the column transform
3985     // N.B. Only first 4 cols contain non-zero coeffs
3986     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3987     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3988     for (i = 2; i < 16; i++) {
3989       max_input = _mm_max_epi16(max_input, inptr[i]);
3990       min_input = _mm_min_epi16(min_input, inptr[i]);
3991     }
3992     max_input = _mm_cmpgt_epi16(max_input, max);
3993     min_input = _mm_cmplt_epi16(min_input, min);
3994     temp1 = _mm_or_si128(max_input, min_input);
3995     test = _mm_movemask_epi8(temp1);
3996 
3997     if (test) {
3998       // Use fact only first 4 rows contain non-zero coeffs
3999       array_transpose_8x8(inptr, inptr);
4000       array_transpose_8x8(inptr + 8, inptr + 16);
4001       for (i = 0; i < 4; i++) {
4002         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
4003         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
4004         temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
4005         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
4006         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
4007         sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
4008         temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
4009         temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
4010         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
4011         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
4012       }
4013     } else {
4014       // Set to use the optimised transform for the column
4015       optimised_cols = 1;
4016     }
4017   } else {
4018     // Run the un-optimised row transform
4019     for (i = 0; i < 4; ++i) {
4020       vpx_highbd_idct16_c(input, outptr, bd);
4021       input += 16;
4022       outptr += 16;
4023     }
4024   }
4025 
4026   if (optimised_cols) {
4027     idct16_sse2(inptr, inptr + 16);
4028 
4029     // Final round & shift and Reconstruction and Store
4030     {
4031       __m128i d[2];
4032       for (i = 0; i < 16; i++) {
4033         inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
4034         inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
4035         d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
4036         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
4037         inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
4038         inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
4039         d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
4040         d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
4041         // Store
4042         _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
4043         _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
4044       }
4045     }
4046   } else {
4047     // Run the un-optimised column transform
4048     tran_low_t temp_in[16], temp_out[16];
4049     for (i = 0; i < 16; ++i) {
4050       for (j = 0; j < 16; ++j)
4051         temp_in[j] = out[j * 16 + i];
4052       vpx_highbd_idct16_c(temp_in, temp_out, bd);
4053       for (j = 0; j < 16; ++j) {
4054         dest[j * stride + i] = highbd_clip_pixel_add(
4055             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4056       }
4057     }
4058   }
4059 }
4060 #endif  // CONFIG_VP9_HIGHBITDEPTH
4061