1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>  // SSE2
12 
13 #include "vpx_dsp/fwd_txfm.h"
14 #include "vpx_dsp/txfm_common.h"
15 #include "vpx_dsp/x86/txfm_common_sse2.h"
16 
17 // TODO(jingning) The high bit-depth version needs re-work for performance.
18 // The current SSE2 implementation also causes cross reference to the static
19 // functions in the C implementation file.
20 #if DCT_HIGH_BIT_DEPTH
21 #define ADD_EPI16 _mm_adds_epi16
22 #define SUB_EPI16 _mm_subs_epi16
23 #if FDCT32x32_HIGH_PRECISION
vpx_fdct32x32_rows_c(const int16_t * intermediate,tran_low_t * out)24 void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
25     int i, j;
26     for (i = 0; i < 32; ++i) {
27       tran_high_t temp_in[32], temp_out[32];
28       for (j = 0; j < 32; ++j)
29         temp_in[j] = intermediate[j * 32 + i];
30       vpx_fdct32(temp_in, temp_out, 0);
31       for (j = 0; j < 32; ++j)
32         out[j + i * 32] =
33             (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
34     }
35 }
36   #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
37   #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
38 #else
vpx_fdct32x32_rd_rows_c(const int16_t * intermediate,tran_low_t * out)39 void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
40     int i, j;
41     for (i = 0; i < 32; ++i) {
42       tran_high_t temp_in[32], temp_out[32];
43       for (j = 0; j < 32; ++j)
44         temp_in[j] = intermediate[j * 32 + i];
45       vpx_fdct32(temp_in, temp_out, 1);
46       for (j = 0; j < 32; ++j)
47         out[j + i * 32] = (tran_low_t)temp_out[j];
48     }
49 }
50   #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
51   #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
52 #endif  // FDCT32x32_HIGH_PRECISION
53 #else
54 #define ADD_EPI16 _mm_add_epi16
55 #define SUB_EPI16 _mm_sub_epi16
56 #endif  // DCT_HIGH_BIT_DEPTH
57 
58 
FDCT32x32_2D(const int16_t * input,tran_low_t * output_org,int stride)59 void FDCT32x32_2D(const int16_t *input,
60                   tran_low_t *output_org, int stride) {
61   // Calculate pre-multiplied strides
62   const int str1 = stride;
63   const int str2 = 2 * stride;
64   const int str3 = 2 * stride + str1;
65   // We need an intermediate buffer between passes.
66   DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
67   // Constants
68   //    When we use them, in one case, they are all the same. In all others
69   //    it's a pair of them that we need to repeat four times. This is done
70   //    by constructing the 32 bit constant corresponding to that pair.
71   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
72   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
73   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
74   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
75   const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64,  cospi_8_64);
76   const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64,  cospi_20_64);
77   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64,  cospi_12_64);
78   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64,   cospi_28_64);
79   const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64,  cospi_4_64);
80   const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
81   const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
82   const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64,  cospi_2_64);
83   const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64,  cospi_18_64);
84   const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64,  cospi_10_64);
85   const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64,   cospi_26_64);
86   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64,  cospi_6_64);
87   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64,  cospi_22_64);
88   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64,  cospi_14_64);
89   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64,   cospi_30_64);
90   const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64,  cospi_1_64);
91   const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64,  cospi_17_64);
92   const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64,  cospi_9_64);
93   const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64,   cospi_25_64);
94   const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64,  cospi_7_64);
95   const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64,   cospi_23_64);
96   const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64,  cospi_15_64);
97   const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64,   cospi_31_64);
98   const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64,  cospi_5_64);
99   const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64,  cospi_21_64);
100   const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64,  cospi_13_64);
101   const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64,   cospi_29_64);
102   const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64,  cospi_3_64);
103   const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64,  cospi_19_64);
104   const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64,  cospi_11_64);
105   const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64,   cospi_27_64);
106   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
107   const __m128i kZero = _mm_set1_epi16(0);
108   const __m128i kOne  = _mm_set1_epi16(1);
109   // Do the two transform/transpose passes
110   int pass;
111 #if DCT_HIGH_BIT_DEPTH
112   int overflow;
113 #endif
114   for (pass = 0; pass < 2; ++pass) {
115     // We process eight columns (transposed rows in second pass) at a time.
116     int column_start;
117     for (column_start = 0; column_start < 32; column_start += 8) {
118       __m128i step1[32];
119       __m128i step2[32];
120       __m128i step3[32];
121       __m128i out[32];
122       // Stage 1
123       // Note: even though all the loads below are aligned, using the aligned
124       //       intrinsic make the code slightly slower.
125       if (0 == pass) {
126         const int16_t *in  = &input[column_start];
127         // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
128         // Note: the next four blocks could be in a loop. That would help the
129         //       instruction cache but is actually slower.
130         {
131           const int16_t *ina =  in +  0 * str1;
132           const int16_t *inb =  in + 31 * str1;
133           __m128i *step1a = &step1[ 0];
134           __m128i *step1b = &step1[31];
135           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
136           const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
137           const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
138           const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
139           const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
140           const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
141           const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
142           const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
143           step1a[ 0] = _mm_add_epi16(ina0, inb0);
144           step1a[ 1] = _mm_add_epi16(ina1, inb1);
145           step1a[ 2] = _mm_add_epi16(ina2, inb2);
146           step1a[ 3] = _mm_add_epi16(ina3, inb3);
147           step1b[-3] = _mm_sub_epi16(ina3, inb3);
148           step1b[-2] = _mm_sub_epi16(ina2, inb2);
149           step1b[-1] = _mm_sub_epi16(ina1, inb1);
150           step1b[-0] = _mm_sub_epi16(ina0, inb0);
151           step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
152           step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
153           step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
154           step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
155           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
156           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
157           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
158           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
159         }
160         {
161           const int16_t *ina =  in +  4 * str1;
162           const int16_t *inb =  in + 27 * str1;
163           __m128i *step1a = &step1[ 4];
164           __m128i *step1b = &step1[27];
165           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
166           const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
167           const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
168           const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
169           const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
170           const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
171           const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
172           const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
173           step1a[ 0] = _mm_add_epi16(ina0, inb0);
174           step1a[ 1] = _mm_add_epi16(ina1, inb1);
175           step1a[ 2] = _mm_add_epi16(ina2, inb2);
176           step1a[ 3] = _mm_add_epi16(ina3, inb3);
177           step1b[-3] = _mm_sub_epi16(ina3, inb3);
178           step1b[-2] = _mm_sub_epi16(ina2, inb2);
179           step1b[-1] = _mm_sub_epi16(ina1, inb1);
180           step1b[-0] = _mm_sub_epi16(ina0, inb0);
181           step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
182           step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
183           step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
184           step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
185           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
186           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
187           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
188           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
189         }
190         {
191           const int16_t *ina =  in +  8 * str1;
192           const int16_t *inb =  in + 23 * str1;
193           __m128i *step1a = &step1[ 8];
194           __m128i *step1b = &step1[23];
195           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
196           const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
197           const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
198           const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
199           const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
200           const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
201           const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
202           const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
203           step1a[ 0] = _mm_add_epi16(ina0, inb0);
204           step1a[ 1] = _mm_add_epi16(ina1, inb1);
205           step1a[ 2] = _mm_add_epi16(ina2, inb2);
206           step1a[ 3] = _mm_add_epi16(ina3, inb3);
207           step1b[-3] = _mm_sub_epi16(ina3, inb3);
208           step1b[-2] = _mm_sub_epi16(ina2, inb2);
209           step1b[-1] = _mm_sub_epi16(ina1, inb1);
210           step1b[-0] = _mm_sub_epi16(ina0, inb0);
211           step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
212           step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
213           step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
214           step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
215           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
216           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
217           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
218           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
219         }
220         {
221           const int16_t *ina =  in + 12 * str1;
222           const int16_t *inb =  in + 19 * str1;
223           __m128i *step1a = &step1[12];
224           __m128i *step1b = &step1[19];
225           const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
226           const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
227           const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
228           const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
229           const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
230           const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
231           const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
232           const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
233           step1a[ 0] = _mm_add_epi16(ina0, inb0);
234           step1a[ 1] = _mm_add_epi16(ina1, inb1);
235           step1a[ 2] = _mm_add_epi16(ina2, inb2);
236           step1a[ 3] = _mm_add_epi16(ina3, inb3);
237           step1b[-3] = _mm_sub_epi16(ina3, inb3);
238           step1b[-2] = _mm_sub_epi16(ina2, inb2);
239           step1b[-1] = _mm_sub_epi16(ina1, inb1);
240           step1b[-0] = _mm_sub_epi16(ina0, inb0);
241           step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
242           step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
243           step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
244           step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
245           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
246           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
247           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
248           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
249         }
250       } else {
251         int16_t *in = &intermediate[column_start];
252         // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
253         // Note: using the same approach as above to have common offset is
254         //       counter-productive as all offsets can be calculated at compile
255         //       time.
256         // Note: the next four blocks could be in a loop. That would help the
257         //       instruction cache but is actually slower.
258         {
259           __m128i in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 32));
260           __m128i in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 32));
261           __m128i in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 32));
262           __m128i in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 32));
263           __m128i in28  = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
264           __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
265           __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
266           __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
267           step1[0] = ADD_EPI16(in00, in31);
268           step1[1] = ADD_EPI16(in01, in30);
269           step1[2] = ADD_EPI16(in02, in29);
270           step1[3] = ADD_EPI16(in03, in28);
271           step1[28] = SUB_EPI16(in03, in28);
272           step1[29] = SUB_EPI16(in02, in29);
273           step1[30] = SUB_EPI16(in01, in30);
274           step1[31] = SUB_EPI16(in00, in31);
275 #if DCT_HIGH_BIT_DEPTH
276           overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
277                                              &step1[3], &step1[28], &step1[29],
278                                              &step1[30], &step1[31]);
279           if (overflow) {
280             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
281             return;
282           }
283 #endif  // DCT_HIGH_BIT_DEPTH
284         }
285         {
286           __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
287           __m128i in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 32));
288           __m128i in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 32));
289           __m128i in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 32));
290           __m128i in24  = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
291           __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
292           __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
293           __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
294           step1[4] = ADD_EPI16(in04, in27);
295           step1[5] = ADD_EPI16(in05, in26);
296           step1[6] = ADD_EPI16(in06, in25);
297           step1[7] = ADD_EPI16(in07, in24);
298           step1[24] = SUB_EPI16(in07, in24);
299           step1[25] = SUB_EPI16(in06, in25);
300           step1[26] = SUB_EPI16(in05, in26);
301           step1[27] = SUB_EPI16(in04, in27);
302 #if DCT_HIGH_BIT_DEPTH
303           overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
304                                              &step1[7], &step1[24], &step1[25],
305                                              &step1[26], &step1[27]);
306           if (overflow) {
307             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
308             return;
309           }
310 #endif  // DCT_HIGH_BIT_DEPTH
311         }
312         {
313           __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
314           __m128i in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 32));
315           __m128i in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
316           __m128i in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
317           __m128i in20  = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
318           __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
319           __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
320           __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
321           step1[8] = ADD_EPI16(in08, in23);
322           step1[9] = ADD_EPI16(in09, in22);
323           step1[10] = ADD_EPI16(in10, in21);
324           step1[11] = ADD_EPI16(in11, in20);
325           step1[20] = SUB_EPI16(in11, in20);
326           step1[21] = SUB_EPI16(in10, in21);
327           step1[22] = SUB_EPI16(in09, in22);
328           step1[23] = SUB_EPI16(in08, in23);
329 #if DCT_HIGH_BIT_DEPTH
330           overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
331                                              &step1[11], &step1[20], &step1[21],
332                                              &step1[22], &step1[23]);
333           if (overflow) {
334             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
335             return;
336           }
337 #endif  // DCT_HIGH_BIT_DEPTH
338         }
339         {
340           __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
341           __m128i in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
342           __m128i in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
343           __m128i in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
344           __m128i in16  = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
345           __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
346           __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
347           __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
348           step1[12] = ADD_EPI16(in12, in19);
349           step1[13] = ADD_EPI16(in13, in18);
350           step1[14] = ADD_EPI16(in14, in17);
351           step1[15] = ADD_EPI16(in15, in16);
352           step1[16] = SUB_EPI16(in15, in16);
353           step1[17] = SUB_EPI16(in14, in17);
354           step1[18] = SUB_EPI16(in13, in18);
355           step1[19] = SUB_EPI16(in12, in19);
356 #if DCT_HIGH_BIT_DEPTH
357           overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
358                                              &step1[15], &step1[16], &step1[17],
359                                              &step1[18], &step1[19]);
360           if (overflow) {
361             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
362             return;
363           }
364 #endif  // DCT_HIGH_BIT_DEPTH
365         }
366       }
367       // Stage 2
368       {
369         step2[0] = ADD_EPI16(step1[0], step1[15]);
370         step2[1] = ADD_EPI16(step1[1], step1[14]);
371         step2[2] = ADD_EPI16(step1[2], step1[13]);
372         step2[3] = ADD_EPI16(step1[3], step1[12]);
373         step2[4] = ADD_EPI16(step1[4], step1[11]);
374         step2[5] = ADD_EPI16(step1[5], step1[10]);
375         step2[6] = ADD_EPI16(step1[6], step1[ 9]);
376         step2[7] = ADD_EPI16(step1[7], step1[ 8]);
377         step2[8] = SUB_EPI16(step1[7], step1[ 8]);
378         step2[9] = SUB_EPI16(step1[6], step1[ 9]);
379         step2[10] = SUB_EPI16(step1[5], step1[10]);
380         step2[11] = SUB_EPI16(step1[4], step1[11]);
381         step2[12] = SUB_EPI16(step1[3], step1[12]);
382         step2[13] = SUB_EPI16(step1[2], step1[13]);
383         step2[14] = SUB_EPI16(step1[1], step1[14]);
384         step2[15] = SUB_EPI16(step1[0], step1[15]);
385 #if DCT_HIGH_BIT_DEPTH
386         overflow = check_epi16_overflow_x16(
387             &step2[0], &step2[1], &step2[2], &step2[3],
388             &step2[4], &step2[5], &step2[6], &step2[7],
389             &step2[8], &step2[9], &step2[10], &step2[11],
390             &step2[12], &step2[13], &step2[14], &step2[15]);
391         if (overflow) {
392           if (pass == 0)
393             HIGH_FDCT32x32_2D_C(input, output_org, stride);
394           else
395             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
396           return;
397         }
398 #endif  // DCT_HIGH_BIT_DEPTH
399       }
400       {
401         const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
402         const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
403         const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
404         const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
405         const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
406         const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
407         const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
408         const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
409         const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
410         const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
411         const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
412         const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
413         const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
414         const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
415         const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
416         const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
417         const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
418         const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
419         const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
420         const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
421         const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
422         const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
423         const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
424         const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
425         // dct_const_round_shift
426         const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
427         const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
428         const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
429         const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
430         const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
431         const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
432         const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
433         const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
434         const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
435         const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
436         const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
437         const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
438         const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
439         const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
440         const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
441         const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
442         const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
443         const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
444         const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
445         const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
446         const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
447         const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
448         const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
449         const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
450         const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
451         const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
452         const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
453         const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
454         const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
455         const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
456         const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
457         const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
458         // Combine
459         step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
460         step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
461         step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
462         step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
463         step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
464         step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
465         step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
466         step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
467 #if DCT_HIGH_BIT_DEPTH
468         overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
469                                            &step2[23], &step2[24], &step2[25],
470                                            &step2[26], &step2[27]);
471         if (overflow) {
472           if (pass == 0)
473             HIGH_FDCT32x32_2D_C(input, output_org, stride);
474           else
475             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
476           return;
477         }
478 #endif  // DCT_HIGH_BIT_DEPTH
479       }
480 
481 #if !FDCT32x32_HIGH_PRECISION
482       // dump the magnitude by half, hence the intermediate values are within
483       // the range of 16 bits.
484       if (1 == pass) {
485         __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
486         __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
487         __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
488         __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
489         __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
490         __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
491         __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
492         __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
493         __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
494         __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
495         __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
496         __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
497         __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
498         __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
499         __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
500         __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
501         __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
502         __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
503         __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
504         __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
505         __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
506         __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
507         __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
508         __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
509         __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
510         __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
511         __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
512         __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
513         __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
514         __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
515         __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
516         __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
517 
518         step2[0] = SUB_EPI16(step2[ 0], s3_00_0);
519         step2[1] = SUB_EPI16(step2[ 1], s3_01_0);
520         step2[2] = SUB_EPI16(step2[ 2], s3_02_0);
521         step2[3] = SUB_EPI16(step2[ 3], s3_03_0);
522         step2[4] = SUB_EPI16(step2[ 4], s3_04_0);
523         step2[5] = SUB_EPI16(step2[ 5], s3_05_0);
524         step2[6] = SUB_EPI16(step2[ 6], s3_06_0);
525         step2[7] = SUB_EPI16(step2[ 7], s3_07_0);
526         step2[8] = SUB_EPI16(step2[ 8], s2_08_0);
527         step2[9] = SUB_EPI16(step2[ 9], s2_09_0);
528         step2[10] = SUB_EPI16(step2[10], s3_10_0);
529         step2[11] = SUB_EPI16(step2[11], s3_11_0);
530         step2[12] = SUB_EPI16(step2[12], s3_12_0);
531         step2[13] = SUB_EPI16(step2[13], s3_13_0);
532         step2[14] = SUB_EPI16(step2[14], s2_14_0);
533         step2[15] = SUB_EPI16(step2[15], s2_15_0);
534         step1[16] = SUB_EPI16(step1[16], s3_16_0);
535         step1[17] = SUB_EPI16(step1[17], s3_17_0);
536         step1[18] = SUB_EPI16(step1[18], s3_18_0);
537         step1[19] = SUB_EPI16(step1[19], s3_19_0);
538         step2[20] = SUB_EPI16(step2[20], s3_20_0);
539         step2[21] = SUB_EPI16(step2[21], s3_21_0);
540         step2[22] = SUB_EPI16(step2[22], s3_22_0);
541         step2[23] = SUB_EPI16(step2[23], s3_23_0);
542         step2[24] = SUB_EPI16(step2[24], s3_24_0);
543         step2[25] = SUB_EPI16(step2[25], s3_25_0);
544         step2[26] = SUB_EPI16(step2[26], s3_26_0);
545         step2[27] = SUB_EPI16(step2[27], s3_27_0);
546         step1[28] = SUB_EPI16(step1[28], s3_28_0);
547         step1[29] = SUB_EPI16(step1[29], s3_29_0);
548         step1[30] = SUB_EPI16(step1[30], s3_30_0);
549         step1[31] = SUB_EPI16(step1[31], s3_31_0);
550 #if DCT_HIGH_BIT_DEPTH
551         overflow = check_epi16_overflow_x32(
552             &step2[0], &step2[1], &step2[2], &step2[3],
553             &step2[4], &step2[5], &step2[6], &step2[7],
554             &step2[8], &step2[9], &step2[10], &step2[11],
555             &step2[12], &step2[13], &step2[14], &step2[15],
556             &step1[16], &step1[17], &step1[18], &step1[19],
557             &step2[20], &step2[21], &step2[22], &step2[23],
558             &step2[24], &step2[25], &step2[26], &step2[27],
559             &step1[28], &step1[29], &step1[30], &step1[31]);
560         if (overflow) {
561           HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
562           return;
563         }
564 #endif  // DCT_HIGH_BIT_DEPTH
565         step2[0] = _mm_add_epi16(step2[ 0], kOne);
566         step2[1] = _mm_add_epi16(step2[ 1], kOne);
567         step2[2] = _mm_add_epi16(step2[ 2], kOne);
568         step2[3] = _mm_add_epi16(step2[ 3], kOne);
569         step2[4] = _mm_add_epi16(step2[ 4], kOne);
570         step2[5] = _mm_add_epi16(step2[ 5], kOne);
571         step2[6] = _mm_add_epi16(step2[ 6], kOne);
572         step2[7] = _mm_add_epi16(step2[ 7], kOne);
573         step2[8] = _mm_add_epi16(step2[ 8], kOne);
574         step2[9] = _mm_add_epi16(step2[ 9], kOne);
575         step2[10] = _mm_add_epi16(step2[10], kOne);
576         step2[11] = _mm_add_epi16(step2[11], kOne);
577         step2[12] = _mm_add_epi16(step2[12], kOne);
578         step2[13] = _mm_add_epi16(step2[13], kOne);
579         step2[14] = _mm_add_epi16(step2[14], kOne);
580         step2[15] = _mm_add_epi16(step2[15], kOne);
581         step1[16] = _mm_add_epi16(step1[16], kOne);
582         step1[17] = _mm_add_epi16(step1[17], kOne);
583         step1[18] = _mm_add_epi16(step1[18], kOne);
584         step1[19] = _mm_add_epi16(step1[19], kOne);
585         step2[20] = _mm_add_epi16(step2[20], kOne);
586         step2[21] = _mm_add_epi16(step2[21], kOne);
587         step2[22] = _mm_add_epi16(step2[22], kOne);
588         step2[23] = _mm_add_epi16(step2[23], kOne);
589         step2[24] = _mm_add_epi16(step2[24], kOne);
590         step2[25] = _mm_add_epi16(step2[25], kOne);
591         step2[26] = _mm_add_epi16(step2[26], kOne);
592         step2[27] = _mm_add_epi16(step2[27], kOne);
593         step1[28] = _mm_add_epi16(step1[28], kOne);
594         step1[29] = _mm_add_epi16(step1[29], kOne);
595         step1[30] = _mm_add_epi16(step1[30], kOne);
596         step1[31] = _mm_add_epi16(step1[31], kOne);
597 
598         step2[0] = _mm_srai_epi16(step2[ 0], 2);
599         step2[1] = _mm_srai_epi16(step2[ 1], 2);
600         step2[2] = _mm_srai_epi16(step2[ 2], 2);
601         step2[3] = _mm_srai_epi16(step2[ 3], 2);
602         step2[4] = _mm_srai_epi16(step2[ 4], 2);
603         step2[5] = _mm_srai_epi16(step2[ 5], 2);
604         step2[6] = _mm_srai_epi16(step2[ 6], 2);
605         step2[7] = _mm_srai_epi16(step2[ 7], 2);
606         step2[8] = _mm_srai_epi16(step2[ 8], 2);
607         step2[9] = _mm_srai_epi16(step2[ 9], 2);
608         step2[10] = _mm_srai_epi16(step2[10], 2);
609         step2[11] = _mm_srai_epi16(step2[11], 2);
610         step2[12] = _mm_srai_epi16(step2[12], 2);
611         step2[13] = _mm_srai_epi16(step2[13], 2);
612         step2[14] = _mm_srai_epi16(step2[14], 2);
613         step2[15] = _mm_srai_epi16(step2[15], 2);
614         step1[16] = _mm_srai_epi16(step1[16], 2);
615         step1[17] = _mm_srai_epi16(step1[17], 2);
616         step1[18] = _mm_srai_epi16(step1[18], 2);
617         step1[19] = _mm_srai_epi16(step1[19], 2);
618         step2[20] = _mm_srai_epi16(step2[20], 2);
619         step2[21] = _mm_srai_epi16(step2[21], 2);
620         step2[22] = _mm_srai_epi16(step2[22], 2);
621         step2[23] = _mm_srai_epi16(step2[23], 2);
622         step2[24] = _mm_srai_epi16(step2[24], 2);
623         step2[25] = _mm_srai_epi16(step2[25], 2);
624         step2[26] = _mm_srai_epi16(step2[26], 2);
625         step2[27] = _mm_srai_epi16(step2[27], 2);
626         step1[28] = _mm_srai_epi16(step1[28], 2);
627         step1[29] = _mm_srai_epi16(step1[29], 2);
628         step1[30] = _mm_srai_epi16(step1[30], 2);
629         step1[31] = _mm_srai_epi16(step1[31], 2);
630       }
631 #endif  // !FDCT32x32_HIGH_PRECISION
632 
633 #if FDCT32x32_HIGH_PRECISION
634       if (pass == 0) {
635 #endif
636       // Stage 3
637       {
638         step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
639         step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
640         step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
641         step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
642         step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
643         step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
644         step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
645         step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
646 #if DCT_HIGH_BIT_DEPTH
647         overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
648                                            &step3[3], &step3[4], &step3[5],
649                                            &step3[6], &step3[7]);
650         if (overflow) {
651           if (pass == 0)
652             HIGH_FDCT32x32_2D_C(input, output_org, stride);
653           else
654             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
655           return;
656         }
657 #endif  // DCT_HIGH_BIT_DEPTH
658       }
659       {
660         const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
661         const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
662         const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
663         const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
664         const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
665         const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
666         const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
667         const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
668         const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
669         const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
670         const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
671         const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
672         // dct_const_round_shift
673         const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
674         const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
675         const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
676         const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
677         const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
678         const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
679         const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
680         const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
681         const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
682         const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
683         const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
684         const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
685         const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
686         const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
687         const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
688         const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
689         // Combine
690         step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
691         step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
692         step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
693         step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
694 #if DCT_HIGH_BIT_DEPTH
695         overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
696                                            &step3[12], &step3[13]);
697         if (overflow) {
698           if (pass == 0)
699             HIGH_FDCT32x32_2D_C(input, output_org, stride);
700           else
701             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
702           return;
703         }
704 #endif  // DCT_HIGH_BIT_DEPTH
705       }
706       {
707         step3[16] = ADD_EPI16(step2[23], step1[16]);
708         step3[17] = ADD_EPI16(step2[22], step1[17]);
709         step3[18] = ADD_EPI16(step2[21], step1[18]);
710         step3[19] = ADD_EPI16(step2[20], step1[19]);
711         step3[20] = SUB_EPI16(step1[19], step2[20]);
712         step3[21] = SUB_EPI16(step1[18], step2[21]);
713         step3[22] = SUB_EPI16(step1[17], step2[22]);
714         step3[23] = SUB_EPI16(step1[16], step2[23]);
715         step3[24] = SUB_EPI16(step1[31], step2[24]);
716         step3[25] = SUB_EPI16(step1[30], step2[25]);
717         step3[26] = SUB_EPI16(step1[29], step2[26]);
718         step3[27] = SUB_EPI16(step1[28], step2[27]);
719         step3[28] = ADD_EPI16(step2[27], step1[28]);
720         step3[29] = ADD_EPI16(step2[26], step1[29]);
721         step3[30] = ADD_EPI16(step2[25], step1[30]);
722         step3[31] = ADD_EPI16(step2[24], step1[31]);
723 #if DCT_HIGH_BIT_DEPTH
724         overflow = check_epi16_overflow_x16(
725             &step3[16], &step3[17], &step3[18], &step3[19],
726             &step3[20], &step3[21], &step3[22], &step3[23],
727             &step3[24], &step3[25], &step3[26], &step3[27],
728             &step3[28], &step3[29], &step3[30], &step3[31]);
729         if (overflow) {
730           if (pass == 0)
731             HIGH_FDCT32x32_2D_C(input, output_org, stride);
732           else
733             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
734           return;
735         }
736 #endif  // DCT_HIGH_BIT_DEPTH
737       }
738 
739       // Stage 4
740       {
741         step1[0] = ADD_EPI16(step3[ 3], step3[ 0]);
742         step1[1] = ADD_EPI16(step3[ 2], step3[ 1]);
743         step1[2] = SUB_EPI16(step3[ 1], step3[ 2]);
744         step1[3] = SUB_EPI16(step3[ 0], step3[ 3]);
745         step1[8] = ADD_EPI16(step3[11], step2[ 8]);
746         step1[9] = ADD_EPI16(step3[10], step2[ 9]);
747         step1[10] = SUB_EPI16(step2[ 9], step3[10]);
748         step1[11] = SUB_EPI16(step2[ 8], step3[11]);
749         step1[12] = SUB_EPI16(step2[15], step3[12]);
750         step1[13] = SUB_EPI16(step2[14], step3[13]);
751         step1[14] = ADD_EPI16(step3[13], step2[14]);
752         step1[15] = ADD_EPI16(step3[12], step2[15]);
753 #if DCT_HIGH_BIT_DEPTH
754         overflow = check_epi16_overflow_x16(
755             &step1[0], &step1[1], &step1[2], &step1[3],
756             &step1[4], &step1[5], &step1[6], &step1[7],
757             &step1[8], &step1[9], &step1[10], &step1[11],
758             &step1[12], &step1[13], &step1[14], &step1[15]);
759         if (overflow) {
760           if (pass == 0)
761             HIGH_FDCT32x32_2D_C(input, output_org, stride);
762           else
763             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
764           return;
765         }
766 #endif  // DCT_HIGH_BIT_DEPTH
767       }
768       {
769         const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
770         const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
771         const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
772         const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
773         const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
774         const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
775         // dct_const_round_shift
776         const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
777         const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
778         const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
779         const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
780         const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
781         const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
782         const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
783         const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
784         // Combine
785         step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
786         step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
787 #if DCT_HIGH_BIT_DEPTH
788         overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
789         if (overflow) {
790           if (pass == 0)
791             HIGH_FDCT32x32_2D_C(input, output_org, stride);
792           else
793             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
794           return;
795         }
796 #endif  // DCT_HIGH_BIT_DEPTH
797       }
798       {
799         const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
800         const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
801         const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
802         const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
803         const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
804         const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
805         const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
806         const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
807         const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
808         const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
809         const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
810         const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
811         const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
812         const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
813         const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
814         const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
815         const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
816         const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
817         const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
818         const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
819         const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
820         const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
821         const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
822         const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
823         // dct_const_round_shift
824         const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
825         const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
826         const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
827         const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
828         const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
829         const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
830         const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
831         const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
832         const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
833         const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
834         const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
835         const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
836         const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
837         const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
838         const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
839         const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
840         const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
841         const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
842         const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
843         const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
844         const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
845         const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
846         const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
847         const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
848         const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
849         const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
850         const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
851         const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
852         const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
853         const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
854         const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
855         const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
856         // Combine
857         step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
858         step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
859         step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
860         step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
861         step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
862         step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
863         step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
864         step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
865 #if DCT_HIGH_BIT_DEPTH
866         overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
867                                            &step1[21], &step1[26], &step1[27],
868                                            &step1[28], &step1[29]);
869         if (overflow) {
870           if (pass == 0)
871             HIGH_FDCT32x32_2D_C(input, output_org, stride);
872           else
873             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
874           return;
875         }
876 #endif  // DCT_HIGH_BIT_DEPTH
877       }
878       // Stage 5
879       {
880         step2[4] = ADD_EPI16(step1[5], step3[4]);
881         step2[5] = SUB_EPI16(step3[4], step1[5]);
882         step2[6] = SUB_EPI16(step3[7], step1[6]);
883         step2[7] = ADD_EPI16(step1[6], step3[7]);
884 #if DCT_HIGH_BIT_DEPTH
885         overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
886                                            &step2[6], &step2[7]);
887         if (overflow) {
888           if (pass == 0)
889             HIGH_FDCT32x32_2D_C(input, output_org, stride);
890           else
891             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
892           return;
893         }
894 #endif  // DCT_HIGH_BIT_DEPTH
895       }
896       {
897         const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
898         const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
899         const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
900         const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
901         const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
902         const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
903         const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
904         const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
905         const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
906         const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
907         const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
908         const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
909         // dct_const_round_shift
910         const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
911         const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
912         const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
913         const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
914         const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
915         const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
916         const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
917         const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
918         const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
919         const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
920         const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
921         const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
922         const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
923         const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
924         const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
925         const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
926         // Combine
927         out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
928         out[16] = _mm_packs_epi32(out_16_6, out_16_7);
929         out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
930         out[24] = _mm_packs_epi32(out_24_6, out_24_7);
931 #if DCT_HIGH_BIT_DEPTH
932         overflow = check_epi16_overflow_x4(&out[0], &out[16],
933                                            &out[8], &out[24]);
934         if (overflow) {
935           if (pass == 0)
936             HIGH_FDCT32x32_2D_C(input, output_org, stride);
937           else
938             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
939           return;
940         }
941 #endif  // DCT_HIGH_BIT_DEPTH
942       }
943       {
944         const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
945         const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
946         const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
947         const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
948         const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
949         const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
950         const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
951         const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
952         const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
953         const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
954         const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
955         const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
956         // dct_const_round_shift
957         const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
958         const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
959         const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
960         const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
961         const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
962         const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
963         const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
964         const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
965         const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
966         const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
967         const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
968         const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
969         const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
970         const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
971         const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
972         const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
973         // Combine
974         step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
975         step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
976         step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
977         step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
978 #if DCT_HIGH_BIT_DEPTH
979         overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
980                                            &step2[13], &step2[14]);
981         if (overflow) {
982           if (pass == 0)
983             HIGH_FDCT32x32_2D_C(input, output_org, stride);
984           else
985             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
986           return;
987         }
988 #endif  // DCT_HIGH_BIT_DEPTH
989       }
990       {
991         step2[16] = ADD_EPI16(step1[19], step3[16]);
992         step2[17] = ADD_EPI16(step1[18], step3[17]);
993         step2[18] = SUB_EPI16(step3[17], step1[18]);
994         step2[19] = SUB_EPI16(step3[16], step1[19]);
995         step2[20] = SUB_EPI16(step3[23], step1[20]);
996         step2[21] = SUB_EPI16(step3[22], step1[21]);
997         step2[22] = ADD_EPI16(step1[21], step3[22]);
998         step2[23] = ADD_EPI16(step1[20], step3[23]);
999         step2[24] = ADD_EPI16(step1[27], step3[24]);
1000         step2[25] = ADD_EPI16(step1[26], step3[25]);
1001         step2[26] = SUB_EPI16(step3[25], step1[26]);
1002         step2[27] = SUB_EPI16(step3[24], step1[27]);
1003         step2[28] = SUB_EPI16(step3[31], step1[28]);
1004         step2[29] = SUB_EPI16(step3[30], step1[29]);
1005         step2[30] = ADD_EPI16(step1[29], step3[30]);
1006         step2[31] = ADD_EPI16(step1[28], step3[31]);
1007 #if DCT_HIGH_BIT_DEPTH
1008         overflow = check_epi16_overflow_x16(
1009             &step2[16], &step2[17], &step2[18], &step2[19],
1010             &step2[20], &step2[21], &step2[22], &step2[23],
1011             &step2[24], &step2[25], &step2[26], &step2[27],
1012             &step2[28], &step2[29], &step2[30], &step2[31]);
1013         if (overflow) {
1014           if (pass == 0)
1015             HIGH_FDCT32x32_2D_C(input, output_org, stride);
1016           else
1017             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1018           return;
1019         }
1020 #endif  // DCT_HIGH_BIT_DEPTH
1021       }
1022       // Stage 6
1023       {
1024         const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
1025         const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
1026         const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
1027         const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
1028         const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
1029         const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
1030         const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
1031         const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
1032         const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
1033         const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
1034         const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
1035         const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
1036         const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
1037         const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
1038         const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
1039         const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
1040         // dct_const_round_shift
1041         const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
1042         const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
1043         const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
1044         const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
1045         const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
1046         const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
1047         const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
1048         const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
1049         const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
1050         const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
1051         const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
1052         const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
1053         const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
1054         const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
1055         const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
1056         const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
1057         // Combine
1058         out[4] = _mm_packs_epi32(out_04_6, out_04_7);
1059         out[20] = _mm_packs_epi32(out_20_6, out_20_7);
1060         out[12] = _mm_packs_epi32(out_12_6, out_12_7);
1061         out[28] = _mm_packs_epi32(out_28_6, out_28_7);
1062 #if DCT_HIGH_BIT_DEPTH
1063         overflow = check_epi16_overflow_x4(&out[4], &out[20],
1064                                            &out[12], &out[28]);
1065         if (overflow) {
1066           if (pass == 0)
1067             HIGH_FDCT32x32_2D_C(input, output_org, stride);
1068           else
1069             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1070           return;
1071         }
1072 #endif  // DCT_HIGH_BIT_DEPTH
1073       }
1074       {
1075         step3[8] = ADD_EPI16(step2[ 9], step1[ 8]);
1076         step3[9] = SUB_EPI16(step1[ 8], step2[ 9]);
1077         step3[10] = SUB_EPI16(step1[11], step2[10]);
1078         step3[11] = ADD_EPI16(step2[10], step1[11]);
1079         step3[12] = ADD_EPI16(step2[13], step1[12]);
1080         step3[13] = SUB_EPI16(step1[12], step2[13]);
1081         step3[14] = SUB_EPI16(step1[15], step2[14]);
1082         step3[15] = ADD_EPI16(step2[14], step1[15]);
1083 #if DCT_HIGH_BIT_DEPTH
1084         overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
1085                                            &step3[11], &step3[12], &step3[13],
1086                                            &step3[14], &step3[15]);
1087         if (overflow) {
1088           if (pass == 0)
1089             HIGH_FDCT32x32_2D_C(input, output_org, stride);
1090           else
1091             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1092           return;
1093         }
1094 #endif  // DCT_HIGH_BIT_DEPTH
1095       }
1096       {
1097         const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
1098         const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
1099         const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
1100         const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
1101         const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
1102         const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
1103         const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
1104         const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
1105         const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
1106         const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
1107         const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
1108         const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
1109         const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
1110         const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
1111         const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
1112         const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
1113         const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
1114         const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
1115         const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
1116         const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
1117         const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
1118         const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
1119         const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
1120         const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
1121         // dct_const_round_shift
1122         const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
1123         const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
1124         const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
1125         const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
1126         const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
1127         const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
1128         const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
1129         const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
1130         const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
1131         const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
1132         const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
1133         const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
1134         const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
1135         const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
1136         const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
1137         const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
1138         const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
1139         const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
1140         const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
1141         const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
1142         const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
1143         const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
1144         const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
1145         const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
1146         const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
1147         const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
1148         const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
1149         const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
1150         const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
1151         const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
1152         const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
1153         const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
1154         // Combine
1155         step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
1156         step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
1157         step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
1158         step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
1159         // Combine
1160         step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
1161         step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
1162         step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
1163         step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
1164 #if DCT_HIGH_BIT_DEPTH
1165         overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
1166                                            &step3[22], &step3[25], &step3[26],
1167                                            &step3[29], &step3[30]);
1168         if (overflow) {
1169           if (pass == 0)
1170             HIGH_FDCT32x32_2D_C(input, output_org, stride);
1171           else
1172             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1173           return;
1174         }
1175 #endif  // DCT_HIGH_BIT_DEPTH
1176       }
1177       // Stage 7
1178       {
1179         const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
1180         const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
1181         const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
1182         const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
1183         const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
1184         const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
1185         const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
1186         const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
1187         const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
1188         const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
1189         const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
1190         const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
1191         const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
1192         const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
1193         const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
1194         const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
1195         const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
1196         const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
1197         const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
1198         const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
1199         const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
1200         const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
1201         const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
1202         const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
1203         // dct_const_round_shift
1204         const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
1205         const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
1206         const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
1207         const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
1208         const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
1209         const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
1210         const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
1211         const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
1212         const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
1213         const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
1214         const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
1215         const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
1216         const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
1217         const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
1218         const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
1219         const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
1220         const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
1221         const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
1222         const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
1223         const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
1224         const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
1225         const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
1226         const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
1227         const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
1228         const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
1229         const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
1230         const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
1231         const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
1232         const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
1233         const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
1234         const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
1235         const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
1236         // Combine
1237         out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
1238         out[18] = _mm_packs_epi32(out_18_6, out_18_7);
1239         out[10] = _mm_packs_epi32(out_10_6, out_10_7);
1240         out[26] = _mm_packs_epi32(out_26_6, out_26_7);
1241         out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
1242         out[22] = _mm_packs_epi32(out_22_6, out_22_7);
1243         out[14] = _mm_packs_epi32(out_14_6, out_14_7);
1244         out[30] = _mm_packs_epi32(out_30_6, out_30_7);
1245 #if DCT_HIGH_BIT_DEPTH
1246         overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
1247                                            &out[26], &out[6], &out[22],
1248                                            &out[14], &out[30]);
1249         if (overflow) {
1250           if (pass == 0)
1251             HIGH_FDCT32x32_2D_C(input, output_org, stride);
1252           else
1253             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1254           return;
1255         }
1256 #endif  // DCT_HIGH_BIT_DEPTH
1257       }
1258       {
1259         step1[16] = ADD_EPI16(step3[17], step2[16]);
1260         step1[17] = SUB_EPI16(step2[16], step3[17]);
1261         step1[18] = SUB_EPI16(step2[19], step3[18]);
1262         step1[19] = ADD_EPI16(step3[18], step2[19]);
1263         step1[20] = ADD_EPI16(step3[21], step2[20]);
1264         step1[21] = SUB_EPI16(step2[20], step3[21]);
1265         step1[22] = SUB_EPI16(step2[23], step3[22]);
1266         step1[23] = ADD_EPI16(step3[22], step2[23]);
1267         step1[24] = ADD_EPI16(step3[25], step2[24]);
1268         step1[25] = SUB_EPI16(step2[24], step3[25]);
1269         step1[26] = SUB_EPI16(step2[27], step3[26]);
1270         step1[27] = ADD_EPI16(step3[26], step2[27]);
1271         step1[28] = ADD_EPI16(step3[29], step2[28]);
1272         step1[29] = SUB_EPI16(step2[28], step3[29]);
1273         step1[30] = SUB_EPI16(step2[31], step3[30]);
1274         step1[31] = ADD_EPI16(step3[30], step2[31]);
1275 #if DCT_HIGH_BIT_DEPTH
1276         overflow = check_epi16_overflow_x16(
1277             &step1[16], &step1[17], &step1[18], &step1[19],
1278             &step1[20], &step1[21], &step1[22], &step1[23],
1279             &step1[24], &step1[25], &step1[26], &step1[27],
1280             &step1[28], &step1[29], &step1[30], &step1[31]);
1281         if (overflow) {
1282           if (pass == 0)
1283             HIGH_FDCT32x32_2D_C(input, output_org, stride);
1284           else
1285              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1286           return;
1287         }
1288 #endif  // DCT_HIGH_BIT_DEPTH
1289       }
1290       // Final stage --- outputs indices are bit-reversed.
1291       {
1292         const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
1293         const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
1294         const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
1295         const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
1296         const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
1297         const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
1298         const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
1299         const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
1300         const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
1301         const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
1302         const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
1303         const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
1304         const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
1305         const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
1306         const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
1307         const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
1308         const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
1309         const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
1310         const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
1311         const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
1312         const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
1313         const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
1314         const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
1315         const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
1316         // dct_const_round_shift
1317         const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
1318         const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
1319         const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
1320         const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
1321         const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
1322         const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
1323         const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
1324         const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
1325         const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
1326         const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
1327         const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
1328         const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
1329         const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
1330         const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
1331         const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
1332         const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
1333         const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
1334         const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
1335         const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
1336         const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
1337         const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
1338         const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
1339         const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
1340         const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
1341         const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
1342         const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
1343         const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
1344         const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
1345         const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
1346         const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
1347         const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
1348         const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
1349         // Combine
1350         out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
1351         out[17] = _mm_packs_epi32(out_17_6, out_17_7);
1352         out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
1353         out[25] = _mm_packs_epi32(out_25_6, out_25_7);
1354         out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
1355         out[23] = _mm_packs_epi32(out_23_6, out_23_7);
1356         out[15] = _mm_packs_epi32(out_15_6, out_15_7);
1357         out[31] = _mm_packs_epi32(out_31_6, out_31_7);
1358 #if DCT_HIGH_BIT_DEPTH
1359         overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
1360                                            &out[25], &out[7], &out[23],
1361                                            &out[15], &out[31]);
1362         if (overflow) {
1363           if (pass == 0)
1364             HIGH_FDCT32x32_2D_C(input, output_org, stride);
1365           else
1366             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1367           return;
1368         }
1369 #endif  // DCT_HIGH_BIT_DEPTH
1370       }
1371       {
1372         const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
1373         const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
1374         const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
1375         const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
1376         const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
1377         const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
1378         const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
1379         const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
1380         const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
1381         const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
1382         const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
1383         const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
1384         const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
1385         const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
1386         const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
1387         const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
1388         const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
1389         const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
1390         const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
1391         const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
1392         const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
1393         const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
1394         const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
1395         const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
1396         // dct_const_round_shift
1397         const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
1398         const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
1399         const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
1400         const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
1401         const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
1402         const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
1403         const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
1404         const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
1405         const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
1406         const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
1407         const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
1408         const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
1409         const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
1410         const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
1411         const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
1412         const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
1413         const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
1414         const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
1415         const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
1416         const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
1417         const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
1418         const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
1419         const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
1420         const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
1421         const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
1422         const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
1423         const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
1424         const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
1425         const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
1426         const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
1427         const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
1428         const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
1429         // Combine
1430         out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
1431         out[21] = _mm_packs_epi32(out_21_6, out_21_7);
1432         out[13] = _mm_packs_epi32(out_13_6, out_13_7);
1433         out[29] = _mm_packs_epi32(out_29_6, out_29_7);
1434         out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
1435         out[19] = _mm_packs_epi32(out_19_6, out_19_7);
1436         out[11] = _mm_packs_epi32(out_11_6, out_11_7);
1437         out[27] = _mm_packs_epi32(out_27_6, out_27_7);
1438 #if DCT_HIGH_BIT_DEPTH
1439         overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
1440                                            &out[29], &out[3], &out[19],
1441                                            &out[11], &out[27]);
1442         if (overflow) {
1443           if (pass == 0)
1444             HIGH_FDCT32x32_2D_C(input, output_org, stride);
1445           else
1446             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1447           return;
1448         }
1449 #endif  // DCT_HIGH_BIT_DEPTH
1450       }
1451 #if FDCT32x32_HIGH_PRECISION
1452       } else {
1453         __m128i lstep1[64], lstep2[64], lstep3[64];
1454         __m128i u[32], v[32], sign[16];
1455         const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
1456         // start using 32-bit operations
1457         // stage 3
1458         {
1459           // expanding to 32-bit length priori to addition operations
1460           lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
1461           lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
1462           lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
1463           lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
1464           lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
1465           lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
1466           lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
1467           lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
1468           lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
1469           lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
1470           lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
1471           lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
1472           lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
1473           lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
1474           lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
1475           lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
1476           lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
1477           lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
1478           lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
1479           lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
1480           lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
1481           lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
1482           lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
1483           lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
1484           lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
1485           lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
1486           lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
1487           lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
1488           lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
1489           lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
1490           lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
1491           lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
1492 
1493           lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
1494           lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
1495           lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
1496           lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
1497           lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
1498           lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
1499           lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
1500           lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
1501           lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
1502           lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
1503           lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
1504           lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
1505           lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
1506           lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
1507           lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
1508           lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
1509         }
1510         {
1511           const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
1512           const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
1513           const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
1514           const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
1515           const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
1516           const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
1517           const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
1518           const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
1519           const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
1520           const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
1521           const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
1522           const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
1523           // dct_const_round_shift
1524           const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
1525           const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
1526           const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
1527           const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
1528           const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
1529           const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
1530           const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
1531           const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
1532           lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
1533           lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
1534           lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
1535           lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
1536           lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
1537           lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
1538           lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
1539           lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
1540         }
1541         {
1542           lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
1543           lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
1544           lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
1545           lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
1546           lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
1547           lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
1548           lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
1549           lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
1550           lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
1551           lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
1552           lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
1553           lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
1554           lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
1555           lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
1556           lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
1557           lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
1558           lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
1559           lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
1560           lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
1561           lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
1562           lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
1563           lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
1564           lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
1565           lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
1566           lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
1567           lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
1568           lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
1569           lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
1570           lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
1571           lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
1572           lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
1573           lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
1574 
1575           lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
1576           lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
1577           lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
1578           lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
1579           lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
1580           lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
1581           lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
1582           lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
1583           lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
1584           lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
1585           lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
1586           lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
1587           lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
1588           lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
1589           lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
1590           lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
1591           lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
1592           lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
1593           lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
1594           lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
1595           lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
1596           lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
1597           lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
1598           lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
1599           lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
1600           lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
1601           lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
1602           lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
1603           lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
1604           lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
1605           lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
1606           lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
1607 
1608           lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
1609           lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
1610 
1611           lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
1612           lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
1613           lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
1614           lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
1615           lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
1616           lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
1617           lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
1618           lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
1619           lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
1620           lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
1621           lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
1622           lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
1623           lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
1624           lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
1625           lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
1626           lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
1627           lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
1628           lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
1629           lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
1630           lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
1631           lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
1632           lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
1633           lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
1634           lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
1635           lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
1636           lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
1637           lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
1638           lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
1639           lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
1640           lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
1641         }
1642 
1643         // stage 4
1644         {
1645           // expanding to 32-bit length priori to addition operations
1646           lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
1647           lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
1648           lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
1649           lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
1650           lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
1651           lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
1652           lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
1653           lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
1654           lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
1655           lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
1656           lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
1657           lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
1658           lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
1659           lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
1660           lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
1661           lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
1662 
1663           lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
1664           lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
1665           lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
1666           lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
1667           lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
1668           lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
1669           lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
1670           lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
1671           lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
1672           lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
1673           lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
1674           lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
1675           lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
1676           lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
1677           lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
1678           lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
1679           lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
1680           lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
1681           lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
1682           lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
1683           lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
1684           lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
1685           lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
1686           lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
1687         }
1688         {
1689         // to be continued...
1690         //
1691         const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
1692         const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
1693 
1694         u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
1695         u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
1696         u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
1697         u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
1698 
1699         // TODO(jingning): manually inline k_madd_epi32_ to further hide
1700         // instruction latency.
1701         v[0] = k_madd_epi32(u[0], k32_p16_m16);
1702         v[1] = k_madd_epi32(u[1], k32_p16_m16);
1703         v[2] = k_madd_epi32(u[2], k32_p16_m16);
1704         v[3] = k_madd_epi32(u[3], k32_p16_m16);
1705         v[4] = k_madd_epi32(u[0], k32_p16_p16);
1706         v[5] = k_madd_epi32(u[1], k32_p16_p16);
1707         v[6] = k_madd_epi32(u[2], k32_p16_p16);
1708         v[7] = k_madd_epi32(u[3], k32_p16_p16);
1709 #if DCT_HIGH_BIT_DEPTH
1710         overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
1711                                             &v[4], &v[5], &v[6], &v[7], &kZero);
1712         if (overflow) {
1713           HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1714           return;
1715         }
1716 #endif  // DCT_HIGH_BIT_DEPTH
1717         u[0] = k_packs_epi64(v[0], v[1]);
1718         u[1] = k_packs_epi64(v[2], v[3]);
1719         u[2] = k_packs_epi64(v[4], v[5]);
1720         u[3] = k_packs_epi64(v[6], v[7]);
1721 
1722         v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1723         v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1724         v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1725         v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1726 
1727         lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1728         lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1729         lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1730         lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1731         }
1732         {
1733           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
1734           const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
1735           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
1736 
1737           u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
1738           u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
1739           u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
1740           u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
1741           u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
1742           u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
1743           u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
1744           u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
1745           u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
1746           u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
1747           u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
1748           u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
1749           u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
1750           u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
1751           u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
1752           u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
1753 
1754           v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
1755           v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
1756           v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
1757           v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
1758           v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
1759           v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
1760           v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
1761           v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
1762           v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
1763           v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
1764           v[10] = k_madd_epi32(u[10], k32_m24_m08);
1765           v[11] = k_madd_epi32(u[11], k32_m24_m08);
1766           v[12] = k_madd_epi32(u[12], k32_m24_m08);
1767           v[13] = k_madd_epi32(u[13], k32_m24_m08);
1768           v[14] = k_madd_epi32(u[14], k32_m24_m08);
1769           v[15] = k_madd_epi32(u[15], k32_m24_m08);
1770           v[16] = k_madd_epi32(u[12], k32_m08_p24);
1771           v[17] = k_madd_epi32(u[13], k32_m08_p24);
1772           v[18] = k_madd_epi32(u[14], k32_m08_p24);
1773           v[19] = k_madd_epi32(u[15], k32_m08_p24);
1774           v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
1775           v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
1776           v[22] = k_madd_epi32(u[10], k32_m08_p24);
1777           v[23] = k_madd_epi32(u[11], k32_m08_p24);
1778           v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
1779           v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
1780           v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
1781           v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
1782           v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
1783           v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
1784           v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
1785           v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
1786 
1787 #if DCT_HIGH_BIT_DEPTH
1788           overflow = k_check_epi32_overflow_32(
1789               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
1790               &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
1791               &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
1792               &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
1793               &kZero);
1794           if (overflow) {
1795             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1796             return;
1797           }
1798 #endif  // DCT_HIGH_BIT_DEPTH
1799           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
1800           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
1801           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
1802           u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
1803           u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
1804           u[ 5] = k_packs_epi64(v[10], v[11]);
1805           u[ 6] = k_packs_epi64(v[12], v[13]);
1806           u[ 7] = k_packs_epi64(v[14], v[15]);
1807           u[ 8] = k_packs_epi64(v[16], v[17]);
1808           u[ 9] = k_packs_epi64(v[18], v[19]);
1809           u[10] = k_packs_epi64(v[20], v[21]);
1810           u[11] = k_packs_epi64(v[22], v[23]);
1811           u[12] = k_packs_epi64(v[24], v[25]);
1812           u[13] = k_packs_epi64(v[26], v[27]);
1813           u[14] = k_packs_epi64(v[28], v[29]);
1814           u[15] = k_packs_epi64(v[30], v[31]);
1815 
1816           v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
1817           v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
1818           v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
1819           v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
1820           v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
1821           v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
1822           v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
1823           v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
1824           v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
1825           v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
1826           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1827           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1828           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1829           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1830           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1831           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1832 
1833           lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
1834           lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
1835           lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
1836           lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
1837           lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
1838           lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
1839           lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
1840           lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
1841           lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
1842           lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
1843           lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1844           lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1845           lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1846           lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1847           lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1848           lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1849         }
1850         // stage 5
1851         {
1852           lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
1853           lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
1854           lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
1855           lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]);
1856           lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
1857           lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
1858           lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
1859           lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
1860         }
1861         {
1862           const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
1863           const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
1864           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
1865           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
1866 
1867           u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
1868           u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
1869           u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
1870           u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
1871           u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
1872           u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
1873           u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
1874           u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
1875 
1876           // TODO(jingning): manually inline k_madd_epi32_ to further hide
1877           // instruction latency.
1878           v[ 0] = k_madd_epi32(u[0], k32_p16_p16);
1879           v[ 1] = k_madd_epi32(u[1], k32_p16_p16);
1880           v[ 2] = k_madd_epi32(u[2], k32_p16_p16);
1881           v[ 3] = k_madd_epi32(u[3], k32_p16_p16);
1882           v[ 4] = k_madd_epi32(u[0], k32_p16_m16);
1883           v[ 5] = k_madd_epi32(u[1], k32_p16_m16);
1884           v[ 6] = k_madd_epi32(u[2], k32_p16_m16);
1885           v[ 7] = k_madd_epi32(u[3], k32_p16_m16);
1886           v[ 8] = k_madd_epi32(u[4], k32_p24_p08);
1887           v[ 9] = k_madd_epi32(u[5], k32_p24_p08);
1888           v[10] = k_madd_epi32(u[6], k32_p24_p08);
1889           v[11] = k_madd_epi32(u[7], k32_p24_p08);
1890           v[12] = k_madd_epi32(u[4], k32_m08_p24);
1891           v[13] = k_madd_epi32(u[5], k32_m08_p24);
1892           v[14] = k_madd_epi32(u[6], k32_m08_p24);
1893           v[15] = k_madd_epi32(u[7], k32_m08_p24);
1894 
1895 #if DCT_HIGH_BIT_DEPTH
1896           overflow = k_check_epi32_overflow_16(
1897               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
1898               &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
1899               &kZero);
1900           if (overflow) {
1901             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1902             return;
1903           }
1904 #endif  // DCT_HIGH_BIT_DEPTH
1905           u[0] = k_packs_epi64(v[0], v[1]);
1906           u[1] = k_packs_epi64(v[2], v[3]);
1907           u[2] = k_packs_epi64(v[4], v[5]);
1908           u[3] = k_packs_epi64(v[6], v[7]);
1909           u[4] = k_packs_epi64(v[8], v[9]);
1910           u[5] = k_packs_epi64(v[10], v[11]);
1911           u[6] = k_packs_epi64(v[12], v[13]);
1912           u[7] = k_packs_epi64(v[14], v[15]);
1913 
1914           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1915           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1916           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1917           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1918           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1919           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1920           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1921           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1922 
1923           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1924           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1925           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1926           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1927           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1928           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1929           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1930           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1931 
1932           sign[0] = _mm_cmplt_epi32(u[0], kZero);
1933           sign[1] = _mm_cmplt_epi32(u[1], kZero);
1934           sign[2] = _mm_cmplt_epi32(u[2], kZero);
1935           sign[3] = _mm_cmplt_epi32(u[3], kZero);
1936           sign[4] = _mm_cmplt_epi32(u[4], kZero);
1937           sign[5] = _mm_cmplt_epi32(u[5], kZero);
1938           sign[6] = _mm_cmplt_epi32(u[6], kZero);
1939           sign[7] = _mm_cmplt_epi32(u[7], kZero);
1940 
1941           u[0] = _mm_sub_epi32(u[0], sign[0]);
1942           u[1] = _mm_sub_epi32(u[1], sign[1]);
1943           u[2] = _mm_sub_epi32(u[2], sign[2]);
1944           u[3] = _mm_sub_epi32(u[3], sign[3]);
1945           u[4] = _mm_sub_epi32(u[4], sign[4]);
1946           u[5] = _mm_sub_epi32(u[5], sign[5]);
1947           u[6] = _mm_sub_epi32(u[6], sign[6]);
1948           u[7] = _mm_sub_epi32(u[7], sign[7]);
1949 
1950           u[0] = _mm_add_epi32(u[0], K32One);
1951           u[1] = _mm_add_epi32(u[1], K32One);
1952           u[2] = _mm_add_epi32(u[2], K32One);
1953           u[3] = _mm_add_epi32(u[3], K32One);
1954           u[4] = _mm_add_epi32(u[4], K32One);
1955           u[5] = _mm_add_epi32(u[5], K32One);
1956           u[6] = _mm_add_epi32(u[6], K32One);
1957           u[7] = _mm_add_epi32(u[7], K32One);
1958 
1959           u[0] = _mm_srai_epi32(u[0], 2);
1960           u[1] = _mm_srai_epi32(u[1], 2);
1961           u[2] = _mm_srai_epi32(u[2], 2);
1962           u[3] = _mm_srai_epi32(u[3], 2);
1963           u[4] = _mm_srai_epi32(u[4], 2);
1964           u[5] = _mm_srai_epi32(u[5], 2);
1965           u[6] = _mm_srai_epi32(u[6], 2);
1966           u[7] = _mm_srai_epi32(u[7], 2);
1967 
1968           // Combine
1969           out[ 0] = _mm_packs_epi32(u[0], u[1]);
1970           out[16] = _mm_packs_epi32(u[2], u[3]);
1971           out[ 8] = _mm_packs_epi32(u[4], u[5]);
1972           out[24] = _mm_packs_epi32(u[6], u[7]);
1973 #if DCT_HIGH_BIT_DEPTH
1974           overflow = check_epi16_overflow_x4(&out[0], &out[16],
1975                                              &out[8], &out[24]);
1976           if (overflow) {
1977             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1978             return;
1979           }
1980 #endif  // DCT_HIGH_BIT_DEPTH
1981         }
1982         {
1983           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
1984           const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
1985           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
1986 
1987           u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
1988           u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
1989           u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
1990           u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
1991           u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
1992           u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
1993           u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
1994           u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
1995 
1996           v[0] = k_madd_epi32(u[0], k32_m08_p24);
1997           v[1] = k_madd_epi32(u[1], k32_m08_p24);
1998           v[2] = k_madd_epi32(u[2], k32_m08_p24);
1999           v[3] = k_madd_epi32(u[3], k32_m08_p24);
2000           v[4] = k_madd_epi32(u[4], k32_m24_m08);
2001           v[5] = k_madd_epi32(u[5], k32_m24_m08);
2002           v[6] = k_madd_epi32(u[6], k32_m24_m08);
2003           v[7] = k_madd_epi32(u[7], k32_m24_m08);
2004           v[ 8] = k_madd_epi32(u[4], k32_m08_p24);
2005           v[ 9] = k_madd_epi32(u[5], k32_m08_p24);
2006           v[10] = k_madd_epi32(u[6], k32_m08_p24);
2007           v[11] = k_madd_epi32(u[7], k32_m08_p24);
2008           v[12] = k_madd_epi32(u[0], k32_p24_p08);
2009           v[13] = k_madd_epi32(u[1], k32_p24_p08);
2010           v[14] = k_madd_epi32(u[2], k32_p24_p08);
2011           v[15] = k_madd_epi32(u[3], k32_p24_p08);
2012 
2013 #if DCT_HIGH_BIT_DEPTH
2014           overflow = k_check_epi32_overflow_16(
2015               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
2016               &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
2017               &kZero);
2018           if (overflow) {
2019             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2020             return;
2021           }
2022 #endif  // DCT_HIGH_BIT_DEPTH
2023           u[0] = k_packs_epi64(v[0], v[1]);
2024           u[1] = k_packs_epi64(v[2], v[3]);
2025           u[2] = k_packs_epi64(v[4], v[5]);
2026           u[3] = k_packs_epi64(v[6], v[7]);
2027           u[4] = k_packs_epi64(v[8], v[9]);
2028           u[5] = k_packs_epi64(v[10], v[11]);
2029           u[6] = k_packs_epi64(v[12], v[13]);
2030           u[7] = k_packs_epi64(v[14], v[15]);
2031 
2032           u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2033           u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2034           u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2035           u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2036           u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2037           u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2038           u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2039           u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2040 
2041           lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2042           lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2043           lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2044           lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2045           lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2046           lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2047           lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2048           lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2049         }
2050         {
2051           lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
2052           lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
2053           lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
2054           lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
2055           lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
2056           lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
2057           lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
2058           lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
2059           lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
2060           lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
2061           lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
2062           lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
2063           lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
2064           lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
2065           lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
2066           lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
2067           lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
2068           lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
2069           lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
2070           lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
2071           lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
2072           lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
2073           lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
2074           lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
2075           lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
2076           lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
2077           lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
2078           lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
2079           lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
2080           lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
2081           lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
2082           lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
2083         }
2084         // stage 6
2085         {
2086           const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
2087           const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
2088           const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
2089           const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
2090 
2091           u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
2092           u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
2093           u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
2094           u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
2095           u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
2096           u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
2097           u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
2098           u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
2099           u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
2100           u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
2101           u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
2102           u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
2103           u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
2104           u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
2105           u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
2106           u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
2107 
2108           v[0] = k_madd_epi32(u[0], k32_p28_p04);
2109           v[1] = k_madd_epi32(u[1], k32_p28_p04);
2110           v[2] = k_madd_epi32(u[2], k32_p28_p04);
2111           v[3] = k_madd_epi32(u[3], k32_p28_p04);
2112           v[4] = k_madd_epi32(u[4], k32_p12_p20);
2113           v[5] = k_madd_epi32(u[5], k32_p12_p20);
2114           v[6] = k_madd_epi32(u[6], k32_p12_p20);
2115           v[7] = k_madd_epi32(u[7], k32_p12_p20);
2116           v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
2117           v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
2118           v[10] = k_madd_epi32(u[10], k32_m20_p12);
2119           v[11] = k_madd_epi32(u[11], k32_m20_p12);
2120           v[12] = k_madd_epi32(u[12], k32_m04_p28);
2121           v[13] = k_madd_epi32(u[13], k32_m04_p28);
2122           v[14] = k_madd_epi32(u[14], k32_m04_p28);
2123           v[15] = k_madd_epi32(u[15], k32_m04_p28);
2124 
2125 #if DCT_HIGH_BIT_DEPTH
2126           overflow = k_check_epi32_overflow_16(
2127               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
2128               &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
2129               &kZero);
2130           if (overflow) {
2131             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2132             return;
2133           }
2134 #endif  // DCT_HIGH_BIT_DEPTH
2135           u[0] = k_packs_epi64(v[0], v[1]);
2136           u[1] = k_packs_epi64(v[2], v[3]);
2137           u[2] = k_packs_epi64(v[4], v[5]);
2138           u[3] = k_packs_epi64(v[6], v[7]);
2139           u[4] = k_packs_epi64(v[8], v[9]);
2140           u[5] = k_packs_epi64(v[10], v[11]);
2141           u[6] = k_packs_epi64(v[12], v[13]);
2142           u[7] = k_packs_epi64(v[14], v[15]);
2143 
2144           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2145           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2146           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2147           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2148           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2149           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2150           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2151           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2152 
2153           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2154           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2155           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2156           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2157           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2158           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2159           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2160           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2161 
2162           sign[0] = _mm_cmplt_epi32(u[0], kZero);
2163           sign[1] = _mm_cmplt_epi32(u[1], kZero);
2164           sign[2] = _mm_cmplt_epi32(u[2], kZero);
2165           sign[3] = _mm_cmplt_epi32(u[3], kZero);
2166           sign[4] = _mm_cmplt_epi32(u[4], kZero);
2167           sign[5] = _mm_cmplt_epi32(u[5], kZero);
2168           sign[6] = _mm_cmplt_epi32(u[6], kZero);
2169           sign[7] = _mm_cmplt_epi32(u[7], kZero);
2170 
2171           u[0] = _mm_sub_epi32(u[0], sign[0]);
2172           u[1] = _mm_sub_epi32(u[1], sign[1]);
2173           u[2] = _mm_sub_epi32(u[2], sign[2]);
2174           u[3] = _mm_sub_epi32(u[3], sign[3]);
2175           u[4] = _mm_sub_epi32(u[4], sign[4]);
2176           u[5] = _mm_sub_epi32(u[5], sign[5]);
2177           u[6] = _mm_sub_epi32(u[6], sign[6]);
2178           u[7] = _mm_sub_epi32(u[7], sign[7]);
2179 
2180           u[0] = _mm_add_epi32(u[0], K32One);
2181           u[1] = _mm_add_epi32(u[1], K32One);
2182           u[2] = _mm_add_epi32(u[2], K32One);
2183           u[3] = _mm_add_epi32(u[3], K32One);
2184           u[4] = _mm_add_epi32(u[4], K32One);
2185           u[5] = _mm_add_epi32(u[5], K32One);
2186           u[6] = _mm_add_epi32(u[6], K32One);
2187           u[7] = _mm_add_epi32(u[7], K32One);
2188 
2189           u[0] = _mm_srai_epi32(u[0], 2);
2190           u[1] = _mm_srai_epi32(u[1], 2);
2191           u[2] = _mm_srai_epi32(u[2], 2);
2192           u[3] = _mm_srai_epi32(u[3], 2);
2193           u[4] = _mm_srai_epi32(u[4], 2);
2194           u[5] = _mm_srai_epi32(u[5], 2);
2195           u[6] = _mm_srai_epi32(u[6], 2);
2196           u[7] = _mm_srai_epi32(u[7], 2);
2197 
2198           out[ 4] = _mm_packs_epi32(u[0], u[1]);
2199           out[20] = _mm_packs_epi32(u[2], u[3]);
2200           out[12] = _mm_packs_epi32(u[4], u[5]);
2201           out[28] = _mm_packs_epi32(u[6], u[7]);
2202 #if DCT_HIGH_BIT_DEPTH
2203           overflow = check_epi16_overflow_x4(&out[4], &out[20],
2204                                              &out[12], &out[28]);
2205           if (overflow) {
2206             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2207             return;
2208           }
2209 #endif  // DCT_HIGH_BIT_DEPTH
2210         }
2211         {
2212           lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
2213           lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
2214           lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
2215           lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
2216           lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
2217           lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
2218           lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
2219           lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
2220           lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
2221           lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
2222           lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
2223           lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
2224           lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
2225           lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
2226           lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
2227           lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
2228         }
2229         {
2230           const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
2231           const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
2232           const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
2233           const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64,
2234                                                      -cospi_20_64);
2235           const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
2236           const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
2237 
2238           u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
2239           u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
2240           u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
2241           u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
2242           u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
2243           u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
2244           u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
2245           u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
2246           u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
2247           u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
2248           u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
2249           u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
2250           u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
2251           u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
2252           u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
2253           u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
2254 
2255           v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28);
2256           v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28);
2257           v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28);
2258           v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28);
2259           v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04);
2260           v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04);
2261           v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04);
2262           v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04);
2263           v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
2264           v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
2265           v[10] = k_madd_epi32(u[10], k32_m20_p12);
2266           v[11] = k_madd_epi32(u[11], k32_m20_p12);
2267           v[12] = k_madd_epi32(u[12], k32_m12_m20);
2268           v[13] = k_madd_epi32(u[13], k32_m12_m20);
2269           v[14] = k_madd_epi32(u[14], k32_m12_m20);
2270           v[15] = k_madd_epi32(u[15], k32_m12_m20);
2271           v[16] = k_madd_epi32(u[12], k32_m20_p12);
2272           v[17] = k_madd_epi32(u[13], k32_m20_p12);
2273           v[18] = k_madd_epi32(u[14], k32_m20_p12);
2274           v[19] = k_madd_epi32(u[15], k32_m20_p12);
2275           v[20] = k_madd_epi32(u[ 8], k32_p12_p20);
2276           v[21] = k_madd_epi32(u[ 9], k32_p12_p20);
2277           v[22] = k_madd_epi32(u[10], k32_p12_p20);
2278           v[23] = k_madd_epi32(u[11], k32_p12_p20);
2279           v[24] = k_madd_epi32(u[ 4], k32_m04_p28);
2280           v[25] = k_madd_epi32(u[ 5], k32_m04_p28);
2281           v[26] = k_madd_epi32(u[ 6], k32_m04_p28);
2282           v[27] = k_madd_epi32(u[ 7], k32_m04_p28);
2283           v[28] = k_madd_epi32(u[ 0], k32_p28_p04);
2284           v[29] = k_madd_epi32(u[ 1], k32_p28_p04);
2285           v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
2286           v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
2287 
2288 #if DCT_HIGH_BIT_DEPTH
2289           overflow = k_check_epi32_overflow_32(
2290               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
2291               &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
2292               &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
2293               &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
2294               &kZero);
2295           if (overflow) {
2296             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2297             return;
2298           }
2299 #endif  // DCT_HIGH_BIT_DEPTH
2300           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
2301           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
2302           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
2303           u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
2304           u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
2305           u[ 5] = k_packs_epi64(v[10], v[11]);
2306           u[ 6] = k_packs_epi64(v[12], v[13]);
2307           u[ 7] = k_packs_epi64(v[14], v[15]);
2308           u[ 8] = k_packs_epi64(v[16], v[17]);
2309           u[ 9] = k_packs_epi64(v[18], v[19]);
2310           u[10] = k_packs_epi64(v[20], v[21]);
2311           u[11] = k_packs_epi64(v[22], v[23]);
2312           u[12] = k_packs_epi64(v[24], v[25]);
2313           u[13] = k_packs_epi64(v[26], v[27]);
2314           u[14] = k_packs_epi64(v[28], v[29]);
2315           u[15] = k_packs_epi64(v[30], v[31]);
2316 
2317           v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
2318           v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
2319           v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
2320           v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
2321           v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
2322           v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
2323           v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
2324           v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
2325           v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
2326           v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
2327           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2328           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2329           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2330           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2331           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2332           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2333 
2334           lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
2335           lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
2336           lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
2337           lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
2338           lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
2339           lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
2340           lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
2341           lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
2342           lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
2343           lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
2344           lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2345           lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2346           lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2347           lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2348           lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2349           lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2350         }
2351         // stage 7
2352         {
2353           const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
2354           const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
2355           const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
2356           const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64,  cospi_26_64);
2357           const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
2358           const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
2359           const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
2360           const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
2361 
2362           u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
2363           u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
2364           u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
2365           u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
2366           u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
2367           u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
2368           u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
2369           u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
2370           u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
2371           u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
2372           u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
2373           u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
2374           u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
2375           u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
2376           u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
2377           u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
2378 
2379           v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02);
2380           v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02);
2381           v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02);
2382           v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02);
2383           v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18);
2384           v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18);
2385           v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18);
2386           v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18);
2387           v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10);
2388           v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10);
2389           v[10] = k_madd_epi32(u[10], k32_p22_p10);
2390           v[11] = k_madd_epi32(u[11], k32_p22_p10);
2391           v[12] = k_madd_epi32(u[12], k32_p06_p26);
2392           v[13] = k_madd_epi32(u[13], k32_p06_p26);
2393           v[14] = k_madd_epi32(u[14], k32_p06_p26);
2394           v[15] = k_madd_epi32(u[15], k32_p06_p26);
2395           v[16] = k_madd_epi32(u[12], k32_m26_p06);
2396           v[17] = k_madd_epi32(u[13], k32_m26_p06);
2397           v[18] = k_madd_epi32(u[14], k32_m26_p06);
2398           v[19] = k_madd_epi32(u[15], k32_m26_p06);
2399           v[20] = k_madd_epi32(u[ 8], k32_m10_p22);
2400           v[21] = k_madd_epi32(u[ 9], k32_m10_p22);
2401           v[22] = k_madd_epi32(u[10], k32_m10_p22);
2402           v[23] = k_madd_epi32(u[11], k32_m10_p22);
2403           v[24] = k_madd_epi32(u[ 4], k32_m18_p14);
2404           v[25] = k_madd_epi32(u[ 5], k32_m18_p14);
2405           v[26] = k_madd_epi32(u[ 6], k32_m18_p14);
2406           v[27] = k_madd_epi32(u[ 7], k32_m18_p14);
2407           v[28] = k_madd_epi32(u[ 0], k32_m02_p30);
2408           v[29] = k_madd_epi32(u[ 1], k32_m02_p30);
2409           v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
2410           v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
2411 
2412 #if DCT_HIGH_BIT_DEPTH
2413           overflow = k_check_epi32_overflow_32(
2414               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
2415               &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
2416               &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
2417               &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
2418               &kZero);
2419           if (overflow) {
2420             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2421             return;
2422           }
2423 #endif  // DCT_HIGH_BIT_DEPTH
2424           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
2425           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
2426           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
2427           u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
2428           u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
2429           u[ 5] = k_packs_epi64(v[10], v[11]);
2430           u[ 6] = k_packs_epi64(v[12], v[13]);
2431           u[ 7] = k_packs_epi64(v[14], v[15]);
2432           u[ 8] = k_packs_epi64(v[16], v[17]);
2433           u[ 9] = k_packs_epi64(v[18], v[19]);
2434           u[10] = k_packs_epi64(v[20], v[21]);
2435           u[11] = k_packs_epi64(v[22], v[23]);
2436           u[12] = k_packs_epi64(v[24], v[25]);
2437           u[13] = k_packs_epi64(v[26], v[27]);
2438           u[14] = k_packs_epi64(v[28], v[29]);
2439           u[15] = k_packs_epi64(v[30], v[31]);
2440 
2441           v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
2442           v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
2443           v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
2444           v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
2445           v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
2446           v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
2447           v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
2448           v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
2449           v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
2450           v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
2451           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2452           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2453           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2454           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2455           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2456           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2457 
2458           u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
2459           u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
2460           u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
2461           u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
2462           u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
2463           u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
2464           u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
2465           u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
2466           u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
2467           u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
2468           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2469           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2470           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2471           u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2472           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2473           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2474 
2475           v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
2476           v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
2477           v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
2478           v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
2479           v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
2480           v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
2481           v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
2482           v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
2483           v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
2484           v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
2485           v[10] = _mm_cmplt_epi32(u[10], kZero);
2486           v[11] = _mm_cmplt_epi32(u[11], kZero);
2487           v[12] = _mm_cmplt_epi32(u[12], kZero);
2488           v[13] = _mm_cmplt_epi32(u[13], kZero);
2489           v[14] = _mm_cmplt_epi32(u[14], kZero);
2490           v[15] = _mm_cmplt_epi32(u[15], kZero);
2491 
2492           u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
2493           u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
2494           u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
2495           u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
2496           u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
2497           u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
2498           u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
2499           u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
2500           u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
2501           u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
2502           u[10] = _mm_sub_epi32(u[10], v[10]);
2503           u[11] = _mm_sub_epi32(u[11], v[11]);
2504           u[12] = _mm_sub_epi32(u[12], v[12]);
2505           u[13] = _mm_sub_epi32(u[13], v[13]);
2506           u[14] = _mm_sub_epi32(u[14], v[14]);
2507           u[15] = _mm_sub_epi32(u[15], v[15]);
2508 
2509           v[ 0] = _mm_add_epi32(u[ 0], K32One);
2510           v[ 1] = _mm_add_epi32(u[ 1], K32One);
2511           v[ 2] = _mm_add_epi32(u[ 2], K32One);
2512           v[ 3] = _mm_add_epi32(u[ 3], K32One);
2513           v[ 4] = _mm_add_epi32(u[ 4], K32One);
2514           v[ 5] = _mm_add_epi32(u[ 5], K32One);
2515           v[ 6] = _mm_add_epi32(u[ 6], K32One);
2516           v[ 7] = _mm_add_epi32(u[ 7], K32One);
2517           v[ 8] = _mm_add_epi32(u[ 8], K32One);
2518           v[ 9] = _mm_add_epi32(u[ 9], K32One);
2519           v[10] = _mm_add_epi32(u[10], K32One);
2520           v[11] = _mm_add_epi32(u[11], K32One);
2521           v[12] = _mm_add_epi32(u[12], K32One);
2522           v[13] = _mm_add_epi32(u[13], K32One);
2523           v[14] = _mm_add_epi32(u[14], K32One);
2524           v[15] = _mm_add_epi32(u[15], K32One);
2525 
2526           u[ 0] = _mm_srai_epi32(v[ 0], 2);
2527           u[ 1] = _mm_srai_epi32(v[ 1], 2);
2528           u[ 2] = _mm_srai_epi32(v[ 2], 2);
2529           u[ 3] = _mm_srai_epi32(v[ 3], 2);
2530           u[ 4] = _mm_srai_epi32(v[ 4], 2);
2531           u[ 5] = _mm_srai_epi32(v[ 5], 2);
2532           u[ 6] = _mm_srai_epi32(v[ 6], 2);
2533           u[ 7] = _mm_srai_epi32(v[ 7], 2);
2534           u[ 8] = _mm_srai_epi32(v[ 8], 2);
2535           u[ 9] = _mm_srai_epi32(v[ 9], 2);
2536           u[10] = _mm_srai_epi32(v[10], 2);
2537           u[11] = _mm_srai_epi32(v[11], 2);
2538           u[12] = _mm_srai_epi32(v[12], 2);
2539           u[13] = _mm_srai_epi32(v[13], 2);
2540           u[14] = _mm_srai_epi32(v[14], 2);
2541           u[15] = _mm_srai_epi32(v[15], 2);
2542 
2543           out[ 2] = _mm_packs_epi32(u[0], u[1]);
2544           out[18] = _mm_packs_epi32(u[2], u[3]);
2545           out[10] = _mm_packs_epi32(u[4], u[5]);
2546           out[26] = _mm_packs_epi32(u[6], u[7]);
2547           out[ 6] = _mm_packs_epi32(u[8], u[9]);
2548           out[22] = _mm_packs_epi32(u[10], u[11]);
2549           out[14] = _mm_packs_epi32(u[12], u[13]);
2550           out[30] = _mm_packs_epi32(u[14], u[15]);
2551 #if DCT_HIGH_BIT_DEPTH
2552           overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
2553                                              &out[26], &out[6], &out[22],
2554                                              &out[14], &out[30]);
2555           if (overflow) {
2556             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2557             return;
2558           }
2559 #endif  // DCT_HIGH_BIT_DEPTH
2560         }
2561         {
2562           lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
2563           lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
2564           lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
2565           lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
2566           lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
2567           lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
2568           lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
2569           lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
2570           lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
2571           lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
2572           lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
2573           lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
2574           lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
2575           lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
2576           lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
2577           lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
2578           lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
2579           lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
2580           lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
2581           lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
2582           lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
2583           lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
2584           lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
2585           lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
2586           lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
2587           lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
2588           lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
2589           lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
2590           lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
2591           lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
2592           lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
2593           lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
2594         }
2595         // stage 8
2596         {
2597           const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
2598           const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
2599           const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
2600           const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
2601           const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
2602           const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
2603           const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
2604           const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
2605 
2606           u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
2607           u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
2608           u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
2609           u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
2610           u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
2611           u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
2612           u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
2613           u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
2614           u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
2615           u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
2616           u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
2617           u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
2618           u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
2619           u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
2620           u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
2621           u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
2622 
2623           v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01);
2624           v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01);
2625           v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01);
2626           v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01);
2627           v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17);
2628           v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17);
2629           v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17);
2630           v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17);
2631           v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09);
2632           v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09);
2633           v[10] = k_madd_epi32(u[10], k32_p23_p09);
2634           v[11] = k_madd_epi32(u[11], k32_p23_p09);
2635           v[12] = k_madd_epi32(u[12], k32_p07_p25);
2636           v[13] = k_madd_epi32(u[13], k32_p07_p25);
2637           v[14] = k_madd_epi32(u[14], k32_p07_p25);
2638           v[15] = k_madd_epi32(u[15], k32_p07_p25);
2639           v[16] = k_madd_epi32(u[12], k32_m25_p07);
2640           v[17] = k_madd_epi32(u[13], k32_m25_p07);
2641           v[18] = k_madd_epi32(u[14], k32_m25_p07);
2642           v[19] = k_madd_epi32(u[15], k32_m25_p07);
2643           v[20] = k_madd_epi32(u[ 8], k32_m09_p23);
2644           v[21] = k_madd_epi32(u[ 9], k32_m09_p23);
2645           v[22] = k_madd_epi32(u[10], k32_m09_p23);
2646           v[23] = k_madd_epi32(u[11], k32_m09_p23);
2647           v[24] = k_madd_epi32(u[ 4], k32_m17_p15);
2648           v[25] = k_madd_epi32(u[ 5], k32_m17_p15);
2649           v[26] = k_madd_epi32(u[ 6], k32_m17_p15);
2650           v[27] = k_madd_epi32(u[ 7], k32_m17_p15);
2651           v[28] = k_madd_epi32(u[ 0], k32_m01_p31);
2652           v[29] = k_madd_epi32(u[ 1], k32_m01_p31);
2653           v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
2654           v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
2655 
2656 #if DCT_HIGH_BIT_DEPTH
2657           overflow = k_check_epi32_overflow_32(
2658               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
2659               &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
2660               &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
2661               &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
2662               &kZero);
2663           if (overflow) {
2664             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2665             return;
2666           }
2667 #endif  // DCT_HIGH_BIT_DEPTH
2668           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
2669           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
2670           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
2671           u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
2672           u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
2673           u[ 5] = k_packs_epi64(v[10], v[11]);
2674           u[ 6] = k_packs_epi64(v[12], v[13]);
2675           u[ 7] = k_packs_epi64(v[14], v[15]);
2676           u[ 8] = k_packs_epi64(v[16], v[17]);
2677           u[ 9] = k_packs_epi64(v[18], v[19]);
2678           u[10] = k_packs_epi64(v[20], v[21]);
2679           u[11] = k_packs_epi64(v[22], v[23]);
2680           u[12] = k_packs_epi64(v[24], v[25]);
2681           u[13] = k_packs_epi64(v[26], v[27]);
2682           u[14] = k_packs_epi64(v[28], v[29]);
2683           u[15] = k_packs_epi64(v[30], v[31]);
2684 
2685           v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
2686           v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
2687           v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
2688           v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
2689           v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
2690           v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
2691           v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
2692           v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
2693           v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
2694           v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
2695           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2696           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2697           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2698           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2699           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2700           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2701 
2702           u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
2703           u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
2704           u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
2705           u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
2706           u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
2707           u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
2708           u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
2709           u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
2710           u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
2711           u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
2712           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2713           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2714           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2715           u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2716           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2717           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2718 
2719           v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
2720           v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
2721           v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
2722           v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
2723           v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
2724           v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
2725           v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
2726           v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
2727           v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
2728           v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
2729           v[10] = _mm_cmplt_epi32(u[10], kZero);
2730           v[11] = _mm_cmplt_epi32(u[11], kZero);
2731           v[12] = _mm_cmplt_epi32(u[12], kZero);
2732           v[13] = _mm_cmplt_epi32(u[13], kZero);
2733           v[14] = _mm_cmplt_epi32(u[14], kZero);
2734           v[15] = _mm_cmplt_epi32(u[15], kZero);
2735 
2736           u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
2737           u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
2738           u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
2739           u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
2740           u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
2741           u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
2742           u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
2743           u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
2744           u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
2745           u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
2746           u[10] = _mm_sub_epi32(u[10], v[10]);
2747           u[11] = _mm_sub_epi32(u[11], v[11]);
2748           u[12] = _mm_sub_epi32(u[12], v[12]);
2749           u[13] = _mm_sub_epi32(u[13], v[13]);
2750           u[14] = _mm_sub_epi32(u[14], v[14]);
2751           u[15] = _mm_sub_epi32(u[15], v[15]);
2752 
2753           v[0] = _mm_add_epi32(u[0], K32One);
2754           v[1] = _mm_add_epi32(u[1], K32One);
2755           v[2] = _mm_add_epi32(u[2], K32One);
2756           v[3] = _mm_add_epi32(u[3], K32One);
2757           v[4] = _mm_add_epi32(u[4], K32One);
2758           v[5] = _mm_add_epi32(u[5], K32One);
2759           v[6] = _mm_add_epi32(u[6], K32One);
2760           v[7] = _mm_add_epi32(u[7], K32One);
2761           v[8] = _mm_add_epi32(u[8], K32One);
2762           v[9] = _mm_add_epi32(u[9], K32One);
2763           v[10] = _mm_add_epi32(u[10], K32One);
2764           v[11] = _mm_add_epi32(u[11], K32One);
2765           v[12] = _mm_add_epi32(u[12], K32One);
2766           v[13] = _mm_add_epi32(u[13], K32One);
2767           v[14] = _mm_add_epi32(u[14], K32One);
2768           v[15] = _mm_add_epi32(u[15], K32One);
2769 
2770           u[0] = _mm_srai_epi32(v[0], 2);
2771           u[1] = _mm_srai_epi32(v[1], 2);
2772           u[2] = _mm_srai_epi32(v[2], 2);
2773           u[3] = _mm_srai_epi32(v[3], 2);
2774           u[4] = _mm_srai_epi32(v[4], 2);
2775           u[5] = _mm_srai_epi32(v[5], 2);
2776           u[6] = _mm_srai_epi32(v[6], 2);
2777           u[7] = _mm_srai_epi32(v[7], 2);
2778           u[8] = _mm_srai_epi32(v[8], 2);
2779           u[9] = _mm_srai_epi32(v[9], 2);
2780           u[10] = _mm_srai_epi32(v[10], 2);
2781           u[11] = _mm_srai_epi32(v[11], 2);
2782           u[12] = _mm_srai_epi32(v[12], 2);
2783           u[13] = _mm_srai_epi32(v[13], 2);
2784           u[14] = _mm_srai_epi32(v[14], 2);
2785           u[15] = _mm_srai_epi32(v[15], 2);
2786 
2787           out[ 1] = _mm_packs_epi32(u[0], u[1]);
2788           out[17] = _mm_packs_epi32(u[2], u[3]);
2789           out[ 9] = _mm_packs_epi32(u[4], u[5]);
2790           out[25] = _mm_packs_epi32(u[6], u[7]);
2791           out[ 7] = _mm_packs_epi32(u[8], u[9]);
2792           out[23] = _mm_packs_epi32(u[10], u[11]);
2793           out[15] = _mm_packs_epi32(u[12], u[13]);
2794           out[31] = _mm_packs_epi32(u[14], u[15]);
2795 #if DCT_HIGH_BIT_DEPTH
2796           overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
2797                                              &out[25], &out[7], &out[23],
2798                                              &out[15], &out[31]);
2799           if (overflow) {
2800             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2801             return;
2802           }
2803 #endif  // DCT_HIGH_BIT_DEPTH
2804         }
2805         {
2806           const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
2807           const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
2808           const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
2809           const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
2810           const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
2811           const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
2812           const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
2813           const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
2814 
2815           u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
2816           u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
2817           u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
2818           u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
2819           u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
2820           u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
2821           u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
2822           u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
2823           u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
2824           u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
2825           u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
2826           u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
2827           u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
2828           u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
2829           u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
2830           u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
2831 
2832           v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05);
2833           v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05);
2834           v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05);
2835           v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05);
2836           v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21);
2837           v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21);
2838           v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21);
2839           v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21);
2840           v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13);
2841           v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13);
2842           v[10] = k_madd_epi32(u[10], k32_p19_p13);
2843           v[11] = k_madd_epi32(u[11], k32_p19_p13);
2844           v[12] = k_madd_epi32(u[12], k32_p03_p29);
2845           v[13] = k_madd_epi32(u[13], k32_p03_p29);
2846           v[14] = k_madd_epi32(u[14], k32_p03_p29);
2847           v[15] = k_madd_epi32(u[15], k32_p03_p29);
2848           v[16] = k_madd_epi32(u[12], k32_m29_p03);
2849           v[17] = k_madd_epi32(u[13], k32_m29_p03);
2850           v[18] = k_madd_epi32(u[14], k32_m29_p03);
2851           v[19] = k_madd_epi32(u[15], k32_m29_p03);
2852           v[20] = k_madd_epi32(u[ 8], k32_m13_p19);
2853           v[21] = k_madd_epi32(u[ 9], k32_m13_p19);
2854           v[22] = k_madd_epi32(u[10], k32_m13_p19);
2855           v[23] = k_madd_epi32(u[11], k32_m13_p19);
2856           v[24] = k_madd_epi32(u[ 4], k32_m21_p11);
2857           v[25] = k_madd_epi32(u[ 5], k32_m21_p11);
2858           v[26] = k_madd_epi32(u[ 6], k32_m21_p11);
2859           v[27] = k_madd_epi32(u[ 7], k32_m21_p11);
2860           v[28] = k_madd_epi32(u[ 0], k32_m05_p27);
2861           v[29] = k_madd_epi32(u[ 1], k32_m05_p27);
2862           v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
2863           v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
2864 
2865 #if DCT_HIGH_BIT_DEPTH
2866           overflow = k_check_epi32_overflow_32(
2867               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
2868               &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
2869               &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
2870               &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
2871               &kZero);
2872           if (overflow) {
2873             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2874             return;
2875           }
2876 #endif  // DCT_HIGH_BIT_DEPTH
2877           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
2878           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
2879           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
2880           u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
2881           u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
2882           u[ 5] = k_packs_epi64(v[10], v[11]);
2883           u[ 6] = k_packs_epi64(v[12], v[13]);
2884           u[ 7] = k_packs_epi64(v[14], v[15]);
2885           u[ 8] = k_packs_epi64(v[16], v[17]);
2886           u[ 9] = k_packs_epi64(v[18], v[19]);
2887           u[10] = k_packs_epi64(v[20], v[21]);
2888           u[11] = k_packs_epi64(v[22], v[23]);
2889           u[12] = k_packs_epi64(v[24], v[25]);
2890           u[13] = k_packs_epi64(v[26], v[27]);
2891           u[14] = k_packs_epi64(v[28], v[29]);
2892           u[15] = k_packs_epi64(v[30], v[31]);
2893 
2894           v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
2895           v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
2896           v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
2897           v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
2898           v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
2899           v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
2900           v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
2901           v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
2902           v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
2903           v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
2904           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2905           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2906           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2907           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2908           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2909           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2910 
2911           u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
2912           u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
2913           u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
2914           u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
2915           u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
2916           u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
2917           u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
2918           u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
2919           u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
2920           u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
2921           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2922           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2923           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2924           u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2925           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2926           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2927 
2928           v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
2929           v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
2930           v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
2931           v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
2932           v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
2933           v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
2934           v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
2935           v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
2936           v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
2937           v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
2938           v[10] = _mm_cmplt_epi32(u[10], kZero);
2939           v[11] = _mm_cmplt_epi32(u[11], kZero);
2940           v[12] = _mm_cmplt_epi32(u[12], kZero);
2941           v[13] = _mm_cmplt_epi32(u[13], kZero);
2942           v[14] = _mm_cmplt_epi32(u[14], kZero);
2943           v[15] = _mm_cmplt_epi32(u[15], kZero);
2944 
2945           u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
2946           u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
2947           u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
2948           u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
2949           u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
2950           u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
2951           u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
2952           u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
2953           u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
2954           u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
2955           u[10] = _mm_sub_epi32(u[10], v[10]);
2956           u[11] = _mm_sub_epi32(u[11], v[11]);
2957           u[12] = _mm_sub_epi32(u[12], v[12]);
2958           u[13] = _mm_sub_epi32(u[13], v[13]);
2959           u[14] = _mm_sub_epi32(u[14], v[14]);
2960           u[15] = _mm_sub_epi32(u[15], v[15]);
2961 
2962           v[0] = _mm_add_epi32(u[0], K32One);
2963           v[1] = _mm_add_epi32(u[1], K32One);
2964           v[2] = _mm_add_epi32(u[2], K32One);
2965           v[3] = _mm_add_epi32(u[3], K32One);
2966           v[4] = _mm_add_epi32(u[4], K32One);
2967           v[5] = _mm_add_epi32(u[5], K32One);
2968           v[6] = _mm_add_epi32(u[6], K32One);
2969           v[7] = _mm_add_epi32(u[7], K32One);
2970           v[8] = _mm_add_epi32(u[8], K32One);
2971           v[9] = _mm_add_epi32(u[9], K32One);
2972           v[10] = _mm_add_epi32(u[10], K32One);
2973           v[11] = _mm_add_epi32(u[11], K32One);
2974           v[12] = _mm_add_epi32(u[12], K32One);
2975           v[13] = _mm_add_epi32(u[13], K32One);
2976           v[14] = _mm_add_epi32(u[14], K32One);
2977           v[15] = _mm_add_epi32(u[15], K32One);
2978 
2979           u[0] = _mm_srai_epi32(v[0], 2);
2980           u[1] = _mm_srai_epi32(v[1], 2);
2981           u[2] = _mm_srai_epi32(v[2], 2);
2982           u[3] = _mm_srai_epi32(v[3], 2);
2983           u[4] = _mm_srai_epi32(v[4], 2);
2984           u[5] = _mm_srai_epi32(v[5], 2);
2985           u[6] = _mm_srai_epi32(v[6], 2);
2986           u[7] = _mm_srai_epi32(v[7], 2);
2987           u[8] = _mm_srai_epi32(v[8], 2);
2988           u[9] = _mm_srai_epi32(v[9], 2);
2989           u[10] = _mm_srai_epi32(v[10], 2);
2990           u[11] = _mm_srai_epi32(v[11], 2);
2991           u[12] = _mm_srai_epi32(v[12], 2);
2992           u[13] = _mm_srai_epi32(v[13], 2);
2993           u[14] = _mm_srai_epi32(v[14], 2);
2994           u[15] = _mm_srai_epi32(v[15], 2);
2995 
2996           out[ 5] = _mm_packs_epi32(u[0], u[1]);
2997           out[21] = _mm_packs_epi32(u[2], u[3]);
2998           out[13] = _mm_packs_epi32(u[4], u[5]);
2999           out[29] = _mm_packs_epi32(u[6], u[7]);
3000           out[ 3] = _mm_packs_epi32(u[8], u[9]);
3001           out[19] = _mm_packs_epi32(u[10], u[11]);
3002           out[11] = _mm_packs_epi32(u[12], u[13]);
3003           out[27] = _mm_packs_epi32(u[14], u[15]);
3004 #if DCT_HIGH_BIT_DEPTH
3005           overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
3006                                              &out[29], &out[3], &out[19],
3007                                              &out[11], &out[27]);
3008           if (overflow) {
3009             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
3010             return;
3011           }
3012 #endif  // DCT_HIGH_BIT_DEPTH
3013         }
3014       }
3015 #endif  // FDCT32x32_HIGH_PRECISION
3016       // Transpose the results, do it as four 8x8 transposes.
3017       {
3018         int transpose_block;
3019         int16_t *output0 = &intermediate[column_start * 32];
3020         tran_low_t *output1 = &output_org[column_start * 32];
3021         for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
3022           __m128i *this_out = &out[8 * transpose_block];
3023           // 00 01 02 03 04 05 06 07
3024           // 10 11 12 13 14 15 16 17
3025           // 20 21 22 23 24 25 26 27
3026           // 30 31 32 33 34 35 36 37
3027           // 40 41 42 43 44 45 46 47
3028           // 50 51 52 53 54 55 56 57
3029           // 60 61 62 63 64 65 66 67
3030           // 70 71 72 73 74 75 76 77
3031           const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
3032           const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
3033           const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
3034           const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
3035           const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
3036           const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
3037           const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
3038           const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
3039           // 00 10 01 11 02 12 03 13
3040           // 20 30 21 31 22 32 23 33
3041           // 04 14 05 15 06 16 07 17
3042           // 24 34 25 35 26 36 27 37
3043           // 40 50 41 51 42 52 43 53
3044           // 60 70 61 71 62 72 63 73
3045           // 54 54 55 55 56 56 57 57
3046           // 64 74 65 75 66 76 67 77
3047           const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
3048           const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
3049           const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
3050           const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
3051           const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
3052           const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
3053           const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
3054           const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
3055           // 00 10 20 30 01 11 21 31
3056           // 40 50 60 70 41 51 61 71
3057           // 02 12 22 32 03 13 23 33
3058           // 42 52 62 72 43 53 63 73
3059           // 04 14 24 34 05 15 21 36
3060           // 44 54 64 74 45 55 61 76
3061           // 06 16 26 36 07 17 27 37
3062           // 46 56 66 76 47 57 67 77
3063           __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
3064           __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
3065           __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
3066           __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
3067           __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
3068           __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
3069           __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
3070           __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
3071           // 00 10 20 30 40 50 60 70
3072           // 01 11 21 31 41 51 61 71
3073           // 02 12 22 32 42 52 62 72
3074           // 03 13 23 33 43 53 63 73
3075           // 04 14 24 34 44 54 64 74
3076           // 05 15 25 35 45 55 65 75
3077           // 06 16 26 36 46 56 66 76
3078           // 07 17 27 37 47 57 67 77
3079           if (0 == pass) {
3080             // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
3081             // TODO(cd): see quality impact of only doing
3082             //           output[j] = (output[j] + 1) >> 2;
3083             //           which would remove the code between here ...
3084             __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
3085             __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
3086             __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
3087             __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
3088             __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
3089             __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
3090             __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
3091             __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
3092             tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
3093             tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
3094             tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
3095             tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
3096             tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
3097             tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
3098             tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
3099             tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
3100             //           ... and here.
3101             //           PS: also change code in vp9/encoder/vp9_dct.c
3102             tr2_0 = _mm_add_epi16(tr2_0, kOne);
3103             tr2_1 = _mm_add_epi16(tr2_1, kOne);
3104             tr2_2 = _mm_add_epi16(tr2_2, kOne);
3105             tr2_3 = _mm_add_epi16(tr2_3, kOne);
3106             tr2_4 = _mm_add_epi16(tr2_4, kOne);
3107             tr2_5 = _mm_add_epi16(tr2_5, kOne);
3108             tr2_6 = _mm_add_epi16(tr2_6, kOne);
3109             tr2_7 = _mm_add_epi16(tr2_7, kOne);
3110             tr2_0 = _mm_srai_epi16(tr2_0, 2);
3111             tr2_1 = _mm_srai_epi16(tr2_1, 2);
3112             tr2_2 = _mm_srai_epi16(tr2_2, 2);
3113             tr2_3 = _mm_srai_epi16(tr2_3, 2);
3114             tr2_4 = _mm_srai_epi16(tr2_4, 2);
3115             tr2_5 = _mm_srai_epi16(tr2_5, 2);
3116             tr2_6 = _mm_srai_epi16(tr2_6, 2);
3117             tr2_7 = _mm_srai_epi16(tr2_7, 2);
3118           }
3119           // Note: even though all these stores are aligned, using the aligned
3120           //       intrinsic make the code slightly slower.
3121           if (pass == 0) {
3122             _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
3123             _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
3124             _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
3125             _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
3126             _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
3127             _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
3128             _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
3129             _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
3130             // Process next 8x8
3131             output0 += 8;
3132           } else {
3133             storeu_output(&tr2_0, (output1 + 0 * 32));
3134             storeu_output(&tr2_1, (output1 + 1 * 32));
3135             storeu_output(&tr2_2, (output1 + 2 * 32));
3136             storeu_output(&tr2_3, (output1 + 3 * 32));
3137             storeu_output(&tr2_4, (output1 + 4 * 32));
3138             storeu_output(&tr2_5, (output1 + 5 * 32));
3139             storeu_output(&tr2_6, (output1 + 6 * 32));
3140             storeu_output(&tr2_7, (output1 + 7 * 32));
3141             // Process next 8x8
3142             output1 += 8;
3143           }
3144         }
3145       }
3146     }
3147   }
3148 }  // NOLINT
3149 
3150 #undef ADD_EPI16
3151 #undef SUB_EPI16
3152 #undef HIGH_FDCT32x32_2D_C
3153 #undef HIGH_FDCT32x32_2D_ROWS_C
3154