1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
12 #define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
13 
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17 
18 #define pair_set_epi32(a, b) \
19   _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
20 
k_madd_epi32(__m128i a,__m128i b)21 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
22   __m128i buf0, buf1;
23   buf0 = _mm_mul_epu32(a, b);
24   a = _mm_srli_epi64(a, 32);
25   b = _mm_srli_epi64(b, 32);
26   buf1 = _mm_mul_epu32(a, b);
27   return _mm_add_epi64(buf0, buf1);
28 }
29 
k_packs_epi64(__m128i a,__m128i b)30 static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
31   __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
32   __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
33   return _mm_unpacklo_epi64(buf0, buf1);
34 }
35 
check_epi16_overflow_x2(const __m128i * preg0,const __m128i * preg1)36 static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
37                                           const __m128i *preg1) {
38   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
39   const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
40   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
41                               _mm_cmpeq_epi16(*preg0, min_overflow));
42   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
43                               _mm_cmpeq_epi16(*preg1, min_overflow));
44   cmp0 = _mm_or_si128(cmp0, cmp1);
45   return _mm_movemask_epi8(cmp0);
46 }
47 
check_epi16_overflow_x4(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3)48 static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
49                                           const __m128i *preg1,
50                                           const __m128i *preg2,
51                                           const __m128i *preg3) {
52   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
53   const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
54   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
55                               _mm_cmpeq_epi16(*preg0, min_overflow));
56   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
57                               _mm_cmpeq_epi16(*preg1, min_overflow));
58   __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
59                               _mm_cmpeq_epi16(*preg2, min_overflow));
60   __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
61                               _mm_cmpeq_epi16(*preg3, min_overflow));
62   cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
63   return _mm_movemask_epi8(cmp0);
64 }
65 
check_epi16_overflow_x8(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7)66 static INLINE int check_epi16_overflow_x8(
67     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
68     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
69     const __m128i *preg6, const __m128i *preg7) {
70   int res0, res1;
71   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
72   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
73   return res0 + res1;
74 }
75 
check_epi16_overflow_x12(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11)76 static INLINE int check_epi16_overflow_x12(
77     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
78     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
79     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
80     const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
81   int res0, res1;
82   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
83   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
84   if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
85   return res0 + res1;
86 }
87 
check_epi16_overflow_x16(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15)88 static INLINE int check_epi16_overflow_x16(
89     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
90     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
91     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
92     const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
93     const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
94     const __m128i *preg15) {
95   int res0, res1;
96   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
97   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
98   if (!res0) {
99     res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
100     if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
101   }
102   return res0 + res1;
103 }
104 
check_epi16_overflow_x32(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15,const __m128i * preg16,const __m128i * preg17,const __m128i * preg18,const __m128i * preg19,const __m128i * preg20,const __m128i * preg21,const __m128i * preg22,const __m128i * preg23,const __m128i * preg24,const __m128i * preg25,const __m128i * preg26,const __m128i * preg27,const __m128i * preg28,const __m128i * preg29,const __m128i * preg30,const __m128i * preg31)105 static INLINE int check_epi16_overflow_x32(
106     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
107     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
108     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
109     const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
110     const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
111     const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
112     const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
113     const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
114     const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
115     const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
116     const __m128i *preg30, const __m128i *preg31) {
117   int res0, res1;
118   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
119   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
120   if (!res0) {
121     res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
122     if (!res1) {
123       res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
124       if (!res0) {
125         res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
126         if (!res1) {
127           res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
128           if (!res0) {
129             res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
130             if (!res1)
131               res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
132           }
133         }
134       }
135     }
136   }
137   return res0 + res1;
138 }
139 
k_check_epi32_overflow_4(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * zero)140 static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
141                                            const __m128i *preg1,
142                                            const __m128i *preg2,
143                                            const __m128i *preg3,
144                                            const __m128i *zero) {
145   __m128i minus_one = _mm_set1_epi32(-1);
146   // Check for overflows
147   __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
148   __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
149   __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
150   __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
151   __m128i reg0_top_dwords =
152       _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
153   __m128i reg1_top_dwords =
154       _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
155   __m128i reg2_top_dwords =
156       _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
157   __m128i reg3_top_dwords =
158       _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
159   __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
160   __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
161   __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
162   __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
163   __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
164   __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
165   int overflow_01 =
166       _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
167   int overflow_23 =
168       _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
169   return (overflow_01 + overflow_23);
170 }
171 
k_check_epi32_overflow_8(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * zero)172 static INLINE int k_check_epi32_overflow_8(
173     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
174     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
175     const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
176   int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
177   if (!overflow) {
178     overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
179   }
180   return overflow;
181 }
182 
k_check_epi32_overflow_16(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15,const __m128i * zero)183 static INLINE int k_check_epi32_overflow_16(
184     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
185     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
186     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
187     const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
188     const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
189     const __m128i *preg15, const __m128i *zero) {
190   int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
191   if (!overflow) {
192     overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
193     if (!overflow) {
194       overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
195       if (!overflow) {
196         overflow =
197             k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
198       }
199     }
200   }
201   return overflow;
202 }
203 
k_check_epi32_overflow_32(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15,const __m128i * preg16,const __m128i * preg17,const __m128i * preg18,const __m128i * preg19,const __m128i * preg20,const __m128i * preg21,const __m128i * preg22,const __m128i * preg23,const __m128i * preg24,const __m128i * preg25,const __m128i * preg26,const __m128i * preg27,const __m128i * preg28,const __m128i * preg29,const __m128i * preg30,const __m128i * preg31,const __m128i * zero)204 static INLINE int k_check_epi32_overflow_32(
205     const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
206     const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
207     const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
208     const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
209     const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
210     const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
211     const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
212     const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
213     const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
214     const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
215     const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
216   int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
217   if (!overflow) {
218     overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
219     if (!overflow) {
220       overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
221       if (!overflow) {
222         overflow =
223             k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
224         if (!overflow) {
225           overflow =
226               k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
227           if (!overflow) {
228             overflow =
229                 k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
230             if (!overflow) {
231               overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
232                                                   preg27, zero);
233               if (!overflow) {
234                 overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
235                                                     preg31, zero);
236               }
237             }
238           }
239         }
240       }
241     }
242   }
243   return overflow;
244 }
245 
store_output(const __m128i * poutput,tran_low_t * dst_ptr)246 static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
247 #if CONFIG_VP9_HIGHBITDEPTH
248   const __m128i zero = _mm_setzero_si128();
249   const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
250   __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
251   __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
252   _mm_store_si128((__m128i *)(dst_ptr), out0);
253   _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
254 #else
255   _mm_store_si128((__m128i *)(dst_ptr), *poutput);
256 #endif  // CONFIG_VP9_HIGHBITDEPTH
257 }
258 
storeu_output(const __m128i * poutput,tran_low_t * dst_ptr)259 static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
260 #if CONFIG_VP9_HIGHBITDEPTH
261   const __m128i zero = _mm_setzero_si128();
262   const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
263   __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
264   __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
265   _mm_storeu_si128((__m128i *)(dst_ptr), out0);
266   _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
267 #else
268   _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
269 #endif  // CONFIG_VP9_HIGHBITDEPTH
270 }
271 
mult_round_shift(const __m128i * pin0,const __m128i * pin1,const __m128i * pmultiplier,const __m128i * prounding,const int shift)272 static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
273                                        const __m128i *pmultiplier,
274                                        const __m128i *prounding,
275                                        const int shift) {
276   const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
277   const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
278   const __m128i v0 = _mm_add_epi32(u0, *prounding);
279   const __m128i v1 = _mm_add_epi32(u1, *prounding);
280   const __m128i w0 = _mm_srai_epi32(v0, shift);
281   const __m128i w1 = _mm_srai_epi32(v1, shift);
282   return _mm_packs_epi32(w0, w1);
283 }
284 
transpose_and_output8x8(const __m128i * pin00,const __m128i * pin01,const __m128i * pin02,const __m128i * pin03,const __m128i * pin04,const __m128i * pin05,const __m128i * pin06,const __m128i * pin07,const int pass,int16_t * out0_ptr,tran_low_t * out1_ptr)285 static INLINE void transpose_and_output8x8(
286     const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
287     const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
288     const __m128i *pin06, const __m128i *pin07, const int pass,
289     int16_t *out0_ptr, tran_low_t *out1_ptr) {
290   // 00 01 02 03 04 05 06 07
291   // 10 11 12 13 14 15 16 17
292   // 20 21 22 23 24 25 26 27
293   // 30 31 32 33 34 35 36 37
294   // 40 41 42 43 44 45 46 47
295   // 50 51 52 53 54 55 56 57
296   // 60 61 62 63 64 65 66 67
297   // 70 71 72 73 74 75 76 77
298   const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
299   const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
300   const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
301   const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
302   const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
303   const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
304   const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
305   const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
306   // 00 10 01 11 02 12 03 13
307   // 20 30 21 31 22 32 23 33
308   // 04 14 05 15 06 16 07 17
309   // 24 34 25 35 26 36 27 37
310   // 40 50 41 51 42 52 43 53
311   // 60 70 61 71 62 72 63 73
312   // 54 54 55 55 56 56 57 57
313   // 64 74 65 75 66 76 67 77
314   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
315   const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
316   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
317   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
318   const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
319   const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
320   const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
321   const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
322   // 00 10 20 30 01 11 21 31
323   // 40 50 60 70 41 51 61 71
324   // 02 12 22 32 03 13 23 33
325   // 42 52 62 72 43 53 63 73
326   // 04 14 24 34 05 15 21 36
327   // 44 54 64 74 45 55 61 76
328   // 06 16 26 36 07 17 27 37
329   // 46 56 66 76 47 57 67 77
330   const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
331   const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
332   const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
333   const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
334   const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
335   const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
336   const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
337   const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
338   // 00 10 20 30 40 50 60 70
339   // 01 11 21 31 41 51 61 71
340   // 02 12 22 32 42 52 62 72
341   // 03 13 23 33 43 53 63 73
342   // 04 14 24 34 44 54 64 74
343   // 05 15 25 35 45 55 65 75
344   // 06 16 26 36 46 56 66 76
345   // 07 17 27 37 47 57 67 77
346   if (pass == 0) {
347     _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
348     _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
349     _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
350     _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
351     _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
352     _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
353     _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
354     _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
355   } else {
356     storeu_output(&tr2_0, (out1_ptr + 0 * 16));
357     storeu_output(&tr2_1, (out1_ptr + 1 * 16));
358     storeu_output(&tr2_2, (out1_ptr + 2 * 16));
359     storeu_output(&tr2_3, (out1_ptr + 3 * 16));
360     storeu_output(&tr2_4, (out1_ptr + 4 * 16));
361     storeu_output(&tr2_5, (out1_ptr + 5 * 16));
362     storeu_output(&tr2_6, (out1_ptr + 6 * 16));
363     storeu_output(&tr2_7, (out1_ptr + 7 * 16));
364   }
365 }
366 
367 #ifdef __cplusplus
368 }  // extern "C"
369 #endif
370 
371 #endif  // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
372