1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_DSP_X86_FWD_TXFM_SSE2_H_
12 #define VPX_DSP_X86_FWD_TXFM_SSE2_H_
13
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17
18 #define pair_set_epi32(a, b) \
19 _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
20
k_madd_epi32(__m128i a,__m128i b)21 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
22 __m128i buf0, buf1;
23 buf0 = _mm_mul_epu32(a, b);
24 a = _mm_srli_epi64(a, 32);
25 b = _mm_srli_epi64(b, 32);
26 buf1 = _mm_mul_epu32(a, b);
27 return _mm_add_epi64(buf0, buf1);
28 }
29
k_packs_epi64(__m128i a,__m128i b)30 static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
31 __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
32 __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
33 return _mm_unpacklo_epi64(buf0, buf1);
34 }
35
check_epi16_overflow_x2(const __m128i * preg0,const __m128i * preg1)36 static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
37 const __m128i *preg1) {
38 const __m128i max_overflow = _mm_set1_epi16(0x7fff);
39 const __m128i min_overflow = _mm_set1_epi16(0x8000);
40 __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
41 _mm_cmpeq_epi16(*preg0, min_overflow));
42 __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
43 _mm_cmpeq_epi16(*preg1, min_overflow));
44 cmp0 = _mm_or_si128(cmp0, cmp1);
45 return _mm_movemask_epi8(cmp0);
46 }
47
check_epi16_overflow_x4(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3)48 static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
49 const __m128i *preg1,
50 const __m128i *preg2,
51 const __m128i *preg3) {
52 const __m128i max_overflow = _mm_set1_epi16(0x7fff);
53 const __m128i min_overflow = _mm_set1_epi16(0x8000);
54 __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
55 _mm_cmpeq_epi16(*preg0, min_overflow));
56 __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
57 _mm_cmpeq_epi16(*preg1, min_overflow));
58 __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
59 _mm_cmpeq_epi16(*preg2, min_overflow));
60 __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
61 _mm_cmpeq_epi16(*preg3, min_overflow));
62 cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
63 return _mm_movemask_epi8(cmp0);
64 }
65
check_epi16_overflow_x8(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7)66 static INLINE int check_epi16_overflow_x8(const __m128i *preg0,
67 const __m128i *preg1,
68 const __m128i *preg2,
69 const __m128i *preg3,
70 const __m128i *preg4,
71 const __m128i *preg5,
72 const __m128i *preg6,
73 const __m128i *preg7) {
74 int res0, res1;
75 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
76 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
77 return res0 + res1;
78 }
79
check_epi16_overflow_x12(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11)80 static INLINE int check_epi16_overflow_x12(const __m128i *preg0,
81 const __m128i *preg1,
82 const __m128i *preg2,
83 const __m128i *preg3,
84 const __m128i *preg4,
85 const __m128i *preg5,
86 const __m128i *preg6,
87 const __m128i *preg7,
88 const __m128i *preg8,
89 const __m128i *preg9,
90 const __m128i *preg10,
91 const __m128i *preg11) {
92 int res0, res1;
93 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
94 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
95 if (!res0)
96 res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
97 return res0 + res1;
98 }
99
check_epi16_overflow_x16(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15)100 static INLINE int check_epi16_overflow_x16(const __m128i *preg0,
101 const __m128i *preg1,
102 const __m128i *preg2,
103 const __m128i *preg3,
104 const __m128i *preg4,
105 const __m128i *preg5,
106 const __m128i *preg6,
107 const __m128i *preg7,
108 const __m128i *preg8,
109 const __m128i *preg9,
110 const __m128i *preg10,
111 const __m128i *preg11,
112 const __m128i *preg12,
113 const __m128i *preg13,
114 const __m128i *preg14,
115 const __m128i *preg15) {
116 int res0, res1;
117 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
118 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
119 if (!res0) {
120 res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
121 if (!res1)
122 res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
123 }
124 return res0 + res1;
125 }
126
check_epi16_overflow_x32(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15,const __m128i * preg16,const __m128i * preg17,const __m128i * preg18,const __m128i * preg19,const __m128i * preg20,const __m128i * preg21,const __m128i * preg22,const __m128i * preg23,const __m128i * preg24,const __m128i * preg25,const __m128i * preg26,const __m128i * preg27,const __m128i * preg28,const __m128i * preg29,const __m128i * preg30,const __m128i * preg31)127 static INLINE int check_epi16_overflow_x32(const __m128i *preg0,
128 const __m128i *preg1,
129 const __m128i *preg2,
130 const __m128i *preg3,
131 const __m128i *preg4,
132 const __m128i *preg5,
133 const __m128i *preg6,
134 const __m128i *preg7,
135 const __m128i *preg8,
136 const __m128i *preg9,
137 const __m128i *preg10,
138 const __m128i *preg11,
139 const __m128i *preg12,
140 const __m128i *preg13,
141 const __m128i *preg14,
142 const __m128i *preg15,
143 const __m128i *preg16,
144 const __m128i *preg17,
145 const __m128i *preg18,
146 const __m128i *preg19,
147 const __m128i *preg20,
148 const __m128i *preg21,
149 const __m128i *preg22,
150 const __m128i *preg23,
151 const __m128i *preg24,
152 const __m128i *preg25,
153 const __m128i *preg26,
154 const __m128i *preg27,
155 const __m128i *preg28,
156 const __m128i *preg29,
157 const __m128i *preg30,
158 const __m128i *preg31) {
159 int res0, res1;
160 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
161 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
162 if (!res0) {
163 res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
164 if (!res1) {
165 res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
166 if (!res0) {
167 res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
168 if (!res1) {
169 res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
170 if (!res0) {
171 res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
172 if (!res1)
173 res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
174 }
175 }
176 }
177 }
178 }
179 return res0 + res1;
180 }
181
k_check_epi32_overflow_4(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * zero)182 static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
183 const __m128i *preg1,
184 const __m128i *preg2,
185 const __m128i *preg3,
186 const __m128i *zero) {
187 __m128i minus_one = _mm_set1_epi32(-1);
188 // Check for overflows
189 __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
190 __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
191 __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
192 __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
193 __m128i reg0_top_dwords = _mm_shuffle_epi32(
194 reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
195 __m128i reg1_top_dwords = _mm_shuffle_epi32(
196 reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
197 __m128i reg2_top_dwords = _mm_shuffle_epi32(
198 reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
199 __m128i reg3_top_dwords = _mm_shuffle_epi32(
200 reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
201 __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
202 __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
203 __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
204 __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
205 __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
206 __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
207 int overflow_01 = _mm_movemask_epi8(
208 _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
209 int overflow_23 = _mm_movemask_epi8(
210 _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
211 return (overflow_01 + overflow_23);
212 }
213
k_check_epi32_overflow_8(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * zero)214 static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
215 const __m128i *preg1,
216 const __m128i *preg2,
217 const __m128i *preg3,
218 const __m128i *preg4,
219 const __m128i *preg5,
220 const __m128i *preg6,
221 const __m128i *preg7,
222 const __m128i *zero) {
223 int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
224 if (!overflow) {
225 overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
226 }
227 return overflow;
228 }
229
k_check_epi32_overflow_16(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15,const __m128i * zero)230 static INLINE int k_check_epi32_overflow_16(const __m128i *preg0,
231 const __m128i *preg1,
232 const __m128i *preg2,
233 const __m128i *preg3,
234 const __m128i *preg4,
235 const __m128i *preg5,
236 const __m128i *preg6,
237 const __m128i *preg7,
238 const __m128i *preg8,
239 const __m128i *preg9,
240 const __m128i *preg10,
241 const __m128i *preg11,
242 const __m128i *preg12,
243 const __m128i *preg13,
244 const __m128i *preg14,
245 const __m128i *preg15,
246 const __m128i *zero) {
247 int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
248 if (!overflow) {
249 overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
250 if (!overflow) {
251 overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11,
252 zero);
253 if (!overflow) {
254 overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
255 zero);
256 }
257 }
258 }
259 return overflow;
260 }
261
k_check_epi32_overflow_32(const __m128i * preg0,const __m128i * preg1,const __m128i * preg2,const __m128i * preg3,const __m128i * preg4,const __m128i * preg5,const __m128i * preg6,const __m128i * preg7,const __m128i * preg8,const __m128i * preg9,const __m128i * preg10,const __m128i * preg11,const __m128i * preg12,const __m128i * preg13,const __m128i * preg14,const __m128i * preg15,const __m128i * preg16,const __m128i * preg17,const __m128i * preg18,const __m128i * preg19,const __m128i * preg20,const __m128i * preg21,const __m128i * preg22,const __m128i * preg23,const __m128i * preg24,const __m128i * preg25,const __m128i * preg26,const __m128i * preg27,const __m128i * preg28,const __m128i * preg29,const __m128i * preg30,const __m128i * preg31,const __m128i * zero)262 static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
263 const __m128i *preg1,
264 const __m128i *preg2,
265 const __m128i *preg3,
266 const __m128i *preg4,
267 const __m128i *preg5,
268 const __m128i *preg6,
269 const __m128i *preg7,
270 const __m128i *preg8,
271 const __m128i *preg9,
272 const __m128i *preg10,
273 const __m128i *preg11,
274 const __m128i *preg12,
275 const __m128i *preg13,
276 const __m128i *preg14,
277 const __m128i *preg15,
278 const __m128i *preg16,
279 const __m128i *preg17,
280 const __m128i *preg18,
281 const __m128i *preg19,
282 const __m128i *preg20,
283 const __m128i *preg21,
284 const __m128i *preg22,
285 const __m128i *preg23,
286 const __m128i *preg24,
287 const __m128i *preg25,
288 const __m128i *preg26,
289 const __m128i *preg27,
290 const __m128i *preg28,
291 const __m128i *preg29,
292 const __m128i *preg30,
293 const __m128i *preg31,
294 const __m128i *zero) {
295 int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
296 if (!overflow) {
297 overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
298 if (!overflow) {
299 overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
300 if (!overflow) {
301 overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
302 zero);
303 if (!overflow) {
304 overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19,
305 zero);
306 if (!overflow) {
307 overflow = k_check_epi32_overflow_4(preg20, preg21,
308 preg22, preg23, zero);
309 if (!overflow) {
310 overflow = k_check_epi32_overflow_4(preg24, preg25,
311 preg26, preg27, zero);
312 if (!overflow) {
313 overflow = k_check_epi32_overflow_4(preg28, preg29,
314 preg30, preg31, zero);
315 }
316 }
317 }
318 }
319 }
320 }
321 }
322 return overflow;
323 }
324
store_output(const __m128i * poutput,tran_low_t * dst_ptr)325 static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
326 #if CONFIG_VP9_HIGHBITDEPTH
327 const __m128i zero = _mm_setzero_si128();
328 const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
329 __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
330 __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
331 _mm_store_si128((__m128i *)(dst_ptr), out0);
332 _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
333 #else
334 _mm_store_si128((__m128i *)(dst_ptr), *poutput);
335 #endif // CONFIG_VP9_HIGHBITDEPTH
336 }
337
storeu_output(const __m128i * poutput,tran_low_t * dst_ptr)338 static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
339 #if CONFIG_VP9_HIGHBITDEPTH
340 const __m128i zero = _mm_setzero_si128();
341 const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
342 __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
343 __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
344 _mm_storeu_si128((__m128i *)(dst_ptr), out0);
345 _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
346 #else
347 _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
348 #endif // CONFIG_VP9_HIGHBITDEPTH
349 }
350
351
mult_round_shift(const __m128i * pin0,const __m128i * pin1,const __m128i * pmultiplier,const __m128i * prounding,const int shift)352 static INLINE __m128i mult_round_shift(const __m128i *pin0,
353 const __m128i *pin1,
354 const __m128i *pmultiplier,
355 const __m128i *prounding,
356 const int shift) {
357 const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
358 const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
359 const __m128i v0 = _mm_add_epi32(u0, *prounding);
360 const __m128i v1 = _mm_add_epi32(u1, *prounding);
361 const __m128i w0 = _mm_srai_epi32(v0, shift);
362 const __m128i w1 = _mm_srai_epi32(v1, shift);
363 return _mm_packs_epi32(w0, w1);
364 }
365
transpose_and_output8x8(const __m128i * pin00,const __m128i * pin01,const __m128i * pin02,const __m128i * pin03,const __m128i * pin04,const __m128i * pin05,const __m128i * pin06,const __m128i * pin07,const int pass,int16_t * out0_ptr,tran_low_t * out1_ptr)366 static INLINE void transpose_and_output8x8(
367 const __m128i *pin00, const __m128i *pin01,
368 const __m128i *pin02, const __m128i *pin03,
369 const __m128i *pin04, const __m128i *pin05,
370 const __m128i *pin06, const __m128i *pin07,
371 const int pass, int16_t* out0_ptr,
372 tran_low_t* out1_ptr) {
373 // 00 01 02 03 04 05 06 07
374 // 10 11 12 13 14 15 16 17
375 // 20 21 22 23 24 25 26 27
376 // 30 31 32 33 34 35 36 37
377 // 40 41 42 43 44 45 46 47
378 // 50 51 52 53 54 55 56 57
379 // 60 61 62 63 64 65 66 67
380 // 70 71 72 73 74 75 76 77
381 const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
382 const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
383 const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
384 const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
385 const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
386 const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
387 const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
388 const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
389 // 00 10 01 11 02 12 03 13
390 // 20 30 21 31 22 32 23 33
391 // 04 14 05 15 06 16 07 17
392 // 24 34 25 35 26 36 27 37
393 // 40 50 41 51 42 52 43 53
394 // 60 70 61 71 62 72 63 73
395 // 54 54 55 55 56 56 57 57
396 // 64 74 65 75 66 76 67 77
397 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
398 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
399 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
400 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
401 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
402 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
403 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
404 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
405 // 00 10 20 30 01 11 21 31
406 // 40 50 60 70 41 51 61 71
407 // 02 12 22 32 03 13 23 33
408 // 42 52 62 72 43 53 63 73
409 // 04 14 24 34 05 15 21 36
410 // 44 54 64 74 45 55 61 76
411 // 06 16 26 36 07 17 27 37
412 // 46 56 66 76 47 57 67 77
413 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
414 const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
415 const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
416 const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
417 const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
418 const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
419 const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
420 const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
421 // 00 10 20 30 40 50 60 70
422 // 01 11 21 31 41 51 61 71
423 // 02 12 22 32 42 52 62 72
424 // 03 13 23 33 43 53 63 73
425 // 04 14 24 34 44 54 64 74
426 // 05 15 25 35 45 55 65 75
427 // 06 16 26 36 46 56 66 76
428 // 07 17 27 37 47 57 67 77
429 if (pass == 0) {
430 _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
431 _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
432 _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
433 _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
434 _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
435 _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
436 _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
437 _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
438 } else {
439 storeu_output(&tr2_0, (out1_ptr + 0 * 16));
440 storeu_output(&tr2_1, (out1_ptr + 1 * 16));
441 storeu_output(&tr2_2, (out1_ptr + 2 * 16));
442 storeu_output(&tr2_3, (out1_ptr + 3 * 16));
443 storeu_output(&tr2_4, (out1_ptr + 4 * 16));
444 storeu_output(&tr2_5, (out1_ptr + 5 * 16));
445 storeu_output(&tr2_6, (out1_ptr + 6 * 16));
446 storeu_output(&tr2_7, (out1_ptr + 7 * 16));
447 }
448 }
449
450 #ifdef __cplusplus
451 } // extern "C"
452 #endif
453
454 #endif // VPX_DSP_X86_FWD_TXFM_SSE2_H_
455