1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>  // SSE2
12 #include "vp9/common/vp9_idct.h"  // for cospi constants
13 #include "vpx_ports/mem.h"
14 
vp9_fdct4x4_1_sse2(const int16_t * input,int16_t * output,int stride)15 void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
16   __m128i in0, in1;
17   __m128i tmp;
18   const __m128i zero = _mm_setzero_si128();
19   in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
20   in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
21   in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
22          (input +  2 * stride)));
23   in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
24          (input +  3 * stride)));
25 
26   tmp = _mm_add_epi16(in0, in1);
27   in0 = _mm_unpacklo_epi16(zero, tmp);
28   in1 = _mm_unpackhi_epi16(zero, tmp);
29   in0 = _mm_srai_epi32(in0, 16);
30   in1 = _mm_srai_epi32(in1, 16);
31 
32   tmp = _mm_add_epi32(in0, in1);
33   in0 = _mm_unpacklo_epi32(tmp, zero);
34   in1 = _mm_unpackhi_epi32(tmp, zero);
35 
36   tmp = _mm_add_epi32(in0, in1);
37   in0 = _mm_srli_si128(tmp, 8);
38 
39   in1 = _mm_add_epi32(tmp, in0);
40   in0 = _mm_slli_epi32(in1, 1);
41   _mm_store_si128((__m128i *)(output), in0);
42 }
43 
vp9_fdct4x4_sse2(const int16_t * input,int16_t * output,int stride)44 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
45   // This 2D transform implements 4 vertical 1D transforms followed
46   // by 4 horizontal 1D transforms.  The multiplies and adds are as given
47   // by Chen, Smith and Fralick ('77).  The commands for moving the data
48   // around have been minimized by hand.
49   // For the purposes of the comments, the 16 inputs are referred to at i0
50   // through iF (in raster order), intermediate variables are a0, b0, c0
51   // through f, and correspond to the in-place computations mapped to input
52   // locations.  The outputs, o0 through oF are labeled according to the
53   // output locations.
54 
55   // Constants
56   // These are the coefficients used for the multiplies.
57   // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
58   // where cospi_N_64 = cos(N pi /64)
59   const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
60                                             cospi_16_64, cospi_16_64,
61                                             cospi_16_64, -cospi_16_64,
62                                             cospi_16_64, -cospi_16_64);
63   const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
64                                             cospi_16_64, -cospi_16_64,
65                                             cospi_16_64, cospi_16_64,
66                                             cospi_16_64, cospi_16_64);
67   const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
68                                             cospi_8_64, cospi_24_64,
69                                             cospi_24_64, -cospi_8_64,
70                                             cospi_24_64, -cospi_8_64);
71   const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
72                                             cospi_24_64, -cospi_8_64,
73                                             cospi_8_64, cospi_24_64,
74                                             cospi_8_64, cospi_24_64);
75   const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
76                                             cospi_16_64, cospi_16_64,
77                                             cospi_16_64, cospi_16_64,
78                                             cospi_16_64, cospi_16_64);
79   const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
80                                             cospi_16_64, -cospi_16_64,
81                                             cospi_16_64, -cospi_16_64,
82                                             cospi_16_64, -cospi_16_64);
83   const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
84                                             cospi_8_64, cospi_24_64,
85                                             -cospi_8_64, -cospi_24_64,
86                                             -cospi_8_64, -cospi_24_64);
87   const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
88                                             cospi_24_64, -cospi_8_64,
89                                             -cospi_24_64, cospi_8_64,
90                                             -cospi_24_64, cospi_8_64);
91 
92   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
93   // This second rounding constant saves doing some extra adds at the end
94   const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
95                                                +(DCT_CONST_ROUNDING << 1));
96   const int DCT_CONST_BITS2 =  DCT_CONST_BITS+2;
97   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
98   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
99   __m128i in0, in1;
100 
101   // Load inputs.
102   {
103     in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
104     in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
105     in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
106            (input +  2 * stride)));
107     in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
108            (input +  3 * stride)));
109     // in0 = [i0 i1 i2 i3 iC iD iE iF]
110     // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
111 
112 
113     // multiply by 16 to give some extra precision
114     in0 = _mm_slli_epi16(in0, 4);
115     in1 = _mm_slli_epi16(in1, 4);
116     // if (i == 0 && input[0]) input[0] += 1;
117     // add 1 to the upper left pixel if it is non-zero, which helps reduce
118     // the round-trip error
119     {
120       // The mask will only contain whether the first value is zero, all
121       // other comparison will fail as something shifted by 4 (above << 4)
122       // can never be equal to one. To increment in the non-zero case, we
123       // add the mask and one for the first element:
124       //   - if zero, mask = -1, v = v - 1 + 1 = v
125       //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
126       __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
127       in0 = _mm_add_epi16(in0, mask);
128       in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
129     }
130   }
131   // There are 4 total stages, alternating between an add/subtract stage
132   // followed by an multiply-and-add stage.
133   {
134     // Stage 1: Add/subtract
135 
136     // in0 = [i0 i1 i2 i3 iC iD iE iF]
137     // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
138     const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
139     const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
140     // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
141     // r1 = [iC i8 iD i9 iE iA iF iB]
142     const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
143     const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
144     // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
145     // r3 = [iC i8 iD i9 iF iB iE iA]
146 
147     const __m128i t0 = _mm_add_epi16(r2, r3);
148     const __m128i t1 = _mm_sub_epi16(r2, r3);
149     // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
150     // t1 = [aC a8 aD a9 aF aB aE aA]
151 
152     // Stage 2: multiply by constants (which gets us into 32 bits).
153     // The constants needed here are:
154     // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
155     // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
156     // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
157     // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
158     const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
159     const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
160     const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
161     const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
162     // Then add and right-shift to get back to 16-bit range
163     const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
164     const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
165     const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
166     const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
167     const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
168     const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
169     const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
170     const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
171     // w0 = [b0 b1 b7 b6]
172     // w1 = [b8 b9 bF bE]
173     // w2 = [b4 b5 b3 b2]
174     // w3 = [bC bD bB bA]
175     const __m128i x0 = _mm_packs_epi32(w0, w1);
176     const __m128i x1 = _mm_packs_epi32(w2, w3);
177     // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
178     // x1 = [b4 b5 b3 b2 bC bD bB bA]
179     in0 = _mm_shuffle_epi32(x0, 0xD8);
180     in1 = _mm_shuffle_epi32(x1, 0x8D);
181     // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
182     // in1 = [b3 b2 bB bA b4 b5 bC bD]
183   }
184   {
185     // vertical DCTs finished. Now we do the horizontal DCTs.
186     // Stage 3: Add/subtract
187 
188     const __m128i t0 = _mm_add_epi16(in0, in1);
189     const __m128i t1 = _mm_sub_epi16(in0, in1);
190     // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
191     // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
192 
193     // Stage 4: multiply by constants (which gets us into 32 bits).
194     // The constants needed here are:
195     // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
196     // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
197     // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
198     // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
199     const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
200     const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
201     const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
202     const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
203     // Then add and right-shift to get back to 16-bit range
204     // but this combines the final right-shift as well to save operations
205     // This unusual rounding operations is to maintain bit-accurate
206     // compatibility with the c version of this function which has two
207     // rounding steps in a row.
208     const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
209     const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
210     const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
211     const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
212     const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
213     const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
214     const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
215     const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
216     // w0 = [o0 o4 o8 oC]
217     // w1 = [o2 o6 oA oE]
218     // w2 = [o1 o5 o9 oD]
219     // w3 = [o3 o7 oB oF]
220     // remember the o's are numbered according to the correct output location
221     const __m128i x0 = _mm_packs_epi32(w0, w1);
222     const __m128i x1 = _mm_packs_epi32(w2, w3);
223     // x0 = [o0 o4 o8 oC o2 o6 oA oE]
224     // x1 = [o1 o5 o9 oD o3 o7 oB oF]
225     const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
226     const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
227     // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
228     // y1 = [o2 o3 o6 o7 oA oB oE oF]
229     in0 = _mm_unpacklo_epi32(y0, y1);
230     // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
231     in1 = _mm_unpackhi_epi32(y0, y1);
232     // in1 = [o8 o9 oA oB oC oD oE oF]
233   }
234   // Post-condition (v + 1) >> 2 is now incorporated into previous
235   // add and right-shift commands.  Only 2 store instructions needed
236   // because we are using the fact that 1/3 are stored just after 0/2.
237   {
238      _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
239      _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
240   }
241 }
242 
243 
load_buffer_4x4(const int16_t * input,__m128i * in,int stride)244 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
245                                    int stride) {
246   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
247   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
248   __m128i mask;
249 
250   in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
251   in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
252   in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
253   in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
254 
255   in[0] = _mm_slli_epi16(in[0], 4);
256   in[1] = _mm_slli_epi16(in[1], 4);
257   in[2] = _mm_slli_epi16(in[2], 4);
258   in[3] = _mm_slli_epi16(in[3], 4);
259 
260   mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
261   in[0] = _mm_add_epi16(in[0], mask);
262   in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
263 }
264 
write_buffer_4x4(int16_t * output,__m128i * res)265 static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
266   const __m128i kOne = _mm_set1_epi16(1);
267   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
268   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
269   __m128i out01 = _mm_add_epi16(in01, kOne);
270   __m128i out23 = _mm_add_epi16(in23, kOne);
271   out01 = _mm_srai_epi16(out01, 2);
272   out23 = _mm_srai_epi16(out23, 2);
273   _mm_store_si128((__m128i *)(output + 0 * 8), out01);
274   _mm_store_si128((__m128i *)(output + 1 * 8), out23);
275 }
276 
transpose_4x4(__m128i * res)277 static INLINE void transpose_4x4(__m128i *res) {
278   // Combine and transpose
279   // 00 01 02 03 20 21 22 23
280   // 10 11 12 13 30 31 32 33
281   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
282   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
283 
284   // 00 10 01 11 02 12 03 13
285   // 20 30 21 31 22 32 23 33
286   res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
287   res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
288 
289   // 00 10 20 30 01 11 21 31
290   // 02 12 22 32 03 13 23 33
291   // only use the first 4 16-bit integers
292   res[1] = _mm_unpackhi_epi64(res[0], res[0]);
293   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
294 }
295 
fdct4_sse2(__m128i * in)296 void fdct4_sse2(__m128i *in) {
297   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
298   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
299   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
300   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
301   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
302 
303   __m128i u[4], v[4];
304   u[0]=_mm_unpacklo_epi16(in[0], in[1]);
305   u[1]=_mm_unpacklo_epi16(in[3], in[2]);
306 
307   v[0] = _mm_add_epi16(u[0], u[1]);
308   v[1] = _mm_sub_epi16(u[0], u[1]);
309 
310   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
311   u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
312   u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
313   u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
314 
315   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
316   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
317   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
318   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
319   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
320   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
321   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
322   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
323 
324   in[0] = _mm_packs_epi32(u[0], u[1]);
325   in[1] = _mm_packs_epi32(u[2], u[3]);
326   transpose_4x4(in);
327 }
328 
fadst4_sse2(__m128i * in)329 void fadst4_sse2(__m128i *in) {
330   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
331   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
332   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
333   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
334   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
335   const __m128i kZero = _mm_set1_epi16(0);
336   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
337   __m128i u[8], v[8];
338   __m128i in7 = _mm_add_epi16(in[0], in[1]);
339 
340   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
341   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
342   u[2] = _mm_unpacklo_epi16(in7, kZero);
343   u[3] = _mm_unpacklo_epi16(in[2], kZero);
344   u[4] = _mm_unpacklo_epi16(in[3], kZero);
345 
346   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
347   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
348   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
349   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
350   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
351   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
352   v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
353 
354   u[0] = _mm_add_epi32(v[0], v[1]);
355   u[1] = _mm_sub_epi32(v[2], v[6]);
356   u[2] = _mm_add_epi32(v[3], v[4]);
357   u[3] = _mm_sub_epi32(u[2], u[0]);
358   u[4] = _mm_slli_epi32(v[5], 2);
359   u[5] = _mm_sub_epi32(u[4], v[5]);
360   u[6] = _mm_add_epi32(u[3], u[5]);
361 
362   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
363   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
364   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
365   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
366 
367   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
368   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
369   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
370   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
371 
372   in[0] = _mm_packs_epi32(u[0], u[2]);
373   in[1] = _mm_packs_epi32(u[1], u[3]);
374   transpose_4x4(in);
375 }
376 
vp9_fht4x4_sse2(const int16_t * input,int16_t * output,int stride,int tx_type)377 void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
378                      int stride, int tx_type) {
379   __m128i in[4];
380 
381   switch (tx_type) {
382     case DCT_DCT:
383       vp9_fdct4x4_sse2(input, output, stride);
384       break;
385     case ADST_DCT:
386       load_buffer_4x4(input, in, stride);
387       fadst4_sse2(in);
388       fdct4_sse2(in);
389       write_buffer_4x4(output, in);
390       break;
391     case DCT_ADST:
392       load_buffer_4x4(input, in, stride);
393       fdct4_sse2(in);
394       fadst4_sse2(in);
395       write_buffer_4x4(output, in);
396       break;
397     case ADST_ADST:
398       load_buffer_4x4(input, in, stride);
399       fadst4_sse2(in);
400       fadst4_sse2(in);
401       write_buffer_4x4(output, in);
402       break;
403    default:
404      assert(0);
405      break;
406   }
407 }
408 
vp9_fdct8x8_1_sse2(const int16_t * input,int16_t * output,int stride)409 void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
410   __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
411   __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
412   __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
413   __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
414   __m128i u0, u1, sum;
415 
416   u0 = _mm_add_epi16(in0, in1);
417   u1 = _mm_add_epi16(in2, in3);
418 
419   in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
420   in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
421   in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
422   in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
423 
424   sum = _mm_add_epi16(u0, u1);
425 
426   in0 = _mm_add_epi16(in0, in1);
427   in2 = _mm_add_epi16(in2, in3);
428   sum = _mm_add_epi16(sum, in0);
429 
430   u0  = _mm_setzero_si128();
431   sum = _mm_add_epi16(sum, in2);
432 
433   in0 = _mm_unpacklo_epi16(u0, sum);
434   in1 = _mm_unpackhi_epi16(u0, sum);
435   in0 = _mm_srai_epi32(in0, 16);
436   in1 = _mm_srai_epi32(in1, 16);
437 
438   sum = _mm_add_epi32(in0, in1);
439   in0 = _mm_unpacklo_epi32(sum, u0);
440   in1 = _mm_unpackhi_epi32(sum, u0);
441 
442   sum = _mm_add_epi32(in0, in1);
443   in0 = _mm_srli_si128(sum, 8);
444 
445   in1 = _mm_add_epi32(sum, in0);
446   _mm_store_si128((__m128i *)(output), in1);
447 }
448 
vp9_fdct8x8_sse2(const int16_t * input,int16_t * output,int stride)449 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
450   int pass;
451   // Constants
452   //    When we use them, in one case, they are all the same. In all others
453   //    it's a pair of them that we need to repeat four times. This is done
454   //    by constructing the 32 bit constant corresponding to that pair.
455   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
456   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
457   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
458   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
459   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
460   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
461   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
462   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
463   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
464   // Load input
465   __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
466   __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
467   __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
468   __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
469   __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
470   __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
471   __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
472   __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
473   // Pre-condition input (shift by two)
474   in0 = _mm_slli_epi16(in0, 2);
475   in1 = _mm_slli_epi16(in1, 2);
476   in2 = _mm_slli_epi16(in2, 2);
477   in3 = _mm_slli_epi16(in3, 2);
478   in4 = _mm_slli_epi16(in4, 2);
479   in5 = _mm_slli_epi16(in5, 2);
480   in6 = _mm_slli_epi16(in6, 2);
481   in7 = _mm_slli_epi16(in7, 2);
482 
483   // We do two passes, first the columns, then the rows. The results of the
484   // first pass are transposed so that the same column code can be reused. The
485   // results of the second pass are also transposed so that the rows (processed
486   // as columns) are put back in row positions.
487   for (pass = 0; pass < 2; pass++) {
488     // To store results of each pass before the transpose.
489     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
490     // Add/subtract
491     const __m128i q0 = _mm_add_epi16(in0, in7);
492     const __m128i q1 = _mm_add_epi16(in1, in6);
493     const __m128i q2 = _mm_add_epi16(in2, in5);
494     const __m128i q3 = _mm_add_epi16(in3, in4);
495     const __m128i q4 = _mm_sub_epi16(in3, in4);
496     const __m128i q5 = _mm_sub_epi16(in2, in5);
497     const __m128i q6 = _mm_sub_epi16(in1, in6);
498     const __m128i q7 = _mm_sub_epi16(in0, in7);
499     // Work on first four results
500     {
501       // Add/subtract
502       const __m128i r0 = _mm_add_epi16(q0, q3);
503       const __m128i r1 = _mm_add_epi16(q1, q2);
504       const __m128i r2 = _mm_sub_epi16(q1, q2);
505       const __m128i r3 = _mm_sub_epi16(q0, q3);
506       // Interleave to do the multiply by constants which gets us into 32bits
507       const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
508       const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
509       const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
510       const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
511       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
512       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
513       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
514       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
515       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
516       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
517       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
518       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
519       // dct_const_round_shift
520       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
521       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
522       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
523       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
524       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
525       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
526       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
527       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
528       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
529       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
530       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
531       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
532       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
533       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
534       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
535       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
536       // Combine
537       res0 = _mm_packs_epi32(w0, w1);
538       res4 = _mm_packs_epi32(w2, w3);
539       res2 = _mm_packs_epi32(w4, w5);
540       res6 = _mm_packs_epi32(w6, w7);
541     }
542     // Work on next four results
543     {
544       // Interleave to do the multiply by constants which gets us into 32bits
545       const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
546       const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
547       const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
548       const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
549       const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
550       const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
551       // dct_const_round_shift
552       const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
553       const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
554       const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
555       const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
556       const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
557       const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
558       const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
559       const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
560       // Combine
561       const __m128i r0 = _mm_packs_epi32(s0, s1);
562       const __m128i r1 = _mm_packs_epi32(s2, s3);
563       // Add/subtract
564       const __m128i x0 = _mm_add_epi16(q4, r0);
565       const __m128i x1 = _mm_sub_epi16(q4, r0);
566       const __m128i x2 = _mm_sub_epi16(q7, r1);
567       const __m128i x3 = _mm_add_epi16(q7, r1);
568       // Interleave to do the multiply by constants which gets us into 32bits
569       const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
570       const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
571       const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
572       const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
573       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
574       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
575       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
576       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
577       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
578       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
579       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
580       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
581       // dct_const_round_shift
582       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
583       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
584       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
585       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
586       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
587       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
588       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
589       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
590       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
591       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
592       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
593       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
594       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
595       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
596       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
597       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
598       // Combine
599       res1 = _mm_packs_epi32(w0, w1);
600       res7 = _mm_packs_epi32(w2, w3);
601       res5 = _mm_packs_epi32(w4, w5);
602       res3 = _mm_packs_epi32(w6, w7);
603     }
604     // Transpose the 8x8.
605     {
606       // 00 01 02 03 04 05 06 07
607       // 10 11 12 13 14 15 16 17
608       // 20 21 22 23 24 25 26 27
609       // 30 31 32 33 34 35 36 37
610       // 40 41 42 43 44 45 46 47
611       // 50 51 52 53 54 55 56 57
612       // 60 61 62 63 64 65 66 67
613       // 70 71 72 73 74 75 76 77
614       const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
615       const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
616       const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
617       const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
618       const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
619       const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
620       const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
621       const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
622       // 00 10 01 11 02 12 03 13
623       // 20 30 21 31 22 32 23 33
624       // 04 14 05 15 06 16 07 17
625       // 24 34 25 35 26 36 27 37
626       // 40 50 41 51 42 52 43 53
627       // 60 70 61 71 62 72 63 73
628       // 54 54 55 55 56 56 57 57
629       // 64 74 65 75 66 76 67 77
630       const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
631       const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
632       const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
633       const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
634       const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
635       const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
636       const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
637       const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
638       // 00 10 20 30 01 11 21 31
639       // 40 50 60 70 41 51 61 71
640       // 02 12 22 32 03 13 23 33
641       // 42 52 62 72 43 53 63 73
642       // 04 14 24 34 05 15 21 36
643       // 44 54 64 74 45 55 61 76
644       // 06 16 26 36 07 17 27 37
645       // 46 56 66 76 47 57 67 77
646       in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
647       in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
648       in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
649       in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
650       in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
651       in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
652       in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
653       in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
654       // 00 10 20 30 40 50 60 70
655       // 01 11 21 31 41 51 61 71
656       // 02 12 22 32 42 52 62 72
657       // 03 13 23 33 43 53 63 73
658       // 04 14 24 34 44 54 64 74
659       // 05 15 25 35 45 55 65 75
660       // 06 16 26 36 46 56 66 76
661       // 07 17 27 37 47 57 67 77
662     }
663   }
664   // Post-condition output and store it
665   {
666     // Post-condition (division by two)
667     //    division of two 16 bits signed numbers using shifts
668     //    n / 2 = (n - (n >> 15)) >> 1
669     const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
670     const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
671     const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
672     const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
673     const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
674     const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
675     const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
676     const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
677     in0 = _mm_sub_epi16(in0, sign_in0);
678     in1 = _mm_sub_epi16(in1, sign_in1);
679     in2 = _mm_sub_epi16(in2, sign_in2);
680     in3 = _mm_sub_epi16(in3, sign_in3);
681     in4 = _mm_sub_epi16(in4, sign_in4);
682     in5 = _mm_sub_epi16(in5, sign_in5);
683     in6 = _mm_sub_epi16(in6, sign_in6);
684     in7 = _mm_sub_epi16(in7, sign_in7);
685     in0 = _mm_srai_epi16(in0, 1);
686     in1 = _mm_srai_epi16(in1, 1);
687     in2 = _mm_srai_epi16(in2, 1);
688     in3 = _mm_srai_epi16(in3, 1);
689     in4 = _mm_srai_epi16(in4, 1);
690     in5 = _mm_srai_epi16(in5, 1);
691     in6 = _mm_srai_epi16(in6, 1);
692     in7 = _mm_srai_epi16(in7, 1);
693     // store results
694     _mm_store_si128((__m128i *)(output + 0 * 8), in0);
695     _mm_store_si128((__m128i *)(output + 1 * 8), in1);
696     _mm_store_si128((__m128i *)(output + 2 * 8), in2);
697     _mm_store_si128((__m128i *)(output + 3 * 8), in3);
698     _mm_store_si128((__m128i *)(output + 4 * 8), in4);
699     _mm_store_si128((__m128i *)(output + 5 * 8), in5);
700     _mm_store_si128((__m128i *)(output + 6 * 8), in6);
701     _mm_store_si128((__m128i *)(output + 7 * 8), in7);
702   }
703 }
704 
705 // load 8x8 array
load_buffer_8x8(const int16_t * input,__m128i * in,int stride)706 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
707                                    int stride) {
708   in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
709   in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
710   in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
711   in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
712   in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
713   in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
714   in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
715   in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
716 
717   in[0] = _mm_slli_epi16(in[0], 2);
718   in[1] = _mm_slli_epi16(in[1], 2);
719   in[2] = _mm_slli_epi16(in[2], 2);
720   in[3] = _mm_slli_epi16(in[3], 2);
721   in[4] = _mm_slli_epi16(in[4], 2);
722   in[5] = _mm_slli_epi16(in[5], 2);
723   in[6] = _mm_slli_epi16(in[6], 2);
724   in[7] = _mm_slli_epi16(in[7], 2);
725 }
726 
727 // right shift and rounding
right_shift_8x8(__m128i * res,int const bit)728 static INLINE void right_shift_8x8(__m128i *res, int const bit) {
729   const __m128i kOne = _mm_set1_epi16(1);
730   const int bit_m02 = bit - 2;
731   __m128i sign0 = _mm_srai_epi16(res[0], 15);
732   __m128i sign1 = _mm_srai_epi16(res[1], 15);
733   __m128i sign2 = _mm_srai_epi16(res[2], 15);
734   __m128i sign3 = _mm_srai_epi16(res[3], 15);
735   __m128i sign4 = _mm_srai_epi16(res[4], 15);
736   __m128i sign5 = _mm_srai_epi16(res[5], 15);
737   __m128i sign6 = _mm_srai_epi16(res[6], 15);
738   __m128i sign7 = _mm_srai_epi16(res[7], 15);
739 
740   if (bit_m02 >= 0) {
741     __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
742     res[0] = _mm_add_epi16(res[0], k_const_rounding);
743     res[1] = _mm_add_epi16(res[1], k_const_rounding);
744     res[2] = _mm_add_epi16(res[2], k_const_rounding);
745     res[3] = _mm_add_epi16(res[3], k_const_rounding);
746     res[4] = _mm_add_epi16(res[4], k_const_rounding);
747     res[5] = _mm_add_epi16(res[5], k_const_rounding);
748     res[6] = _mm_add_epi16(res[6], k_const_rounding);
749     res[7] = _mm_add_epi16(res[7], k_const_rounding);
750   }
751 
752   res[0] = _mm_sub_epi16(res[0], sign0);
753   res[1] = _mm_sub_epi16(res[1], sign1);
754   res[2] = _mm_sub_epi16(res[2], sign2);
755   res[3] = _mm_sub_epi16(res[3], sign3);
756   res[4] = _mm_sub_epi16(res[4], sign4);
757   res[5] = _mm_sub_epi16(res[5], sign5);
758   res[6] = _mm_sub_epi16(res[6], sign6);
759   res[7] = _mm_sub_epi16(res[7], sign7);
760 
761   res[0] = _mm_srai_epi16(res[0], bit);
762   res[1] = _mm_srai_epi16(res[1], bit);
763   res[2] = _mm_srai_epi16(res[2], bit);
764   res[3] = _mm_srai_epi16(res[3], bit);
765   res[4] = _mm_srai_epi16(res[4], bit);
766   res[5] = _mm_srai_epi16(res[5], bit);
767   res[6] = _mm_srai_epi16(res[6], bit);
768   res[7] = _mm_srai_epi16(res[7], bit);
769 }
770 
771 // write 8x8 array
write_buffer_8x8(int16_t * output,__m128i * res,int stride)772 static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
773   _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
774   _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
775   _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
776   _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
777   _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
778   _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
779   _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
780   _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
781 }
782 
783 // perform in-place transpose
array_transpose_8x8(__m128i * in,__m128i * res)784 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
785   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
786   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
787   const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
788   const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
789   const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
790   const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
791   const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
792   const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
793   // 00 10 01 11 02 12 03 13
794   // 20 30 21 31 22 32 23 33
795   // 04 14 05 15 06 16 07 17
796   // 24 34 25 35 26 36 27 37
797   // 40 50 41 51 42 52 43 53
798   // 60 70 61 71 62 72 63 73
799   // 44 54 45 55 46 56 47 57
800   // 64 74 65 75 66 76 67 77
801   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
802   const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
803   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
804   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
805   const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
806   const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
807   const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
808   const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
809   // 00 10 20 30 01 11 21 31
810   // 40 50 60 70 41 51 61 71
811   // 02 12 22 32 03 13 23 33
812   // 42 52 62 72 43 53 63 73
813   // 04 14 24 34 05 15 25 35
814   // 44 54 64 74 45 55 65 75
815   // 06 16 26 36 07 17 27 37
816   // 46 56 66 76 47 57 67 77
817   res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
818   res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
819   res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
820   res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
821   res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
822   res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
823   res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
824   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
825   // 00 10 20 30 40 50 60 70
826   // 01 11 21 31 41 51 61 71
827   // 02 12 22 32 42 52 62 72
828   // 03 13 23 33 43 53 63 73
829   // 04 14 24 34 44 54 64 74
830   // 05 15 25 35 45 55 65 75
831   // 06 16 26 36 46 56 66 76
832   // 07 17 27 37 47 57 67 77
833 }
834 
fdct8_sse2(__m128i * in)835 void fdct8_sse2(__m128i *in) {
836   // constants
837   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
838   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
839   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
840   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
841   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
842   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
843   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
844   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
845   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
846   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
847   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
848   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
849 
850   // stage 1
851   s0 = _mm_add_epi16(in[0], in[7]);
852   s1 = _mm_add_epi16(in[1], in[6]);
853   s2 = _mm_add_epi16(in[2], in[5]);
854   s3 = _mm_add_epi16(in[3], in[4]);
855   s4 = _mm_sub_epi16(in[3], in[4]);
856   s5 = _mm_sub_epi16(in[2], in[5]);
857   s6 = _mm_sub_epi16(in[1], in[6]);
858   s7 = _mm_sub_epi16(in[0], in[7]);
859 
860   u0 = _mm_add_epi16(s0, s3);
861   u1 = _mm_add_epi16(s1, s2);
862   u2 = _mm_sub_epi16(s1, s2);
863   u3 = _mm_sub_epi16(s0, s3);
864   // interleave and perform butterfly multiplication/addition
865   v0 = _mm_unpacklo_epi16(u0, u1);
866   v1 = _mm_unpackhi_epi16(u0, u1);
867   v2 = _mm_unpacklo_epi16(u2, u3);
868   v3 = _mm_unpackhi_epi16(u2, u3);
869 
870   u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
871   u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
872   u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
873   u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
874   u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
875   u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
876   u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
877   u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
878 
879   // shift and rounding
880   v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
881   v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
882   v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
883   v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
884   v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
885   v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
886   v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
887   v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
888 
889   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
890   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
891   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
892   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
893   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
894   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
895   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
896   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
897 
898   in[0] = _mm_packs_epi32(u0, u1);
899   in[2] = _mm_packs_epi32(u4, u5);
900   in[4] = _mm_packs_epi32(u2, u3);
901   in[6] = _mm_packs_epi32(u6, u7);
902 
903   // stage 2
904   // interleave and perform butterfly multiplication/addition
905   u0 = _mm_unpacklo_epi16(s6, s5);
906   u1 = _mm_unpackhi_epi16(s6, s5);
907   v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
908   v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
909   v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
910   v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
911 
912   // shift and rounding
913   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
914   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
915   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
916   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
917 
918   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
919   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
920   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
921   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
922 
923   u0 = _mm_packs_epi32(v0, v1);
924   u1 = _mm_packs_epi32(v2, v3);
925 
926   // stage 3
927   s0 = _mm_add_epi16(s4, u0);
928   s1 = _mm_sub_epi16(s4, u0);
929   s2 = _mm_sub_epi16(s7, u1);
930   s3 = _mm_add_epi16(s7, u1);
931 
932   // stage 4
933   u0 = _mm_unpacklo_epi16(s0, s3);
934   u1 = _mm_unpackhi_epi16(s0, s3);
935   u2 = _mm_unpacklo_epi16(s1, s2);
936   u3 = _mm_unpackhi_epi16(s1, s2);
937 
938   v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
939   v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
940   v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
941   v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
942   v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
943   v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
944   v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
945   v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
946 
947   // shift and rounding
948   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
949   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
950   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
951   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
952   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
953   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
954   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
955   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
956 
957   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
958   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
959   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
960   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
961   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
962   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
963   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
964   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
965 
966   in[1] = _mm_packs_epi32(v0, v1);
967   in[3] = _mm_packs_epi32(v4, v5);
968   in[5] = _mm_packs_epi32(v2, v3);
969   in[7] = _mm_packs_epi32(v6, v7);
970 
971   // transpose
972   array_transpose_8x8(in, in);
973 }
974 
fadst8_sse2(__m128i * in)975 void fadst8_sse2(__m128i *in) {
976   // Constants
977   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
978   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
979   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
980   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
981   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
982   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
983   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
984   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
985   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
986   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
987   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
988   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
989   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
990   const __m128i k__const_0 = _mm_set1_epi16(0);
991   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
992 
993   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
994   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
995   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
996   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
997   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
998 
999   // properly aligned for butterfly input
1000   in0  = in[7];
1001   in1  = in[0];
1002   in2  = in[5];
1003   in3  = in[2];
1004   in4  = in[3];
1005   in5  = in[4];
1006   in6  = in[1];
1007   in7  = in[6];
1008 
1009   // column transformation
1010   // stage 1
1011   // interleave and multiply/add into 32-bit integer
1012   s0 = _mm_unpacklo_epi16(in0, in1);
1013   s1 = _mm_unpackhi_epi16(in0, in1);
1014   s2 = _mm_unpacklo_epi16(in2, in3);
1015   s3 = _mm_unpackhi_epi16(in2, in3);
1016   s4 = _mm_unpacklo_epi16(in4, in5);
1017   s5 = _mm_unpackhi_epi16(in4, in5);
1018   s6 = _mm_unpacklo_epi16(in6, in7);
1019   s7 = _mm_unpackhi_epi16(in6, in7);
1020 
1021   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
1022   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
1023   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
1024   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
1025   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
1026   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
1027   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
1028   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
1029   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
1030   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
1031   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
1032   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
1033   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
1034   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
1035   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
1036   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
1037 
1038   // addition
1039   w0 = _mm_add_epi32(u0, u8);
1040   w1 = _mm_add_epi32(u1, u9);
1041   w2 = _mm_add_epi32(u2, u10);
1042   w3 = _mm_add_epi32(u3, u11);
1043   w4 = _mm_add_epi32(u4, u12);
1044   w5 = _mm_add_epi32(u5, u13);
1045   w6 = _mm_add_epi32(u6, u14);
1046   w7 = _mm_add_epi32(u7, u15);
1047   w8 = _mm_sub_epi32(u0, u8);
1048   w9 = _mm_sub_epi32(u1, u9);
1049   w10 = _mm_sub_epi32(u2, u10);
1050   w11 = _mm_sub_epi32(u3, u11);
1051   w12 = _mm_sub_epi32(u4, u12);
1052   w13 = _mm_sub_epi32(u5, u13);
1053   w14 = _mm_sub_epi32(u6, u14);
1054   w15 = _mm_sub_epi32(u7, u15);
1055 
1056   // shift and rounding
1057   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
1058   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
1059   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
1060   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
1061   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
1062   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
1063   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
1064   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
1065   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
1066   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
1067   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
1068   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
1069   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
1070   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
1071   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
1072   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
1073 
1074   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1075   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1076   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1077   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1078   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1079   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1080   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1081   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1082   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
1083   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
1084   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
1085   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
1086   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
1087   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
1088   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
1089   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
1090 
1091   // back to 16-bit and pack 8 integers into __m128i
1092   in[0] = _mm_packs_epi32(u0, u1);
1093   in[1] = _mm_packs_epi32(u2, u3);
1094   in[2] = _mm_packs_epi32(u4, u5);
1095   in[3] = _mm_packs_epi32(u6, u7);
1096   in[4] = _mm_packs_epi32(u8, u9);
1097   in[5] = _mm_packs_epi32(u10, u11);
1098   in[6] = _mm_packs_epi32(u12, u13);
1099   in[7] = _mm_packs_epi32(u14, u15);
1100 
1101   // stage 2
1102   s0 = _mm_add_epi16(in[0], in[2]);
1103   s1 = _mm_add_epi16(in[1], in[3]);
1104   s2 = _mm_sub_epi16(in[0], in[2]);
1105   s3 = _mm_sub_epi16(in[1], in[3]);
1106   u0 = _mm_unpacklo_epi16(in[4], in[5]);
1107   u1 = _mm_unpackhi_epi16(in[4], in[5]);
1108   u2 = _mm_unpacklo_epi16(in[6], in[7]);
1109   u3 = _mm_unpackhi_epi16(in[6], in[7]);
1110 
1111   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
1112   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
1113   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
1114   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
1115   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
1116   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
1117   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
1118   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
1119 
1120   w0 = _mm_add_epi32(v0, v4);
1121   w1 = _mm_add_epi32(v1, v5);
1122   w2 = _mm_add_epi32(v2, v6);
1123   w3 = _mm_add_epi32(v3, v7);
1124   w4 = _mm_sub_epi32(v0, v4);
1125   w5 = _mm_sub_epi32(v1, v5);
1126   w6 = _mm_sub_epi32(v2, v6);
1127   w7 = _mm_sub_epi32(v3, v7);
1128 
1129   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
1130   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
1131   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
1132   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
1133   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
1134   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
1135   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
1136   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
1137 
1138   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1139   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1140   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1141   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1142   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1143   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1144   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1145   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1146 
1147   // back to 16-bit intergers
1148   s4 = _mm_packs_epi32(u0, u1);
1149   s5 = _mm_packs_epi32(u2, u3);
1150   s6 = _mm_packs_epi32(u4, u5);
1151   s7 = _mm_packs_epi32(u6, u7);
1152 
1153   // stage 3
1154   u0 = _mm_unpacklo_epi16(s2, s3);
1155   u1 = _mm_unpackhi_epi16(s2, s3);
1156   u2 = _mm_unpacklo_epi16(s6, s7);
1157   u3 = _mm_unpackhi_epi16(s6, s7);
1158 
1159   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
1160   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
1161   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
1162   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
1163   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
1164   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
1165   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
1166   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
1167 
1168   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
1169   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
1170   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
1171   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
1172   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
1173   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
1174   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
1175   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
1176 
1177   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
1178   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
1179   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
1180   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
1181   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
1182   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
1183   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
1184   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
1185 
1186   s2 = _mm_packs_epi32(v0, v1);
1187   s3 = _mm_packs_epi32(v2, v3);
1188   s6 = _mm_packs_epi32(v4, v5);
1189   s7 = _mm_packs_epi32(v6, v7);
1190 
1191   // FIXME(jingning): do subtract using bit inversion?
1192   in[0] = s0;
1193   in[1] = _mm_sub_epi16(k__const_0, s4);
1194   in[2] = s6;
1195   in[3] = _mm_sub_epi16(k__const_0, s2);
1196   in[4] = s3;
1197   in[5] = _mm_sub_epi16(k__const_0, s7);
1198   in[6] = s5;
1199   in[7] = _mm_sub_epi16(k__const_0, s1);
1200 
1201   // transpose
1202   array_transpose_8x8(in, in);
1203 }
1204 
vp9_fht8x8_sse2(const int16_t * input,int16_t * output,int stride,int tx_type)1205 void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
1206                      int stride, int tx_type) {
1207   __m128i in[8];
1208 
1209   switch (tx_type) {
1210     case DCT_DCT:
1211       vp9_fdct8x8_sse2(input, output, stride);
1212       break;
1213     case ADST_DCT:
1214       load_buffer_8x8(input, in, stride);
1215       fadst8_sse2(in);
1216       fdct8_sse2(in);
1217       right_shift_8x8(in, 1);
1218       write_buffer_8x8(output, in, 8);
1219       break;
1220     case DCT_ADST:
1221       load_buffer_8x8(input, in, stride);
1222       fdct8_sse2(in);
1223       fadst8_sse2(in);
1224       right_shift_8x8(in, 1);
1225       write_buffer_8x8(output, in, 8);
1226       break;
1227     case ADST_ADST:
1228       load_buffer_8x8(input, in, stride);
1229       fadst8_sse2(in);
1230       fadst8_sse2(in);
1231       right_shift_8x8(in, 1);
1232       write_buffer_8x8(output, in, 8);
1233       break;
1234     default:
1235       assert(0);
1236       break;
1237   }
1238 }
1239 
vp9_fdct16x16_1_sse2(const int16_t * input,int16_t * output,int stride)1240 void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
1241   __m128i in0, in1, in2, in3;
1242   __m128i u0, u1;
1243   __m128i sum = _mm_setzero_si128();
1244   int i;
1245 
1246   for (i = 0; i < 2; ++i) {
1247     input += 8 * i;
1248     in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
1249     in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
1250     in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
1251     in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
1252 
1253     u0 = _mm_add_epi16(in0, in1);
1254     u1 = _mm_add_epi16(in2, in3);
1255     sum = _mm_add_epi16(sum, u0);
1256 
1257     in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
1258     in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
1259     in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
1260     in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
1261 
1262     sum = _mm_add_epi16(sum, u1);
1263     u0  = _mm_add_epi16(in0, in1);
1264     u1  = _mm_add_epi16(in2, in3);
1265     sum = _mm_add_epi16(sum, u0);
1266 
1267     in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
1268     in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
1269     in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
1270     in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
1271 
1272     sum = _mm_add_epi16(sum, u1);
1273     u0  = _mm_add_epi16(in0, in1);
1274     u1  = _mm_add_epi16(in2, in3);
1275     sum = _mm_add_epi16(sum, u0);
1276 
1277     in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
1278     in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
1279     in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
1280     in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
1281 
1282     sum = _mm_add_epi16(sum, u1);
1283     u0  = _mm_add_epi16(in0, in1);
1284     u1  = _mm_add_epi16(in2, in3);
1285     sum = _mm_add_epi16(sum, u0);
1286 
1287     sum = _mm_add_epi16(sum, u1);
1288   }
1289 
1290   u0  = _mm_setzero_si128();
1291   in0 = _mm_unpacklo_epi16(u0, sum);
1292   in1 = _mm_unpackhi_epi16(u0, sum);
1293   in0 = _mm_srai_epi32(in0, 16);
1294   in1 = _mm_srai_epi32(in1, 16);
1295 
1296   sum = _mm_add_epi32(in0, in1);
1297   in0 = _mm_unpacklo_epi32(sum, u0);
1298   in1 = _mm_unpackhi_epi32(sum, u0);
1299 
1300   sum = _mm_add_epi32(in0, in1);
1301   in0 = _mm_srli_si128(sum, 8);
1302 
1303   in1 = _mm_add_epi32(sum, in0);
1304   in1 = _mm_srai_epi32(in1, 1);
1305   _mm_store_si128((__m128i *)(output), in1);
1306 }
1307 
vp9_fdct16x16_sse2(const int16_t * input,int16_t * output,int stride)1308 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
1309   // The 2D transform is done with two passes which are actually pretty
1310   // similar. In the first one, we transform the columns and transpose
1311   // the results. In the second one, we transform the rows. To achieve that,
1312   // as the first pass results are transposed, we transpose the columns (that
1313   // is the transposed rows) and transpose the results (so that it goes back
1314   // in normal/row positions).
1315   int pass;
1316   // We need an intermediate buffer between passes.
1317   DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
1318   const int16_t *in = input;
1319   int16_t *out = intermediate;
1320   // Constants
1321   //    When we use them, in one case, they are all the same. In all others
1322   //    it's a pair of them that we need to repeat four times. This is done
1323   //    by constructing the 32 bit constant corresponding to that pair.
1324   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1325   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1326   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1327   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
1328   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1329   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
1330   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1331   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
1332   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1333   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
1334   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
1335   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
1336   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
1337   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
1338   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
1339   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
1340   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
1341   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1342   const __m128i kOne = _mm_set1_epi16(1);
1343   // Do the two transform/transpose passes
1344   for (pass = 0; pass < 2; ++pass) {
1345     // We process eight columns (transposed rows in second pass) at a time.
1346     int column_start;
1347     for (column_start = 0; column_start < 16; column_start += 8) {
1348       __m128i in00, in01, in02, in03, in04, in05, in06, in07;
1349       __m128i in08, in09, in10, in11, in12, in13, in14, in15;
1350       __m128i input0, input1, input2, input3, input4, input5, input6, input7;
1351       __m128i step1_0, step1_1, step1_2, step1_3;
1352       __m128i step1_4, step1_5, step1_6, step1_7;
1353       __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
1354       __m128i step3_0, step3_1, step3_2, step3_3;
1355       __m128i step3_4, step3_5, step3_6, step3_7;
1356       __m128i res00, res01, res02, res03, res04, res05, res06, res07;
1357       __m128i res08, res09, res10, res11, res12, res13, res14, res15;
1358       // Load and pre-condition input.
1359       if (0 == pass) {
1360         in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
1361         in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
1362         in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
1363         in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
1364         in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
1365         in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
1366         in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
1367         in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
1368         in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
1369         in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
1370         in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
1371         in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
1372         in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
1373         in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
1374         in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
1375         in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
1376         // x = x << 2
1377         in00 = _mm_slli_epi16(in00, 2);
1378         in01 = _mm_slli_epi16(in01, 2);
1379         in02 = _mm_slli_epi16(in02, 2);
1380         in03 = _mm_slli_epi16(in03, 2);
1381         in04 = _mm_slli_epi16(in04, 2);
1382         in05 = _mm_slli_epi16(in05, 2);
1383         in06 = _mm_slli_epi16(in06, 2);
1384         in07 = _mm_slli_epi16(in07, 2);
1385         in08 = _mm_slli_epi16(in08, 2);
1386         in09 = _mm_slli_epi16(in09, 2);
1387         in10 = _mm_slli_epi16(in10, 2);
1388         in11 = _mm_slli_epi16(in11, 2);
1389         in12 = _mm_slli_epi16(in12, 2);
1390         in13 = _mm_slli_epi16(in13, 2);
1391         in14 = _mm_slli_epi16(in14, 2);
1392         in15 = _mm_slli_epi16(in15, 2);
1393       } else {
1394         in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
1395         in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
1396         in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
1397         in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
1398         in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
1399         in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
1400         in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
1401         in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
1402         in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
1403         in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
1404         in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
1405         in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
1406         in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
1407         in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
1408         in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
1409         in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
1410         // x = (x + 1) >> 2
1411         in00 = _mm_add_epi16(in00, kOne);
1412         in01 = _mm_add_epi16(in01, kOne);
1413         in02 = _mm_add_epi16(in02, kOne);
1414         in03 = _mm_add_epi16(in03, kOne);
1415         in04 = _mm_add_epi16(in04, kOne);
1416         in05 = _mm_add_epi16(in05, kOne);
1417         in06 = _mm_add_epi16(in06, kOne);
1418         in07 = _mm_add_epi16(in07, kOne);
1419         in08 = _mm_add_epi16(in08, kOne);
1420         in09 = _mm_add_epi16(in09, kOne);
1421         in10 = _mm_add_epi16(in10, kOne);
1422         in11 = _mm_add_epi16(in11, kOne);
1423         in12 = _mm_add_epi16(in12, kOne);
1424         in13 = _mm_add_epi16(in13, kOne);
1425         in14 = _mm_add_epi16(in14, kOne);
1426         in15 = _mm_add_epi16(in15, kOne);
1427         in00 = _mm_srai_epi16(in00, 2);
1428         in01 = _mm_srai_epi16(in01, 2);
1429         in02 = _mm_srai_epi16(in02, 2);
1430         in03 = _mm_srai_epi16(in03, 2);
1431         in04 = _mm_srai_epi16(in04, 2);
1432         in05 = _mm_srai_epi16(in05, 2);
1433         in06 = _mm_srai_epi16(in06, 2);
1434         in07 = _mm_srai_epi16(in07, 2);
1435         in08 = _mm_srai_epi16(in08, 2);
1436         in09 = _mm_srai_epi16(in09, 2);
1437         in10 = _mm_srai_epi16(in10, 2);
1438         in11 = _mm_srai_epi16(in11, 2);
1439         in12 = _mm_srai_epi16(in12, 2);
1440         in13 = _mm_srai_epi16(in13, 2);
1441         in14 = _mm_srai_epi16(in14, 2);
1442         in15 = _mm_srai_epi16(in15, 2);
1443       }
1444       in += 8;
1445       // Calculate input for the first 8 results.
1446       {
1447         input0 = _mm_add_epi16(in00, in15);
1448         input1 = _mm_add_epi16(in01, in14);
1449         input2 = _mm_add_epi16(in02, in13);
1450         input3 = _mm_add_epi16(in03, in12);
1451         input4 = _mm_add_epi16(in04, in11);
1452         input5 = _mm_add_epi16(in05, in10);
1453         input6 = _mm_add_epi16(in06, in09);
1454         input7 = _mm_add_epi16(in07, in08);
1455       }
1456       // Calculate input for the next 8 results.
1457       {
1458         step1_0 = _mm_sub_epi16(in07, in08);
1459         step1_1 = _mm_sub_epi16(in06, in09);
1460         step1_2 = _mm_sub_epi16(in05, in10);
1461         step1_3 = _mm_sub_epi16(in04, in11);
1462         step1_4 = _mm_sub_epi16(in03, in12);
1463         step1_5 = _mm_sub_epi16(in02, in13);
1464         step1_6 = _mm_sub_epi16(in01, in14);
1465         step1_7 = _mm_sub_epi16(in00, in15);
1466       }
1467       // Work on the first eight values; fdct8(input, even_results);
1468       {
1469         // Add/subtract
1470         const __m128i q0 = _mm_add_epi16(input0, input7);
1471         const __m128i q1 = _mm_add_epi16(input1, input6);
1472         const __m128i q2 = _mm_add_epi16(input2, input5);
1473         const __m128i q3 = _mm_add_epi16(input3, input4);
1474         const __m128i q4 = _mm_sub_epi16(input3, input4);
1475         const __m128i q5 = _mm_sub_epi16(input2, input5);
1476         const __m128i q6 = _mm_sub_epi16(input1, input6);
1477         const __m128i q7 = _mm_sub_epi16(input0, input7);
1478         // Work on first four results
1479         {
1480           // Add/subtract
1481           const __m128i r0 = _mm_add_epi16(q0, q3);
1482           const __m128i r1 = _mm_add_epi16(q1, q2);
1483           const __m128i r2 = _mm_sub_epi16(q1, q2);
1484           const __m128i r3 = _mm_sub_epi16(q0, q3);
1485           // Interleave to do the multiply by constants which gets us
1486           // into 32 bits.
1487           const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
1488           const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
1489           const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
1490           const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
1491           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
1492           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
1493           const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
1494           const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
1495           const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
1496           const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
1497           const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
1498           const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
1499           // dct_const_round_shift
1500           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1501           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1502           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1503           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1504           const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
1505           const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
1506           const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
1507           const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
1508           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1509           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1510           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1511           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1512           const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1513           const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1514           const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1515           const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1516           // Combine
1517           res00 = _mm_packs_epi32(w0, w1);
1518           res08 = _mm_packs_epi32(w2, w3);
1519           res04 = _mm_packs_epi32(w4, w5);
1520           res12 = _mm_packs_epi32(w6, w7);
1521         }
1522         // Work on next four results
1523         {
1524           // Interleave to do the multiply by constants which gets us
1525           // into 32 bits.
1526           const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
1527           const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
1528           const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
1529           const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
1530           const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
1531           const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
1532           // dct_const_round_shift
1533           const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
1534           const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
1535           const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
1536           const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
1537           const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
1538           const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
1539           const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
1540           const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
1541           // Combine
1542           const __m128i r0 = _mm_packs_epi32(s0, s1);
1543           const __m128i r1 = _mm_packs_epi32(s2, s3);
1544           // Add/subtract
1545           const __m128i x0 = _mm_add_epi16(q4, r0);
1546           const __m128i x1 = _mm_sub_epi16(q4, r0);
1547           const __m128i x2 = _mm_sub_epi16(q7, r1);
1548           const __m128i x3 = _mm_add_epi16(q7, r1);
1549           // Interleave to do the multiply by constants which gets us
1550           // into 32 bits.
1551           const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
1552           const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
1553           const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
1554           const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
1555           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
1556           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
1557           const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
1558           const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
1559           const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
1560           const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
1561           const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
1562           const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
1563           // dct_const_round_shift
1564           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1565           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1566           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1567           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1568           const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
1569           const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
1570           const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
1571           const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
1572           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1573           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1574           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1575           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1576           const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1577           const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1578           const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1579           const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1580           // Combine
1581           res02 = _mm_packs_epi32(w0, w1);
1582           res14 = _mm_packs_epi32(w2, w3);
1583           res10 = _mm_packs_epi32(w4, w5);
1584           res06 = _mm_packs_epi32(w6, w7);
1585         }
1586       }
1587       // Work on the next eight values; step1 -> odd_results
1588       {
1589         // step 2
1590         {
1591           const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
1592           const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
1593           const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
1594           const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
1595           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
1596           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
1597           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
1598           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
1599           // dct_const_round_shift
1600           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1601           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1602           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1603           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1604           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1605           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1606           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1607           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1608           // Combine
1609           step2_2 = _mm_packs_epi32(w0, w1);
1610           step2_3 = _mm_packs_epi32(w2, w3);
1611         }
1612         {
1613           const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
1614           const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
1615           const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
1616           const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
1617           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
1618           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
1619           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
1620           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
1621           // dct_const_round_shift
1622           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1623           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1624           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1625           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1626           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1627           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1628           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1629           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1630           // Combine
1631           step2_5 = _mm_packs_epi32(w0, w1);
1632           step2_4 = _mm_packs_epi32(w2, w3);
1633         }
1634         // step 3
1635         {
1636           step3_0 = _mm_add_epi16(step1_0, step2_3);
1637           step3_1 = _mm_add_epi16(step1_1, step2_2);
1638           step3_2 = _mm_sub_epi16(step1_1, step2_2);
1639           step3_3 = _mm_sub_epi16(step1_0, step2_3);
1640           step3_4 = _mm_sub_epi16(step1_7, step2_4);
1641           step3_5 = _mm_sub_epi16(step1_6, step2_5);
1642           step3_6 = _mm_add_epi16(step1_6, step2_5);
1643           step3_7 = _mm_add_epi16(step1_7, step2_4);
1644         }
1645         // step 4
1646         {
1647           const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
1648           const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
1649           const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
1650           const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
1651           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
1652           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
1653           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
1654           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
1655           // dct_const_round_shift
1656           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1657           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1658           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1659           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1660           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1661           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1662           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1663           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1664           // Combine
1665           step2_1 = _mm_packs_epi32(w0, w1);
1666           step2_2 = _mm_packs_epi32(w2, w3);
1667         }
1668         {
1669           const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
1670           const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
1671           const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
1672           const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
1673           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
1674           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
1675           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
1676           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
1677           // dct_const_round_shift
1678           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1679           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1680           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1681           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1682           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1683           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1684           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1685           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1686           // Combine
1687           step2_6 = _mm_packs_epi32(w0, w1);
1688           step2_5 = _mm_packs_epi32(w2, w3);
1689         }
1690         // step 5
1691         {
1692           step1_0 = _mm_add_epi16(step3_0, step2_1);
1693           step1_1 = _mm_sub_epi16(step3_0, step2_1);
1694           step1_2 = _mm_add_epi16(step3_3, step2_2);
1695           step1_3 = _mm_sub_epi16(step3_3, step2_2);
1696           step1_4 = _mm_sub_epi16(step3_4, step2_5);
1697           step1_5 = _mm_add_epi16(step3_4, step2_5);
1698           step1_6 = _mm_sub_epi16(step3_7, step2_6);
1699           step1_7 = _mm_add_epi16(step3_7, step2_6);
1700         }
1701         // step 6
1702         {
1703           const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
1704           const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
1705           const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
1706           const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
1707           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
1708           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
1709           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
1710           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
1711           // dct_const_round_shift
1712           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1713           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1714           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1715           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1716           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1717           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1718           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1719           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1720           // Combine
1721           res01 = _mm_packs_epi32(w0, w1);
1722           res09 = _mm_packs_epi32(w2, w3);
1723         }
1724         {
1725           const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
1726           const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
1727           const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
1728           const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
1729           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
1730           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
1731           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
1732           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
1733           // dct_const_round_shift
1734           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1735           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1736           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1737           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1738           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1739           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1740           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1741           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1742           // Combine
1743           res05 = _mm_packs_epi32(w0, w1);
1744           res13 = _mm_packs_epi32(w2, w3);
1745         }
1746         {
1747           const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
1748           const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
1749           const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
1750           const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
1751           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
1752           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
1753           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
1754           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
1755           // dct_const_round_shift
1756           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1757           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1758           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1759           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1760           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1761           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1762           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1763           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1764           // Combine
1765           res11 = _mm_packs_epi32(w0, w1);
1766           res03 = _mm_packs_epi32(w2, w3);
1767         }
1768         {
1769           const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
1770           const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
1771           const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
1772           const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
1773           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
1774           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
1775           const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
1776           const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
1777           // dct_const_round_shift
1778           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
1779           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
1780           const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
1781           const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
1782           const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1783           const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1784           const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1785           const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1786           // Combine
1787           res15 = _mm_packs_epi32(w0, w1);
1788           res07 = _mm_packs_epi32(w2, w3);
1789         }
1790       }
1791       // Transpose the results, do it as two 8x8 transposes.
1792       {
1793         // 00 01 02 03 04 05 06 07
1794         // 10 11 12 13 14 15 16 17
1795         // 20 21 22 23 24 25 26 27
1796         // 30 31 32 33 34 35 36 37
1797         // 40 41 42 43 44 45 46 47
1798         // 50 51 52 53 54 55 56 57
1799         // 60 61 62 63 64 65 66 67
1800         // 70 71 72 73 74 75 76 77
1801         const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
1802         const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
1803         const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
1804         const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
1805         const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
1806         const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
1807         const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
1808         const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
1809         // 00 10 01 11 02 12 03 13
1810         // 20 30 21 31 22 32 23 33
1811         // 04 14 05 15 06 16 07 17
1812         // 24 34 25 35 26 36 27 37
1813         // 40 50 41 51 42 52 43 53
1814         // 60 70 61 71 62 72 63 73
1815         // 54 54 55 55 56 56 57 57
1816         // 64 74 65 75 66 76 67 77
1817         const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
1818         const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
1819         const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
1820         const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
1821         const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
1822         const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
1823         const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
1824         const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
1825         // 00 10 20 30 01 11 21 31
1826         // 40 50 60 70 41 51 61 71
1827         // 02 12 22 32 03 13 23 33
1828         // 42 52 62 72 43 53 63 73
1829         // 04 14 24 34 05 15 21 36
1830         // 44 54 64 74 45 55 61 76
1831         // 06 16 26 36 07 17 27 37
1832         // 46 56 66 76 47 57 67 77
1833         const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
1834         const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
1835         const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
1836         const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
1837         const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
1838         const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
1839         const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
1840         const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
1841         // 00 10 20 30 40 50 60 70
1842         // 01 11 21 31 41 51 61 71
1843         // 02 12 22 32 42 52 62 72
1844         // 03 13 23 33 43 53 63 73
1845         // 04 14 24 34 44 54 64 74
1846         // 05 15 25 35 45 55 65 75
1847         // 06 16 26 36 46 56 66 76
1848         // 07 17 27 37 47 57 67 77
1849         _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
1850         _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
1851         _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
1852         _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
1853         _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
1854         _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
1855         _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
1856         _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
1857       }
1858       {
1859         // 00 01 02 03 04 05 06 07
1860         // 10 11 12 13 14 15 16 17
1861         // 20 21 22 23 24 25 26 27
1862         // 30 31 32 33 34 35 36 37
1863         // 40 41 42 43 44 45 46 47
1864         // 50 51 52 53 54 55 56 57
1865         // 60 61 62 63 64 65 66 67
1866         // 70 71 72 73 74 75 76 77
1867         const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
1868         const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
1869         const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
1870         const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
1871         const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
1872         const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
1873         const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
1874         const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
1875         // 00 10 01 11 02 12 03 13
1876         // 20 30 21 31 22 32 23 33
1877         // 04 14 05 15 06 16 07 17
1878         // 24 34 25 35 26 36 27 37
1879         // 40 50 41 51 42 52 43 53
1880         // 60 70 61 71 62 72 63 73
1881         // 54 54 55 55 56 56 57 57
1882         // 64 74 65 75 66 76 67 77
1883         const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
1884         const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
1885         const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
1886         const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
1887         const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
1888         const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
1889         const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
1890         const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
1891         // 00 10 20 30 01 11 21 31
1892         // 40 50 60 70 41 51 61 71
1893         // 02 12 22 32 03 13 23 33
1894         // 42 52 62 72 43 53 63 73
1895         // 04 14 24 34 05 15 21 36
1896         // 44 54 64 74 45 55 61 76
1897         // 06 16 26 36 07 17 27 37
1898         // 46 56 66 76 47 57 67 77
1899         const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
1900         const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
1901         const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
1902         const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
1903         const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
1904         const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
1905         const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
1906         const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
1907         // 00 10 20 30 40 50 60 70
1908         // 01 11 21 31 41 51 61 71
1909         // 02 12 22 32 42 52 62 72
1910         // 03 13 23 33 43 53 63 73
1911         // 04 14 24 34 44 54 64 74
1912         // 05 15 25 35 45 55 65 75
1913         // 06 16 26 36 46 56 66 76
1914         // 07 17 27 37 47 57 67 77
1915         // Store results
1916         _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
1917         _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
1918         _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
1919         _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
1920         _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
1921         _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
1922         _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
1923         _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
1924       }
1925       out += 8*16;
1926     }
1927     // Setup in/out for next pass.
1928     in = intermediate;
1929     out = output;
1930   }
1931 }
1932 
load_buffer_16x16(const int16_t * input,__m128i * in0,__m128i * in1,int stride)1933 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
1934                                      __m128i *in1, int stride) {
1935   // load first 8 columns
1936   load_buffer_8x8(input, in0, stride);
1937   load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
1938 
1939   input += 8;
1940   // load second 8 columns
1941   load_buffer_8x8(input, in1, stride);
1942   load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
1943 }
1944 
write_buffer_16x16(int16_t * output,__m128i * in0,__m128i * in1,int stride)1945 static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
1946                                       __m128i *in1, int stride) {
1947   // write first 8 columns
1948   write_buffer_8x8(output, in0, stride);
1949   write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
1950   // write second 8 columns
1951   output += 8;
1952   write_buffer_8x8(output, in1, stride);
1953   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
1954 }
1955 
array_transpose_16x16(__m128i * res0,__m128i * res1)1956 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1957   __m128i tbuf[8];
1958   array_transpose_8x8(res0, res0);
1959   array_transpose_8x8(res1, tbuf);
1960   array_transpose_8x8(res0 + 8, res1);
1961   array_transpose_8x8(res1 + 8, res1 + 8);
1962 
1963   res0[8] = tbuf[0];
1964   res0[9] = tbuf[1];
1965   res0[10] = tbuf[2];
1966   res0[11] = tbuf[3];
1967   res0[12] = tbuf[4];
1968   res0[13] = tbuf[5];
1969   res0[14] = tbuf[6];
1970   res0[15] = tbuf[7];
1971 }
1972 
right_shift_16x16(__m128i * res0,__m128i * res1)1973 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
1974   // perform rounding operations
1975   right_shift_8x8(res0, 2);
1976   right_shift_8x8(res0 + 8, 2);
1977   right_shift_8x8(res1, 2);
1978   right_shift_8x8(res1 + 8, 2);
1979 }
1980 
fdct16_8col(__m128i * in)1981 void fdct16_8col(__m128i *in) {
1982   // perform 16x16 1-D DCT for 8 columns
1983   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
1984   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1985   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1986   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1987   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1988   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
1989   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1990   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
1991   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1992   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
1993   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1994   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
1995   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
1996   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
1997   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
1998   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
1999   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
2000   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
2001   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
2002   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
2003 
2004   // stage 1
2005   i[0] = _mm_add_epi16(in[0], in[15]);
2006   i[1] = _mm_add_epi16(in[1], in[14]);
2007   i[2] = _mm_add_epi16(in[2], in[13]);
2008   i[3] = _mm_add_epi16(in[3], in[12]);
2009   i[4] = _mm_add_epi16(in[4], in[11]);
2010   i[5] = _mm_add_epi16(in[5], in[10]);
2011   i[6] = _mm_add_epi16(in[6], in[9]);
2012   i[7] = _mm_add_epi16(in[7], in[8]);
2013 
2014   s[0] = _mm_sub_epi16(in[7], in[8]);
2015   s[1] = _mm_sub_epi16(in[6], in[9]);
2016   s[2] = _mm_sub_epi16(in[5], in[10]);
2017   s[3] = _mm_sub_epi16(in[4], in[11]);
2018   s[4] = _mm_sub_epi16(in[3], in[12]);
2019   s[5] = _mm_sub_epi16(in[2], in[13]);
2020   s[6] = _mm_sub_epi16(in[1], in[14]);
2021   s[7] = _mm_sub_epi16(in[0], in[15]);
2022 
2023   p[0] = _mm_add_epi16(i[0], i[7]);
2024   p[1] = _mm_add_epi16(i[1], i[6]);
2025   p[2] = _mm_add_epi16(i[2], i[5]);
2026   p[3] = _mm_add_epi16(i[3], i[4]);
2027   p[4] = _mm_sub_epi16(i[3], i[4]);
2028   p[5] = _mm_sub_epi16(i[2], i[5]);
2029   p[6] = _mm_sub_epi16(i[1], i[6]);
2030   p[7] = _mm_sub_epi16(i[0], i[7]);
2031 
2032   u[0] = _mm_add_epi16(p[0], p[3]);
2033   u[1] = _mm_add_epi16(p[1], p[2]);
2034   u[2] = _mm_sub_epi16(p[1], p[2]);
2035   u[3] = _mm_sub_epi16(p[0], p[3]);
2036 
2037   v[0] = _mm_unpacklo_epi16(u[0], u[1]);
2038   v[1] = _mm_unpackhi_epi16(u[0], u[1]);
2039   v[2] = _mm_unpacklo_epi16(u[2], u[3]);
2040   v[3] = _mm_unpackhi_epi16(u[2], u[3]);
2041 
2042   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
2043   u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
2044   u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
2045   u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
2046   u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
2047   u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
2048   u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
2049   u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
2050 
2051   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2052   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2053   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2054   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2055   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2056   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2057   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2058   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2059 
2060   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2061   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2062   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2063   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2064   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2065   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2066   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2067   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2068 
2069   in[0] = _mm_packs_epi32(u[0], u[1]);
2070   in[4] = _mm_packs_epi32(u[4], u[5]);
2071   in[8] = _mm_packs_epi32(u[2], u[3]);
2072   in[12] = _mm_packs_epi32(u[6], u[7]);
2073 
2074   u[0] = _mm_unpacklo_epi16(p[5], p[6]);
2075   u[1] = _mm_unpackhi_epi16(p[5], p[6]);
2076   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2077   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2078   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2079   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2080 
2081   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2082   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2083   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2084   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2085 
2086   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2087   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2088   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2089   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2090 
2091   u[0] = _mm_packs_epi32(v[0], v[1]);
2092   u[1] = _mm_packs_epi32(v[2], v[3]);
2093 
2094   t[0] = _mm_add_epi16(p[4], u[0]);
2095   t[1] = _mm_sub_epi16(p[4], u[0]);
2096   t[2] = _mm_sub_epi16(p[7], u[1]);
2097   t[3] = _mm_add_epi16(p[7], u[1]);
2098 
2099   u[0] = _mm_unpacklo_epi16(t[0], t[3]);
2100   u[1] = _mm_unpackhi_epi16(t[0], t[3]);
2101   u[2] = _mm_unpacklo_epi16(t[1], t[2]);
2102   u[3] = _mm_unpackhi_epi16(t[1], t[2]);
2103 
2104   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
2105   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
2106   v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
2107   v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
2108   v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
2109   v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
2110   v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
2111   v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
2112 
2113   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2114   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2115   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2116   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2117   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2118   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2119   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2120   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2121 
2122   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2123   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2124   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2125   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2126   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2127   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2128   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2129   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2130 
2131   in[2] = _mm_packs_epi32(v[0], v[1]);
2132   in[6] = _mm_packs_epi32(v[4], v[5]);
2133   in[10] = _mm_packs_epi32(v[2], v[3]);
2134   in[14] = _mm_packs_epi32(v[6], v[7]);
2135 
2136   // stage 2
2137   u[0] = _mm_unpacklo_epi16(s[2], s[5]);
2138   u[1] = _mm_unpackhi_epi16(s[2], s[5]);
2139   u[2] = _mm_unpacklo_epi16(s[3], s[4]);
2140   u[3] = _mm_unpackhi_epi16(s[3], s[4]);
2141 
2142   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2143   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2144   v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2145   v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2146   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2147   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2148   v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2149   v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2150 
2151   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2152   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2153   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2154   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2155   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2156   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2157   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2158   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2159 
2160   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2161   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2162   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2163   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2164   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2165   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2166   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2167   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2168 
2169   t[2] = _mm_packs_epi32(v[0], v[1]);
2170   t[3] = _mm_packs_epi32(v[2], v[3]);
2171   t[4] = _mm_packs_epi32(v[4], v[5]);
2172   t[5] = _mm_packs_epi32(v[6], v[7]);
2173 
2174   // stage 3
2175   p[0] = _mm_add_epi16(s[0], t[3]);
2176   p[1] = _mm_add_epi16(s[1], t[2]);
2177   p[2] = _mm_sub_epi16(s[1], t[2]);
2178   p[3] = _mm_sub_epi16(s[0], t[3]);
2179   p[4] = _mm_sub_epi16(s[7], t[4]);
2180   p[5] = _mm_sub_epi16(s[6], t[5]);
2181   p[6] = _mm_add_epi16(s[6], t[5]);
2182   p[7] = _mm_add_epi16(s[7], t[4]);
2183 
2184   // stage 4
2185   u[0] = _mm_unpacklo_epi16(p[1], p[6]);
2186   u[1] = _mm_unpackhi_epi16(p[1], p[6]);
2187   u[2] = _mm_unpacklo_epi16(p[2], p[5]);
2188   u[3] = _mm_unpackhi_epi16(p[2], p[5]);
2189 
2190   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
2191   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
2192   v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
2193   v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
2194   v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
2195   v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
2196   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
2197   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
2198 
2199   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2200   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2201   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2202   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2203   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2204   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2205   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2206   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2207 
2208   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2209   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2210   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2211   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2212   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2213   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2214   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2215   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2216 
2217   t[1] = _mm_packs_epi32(v[0], v[1]);
2218   t[2] = _mm_packs_epi32(v[2], v[3]);
2219   t[5] = _mm_packs_epi32(v[4], v[5]);
2220   t[6] = _mm_packs_epi32(v[6], v[7]);
2221 
2222   // stage 5
2223   s[0] = _mm_add_epi16(p[0], t[1]);
2224   s[1] = _mm_sub_epi16(p[0], t[1]);
2225   s[2] = _mm_add_epi16(p[3], t[2]);
2226   s[3] = _mm_sub_epi16(p[3], t[2]);
2227   s[4] = _mm_sub_epi16(p[4], t[5]);
2228   s[5] = _mm_add_epi16(p[4], t[5]);
2229   s[6] = _mm_sub_epi16(p[7], t[6]);
2230   s[7] = _mm_add_epi16(p[7], t[6]);
2231 
2232   // stage 6
2233   u[0] = _mm_unpacklo_epi16(s[0], s[7]);
2234   u[1] = _mm_unpackhi_epi16(s[0], s[7]);
2235   u[2] = _mm_unpacklo_epi16(s[1], s[6]);
2236   u[3] = _mm_unpackhi_epi16(s[1], s[6]);
2237   u[4] = _mm_unpacklo_epi16(s[2], s[5]);
2238   u[5] = _mm_unpackhi_epi16(s[2], s[5]);
2239   u[6] = _mm_unpacklo_epi16(s[3], s[4]);
2240   u[7] = _mm_unpackhi_epi16(s[3], s[4]);
2241 
2242   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
2243   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
2244   v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
2245   v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
2246   v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
2247   v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
2248   v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
2249   v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
2250   v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
2251   v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
2252   v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
2253   v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
2254   v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
2255   v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
2256   v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
2257   v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
2258 
2259   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2260   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2261   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2262   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2263   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2264   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2265   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2266   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2267   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2268   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2269   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2270   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2271   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2272   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2273   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2274   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2275 
2276   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2277   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2278   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2279   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2280   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2281   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2282   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2283   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2284   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2285   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2286   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2287   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2288   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2289   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2290   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2291   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2292 
2293   in[1]  = _mm_packs_epi32(v[0], v[1]);
2294   in[9]  = _mm_packs_epi32(v[2], v[3]);
2295   in[5]  = _mm_packs_epi32(v[4], v[5]);
2296   in[13] = _mm_packs_epi32(v[6], v[7]);
2297   in[3]  = _mm_packs_epi32(v[8], v[9]);
2298   in[11] = _mm_packs_epi32(v[10], v[11]);
2299   in[7]  = _mm_packs_epi32(v[12], v[13]);
2300   in[15] = _mm_packs_epi32(v[14], v[15]);
2301 }
2302 
fadst16_8col(__m128i * in)2303 void fadst16_8col(__m128i *in) {
2304   // perform 16x16 1-D ADST for 8 columns
2305   __m128i s[16], x[16], u[32], v[32];
2306   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
2307   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2308   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
2309   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2310   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
2311   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
2312   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
2313   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
2314   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
2315   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
2316   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
2317   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
2318   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
2319   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2320   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
2321   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2322   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
2323   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2324   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
2325   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2326   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
2327   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
2328   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
2329   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2330   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
2331   const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
2332   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
2333   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2334   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2335   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
2336   const __m128i kZero = _mm_set1_epi16(0);
2337 
2338   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
2339   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
2340   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
2341   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
2342   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
2343   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
2344   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
2345   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
2346   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
2347   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
2348   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
2349   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
2350   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
2351   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
2352   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
2353   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
2354 
2355   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
2356   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
2357   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
2358   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
2359   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
2360   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
2361   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
2362   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
2363   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
2364   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
2365   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
2366   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
2367   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
2368   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
2369   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
2370   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
2371   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
2372   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
2373   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
2374   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
2375   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
2376   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
2377   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
2378   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
2379   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
2380   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
2381   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
2382   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
2383   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
2384   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
2385   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
2386   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
2387 
2388   u[0] = _mm_add_epi32(v[0], v[16]);
2389   u[1] = _mm_add_epi32(v[1], v[17]);
2390   u[2] = _mm_add_epi32(v[2], v[18]);
2391   u[3] = _mm_add_epi32(v[3], v[19]);
2392   u[4] = _mm_add_epi32(v[4], v[20]);
2393   u[5] = _mm_add_epi32(v[5], v[21]);
2394   u[6] = _mm_add_epi32(v[6], v[22]);
2395   u[7] = _mm_add_epi32(v[7], v[23]);
2396   u[8] = _mm_add_epi32(v[8], v[24]);
2397   u[9] = _mm_add_epi32(v[9], v[25]);
2398   u[10] = _mm_add_epi32(v[10], v[26]);
2399   u[11] = _mm_add_epi32(v[11], v[27]);
2400   u[12] = _mm_add_epi32(v[12], v[28]);
2401   u[13] = _mm_add_epi32(v[13], v[29]);
2402   u[14] = _mm_add_epi32(v[14], v[30]);
2403   u[15] = _mm_add_epi32(v[15], v[31]);
2404   u[16] = _mm_sub_epi32(v[0], v[16]);
2405   u[17] = _mm_sub_epi32(v[1], v[17]);
2406   u[18] = _mm_sub_epi32(v[2], v[18]);
2407   u[19] = _mm_sub_epi32(v[3], v[19]);
2408   u[20] = _mm_sub_epi32(v[4], v[20]);
2409   u[21] = _mm_sub_epi32(v[5], v[21]);
2410   u[22] = _mm_sub_epi32(v[6], v[22]);
2411   u[23] = _mm_sub_epi32(v[7], v[23]);
2412   u[24] = _mm_sub_epi32(v[8], v[24]);
2413   u[25] = _mm_sub_epi32(v[9], v[25]);
2414   u[26] = _mm_sub_epi32(v[10], v[26]);
2415   u[27] = _mm_sub_epi32(v[11], v[27]);
2416   u[28] = _mm_sub_epi32(v[12], v[28]);
2417   u[29] = _mm_sub_epi32(v[13], v[29]);
2418   u[30] = _mm_sub_epi32(v[14], v[30]);
2419   u[31] = _mm_sub_epi32(v[15], v[31]);
2420 
2421   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2422   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2423   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2424   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2425   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2426   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2427   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2428   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2429   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2430   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2431   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2432   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2433   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2434   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2435   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2436   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2437   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
2438   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
2439   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
2440   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
2441   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
2442   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
2443   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
2444   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
2445   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
2446   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
2447   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
2448   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
2449   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
2450   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
2451   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
2452   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
2453 
2454   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2455   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2456   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2457   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2458   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2459   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2460   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2461   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2462   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2463   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2464   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2465   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2466   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2467   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2468   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2469   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2470   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
2471   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
2472   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
2473   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
2474   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
2475   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
2476   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
2477   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
2478   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
2479   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
2480   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
2481   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
2482   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
2483   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
2484   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
2485   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
2486 
2487   s[0] = _mm_packs_epi32(u[0], u[1]);
2488   s[1] = _mm_packs_epi32(u[2], u[3]);
2489   s[2] = _mm_packs_epi32(u[4], u[5]);
2490   s[3] = _mm_packs_epi32(u[6], u[7]);
2491   s[4] = _mm_packs_epi32(u[8], u[9]);
2492   s[5] = _mm_packs_epi32(u[10], u[11]);
2493   s[6] = _mm_packs_epi32(u[12], u[13]);
2494   s[7] = _mm_packs_epi32(u[14], u[15]);
2495   s[8] = _mm_packs_epi32(u[16], u[17]);
2496   s[9] = _mm_packs_epi32(u[18], u[19]);
2497   s[10] = _mm_packs_epi32(u[20], u[21]);
2498   s[11] = _mm_packs_epi32(u[22], u[23]);
2499   s[12] = _mm_packs_epi32(u[24], u[25]);
2500   s[13] = _mm_packs_epi32(u[26], u[27]);
2501   s[14] = _mm_packs_epi32(u[28], u[29]);
2502   s[15] = _mm_packs_epi32(u[30], u[31]);
2503 
2504   // stage 2
2505   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
2506   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
2507   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
2508   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
2509   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
2510   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
2511   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
2512   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
2513 
2514   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2515   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2516   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2517   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2518   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2519   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2520   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2521   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2522   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
2523   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
2524   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
2525   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
2526   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
2527   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
2528   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
2529   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
2530 
2531   u[0] = _mm_add_epi32(v[0], v[8]);
2532   u[1] = _mm_add_epi32(v[1], v[9]);
2533   u[2] = _mm_add_epi32(v[2], v[10]);
2534   u[3] = _mm_add_epi32(v[3], v[11]);
2535   u[4] = _mm_add_epi32(v[4], v[12]);
2536   u[5] = _mm_add_epi32(v[5], v[13]);
2537   u[6] = _mm_add_epi32(v[6], v[14]);
2538   u[7] = _mm_add_epi32(v[7], v[15]);
2539   u[8] = _mm_sub_epi32(v[0], v[8]);
2540   u[9] = _mm_sub_epi32(v[1], v[9]);
2541   u[10] = _mm_sub_epi32(v[2], v[10]);
2542   u[11] = _mm_sub_epi32(v[3], v[11]);
2543   u[12] = _mm_sub_epi32(v[4], v[12]);
2544   u[13] = _mm_sub_epi32(v[5], v[13]);
2545   u[14] = _mm_sub_epi32(v[6], v[14]);
2546   u[15] = _mm_sub_epi32(v[7], v[15]);
2547 
2548   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2549   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2550   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2551   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2552   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2553   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2554   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2555   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2556   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2557   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2558   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2559   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2560   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2561   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2562   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2563   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2564 
2565   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2566   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2567   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2568   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2569   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2570   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2571   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2572   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2573   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2574   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2575   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2576   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2577   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2578   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2579   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2580   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2581 
2582   x[0] = _mm_add_epi16(s[0], s[4]);
2583   x[1] = _mm_add_epi16(s[1], s[5]);
2584   x[2] = _mm_add_epi16(s[2], s[6]);
2585   x[3] = _mm_add_epi16(s[3], s[7]);
2586   x[4] = _mm_sub_epi16(s[0], s[4]);
2587   x[5] = _mm_sub_epi16(s[1], s[5]);
2588   x[6] = _mm_sub_epi16(s[2], s[6]);
2589   x[7] = _mm_sub_epi16(s[3], s[7]);
2590   x[8] = _mm_packs_epi32(u[0], u[1]);
2591   x[9] = _mm_packs_epi32(u[2], u[3]);
2592   x[10] = _mm_packs_epi32(u[4], u[5]);
2593   x[11] = _mm_packs_epi32(u[6], u[7]);
2594   x[12] = _mm_packs_epi32(u[8], u[9]);
2595   x[13] = _mm_packs_epi32(u[10], u[11]);
2596   x[14] = _mm_packs_epi32(u[12], u[13]);
2597   x[15] = _mm_packs_epi32(u[14], u[15]);
2598 
2599   // stage 3
2600   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
2601   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
2602   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
2603   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
2604   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
2605   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
2606   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
2607   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
2608 
2609   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
2610   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
2611   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
2612   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
2613   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
2614   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
2615   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2616   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2617   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
2618   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
2619   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
2620   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
2621   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
2622   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
2623   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
2624   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
2625 
2626   u[0] = _mm_add_epi32(v[0], v[4]);
2627   u[1] = _mm_add_epi32(v[1], v[5]);
2628   u[2] = _mm_add_epi32(v[2], v[6]);
2629   u[3] = _mm_add_epi32(v[3], v[7]);
2630   u[4] = _mm_sub_epi32(v[0], v[4]);
2631   u[5] = _mm_sub_epi32(v[1], v[5]);
2632   u[6] = _mm_sub_epi32(v[2], v[6]);
2633   u[7] = _mm_sub_epi32(v[3], v[7]);
2634   u[8] = _mm_add_epi32(v[8], v[12]);
2635   u[9] = _mm_add_epi32(v[9], v[13]);
2636   u[10] = _mm_add_epi32(v[10], v[14]);
2637   u[11] = _mm_add_epi32(v[11], v[15]);
2638   u[12] = _mm_sub_epi32(v[8], v[12]);
2639   u[13] = _mm_sub_epi32(v[9], v[13]);
2640   u[14] = _mm_sub_epi32(v[10], v[14]);
2641   u[15] = _mm_sub_epi32(v[11], v[15]);
2642 
2643   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2644   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2645   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2646   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2647   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2648   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2649   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2650   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2651   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2652   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2653   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2654   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2655   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2656   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2657   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2658   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2659 
2660   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2661   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2662   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2663   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2664   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2665   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2666   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2667   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2668   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2669   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2670   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2671   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2672   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2673   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2674   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2675   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2676 
2677   s[0] = _mm_add_epi16(x[0], x[2]);
2678   s[1] = _mm_add_epi16(x[1], x[3]);
2679   s[2] = _mm_sub_epi16(x[0], x[2]);
2680   s[3] = _mm_sub_epi16(x[1], x[3]);
2681   s[4] = _mm_packs_epi32(v[0], v[1]);
2682   s[5] = _mm_packs_epi32(v[2], v[3]);
2683   s[6] = _mm_packs_epi32(v[4], v[5]);
2684   s[7] = _mm_packs_epi32(v[6], v[7]);
2685   s[8] = _mm_add_epi16(x[8], x[10]);
2686   s[9] = _mm_add_epi16(x[9], x[11]);
2687   s[10] = _mm_sub_epi16(x[8], x[10]);
2688   s[11] = _mm_sub_epi16(x[9], x[11]);
2689   s[12] = _mm_packs_epi32(v[8], v[9]);
2690   s[13] = _mm_packs_epi32(v[10], v[11]);
2691   s[14] = _mm_packs_epi32(v[12], v[13]);
2692   s[15] = _mm_packs_epi32(v[14], v[15]);
2693 
2694   // stage 4
2695   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
2696   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
2697   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
2698   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
2699   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
2700   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
2701   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
2702   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
2703 
2704   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
2705   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
2706   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2707   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2708   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2709   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2710   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2711   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2712   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
2713   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
2714   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
2715   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
2716   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
2717   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
2718   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
2719   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
2720 
2721   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2722   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2723   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2724   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2725   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2726   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2727   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2728   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2729   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2730   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2731   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2732   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2733   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2734   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2735   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2736   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2737 
2738   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2739   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2740   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2741   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2742   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2743   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2744   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2745   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2746   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2747   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2748   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2749   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2750   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2751   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2752   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2753   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2754 
2755   in[0] = s[0];
2756   in[1] = _mm_sub_epi16(kZero, s[8]);
2757   in[2] = s[12];
2758   in[3] = _mm_sub_epi16(kZero, s[4]);
2759   in[4] = _mm_packs_epi32(v[4], v[5]);
2760   in[5] = _mm_packs_epi32(v[12], v[13]);
2761   in[6] = _mm_packs_epi32(v[8], v[9]);
2762   in[7] = _mm_packs_epi32(v[0], v[1]);
2763   in[8] = _mm_packs_epi32(v[2], v[3]);
2764   in[9] = _mm_packs_epi32(v[10], v[11]);
2765   in[10] = _mm_packs_epi32(v[14], v[15]);
2766   in[11] = _mm_packs_epi32(v[6], v[7]);
2767   in[12] = s[5];
2768   in[13] = _mm_sub_epi16(kZero, s[13]);
2769   in[14] = s[9];
2770   in[15] = _mm_sub_epi16(kZero, s[1]);
2771 }
2772 
fdct16_sse2(__m128i * in0,__m128i * in1)2773 void fdct16_sse2(__m128i *in0, __m128i *in1) {
2774   fdct16_8col(in0);
2775   fdct16_8col(in1);
2776   array_transpose_16x16(in0, in1);
2777 }
2778 
fadst16_sse2(__m128i * in0,__m128i * in1)2779 void fadst16_sse2(__m128i *in0, __m128i *in1) {
2780   fadst16_8col(in0);
2781   fadst16_8col(in1);
2782   array_transpose_16x16(in0, in1);
2783 }
2784 
vp9_fht16x16_sse2(const int16_t * input,int16_t * output,int stride,int tx_type)2785 void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
2786                        int stride, int tx_type) {
2787   __m128i in0[16], in1[16];
2788 
2789   switch (tx_type) {
2790     case DCT_DCT:
2791       vp9_fdct16x16_sse2(input, output, stride);
2792       break;
2793     case ADST_DCT:
2794       load_buffer_16x16(input, in0, in1, stride);
2795       fadst16_sse2(in0, in1);
2796       right_shift_16x16(in0, in1);
2797       fdct16_sse2(in0, in1);
2798       write_buffer_16x16(output, in0, in1, 16);
2799       break;
2800     case DCT_ADST:
2801       load_buffer_16x16(input, in0, in1, stride);
2802       fdct16_sse2(in0, in1);
2803       right_shift_16x16(in0, in1);
2804       fadst16_sse2(in0, in1);
2805       write_buffer_16x16(output, in0, in1, 16);
2806       break;
2807     case ADST_ADST:
2808       load_buffer_16x16(input, in0, in1, stride);
2809       fadst16_sse2(in0, in1);
2810       right_shift_16x16(in0, in1);
2811       fadst16_sse2(in0, in1);
2812       write_buffer_16x16(output, in0, in1, 16);
2813       break;
2814     default:
2815       assert(0);
2816       break;
2817   }
2818 }
2819 
vp9_fdct32x32_1_sse2(const int16_t * input,int16_t * output,int stride)2820 void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
2821   __m128i in0, in1, in2, in3;
2822   __m128i u0, u1;
2823   __m128i sum = _mm_setzero_si128();
2824   int i;
2825 
2826   for (i = 0; i < 8; ++i) {
2827     in0  = _mm_load_si128((const __m128i *)(input +  0));
2828     in1  = _mm_load_si128((const __m128i *)(input +  8));
2829     in2  = _mm_load_si128((const __m128i *)(input + 16));
2830     in3  = _mm_load_si128((const __m128i *)(input + 24));
2831 
2832     input += stride;
2833     u0 = _mm_add_epi16(in0, in1);
2834     u1 = _mm_add_epi16(in2, in3);
2835     sum = _mm_add_epi16(sum, u0);
2836 
2837     in0  = _mm_load_si128((const __m128i *)(input +  0));
2838     in1  = _mm_load_si128((const __m128i *)(input +  8));
2839     in2  = _mm_load_si128((const __m128i *)(input + 16));
2840     in3  = _mm_load_si128((const __m128i *)(input + 24));
2841 
2842     input += stride;
2843     sum = _mm_add_epi16(sum, u1);
2844     u0  = _mm_add_epi16(in0, in1);
2845     u1  = _mm_add_epi16(in2, in3);
2846     sum = _mm_add_epi16(sum, u0);
2847 
2848     in0  = _mm_load_si128((const __m128i *)(input +  0));
2849     in1  = _mm_load_si128((const __m128i *)(input +  8));
2850     in2  = _mm_load_si128((const __m128i *)(input + 16));
2851     in3  = _mm_load_si128((const __m128i *)(input + 24));
2852 
2853     input += stride;
2854     sum = _mm_add_epi16(sum, u1);
2855     u0  = _mm_add_epi16(in0, in1);
2856     u1  = _mm_add_epi16(in2, in3);
2857     sum = _mm_add_epi16(sum, u0);
2858 
2859     in0  = _mm_load_si128((const __m128i *)(input +  0));
2860     in1  = _mm_load_si128((const __m128i *)(input +  8));
2861     in2  = _mm_load_si128((const __m128i *)(input + 16));
2862     in3  = _mm_load_si128((const __m128i *)(input + 24));
2863 
2864     input += stride;
2865     sum = _mm_add_epi16(sum, u1);
2866     u0  = _mm_add_epi16(in0, in1);
2867     u1  = _mm_add_epi16(in2, in3);
2868     sum = _mm_add_epi16(sum, u0);
2869 
2870     sum = _mm_add_epi16(sum, u1);
2871   }
2872 
2873   u0  = _mm_setzero_si128();
2874   in0 = _mm_unpacklo_epi16(u0, sum);
2875   in1 = _mm_unpackhi_epi16(u0, sum);
2876   in0 = _mm_srai_epi32(in0, 16);
2877   in1 = _mm_srai_epi32(in1, 16);
2878 
2879   sum = _mm_add_epi32(in0, in1);
2880   in0 = _mm_unpacklo_epi32(sum, u0);
2881   in1 = _mm_unpackhi_epi32(sum, u0);
2882 
2883   sum = _mm_add_epi32(in0, in1);
2884   in0 = _mm_srli_si128(sum, 8);
2885 
2886   in1 = _mm_add_epi32(sum, in0);
2887   in1 = _mm_srai_epi32(in1, 3);
2888   _mm_store_si128((__m128i *)(output), in1);
2889 }
2890 
2891 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
2892 #define FDCT32x32_HIGH_PRECISION 0
2893 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
2894 #undef  FDCT32x32_2D
2895 #undef  FDCT32x32_HIGH_PRECISION
2896 
2897 #define FDCT32x32_2D vp9_fdct32x32_sse2
2898 #define FDCT32x32_HIGH_PRECISION 1
2899 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
2900 #undef  FDCT32x32_2D
2901 #undef  FDCT32x32_HIGH_PRECISION
2902