1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #if defined(_MSC_VER) && _MSC_VER <= 1500
12 // Need to include math.h before calling tmmintrin.h/intrin.h
13 // in certain versions of MSVS.
14 #include <math.h>
15 #endif
16 #include <tmmintrin.h> // SSSE3
17 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
18
idct16_8col(__m128i * in,int round)19 static void idct16_8col(__m128i *in, int round) {
20 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
21 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
22 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
23 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
24 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
25 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
26 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
27 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
28 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
29 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
30 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
31 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
32 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
33 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
34 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
35 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
36 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
37 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
38 const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170);
39 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
40 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
41
42 __m128i v[16], u[16], s[16], t[16];
43
44 // stage 1
45 s[0] = in[0];
46 s[1] = in[8];
47 s[2] = in[4];
48 s[3] = in[12];
49 s[4] = in[2];
50 s[5] = in[10];
51 s[6] = in[6];
52 s[7] = in[14];
53 s[8] = in[1];
54 s[9] = in[9];
55 s[10] = in[5];
56 s[11] = in[13];
57 s[12] = in[3];
58 s[13] = in[11];
59 s[14] = in[7];
60 s[15] = in[15];
61
62 // stage 2
63 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
64 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
65 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
66 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
67 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
68 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
69 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
70 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
71
72 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
73 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
74 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
75 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
76 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
77 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
78 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
79 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
80 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
81 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
82 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
83 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
84 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
85 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
86 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
87 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
88
89 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
90 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
91 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
92 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
93 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
94 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
95 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
96 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
97 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
98 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
99 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
100 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
101 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
102 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
103 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
104 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
105
106 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
107 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
108 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
109 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
110 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
111 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
112 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
113 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
114 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
115 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
116 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
117 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
118 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
119 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
120 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
121 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
122
123 s[8] = _mm_packs_epi32(u[0], u[1]);
124 s[15] = _mm_packs_epi32(u[2], u[3]);
125 s[9] = _mm_packs_epi32(u[4], u[5]);
126 s[14] = _mm_packs_epi32(u[6], u[7]);
127 s[10] = _mm_packs_epi32(u[8], u[9]);
128 s[13] = _mm_packs_epi32(u[10], u[11]);
129 s[11] = _mm_packs_epi32(u[12], u[13]);
130 s[12] = _mm_packs_epi32(u[14], u[15]);
131
132 // stage 3
133 t[0] = s[0];
134 t[1] = s[1];
135 t[2] = s[2];
136 t[3] = s[3];
137 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
138 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
139 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
140 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
141
142 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
143 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
144 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
145 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
146 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
147 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
148 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
149 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
150
151 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
152 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
153 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
154 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
155 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
156 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
157 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
158 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
159
160 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
161 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
162 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
163 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
164 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
165 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
166 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
167 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
168
169 t[4] = _mm_packs_epi32(u[0], u[1]);
170 t[7] = _mm_packs_epi32(u[2], u[3]);
171 t[5] = _mm_packs_epi32(u[4], u[5]);
172 t[6] = _mm_packs_epi32(u[6], u[7]);
173 t[8] = _mm_add_epi16(s[8], s[9]);
174 t[9] = _mm_sub_epi16(s[8], s[9]);
175 t[10] = _mm_sub_epi16(s[11], s[10]);
176 t[11] = _mm_add_epi16(s[10], s[11]);
177 t[12] = _mm_add_epi16(s[12], s[13]);
178 t[13] = _mm_sub_epi16(s[12], s[13]);
179 t[14] = _mm_sub_epi16(s[15], s[14]);
180 t[15] = _mm_add_epi16(s[14], s[15]);
181
182 // stage 4
183 u[0] = _mm_add_epi16(t[0], t[1]);
184 u[1] = _mm_sub_epi16(t[0], t[1]);
185 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
186 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
187 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
188 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
189 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
190 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
191
192 s[0] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
193 s[1] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
194 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
195 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
196 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
197 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
198 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
199 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
200 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
201 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
202 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
203 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
204 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
205 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
206
207 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
208 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
209 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
210 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
211 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
212 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
213 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
214 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
215 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
216 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
217 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
218 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
219
220 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
221 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
222 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
223 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
224 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
225 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
226 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
227 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
228 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
229 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
230 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
231 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
232
233 s[2] = _mm_packs_epi32(u[4], u[5]);
234 s[3] = _mm_packs_epi32(u[6], u[7]);
235 s[4] = _mm_add_epi16(t[4], t[5]);
236 s[5] = _mm_sub_epi16(t[4], t[5]);
237 s[6] = _mm_sub_epi16(t[7], t[6]);
238 s[7] = _mm_add_epi16(t[6], t[7]);
239 s[8] = t[8];
240 s[15] = t[15];
241 s[9] = _mm_packs_epi32(u[8], u[9]);
242 s[14] = _mm_packs_epi32(u[10], u[11]);
243 s[10] = _mm_packs_epi32(u[12], u[13]);
244 s[13] = _mm_packs_epi32(u[14], u[15]);
245 s[11] = t[11];
246 s[12] = t[12];
247
248 // stage 5
249 t[0] = _mm_add_epi16(s[0], s[3]);
250 t[1] = _mm_add_epi16(s[1], s[2]);
251 t[2] = _mm_sub_epi16(s[1], s[2]);
252 t[3] = _mm_sub_epi16(s[0], s[3]);
253 t[4] = s[4];
254 t[7] = s[7];
255
256 u[0] = _mm_sub_epi16(s[6], s[5]);
257 u[1] = _mm_add_epi16(s[6], s[5]);
258 t[5] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
259 t[6] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
260
261 t[8] = _mm_add_epi16(s[8], s[11]);
262 t[9] = _mm_add_epi16(s[9], s[10]);
263 t[10] = _mm_sub_epi16(s[9], s[10]);
264 t[11] = _mm_sub_epi16(s[8], s[11]);
265 t[12] = _mm_sub_epi16(s[15], s[12]);
266 t[13] = _mm_sub_epi16(s[14], s[13]);
267 t[14] = _mm_add_epi16(s[13], s[14]);
268 t[15] = _mm_add_epi16(s[12], s[15]);
269
270 // stage 6
271 if (round == 1) {
272 s[0] = _mm_add_epi16(t[0], t[7]);
273 s[1] = _mm_add_epi16(t[1], t[6]);
274 s[2] = _mm_add_epi16(t[2], t[5]);
275 s[3] = _mm_add_epi16(t[3], t[4]);
276 s[4] = _mm_sub_epi16(t[3], t[4]);
277 s[5] = _mm_sub_epi16(t[2], t[5]);
278 s[6] = _mm_sub_epi16(t[1], t[6]);
279 s[7] = _mm_sub_epi16(t[0], t[7]);
280 s[8] = t[8];
281 s[9] = t[9];
282
283 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
284 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
285 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
286 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
287
288 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
289 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
290 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
291 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
292 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
293 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
294 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
295 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
296
297 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
298 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
299 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
300 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
301 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
302 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
303 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
304 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
305
306 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
307 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
308 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
309 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
310 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
311 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
312 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
313 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
314
315 s[10] = _mm_packs_epi32(u[0], u[1]);
316 s[13] = _mm_packs_epi32(u[2], u[3]);
317 s[11] = _mm_packs_epi32(u[4], u[5]);
318 s[12] = _mm_packs_epi32(u[6], u[7]);
319 s[14] = t[14];
320 s[15] = t[15];
321 } else {
322 s[0] = _mm_add_epi16(t[0], t[7]);
323 s[1] = _mm_add_epi16(t[1], t[6]);
324 s[2] = _mm_add_epi16(t[2], t[5]);
325 s[3] = _mm_add_epi16(t[3], t[4]);
326 s[4] = _mm_sub_epi16(t[3], t[4]);
327 s[5] = _mm_sub_epi16(t[2], t[5]);
328 s[6] = _mm_sub_epi16(t[1], t[6]);
329 s[7] = _mm_sub_epi16(t[0], t[7]);
330 s[8] = t[8];
331 s[9] = t[9];
332
333 u[0] = _mm_sub_epi16(t[13], t[10]);
334 u[1] = _mm_add_epi16(t[13], t[10]);
335 u[2] = _mm_sub_epi16(t[12], t[11]);
336 u[3] = _mm_add_epi16(t[12], t[11]);
337
338 s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
339 s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
340 s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
341 s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
342 s[14] = t[14];
343 s[15] = t[15];
344 }
345
346 // stage 7
347 in[0] = _mm_add_epi16(s[0], s[15]);
348 in[1] = _mm_add_epi16(s[1], s[14]);
349 in[2] = _mm_add_epi16(s[2], s[13]);
350 in[3] = _mm_add_epi16(s[3], s[12]);
351 in[4] = _mm_add_epi16(s[4], s[11]);
352 in[5] = _mm_add_epi16(s[5], s[10]);
353 in[6] = _mm_add_epi16(s[6], s[9]);
354 in[7] = _mm_add_epi16(s[7], s[8]);
355 in[8] = _mm_sub_epi16(s[7], s[8]);
356 in[9] = _mm_sub_epi16(s[6], s[9]);
357 in[10] = _mm_sub_epi16(s[5], s[10]);
358 in[11] = _mm_sub_epi16(s[4], s[11]);
359 in[12] = _mm_sub_epi16(s[3], s[12]);
360 in[13] = _mm_sub_epi16(s[2], s[13]);
361 in[14] = _mm_sub_epi16(s[1], s[14]);
362 in[15] = _mm_sub_epi16(s[0], s[15]);
363 }
364
idct16_sse2(__m128i * in0,__m128i * in1,int round)365 static void idct16_sse2(__m128i *in0, __m128i *in1, int round) {
366 array_transpose_16x16(in0, in1);
367 idct16_8col(in0, round);
368 idct16_8col(in1, round);
369 }
370
vp9_idct16x16_256_add_ssse3(const int16_t * input,uint8_t * dest,int stride)371 void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
372 int stride) {
373 __m128i in0[16], in1[16];
374
375 load_buffer_8x16(input, in0);
376 input += 8;
377 load_buffer_8x16(input, in1);
378
379 idct16_sse2(in0, in1, 0);
380 idct16_sse2(in0, in1, 1);
381
382 write_buffer_8x16(dest, in0, stride);
383 dest += 8;
384 write_buffer_8x16(dest, in1, stride);
385 }
386
idct16_10_r1(__m128i * in,__m128i * l)387 static void idct16_10_r1(__m128i *in, __m128i *l) {
388 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
389 const __m128i zero = _mm_setzero_si128();
390
391 const __m128i stg2_01 = dual_set_epi16(3212, 32610);
392 const __m128i stg2_67 = dual_set_epi16(-9512, 31358);
393 const __m128i stg3_01 = dual_set_epi16(6392, 32138);
394 const __m128i stg4_01 = dual_set_epi16(23170, 23170);
395
396
397
398 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
399 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
400 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
401 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
402
403 __m128i stp1_0, stp1_1, stp1_4, stp1_6,
404 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15;
405 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
406 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13;
407 __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
408
409 // Stage2
410 {
411 const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]);
412 const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]);
413
414 stp2_8 = _mm_mulhrs_epi16(lo_1_15, stg2_01);
415 stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67);
416 }
417
418 // Stage3
419 {
420 const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]);
421 stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01);
422
423 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
424 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
425 }
426
427 // Stage4
428 {
429 const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]);
430 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
431 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
432
433 tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01);
434 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
435 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
436 tmp2 = _mm_madd_epi16(lo_10_13, stg4_6);
437 tmp4 = _mm_madd_epi16(lo_10_13, stg4_7);
438
439 tmp1 = _mm_add_epi32(tmp1, rounding);
440 tmp3 = _mm_add_epi32(tmp3, rounding);
441 tmp2 = _mm_add_epi32(tmp2, rounding);
442 tmp4 = _mm_add_epi32(tmp4, rounding);
443
444 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
445 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
446 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
447 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
448
449 stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0);
450 stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0);
451 stp2_9 = _mm_packs_epi32(tmp1, tmp3);
452 stp2_10 = _mm_packs_epi32(tmp2, tmp4);
453
454 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
455 }
456
457 // Stage5 and Stage6
458 {
459 tmp0 = _mm_add_epi16(stp2_8, stp2_11);
460 tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
461 tmp2 = _mm_add_epi16(stp2_9, stp2_10);
462 tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
463
464 stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
465 stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
466 stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
467 stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
468
469 stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
470 stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
471 stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
472 stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
473 }
474
475 // Stage6
476 {
477 const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4);
478 const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4);
479 const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10);
480 const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10);
481 const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11);
482 const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11);
483
484 tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6);
485 tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14);
486 tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13);
487
488 stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
489 tmp0 = _mm_mulhrs_epi16(tmp0, stg4_01);
490 tmp4 = _mm_mulhrs_epi16(tmp4, stg4_01);
491
492 stp2_10 = _mm_unpacklo_epi64(tmp0, zero);
493 stp2_13 = _mm_unpackhi_epi64(tmp0, zero);
494 stp2_11 = _mm_unpacklo_epi64(tmp4, zero);
495 stp2_12 = _mm_unpackhi_epi64(tmp4, zero);
496
497 tmp0 = _mm_add_epi16(stp1_0, stp1_4);
498 tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
499 tmp2 = _mm_add_epi16(stp1_1, stp1_6);
500 tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
501
502 stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
503 stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
504 stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
505 stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
506 stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
507 stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
508 stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
509 stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
510 }
511
512 // Stage7. Left 8x16 only.
513 l[0] = _mm_add_epi16(stp2_0, stp1_15);
514 l[1] = _mm_add_epi16(stp2_1, stp1_14);
515 l[2] = _mm_add_epi16(stp2_2, stp2_13);
516 l[3] = _mm_add_epi16(stp2_3, stp2_12);
517 l[4] = _mm_add_epi16(stp2_4, stp2_11);
518 l[5] = _mm_add_epi16(stp2_5, stp2_10);
519 l[6] = _mm_add_epi16(stp2_6, stp1_9);
520 l[7] = _mm_add_epi16(stp2_7, stp1_8);
521 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
522 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
523 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
524 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
525 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
526 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
527 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
528 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
529 }
530
idct16_10_r2(__m128i * in)531 static void idct16_10_r2(__m128i *in) {
532 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
533
534 const __m128i stg2_0 = dual_set_epi16(3212, 3212);
535 const __m128i stg2_1 = dual_set_epi16(32610, 32610);
536 const __m128i stg2_6 = dual_set_epi16(-9512, -9512);
537 const __m128i stg2_7 = dual_set_epi16(31358, 31358);
538 const __m128i stg3_0 = dual_set_epi16(6392, 6392);
539 const __m128i stg3_1 = dual_set_epi16(32138, 32138);
540 const __m128i stg4_01 = dual_set_epi16(23170, 23170);
541
542 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
543 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
544 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
545 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
546
547 __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6,
548 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
549 stp1_8_0, stp1_12_0;
550 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
551 stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
552 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
553
554 /* Stage2 */
555 {
556 stp1_8_0 = _mm_mulhrs_epi16(in[1], stg2_0);
557 stp1_15 = _mm_mulhrs_epi16(in[1], stg2_1);
558 stp1_11 = _mm_mulhrs_epi16(in[3], stg2_6);
559 stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7);
560 }
561
562 /* Stage3 */
563 {
564 stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0);
565 stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1);
566
567 stp1_9 = stp1_8_0;
568 stp1_10 = stp1_11;
569
570 stp1_13 = stp1_12_0;
571 stp1_14 = stp1_15;
572 }
573
574 /* Stage4 */
575 {
576 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
577 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
578 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
579 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
580
581 stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01);
582
583 stp2_5 = stp2_4;
584 stp2_6 = stp2_7;
585
586
587 tmp0 = _mm_madd_epi16(lo_9_14, stg4_4);
588 tmp1 = _mm_madd_epi16(hi_9_14, stg4_4);
589 tmp2 = _mm_madd_epi16(lo_9_14, stg4_5);
590 tmp3 = _mm_madd_epi16(hi_9_14, stg4_5);
591 tmp4 = _mm_madd_epi16(lo_10_13, stg4_6);
592 tmp5 = _mm_madd_epi16(hi_10_13, stg4_6);
593 tmp6 = _mm_madd_epi16(lo_10_13, stg4_7);
594 tmp7 = _mm_madd_epi16(hi_10_13, stg4_7);
595
596 tmp0 = _mm_add_epi32(tmp0, rounding);
597 tmp1 = _mm_add_epi32(tmp1, rounding);
598 tmp2 = _mm_add_epi32(tmp2, rounding);
599 tmp3 = _mm_add_epi32(tmp3, rounding);
600 tmp4 = _mm_add_epi32(tmp4, rounding);
601 tmp5 = _mm_add_epi32(tmp5, rounding);
602 tmp6 = _mm_add_epi32(tmp6, rounding);
603 tmp7 = _mm_add_epi32(tmp7, rounding);
604
605 tmp0 = _mm_srai_epi32(tmp0, 14);
606 tmp1 = _mm_srai_epi32(tmp1, 14);
607 tmp2 = _mm_srai_epi32(tmp2, 14);
608 tmp3 = _mm_srai_epi32(tmp3, 14);
609 tmp4 = _mm_srai_epi32(tmp4, 14);
610 tmp5 = _mm_srai_epi32(tmp5, 14);
611 tmp6 = _mm_srai_epi32(tmp6, 14);
612 tmp7 = _mm_srai_epi32(tmp7, 14);
613
614 stp2_9 = _mm_packs_epi32(tmp0, tmp1);
615 stp2_14 = _mm_packs_epi32(tmp2, tmp3);
616 stp2_10 = _mm_packs_epi32(tmp4, tmp5);
617 stp2_13 = _mm_packs_epi32(tmp6, tmp7);
618 }
619
620 /* Stage5 */
621 {
622 stp1_2 = stp1_0;
623 stp1_3 = stp1_0;
624
625 tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
626 tmp1 = _mm_add_epi16(stp2_6, stp2_5);
627
628 stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01);
629 stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
630
631 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
632 stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
633 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
634 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
635
636 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
637 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
638 stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
639 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
640 }
641
642 /* Stage6 */
643 {
644 stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
645 stp2_1 = _mm_add_epi16(stp1_0, stp1_6);
646 stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
647 stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
648
649 tmp0 = _mm_sub_epi16(stp1_13, stp1_10);
650 tmp1 = _mm_add_epi16(stp1_13, stp1_10);
651 tmp2 = _mm_sub_epi16(stp1_12, stp1_11);
652 tmp3 = _mm_add_epi16(stp1_12, stp1_11);
653
654 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
655 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
656 stp2_6 = _mm_sub_epi16(stp1_0, stp1_6);
657 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
658
659 stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01);
660 stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01);
661 stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01);
662 stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01);
663 }
664
665 // Stage7
666 in[0] = _mm_add_epi16(stp2_0, stp1_15);
667 in[1] = _mm_add_epi16(stp2_1, stp1_14);
668 in[2] = _mm_add_epi16(stp2_2, stp2_13);
669 in[3] = _mm_add_epi16(stp2_3, stp2_12);
670 in[4] = _mm_add_epi16(stp2_4, stp2_11);
671 in[5] = _mm_add_epi16(stp2_5, stp2_10);
672 in[6] = _mm_add_epi16(stp2_6, stp1_9);
673 in[7] = _mm_add_epi16(stp2_7, stp1_8);
674 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
675 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
676 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
677 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
678 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
679 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
680 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
681 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
682 }
683
vp9_idct16x16_10_add_ssse3(const int16_t * input,uint8_t * dest,int stride)684 void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest,
685 int stride) {
686 const __m128i final_rounding = _mm_set1_epi16(1<<5);
687 const __m128i zero = _mm_setzero_si128();
688 __m128i in[16], l[16];
689
690 int i;
691 // First 1-D inverse DCT
692 // Load input data.
693 in[0] = _mm_load_si128((const __m128i *)input);
694 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
695 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
696 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
697
698 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
699
700 idct16_10_r1(in, l);
701
702 // Second 1-D inverse transform, performed per 8x16 block
703 for (i = 0; i < 2; i++) {
704 array_transpose_4X8(l + 8*i, in);
705
706 idct16_10_r2(in);
707
708 // Final rounding and shift
709 in[0] = _mm_adds_epi16(in[0], final_rounding);
710 in[1] = _mm_adds_epi16(in[1], final_rounding);
711 in[2] = _mm_adds_epi16(in[2], final_rounding);
712 in[3] = _mm_adds_epi16(in[3], final_rounding);
713 in[4] = _mm_adds_epi16(in[4], final_rounding);
714 in[5] = _mm_adds_epi16(in[5], final_rounding);
715 in[6] = _mm_adds_epi16(in[6], final_rounding);
716 in[7] = _mm_adds_epi16(in[7], final_rounding);
717 in[8] = _mm_adds_epi16(in[8], final_rounding);
718 in[9] = _mm_adds_epi16(in[9], final_rounding);
719 in[10] = _mm_adds_epi16(in[10], final_rounding);
720 in[11] = _mm_adds_epi16(in[11], final_rounding);
721 in[12] = _mm_adds_epi16(in[12], final_rounding);
722 in[13] = _mm_adds_epi16(in[13], final_rounding);
723 in[14] = _mm_adds_epi16(in[14], final_rounding);
724 in[15] = _mm_adds_epi16(in[15], final_rounding);
725
726 in[0] = _mm_srai_epi16(in[0], 6);
727 in[1] = _mm_srai_epi16(in[1], 6);
728 in[2] = _mm_srai_epi16(in[2], 6);
729 in[3] = _mm_srai_epi16(in[3], 6);
730 in[4] = _mm_srai_epi16(in[4], 6);
731 in[5] = _mm_srai_epi16(in[5], 6);
732 in[6] = _mm_srai_epi16(in[6], 6);
733 in[7] = _mm_srai_epi16(in[7], 6);
734 in[8] = _mm_srai_epi16(in[8], 6);
735 in[9] = _mm_srai_epi16(in[9], 6);
736 in[10] = _mm_srai_epi16(in[10], 6);
737 in[11] = _mm_srai_epi16(in[11], 6);
738 in[12] = _mm_srai_epi16(in[12], 6);
739 in[13] = _mm_srai_epi16(in[13], 6);
740 in[14] = _mm_srai_epi16(in[14], 6);
741 in[15] = _mm_srai_epi16(in[15], 6);
742
743 RECON_AND_STORE(dest, in[0]);
744 RECON_AND_STORE(dest, in[1]);
745 RECON_AND_STORE(dest, in[2]);
746 RECON_AND_STORE(dest, in[3]);
747 RECON_AND_STORE(dest, in[4]);
748 RECON_AND_STORE(dest, in[5]);
749 RECON_AND_STORE(dest, in[6]);
750 RECON_AND_STORE(dest, in[7]);
751 RECON_AND_STORE(dest, in[8]);
752 RECON_AND_STORE(dest, in[9]);
753 RECON_AND_STORE(dest, in[10]);
754 RECON_AND_STORE(dest, in[11]);
755 RECON_AND_STORE(dest, in[12]);
756 RECON_AND_STORE(dest, in[13]);
757 RECON_AND_STORE(dest, in[14]);
758 RECON_AND_STORE(dest, in[15]);
759
760 dest += 8 - (stride * 16);
761 }
762 }
763