1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h> // SSE2
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/inv_txfm_sse2.h"
16 #include "vpx_dsp/x86/transpose_sse2.h"
17 #include "vpx_dsp/x86/txfm_common_sse2.h"
18
highbd_idct16_4col_stage5(const __m128i * const in,__m128i * const out)19 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
20 __m128i *const out) {
21 // stage 5
22 out[0] = _mm_add_epi32(in[0], in[3]);
23 out[1] = _mm_add_epi32(in[1], in[2]);
24 out[2] = _mm_sub_epi32(in[1], in[2]);
25 out[3] = _mm_sub_epi32(in[0], in[3]);
26 highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
27 out[8] = _mm_add_epi32(in[8], in[11]);
28 out[9] = _mm_add_epi32(in[9], in[10]);
29 out[10] = _mm_sub_epi32(in[9], in[10]);
30 out[11] = _mm_sub_epi32(in[8], in[11]);
31 out[12] = _mm_sub_epi32(in[15], in[12]);
32 out[13] = _mm_sub_epi32(in[14], in[13]);
33 out[14] = _mm_add_epi32(in[14], in[13]);
34 out[15] = _mm_add_epi32(in[15], in[12]);
35 }
36
highbd_idct16_4col_stage6(const __m128i * const in,__m128i * const out)37 static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
38 __m128i *const out) {
39 out[0] = _mm_add_epi32(in[0], in[7]);
40 out[1] = _mm_add_epi32(in[1], in[6]);
41 out[2] = _mm_add_epi32(in[2], in[5]);
42 out[3] = _mm_add_epi32(in[3], in[4]);
43 out[4] = _mm_sub_epi32(in[3], in[4]);
44 out[5] = _mm_sub_epi32(in[2], in[5]);
45 out[6] = _mm_sub_epi32(in[1], in[6]);
46 out[7] = _mm_sub_epi32(in[0], in[7]);
47 out[8] = in[8];
48 out[9] = in[9];
49 highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
50 highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
51 out[14] = in[14];
52 out[15] = in[15];
53 }
54
highbd_idct16_4col(__m128i * const io)55 static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
56 __m128i step1[16], step2[16];
57
58 // stage 2
59 highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
60 &step2[15]);
61 highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
62 &step2[14]);
63 highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
64 &step2[13]);
65 highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
66 &step2[12]);
67
68 // stage 3
69 highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
70 &step1[7]);
71 highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
72 &step1[6]);
73 step1[8] = _mm_add_epi32(step2[8], step2[9]);
74 step1[9] = _mm_sub_epi32(step2[8], step2[9]);
75 step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
76 step1[11] = _mm_add_epi32(step2[10], step2[11]);
77 step1[12] = _mm_add_epi32(step2[13], step2[12]);
78 step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
79 step1[14] = _mm_sub_epi32(step2[15], step2[14]);
80 step1[15] = _mm_add_epi32(step2[15], step2[14]);
81
82 // stage 4
83 highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
84 highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
85 &step2[3]);
86 highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
87 &step2[14]);
88 highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
89 &step2[13], &step2[10]);
90 step2[5] = _mm_sub_epi32(step1[4], step1[5]);
91 step1[4] = _mm_add_epi32(step1[4], step1[5]);
92 step2[6] = _mm_sub_epi32(step1[7], step1[6]);
93 step1[7] = _mm_add_epi32(step1[7], step1[6]);
94 step2[8] = step1[8];
95 step2[11] = step1[11];
96 step2[12] = step1[12];
97 step2[15] = step1[15];
98
99 highbd_idct16_4col_stage5(step2, step1);
100 highbd_idct16_4col_stage6(step1, step2);
101 highbd_idct16_4col_stage7(step2, io);
102 }
103
highbd_idct16x16_38_4col(__m128i * const io)104 static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
105 __m128i step1[16], step2[16];
106 __m128i temp1[2], sign[2];
107
108 // stage 2
109 highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
110 &step2[15]);
111 highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
112 &step2[14]);
113 highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
114 &step2[13]);
115 highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
116 &step2[12]);
117
118 // stage 3
119 highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
120 &step1[7]);
121 highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
122 &step1[6]);
123 step1[8] = _mm_add_epi32(step2[8], step2[9]);
124 step1[9] = _mm_sub_epi32(step2[8], step2[9]);
125 step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
126 step1[11] = _mm_add_epi32(step2[10], step2[11]);
127 step1[12] = _mm_add_epi32(step2[13], step2[12]);
128 step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
129 step1[14] = _mm_sub_epi32(step2[15], step2[14]);
130 step1[15] = _mm_add_epi32(step2[15], step2[14]);
131
132 // stage 4
133 abs_extend_64bit_sse2(io[0], temp1, sign);
134 step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
135 step2[1] = step2[0];
136 highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
137 &step2[3]);
138 highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
139 &step2[14]);
140 highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
141 &step2[13], &step2[10]);
142 step2[5] = _mm_sub_epi32(step1[4], step1[5]);
143 step1[4] = _mm_add_epi32(step1[4], step1[5]);
144 step2[6] = _mm_sub_epi32(step1[7], step1[6]);
145 step1[7] = _mm_add_epi32(step1[7], step1[6]);
146 step2[8] = step1[8];
147 step2[11] = step1[11];
148 step2[12] = step1[12];
149 step2[15] = step1[15];
150
151 highbd_idct16_4col_stage5(step2, step1);
152 highbd_idct16_4col_stage6(step1, step2);
153 highbd_idct16_4col_stage7(step2, io);
154 }
155
highbd_idct16x16_10_4col(__m128i * const io)156 static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
157 __m128i step1[16], step2[16];
158 __m128i temp[2], sign[2];
159
160 // stage 2
161 highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
162 &step2[15]);
163 highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
164 &step2[12]);
165
166 // stage 3
167 highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
168 &step1[7]);
169 step1[8] = step2[8];
170 step1[9] = step2[8];
171 step1[10] =
172 _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10]
173 step1[11] = step2[11];
174 step1[12] = step2[12];
175 step1[13] =
176 _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13]
177 step1[14] = step2[15];
178 step1[15] = step2[15];
179
180 // stage 4
181 abs_extend_64bit_sse2(io[0], temp, sign);
182 step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
183 step2[1] = step2[0];
184 step2[2] = _mm_setzero_si128();
185 step2[3] = _mm_setzero_si128();
186 highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
187 &step2[14]);
188 highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
189 &step2[13], &step2[10]);
190 step2[5] = step1[4];
191 step2[6] = step1[7];
192 step2[8] = step1[8];
193 step2[11] = step1[11];
194 step2[12] = step1[12];
195 step2[15] = step1[15];
196
197 highbd_idct16_4col_stage5(step2, step1);
198 highbd_idct16_4col_stage6(step1, step2);
199 highbd_idct16_4col_stage7(step2, io);
200 }
201
vpx_highbd_idct16x16_256_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)202 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
203 int stride, int bd) {
204 int i;
205 __m128i out[16], *in;
206
207 if (bd == 8) {
208 __m128i l[16], r[16];
209
210 in = l;
211 for (i = 0; i < 2; i++) {
212 highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
213 highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
214 idct16_8col(in, in);
215 in = r;
216 input += 128;
217 }
218
219 for (i = 0; i < 16; i += 8) {
220 int j;
221 transpose_16bit_8x8(l + i, out);
222 transpose_16bit_8x8(r + i, out + 8);
223 idct16_8col(out, out);
224
225 for (j = 0; j < 16; ++j) {
226 highbd_write_buffer_8(dest + j * stride, out[j], bd);
227 }
228 dest += 8;
229 }
230 } else {
231 __m128i all[4][16];
232
233 for (i = 0; i < 4; i++) {
234 in = all[i];
235 highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
236 highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
237 highbd_idct16_4col(in);
238 input += 4 * 16;
239 }
240
241 for (i = 0; i < 16; i += 4) {
242 int j;
243 transpose_32bit_4x4(all[0] + i, out + 0);
244 transpose_32bit_4x4(all[1] + i, out + 4);
245 transpose_32bit_4x4(all[2] + i, out + 8);
246 transpose_32bit_4x4(all[3] + i, out + 12);
247 highbd_idct16_4col(out);
248
249 for (j = 0; j < 16; ++j) {
250 highbd_write_buffer_4(dest + j * stride, out[j], bd);
251 }
252 dest += 4;
253 }
254 }
255 }
256
vpx_highbd_idct16x16_38_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)257 void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
258 int stride, int bd) {
259 int i;
260 __m128i out[16];
261
262 if (bd == 8) {
263 __m128i in[16], temp[16];
264
265 highbd_load_pack_transpose_32bit_8x8(input, 16, in);
266 for (i = 8; i < 16; i++) {
267 in[i] = _mm_setzero_si128();
268 }
269 idct16_8col(in, temp);
270
271 for (i = 0; i < 16; i += 8) {
272 int j;
273 transpose_16bit_8x8(temp + i, in);
274 idct16_8col(in, out);
275
276 for (j = 0; j < 16; ++j) {
277 highbd_write_buffer_8(dest + j * stride, out[j], bd);
278 }
279 dest += 8;
280 }
281 } else {
282 __m128i all[2][16], *in;
283
284 for (i = 0; i < 2; i++) {
285 in = all[i];
286 highbd_load_transpose_32bit_8x4(input, 16, in);
287 highbd_idct16x16_38_4col(in);
288 input += 4 * 16;
289 }
290
291 for (i = 0; i < 16; i += 4) {
292 int j;
293 transpose_32bit_4x4(all[0] + i, out + 0);
294 transpose_32bit_4x4(all[1] + i, out + 4);
295 highbd_idct16x16_38_4col(out);
296
297 for (j = 0; j < 16; ++j) {
298 highbd_write_buffer_4(dest + j * stride, out[j], bd);
299 }
300 dest += 4;
301 }
302 }
303 }
304
vpx_highbd_idct16x16_10_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)305 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
306 int stride, int bd) {
307 int i;
308 __m128i out[16];
309
310 if (bd == 8) {
311 __m128i in[16], l[16];
312
313 in[0] = load_pack_8_32bit(input + 0 * 16);
314 in[1] = load_pack_8_32bit(input + 1 * 16);
315 in[2] = load_pack_8_32bit(input + 2 * 16);
316 in[3] = load_pack_8_32bit(input + 3 * 16);
317
318 idct16x16_10_pass1(in, l);
319
320 for (i = 0; i < 16; i += 8) {
321 int j;
322 idct16x16_10_pass2(l + i, in);
323
324 for (j = 0; j < 16; ++j) {
325 highbd_write_buffer_8(dest + j * stride, in[j], bd);
326 }
327 dest += 8;
328 }
329 } else {
330 __m128i all[2][16], *in;
331
332 for (i = 0; i < 2; i++) {
333 in = all[i];
334 highbd_load_transpose_32bit_4x4(input, 16, in);
335 highbd_idct16x16_10_4col(in);
336 input += 4 * 16;
337 }
338
339 for (i = 0; i < 16; i += 4) {
340 int j;
341 transpose_32bit_4x4(&all[0][i], out);
342 highbd_idct16x16_10_4col(out);
343
344 for (j = 0; j < 16; ++j) {
345 highbd_write_buffer_4(dest + j * stride, out[j], bd);
346 }
347 dest += 4;
348 }
349 }
350 }
351
vpx_highbd_idct16x16_1_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)352 void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
353 int stride, int bd) {
354 highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
355 }
356