1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>  // SSE2
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/inv_txfm_sse2.h"
16 #include "vpx_dsp/x86/transpose_sse2.h"
17 #include "vpx_dsp/x86/txfm_common_sse2.h"
18 
highbd_idct16_4col_stage5(const __m128i * const in,__m128i * const out)19 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
20                                              __m128i *const out) {
21   // stage 5
22   out[0] = _mm_add_epi32(in[0], in[3]);
23   out[1] = _mm_add_epi32(in[1], in[2]);
24   out[2] = _mm_sub_epi32(in[1], in[2]);
25   out[3] = _mm_sub_epi32(in[0], in[3]);
26   highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
27   out[8] = _mm_add_epi32(in[8], in[11]);
28   out[9] = _mm_add_epi32(in[9], in[10]);
29   out[10] = _mm_sub_epi32(in[9], in[10]);
30   out[11] = _mm_sub_epi32(in[8], in[11]);
31   out[12] = _mm_sub_epi32(in[15], in[12]);
32   out[13] = _mm_sub_epi32(in[14], in[13]);
33   out[14] = _mm_add_epi32(in[14], in[13]);
34   out[15] = _mm_add_epi32(in[15], in[12]);
35 }
36 
highbd_idct16_4col_stage6(const __m128i * const in,__m128i * const out)37 static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
38                                              __m128i *const out) {
39   out[0] = _mm_add_epi32(in[0], in[7]);
40   out[1] = _mm_add_epi32(in[1], in[6]);
41   out[2] = _mm_add_epi32(in[2], in[5]);
42   out[3] = _mm_add_epi32(in[3], in[4]);
43   out[4] = _mm_sub_epi32(in[3], in[4]);
44   out[5] = _mm_sub_epi32(in[2], in[5]);
45   out[6] = _mm_sub_epi32(in[1], in[6]);
46   out[7] = _mm_sub_epi32(in[0], in[7]);
47   out[8] = in[8];
48   out[9] = in[9];
49   highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
50   highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
51   out[14] = in[14];
52   out[15] = in[15];
53 }
54 
highbd_idct16_4col(__m128i * const io)55 static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
56   __m128i step1[16], step2[16];
57 
58   // stage 2
59   highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
60                         &step2[15]);
61   highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
62                         &step2[14]);
63   highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
64                         &step2[13]);
65   highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
66                         &step2[12]);
67 
68   // stage 3
69   highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
70                         &step1[7]);
71   highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
72                         &step1[6]);
73   step1[8] = _mm_add_epi32(step2[8], step2[9]);
74   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
75   step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
76   step1[11] = _mm_add_epi32(step2[10], step2[11]);
77   step1[12] = _mm_add_epi32(step2[13], step2[12]);
78   step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
79   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
80   step1[15] = _mm_add_epi32(step2[15], step2[14]);
81 
82   // stage 4
83   highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
84   highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
85                         &step2[3]);
86   highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
87                         &step2[14]);
88   highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
89                         &step2[13], &step2[10]);
90   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
91   step1[4] = _mm_add_epi32(step1[4], step1[5]);
92   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
93   step1[7] = _mm_add_epi32(step1[7], step1[6]);
94   step2[8] = step1[8];
95   step2[11] = step1[11];
96   step2[12] = step1[12];
97   step2[15] = step1[15];
98 
99   highbd_idct16_4col_stage5(step2, step1);
100   highbd_idct16_4col_stage6(step1, step2);
101   highbd_idct16_4col_stage7(step2, io);
102 }
103 
highbd_idct16x16_38_4col(__m128i * const io)104 static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
105   __m128i step1[16], step2[16];
106   __m128i temp1[2], sign[2];
107 
108   // stage 2
109   highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
110                                 &step2[15]);
111   highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
112                                     &step2[14]);
113   highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
114                                 &step2[13]);
115   highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
116                                     &step2[12]);
117 
118   // stage 3
119   highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
120                                 &step1[7]);
121   highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
122                                     &step1[6]);
123   step1[8] = _mm_add_epi32(step2[8], step2[9]);
124   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
125   step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
126   step1[11] = _mm_add_epi32(step2[10], step2[11]);
127   step1[12] = _mm_add_epi32(step2[13], step2[12]);
128   step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
129   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
130   step1[15] = _mm_add_epi32(step2[15], step2[14]);
131 
132   // stage 4
133   abs_extend_64bit_sse2(io[0], temp1, sign);
134   step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
135   step2[1] = step2[0];
136   highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
137                                 &step2[3]);
138   highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
139                         &step2[14]);
140   highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
141                         &step2[13], &step2[10]);
142   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
143   step1[4] = _mm_add_epi32(step1[4], step1[5]);
144   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
145   step1[7] = _mm_add_epi32(step1[7], step1[6]);
146   step2[8] = step1[8];
147   step2[11] = step1[11];
148   step2[12] = step1[12];
149   step2[15] = step1[15];
150 
151   highbd_idct16_4col_stage5(step2, step1);
152   highbd_idct16_4col_stage6(step1, step2);
153   highbd_idct16_4col_stage7(step2, io);
154 }
155 
highbd_idct16x16_10_4col(__m128i * const io)156 static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
157   __m128i step1[16], step2[16];
158   __m128i temp[2], sign[2];
159 
160   // stage 2
161   highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
162                                 &step2[15]);
163   highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
164                                     &step2[12]);
165 
166   // stage 3
167   highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
168                                 &step1[7]);
169   step1[8] = step2[8];
170   step1[9] = step2[8];
171   step1[10] =
172       _mm_sub_epi32(_mm_setzero_si128(), step2[11]);  // step1[10] = -step1[10]
173   step1[11] = step2[11];
174   step1[12] = step2[12];
175   step1[13] =
176       _mm_sub_epi32(_mm_setzero_si128(), step2[12]);  // step1[13] = -step1[13]
177   step1[14] = step2[15];
178   step1[15] = step2[15];
179 
180   // stage 4
181   abs_extend_64bit_sse2(io[0], temp, sign);
182   step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
183   step2[1] = step2[0];
184   step2[2] = _mm_setzero_si128();
185   step2[3] = _mm_setzero_si128();
186   highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
187                         &step2[14]);
188   highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
189                         &step2[13], &step2[10]);
190   step2[5] = step1[4];
191   step2[6] = step1[7];
192   step2[8] = step1[8];
193   step2[11] = step1[11];
194   step2[12] = step1[12];
195   step2[15] = step1[15];
196 
197   highbd_idct16_4col_stage5(step2, step1);
198   highbd_idct16_4col_stage6(step1, step2);
199   highbd_idct16_4col_stage7(step2, io);
200 }
201 
vpx_highbd_idct16x16_256_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)202 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
203                                        int stride, int bd) {
204   int i;
205   __m128i out[16], *in;
206 
207   if (bd == 8) {
208     __m128i l[16], r[16];
209 
210     in = l;
211     for (i = 0; i < 2; i++) {
212       highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
213       highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
214       idct16_8col(in, in);
215       in = r;
216       input += 128;
217     }
218 
219     for (i = 0; i < 16; i += 8) {
220       int j;
221       transpose_16bit_8x8(l + i, out);
222       transpose_16bit_8x8(r + i, out + 8);
223       idct16_8col(out, out);
224 
225       for (j = 0; j < 16; ++j) {
226         highbd_write_buffer_8(dest + j * stride, out[j], bd);
227       }
228       dest += 8;
229     }
230   } else {
231     __m128i all[4][16];
232 
233     for (i = 0; i < 4; i++) {
234       in = all[i];
235       highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
236       highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
237       highbd_idct16_4col(in);
238       input += 4 * 16;
239     }
240 
241     for (i = 0; i < 16; i += 4) {
242       int j;
243       transpose_32bit_4x4(all[0] + i, out + 0);
244       transpose_32bit_4x4(all[1] + i, out + 4);
245       transpose_32bit_4x4(all[2] + i, out + 8);
246       transpose_32bit_4x4(all[3] + i, out + 12);
247       highbd_idct16_4col(out);
248 
249       for (j = 0; j < 16; ++j) {
250         highbd_write_buffer_4(dest + j * stride, out[j], bd);
251       }
252       dest += 4;
253     }
254   }
255 }
256 
vpx_highbd_idct16x16_38_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)257 void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
258                                       int stride, int bd) {
259   int i;
260   __m128i out[16];
261 
262   if (bd == 8) {
263     __m128i in[16], temp[16];
264 
265     highbd_load_pack_transpose_32bit_8x8(input, 16, in);
266     for (i = 8; i < 16; i++) {
267       in[i] = _mm_setzero_si128();
268     }
269     idct16_8col(in, temp);
270 
271     for (i = 0; i < 16; i += 8) {
272       int j;
273       transpose_16bit_8x8(temp + i, in);
274       idct16_8col(in, out);
275 
276       for (j = 0; j < 16; ++j) {
277         highbd_write_buffer_8(dest + j * stride, out[j], bd);
278       }
279       dest += 8;
280     }
281   } else {
282     __m128i all[2][16], *in;
283 
284     for (i = 0; i < 2; i++) {
285       in = all[i];
286       highbd_load_transpose_32bit_8x4(input, 16, in);
287       highbd_idct16x16_38_4col(in);
288       input += 4 * 16;
289     }
290 
291     for (i = 0; i < 16; i += 4) {
292       int j;
293       transpose_32bit_4x4(all[0] + i, out + 0);
294       transpose_32bit_4x4(all[1] + i, out + 4);
295       highbd_idct16x16_38_4col(out);
296 
297       for (j = 0; j < 16; ++j) {
298         highbd_write_buffer_4(dest + j * stride, out[j], bd);
299       }
300       dest += 4;
301     }
302   }
303 }
304 
vpx_highbd_idct16x16_10_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)305 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
306                                       int stride, int bd) {
307   int i;
308   __m128i out[16];
309 
310   if (bd == 8) {
311     __m128i in[16], l[16];
312 
313     in[0] = load_pack_8_32bit(input + 0 * 16);
314     in[1] = load_pack_8_32bit(input + 1 * 16);
315     in[2] = load_pack_8_32bit(input + 2 * 16);
316     in[3] = load_pack_8_32bit(input + 3 * 16);
317 
318     idct16x16_10_pass1(in, l);
319 
320     for (i = 0; i < 16; i += 8) {
321       int j;
322       idct16x16_10_pass2(l + i, in);
323 
324       for (j = 0; j < 16; ++j) {
325         highbd_write_buffer_8(dest + j * stride, in[j], bd);
326       }
327       dest += 8;
328     }
329   } else {
330     __m128i all[2][16], *in;
331 
332     for (i = 0; i < 2; i++) {
333       in = all[i];
334       highbd_load_transpose_32bit_4x4(input, 16, in);
335       highbd_idct16x16_10_4col(in);
336       input += 4 * 16;
337     }
338 
339     for (i = 0; i < 16; i += 4) {
340       int j;
341       transpose_32bit_4x4(&all[0][i], out);
342       highbd_idct16x16_10_4col(out);
343 
344       for (j = 0; j < 16; ++j) {
345         highbd_write_buffer_4(dest + j * stride, out[j], bd);
346       }
347       dest += 4;
348     }
349   }
350 }
351 
vpx_highbd_idct16x16_1_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)352 void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
353                                      int stride, int bd) {
354   highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
355 }
356