1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/inv_txfm_sse2.h"
14 #include "vpx_dsp/x86/transpose_sse2.h"
15 #include "vpx_dsp/x86/txfm_common_sse2.h"
16 
vpx_highbd_idct16x16_256_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)17 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
18                                        int stride, int bd) {
19   tran_low_t out[16 * 16];
20   tran_low_t *outptr = out;
21   int i, j, test;
22   __m128i inptr[32];
23   __m128i min_input, max_input, temp1, temp2, sign_bits;
24   const __m128i zero = _mm_set1_epi16(0);
25   const __m128i rounding = _mm_set1_epi16(32);
26   const __m128i max = _mm_set1_epi16(3155);
27   const __m128i min = _mm_set1_epi16(-3155);
28   int optimised_cols = 0;
29 
30   // Load input into __m128i & pack to 16 bits
31   for (i = 0; i < 16; i++) {
32     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
33     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
34     inptr[i] = _mm_packs_epi32(temp1, temp2);
35     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
36     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
37     inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
38   }
39 
40   // Find the min & max for the row transform
41   max_input = _mm_max_epi16(inptr[0], inptr[1]);
42   min_input = _mm_min_epi16(inptr[0], inptr[1]);
43   for (i = 2; i < 32; i++) {
44     max_input = _mm_max_epi16(max_input, inptr[i]);
45     min_input = _mm_min_epi16(min_input, inptr[i]);
46   }
47   max_input = _mm_cmpgt_epi16(max_input, max);
48   min_input = _mm_cmplt_epi16(min_input, min);
49   temp1 = _mm_or_si128(max_input, min_input);
50   test = _mm_movemask_epi8(temp1);
51 
52   if (!test) {
53     // Do the row transform
54     idct16_sse2(inptr, inptr + 16);
55 
56     // Find the min & max for the column transform
57     max_input = _mm_max_epi16(inptr[0], inptr[1]);
58     min_input = _mm_min_epi16(inptr[0], inptr[1]);
59     for (i = 2; i < 32; i++) {
60       max_input = _mm_max_epi16(max_input, inptr[i]);
61       min_input = _mm_min_epi16(min_input, inptr[i]);
62     }
63     max_input = _mm_cmpgt_epi16(max_input, max);
64     min_input = _mm_cmplt_epi16(min_input, min);
65     temp1 = _mm_or_si128(max_input, min_input);
66     test = _mm_movemask_epi8(temp1);
67 
68     if (test) {
69       array_transpose_16x16(inptr, inptr + 16);
70       for (i = 0; i < 16; i++) {
71         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
72         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
73         temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
74         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
75         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
76         sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
77         temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
78         temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
79         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
80         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
81       }
82     } else {
83       // Set to use the optimised transform for the column
84       optimised_cols = 1;
85     }
86   } else {
87     // Run the un-optimised row transform
88     for (i = 0; i < 16; ++i) {
89       vpx_highbd_idct16_c(input, outptr, bd);
90       input += 16;
91       outptr += 16;
92     }
93   }
94 
95   if (optimised_cols) {
96     idct16_sse2(inptr, inptr + 16);
97 
98     // Final round & shift and Reconstruction and Store
99     {
100       __m128i d[2];
101       for (i = 0; i < 16; i++) {
102         inptr[i] = _mm_add_epi16(inptr[i], rounding);
103         inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
104         d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
105         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
106         inptr[i] = _mm_srai_epi16(inptr[i], 6);
107         inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
108         d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
109         d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
110         // Store
111         _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
112         _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
113       }
114     }
115   } else {
116     // Run the un-optimised column transform
117     tran_low_t temp_in[16], temp_out[16];
118     for (i = 0; i < 16; ++i) {
119       for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
120       vpx_highbd_idct16_c(temp_in, temp_out, bd);
121       for (j = 0; j < 16; ++j) {
122         dest[j * stride + i] = highbd_clip_pixel_add(
123             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
124       }
125     }
126   }
127 }
128 
vpx_highbd_idct16x16_10_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)129 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
130                                       int stride, int bd) {
131   tran_low_t out[16 * 16] = { 0 };
132   tran_low_t *outptr = out;
133   int i, j, test;
134   __m128i inptr[32];
135   __m128i min_input, max_input, temp1, temp2, sign_bits;
136   const __m128i zero = _mm_set1_epi16(0);
137   const __m128i rounding = _mm_set1_epi16(32);
138   const __m128i max = _mm_set1_epi16(3155);
139   const __m128i min = _mm_set1_epi16(-3155);
140   int optimised_cols = 0;
141 
142   // Load input into __m128i & pack to 16 bits
143   for (i = 0; i < 16; i++) {
144     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
145     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
146     inptr[i] = _mm_packs_epi32(temp1, temp2);
147     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
148     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
149     inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
150   }
151 
152   // Find the min & max for the row transform
153   // Since all non-zero dct coefficients are in upper-left 4x4 area,
154   // we only need to consider first 4 rows here.
155   max_input = _mm_max_epi16(inptr[0], inptr[1]);
156   min_input = _mm_min_epi16(inptr[0], inptr[1]);
157   for (i = 2; i < 4; i++) {
158     max_input = _mm_max_epi16(max_input, inptr[i]);
159     min_input = _mm_min_epi16(min_input, inptr[i]);
160   }
161   max_input = _mm_cmpgt_epi16(max_input, max);
162   min_input = _mm_cmplt_epi16(min_input, min);
163   temp1 = _mm_or_si128(max_input, min_input);
164   test = _mm_movemask_epi8(temp1);
165 
166   if (!test) {
167     // Do the row transform (N.B. This transposes inptr)
168     idct16_sse2(inptr, inptr + 16);
169 
170     // Find the min & max for the column transform
171     // N.B. Only first 4 cols contain non-zero coeffs
172     max_input = _mm_max_epi16(inptr[0], inptr[1]);
173     min_input = _mm_min_epi16(inptr[0], inptr[1]);
174     for (i = 2; i < 16; i++) {
175       max_input = _mm_max_epi16(max_input, inptr[i]);
176       min_input = _mm_min_epi16(min_input, inptr[i]);
177     }
178     max_input = _mm_cmpgt_epi16(max_input, max);
179     min_input = _mm_cmplt_epi16(min_input, min);
180     temp1 = _mm_or_si128(max_input, min_input);
181     test = _mm_movemask_epi8(temp1);
182 
183     if (test) {
184       // Use fact only first 4 rows contain non-zero coeffs
185       array_transpose_8x8(inptr, inptr);
186       array_transpose_8x8(inptr + 8, inptr + 16);
187       for (i = 0; i < 4; i++) {
188         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
189         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
190         temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
191         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
192         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
193         sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
194         temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
195         temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
196         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
197         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
198       }
199     } else {
200       // Set to use the optimised transform for the column
201       optimised_cols = 1;
202     }
203   } else {
204     // Run the un-optimised row transform
205     for (i = 0; i < 4; ++i) {
206       vpx_highbd_idct16_c(input, outptr, bd);
207       input += 16;
208       outptr += 16;
209     }
210   }
211 
212   if (optimised_cols) {
213     idct16_sse2(inptr, inptr + 16);
214 
215     // Final round & shift and Reconstruction and Store
216     {
217       __m128i d[2];
218       for (i = 0; i < 16; i++) {
219         inptr[i] = _mm_add_epi16(inptr[i], rounding);
220         inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
221         d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
222         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
223         inptr[i] = _mm_srai_epi16(inptr[i], 6);
224         inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
225         d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
226         d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
227         // Store
228         _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
229         _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
230       }
231     }
232   } else {
233     // Run the un-optimised column transform
234     tran_low_t temp_in[16], temp_out[16];
235     for (i = 0; i < 16; ++i) {
236       for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
237       vpx_highbd_idct16_c(temp_in, temp_out, bd);
238       for (j = 0; j < 16; ++j) {
239         dest[j * stride + i] = highbd_clip_pixel_add(
240             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
241       }
242     }
243   }
244 }
245 
vpx_highbd_idct16x16_1_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)246 void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
247                                      int stride, int bd) {
248   highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
249 }
250