1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/inv_txfm_sse2.h"
14 #include "vpx_dsp/x86/transpose_sse2.h"
15 #include "vpx_dsp/x86/txfm_common_sse2.h"
16
vpx_highbd_idct16x16_256_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)17 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
18 int stride, int bd) {
19 tran_low_t out[16 * 16];
20 tran_low_t *outptr = out;
21 int i, j, test;
22 __m128i inptr[32];
23 __m128i min_input, max_input, temp1, temp2, sign_bits;
24 const __m128i zero = _mm_set1_epi16(0);
25 const __m128i rounding = _mm_set1_epi16(32);
26 const __m128i max = _mm_set1_epi16(3155);
27 const __m128i min = _mm_set1_epi16(-3155);
28 int optimised_cols = 0;
29
30 // Load input into __m128i & pack to 16 bits
31 for (i = 0; i < 16; i++) {
32 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
33 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
34 inptr[i] = _mm_packs_epi32(temp1, temp2);
35 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
36 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
37 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
38 }
39
40 // Find the min & max for the row transform
41 max_input = _mm_max_epi16(inptr[0], inptr[1]);
42 min_input = _mm_min_epi16(inptr[0], inptr[1]);
43 for (i = 2; i < 32; i++) {
44 max_input = _mm_max_epi16(max_input, inptr[i]);
45 min_input = _mm_min_epi16(min_input, inptr[i]);
46 }
47 max_input = _mm_cmpgt_epi16(max_input, max);
48 min_input = _mm_cmplt_epi16(min_input, min);
49 temp1 = _mm_or_si128(max_input, min_input);
50 test = _mm_movemask_epi8(temp1);
51
52 if (!test) {
53 // Do the row transform
54 idct16_sse2(inptr, inptr + 16);
55
56 // Find the min & max for the column transform
57 max_input = _mm_max_epi16(inptr[0], inptr[1]);
58 min_input = _mm_min_epi16(inptr[0], inptr[1]);
59 for (i = 2; i < 32; i++) {
60 max_input = _mm_max_epi16(max_input, inptr[i]);
61 min_input = _mm_min_epi16(min_input, inptr[i]);
62 }
63 max_input = _mm_cmpgt_epi16(max_input, max);
64 min_input = _mm_cmplt_epi16(min_input, min);
65 temp1 = _mm_or_si128(max_input, min_input);
66 test = _mm_movemask_epi8(temp1);
67
68 if (test) {
69 array_transpose_16x16(inptr, inptr + 16);
70 for (i = 0; i < 16; i++) {
71 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
72 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
73 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
74 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
75 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
76 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
77 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
78 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
79 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
80 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
81 }
82 } else {
83 // Set to use the optimised transform for the column
84 optimised_cols = 1;
85 }
86 } else {
87 // Run the un-optimised row transform
88 for (i = 0; i < 16; ++i) {
89 vpx_highbd_idct16_c(input, outptr, bd);
90 input += 16;
91 outptr += 16;
92 }
93 }
94
95 if (optimised_cols) {
96 idct16_sse2(inptr, inptr + 16);
97
98 // Final round & shift and Reconstruction and Store
99 {
100 __m128i d[2];
101 for (i = 0; i < 16; i++) {
102 inptr[i] = _mm_add_epi16(inptr[i], rounding);
103 inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
104 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
105 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
106 inptr[i] = _mm_srai_epi16(inptr[i], 6);
107 inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
108 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
109 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
110 // Store
111 _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
112 _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
113 }
114 }
115 } else {
116 // Run the un-optimised column transform
117 tran_low_t temp_in[16], temp_out[16];
118 for (i = 0; i < 16; ++i) {
119 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
120 vpx_highbd_idct16_c(temp_in, temp_out, bd);
121 for (j = 0; j < 16; ++j) {
122 dest[j * stride + i] = highbd_clip_pixel_add(
123 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
124 }
125 }
126 }
127 }
128
vpx_highbd_idct16x16_10_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)129 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
130 int stride, int bd) {
131 tran_low_t out[16 * 16] = { 0 };
132 tran_low_t *outptr = out;
133 int i, j, test;
134 __m128i inptr[32];
135 __m128i min_input, max_input, temp1, temp2, sign_bits;
136 const __m128i zero = _mm_set1_epi16(0);
137 const __m128i rounding = _mm_set1_epi16(32);
138 const __m128i max = _mm_set1_epi16(3155);
139 const __m128i min = _mm_set1_epi16(-3155);
140 int optimised_cols = 0;
141
142 // Load input into __m128i & pack to 16 bits
143 for (i = 0; i < 16; i++) {
144 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
145 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
146 inptr[i] = _mm_packs_epi32(temp1, temp2);
147 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
148 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
149 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
150 }
151
152 // Find the min & max for the row transform
153 // Since all non-zero dct coefficients are in upper-left 4x4 area,
154 // we only need to consider first 4 rows here.
155 max_input = _mm_max_epi16(inptr[0], inptr[1]);
156 min_input = _mm_min_epi16(inptr[0], inptr[1]);
157 for (i = 2; i < 4; i++) {
158 max_input = _mm_max_epi16(max_input, inptr[i]);
159 min_input = _mm_min_epi16(min_input, inptr[i]);
160 }
161 max_input = _mm_cmpgt_epi16(max_input, max);
162 min_input = _mm_cmplt_epi16(min_input, min);
163 temp1 = _mm_or_si128(max_input, min_input);
164 test = _mm_movemask_epi8(temp1);
165
166 if (!test) {
167 // Do the row transform (N.B. This transposes inptr)
168 idct16_sse2(inptr, inptr + 16);
169
170 // Find the min & max for the column transform
171 // N.B. Only first 4 cols contain non-zero coeffs
172 max_input = _mm_max_epi16(inptr[0], inptr[1]);
173 min_input = _mm_min_epi16(inptr[0], inptr[1]);
174 for (i = 2; i < 16; i++) {
175 max_input = _mm_max_epi16(max_input, inptr[i]);
176 min_input = _mm_min_epi16(min_input, inptr[i]);
177 }
178 max_input = _mm_cmpgt_epi16(max_input, max);
179 min_input = _mm_cmplt_epi16(min_input, min);
180 temp1 = _mm_or_si128(max_input, min_input);
181 test = _mm_movemask_epi8(temp1);
182
183 if (test) {
184 // Use fact only first 4 rows contain non-zero coeffs
185 array_transpose_8x8(inptr, inptr);
186 array_transpose_8x8(inptr + 8, inptr + 16);
187 for (i = 0; i < 4; i++) {
188 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
189 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
190 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
191 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
192 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
193 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
194 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
195 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
196 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
197 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
198 }
199 } else {
200 // Set to use the optimised transform for the column
201 optimised_cols = 1;
202 }
203 } else {
204 // Run the un-optimised row transform
205 for (i = 0; i < 4; ++i) {
206 vpx_highbd_idct16_c(input, outptr, bd);
207 input += 16;
208 outptr += 16;
209 }
210 }
211
212 if (optimised_cols) {
213 idct16_sse2(inptr, inptr + 16);
214
215 // Final round & shift and Reconstruction and Store
216 {
217 __m128i d[2];
218 for (i = 0; i < 16; i++) {
219 inptr[i] = _mm_add_epi16(inptr[i], rounding);
220 inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
221 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
222 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
223 inptr[i] = _mm_srai_epi16(inptr[i], 6);
224 inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
225 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
226 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
227 // Store
228 _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
229 _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
230 }
231 }
232 } else {
233 // Run the un-optimised column transform
234 tran_low_t temp_in[16], temp_out[16];
235 for (i = 0; i < 16; ++i) {
236 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
237 vpx_highbd_idct16_c(temp_in, temp_out, bd);
238 for (j = 0; j < 16; ++j) {
239 dest[j * stride + i] = highbd_clip_pixel_add(
240 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
241 }
242 }
243 }
244 }
245
vpx_highbd_idct16x16_1_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)246 void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
247 int stride, int bd) {
248 highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
249 }
250