1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <math.h>
12 #include <string.h>
13
14 #include "vpx_dsp/inv_txfm.h"
15
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)16 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
17 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
18 0.5 shifts per pixel. */
19 int i;
20 tran_low_t output[16];
21 tran_high_t a1, b1, c1, d1, e1;
22 const tran_low_t *ip = input;
23 tran_low_t *op = output;
24
25 for (i = 0; i < 4; i++) {
26 a1 = ip[0] >> UNIT_QUANT_SHIFT;
27 c1 = ip[1] >> UNIT_QUANT_SHIFT;
28 d1 = ip[2] >> UNIT_QUANT_SHIFT;
29 b1 = ip[3] >> UNIT_QUANT_SHIFT;
30 a1 += c1;
31 d1 -= b1;
32 e1 = (a1 - d1) >> 1;
33 b1 = e1 - b1;
34 c1 = e1 - c1;
35 a1 -= b1;
36 d1 += c1;
37 op[0] = WRAPLOW(a1, 8);
38 op[1] = WRAPLOW(b1, 8);
39 op[2] = WRAPLOW(c1, 8);
40 op[3] = WRAPLOW(d1, 8);
41 ip += 4;
42 op += 4;
43 }
44
45 ip = output;
46 for (i = 0; i < 4; i++) {
47 a1 = ip[4 * 0];
48 c1 = ip[4 * 1];
49 d1 = ip[4 * 2];
50 b1 = ip[4 * 3];
51 a1 += c1;
52 d1 -= b1;
53 e1 = (a1 - d1) >> 1;
54 b1 = e1 - b1;
55 c1 = e1 - c1;
56 a1 -= b1;
57 d1 += c1;
58 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
59 dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
60 dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
61 dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
62
63 ip++;
64 dest++;
65 }
66 }
67
vpx_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int dest_stride)68 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
69 int i;
70 tran_high_t a1, e1;
71 tran_low_t tmp[4];
72 const tran_low_t *ip = in;
73 tran_low_t *op = tmp;
74
75 a1 = ip[0] >> UNIT_QUANT_SHIFT;
76 e1 = a1 >> 1;
77 a1 -= e1;
78 op[0] = WRAPLOW(a1, 8);
79 op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
80
81 ip = tmp;
82 for (i = 0; i < 4; i++) {
83 e1 = ip[0] >> 1;
84 a1 = ip[0] - e1;
85 dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
86 dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
87 dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
88 dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
89 ip++;
90 dest++;
91 }
92 }
93
idct4_c(const tran_low_t * input,tran_low_t * output)94 void idct4_c(const tran_low_t *input, tran_low_t *output) {
95 tran_low_t step[4];
96 tran_high_t temp1, temp2;
97 // stage 1
98 temp1 = (input[0] + input[2]) * cospi_16_64;
99 temp2 = (input[0] - input[2]) * cospi_16_64;
100 step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
101 step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
102 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
103 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
104 step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
105 step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
106
107 // stage 2
108 output[0] = WRAPLOW(step[0] + step[3], 8);
109 output[1] = WRAPLOW(step[1] + step[2], 8);
110 output[2] = WRAPLOW(step[1] - step[2], 8);
111 output[3] = WRAPLOW(step[0] - step[3], 8);
112 }
113
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)114 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
115 tran_low_t out[4 * 4];
116 tran_low_t *outptr = out;
117 int i, j;
118 tran_low_t temp_in[4], temp_out[4];
119
120 // Rows
121 for (i = 0; i < 4; ++i) {
122 idct4_c(input, outptr);
123 input += 4;
124 outptr += 4;
125 }
126
127 // Columns
128 for (i = 0; i < 4; ++i) {
129 for (j = 0; j < 4; ++j)
130 temp_in[j] = out[j * 4 + i];
131 idct4_c(temp_in, temp_out);
132 for (j = 0; j < 4; ++j) {
133 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
134 ROUND_POWER_OF_TWO(temp_out[j], 4));
135 }
136 }
137 }
138
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int dest_stride)139 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
140 int dest_stride) {
141 int i;
142 tran_high_t a1;
143 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
144 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
145 a1 = ROUND_POWER_OF_TWO(out, 4);
146
147 for (i = 0; i < 4; i++) {
148 dest[0] = clip_pixel_add(dest[0], a1);
149 dest[1] = clip_pixel_add(dest[1], a1);
150 dest[2] = clip_pixel_add(dest[2], a1);
151 dest[3] = clip_pixel_add(dest[3], a1);
152 dest += dest_stride;
153 }
154 }
155
idct8_c(const tran_low_t * input,tran_low_t * output)156 void idct8_c(const tran_low_t *input, tran_low_t *output) {
157 tran_low_t step1[8], step2[8];
158 tran_high_t temp1, temp2;
159 // stage 1
160 step1[0] = input[0];
161 step1[2] = input[4];
162 step1[1] = input[2];
163 step1[3] = input[6];
164 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
165 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
166 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
167 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
168 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
169 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
170 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
171 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
172
173 // stage 2
174 temp1 = (step1[0] + step1[2]) * cospi_16_64;
175 temp2 = (step1[0] - step1[2]) * cospi_16_64;
176 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
177 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
178 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
179 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
180 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
181 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
182 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
183 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
184 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
185 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
186
187 // stage 3
188 step1[0] = WRAPLOW(step2[0] + step2[3], 8);
189 step1[1] = WRAPLOW(step2[1] + step2[2], 8);
190 step1[2] = WRAPLOW(step2[1] - step2[2], 8);
191 step1[3] = WRAPLOW(step2[0] - step2[3], 8);
192 step1[4] = step2[4];
193 temp1 = (step2[6] - step2[5]) * cospi_16_64;
194 temp2 = (step2[5] + step2[6]) * cospi_16_64;
195 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
196 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
197 step1[7] = step2[7];
198
199 // stage 4
200 output[0] = WRAPLOW(step1[0] + step1[7], 8);
201 output[1] = WRAPLOW(step1[1] + step1[6], 8);
202 output[2] = WRAPLOW(step1[2] + step1[5], 8);
203 output[3] = WRAPLOW(step1[3] + step1[4], 8);
204 output[4] = WRAPLOW(step1[3] - step1[4], 8);
205 output[5] = WRAPLOW(step1[2] - step1[5], 8);
206 output[6] = WRAPLOW(step1[1] - step1[6], 8);
207 output[7] = WRAPLOW(step1[0] - step1[7], 8);
208 }
209
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)210 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
211 tran_low_t out[8 * 8];
212 tran_low_t *outptr = out;
213 int i, j;
214 tran_low_t temp_in[8], temp_out[8];
215
216 // First transform rows
217 for (i = 0; i < 8; ++i) {
218 idct8_c(input, outptr);
219 input += 8;
220 outptr += 8;
221 }
222
223 // Then transform columns
224 for (i = 0; i < 8; ++i) {
225 for (j = 0; j < 8; ++j)
226 temp_in[j] = out[j * 8 + i];
227 idct8_c(temp_in, temp_out);
228 for (j = 0; j < 8; ++j) {
229 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
230 ROUND_POWER_OF_TWO(temp_out[j], 5));
231 }
232 }
233 }
234
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)235 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
236 int i, j;
237 tran_high_t a1;
238 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
239 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
240 a1 = ROUND_POWER_OF_TWO(out, 5);
241 for (j = 0; j < 8; ++j) {
242 for (i = 0; i < 8; ++i)
243 dest[i] = clip_pixel_add(dest[i], a1);
244 dest += stride;
245 }
246 }
247
iadst4_c(const tran_low_t * input,tran_low_t * output)248 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
249 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
250
251 tran_low_t x0 = input[0];
252 tran_low_t x1 = input[1];
253 tran_low_t x2 = input[2];
254 tran_low_t x3 = input[3];
255
256 if (!(x0 | x1 | x2 | x3)) {
257 output[0] = output[1] = output[2] = output[3] = 0;
258 return;
259 }
260
261 s0 = sinpi_1_9 * x0;
262 s1 = sinpi_2_9 * x0;
263 s2 = sinpi_3_9 * x1;
264 s3 = sinpi_4_9 * x2;
265 s4 = sinpi_1_9 * x2;
266 s5 = sinpi_2_9 * x3;
267 s6 = sinpi_4_9 * x3;
268 s7 = x0 - x2 + x3;
269
270 s0 = s0 + s3 + s5;
271 s1 = s1 - s4 - s6;
272 s3 = s2;
273 s2 = sinpi_3_9 * s7;
274
275 // 1-D transform scaling factor is sqrt(2).
276 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
277 // + 1b (addition) = 29b.
278 // Hence the output bit depth is 15b.
279 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
280 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
281 output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
282 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
283 }
284
iadst8_c(const tran_low_t * input,tran_low_t * output)285 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
286 int s0, s1, s2, s3, s4, s5, s6, s7;
287
288 tran_high_t x0 = input[7];
289 tran_high_t x1 = input[0];
290 tran_high_t x2 = input[5];
291 tran_high_t x3 = input[2];
292 tran_high_t x4 = input[3];
293 tran_high_t x5 = input[4];
294 tran_high_t x6 = input[1];
295 tran_high_t x7 = input[6];
296
297 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
298 output[0] = output[1] = output[2] = output[3] = output[4]
299 = output[5] = output[6] = output[7] = 0;
300 return;
301 }
302
303 // stage 1
304 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
305 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
306 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
307 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
308 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
309 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
310 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
311 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
312
313 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
314 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
315 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
316 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
317 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
318 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
319 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
320 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
321
322 // stage 2
323 s0 = (int)x0;
324 s1 = (int)x1;
325 s2 = (int)x2;
326 s3 = (int)x3;
327 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
328 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
329 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
330 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
331
332 x0 = WRAPLOW(s0 + s2, 8);
333 x1 = WRAPLOW(s1 + s3, 8);
334 x2 = WRAPLOW(s0 - s2, 8);
335 x3 = WRAPLOW(s1 - s3, 8);
336 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
337 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
338 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
339 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
340
341 // stage 3
342 s2 = (int)(cospi_16_64 * (x2 + x3));
343 s3 = (int)(cospi_16_64 * (x2 - x3));
344 s6 = (int)(cospi_16_64 * (x6 + x7));
345 s7 = (int)(cospi_16_64 * (x6 - x7));
346
347 x2 = WRAPLOW(dct_const_round_shift(s2), 8);
348 x3 = WRAPLOW(dct_const_round_shift(s3), 8);
349 x6 = WRAPLOW(dct_const_round_shift(s6), 8);
350 x7 = WRAPLOW(dct_const_round_shift(s7), 8);
351
352 output[0] = WRAPLOW(x0, 8);
353 output[1] = WRAPLOW(-x4, 8);
354 output[2] = WRAPLOW(x6, 8);
355 output[3] = WRAPLOW(-x2, 8);
356 output[4] = WRAPLOW(x3, 8);
357 output[5] = WRAPLOW(-x7, 8);
358 output[6] = WRAPLOW(x5, 8);
359 output[7] = WRAPLOW(-x1, 8);
360 }
361
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)362 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
363 tran_low_t out[8 * 8] = { 0 };
364 tran_low_t *outptr = out;
365 int i, j;
366 tran_low_t temp_in[8], temp_out[8];
367
368 // First transform rows
369 // only first 4 row has non-zero coefs
370 for (i = 0; i < 4; ++i) {
371 idct8_c(input, outptr);
372 input += 8;
373 outptr += 8;
374 }
375
376 // Then transform columns
377 for (i = 0; i < 8; ++i) {
378 for (j = 0; j < 8; ++j)
379 temp_in[j] = out[j * 8 + i];
380 idct8_c(temp_in, temp_out);
381 for (j = 0; j < 8; ++j) {
382 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
383 ROUND_POWER_OF_TWO(temp_out[j], 5));
384 }
385 }
386 }
387
idct16_c(const tran_low_t * input,tran_low_t * output)388 void idct16_c(const tran_low_t *input, tran_low_t *output) {
389 tran_low_t step1[16], step2[16];
390 tran_high_t temp1, temp2;
391
392 // stage 1
393 step1[0] = input[0/2];
394 step1[1] = input[16/2];
395 step1[2] = input[8/2];
396 step1[3] = input[24/2];
397 step1[4] = input[4/2];
398 step1[5] = input[20/2];
399 step1[6] = input[12/2];
400 step1[7] = input[28/2];
401 step1[8] = input[2/2];
402 step1[9] = input[18/2];
403 step1[10] = input[10/2];
404 step1[11] = input[26/2];
405 step1[12] = input[6/2];
406 step1[13] = input[22/2];
407 step1[14] = input[14/2];
408 step1[15] = input[30/2];
409
410 // stage 2
411 step2[0] = step1[0];
412 step2[1] = step1[1];
413 step2[2] = step1[2];
414 step2[3] = step1[3];
415 step2[4] = step1[4];
416 step2[5] = step1[5];
417 step2[6] = step1[6];
418 step2[7] = step1[7];
419
420 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
421 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
422 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
423 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
424
425 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
426 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
427 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
428 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
429
430 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
431 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
432 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
433 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
434
435 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
436 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
437 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
438 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
439
440 // stage 3
441 step1[0] = step2[0];
442 step1[1] = step2[1];
443 step1[2] = step2[2];
444 step1[3] = step2[3];
445
446 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
447 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
448 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
449 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
450 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
451 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
452 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
453 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
454
455 step1[8] = WRAPLOW(step2[8] + step2[9], 8);
456 step1[9] = WRAPLOW(step2[8] - step2[9], 8);
457 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
458 step1[11] = WRAPLOW(step2[10] + step2[11], 8);
459 step1[12] = WRAPLOW(step2[12] + step2[13], 8);
460 step1[13] = WRAPLOW(step2[12] - step2[13], 8);
461 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
462 step1[15] = WRAPLOW(step2[14] + step2[15], 8);
463
464 // stage 4
465 temp1 = (step1[0] + step1[1]) * cospi_16_64;
466 temp2 = (step1[0] - step1[1]) * cospi_16_64;
467 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
468 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
469 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
470 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
471 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
472 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
473 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
474 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
475 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
476 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
477
478 step2[8] = step1[8];
479 step2[15] = step1[15];
480 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
481 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
482 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
483 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
484 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
485 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
486 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
487 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
488 step2[11] = step1[11];
489 step2[12] = step1[12];
490
491 // stage 5
492 step1[0] = WRAPLOW(step2[0] + step2[3], 8);
493 step1[1] = WRAPLOW(step2[1] + step2[2], 8);
494 step1[2] = WRAPLOW(step2[1] - step2[2], 8);
495 step1[3] = WRAPLOW(step2[0] - step2[3], 8);
496 step1[4] = step2[4];
497 temp1 = (step2[6] - step2[5]) * cospi_16_64;
498 temp2 = (step2[5] + step2[6]) * cospi_16_64;
499 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
500 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
501 step1[7] = step2[7];
502
503 step1[8] = WRAPLOW(step2[8] + step2[11], 8);
504 step1[9] = WRAPLOW(step2[9] + step2[10], 8);
505 step1[10] = WRAPLOW(step2[9] - step2[10], 8);
506 step1[11] = WRAPLOW(step2[8] - step2[11], 8);
507 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
508 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
509 step1[14] = WRAPLOW(step2[13] + step2[14], 8);
510 step1[15] = WRAPLOW(step2[12] + step2[15], 8);
511
512 // stage 6
513 step2[0] = WRAPLOW(step1[0] + step1[7], 8);
514 step2[1] = WRAPLOW(step1[1] + step1[6], 8);
515 step2[2] = WRAPLOW(step1[2] + step1[5], 8);
516 step2[3] = WRAPLOW(step1[3] + step1[4], 8);
517 step2[4] = WRAPLOW(step1[3] - step1[4], 8);
518 step2[5] = WRAPLOW(step1[2] - step1[5], 8);
519 step2[6] = WRAPLOW(step1[1] - step1[6], 8);
520 step2[7] = WRAPLOW(step1[0] - step1[7], 8);
521 step2[8] = step1[8];
522 step2[9] = step1[9];
523 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
524 temp2 = (step1[10] + step1[13]) * cospi_16_64;
525 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
526 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
527 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
528 temp2 = (step1[11] + step1[12]) * cospi_16_64;
529 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
530 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
531 step2[14] = step1[14];
532 step2[15] = step1[15];
533
534 // stage 7
535 output[0] = WRAPLOW(step2[0] + step2[15], 8);
536 output[1] = WRAPLOW(step2[1] + step2[14], 8);
537 output[2] = WRAPLOW(step2[2] + step2[13], 8);
538 output[3] = WRAPLOW(step2[3] + step2[12], 8);
539 output[4] = WRAPLOW(step2[4] + step2[11], 8);
540 output[5] = WRAPLOW(step2[5] + step2[10], 8);
541 output[6] = WRAPLOW(step2[6] + step2[9], 8);
542 output[7] = WRAPLOW(step2[7] + step2[8], 8);
543 output[8] = WRAPLOW(step2[7] - step2[8], 8);
544 output[9] = WRAPLOW(step2[6] - step2[9], 8);
545 output[10] = WRAPLOW(step2[5] - step2[10], 8);
546 output[11] = WRAPLOW(step2[4] - step2[11], 8);
547 output[12] = WRAPLOW(step2[3] - step2[12], 8);
548 output[13] = WRAPLOW(step2[2] - step2[13], 8);
549 output[14] = WRAPLOW(step2[1] - step2[14], 8);
550 output[15] = WRAPLOW(step2[0] - step2[15], 8);
551 }
552
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)553 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
554 int stride) {
555 tran_low_t out[16 * 16];
556 tran_low_t *outptr = out;
557 int i, j;
558 tran_low_t temp_in[16], temp_out[16];
559
560 // First transform rows
561 for (i = 0; i < 16; ++i) {
562 idct16_c(input, outptr);
563 input += 16;
564 outptr += 16;
565 }
566
567 // Then transform columns
568 for (i = 0; i < 16; ++i) {
569 for (j = 0; j < 16; ++j)
570 temp_in[j] = out[j * 16 + i];
571 idct16_c(temp_in, temp_out);
572 for (j = 0; j < 16; ++j) {
573 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
574 ROUND_POWER_OF_TWO(temp_out[j], 6));
575 }
576 }
577 }
578
iadst16_c(const tran_low_t * input,tran_low_t * output)579 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
580 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
581 tran_high_t s9, s10, s11, s12, s13, s14, s15;
582
583 tran_high_t x0 = input[15];
584 tran_high_t x1 = input[0];
585 tran_high_t x2 = input[13];
586 tran_high_t x3 = input[2];
587 tran_high_t x4 = input[11];
588 tran_high_t x5 = input[4];
589 tran_high_t x6 = input[9];
590 tran_high_t x7 = input[6];
591 tran_high_t x8 = input[7];
592 tran_high_t x9 = input[8];
593 tran_high_t x10 = input[5];
594 tran_high_t x11 = input[10];
595 tran_high_t x12 = input[3];
596 tran_high_t x13 = input[12];
597 tran_high_t x14 = input[1];
598 tran_high_t x15 = input[14];
599
600 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
601 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
602 output[0] = output[1] = output[2] = output[3] = output[4]
603 = output[5] = output[6] = output[7] = output[8]
604 = output[9] = output[10] = output[11] = output[12]
605 = output[13] = output[14] = output[15] = 0;
606 return;
607 }
608
609 // stage 1
610 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
611 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
612 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
613 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
614 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
615 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
616 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
617 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
618 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
619 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
620 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
621 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
622 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
623 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
624 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
625 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
626
627 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
628 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
629 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
630 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
631 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
632 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
633 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
634 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
635 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
636 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
637 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
638 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
639 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
640 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
641 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
642 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
643
644 // stage 2
645 s0 = x0;
646 s1 = x1;
647 s2 = x2;
648 s3 = x3;
649 s4 = x4;
650 s5 = x5;
651 s6 = x6;
652 s7 = x7;
653 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
654 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
655 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
656 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
657 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
658 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
659 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
660 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
661
662 x0 = WRAPLOW(s0 + s4, 8);
663 x1 = WRAPLOW(s1 + s5, 8);
664 x2 = WRAPLOW(s2 + s6, 8);
665 x3 = WRAPLOW(s3 + s7, 8);
666 x4 = WRAPLOW(s0 - s4, 8);
667 x5 = WRAPLOW(s1 - s5, 8);
668 x6 = WRAPLOW(s2 - s6, 8);
669 x7 = WRAPLOW(s3 - s7, 8);
670 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
671 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
672 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
673 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
674 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
675 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
676 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
677 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
678
679 // stage 3
680 s0 = x0;
681 s1 = x1;
682 s2 = x2;
683 s3 = x3;
684 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
685 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
686 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
687 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
688 s8 = x8;
689 s9 = x9;
690 s10 = x10;
691 s11 = x11;
692 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
693 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
694 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
695 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
696
697 x0 = WRAPLOW(check_range(s0 + s2), 8);
698 x1 = WRAPLOW(check_range(s1 + s3), 8);
699 x2 = WRAPLOW(check_range(s0 - s2), 8);
700 x3 = WRAPLOW(check_range(s1 - s3), 8);
701 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
702 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
703 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
704 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
705 x8 = WRAPLOW(check_range(s8 + s10), 8);
706 x9 = WRAPLOW(check_range(s9 + s11), 8);
707 x10 = WRAPLOW(check_range(s8 - s10), 8);
708 x11 = WRAPLOW(check_range(s9 - s11), 8);
709 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
710 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
711 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
712 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
713
714 // stage 4
715 s2 = (- cospi_16_64) * (x2 + x3);
716 s3 = cospi_16_64 * (x2 - x3);
717 s6 = cospi_16_64 * (x6 + x7);
718 s7 = cospi_16_64 * (- x6 + x7);
719 s10 = cospi_16_64 * (x10 + x11);
720 s11 = cospi_16_64 * (- x10 + x11);
721 s14 = (- cospi_16_64) * (x14 + x15);
722 s15 = cospi_16_64 * (x14 - x15);
723
724 x2 = WRAPLOW(dct_const_round_shift(s2), 8);
725 x3 = WRAPLOW(dct_const_round_shift(s3), 8);
726 x6 = WRAPLOW(dct_const_round_shift(s6), 8);
727 x7 = WRAPLOW(dct_const_round_shift(s7), 8);
728 x10 = WRAPLOW(dct_const_round_shift(s10), 8);
729 x11 = WRAPLOW(dct_const_round_shift(s11), 8);
730 x14 = WRAPLOW(dct_const_round_shift(s14), 8);
731 x15 = WRAPLOW(dct_const_round_shift(s15), 8);
732
733 output[0] = WRAPLOW(x0, 8);
734 output[1] = WRAPLOW(-x8, 8);
735 output[2] = WRAPLOW(x12, 8);
736 output[3] = WRAPLOW(-x4, 8);
737 output[4] = WRAPLOW(x6, 8);
738 output[5] = WRAPLOW(x14, 8);
739 output[6] = WRAPLOW(x10, 8);
740 output[7] = WRAPLOW(x2, 8);
741 output[8] = WRAPLOW(x3, 8);
742 output[9] = WRAPLOW(x11, 8);
743 output[10] = WRAPLOW(x15, 8);
744 output[11] = WRAPLOW(x7, 8);
745 output[12] = WRAPLOW(x5, 8);
746 output[13] = WRAPLOW(-x13, 8);
747 output[14] = WRAPLOW(x9, 8);
748 output[15] = WRAPLOW(-x1, 8);
749 }
750
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)751 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
752 int stride) {
753 tran_low_t out[16 * 16] = { 0 };
754 tran_low_t *outptr = out;
755 int i, j;
756 tran_low_t temp_in[16], temp_out[16];
757
758 // First transform rows. Since all non-zero dct coefficients are in
759 // upper-left 4x4 area, we only need to calculate first 4 rows here.
760 for (i = 0; i < 4; ++i) {
761 idct16_c(input, outptr);
762 input += 16;
763 outptr += 16;
764 }
765
766 // Then transform columns
767 for (i = 0; i < 16; ++i) {
768 for (j = 0; j < 16; ++j)
769 temp_in[j] = out[j*16 + i];
770 idct16_c(temp_in, temp_out);
771 for (j = 0; j < 16; ++j) {
772 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
773 ROUND_POWER_OF_TWO(temp_out[j], 6));
774 }
775 }
776 }
777
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)778 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
779 int i, j;
780 tran_high_t a1;
781 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
782 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
783 a1 = ROUND_POWER_OF_TWO(out, 6);
784 for (j = 0; j < 16; ++j) {
785 for (i = 0; i < 16; ++i)
786 dest[i] = clip_pixel_add(dest[i], a1);
787 dest += stride;
788 }
789 }
790
idct32_c(const tran_low_t * input,tran_low_t * output)791 void idct32_c(const tran_low_t *input, tran_low_t *output) {
792 tran_low_t step1[32], step2[32];
793 tran_high_t temp1, temp2;
794
795 // stage 1
796 step1[0] = input[0];
797 step1[1] = input[16];
798 step1[2] = input[8];
799 step1[3] = input[24];
800 step1[4] = input[4];
801 step1[5] = input[20];
802 step1[6] = input[12];
803 step1[7] = input[28];
804 step1[8] = input[2];
805 step1[9] = input[18];
806 step1[10] = input[10];
807 step1[11] = input[26];
808 step1[12] = input[6];
809 step1[13] = input[22];
810 step1[14] = input[14];
811 step1[15] = input[30];
812
813 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
814 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
815 step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
816 step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
817
818 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
819 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
820 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
821 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
822
823 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
824 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
825 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
826 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
827
828 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
829 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
830 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
831 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
832
833 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
834 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
835 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
836 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
837
838 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
839 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
840 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
841 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
842
843 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
844 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
845 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
846 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
847
848 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
849 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
850 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
851 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
852
853 // stage 2
854 step2[0] = step1[0];
855 step2[1] = step1[1];
856 step2[2] = step1[2];
857 step2[3] = step1[3];
858 step2[4] = step1[4];
859 step2[5] = step1[5];
860 step2[6] = step1[6];
861 step2[7] = step1[7];
862
863 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
864 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
865 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
866 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
867
868 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
869 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
870 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
871 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
872
873 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
874 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
875 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
876 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
877
878 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
879 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
880 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
881 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
882
883 step2[16] = WRAPLOW(step1[16] + step1[17], 8);
884 step2[17] = WRAPLOW(step1[16] - step1[17], 8);
885 step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
886 step2[19] = WRAPLOW(step1[18] + step1[19], 8);
887 step2[20] = WRAPLOW(step1[20] + step1[21], 8);
888 step2[21] = WRAPLOW(step1[20] - step1[21], 8);
889 step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
890 step2[23] = WRAPLOW(step1[22] + step1[23], 8);
891 step2[24] = WRAPLOW(step1[24] + step1[25], 8);
892 step2[25] = WRAPLOW(step1[24] - step1[25], 8);
893 step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
894 step2[27] = WRAPLOW(step1[26] + step1[27], 8);
895 step2[28] = WRAPLOW(step1[28] + step1[29], 8);
896 step2[29] = WRAPLOW(step1[28] - step1[29], 8);
897 step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
898 step2[31] = WRAPLOW(step1[30] + step1[31], 8);
899
900 // stage 3
901 step1[0] = step2[0];
902 step1[1] = step2[1];
903 step1[2] = step2[2];
904 step1[3] = step2[3];
905
906 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
907 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
908 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
909 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
910 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
911 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
912 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
913 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
914
915 step1[8] = WRAPLOW(step2[8] + step2[9], 8);
916 step1[9] = WRAPLOW(step2[8] - step2[9], 8);
917 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
918 step1[11] = WRAPLOW(step2[10] + step2[11], 8);
919 step1[12] = WRAPLOW(step2[12] + step2[13], 8);
920 step1[13] = WRAPLOW(step2[12] - step2[13], 8);
921 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
922 step1[15] = WRAPLOW(step2[14] + step2[15], 8);
923
924 step1[16] = step2[16];
925 step1[31] = step2[31];
926 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
927 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
928 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
929 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
930 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
931 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
932 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
933 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
934 step1[19] = step2[19];
935 step1[20] = step2[20];
936 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
937 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
938 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
939 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
940 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
941 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
942 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
943 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
944 step1[23] = step2[23];
945 step1[24] = step2[24];
946 step1[27] = step2[27];
947 step1[28] = step2[28];
948
949 // stage 4
950 temp1 = (step1[0] + step1[1]) * cospi_16_64;
951 temp2 = (step1[0] - step1[1]) * cospi_16_64;
952 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
953 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
954 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
955 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
956 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
957 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
958 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
959 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
960 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
961 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
962
963 step2[8] = step1[8];
964 step2[15] = step1[15];
965 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
966 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
967 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
968 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
969 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
970 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
971 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
972 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
973 step2[11] = step1[11];
974 step2[12] = step1[12];
975
976 step2[16] = WRAPLOW(step1[16] + step1[19], 8);
977 step2[17] = WRAPLOW(step1[17] + step1[18], 8);
978 step2[18] = WRAPLOW(step1[17] - step1[18], 8);
979 step2[19] = WRAPLOW(step1[16] - step1[19], 8);
980 step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
981 step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
982 step2[22] = WRAPLOW(step1[21] + step1[22], 8);
983 step2[23] = WRAPLOW(step1[20] + step1[23], 8);
984
985 step2[24] = WRAPLOW(step1[24] + step1[27], 8);
986 step2[25] = WRAPLOW(step1[25] + step1[26], 8);
987 step2[26] = WRAPLOW(step1[25] - step1[26], 8);
988 step2[27] = WRAPLOW(step1[24] - step1[27], 8);
989 step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
990 step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
991 step2[30] = WRAPLOW(step1[29] + step1[30], 8);
992 step2[31] = WRAPLOW(step1[28] + step1[31], 8);
993
994 // stage 5
995 step1[0] = WRAPLOW(step2[0] + step2[3], 8);
996 step1[1] = WRAPLOW(step2[1] + step2[2], 8);
997 step1[2] = WRAPLOW(step2[1] - step2[2], 8);
998 step1[3] = WRAPLOW(step2[0] - step2[3], 8);
999 step1[4] = step2[4];
1000 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1001 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1002 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
1003 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
1004 step1[7] = step2[7];
1005
1006 step1[8] = WRAPLOW(step2[8] + step2[11], 8);
1007 step1[9] = WRAPLOW(step2[9] + step2[10], 8);
1008 step1[10] = WRAPLOW(step2[9] - step2[10], 8);
1009 step1[11] = WRAPLOW(step2[8] - step2[11], 8);
1010 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
1011 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
1012 step1[14] = WRAPLOW(step2[13] + step2[14], 8);
1013 step1[15] = WRAPLOW(step2[12] + step2[15], 8);
1014
1015 step1[16] = step2[16];
1016 step1[17] = step2[17];
1017 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1018 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1019 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1020 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1021 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1022 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1023 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
1024 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
1025 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1026 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1027 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1028 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1029 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1030 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1031 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1032 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1033 step1[22] = step2[22];
1034 step1[23] = step2[23];
1035 step1[24] = step2[24];
1036 step1[25] = step2[25];
1037 step1[30] = step2[30];
1038 step1[31] = step2[31];
1039
1040 // stage 6
1041 step2[0] = WRAPLOW(step1[0] + step1[7], 8);
1042 step2[1] = WRAPLOW(step1[1] + step1[6], 8);
1043 step2[2] = WRAPLOW(step1[2] + step1[5], 8);
1044 step2[3] = WRAPLOW(step1[3] + step1[4], 8);
1045 step2[4] = WRAPLOW(step1[3] - step1[4], 8);
1046 step2[5] = WRAPLOW(step1[2] - step1[5], 8);
1047 step2[6] = WRAPLOW(step1[1] - step1[6], 8);
1048 step2[7] = WRAPLOW(step1[0] - step1[7], 8);
1049 step2[8] = step1[8];
1050 step2[9] = step1[9];
1051 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1052 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1053 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1054 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1055 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1056 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1057 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1058 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1059 step2[14] = step1[14];
1060 step2[15] = step1[15];
1061
1062 step2[16] = WRAPLOW(step1[16] + step1[23], 8);
1063 step2[17] = WRAPLOW(step1[17] + step1[22], 8);
1064 step2[18] = WRAPLOW(step1[18] + step1[21], 8);
1065 step2[19] = WRAPLOW(step1[19] + step1[20], 8);
1066 step2[20] = WRAPLOW(step1[19] - step1[20], 8);
1067 step2[21] = WRAPLOW(step1[18] - step1[21], 8);
1068 step2[22] = WRAPLOW(step1[17] - step1[22], 8);
1069 step2[23] = WRAPLOW(step1[16] - step1[23], 8);
1070
1071 step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
1072 step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
1073 step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
1074 step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
1075 step2[28] = WRAPLOW(step1[27] + step1[28], 8);
1076 step2[29] = WRAPLOW(step1[26] + step1[29], 8);
1077 step2[30] = WRAPLOW(step1[25] + step1[30], 8);
1078 step2[31] = WRAPLOW(step1[24] + step1[31], 8);
1079
1080 // stage 7
1081 step1[0] = WRAPLOW(step2[0] + step2[15], 8);
1082 step1[1] = WRAPLOW(step2[1] + step2[14], 8);
1083 step1[2] = WRAPLOW(step2[2] + step2[13], 8);
1084 step1[3] = WRAPLOW(step2[3] + step2[12], 8);
1085 step1[4] = WRAPLOW(step2[4] + step2[11], 8);
1086 step1[5] = WRAPLOW(step2[5] + step2[10], 8);
1087 step1[6] = WRAPLOW(step2[6] + step2[9], 8);
1088 step1[7] = WRAPLOW(step2[7] + step2[8], 8);
1089 step1[8] = WRAPLOW(step2[7] - step2[8], 8);
1090 step1[9] = WRAPLOW(step2[6] - step2[9], 8);
1091 step1[10] = WRAPLOW(step2[5] - step2[10], 8);
1092 step1[11] = WRAPLOW(step2[4] - step2[11], 8);
1093 step1[12] = WRAPLOW(step2[3] - step2[12], 8);
1094 step1[13] = WRAPLOW(step2[2] - step2[13], 8);
1095 step1[14] = WRAPLOW(step2[1] - step2[14], 8);
1096 step1[15] = WRAPLOW(step2[0] - step2[15], 8);
1097
1098 step1[16] = step2[16];
1099 step1[17] = step2[17];
1100 step1[18] = step2[18];
1101 step1[19] = step2[19];
1102 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1103 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1104 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1105 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1106 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1107 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1108 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1109 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1110 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1111 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1112 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1113 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1114 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1115 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1116 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
1117 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
1118 step1[28] = step2[28];
1119 step1[29] = step2[29];
1120 step1[30] = step2[30];
1121 step1[31] = step2[31];
1122
1123 // final stage
1124 output[0] = WRAPLOW(step1[0] + step1[31], 8);
1125 output[1] = WRAPLOW(step1[1] + step1[30], 8);
1126 output[2] = WRAPLOW(step1[2] + step1[29], 8);
1127 output[3] = WRAPLOW(step1[3] + step1[28], 8);
1128 output[4] = WRAPLOW(step1[4] + step1[27], 8);
1129 output[5] = WRAPLOW(step1[5] + step1[26], 8);
1130 output[6] = WRAPLOW(step1[6] + step1[25], 8);
1131 output[7] = WRAPLOW(step1[7] + step1[24], 8);
1132 output[8] = WRAPLOW(step1[8] + step1[23], 8);
1133 output[9] = WRAPLOW(step1[9] + step1[22], 8);
1134 output[10] = WRAPLOW(step1[10] + step1[21], 8);
1135 output[11] = WRAPLOW(step1[11] + step1[20], 8);
1136 output[12] = WRAPLOW(step1[12] + step1[19], 8);
1137 output[13] = WRAPLOW(step1[13] + step1[18], 8);
1138 output[14] = WRAPLOW(step1[14] + step1[17], 8);
1139 output[15] = WRAPLOW(step1[15] + step1[16], 8);
1140 output[16] = WRAPLOW(step1[15] - step1[16], 8);
1141 output[17] = WRAPLOW(step1[14] - step1[17], 8);
1142 output[18] = WRAPLOW(step1[13] - step1[18], 8);
1143 output[19] = WRAPLOW(step1[12] - step1[19], 8);
1144 output[20] = WRAPLOW(step1[11] - step1[20], 8);
1145 output[21] = WRAPLOW(step1[10] - step1[21], 8);
1146 output[22] = WRAPLOW(step1[9] - step1[22], 8);
1147 output[23] = WRAPLOW(step1[8] - step1[23], 8);
1148 output[24] = WRAPLOW(step1[7] - step1[24], 8);
1149 output[25] = WRAPLOW(step1[6] - step1[25], 8);
1150 output[26] = WRAPLOW(step1[5] - step1[26], 8);
1151 output[27] = WRAPLOW(step1[4] - step1[27], 8);
1152 output[28] = WRAPLOW(step1[3] - step1[28], 8);
1153 output[29] = WRAPLOW(step1[2] - step1[29], 8);
1154 output[30] = WRAPLOW(step1[1] - step1[30], 8);
1155 output[31] = WRAPLOW(step1[0] - step1[31], 8);
1156 }
1157
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1158 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1159 int stride) {
1160 tran_low_t out[32 * 32];
1161 tran_low_t *outptr = out;
1162 int i, j;
1163 tran_low_t temp_in[32], temp_out[32];
1164
1165 // Rows
1166 for (i = 0; i < 32; ++i) {
1167 int16_t zero_coeff[16];
1168 for (j = 0; j < 16; ++j)
1169 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1170 for (j = 0; j < 8; ++j)
1171 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1172 for (j = 0; j < 4; ++j)
1173 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1174 for (j = 0; j < 2; ++j)
1175 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1176
1177 if (zero_coeff[0] | zero_coeff[1])
1178 idct32_c(input, outptr);
1179 else
1180 memset(outptr, 0, sizeof(tran_low_t) * 32);
1181 input += 32;
1182 outptr += 32;
1183 }
1184
1185 // Columns
1186 for (i = 0; i < 32; ++i) {
1187 for (j = 0; j < 32; ++j)
1188 temp_in[j] = out[j * 32 + i];
1189 idct32_c(temp_in, temp_out);
1190 for (j = 0; j < 32; ++j) {
1191 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1192 ROUND_POWER_OF_TWO(temp_out[j], 6));
1193 }
1194 }
1195 }
1196
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1197 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1198 int stride) {
1199 tran_low_t out[32 * 32] = {0};
1200 tran_low_t *outptr = out;
1201 int i, j;
1202 tran_low_t temp_in[32], temp_out[32];
1203
1204 // Rows
1205 // only upper-left 8x8 has non-zero coeff
1206 for (i = 0; i < 8; ++i) {
1207 idct32_c(input, outptr);
1208 input += 32;
1209 outptr += 32;
1210 }
1211
1212 // Columns
1213 for (i = 0; i < 32; ++i) {
1214 for (j = 0; j < 32; ++j)
1215 temp_in[j] = out[j * 32 + i];
1216 idct32_c(temp_in, temp_out);
1217 for (j = 0; j < 32; ++j) {
1218 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1219 ROUND_POWER_OF_TWO(temp_out[j], 6));
1220 }
1221 }
1222 }
1223
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1224 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1225 int i, j;
1226 tran_high_t a1;
1227
1228 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
1229 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
1230 a1 = ROUND_POWER_OF_TWO(out, 6);
1231
1232 for (j = 0; j < 32; ++j) {
1233 for (i = 0; i < 32; ++i)
1234 dest[i] = clip_pixel_add(dest[i], a1);
1235 dest += stride;
1236 }
1237 }
1238
1239 #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1240 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1241 int stride, int bd) {
1242 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1243 0.5 shifts per pixel. */
1244 int i;
1245 tran_low_t output[16];
1246 tran_high_t a1, b1, c1, d1, e1;
1247 const tran_low_t *ip = input;
1248 tran_low_t *op = output;
1249 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1250
1251 for (i = 0; i < 4; i++) {
1252 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1253 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1254 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1255 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1256 a1 += c1;
1257 d1 -= b1;
1258 e1 = (a1 - d1) >> 1;
1259 b1 = e1 - b1;
1260 c1 = e1 - c1;
1261 a1 -= b1;
1262 d1 += c1;
1263 op[0] = WRAPLOW(a1, bd);
1264 op[1] = WRAPLOW(b1, bd);
1265 op[2] = WRAPLOW(c1, bd);
1266 op[3] = WRAPLOW(d1, bd);
1267 ip += 4;
1268 op += 4;
1269 }
1270
1271 ip = output;
1272 for (i = 0; i < 4; i++) {
1273 a1 = ip[4 * 0];
1274 c1 = ip[4 * 1];
1275 d1 = ip[4 * 2];
1276 b1 = ip[4 * 3];
1277 a1 += c1;
1278 d1 -= b1;
1279 e1 = (a1 - d1) >> 1;
1280 b1 = e1 - b1;
1281 c1 = e1 - c1;
1282 a1 -= b1;
1283 d1 += c1;
1284 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1285 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
1286 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
1287 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
1288
1289 ip++;
1290 dest++;
1291 }
1292 }
1293
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest8,int dest_stride,int bd)1294 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1295 int dest_stride, int bd) {
1296 int i;
1297 tran_high_t a1, e1;
1298 tran_low_t tmp[4];
1299 const tran_low_t *ip = in;
1300 tran_low_t *op = tmp;
1301 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1302 (void) bd;
1303
1304 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1305 e1 = a1 >> 1;
1306 a1 -= e1;
1307 op[0] = WRAPLOW(a1, bd);
1308 op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
1309
1310 ip = tmp;
1311 for (i = 0; i < 4; i++) {
1312 e1 = ip[0] >> 1;
1313 a1 = ip[0] - e1;
1314 dest[dest_stride * 0] = highbd_clip_pixel_add(
1315 dest[dest_stride * 0], a1, bd);
1316 dest[dest_stride * 1] = highbd_clip_pixel_add(
1317 dest[dest_stride * 1], e1, bd);
1318 dest[dest_stride * 2] = highbd_clip_pixel_add(
1319 dest[dest_stride * 2], e1, bd);
1320 dest[dest_stride * 3] = highbd_clip_pixel_add(
1321 dest[dest_stride * 3], e1, bd);
1322 ip++;
1323 dest++;
1324 }
1325 }
1326
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1327 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1328 tran_low_t step[4];
1329 tran_high_t temp1, temp2;
1330 (void) bd;
1331 // stage 1
1332 temp1 = (input[0] + input[2]) * cospi_16_64;
1333 temp2 = (input[0] - input[2]) * cospi_16_64;
1334 step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1335 step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1336 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1337 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1338 step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1339 step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1340
1341 // stage 2
1342 output[0] = WRAPLOW(step[0] + step[3], bd);
1343 output[1] = WRAPLOW(step[1] + step[2], bd);
1344 output[2] = WRAPLOW(step[1] - step[2], bd);
1345 output[3] = WRAPLOW(step[0] - step[3], bd);
1346 }
1347
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1348 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1349 int stride, int bd) {
1350 tran_low_t out[4 * 4];
1351 tran_low_t *outptr = out;
1352 int i, j;
1353 tran_low_t temp_in[4], temp_out[4];
1354 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1355
1356 // Rows
1357 for (i = 0; i < 4; ++i) {
1358 vpx_highbd_idct4_c(input, outptr, bd);
1359 input += 4;
1360 outptr += 4;
1361 }
1362
1363 // Columns
1364 for (i = 0; i < 4; ++i) {
1365 for (j = 0; j < 4; ++j)
1366 temp_in[j] = out[j * 4 + i];
1367 vpx_highbd_idct4_c(temp_in, temp_out, bd);
1368 for (j = 0; j < 4; ++j) {
1369 dest[j * stride + i] = highbd_clip_pixel_add(
1370 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1371 }
1372 }
1373 }
1374
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest8,int dest_stride,int bd)1375 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1376 int dest_stride, int bd) {
1377 int i;
1378 tran_high_t a1;
1379 tran_low_t out = WRAPLOW(
1380 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1381 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1382
1383 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1384 a1 = ROUND_POWER_OF_TWO(out, 4);
1385
1386 for (i = 0; i < 4; i++) {
1387 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1388 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1389 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1390 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1391 dest += dest_stride;
1392 }
1393 }
1394
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1395 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1396 tran_low_t step1[8], step2[8];
1397 tran_high_t temp1, temp2;
1398 // stage 1
1399 step1[0] = input[0];
1400 step1[2] = input[4];
1401 step1[1] = input[2];
1402 step1[3] = input[6];
1403 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1404 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1405 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1406 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1407 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1408 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1409 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1410 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1411
1412 // stage 2 & stage 3 - even half
1413 vpx_highbd_idct4_c(step1, step1, bd);
1414
1415 // stage 2 - odd half
1416 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1417 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1418 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1419 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1420
1421 // stage 3 - odd half
1422 step1[4] = step2[4];
1423 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1424 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1425 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1426 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1427 step1[7] = step2[7];
1428
1429 // stage 4
1430 output[0] = WRAPLOW(step1[0] + step1[7], bd);
1431 output[1] = WRAPLOW(step1[1] + step1[6], bd);
1432 output[2] = WRAPLOW(step1[2] + step1[5], bd);
1433 output[3] = WRAPLOW(step1[3] + step1[4], bd);
1434 output[4] = WRAPLOW(step1[3] - step1[4], bd);
1435 output[5] = WRAPLOW(step1[2] - step1[5], bd);
1436 output[6] = WRAPLOW(step1[1] - step1[6], bd);
1437 output[7] = WRAPLOW(step1[0] - step1[7], bd);
1438 }
1439
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1440 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1441 int stride, int bd) {
1442 tran_low_t out[8 * 8];
1443 tran_low_t *outptr = out;
1444 int i, j;
1445 tran_low_t temp_in[8], temp_out[8];
1446 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1447
1448 // First transform rows.
1449 for (i = 0; i < 8; ++i) {
1450 vpx_highbd_idct8_c(input, outptr, bd);
1451 input += 8;
1452 outptr += 8;
1453 }
1454
1455 // Then transform columns.
1456 for (i = 0; i < 8; ++i) {
1457 for (j = 0; j < 8; ++j)
1458 temp_in[j] = out[j * 8 + i];
1459 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1460 for (j = 0; j < 8; ++j) {
1461 dest[j * stride + i] = highbd_clip_pixel_add(
1462 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1463 }
1464 }
1465 }
1466
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1467 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1468 int stride, int bd) {
1469 int i, j;
1470 tran_high_t a1;
1471 tran_low_t out = WRAPLOW(
1472 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1473 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1474 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1475 a1 = ROUND_POWER_OF_TWO(out, 5);
1476 for (j = 0; j < 8; ++j) {
1477 for (i = 0; i < 8; ++i)
1478 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1479 dest += stride;
1480 }
1481 }
1482
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1483 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1484 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1485
1486 tran_low_t x0 = input[0];
1487 tran_low_t x1 = input[1];
1488 tran_low_t x2 = input[2];
1489 tran_low_t x3 = input[3];
1490 (void) bd;
1491
1492 if (!(x0 | x1 | x2 | x3)) {
1493 memset(output, 0, 4 * sizeof(*output));
1494 return;
1495 }
1496
1497 s0 = sinpi_1_9 * x0;
1498 s1 = sinpi_2_9 * x0;
1499 s2 = sinpi_3_9 * x1;
1500 s3 = sinpi_4_9 * x2;
1501 s4 = sinpi_1_9 * x2;
1502 s5 = sinpi_2_9 * x3;
1503 s6 = sinpi_4_9 * x3;
1504 s7 = (tran_high_t)(x0 - x2 + x3);
1505
1506 s0 = s0 + s3 + s5;
1507 s1 = s1 - s4 - s6;
1508 s3 = s2;
1509 s2 = sinpi_3_9 * s7;
1510
1511 // 1-D transform scaling factor is sqrt(2).
1512 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1513 // + 1b (addition) = 29b.
1514 // Hence the output bit depth is 15b.
1515 output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
1516 output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
1517 output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1518 output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
1519 }
1520
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1521 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1522 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1523
1524 tran_low_t x0 = input[7];
1525 tran_low_t x1 = input[0];
1526 tran_low_t x2 = input[5];
1527 tran_low_t x3 = input[2];
1528 tran_low_t x4 = input[3];
1529 tran_low_t x5 = input[4];
1530 tran_low_t x6 = input[1];
1531 tran_low_t x7 = input[6];
1532 (void) bd;
1533
1534 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1535 memset(output, 0, 8 * sizeof(*output));
1536 return;
1537 }
1538
1539 // stage 1
1540 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1541 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1542 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1543 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1544 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1545 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1546 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1547 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1548
1549 x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
1550 x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
1551 x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
1552 x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
1553 x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
1554 x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
1555 x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
1556 x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
1557
1558 // stage 2
1559 s0 = x0;
1560 s1 = x1;
1561 s2 = x2;
1562 s3 = x3;
1563 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1564 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1565 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1566 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1567
1568 x0 = WRAPLOW(s0 + s2, bd);
1569 x1 = WRAPLOW(s1 + s3, bd);
1570 x2 = WRAPLOW(s0 - s2, bd);
1571 x3 = WRAPLOW(s1 - s3, bd);
1572 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1573 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1574 x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1575 x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1576
1577 // stage 3
1578 s2 = cospi_16_64 * (x2 + x3);
1579 s3 = cospi_16_64 * (x2 - x3);
1580 s6 = cospi_16_64 * (x6 + x7);
1581 s7 = cospi_16_64 * (x6 - x7);
1582
1583 x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1584 x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1585 x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1586 x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1587
1588 output[0] = WRAPLOW(x0, bd);
1589 output[1] = WRAPLOW(-x4, bd);
1590 output[2] = WRAPLOW(x6, bd);
1591 output[3] = WRAPLOW(-x2, bd);
1592 output[4] = WRAPLOW(x3, bd);
1593 output[5] = WRAPLOW(-x7, bd);
1594 output[6] = WRAPLOW(x5, bd);
1595 output[7] = WRAPLOW(-x1, bd);
1596 }
1597
vpx_highbd_idct8x8_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1598 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
1599 int stride, int bd) {
1600 tran_low_t out[8 * 8] = { 0 };
1601 tran_low_t *outptr = out;
1602 int i, j;
1603 tran_low_t temp_in[8], temp_out[8];
1604 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1605
1606 // First transform rows.
1607 // Only first 4 row has non-zero coefs.
1608 for (i = 0; i < 4; ++i) {
1609 vpx_highbd_idct8_c(input, outptr, bd);
1610 input += 8;
1611 outptr += 8;
1612 }
1613 // Then transform columns.
1614 for (i = 0; i < 8; ++i) {
1615 for (j = 0; j < 8; ++j)
1616 temp_in[j] = out[j * 8 + i];
1617 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1618 for (j = 0; j < 8; ++j) {
1619 dest[j * stride + i] = highbd_clip_pixel_add(
1620 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1621 }
1622 }
1623 }
1624
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1625 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1626 tran_low_t step1[16], step2[16];
1627 tran_high_t temp1, temp2;
1628 (void) bd;
1629
1630 // stage 1
1631 step1[0] = input[0/2];
1632 step1[1] = input[16/2];
1633 step1[2] = input[8/2];
1634 step1[3] = input[24/2];
1635 step1[4] = input[4/2];
1636 step1[5] = input[20/2];
1637 step1[6] = input[12/2];
1638 step1[7] = input[28/2];
1639 step1[8] = input[2/2];
1640 step1[9] = input[18/2];
1641 step1[10] = input[10/2];
1642 step1[11] = input[26/2];
1643 step1[12] = input[6/2];
1644 step1[13] = input[22/2];
1645 step1[14] = input[14/2];
1646 step1[15] = input[30/2];
1647
1648 // stage 2
1649 step2[0] = step1[0];
1650 step2[1] = step1[1];
1651 step2[2] = step1[2];
1652 step2[3] = step1[3];
1653 step2[4] = step1[4];
1654 step2[5] = step1[5];
1655 step2[6] = step1[6];
1656 step2[7] = step1[7];
1657
1658 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1659 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1660 step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1661 step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1662
1663 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1664 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1665 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1666 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1667
1668 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1669 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1670 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1671 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1672
1673 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1674 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1675 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1676 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1677
1678 // stage 3
1679 step1[0] = step2[0];
1680 step1[1] = step2[1];
1681 step1[2] = step2[2];
1682 step1[3] = step2[3];
1683
1684 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1685 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1686 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1687 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1688 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1689 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1690 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1691 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1692
1693 step1[8] = WRAPLOW(step2[8] + step2[9], bd);
1694 step1[9] = WRAPLOW(step2[8] - step2[9], bd);
1695 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
1696 step1[11] = WRAPLOW(step2[10] + step2[11], bd);
1697 step1[12] = WRAPLOW(step2[12] + step2[13], bd);
1698 step1[13] = WRAPLOW(step2[12] - step2[13], bd);
1699 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
1700 step1[15] = WRAPLOW(step2[14] + step2[15], bd);
1701
1702 // stage 4
1703 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1704 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1705 step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1706 step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1707 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1708 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1709 step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1710 step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1711 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1712 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1713 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1714 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1715
1716 step2[8] = step1[8];
1717 step2[15] = step1[15];
1718 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1719 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1720 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1721 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1722 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1723 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1724 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1725 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1726 step2[11] = step1[11];
1727 step2[12] = step1[12];
1728
1729 // stage 5
1730 step1[0] = WRAPLOW(step2[0] + step2[3], bd);
1731 step1[1] = WRAPLOW(step2[1] + step2[2], bd);
1732 step1[2] = WRAPLOW(step2[1] - step2[2], bd);
1733 step1[3] = WRAPLOW(step2[0] - step2[3], bd);
1734 step1[4] = step2[4];
1735 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1736 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1737 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1738 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1739 step1[7] = step2[7];
1740
1741 step1[8] = WRAPLOW(step2[8] + step2[11], bd);
1742 step1[9] = WRAPLOW(step2[9] + step2[10], bd);
1743 step1[10] = WRAPLOW(step2[9] - step2[10], bd);
1744 step1[11] = WRAPLOW(step2[8] - step2[11], bd);
1745 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
1746 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
1747 step1[14] = WRAPLOW(step2[13] + step2[14], bd);
1748 step1[15] = WRAPLOW(step2[12] + step2[15], bd);
1749
1750 // stage 6
1751 step2[0] = WRAPLOW(step1[0] + step1[7], bd);
1752 step2[1] = WRAPLOW(step1[1] + step1[6], bd);
1753 step2[2] = WRAPLOW(step1[2] + step1[5], bd);
1754 step2[3] = WRAPLOW(step1[3] + step1[4], bd);
1755 step2[4] = WRAPLOW(step1[3] - step1[4], bd);
1756 step2[5] = WRAPLOW(step1[2] - step1[5], bd);
1757 step2[6] = WRAPLOW(step1[1] - step1[6], bd);
1758 step2[7] = WRAPLOW(step1[0] - step1[7], bd);
1759 step2[8] = step1[8];
1760 step2[9] = step1[9];
1761 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1762 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1763 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1764 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1765 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1766 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1767 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1768 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1769 step2[14] = step1[14];
1770 step2[15] = step1[15];
1771
1772 // stage 7
1773 output[0] = WRAPLOW(step2[0] + step2[15], bd);
1774 output[1] = WRAPLOW(step2[1] + step2[14], bd);
1775 output[2] = WRAPLOW(step2[2] + step2[13], bd);
1776 output[3] = WRAPLOW(step2[3] + step2[12], bd);
1777 output[4] = WRAPLOW(step2[4] + step2[11], bd);
1778 output[5] = WRAPLOW(step2[5] + step2[10], bd);
1779 output[6] = WRAPLOW(step2[6] + step2[9], bd);
1780 output[7] = WRAPLOW(step2[7] + step2[8], bd);
1781 output[8] = WRAPLOW(step2[7] - step2[8], bd);
1782 output[9] = WRAPLOW(step2[6] - step2[9], bd);
1783 output[10] = WRAPLOW(step2[5] - step2[10], bd);
1784 output[11] = WRAPLOW(step2[4] - step2[11], bd);
1785 output[12] = WRAPLOW(step2[3] - step2[12], bd);
1786 output[13] = WRAPLOW(step2[2] - step2[13], bd);
1787 output[14] = WRAPLOW(step2[1] - step2[14], bd);
1788 output[15] = WRAPLOW(step2[0] - step2[15], bd);
1789 }
1790
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1791 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
1792 int stride, int bd) {
1793 tran_low_t out[16 * 16];
1794 tran_low_t *outptr = out;
1795 int i, j;
1796 tran_low_t temp_in[16], temp_out[16];
1797 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1798
1799 // First transform rows.
1800 for (i = 0; i < 16; ++i) {
1801 vpx_highbd_idct16_c(input, outptr, bd);
1802 input += 16;
1803 outptr += 16;
1804 }
1805
1806 // Then transform columns.
1807 for (i = 0; i < 16; ++i) {
1808 for (j = 0; j < 16; ++j)
1809 temp_in[j] = out[j * 16 + i];
1810 vpx_highbd_idct16_c(temp_in, temp_out, bd);
1811 for (j = 0; j < 16; ++j) {
1812 dest[j * stride + i] = highbd_clip_pixel_add(
1813 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1814 }
1815 }
1816 }
1817
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1818 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1819 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1820 tran_high_t s9, s10, s11, s12, s13, s14, s15;
1821
1822 tran_low_t x0 = input[15];
1823 tran_low_t x1 = input[0];
1824 tran_low_t x2 = input[13];
1825 tran_low_t x3 = input[2];
1826 tran_low_t x4 = input[11];
1827 tran_low_t x5 = input[4];
1828 tran_low_t x6 = input[9];
1829 tran_low_t x7 = input[6];
1830 tran_low_t x8 = input[7];
1831 tran_low_t x9 = input[8];
1832 tran_low_t x10 = input[5];
1833 tran_low_t x11 = input[10];
1834 tran_low_t x12 = input[3];
1835 tran_low_t x13 = input[12];
1836 tran_low_t x14 = input[1];
1837 tran_low_t x15 = input[14];
1838 (void) bd;
1839
1840 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
1841 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
1842 memset(output, 0, 16 * sizeof(*output));
1843 return;
1844 }
1845
1846 // stage 1
1847 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1848 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1849 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1850 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1851 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1852 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1853 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1854 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1855 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1856 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1857 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1858 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1859 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1860 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1861 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1862 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1863
1864 x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
1865 x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
1866 x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
1867 x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
1868 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
1869 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
1870 x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
1871 x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
1872 x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
1873 x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
1874 x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
1875 x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
1876 x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
1877 x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
1878 x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
1879 x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
1880
1881 // stage 2
1882 s0 = x0;
1883 s1 = x1;
1884 s2 = x2;
1885 s3 = x3;
1886 s4 = x4;
1887 s5 = x5;
1888 s6 = x6;
1889 s7 = x7;
1890 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1891 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1892 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1893 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1894 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1895 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1896 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1897 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1898
1899 x0 = WRAPLOW(s0 + s4, bd);
1900 x1 = WRAPLOW(s1 + s5, bd);
1901 x2 = WRAPLOW(s2 + s6, bd);
1902 x3 = WRAPLOW(s3 + s7, bd);
1903 x4 = WRAPLOW(s0 - s4, bd);
1904 x5 = WRAPLOW(s1 - s5, bd);
1905 x6 = WRAPLOW(s2 - s6, bd);
1906 x7 = WRAPLOW(s3 - s7, bd);
1907 x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
1908 x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
1909 x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
1910 x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
1911 x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
1912 x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
1913 x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
1914 x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
1915
1916 // stage 3
1917 s0 = x0;
1918 s1 = x1;
1919 s2 = x2;
1920 s3 = x3;
1921 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1922 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1923 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1924 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1925 s8 = x8;
1926 s9 = x9;
1927 s10 = x10;
1928 s11 = x11;
1929 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1930 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1931 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1932 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1933
1934 x0 = WRAPLOW(s0 + s2, bd);
1935 x1 = WRAPLOW(s1 + s3, bd);
1936 x2 = WRAPLOW(s0 - s2, bd);
1937 x3 = WRAPLOW(s1 - s3, bd);
1938 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1939 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1940 x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1941 x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1942 x8 = WRAPLOW(s8 + s10, bd);
1943 x9 = WRAPLOW(s9 + s11, bd);
1944 x10 = WRAPLOW(s8 - s10, bd);
1945 x11 = WRAPLOW(s9 - s11, bd);
1946 x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
1947 x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
1948 x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
1949 x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
1950
1951 // stage 4
1952 s2 = (- cospi_16_64) * (x2 + x3);
1953 s3 = cospi_16_64 * (x2 - x3);
1954 s6 = cospi_16_64 * (x6 + x7);
1955 s7 = cospi_16_64 * (-x6 + x7);
1956 s10 = cospi_16_64 * (x10 + x11);
1957 s11 = cospi_16_64 * (-x10 + x11);
1958 s14 = (- cospi_16_64) * (x14 + x15);
1959 s15 = cospi_16_64 * (x14 - x15);
1960
1961 x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1962 x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1963 x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1964 x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1965 x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
1966 x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
1967 x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
1968 x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
1969
1970 output[0] = WRAPLOW(x0, bd);
1971 output[1] = WRAPLOW(-x8, bd);
1972 output[2] = WRAPLOW(x12, bd);
1973 output[3] = WRAPLOW(-x4, bd);
1974 output[4] = WRAPLOW(x6, bd);
1975 output[5] = WRAPLOW(x14, bd);
1976 output[6] = WRAPLOW(x10, bd);
1977 output[7] = WRAPLOW(x2, bd);
1978 output[8] = WRAPLOW(x3, bd);
1979 output[9] = WRAPLOW(x11, bd);
1980 output[10] = WRAPLOW(x15, bd);
1981 output[11] = WRAPLOW(x7, bd);
1982 output[12] = WRAPLOW(x5, bd);
1983 output[13] = WRAPLOW(-x13, bd);
1984 output[14] = WRAPLOW(x9, bd);
1985 output[15] = WRAPLOW(-x1, bd);
1986 }
1987
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1988 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
1989 int stride, int bd) {
1990 tran_low_t out[16 * 16] = { 0 };
1991 tran_low_t *outptr = out;
1992 int i, j;
1993 tran_low_t temp_in[16], temp_out[16];
1994 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1995
1996 // First transform rows. Since all non-zero dct coefficients are in
1997 // upper-left 4x4 area, we only need to calculate first 4 rows here.
1998 for (i = 0; i < 4; ++i) {
1999 vpx_highbd_idct16_c(input, outptr, bd);
2000 input += 16;
2001 outptr += 16;
2002 }
2003
2004 // Then transform columns.
2005 for (i = 0; i < 16; ++i) {
2006 for (j = 0; j < 16; ++j)
2007 temp_in[j] = out[j*16 + i];
2008 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2009 for (j = 0; j < 16; ++j) {
2010 dest[j * stride + i] = highbd_clip_pixel_add(
2011 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2012 }
2013 }
2014 }
2015
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2016 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2017 int stride, int bd) {
2018 int i, j;
2019 tran_high_t a1;
2020 tran_low_t out = WRAPLOW(
2021 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2022 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2023
2024 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2025 a1 = ROUND_POWER_OF_TWO(out, 6);
2026 for (j = 0; j < 16; ++j) {
2027 for (i = 0; i < 16; ++i)
2028 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2029 dest += stride;
2030 }
2031 }
2032
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2033 static void highbd_idct32_c(const tran_low_t *input,
2034 tran_low_t *output, int bd) {
2035 tran_low_t step1[32], step2[32];
2036 tran_high_t temp1, temp2;
2037 (void) bd;
2038
2039 // stage 1
2040 step1[0] = input[0];
2041 step1[1] = input[16];
2042 step1[2] = input[8];
2043 step1[3] = input[24];
2044 step1[4] = input[4];
2045 step1[5] = input[20];
2046 step1[6] = input[12];
2047 step1[7] = input[28];
2048 step1[8] = input[2];
2049 step1[9] = input[18];
2050 step1[10] = input[10];
2051 step1[11] = input[26];
2052 step1[12] = input[6];
2053 step1[13] = input[22];
2054 step1[14] = input[14];
2055 step1[15] = input[30];
2056
2057 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2058 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2059 step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2060 step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2061
2062 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2063 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2064 step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2065 step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2066
2067 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2068 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2069 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2070 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2071
2072 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2073 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2074 step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2075 step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2076
2077 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2078 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2079 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2080 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2081
2082 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2083 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2084 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2085 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2086
2087 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2088 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2089 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2090 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2091
2092 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2093 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2094 step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2095 step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2096
2097 // stage 2
2098 step2[0] = step1[0];
2099 step2[1] = step1[1];
2100 step2[2] = step1[2];
2101 step2[3] = step1[3];
2102 step2[4] = step1[4];
2103 step2[5] = step1[5];
2104 step2[6] = step1[6];
2105 step2[7] = step1[7];
2106
2107 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2108 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2109 step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2110 step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2111
2112 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2113 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2114 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2115 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2116
2117 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2118 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2119 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2120 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2121
2122 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2123 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2124 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2125 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2126
2127 step2[16] = WRAPLOW(step1[16] + step1[17], bd);
2128 step2[17] = WRAPLOW(step1[16] - step1[17], bd);
2129 step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
2130 step2[19] = WRAPLOW(step1[18] + step1[19], bd);
2131 step2[20] = WRAPLOW(step1[20] + step1[21], bd);
2132 step2[21] = WRAPLOW(step1[20] - step1[21], bd);
2133 step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
2134 step2[23] = WRAPLOW(step1[22] + step1[23], bd);
2135 step2[24] = WRAPLOW(step1[24] + step1[25], bd);
2136 step2[25] = WRAPLOW(step1[24] - step1[25], bd);
2137 step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
2138 step2[27] = WRAPLOW(step1[26] + step1[27], bd);
2139 step2[28] = WRAPLOW(step1[28] + step1[29], bd);
2140 step2[29] = WRAPLOW(step1[28] - step1[29], bd);
2141 step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
2142 step2[31] = WRAPLOW(step1[30] + step1[31], bd);
2143
2144 // stage 3
2145 step1[0] = step2[0];
2146 step1[1] = step2[1];
2147 step1[2] = step2[2];
2148 step1[3] = step2[3];
2149
2150 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2151 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2152 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2153 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2154 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2155 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2156 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2157 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2158
2159 step1[8] = WRAPLOW(step2[8] + step2[9], bd);
2160 step1[9] = WRAPLOW(step2[8] - step2[9], bd);
2161 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
2162 step1[11] = WRAPLOW(step2[10] + step2[11], bd);
2163 step1[12] = WRAPLOW(step2[12] + step2[13], bd);
2164 step1[13] = WRAPLOW(step2[12] - step2[13], bd);
2165 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
2166 step1[15] = WRAPLOW(step2[14] + step2[15], bd);
2167
2168 step1[16] = step2[16];
2169 step1[31] = step2[31];
2170 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2171 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2172 step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2173 step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2174 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2175 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2176 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2177 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2178 step1[19] = step2[19];
2179 step1[20] = step2[20];
2180 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2181 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2182 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2183 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2184 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2185 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2186 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2187 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2188 step1[23] = step2[23];
2189 step1[24] = step2[24];
2190 step1[27] = step2[27];
2191 step1[28] = step2[28];
2192
2193 // stage 4
2194 temp1 = (step1[0] + step1[1]) * cospi_16_64;
2195 temp2 = (step1[0] - step1[1]) * cospi_16_64;
2196 step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2197 step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2198 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2199 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2200 step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2201 step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2202 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
2203 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
2204 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
2205 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
2206
2207 step2[8] = step1[8];
2208 step2[15] = step1[15];
2209 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2210 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2211 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2212 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2213 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2214 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2215 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2216 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2217 step2[11] = step1[11];
2218 step2[12] = step1[12];
2219
2220 step2[16] = WRAPLOW(step1[16] + step1[19], bd);
2221 step2[17] = WRAPLOW(step1[17] + step1[18], bd);
2222 step2[18] = WRAPLOW(step1[17] - step1[18], bd);
2223 step2[19] = WRAPLOW(step1[16] - step1[19], bd);
2224 step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
2225 step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
2226 step2[22] = WRAPLOW(step1[21] + step1[22], bd);
2227 step2[23] = WRAPLOW(step1[20] + step1[23], bd);
2228
2229 step2[24] = WRAPLOW(step1[24] + step1[27], bd);
2230 step2[25] = WRAPLOW(step1[25] + step1[26], bd);
2231 step2[26] = WRAPLOW(step1[25] - step1[26], bd);
2232 step2[27] = WRAPLOW(step1[24] - step1[27], bd);
2233 step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
2234 step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
2235 step2[30] = WRAPLOW(step1[29] + step1[30], bd);
2236 step2[31] = WRAPLOW(step1[28] + step1[31], bd);
2237
2238 // stage 5
2239 step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2240 step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2241 step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2242 step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2243 step1[4] = step2[4];
2244 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2245 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2246 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2247 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2248 step1[7] = step2[7];
2249
2250 step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2251 step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2252 step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2253 step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2254 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2255 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2256 step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2257 step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2258
2259 step1[16] = step2[16];
2260 step1[17] = step2[17];
2261 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2262 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2263 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2264 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2265 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2266 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2267 step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2268 step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2269 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2270 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2271 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2272 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2273 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2274 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2275 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2276 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2277 step1[22] = step2[22];
2278 step1[23] = step2[23];
2279 step1[24] = step2[24];
2280 step1[25] = step2[25];
2281 step1[30] = step2[30];
2282 step1[31] = step2[31];
2283
2284 // stage 6
2285 step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2286 step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2287 step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2288 step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2289 step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2290 step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2291 step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2292 step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2293 step2[8] = step1[8];
2294 step2[9] = step1[9];
2295 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2296 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2297 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2298 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2299 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2300 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2301 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2302 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2303 step2[14] = step1[14];
2304 step2[15] = step1[15];
2305
2306 step2[16] = WRAPLOW(step1[16] + step1[23], bd);
2307 step2[17] = WRAPLOW(step1[17] + step1[22], bd);
2308 step2[18] = WRAPLOW(step1[18] + step1[21], bd);
2309 step2[19] = WRAPLOW(step1[19] + step1[20], bd);
2310 step2[20] = WRAPLOW(step1[19] - step1[20], bd);
2311 step2[21] = WRAPLOW(step1[18] - step1[21], bd);
2312 step2[22] = WRAPLOW(step1[17] - step1[22], bd);
2313 step2[23] = WRAPLOW(step1[16] - step1[23], bd);
2314
2315 step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
2316 step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
2317 step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
2318 step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
2319 step2[28] = WRAPLOW(step1[27] + step1[28], bd);
2320 step2[29] = WRAPLOW(step1[26] + step1[29], bd);
2321 step2[30] = WRAPLOW(step1[25] + step1[30], bd);
2322 step2[31] = WRAPLOW(step1[24] + step1[31], bd);
2323
2324 // stage 7
2325 step1[0] = WRAPLOW(step2[0] + step2[15], bd);
2326 step1[1] = WRAPLOW(step2[1] + step2[14], bd);
2327 step1[2] = WRAPLOW(step2[2] + step2[13], bd);
2328 step1[3] = WRAPLOW(step2[3] + step2[12], bd);
2329 step1[4] = WRAPLOW(step2[4] + step2[11], bd);
2330 step1[5] = WRAPLOW(step2[5] + step2[10], bd);
2331 step1[6] = WRAPLOW(step2[6] + step2[9], bd);
2332 step1[7] = WRAPLOW(step2[7] + step2[8], bd);
2333 step1[8] = WRAPLOW(step2[7] - step2[8], bd);
2334 step1[9] = WRAPLOW(step2[6] - step2[9], bd);
2335 step1[10] = WRAPLOW(step2[5] - step2[10], bd);
2336 step1[11] = WRAPLOW(step2[4] - step2[11], bd);
2337 step1[12] = WRAPLOW(step2[3] - step2[12], bd);
2338 step1[13] = WRAPLOW(step2[2] - step2[13], bd);
2339 step1[14] = WRAPLOW(step2[1] - step2[14], bd);
2340 step1[15] = WRAPLOW(step2[0] - step2[15], bd);
2341
2342 step1[16] = step2[16];
2343 step1[17] = step2[17];
2344 step1[18] = step2[18];
2345 step1[19] = step2[19];
2346 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2347 temp2 = (step2[20] + step2[27]) * cospi_16_64;
2348 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2349 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2350 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2351 temp2 = (step2[21] + step2[26]) * cospi_16_64;
2352 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2353 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2354 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2355 temp2 = (step2[22] + step2[25]) * cospi_16_64;
2356 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2357 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2358 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2359 temp2 = (step2[23] + step2[24]) * cospi_16_64;
2360 step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2361 step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2362 step1[28] = step2[28];
2363 step1[29] = step2[29];
2364 step1[30] = step2[30];
2365 step1[31] = step2[31];
2366
2367 // final stage
2368 output[0] = WRAPLOW(step1[0] + step1[31], bd);
2369 output[1] = WRAPLOW(step1[1] + step1[30], bd);
2370 output[2] = WRAPLOW(step1[2] + step1[29], bd);
2371 output[3] = WRAPLOW(step1[3] + step1[28], bd);
2372 output[4] = WRAPLOW(step1[4] + step1[27], bd);
2373 output[5] = WRAPLOW(step1[5] + step1[26], bd);
2374 output[6] = WRAPLOW(step1[6] + step1[25], bd);
2375 output[7] = WRAPLOW(step1[7] + step1[24], bd);
2376 output[8] = WRAPLOW(step1[8] + step1[23], bd);
2377 output[9] = WRAPLOW(step1[9] + step1[22], bd);
2378 output[10] = WRAPLOW(step1[10] + step1[21], bd);
2379 output[11] = WRAPLOW(step1[11] + step1[20], bd);
2380 output[12] = WRAPLOW(step1[12] + step1[19], bd);
2381 output[13] = WRAPLOW(step1[13] + step1[18], bd);
2382 output[14] = WRAPLOW(step1[14] + step1[17], bd);
2383 output[15] = WRAPLOW(step1[15] + step1[16], bd);
2384 output[16] = WRAPLOW(step1[15] - step1[16], bd);
2385 output[17] = WRAPLOW(step1[14] - step1[17], bd);
2386 output[18] = WRAPLOW(step1[13] - step1[18], bd);
2387 output[19] = WRAPLOW(step1[12] - step1[19], bd);
2388 output[20] = WRAPLOW(step1[11] - step1[20], bd);
2389 output[21] = WRAPLOW(step1[10] - step1[21], bd);
2390 output[22] = WRAPLOW(step1[9] - step1[22], bd);
2391 output[23] = WRAPLOW(step1[8] - step1[23], bd);
2392 output[24] = WRAPLOW(step1[7] - step1[24], bd);
2393 output[25] = WRAPLOW(step1[6] - step1[25], bd);
2394 output[26] = WRAPLOW(step1[5] - step1[26], bd);
2395 output[27] = WRAPLOW(step1[4] - step1[27], bd);
2396 output[28] = WRAPLOW(step1[3] - step1[28], bd);
2397 output[29] = WRAPLOW(step1[2] - step1[29], bd);
2398 output[30] = WRAPLOW(step1[1] - step1[30], bd);
2399 output[31] = WRAPLOW(step1[0] - step1[31], bd);
2400 }
2401
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2402 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2403 int stride, int bd) {
2404 tran_low_t out[32 * 32];
2405 tran_low_t *outptr = out;
2406 int i, j;
2407 tran_low_t temp_in[32], temp_out[32];
2408 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2409
2410 // Rows
2411 for (i = 0; i < 32; ++i) {
2412 tran_low_t zero_coeff[16];
2413 for (j = 0; j < 16; ++j)
2414 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2415 for (j = 0; j < 8; ++j)
2416 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2417 for (j = 0; j < 4; ++j)
2418 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2419 for (j = 0; j < 2; ++j)
2420 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2421
2422 if (zero_coeff[0] | zero_coeff[1])
2423 highbd_idct32_c(input, outptr, bd);
2424 else
2425 memset(outptr, 0, sizeof(tran_low_t) * 32);
2426 input += 32;
2427 outptr += 32;
2428 }
2429
2430 // Columns
2431 for (i = 0; i < 32; ++i) {
2432 for (j = 0; j < 32; ++j)
2433 temp_in[j] = out[j * 32 + i];
2434 highbd_idct32_c(temp_in, temp_out, bd);
2435 for (j = 0; j < 32; ++j) {
2436 dest[j * stride + i] = highbd_clip_pixel_add(
2437 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2438 }
2439 }
2440 }
2441
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2442 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2443 int stride, int bd) {
2444 tran_low_t out[32 * 32] = {0};
2445 tran_low_t *outptr = out;
2446 int i, j;
2447 tran_low_t temp_in[32], temp_out[32];
2448 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2449
2450 // Rows
2451 // Only upper-left 8x8 has non-zero coeff.
2452 for (i = 0; i < 8; ++i) {
2453 highbd_idct32_c(input, outptr, bd);
2454 input += 32;
2455 outptr += 32;
2456 }
2457 // Columns
2458 for (i = 0; i < 32; ++i) {
2459 for (j = 0; j < 32; ++j)
2460 temp_in[j] = out[j * 32 + i];
2461 highbd_idct32_c(temp_in, temp_out, bd);
2462 for (j = 0; j < 32; ++j) {
2463 dest[j * stride + i] = highbd_clip_pixel_add(
2464 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2465 }
2466 }
2467 }
2468
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2469 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2470 int stride, int bd) {
2471 int i, j;
2472 int a1;
2473 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2474
2475 tran_low_t out = WRAPLOW(
2476 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2477 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2478 a1 = ROUND_POWER_OF_TWO(out, 6);
2479
2480 for (j = 0; j < 32; ++j) {
2481 for (i = 0; i < 32; ++i)
2482 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2483 dest += stride;
2484 }
2485 }
2486 #endif // CONFIG_VP9_HIGHBITDEPTH
2487